aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2021-05-03 23:36:53 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2021-05-03 23:36:53 -0400
commitb9e1f9713d702cdb4721c6d7005718f43170c8fd (patch)
treed4bae9611841843502f1bac1ceeb88f5e8aac438
parent8ce3f41264ca43e2acd627592667ce66bc4b63fe (diff)
parente7eb1f9779f2e223575ab23a6bc1abf2222e7d27 (diff)
downloadgallery-dl-b9e1f9713d702cdb4721c6d7005718f43170c8fd.tar.bz2
gallery-dl-b9e1f9713d702cdb4721c6d7005718f43170c8fd.tar.xz
gallery-dl-b9e1f9713d702cdb4721c6d7005718f43170c8fd.tar.zst
Update upstream source from tag 'upstream/1.17.3'
Update to upstream version '1.17.3' with Debian dir f48eb29debef9eb4ad856e7a0a50599d29d2128a
-rw-r--r--CHANGELOG.md33
-rw-r--r--PKG-INFO6
-rw-r--r--README.rst4
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.575
-rw-r--r--docs/gallery-dl.conf7
-rw-r--r--gallery_dl.egg-info/PKG-INFO6
-rw-r--r--gallery_dl.egg-info/SOURCES.txt4
-rw-r--r--gallery_dl/__init__.py7
-rw-r--r--gallery_dl/extractor/500px.py4
-rw-r--r--gallery_dl/extractor/8muses.py11
-rw-r--r--gallery_dl/extractor/__init__.py4
-rw-r--r--gallery_dl/extractor/artstation.py35
-rw-r--r--gallery_dl/extractor/bcy.py14
-rw-r--r--gallery_dl/extractor/booru.py17
-rw-r--r--gallery_dl/extractor/danbooru.py9
-rw-r--r--gallery_dl/extractor/deviantart.py34
-rw-r--r--gallery_dl/extractor/erome.py10
-rw-r--r--gallery_dl/extractor/exhentai.py40
-rw-r--r--gallery_dl/extractor/fanbox.py283
-rw-r--r--gallery_dl/extractor/fantia.py147
-rw-r--r--gallery_dl/extractor/gelbooru.py33
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py27
-rw-r--r--gallery_dl/extractor/hentaicosplays.py74
-rw-r--r--gallery_dl/extractor/instagram.py33
-rw-r--r--gallery_dl/extractor/komikcast.py4
-rw-r--r--gallery_dl/extractor/luscious.py5
-rw-r--r--gallery_dl/extractor/manganelo.py2
-rw-r--r--gallery_dl/extractor/mangapark.py7
-rw-r--r--gallery_dl/extractor/myportfolio.py22
-rw-r--r--gallery_dl/extractor/naverwebtoon.py79
-rw-r--r--gallery_dl/extractor/philomena.py4
-rw-r--r--gallery_dl/extractor/piczel.py3
-rw-r--r--gallery_dl/extractor/pixiv.py4
-rw-r--r--gallery_dl/extractor/pururin.py6
-rw-r--r--gallery_dl/extractor/sankaku.py6
-rw-r--r--gallery_dl/extractor/shopify.py8
-rw-r--r--gallery_dl/extractor/simplyhentai.py14
-rw-r--r--gallery_dl/extractor/slideshare.py20
-rw-r--r--gallery_dl/extractor/smugmug.py8
-rw-r--r--gallery_dl/extractor/unsplash.py4
-rw-r--r--gallery_dl/extractor/webtoons.py97
-rw-r--r--gallery_dl/extractor/yuki.py125
-rw-r--r--gallery_dl/job.py8
-rw-r--r--gallery_dl/version.py2
-rw-r--r--test/test_results.py2
46 files changed, 987 insertions, 362 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d57583e..59691b7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,38 @@
# Changelog
+## 1.17.3 - 2021-04-25
+### Additions
+- [danbooru] add option for extended metadata extraction ([#1458](https://github.com/mikf/gallery-dl/issues/1458))
+- [fanbox] add extractors ([#1459](https://github.com/mikf/gallery-dl/issues/1459))
+- [fantia] add extractors ([#1459](https://github.com/mikf/gallery-dl/issues/1459))
+- [gelbooru] add an option to extract notes ([#1457](https://github.com/mikf/gallery-dl/issues/1457))
+- [hentaicosplays] add extractor ([#907](https://github.com/mikf/gallery-dl/issues/907), [#1473](https://github.com/mikf/gallery-dl/issues/1473), [#1483](https://github.com/mikf/gallery-dl/issues/1483))
+- [instagram] add extractor for `tagged` posts ([#1439](https://github.com/mikf/gallery-dl/issues/1439))
+- [naverwebtoon] ignore non-comic images
+- [pixiv] also save untranslated tags when `translated-tags` is enabled ([#1501](https://github.com/mikf/gallery-dl/issues/1501))
+- [shopify] support omgmiamiswimwear.com ([#1280](https://github.com/mikf/gallery-dl/issues/1280))
+- implement `output.fallback` option
+- add archive format to InfoJob output ([#875](https://github.com/mikf/gallery-dl/issues/875))
+- build executables with SOCKS proxy support ([#1424](https://github.com/mikf/gallery-dl/issues/1424))
+### Fixes
+- [500px] update query hashes
+- [8muses] fix JSON deobfuscation
+- [artstation] download `/4k/` images ([#1422](https://github.com/mikf/gallery-dl/issues/1422))
+- [deviantart] fix pagination for Eclipse results ([#1444](https://github.com/mikf/gallery-dl/issues/1444))
+- [deviantart] improve folder name matching ([#1451](https://github.com/mikf/gallery-dl/issues/1451))
+- [erome] skip deleted albums ([#1447](https://github.com/mikf/gallery-dl/issues/1447))
+- [exhentai] fix image limit detection ([#1437](https://github.com/mikf/gallery-dl/issues/1437))
+- [exhentai] restore `limits` option ([#1487](https://github.com/mikf/gallery-dl/issues/1487))
+- [gelbooru] fix tag category extraction ([#1455](https://github.com/mikf/gallery-dl/issues/1455))
+- [instagram] update query hashes
+- [komikcast] fix extraction
+- [simplyhentai] fix extraction
+- [slideshare] fix extraction
+- [webtoons] update agegate/GDPR cookies ([#1431](https://github.com/mikf/gallery-dl/issues/1431))
+- fix `category-transfer` option
+### Removals
+- [yuki] remove module for yuki.la
+
## 1.17.2 - 2021-04-02
### Additions
- [deviantart] add support for posts from watched users ([#794](https://github.com/mikf/gallery-dl/issues/794))
diff --git a/PKG-INFO b/PKG-INFO
index f3ee6d3..3df2fe0 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.17.2
+Version: 1.17.3
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -75,8 +75,8 @@ Description: ==========
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
diff --git a/README.rst b/README.rst
index 4cbaa0e..d659faf 100644
--- a/README.rst
+++ b/README.rst
@@ -64,8 +64,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index 1ab1ec6..6a22a07 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2021-04-02" "1.17.2" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2021-04-25" "1.17.3" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index 608c2e5..0190b7f 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2021-04-02" "1.17.2" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2021-04-25" "1.17.3" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -824,22 +824,6 @@ descend into subfolders
Download embedded videos hosted on https://www.blogger.com/
-.SS extractor.danbooru.ugoira
-.IP "Type:" 6
-\f[I]bool\f[]
-
-.IP "Default:" 9
-\f[I]false\f[]
-
-.IP "Description:" 4
-Controls the download target for Ugoira posts.
-
-.br
-* \f[I]true\f[]: Original ZIP archives
-.br
-* \f[I]false\f[]: Converted video files
-
-
.SS extractor.derpibooru.api-key
.IP "Type:" 6
\f[I]string\f[]
@@ -1042,6 +1026,18 @@ or whenever your \f[I]cache file\f[] is deleted or cleared.
Minimum wait time in seconds before API requests.
+.SS extractor.exhentai.limits
+.IP "Type:" 6
+\f[I]integer\f[]
+
+.IP "Default:" 9
+\f[I]null\f[]
+
+.IP "Description:" 4
+Sets a custom image download limit and
+stops extraction when it gets exceeded.
+
+
.SS extractor.exhentai.domain
.IP "Type:" 6
\f[I]string\f[]
@@ -1085,6 +1081,26 @@ Makes \f[I]date\f[] and \f[I]filesize\f[] more precise.
Download full-sized original images if available.
+.SS extractor.fanbox.embeds
+.IP "Type:" 6
+\f[I]bool\f[] or \f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Control behavior on embedded content from external sites.
+
+.br
+* \f[I]true\f[]: Extract embed URLs and download them if supported
+(videos are not downloaded).
+.br
+* \f[I]"ytdl"\f[]: Like \f[I]true\f[], but let \f[I]youtube-dl\f[] handle video
+extraction and download for YouTube, Vimeo and SoundCloud embeds.
+.br
+* \f[I]false\f[]: Ignore embeds.
+
+
.SS extractor.flickr.access-token & .access-token-secret
.IP "Type:" 6
\f[I]string\f[]
@@ -1963,20 +1979,6 @@ Extract media from retweeted posts.
Download video files.
-.SS extractor.[booru].tags
-.IP "Type:" 6
-\f[I]bool\f[]
-
-.IP "Default:" 9
-\f[I]false\f[]
-
-.IP "Description:" 4
-Categorize tags by their respective types
-and provide them as \f[I]tags_<type>\f[] metadata fields.
-
-Note: This requires 1 additional HTTP request for each post.
-
-
.SS extractor.[manga-extractor].chapter-reverse
.IP "Type:" 6
\f[I]bool\f[]
@@ -2240,6 +2242,17 @@ All available options can be found in \f[I]youtube-dl's docstrings
.SH OUTPUT OPTIONS
+.SS output.fallback
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Include fallback URLs in the output of \f[I]-g/--get-urls\f[].
+
+
.SS output.mode
.IP "Type:" 6
\f[I]string\f[]
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 8a3d9e2..4eaf1b8 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -49,7 +49,8 @@
{
"username": null,
"password": null,
- "ugoira": false
+ "ugoira": false,
+ "metadata": false
},
"derpibooru":
{
@@ -79,6 +80,7 @@
"username": null,
"password": null,
"domain": "auto",
+ "limits": true,
"metadata": false,
"original": true,
"sleep-request": 5.0
@@ -279,7 +281,8 @@
},
"booru":
{
- "tags": false
+ "tags": false,
+ "notes": false
}
},
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index f233a1a..e192d75 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.17.2
+Version: 1.17.3
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -75,8 +75,8 @@ Description: ==========
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 09e7097..3cc2071 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -59,6 +59,8 @@ gallery_dl/extractor/e621.py
gallery_dl/extractor/erome.py
gallery_dl/extractor/exhentai.py
gallery_dl/extractor/fallenangels.py
+gallery_dl/extractor/fanbox.py
+gallery_dl/extractor/fantia.py
gallery_dl/extractor/flickr.py
gallery_dl/extractor/foolfuuka.py
gallery_dl/extractor/foolslide.py
@@ -70,6 +72,7 @@ gallery_dl/extractor/gelbooru_v02.py
gallery_dl/extractor/gfycat.py
gallery_dl/extractor/hbrowse.py
gallery_dl/extractor/hentai2read.py
+gallery_dl/extractor/hentaicosplays.py
gallery_dl/extractor/hentaifoundry.py
gallery_dl/extractor/hentaifox.py
gallery_dl/extractor/hentaihand.py
@@ -165,7 +168,6 @@ gallery_dl/extractor/weibo.py
gallery_dl/extractor/wikiart.py
gallery_dl/extractor/xhamster.py
gallery_dl/extractor/xvideos.py
-gallery_dl/extractor/yuki.py
gallery_dl/postprocessor/__init__.py
gallery_dl/postprocessor/classify.py
gallery_dl/postprocessor/common.py
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index c1f80b6..5bf229a 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2020 Mike Fährmann
+# Copyright 2014-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,7 +9,7 @@
from __future__ import unicode_literals, print_function
__author__ = "Mike Fährmann"
-__copyright__ = "Copyright 2014-2020 Mike Fährmann"
+__copyright__ = "Copyright 2014-2021 Mike Fährmann"
__license__ = "GPLv2"
__maintainer__ = "Mike Fährmann"
__email__ = "mike_faehrmann@web.de"
@@ -204,6 +204,9 @@ def main():
if args.list_urls:
jobtype = job.UrlJob
jobtype.maxdepth = args.list_urls
+ if config.get(("output",), "fallback", True):
+ jobtype.handle_url = \
+ staticmethod(jobtype.handle_url_fallback)
else:
jobtype = args.jobtype or job.DownloadJob
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
index aa0e8ad..0583eb9 100644
--- a/gallery_dl/extractor/500px.py
+++ b/gallery_dl/extractor/500px.py
@@ -146,7 +146,7 @@ class _500pxGalleryExtractor(_500pxExtractor):
}),
# unavailable photos (#1335)
("https://500px.com/p/Light_Expression_Photography/galleries/street", {
- "count": 0,
+ "count": ">= 7",
}),
("https://500px.com/fashvamp/galleries/lera"),
)
@@ -159,7 +159,7 @@ class _500pxGalleryExtractor(_500pxExtractor):
def metadata(self):
user = self._request_graphql(
"ProfileRendererQuery", {"username": self.user_name},
- "105058632482dd2786fd5775745908dc928f537b28e28356b076522757d65c19",
+ "fcecc7028c308115b0defebc63acec3fe3c12df86a602c3e1785ba5cfb8fff47",
)["profile"]
self.user_id = str(user["legacyId"])
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
index 3eb5565..c961ded 100644
--- a/gallery_dl/extractor/8muses.py
+++ b/gallery_dl/extractor/8muses.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -54,10 +54,17 @@ class _8musesAlbumExtractor(Extractor):
"private": False,
},
}),
+ # custom sorting
("https://www.8muses.com/comics/album/Fakku-Comics/8?sort=az", {
"count": ">= 70",
"keyword": {"name": r"re:^[R-Zr-z]"},
}),
+ # non-ASCII characters
+ (("https://comics.8muses.com/comics/album/Various-Authors/Chessire88"
+ "/From-Trainers-to-Pokmons"), {
+ "count": 2,
+ "keyword": {"name": "re:From Trainers to Pokémons"},
+ }),
)
def __init__(self, match):
@@ -125,6 +132,6 @@ class _8musesAlbumExtractor(Extractor):
@staticmethod
def _unobfuscate(data):
return json.loads("".join([
- chr(33 + (ord(c) + 14) % 94) if c != " " else c
+ chr(33 + (ord(c) + 14) % 94) if "!" <= c <= "~" else c
for c in text.unescape(data.strip("\t\n\r !"))
]))
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 3d61515..d927d70 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -31,6 +31,8 @@ modules = [
"erome",
"exhentai",
"fallenangels",
+ "fanbox",
+ "fantia",
"flickr",
"furaffinity",
"fuskator",
@@ -40,6 +42,7 @@ modules = [
"gfycat",
"hbrowse",
"hentai2read",
+ "hentaicosplays",
"hentaifoundry",
"hentaifox",
"hentaihand",
@@ -127,7 +130,6 @@ modules = [
"wikiart",
"xhamster",
"xvideos",
- "yuki",
"booru",
"moebooru",
"foolfuuka",
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index 6914f24..f2ad0ab 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://www.artstation.com/"""
+"""Extractors for https://www.artstation.com/"""
from .common import Extractor, Message
from .. import text, util, exception
@@ -29,7 +29,6 @@ class ArtstationExtractor(Extractor):
def items(self):
data = self.metadata()
- yield Message.Version, 1
yield Message.Directory, data
for project in self.projects():
@@ -49,7 +48,20 @@ class ArtstationExtractor(Extractor):
if adict["has_image"]:
url = adict["image_url"]
text.nameext_from_url(url, asset)
- yield Message.Url, self._no_cache(url), asset
+
+ url = self._no_cache(url)
+ lhs, _, rhs = url.partition("/large/")
+ if rhs:
+ url = lhs + "/4k/" + rhs
+ asset["_fallback"] = self._image_fallback(lhs, rhs)
+
+ yield Message.Url, url, asset
+
+ @staticmethod
+ def _image_fallback(lhs, rhs):
+ yield lhs + "/large/" + rhs
+ yield lhs + "/medium/" + rhs
+ yield lhs + "/small/" + rhs
def metadata(self):
"""Return general metadata"""
@@ -135,8 +147,8 @@ class ArtstationUserExtractor(ArtstationExtractor):
r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$")
test = (
("https://www.artstation.com/gaerikim/", {
- "pattern": r"https://\w+\.artstation\.com/p/assets"
- r"/images/images/\d+/\d+/\d+/large/[^/]+",
+ "pattern": r"https://\w+\.artstation\.com/p/assets/images"
+ r"/images/\d+/\d+/\d+/(4k|large|medium|small)/[^/]+",
"count": ">= 6",
}),
("https://www.artstation.com/gaerikim/albums/all/"),
@@ -202,8 +214,8 @@ class ArtstationLikesExtractor(ArtstationExtractor):
r"/(?!artwork|projects|search)([^/?#]+)/likes/?")
test = (
("https://www.artstation.com/mikf/likes", {
- "pattern": r"https://\w+\.artstation\.com/p/assets"
- r"/images/images/\d+/\d+/\d+/large/[^/]+",
+ "pattern": r"https://\w+\.artstation\.com/p/assets/images"
+ r"/images/\d+/\d+/\d+/(4k|large|medium|small)/[^/]+",
"count": 6,
}),
# no likes
@@ -250,7 +262,6 @@ class ArtstationChallengeExtractor(ArtstationExtractor):
self.root)
challenge = self.request(challenge_url).json()
- yield Message.Version, 1
yield Message.Directory, {"challenge": challenge}
params = {"sorting": self.sorting}
@@ -344,10 +355,10 @@ class ArtstationImageExtractor(ArtstationExtractor):
test = (
("https://www.artstation.com/artwork/LQVJr", {
"pattern": r"https?://\w+\.artstation\.com/p/assets"
- r"/images/images/008/760/279/large/.+",
- "content": "1f645ce7634e44675ebde8f6b634d36db0617d3c",
+ r"/images/images/008/760/279/4k/.+",
+ "content": "7b113871465fdc09d127adfdc2767d51cf45a7e9",
# SHA1 hash without _no_cache()
- # "content": "2e8aaf6400aeff2345274f45e90b6ed3f2a0d946",
+ # "content": "44b80f9af36d40efc5a2668cdd11d36d6793bae9",
}),
# multiple images per project
("https://www.artstation.com/artwork/Db3dy", {
diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py
index 6e0003d..d6e3683 100644
--- a/gallery_dl/extractor/bcy.py
+++ b/gallery_dl/extractor/bcy.py
@@ -170,11 +170,16 @@ class BcyPostExtractor(BcyExtractor):
},
}),
# only watermarked images available
- ("https://bcy.net/item/detail/6780546160802143236", {
+ ("https://bcy.net/item/detail/6950136331708144648", {
"pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+"
r"~tplv-banciyuan-logo-v3:.+\.image",
- "count": 8,
+ "count": 10,
"keyword": {"filter": "watermark"}
+
+ }),
+ # deleted
+ ("https://bcy.net/item/detail/6780546160802143236", {
+ "count": 0,
}),
# only visible to logged in users
("https://bcy.net/item/detail/6747523535150783495", {
@@ -183,7 +188,10 @@ class BcyPostExtractor(BcyExtractor):
)
def posts(self):
- data = self._data_from_post(self.item_id)
+ try:
+ data = self._data_from_post(self.item_id)
+ except KeyError:
+ return ()
post = data["post_data"]
post["image_list"] = post["multi"]
post["plain"] = text.parse_unicode_escapes(post["plain"])
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index c3cf3f7..a42ec53 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -24,6 +24,7 @@ class BooruExtractor(BaseExtractor):
self.login()
data = self.metadata()
tags = self.config("tags", False)
+ notes = self.config("notes", False)
for post in self.posts():
try:
@@ -35,8 +36,11 @@ class BooruExtractor(BaseExtractor):
"(md5: %s)", post.get("id"), post.get("md5"))
continue
+ page_html = None
if tags:
- self._extended_tags(post)
+ page_html = self._extended_tags(post)
+ if notes:
+ self._notes(post, page_html)
self._prepare(post)
post.update(data)
text.nameext_from_url(url, post)
@@ -66,4 +70,13 @@ class BooruExtractor(BaseExtractor):
"""Prepare the 'post's metadata"""
def _extended_tags(self, post, page=None):
- """Generate extended tag information"""
+ """Generate extended tag information
+
+ The return value of this function will be
+ passed to the _notes function as the page parameter.
+ This makes it possible to reuse the same HTML both for
+ extracting tags and notes.
+ """
+
+ def _notes(self, post, page=None):
+ """Generate information about notes"""
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 33797f9..1f86ea5 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -32,6 +32,7 @@ class DanbooruExtractor(Extractor):
super().__init__(match)
self.root = "https://{}.donmai.us".format(match.group(1))
self.ugoira = self.config("ugoira", False)
+ self.extended_metadata = self.config("metadata", False)
username, api_key = self._get_auth_info()
if username:
@@ -64,6 +65,14 @@ class DanbooruExtractor(Extractor):
url = post["large_file_url"]
post["extension"] = "webm"
+ if self.extended_metadata:
+ template = (
+ "{}/posts/{}.json"
+ "?only=artist_commentary,children,notes,parent"
+ )
+ resp = self.request(template.format(self.root, post["id"]))
+ post.update(resp.json())
+
post.update(data)
yield Message.Directory, post
yield Message.Url, url, post
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 9d1701f..47f589a 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -259,9 +259,10 @@ class DeviantartExtractor(Extractor):
@staticmethod
def _find_folder(folders, name):
- pattern = re.compile(r"(?i)\W*" + name.replace("-", r"\W+") + r"\W*$")
+ match = re.compile(name.replace(
+ "-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match
for folder in folders:
- if pattern.match(folder["name"]):
+ if match(folder["name"]):
return folder
raise exception.NotFoundError("folder")
@@ -472,6 +473,12 @@ class DeviantartFolderExtractor(DeviantartExtractor):
"count": ">= 4",
"options": (("original", False),),
}),
+ # name starts with '_', special characters (#1451)
+ (("https://www.deviantart.com/justatest235723"
+ "/gallery/69302698/-test-b-c-d-e-f-"), {
+ "count": 1,
+ "options": (("original", False),),
+ }),
("https://shimoda7.deviantart.com/gallery/722019/Miscellaneous"),
("https://yakuzafc.deviantart.com/gallery/37412168/Crafts"),
)
@@ -1230,7 +1237,7 @@ class DeviantartEclipseAPI():
params = {
"username" : user,
"offset" : offset,
- "limit" : "24",
+ "limit" : 24,
"scraps_folder": "true",
}
return self._pagination(endpoint, params)
@@ -1240,8 +1247,8 @@ class DeviantartEclipseAPI():
params = {
"username": user,
"moduleid": self._module_id_watching(user),
- "offset" : None,
- "limit" : "24",
+ "offset" : offset,
+ "limit" : 24,
}
return self._pagination(endpoint, params)
@@ -1260,14 +1267,23 @@ class DeviantartEclipseAPI():
except Exception:
return {"error": response.text}
- def _pagination(self, endpoint, params=None):
+ def _pagination(self, endpoint, params):
while True:
data = self._call(endpoint, params)
- yield from data["results"]
- if not data["hasMore"]:
+ results = data.get("results")
+ if results is None:
+ return
+ yield from results
+
+ if not data.get("hasMore"):
return
- params["offset"] = data["nextOffset"]
+
+ next_offset = data.get("nextOffset")
+ if next_offset:
+ params["offset"] = next_offset
+ else:
+ params["offset"] += params["limit"]
def _module_id_watching(self, user):
url = "{}/{}/about".format(self.extractor.root, user)
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index 2e2e952..d4fd826 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -9,7 +9,7 @@
"""Extractors for https://www.erome.com/"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text, util, exception
from ..cache import cache
import itertools
import time
@@ -32,7 +32,13 @@ class EromeExtractor(Extractor):
def items(self):
for album_id in self.albums():
url = "{}/a/{}".format(self.root, album_id)
- page = self.request(url).text
+
+ try:
+ page = self.request(url).text
+ except exception.HttpError as exc:
+ self.log.warning(
+ "Unable to fetch album '%s' (%s)", album_id, exc)
+ continue
title, pos = text.extract(
page, 'property="og:title" content="', '"')
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 872a338..910da7d 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -45,6 +45,13 @@ class ExhentaiExtractor(Extractor):
Extractor.__init__(self, match)
self.original = self.config("original", True)
+ limits = self.config("limits", False)
+ if limits and limits.__class__ is int:
+ self.limits = limits
+ self._remaining = 0
+ else:
+ self.limits = False
+
self.session.headers["Referer"] = self.root + "/"
if version != "ex":
self.session.cookies.set("nw", "1", domain=self.cookiedomain)
@@ -69,6 +76,7 @@ class ExhentaiExtractor(Extractor):
self.log.info("no username given; using e-hentai.org")
self.root = "https://e-hentai.org"
self.original = False
+ self.limits = False
self.session.cookies["nw"] = "1"
@cache(maxage=90*24*3600, keyarg=1)
@@ -193,12 +201,24 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
self.count = text.parse_int(data["filecount"])
yield Message.Directory, data
+ def _validate_response(response):
+ # declared inside 'items()' to be able to access 'data'
+ if not response.history and \
+ response.headers.get("content-length") == "137":
+ self._report_limits(data)
+ return True
+
images = itertools.chain(
(self.image_from_page(ipage),), self.images_from_api())
for url, image in images:
data.update(image)
+ if self.limits:
+ self._check_limits(data)
if "/fullimg.php" in url:
data["extension"] = ""
+ data["_http_validate"] = _validate_response
+ else:
+ data["_http_validate"] = None
yield Message.Url, url, data
def get_metadata(self, page):
@@ -338,6 +358,26 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"Continue with '%s/s/%s/%s-%s' as URL after resetting it.",
self.root, data["image_token"], self.gallery_id, data["num"])
+ def _check_limits(self, data):
+ if not self._remaining or data["num"] % 25 == 0:
+ self._update_limits()
+ self._remaining -= data["cost"]
+ if self._remaining <= 0:
+ self._report_limits(data)
+
+ def _update_limits(self):
+ url = "https://e-hentai.org/home.php"
+ cookies = {
+ cookie.name: cookie.value
+ for cookie in self.session.cookies
+ if cookie.domain == self.cookiedomain and cookie.name != "igneous"
+ }
+
+ page = self.request(url, cookies=cookies).text
+ current = text.extract(page, "<strong>", "</strong>")[0]
+ self.log.debug("Image Limits: %s/%s", current, self.limits)
+ self._remaining = self.limits - text.parse_int(current)
+
def _gallery_page(self):
url = "{}/g/{}/{}/".format(
self.root, self.gallery_id, self.gallery_token)
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
new file mode 100644
index 0000000..06054b2
--- /dev/null
+++ b/gallery_dl/extractor/fanbox.py
@@ -0,0 +1,283 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.fanbox.cc/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+BASE_PATTERN = (
+ r"(?:https?://)?(?:"
+ r"(?!www\.)([\w-]+)\.fanbox\.cc|"
+ r"(?:www\.)?fanbox\.cc/@([\w-]+))"
+)
+
+
+class FanboxExtractor(Extractor):
+ """Base class for Fanbox extractors"""
+ category = "fanbox"
+ root = "https://www.fanbox.cc"
+ directory_fmt = ("{category}", "{creatorId}")
+ filename_fmt = "{id}_{num}.{extension}"
+ archive_fmt = "{id}_{num}"
+ _warning = True
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.embeds = self.config("embeds", True)
+
+ def items(self):
+ yield Message.Version, 1
+
+ if self._warning:
+ if "FANBOXSESSID" not in self.session.cookies:
+ self.log.warning("no 'FANBOXSESSID' cookie set")
+ FanboxExtractor._warning = False
+
+ for content_body, post in self.posts():
+ yield Message.Directory, post
+ yield from self._get_urls_from_post(content_body, post)
+
+ def posts(self):
+ """Return all relevant post objects"""
+
+ def _pagination(self, url):
+ headers = {"Origin": self.root}
+
+ while url:
+ url = text.ensure_http_scheme(url)
+ body = self.request(url, headers=headers).json()["body"]
+ for item in body["items"]:
+ yield self._process_post(item)
+
+ url = body["nextUrl"]
+
+ def _get_post_data_from_id(self, post_id):
+ """Fetch and process post data"""
+ headers = {"Origin": self.root}
+ url = "https://api.fanbox.cc/post.info?postId="+post_id
+ post = self.request(url, headers=headers).json()["body"]
+
+ return self._process_post(post)
+
+ def _process_post(self, post):
+ content_body = post.pop("body", None)
+ if content_body:
+ if "html" in content_body:
+ post["html"] = content_body["html"]
+ if post["type"] == "article":
+ post["articleBody"] = content_body.copy()
+
+ post["date"] = text.parse_datetime(post["publishedDatetime"])
+ post["text"] = content_body.get("text") if content_body else None
+ post["isCoverImage"] = False
+
+ return content_body, post
+
+ def _get_urls_from_post(self, content_body, post):
+ num = 0
+ cover_image = post.get("coverImageUrl")
+ if cover_image:
+ final_post = post.copy()
+ final_post["isCoverImage"] = True
+ final_post["fileUrl"] = cover_image
+ text.nameext_from_url(cover_image, final_post)
+ final_post["num"] = num
+ num += 1
+ yield Message.Url, cover_image, final_post
+
+ if not content_body:
+ return
+
+ if "html" in content_body:
+ html_urls = []
+
+ for href in text.extract_iter(content_body["html"], 'href="', '"'):
+ if "fanbox.pixiv.net/images/entry" in href:
+ html_urls.append(href)
+ elif "downloads.fanbox.cc" in href:
+ html_urls.append(href)
+ for src in text.extract_iter(content_body["html"],
+ 'data-src-original="', '"'):
+ html_urls.append(src)
+
+ for url in html_urls:
+ final_post = post.copy()
+ text.nameext_from_url(url, final_post)
+ final_post["fileUrl"] = url
+ final_post["num"] = num
+ num += 1
+ yield Message.Url, url, final_post
+
+ for group in ("images", "imageMap"):
+ if group in content_body:
+ for item in content_body[group]:
+ if group == "imageMap":
+ # imageMap is a dict with image objects as values
+ item = content_body[group][item]
+
+ final_post = post.copy()
+ final_post["fileUrl"] = item["originalUrl"]
+ text.nameext_from_url(item["originalUrl"], final_post)
+ if "extension" in item:
+ final_post["extension"] = item["extension"]
+ final_post["fileId"] = item.get("id")
+ final_post["width"] = item.get("width")
+ final_post["height"] = item.get("height")
+ final_post["num"] = num
+ num += 1
+ yield Message.Url, item["originalUrl"], final_post
+
+ for group in ("files", "fileMap"):
+ if group in content_body:
+ for item in content_body[group]:
+ if group == "fileMap":
+ # fileMap is a dict with file objects as values
+ item = content_body[group][item]
+
+ final_post = post.copy()
+ final_post["fileUrl"] = item["url"]
+ text.nameext_from_url(item["url"], final_post)
+ if "extension" in item:
+ final_post["extension"] = item["extension"]
+ if "name" in item:
+ final_post["filename"] = item["name"]
+ final_post["fileId"] = item.get("id")
+ final_post["num"] = num
+ num += 1
+ yield Message.Url, item["url"], final_post
+
+ if self.embeds:
+ embeds_found = []
+ if "video" in content_body:
+ embeds_found.append(content_body["video"])
+ embeds_found.extend(content_body.get("embedMap", {}).values())
+
+ for embed in embeds_found:
+ # embed_result is (message type, url, metadata dict)
+ embed_result = self._process_embed(post, embed)
+ if not embed_result:
+ continue
+ embed_result[2]["num"] = num
+ num += 1
+ yield embed_result
+
+ def _process_embed(self, post, embed):
+ final_post = post.copy()
+ provider = embed["serviceProvider"]
+ content_id = embed.get("videoId") or embed.get("contentId")
+ prefix = "ytdl:" if self.embeds == "ytdl" else ""
+ url = None
+ is_video = False
+
+ if provider == "soundcloud":
+ url = prefix+"https://soundcloud.com/"+content_id
+ is_video = True
+ elif provider == "youtube":
+ url = prefix+"https://youtube.com/watch?v="+content_id
+ is_video = True
+ elif provider == "vimeo":
+ url = prefix+"https://vimeo.com/"+content_id
+ is_video = True
+ elif provider == "fanbox":
+ # this is an old URL format that redirects
+ # to a proper Fanbox URL
+ url = "https://www.pixiv.net/fanbox/"+content_id
+ # resolve redirect
+ response = self.request(url, method="HEAD", allow_redirects=False)
+ url = response.headers["Location"]
+ final_post["_extractor"] = FanboxPostExtractor
+ elif provider == "twitter":
+ url = "https://twitter.com/_/status/"+content_id
+ elif provider == "google_forms":
+ templ = "https://docs.google.com/forms/d/e/{}/viewform?usp=sf_link"
+ url = templ.format(content_id)
+ else:
+ self.log.warning("service not recognized: {}".format(provider))
+
+ if url:
+ final_post["embed"] = embed
+ final_post["embedUrl"] = url
+ text.nameext_from_url(url, final_post)
+ msg_type = Message.Queue
+ if is_video and self.embeds == "ytdl":
+ msg_type = Message.Url
+ return msg_type, url, final_post
+
+
+class FanboxCreatorExtractor(FanboxExtractor):
+ """Extractor for a Fanbox creator's works"""
+ subcategory = "creator"
+ pattern = BASE_PATTERN + r"(?:/posts)?/?$"
+ test = (
+ ("https://xub.fanbox.cc", {
+ "range": "1-15",
+ "count": ">= 15",
+ "keyword": {
+ "creatorId" : "xub",
+ "tags" : list,
+ "title" : str,
+ },
+ }),
+ ("https://xub.fanbox.cc/posts"),
+ ("https://www.fanbox.cc/@xub/"),
+ ("https://www.fanbox.cc/@xub/posts"),
+ )
+
+ def __init__(self, match):
+ FanboxExtractor.__init__(self, match)
+ self.creator_id = match.group(1) or match.group(2)
+
+ def posts(self):
+ url = "https://api.fanbox.cc/post.listCreator?creatorId={}&limit=10"
+
+ return self._pagination(url.format(self.creator_id))
+
+
+class FanboxPostExtractor(FanboxExtractor):
+ """Extractor for media from a single Fanbox post"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/posts/(\d+)"
+ test = (
+ ("https://www.fanbox.cc/@xub/posts/1910054", {
+ "count": 3,
+ "keyword": {
+ "title": "えま★おうがすと",
+ "tags": list,
+ "hasAdultContent": True,
+ "isCoverImage": False
+ },
+ }),
+ # entry post type, image embedded in html of the post
+ ("https://nekoworks.fanbox.cc/posts/915", {
+ "count": 2,
+ "keyword": {
+ "title": "【SAYORI FAN CLUB】お届け内容",
+ "tags": list,
+ "html": str,
+ "hasAdultContent": True
+ },
+ }),
+ # article post type, imageMap, 2 twitter embeds, fanbox embed
+ ("https://steelwire.fanbox.cc/posts/285502", {
+ "options": (("embeds", True),),
+ "count": 10,
+ "keyword": {
+ "title": "イラスト+SS|義足の炭鉱少年が義足を見せてくれるだけ 【全体公開版】",
+ "tags": list,
+ "articleBody": dict,
+ "hasAdultContent": True
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ FanboxExtractor.__init__(self, match)
+ self.post_id = match.group(3)
+
+ def posts(self):
+ return (self._get_post_data_from_id(self.post_id),)
diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py
new file mode 100644
index 0000000..16fed4e
--- /dev/null
+++ b/gallery_dl/extractor/fantia.py
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://fantia.jp/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class FantiaExtractor(Extractor):
+ """Base class for Fantia extractors"""
+ category = "fantia"
+ root = "https://fantia.jp"
+ directory_fmt = ("{category}", "{fanclub_id}")
+ filename_fmt = "{post_id}_{file_id}.{extension}"
+ archive_fmt = "{post_id}_{file_id}"
+ _warning = True
+
+ def items(self):
+ yield Message.Version, 1
+
+ if self._warning:
+ if "_session_id" not in self.session.cookies:
+ self.log.warning("no '_session_id' cookie set")
+ FantiaExtractor._warning = False
+
+ for post_id in self.posts():
+ full_response, post = self._get_post_data(post_id)
+ yield Message.Directory, post
+ for url, url_data in self._get_urls_from_post(full_response, post):
+ fname = url_data["content_filename"] or url
+ text.nameext_from_url(fname, url_data)
+ url_data["file_url"] = url
+ yield Message.Url, url, url_data
+
+ def posts(self):
+ """Return post IDs"""
+
+ def _pagination(self, url):
+ params = {"page": 1}
+ headers = {"Referer": self.root}
+
+ while True:
+ page = self.request(url, params=params, headers=headers).text
+
+ post_id = None
+ for post_id in text.extract_iter(
+ page, 'class="link-block" href="/posts/', '"'):
+ yield post_id
+
+ if not post_id:
+ return
+ params["page"] += 1
+
+ def _get_post_data(self, post_id):
+ """Fetch and process post data"""
+ headers = {"Referer": self.root}
+ url = self.root+"/api/v1/posts/"+post_id
+ resp = self.request(url, headers=headers).json()["post"]
+ post = {
+ "post_id": resp["id"],
+ "post_url": self.root + "/posts/" + str(resp["id"]),
+ "post_title": resp["title"],
+ "comment": resp["comment"],
+ "rating": resp["rating"],
+ "posted_at": resp["posted_at"],
+ "fanclub_id": resp["fanclub"]["id"],
+ "fanclub_user_id": resp["fanclub"]["user"]["id"],
+ "fanclub_user_name": resp["fanclub"]["user"]["name"],
+ "fanclub_name": resp["fanclub"]["name"],
+ "fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]),
+ "tags": resp["tags"]
+ }
+ return resp, post
+
+ def _get_urls_from_post(self, resp, post):
+ """Extract individual URL data from the response"""
+ if "thumb" in resp and resp["thumb"] and "original" in resp["thumb"]:
+ post["content_filename"] = ""
+ post["content_category"] = "thumb"
+ post["file_id"] = "thumb"
+ yield resp["thumb"]["original"], post
+
+ for content in resp["post_contents"]:
+ post["content_category"] = content["category"]
+ post["content_title"] = content["title"]
+ post["content_filename"] = content.get("filename", "")
+ post["content_id"] = content["id"]
+ if "post_content_photos" in content:
+ for photo in content["post_content_photos"]:
+ post["file_id"] = photo["id"]
+ yield photo["url"]["original"], post
+ if "download_uri" in content:
+ post["file_id"] = content["id"]
+ yield self.root+"/"+content["download_uri"], post
+
+
+class FantiaCreatorExtractor(FantiaExtractor):
+ """Extractor for a Fantia creator's works"""
+ subcategory = "creator"
+ pattern = r"(?:https?://)?(?:www\.)?fantia\.jp/fanclubs/(\d+)"
+ test = (
+ ("https://fantia.jp/fanclubs/6939", {
+ "range": "1-25",
+ "count": ">= 25",
+ "keyword": {
+ "fanclub_user_id" : 52152,
+ "tags" : list,
+ "title" : str,
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ FantiaExtractor.__init__(self, match)
+ self.creator_id = match.group(1)
+
+ def posts(self):
+ url = "{}/fanclubs/{}/posts".format(self.root, self.creator_id)
+ return self._pagination(url)
+
+
+class FantiaPostExtractor(FantiaExtractor):
+ """Extractor for media from a single Fantia post"""
+ subcategory = "post"
+ pattern = r"(?:https?://)?(?:www\.)?fantia\.jp/posts/(\d+)"
+ test = (
+ ("https://fantia.jp/posts/508363", {
+ "count": 6,
+ "keyword": {
+ "post_title": "zunda逆バニーでおしりコッショリ",
+ "tags": list,
+ "rating": "adult",
+ "post_id": 508363
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ FantiaExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def posts(self):
+ return (self.post_id,)
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 0042676..863cead 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -91,10 +91,43 @@ class GelbooruPostExtractor(GelbooruBase,
"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
"count": 1,
}),
+ ("https://gelbooru.com/index.php?page=post&s=view&id=6018318", {
+ "options": (("tags", True),),
+ "content": "977caf22f27c72a5d07ea4d4d9719acdab810991",
+ "keyword": {
+ "tags_artist": "kirisaki_shuusei",
+ "tags_character": str,
+ "tags_copyright": "vocaloid",
+ "tags_general": str,
+ "tags_metadata": str,
+ },
+ }),
# video
("https://gelbooru.com/index.php?page=post&s=view&id=5938076", {
"content": "6360452fa8c2f0c1137749e81471238564df832a",
"pattern": r"https://img\d\.gelbooru\.com/images"
r"/22/61/226111273615049235b001b381707bd0\.webm",
}),
+ # notes
+ ("https://gelbooru.com/index.php?page=post&s=view&id=5997331", {
+ "options": (("notes", True),),
+ "keywords": {
+ "notes": [
+ {
+ "height": 553,
+ "body": "Look over this way when you talk~",
+ "width": 246,
+ "x": 35,
+ "y": 72
+ },
+ {
+ "height": 557,
+ "body": "Hey~\nAre you listening~?",
+ "width": 246,
+ "x": 1233,
+ "y": 109
+ }
+ ]
+ }
+ }),
)
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 51fb478..1b877b3 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -47,6 +47,8 @@ class GelbooruV02Extractor(booru.BooruExtractor):
self.root, post["id"])
page = self.request(url).text
html = text.extract(page, '<ul id="tag-', '</ul>')[0]
+ if not html:
+ html = text.extract(page, '<ul class="tag-', '</ul>')[0]
if html:
tags = collections.defaultdict(list)
pattern = re.compile(
@@ -55,6 +57,31 @@ class GelbooruV02Extractor(booru.BooruExtractor):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
post["tags_" + key] = " ".join(value)
+ return page
+
+ def _notes(self, post, page=None):
+ if not page:
+ url = "{}/index.php?page=post&s=view&id={}".format(
+ self.root, post["id"])
+ page = self.request(url).text
+ notes = []
+ notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
+ if not notes_data:
+ return
+
+ note_iter = text.extract_iter(notes_data, '<article', '</article>')
+ extr = text.extract
+ for note_data in note_iter:
+ note = {
+ "width": int(extr(note_data, 'data-width="', '"')[0]),
+ "height": int(extr(note_data, 'data-height="', '"')[0]),
+ "x": int(extr(note_data, 'data-x="', '"')[0]),
+ "y": int(extr(note_data, 'data-y="', '"')[0]),
+ "body": extr(note_data, 'data-body="', '"')[0],
+ }
+ notes.append(note)
+
+ post["notes"] = notes
BASE_PATTERN = GelbooruV02Extractor.update({
diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py
new file mode 100644
index 0000000..7dd047c
--- /dev/null
+++ b/gallery_dl/extractor/hentaicosplays.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentai-cosplays.com/
+(also works for hentai-img.com and porn-images-xxx.com)"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class HentaicosplaysGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from
+ hentai-cosplays.com, hentai-img.com, and porn-images-xxx.com"""
+ category = "hentaicosplays"
+ directory_fmt = ("{site}", "{title}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{title}_{filename}"
+ pattern = r"((?:https?://)?(?:\w{2}\.)?" \
+ r"(hentai-cosplays|hentai-img|porn-images-xxx)\.com)/" \
+ r"(?:image|story)/([\w-]+)"
+ test = (
+ ("https://hentai-cosplays.com/image/---devilism--tide-kurihara-/", {
+ "pattern": r"https://static\d?.hentai-cosplays.com/upload/"
+ r"\d+/\d+/\d+/\d+.jpg$",
+ "keyword": {
+ "count": 18,
+ "site": "hentai-cosplays",
+ "slug": "---devilism--tide-kurihara-",
+ "title": "艦 こ れ-devilism の tide Kurihara 憂",
+ },
+ }),
+ ("https://fr.porn-images-xxx.com/image/enako-enako-24/", {
+ "pattern": r"https://static\d?.porn-images-xxx.com/upload/"
+ r"\d+/\d+/\d+/\d+.jpg$",
+ "keyword": {
+ "count": 11,
+ "site": "porn-images-xxx",
+ "title": str,
+ },
+ }),
+ ("https://ja.hentai-img.com/image/hollow-cora-502/", {
+ "pattern": r"https://static\d?.hentai-img.com/upload/"
+ r"\d+/\d+/\d+/\d+.jpg$",
+ "keyword": {
+ "count": 2,
+ "site": "hentai-img",
+ "title": str,
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ root, self.site, self.slug = match.groups()
+ self.root = text.ensure_http_scheme(root)
+ url = "{}/story/{}/".format(self.root, self.slug)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ title = text.extract(page, "<title>", "</title>")[0]
+ return {
+ "title": text.unescape(title.rpartition(" Story Viewer - ")[0]),
+ "slug" : self.slug,
+ "site" : self.site,
+ }
+
+ def images(self, page):
+ return [
+ (url, None)
+ for url in text.extract_iter(
+ page, '<amp-img class="auto-style" src="', '"')
+ ]
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 74c6197..a027be1 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -161,13 +161,18 @@ class InstagramExtractor(Extractor):
}
def _parse_post_graphql(self, post):
+ typename = post["__typename"]
if post.get("is_video") and "video_url" not in post:
url = "{}/tv/{}/".format(self.root, post["shortcode"])
post = self._extract_post_page(url)
+ elif typename == "GraphSidecar" and \
+ "edge_sidecar_to_children" not in post:
+ url = "{}/p/{}/".format(self.root, post["shortcode"])
+ post = self._extract_post_page(url)
owner = post["owner"]
data = {
- "typename" : post["__typename"],
+ "typename" : typename,
"date" : text.parse_timestamp(post["taken_at_timestamp"]),
"likes" : post["edge_media_preview_like"]["count"],
"owner_id" : owner["id"],
@@ -328,7 +333,7 @@ class InstagramExtractor(Extractor):
def _get_edge_data(self, user, key):
cursor = self.config("cursor")
- if cursor:
+ if cursor or not key:
return {
"edges" : (),
"page_info": {
@@ -386,6 +391,7 @@ class InstagramUserExtractor(InstagramExtractor):
(InstagramPostsExtractor , base + "posts/"),
(InstagramReelsExtractor , base + "reels/"),
(InstagramChannelExtractor , base + "channel/"),
+ (InstagramTaggedExtractor , base + "tagged/"),
), ("posts",))
@@ -402,12 +408,31 @@ class InstagramPostsExtractor(InstagramExtractor):
url = "{}/{}/".format(self.root, self.item)
user = self._extract_profile_page(url)
- query_hash = "003056d32c2554def87228bc3fd9668a"
+ query_hash = "42d2750e44dbac713ff30130659cd891"
variables = {"id": user["id"], "first": 50}
edge = self._get_edge_data(user, "edge_owner_to_timeline_media")
return self._pagination_graphql(query_hash, variables, edge)
+class InstagramTaggedExtractor(InstagramExtractor):
+ """Extractor for ProfilePage tagged posts"""
+ subcategory = "tagged"
+ pattern = USER_PATTERN + r"/tagged"
+ test = ("https://www.instagram.com/instagram/tagged/", {
+ "range": "1-16",
+ "count": ">= 16",
+ })
+
+ def posts(self):
+ url = "{}/{}/".format(self.root, self.item)
+ user = self._extract_profile_page(url)
+
+ query_hash = "31fe64d9463cbbe58319dced405c6206"
+ variables = {"id": user["id"], "first": 50}
+ edge = self._get_edge_data(user, None)
+ return self._pagination_graphql(query_hash, variables, edge)
+
+
class InstagramChannelExtractor(InstagramExtractor):
"""Extractor for ProfilePage channel"""
subcategory = "channel"
@@ -588,7 +613,7 @@ class InstagramPostExtractor(InstagramExtractor):
)
def posts(self):
- query_hash = "2c4c2e343a8f64c625ba02b2aa12c7f8"
+ query_hash = "cf28bf5eb45d62d4dc8e77cdb99d750d"
variables = {
"shortcode" : self.item,
"child_comment_count" : 3,
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index 6e5aec9..21ed3c7 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -60,7 +60,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
)
def metadata(self, page):
- info = text.extract(page, "<title>", " &ndash; Komikcast<")[0]
+ info = text.extract(page, "<title>", " – Komikcast<")[0]
return self.parse_chapter_string(info)
@staticmethod
@@ -100,7 +100,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
@staticmethod
def metadata(page):
"""Return a dict with general metadata"""
- manga , pos = text.extract(page, "<title>" , " &ndash; Komikcast<")
+ manga , pos = text.extract(page, "<title>" , " – Komikcast<")
genres, pos = text.extract(
page, 'class="komik_info-content-genre">', "</span>", pos)
author, pos = text.extract(page, ">Author:", "</span>", pos)
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
index 143d00d..852c49f 100644
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2020 Mike Fährmann
+# Copyright 2016-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -101,9 +101,6 @@ class LusciousAlbumExtractor(LusciousExtractor):
"number_of_favorites": int,
},
}),
- ("https://luscious.net/albums/virgin-killer-sweater_282582/", {
- "url": "0be0cc279be1de99f727764819e03435e2a79915",
- }),
("https://luscious.net/albums/not-found_277035/", {
"exception": exception.NotFoundError,
}),
diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py
index 882031b..f8e1473 100644
--- a/gallery_dl/extractor/manganelo.py
+++ b/gallery_dl/extractor/manganelo.py
@@ -92,7 +92,7 @@ class ManganeloMangaExtractor(ManganeloBase, MangaExtractor):
r"(/(?:manga/|read_)\w+)")
test = (
("https://manganelo.com/manga/ol921234", {
- "url": "8a1810edddbafcde993ecb3558a35c99d8d4f13e",
+ "url": "6ba7f083a6944e414ad8214b74a0a40cb60d4562",
}),
("https://manganelo.com/manga/read_otome_no_teikoku", {
"pattern": ManganeloChapterExtractor.pattern,
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
index 0a6fba4..558e682 100644
--- a/gallery_dl/extractor/mangapark.py
+++ b/gallery_dl/extractor/mangapark.py
@@ -18,6 +18,7 @@ class MangaparkBase():
"""Base class for mangapark extractors"""
category = "mangapark"
root_fmt = "https://mangapark.{}"
+ browser = "firefox"
@staticmethod
def parse_chapter_path(path, data):
@@ -65,7 +66,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
(("https://mangapark.net/manga"
"/gekkan-shoujo-nozaki-kun/i2067426/v7/c70/1"), {
"count": 15,
- "keyword": "edc14993c4752cee3a76e09b2f024d40d854bfd1",
+ "keyword": "8f18f1c977ebe049ef35e3a877eaaab97fb25274",
}),
("https://mangapark.me/manga/gosu/i811615/c55/1"),
("https://mangapark.com/manga/gosu/i811615/c55/1"),
@@ -120,8 +121,8 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor):
r"(/manga/[^/?#]+)/?$")
test = (
("https://mangapark.net/manga/aria", {
- "url": "9b62883c25c8de471f8ab43651e1448536c4ce3f",
- "keyword": "eb4a9b273c69acf31efa731eba713e1cfa14bab6",
+ "url": "f07caf0bc5097c9b32c8c0d6f446bce1bf4bd329",
+ "keyword": "2c0d28efaf84fcfe62932b6931ef3c3987cd48c0",
}),
("https://mangapark.me/manga/aria"),
("https://mangapark.com/manga/aria"),
diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py
index abb937f..5c202f3 100644
--- a/gallery_dl/extractor/myportfolio.py
+++ b/gallery_dl/extractor/myportfolio.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -51,9 +51,11 @@ class MyportfolioGalleryExtractor(Extractor):
self.prefix = "myportfolio:" if domain1 else ""
def items(self):
- yield Message.Version, 1
url = "https://" + self.domain + (self.path or "")
- page = self.request(url).text
+ response = self.request(url)
+ if response.history and response.url.endswith(".adobe.com/missing"):
+ raise exception.NotFoundError()
+ page = response.text
projects = text.extract(
page, '<section class="project-covers', '</section>')[0]
@@ -78,12 +80,12 @@ class MyportfolioGalleryExtractor(Extractor):
# <user> and <title> can contain a "-" as well, so we get the title
# from somewhere else and cut that amount from the og:title content
- user, pos = text.extract(
- page, 'property=og:title content="', '"')
- desc, pos = text.extract(
- page, 'property=og:description content="', '"', pos)
- title, pos = text.extract(
- page, '<h1 ', '</h1>', pos)
+ extr = text.extract_from(page)
+ user = extr('property="og:title" content="', '"') or \
+ extr('property=og:title content="', '"')
+ descr = extr('property="og:description" content="', '"') or \
+ extr('property=og:description content="', '"')
+ title = extr('<h1 ', '</h1>')
if title:
title = title.partition(">")[2]
@@ -96,7 +98,7 @@ class MyportfolioGalleryExtractor(Extractor):
return {
"user": text.unescape(user),
"title": text.unescape(title),
- "description": text.unescape(desc or ""),
+ "description": text.unescape(descr),
}
@staticmethod
diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py
index db15572..1da3e49 100644
--- a/gallery_dl/extractor/naverwebtoon.py
+++ b/gallery_dl/extractor/naverwebtoon.py
@@ -8,27 +8,24 @@
"""Extractors for https://comic.naver.com/"""
-from .common import Extractor, Message
-from .. import exception, text
+from .common import GalleryExtractor, Extractor, Message
+from .. import text
BASE_PATTERN = r"(?:https?://)?comic\.naver\.com/webtoon"
-class NaverwebtoonExtractor(Extractor):
+class NaverwebtoonBase():
+ """Base class for naver webtoon extractors"""
category = "naverwebtoon"
root = "https://comic.naver.com"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.query = match.group(1)
-
-class NaverwebtoonEpisodeExtractor(NaverwebtoonExtractor):
+class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
subcategory = "episode"
directory_fmt = ("{category}", "{comic}")
filename_fmt = "{episode:>03}-{num:>02}.{extension}"
archive_fmt = "{title_id}_{episode}_{num}"
- pattern = (BASE_PATTERN + r"/detail\.nhn\?([^#]+)")
+ pattern = BASE_PATTERN + r"/detail\.nhn\?([^#]+)"
test = (
(("https://comic.naver.com/webtoon/detail.nhn?"
"titleId=26458&no=1&weekday=tue"), {
@@ -39,52 +36,38 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonExtractor):
)
def __init__(self, match):
- NaverwebtoonExtractor.__init__(self, match)
- query = text.parse_query(self.query)
+ query = match.group(1)
+ url = "{}/webtoon/detail.nhn?{}".format(self.root, query)
+ GalleryExtractor.__init__(self, match, url)
+
+ query = text.parse_query(query)
self.title_id = query.get("titleId")
- if not self.title_id:
- raise exception.NotFoundError("titleId")
self.episode = query.get("no")
- if not self.episode:
- raise exception.NotFoundError("no")
-
- def items(self):
- url = "{}/webtoon/detail.nhn?{}".format(self.root, self.query)
- page = self.request(url).text
- data = self.get_job_metadata(page)
-
- yield Message.Directory, data
- for data["num"], url in enumerate(self.get_image_urls(page), 1):
- yield Message.Url, url, text.nameext_from_url(url, data)
-
- def get_job_metadata(self, page):
- """Collect metadata for extractor-job"""
- title, pos = text.extract(page, 'property="og:title" content="', '"')
- comic, pos = text.extract(page, '<h2>', '<span', pos)
- authors, pos = text.extract(page, 'class="wrt_nm">', '</span>', pos)
- authors = authors.strip().split("/")
- descr, pos = text.extract(page, '<p class="txt">', '</p>', pos)
- genre, pos = text.extract(page, '<span class="genre">', '</span>', pos)
- date, pos = text.extract(page, '<dd class="date">', '</dd>', pos)
+ def metadata(self, page):
+ extr = text.extract_from(page)
return {
- "title": title,
- "comic": comic,
- "authors": authors,
- "description": descr,
- "genre": genre,
"title_id": self.title_id,
- "episode": self.episode,
- "date": date,
+ "episode" : self.episode,
+ "title" : extr('property="og:title" content="', '"'),
+ "comic" : extr('<h2>', '<span'),
+ "authors" : extr('class="wrt_nm">', '</span>').strip().split("/"),
+ "description": extr('<p class="txt">', '</p>'),
+ "genre" : extr('<span class="genre">', '</span>'),
+ "date" : extr('<dd class="date">', '</dd>'),
}
@staticmethod
- def get_image_urls(page):
+ def images(page):
view_area = text.extract(page, 'id="comic_view_area"', '</div>')[0]
- return text.extract_iter(view_area, '<img src="', '"')
+ return [
+ (url, None)
+ for url in text.extract_iter(view_area, '<img src="', '"')
+ if "/static/" not in url
+ ]
-class NaverwebtoonComicExtractor(NaverwebtoonExtractor):
+class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor):
subcategory = "comic"
categorytransfer = True
pattern = (BASE_PATTERN + r"/list\.nhn\?([^#]+)")
@@ -96,12 +79,10 @@ class NaverwebtoonComicExtractor(NaverwebtoonExtractor):
)
def __init__(self, match):
- NaverwebtoonExtractor.__init__(self, match)
- query = text.parse_query(self.query)
+ Extractor.__init__(self, match)
+ query = text.parse_query(match.group(1))
self.title_id = query.get("titleId")
- if not self.title_id:
- raise exception.NotFoundError("titleId")
- self.page_no = text.parse_int(query.get("page", 1))
+ self.page_no = text.parse_int(query.get("page"), 1)
def items(self):
url = self.root + "/webtoon/list.nhn"
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index f3c5ac2..3cfcb0e 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -107,11 +107,11 @@ class PhilomenaPostExtractor(PhilomenaExtractor):
"source_url": "https://www.deviantart.com/speccysy/art"
"/Afternoon-Flight-215193985",
"spoilered": False,
- "tag_count": 36,
+ "tag_count": 37,
"tag_ids": list,
"tags": list,
"thumbnails_generated": True,
- "updated_at": "2020-05-28T13:14:07Z",
+ "updated_at": "2021-04-07T06:01:30Z",
"uploader": "Clover the Clever",
"uploader_id": 211188,
"upvotes": int,
diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py
index 38f94e0..45ce7f8 100644
--- a/gallery_dl/extractor/piczel.py
+++ b/gallery_dl/extractor/piczel.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -117,7 +117,6 @@ class PiczelImageExtractor(PiczelExtractor):
"description": None,
"extension": "png",
"favorites_count": int,
- "folder": dict,
"folder_id": 1113,
"id": 7807,
"is_flash": False,
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index ebbce67..8bfae06 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -46,6 +46,10 @@ class PixivExtractor(Extractor):
del work["image_urls"]
del work["meta_pages"]
work["num"] = 0
+ if self.translated_tags:
+ work["untranslated_tags"] = [
+ tag["name"] for tag in work["tags"]
+ ]
work["tags"] = [tag[tkey] or tag["name"] for tag in work["tags"]]
work["date"] = text.parse_datetime(work["create_date"])
work["rating"] = ratings.get(work["x_restrict"])
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
index 26a5cd9..49c24bc 100644
--- a/gallery_dl/extractor/pururin.py
+++ b/gallery_dl/extractor/pururin.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -29,10 +29,10 @@ class PururinGalleryExtractor(GalleryExtractor):
"artist" : ["Shoda Norihiro"],
"group" : ["Obsidian Order"],
"parody" : ["Kantai Collection"],
- "characters": ["Admiral", "Iowa"],
+ "characters": ["Iowa", "Teitoku"],
"tags" : list,
"type" : "Doujinshi",
- "collection": "",
+ "collection": "I owant you!",
"convention": "C92",
"rating" : float,
"uploader" : "demo",
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index ea5bb6d..5579017 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -147,9 +147,9 @@ class SankakuPostExtractor(SankakuExtractor):
"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
"options": (("tags", True),),
"keyword": {
- "tags_artist": ["bonocho"],
- "tags_studio": ["dc_comics"],
- "tags_medium": ["sketch", "copyright_name"],
+ "tags_artist" : ["bonocho"],
+ "tags_studio" : ["dc_comics"],
+ "tags_medium" : list,
"tags_copyright": list,
"tags_character": list,
"tags_general" : list,
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
index ba1ab08..1bc353a 100644
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@@ -58,6 +58,9 @@ BASE_PATTERN = ShopifyExtractor.update({
"root": "https://www.fashionnova.com",
"pattern": r"(?:www\.)?fashionnova\.com",
},
+ "omgmiamiswimwear": {
+ "root": "https://www.omgmiamiswimwear.com"
+ },
})
@@ -74,6 +77,7 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
}),
("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
("https://www.fashionnova.com/collections/mini-dresses#1"),
+ ("https://www.omgmiamiswimwear.com/collections/fajas"),
)
def metadata(self):
@@ -120,6 +124,10 @@ class ShopifyProductExtractor(ShopifyExtractor):
"pattern": r"https?://cdn\d*\.shopify.com/",
"count": 3,
}),
+ ("https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", {
+ "pattern": r"https://cdn\.shopify\.com/s/files/1/1819/6171/",
+ "count": 5,
+ }),
("https://www.fashionnova.com/collections/flats/products/name"),
)
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
index 7301cbc..e1b14ef 100644
--- a/gallery_dl/extractor/simplyhentai.py
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -16,9 +16,9 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
"""Extractor for image galleries from simply-hentai.com"""
category = "simplyhentai"
archive_fmt = "{image_id}"
- pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com"
+ pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.)?simply-hentai\.com"
r"(?!/(?:album|gifs?|images?|series)(?:/|$))"
- r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?#]+)+)")
+ r"((?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?#]+)+)")
test = (
(("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), {
@@ -35,7 +35,10 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
)
def __init__(self, match):
- url = "https://" + match.group(1)
+ subdomain, path = match.groups()
+ if subdomain and subdomain not in ("www.", "old."):
+ path = "/" + subdomain.rstrip(".") + path
+ url = "https://old.simply-hentai.com" + path
GalleryExtractor.__init__(self, match, url)
self.session.headers["Referer"] = url
@@ -43,7 +46,6 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
extr = text.extract_from(page)
split = text.split_html
- self.gallery_url = extr('<link rel="canonical" href="', '"')
title = extr('<meta property="og:title" content="', '"')
image = extr('<meta property="og:image" content="', '"')
if not title:
@@ -99,7 +101,7 @@ class SimplyhentaiImageExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.page_url = "https://www." + match.group(1)
+ self.page_url = "https://old." + match.group(1)
self.type = match.group(2)
def items(self):
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
index 0b970cc..15dbb85 100644
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://www.slideshare.net/"""
+"""Extractors for https://www.slideshare.net/"""
from .common import Extractor, Message
from .. import text
@@ -58,15 +58,16 @@ class SlidesharePresentationExtractor(Extractor):
"""Collect metadata for extractor-job"""
descr, pos = text.extract(
page, '<meta name="description" content="', '"')
- title, pos = text.extract(
- page, '<span class="j-title-breadcrumb">', '</span>', pos)
+ category, pos = text.extract(
+ page, '<div class="metadata-item">', '</div>', pos)
views, pos = text.extract(
- page, '<span class="notranslate">', 'views<', pos)
+ page, '<div class="metadata-item">', '</div>', pos)
published, pos = text.extract(
- page, '<time datetime="', '"', pos)
+ page, '<div class="metadata-item">', '</div>', pos)
+ title, pos = text.extract(
+ page, '<span class="j-title-breadcrumb">', '</span>', pos)
alt_descr, pos = text.extract(
- page, 'id="slideshow-description-paragraph" class="notranslate">',
- '</p>', pos)
+ page, '<p class="slideshow-description notranslate">', '</p>', pos)
if descr.endswith("…") and alt_descr:
descr = text.remove_html(alt_descr).strip()
@@ -76,8 +77,9 @@ class SlidesharePresentationExtractor(Extractor):
"presentation": self.presentation,
"title": text.unescape(title.strip()),
"description": text.unescape(descr),
- "views": text.parse_int(views.replace(",", "")),
- "published": published,
+ "views": text.parse_int(views.rpartition(
+ " views")[0].replace(",", "")),
+ "published": published.strip(),
}
@staticmethod
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
index cfbd5eb..02cf832 100644
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -71,7 +71,7 @@ class SmugmugAlbumExtractor(SmugmugExtractor):
pattern = r"smugmug:album:([^:]+)$"
test = (
("smugmug:album:cr4C7f", {
- "url": "1436ee98d5797b308ecce5862e4885944f59c03c",
+ "url": "2c2e576e47d4e9ce60b44871f08a8c66b5ebaceb",
}),
# empty
("smugmug:album:Fb7hMs", {
@@ -111,8 +111,8 @@ class SmugmugImageExtractor(SmugmugExtractor):
pattern = BASE_PATTERN + r"(?:/[^/?#]+)+/i-([^/?#-]+)"
test = (
("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", {
- "url": "f624ad7293afd6412a7d34e3950a118596c36c85",
- "keyword": "d69c69c1517b8ea77bc763cffc4d0a4002dfee3f",
+ "url": "e6408fd2c64e721fd146130dceb56a971ceb4259",
+ "keyword": "05c8d50aa6ea08d458f83c38d7f9e92148362f0e",
"content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0",
}),
# video
diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py
index 886353f..d13ce0f 100644
--- a/gallery_dl/extractor/unsplash.py
+++ b/gallery_dl/extractor/unsplash.py
@@ -69,7 +69,7 @@ class UnsplashImageExtractor(UnsplashExtractor):
subcategory = "image"
pattern = BASE_PATTERN + r"/photos/([^/?#]+)"
test = ("https://unsplash.com/photos/lsoogGC_5dg", {
- "url": "ac9d194f58b3fc9aacdfc9784c1b69868f212b6e",
+ "url": "b99a5829ca955b768a206aa9afc391bd3f3dd55e",
"keyword": {
"alt_description": "re:silhouette of trees near body of water ",
"blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz",
@@ -190,7 +190,7 @@ class UnsplashSearchExtractor(UnsplashExtractor):
subcategory = "search"
pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^/?#]+))?"
test = ("https://unsplash.com/s/photos/nature", {
- "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
+ "pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
"range": "1-30",
"count": 30,
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 1a26264..cebb421 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -6,36 +6,38 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://www.webtoons.com/"""
+"""Extractors for https://www.webtoons.com/"""
-from .common import Extractor, Message
+from .common import GalleryExtractor, Extractor, Message
from .. import exception, text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com/((en|fr)"
-class WebtoonsExtractor(Extractor):
+class WebtoonsBase():
category = "webtoons"
root = "https://www.webtoons.com"
- cookiedomain = "www.webtoons.com"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.path, self.lang, self.genre , self.comic, self.query = \
- match.groups()
- cookies = self.session.cookies
- cookies.set("pagGDPR", "true", domain=self.cookiedomain)
- cookies.set("ageGatePass", "true", domain=self.cookiedomain)
+ cookiedomain = ".webtoons.com"
+
+ def setup_agegate_cookies(self):
+ self._update_cookies({
+ "atGDPR" : "AD_CONSENT",
+ "needCCPA" : "false",
+ "needCOPPA" : "false",
+ "needGDPR" : "false",
+ "pagGDPR" : "true",
+ "ageGatePass": "true",
+ })
def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)
- if response.history and "/ageGate" in response.request.url:
+ if response.history and "/ageGate" in response.url:
raise exception.StopExtraction(
- "Redirected to age gate check ('%s')", response.request.url)
+ "HTTP redirect to age gate check ('%s')", response.request.url)
return response
-class WebtoonsEpisodeExtractor(WebtoonsExtractor):
+class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
"""Extractor for an episode on webtoons.com"""
subcategory = "episode"
directory_fmt = ("{category}", "{comic}")
@@ -55,54 +57,44 @@ class WebtoonsEpisodeExtractor(WebtoonsExtractor):
)
def __init__(self, match):
- WebtoonsExtractor.__init__(self, match)
- query = text.parse_query(self.query)
- self.title_no = query.get("title_no")
- if not self.title_no:
- raise exception.NotFoundError("title_no")
- self.episode = query.get("episode_no")
- if not self.episode:
- raise exception.NotFoundError("episode_no")
+ self.path, self.lang, self.genre, self.comic, query = match.groups()
- def items(self):
- url = "{}/{}/viewer?{}".format(self.root, self.path, self.query)
+ url = "{}/{}/viewer?{}".format(self.root, self.path, query)
+ GalleryExtractor.__init__(self, match, url)
+ self.setup_agegate_cookies()
self.session.headers["Referer"] = url
- page = self.request(url).text
- data = self.get_job_metadata(page)
- imgs = self.get_image_urls(page)
- data["count"] = len(imgs)
-
- yield Message.Version, 1
- yield Message.Directory, data
- for data["num"], url in enumerate(imgs, 1):
- yield Message.Url, url, text.nameext_from_url(url, data)
+ query = text.parse_query(query)
+ self.title_no = query.get("title_no")
+ self.episode = query.get("episode_no")
- def get_job_metadata(self, page):
- """Collect metadata for extractor-job"""
+ def metadata(self, page):
title, pos = text.extract(
page, '<meta property="og:title" content="', '"')
descr, pos = text.extract(
page, '<meta property="og:description" content="', '"', pos)
return {
- "genre": self.genre,
- "comic": self.comic,
- "title_no": self.title_no,
- "episode": self.episode,
- "title": text.unescape(title),
+ "genre" : self.genre,
+ "comic" : self.comic,
+ "title_no" : self.title_no,
+ "episode" : self.episode,
+ "title" : text.unescape(title),
"description": text.unescape(descr),
- "lang": self.lang,
- "language": util.code_to_language(self.lang),
+ "lang" : self.lang,
+ "language" : util.code_to_language(self.lang),
}
@staticmethod
- def get_image_urls(page):
- """Extract and return a list of all image urls"""
- return list(text.extract_iter(page, 'class="_images" data-url="', '"'))
+ def images(page):
+ return [
+ (url, None)
+ for url in text.extract_iter(
+ page, 'class="_images" data-url="', '"')
+ ]
-class WebtoonsComicExtractor(WebtoonsExtractor):
+class WebtoonsComicExtractor(WebtoonsBase, Extractor):
"""Extractor for an entire comic on webtoons.com"""
subcategory = "comic"
categorytransfer = True
@@ -129,12 +121,13 @@ class WebtoonsComicExtractor(WebtoonsExtractor):
)
def __init__(self, match):
- WebtoonsExtractor.__init__(self, match)
- query = text.parse_query(self.query)
+ Extractor.__init__(self, match)
+ self.setup_agegate_cookies()
+
+ self.path, self.lang, self.genre, self.comic, query = match.groups()
+ query = text.parse_query(query)
self.title_no = query.get("title_no")
- if not self.title_no:
- raise exception.NotFoundError("title_no")
- self.page_no = int(query.get("page", 1))
+ self.page_no = text.parse_int(query.get("page"), 1)
def items(self):
page = None
diff --git a/gallery_dl/extractor/yuki.py b/gallery_dl/extractor/yuki.py
deleted file mode 100644
index 72d7cad..0000000
--- a/gallery_dl/extractor/yuki.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2018-2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://yuki.la/"""
-
-from .common import Extractor, Message
-from .. import text
-
-
-class YukiThreadExtractor(Extractor):
- """Extractor for images from threads on yuki.la"""
- category = "yuki"
- subcategory = "thread"
- directory_fmt = ("{category}", "{board}", "{thread}{title:? - //}")
- filename_fmt = "{time}-{filename}.{extension}"
- archive_fmt = "{board}_{thread}_{tim}"
- pattern = r"(?:https?://)?yuki\.la/([^/?#]+)/(\d+)"
- test = (
- ("https://yuki.la/gd/309639", {
- "url": "289e86c5caf673a2515ec5f5f521ac0ae7e189e9",
- "keyword": "01cbe29ae207a5cb7556bcbd5ed481ecdaf32727",
- "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
- }),
- ("https://yuki.la/a/159767162", {
- "url": "cd94d0eb646d279c3b7efb9b7898888e5d44fa93",
- "keyword": "7a4ff90e423c74bd3126fb65d13015decec2fa45",
- }),
- # old thread - missing board name in title and multi-line HTML
- ("https://yuki.la/gif/6877752", {
- "url": "3dbb2f8453490d002416c5fc2fe95b56c129faf9",
- "keyword": "563ef4ae80134d845dddaed7ebe56f5fc41056be",
- }),
- # even older thread - no thread title
- ("https://yuki.la/a/9357051", {
- "url": "010560bf254bd485e48366c3531728bda4b22583",
- "keyword": "7b736c41e307dcfcb84ef495f29299a6ddd06d67",
- }),
- )
- root = "https://yuki.la"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.board, self.thread = match.groups()
-
- def items(self):
- url = "{}/{}/{}".format(self.root, self.board, self.thread)
- page = self.request(url).text
- data = self.get_metadata(page)
-
- yield Message.Version, 1
- yield Message.Directory, data
- for post in self.posts(page):
- if "image" in post:
- for key in ("w", "h", "no", "time"):
- post[key] = text.parse_int(post[key])
- post.update(data)
- yield Message.Url, post["image"], post
-
- def get_metadata(self, page):
- """Collect metadata for extractor-job"""
- title = text.extract(page, "<title>", "</title>")[0]
- try:
- title, boardname, _ = title.rsplit(" - ", 2)
- except ValueError:
- title = boardname = ""
- else:
- title = title.partition(" - ")[2]
- if not title:
- title, boardname = boardname, ""
- return {
- "board": self.board,
- "board_name": boardname,
- "thread": text.parse_int(self.thread),
- "title": text.unescape(title),
- }
-
- def posts(self, page):
- """Build a list of all post-objects"""
- return [
- self.parse(post) for post in text.extract_iter(
- page, '<div class="postContainer', '</blockquote>')
- ]
-
- def parse(self, post):
- """Build post-object by extracting data from an HTML post"""
- data = self._extract_post(post)
- if 'class="file"' in post:
- self._extract_image(post, data)
- part = data["image"].rpartition("/")[2]
- data["tim"], _, data["extension"] = part.partition(".")
- data["ext"] = "." + data["extension"]
- return data
-
- @staticmethod
- def _extract_post(post):
- data, pos = text.extract_all(post, (
- ("no" , 'id="pc', '"'),
- ("name", '<span class="name">', '</span>'),
- ("time", 'data-utc="', '"'),
- ("now" , '>', ' <'),
- ))
- data["com"] = text.unescape(text.remove_html(
- post[post.index("<blockquote ", pos):].partition(">")[2]))
- return data
-
- @staticmethod
- def _extract_image(post, data):
- text.extract_all(post, (
- (None , '>File:', ''),
- ("fullname", '<a title="', '"'),
- ("image" , 'href="', '"'),
- ("filename", '>', '<'),
- ("fsize" , '(', ', '),
- ("w" , '', 'x'),
- ("h" , '', ')'),
- ), 0, data)
- filename = data["fullname"] or data["filename"]
- data["filename"] = text.unescape(filename.rpartition(".")[0])
- data["image"] = "https:" + data["image"]
- del data["fullname"]
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index d3b4a90..99f61d8 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -57,6 +57,7 @@ class Job():
if pextr.config("category-transfer", pextr.categorytransfer):
extr.category = pextr.category
extr.subcategory = pextr.subcategory
+ extr._cfgpath = pextr._cfgpath
# transfer parent directory
extr._parentdir = pextr._parentdir
@@ -575,7 +576,11 @@ class UrlJob(Job):
self.handle_queue = self.handle_url
@staticmethod
- def handle_url(url, kwdict):
+ def handle_url(url, _):
+ print(url)
+
+ @staticmethod
+ def handle_url_fallback(url, kwdict):
print(url)
if "_fallback" in kwdict:
for url in kwdict["_fallback"]:
@@ -604,6 +609,7 @@ class InfoJob(Job):
pc("Filename format", "filename", ex.filename_fmt)
pc("Directory format", "directory", ex.directory_fmt)
+ pc("Archive format", "archive-format", ex.archive_fmt)
pc("Request interval", "sleep-request", ex.request_interval)
return 0
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index b75f444..630da7d 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.17.2"
+__version__ = "1.17.3"
diff --git a/test/test_results.py b/test/test_results.py
index 223ef57..ed6b2eb 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -312,7 +312,7 @@ def setup_test_config():
config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma")
for category in ("danbooru", "instagram", "twitter", "subscribestar",
- "e621", "inkbunny"):
+ "e621", "inkbunny", "tapas"):
config.set(("extractor", category), "username", None)
config.set(("extractor", "mastodon.social"), "access-token",