diff options
author | Unit 193 <unit193@unit193.net> | 2021-05-03 23:36:53 -0400 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2021-05-03 23:36:53 -0400 |
commit | b9e1f9713d702cdb4721c6d7005718f43170c8fd (patch) | |
tree | d4bae9611841843502f1bac1ceeb88f5e8aac438 | |
parent | 8ce3f41264ca43e2acd627592667ce66bc4b63fe (diff) | |
parent | e7eb1f9779f2e223575ab23a6bc1abf2222e7d27 (diff) | |
download | gallery-dl-b9e1f9713d702cdb4721c6d7005718f43170c8fd.tar.bz2 gallery-dl-b9e1f9713d702cdb4721c6d7005718f43170c8fd.tar.xz gallery-dl-b9e1f9713d702cdb4721c6d7005718f43170c8fd.tar.zst |
Update upstream source from tag 'upstream/1.17.3'
Update to upstream version '1.17.3'
with Debian dir f48eb29debef9eb4ad856e7a0a50599d29d2128a
46 files changed, 987 insertions, 362 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index d57583e..59691b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # Changelog +## 1.17.3 - 2021-04-25 +### Additions +- [danbooru] add option for extended metadata extraction ([#1458](https://github.com/mikf/gallery-dl/issues/1458)) +- [fanbox] add extractors ([#1459](https://github.com/mikf/gallery-dl/issues/1459)) +- [fantia] add extractors ([#1459](https://github.com/mikf/gallery-dl/issues/1459)) +- [gelbooru] add an option to extract notes ([#1457](https://github.com/mikf/gallery-dl/issues/1457)) +- [hentaicosplays] add extractor ([#907](https://github.com/mikf/gallery-dl/issues/907), [#1473](https://github.com/mikf/gallery-dl/issues/1473), [#1483](https://github.com/mikf/gallery-dl/issues/1483)) +- [instagram] add extractor for `tagged` posts ([#1439](https://github.com/mikf/gallery-dl/issues/1439)) +- [naverwebtoon] ignore non-comic images +- [pixiv] also save untranslated tags when `translated-tags` is enabled ([#1501](https://github.com/mikf/gallery-dl/issues/1501)) +- [shopify] support omgmiamiswimwear.com ([#1280](https://github.com/mikf/gallery-dl/issues/1280)) +- implement `output.fallback` option +- add archive format to InfoJob output ([#875](https://github.com/mikf/gallery-dl/issues/875)) +- build executables with SOCKS proxy support ([#1424](https://github.com/mikf/gallery-dl/issues/1424)) +### Fixes +- [500px] update query hashes +- [8muses] fix JSON deobfuscation +- [artstation] download `/4k/` images ([#1422](https://github.com/mikf/gallery-dl/issues/1422)) +- [deviantart] fix pagination for Eclipse results ([#1444](https://github.com/mikf/gallery-dl/issues/1444)) +- [deviantart] improve folder name matching ([#1451](https://github.com/mikf/gallery-dl/issues/1451)) +- [erome] skip deleted albums ([#1447](https://github.com/mikf/gallery-dl/issues/1447)) +- [exhentai] fix image limit detection ([#1437](https://github.com/mikf/gallery-dl/issues/1437)) +- [exhentai] restore `limits` option ([#1487](https://github.com/mikf/gallery-dl/issues/1487)) +- [gelbooru] fix tag category extraction ([#1455](https://github.com/mikf/gallery-dl/issues/1455)) +- [instagram] update query hashes +- [komikcast] fix extraction +- [simplyhentai] fix extraction +- [slideshare] fix extraction +- [webtoons] update agegate/GDPR cookies ([#1431](https://github.com/mikf/gallery-dl/issues/1431)) +- fix `category-transfer` option +### Removals +- [yuki] remove module for yuki.la + ## 1.17.2 - 2021-04-02 ### Additions - [deviantart] add support for posts from watched users ([#794](https://github.com/mikf/gallery-dl/issues/794)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.17.2 +Version: 1.17.3 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -75,8 +75,8 @@ Description: ========== Prebuilt executable files with a Python interpreter and required Python packages included are available for - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -64,8 +64,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 1ab1ec6..6a22a07 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2021-04-02" "1.17.2" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2021-04-25" "1.17.3" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 608c2e5..0190b7f 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2021-04-02" "1.17.2" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2021-04-25" "1.17.3" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -824,22 +824,6 @@ descend into subfolders Download embedded videos hosted on https://www.blogger.com/ -.SS extractor.danbooru.ugoira -.IP "Type:" 6 -\f[I]bool\f[] - -.IP "Default:" 9 -\f[I]false\f[] - -.IP "Description:" 4 -Controls the download target for Ugoira posts. - -.br -* \f[I]true\f[]: Original ZIP archives -.br -* \f[I]false\f[]: Converted video files - - .SS extractor.derpibooru.api-key .IP "Type:" 6 \f[I]string\f[] @@ -1042,6 +1026,18 @@ or whenever your \f[I]cache file\f[] is deleted or cleared. Minimum wait time in seconds before API requests. +.SS extractor.exhentai.limits +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Sets a custom image download limit and +stops extraction when it gets exceeded. + + .SS extractor.exhentai.domain .IP "Type:" 6 \f[I]string\f[] @@ -1085,6 +1081,26 @@ Makes \f[I]date\f[] and \f[I]filesize\f[] more precise. Download full-sized original images if available. +.SS extractor.fanbox.embeds +.IP "Type:" 6 +\f[I]bool\f[] or \f[I]string\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Control behavior on embedded content from external sites. + +.br +* \f[I]true\f[]: Extract embed URLs and download them if supported +(videos are not downloaded). +.br +* \f[I]"ytdl"\f[]: Like \f[I]true\f[], but let \f[I]youtube-dl\f[] handle video +extraction and download for YouTube, Vimeo and SoundCloud embeds. +.br +* \f[I]false\f[]: Ignore embeds. + + .SS extractor.flickr.access-token & .access-token-secret .IP "Type:" 6 \f[I]string\f[] @@ -1963,20 +1979,6 @@ Extract media from retweeted posts. Download video files. -.SS extractor.[booru].tags -.IP "Type:" 6 -\f[I]bool\f[] - -.IP "Default:" 9 -\f[I]false\f[] - -.IP "Description:" 4 -Categorize tags by their respective types -and provide them as \f[I]tags_<type>\f[] metadata fields. - -Note: This requires 1 additional HTTP request for each post. - - .SS extractor.[manga-extractor].chapter-reverse .IP "Type:" 6 \f[I]bool\f[] @@ -2240,6 +2242,17 @@ All available options can be found in \f[I]youtube-dl's docstrings .SH OUTPUT OPTIONS +.SS output.fallback +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Include fallback URLs in the output of \f[I]-g/--get-urls\f[]. + + .SS output.mode .IP "Type:" 6 \f[I]string\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 8a3d9e2..4eaf1b8 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -49,7 +49,8 @@ { "username": null, "password": null, - "ugoira": false + "ugoira": false, + "metadata": false }, "derpibooru": { @@ -79,6 +80,7 @@ "username": null, "password": null, "domain": "auto", + "limits": true, "metadata": false, "original": true, "sleep-request": 5.0 @@ -279,7 +281,8 @@ }, "booru": { - "tags": false + "tags": false, + "notes": false } }, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index f233a1a..e192d75 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.17.2 +Version: 1.17.3 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -75,8 +75,8 @@ Description: ========== Prebuilt executable files with a Python interpreter and required Python packages included are available for - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.3/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 09e7097..3cc2071 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -59,6 +59,8 @@ gallery_dl/extractor/e621.py gallery_dl/extractor/erome.py gallery_dl/extractor/exhentai.py gallery_dl/extractor/fallenangels.py +gallery_dl/extractor/fanbox.py +gallery_dl/extractor/fantia.py gallery_dl/extractor/flickr.py gallery_dl/extractor/foolfuuka.py gallery_dl/extractor/foolslide.py @@ -70,6 +72,7 @@ gallery_dl/extractor/gelbooru_v02.py gallery_dl/extractor/gfycat.py gallery_dl/extractor/hbrowse.py gallery_dl/extractor/hentai2read.py +gallery_dl/extractor/hentaicosplays.py gallery_dl/extractor/hentaifoundry.py gallery_dl/extractor/hentaifox.py gallery_dl/extractor/hentaihand.py @@ -165,7 +168,6 @@ gallery_dl/extractor/weibo.py gallery_dl/extractor/wikiart.py gallery_dl/extractor/xhamster.py gallery_dl/extractor/xvideos.py -gallery_dl/extractor/yuki.py gallery_dl/postprocessor/__init__.py gallery_dl/postprocessor/classify.py gallery_dl/postprocessor/common.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index c1f80b6..5bf229a 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,7 +9,7 @@ from __future__ import unicode_literals, print_function __author__ = "Mike Fährmann" -__copyright__ = "Copyright 2014-2020 Mike Fährmann" +__copyright__ = "Copyright 2014-2021 Mike Fährmann" __license__ = "GPLv2" __maintainer__ = "Mike Fährmann" __email__ = "mike_faehrmann@web.de" @@ -204,6 +204,9 @@ def main(): if args.list_urls: jobtype = job.UrlJob jobtype.maxdepth = args.list_urls + if config.get(("output",), "fallback", True): + jobtype.handle_url = \ + staticmethod(jobtype.handle_url_fallback) else: jobtype = args.jobtype or job.DownloadJob diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index aa0e8ad..0583eb9 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -146,7 +146,7 @@ class _500pxGalleryExtractor(_500pxExtractor): }), # unavailable photos (#1335) ("https://500px.com/p/Light_Expression_Photography/galleries/street", { - "count": 0, + "count": ">= 7", }), ("https://500px.com/fashvamp/galleries/lera"), ) @@ -159,7 +159,7 @@ class _500pxGalleryExtractor(_500pxExtractor): def metadata(self): user = self._request_graphql( "ProfileRendererQuery", {"username": self.user_name}, - "105058632482dd2786fd5775745908dc928f537b28e28356b076522757d65c19", + "fcecc7028c308115b0defebc63acec3fe3c12df86a602c3e1785ba5cfb8fff47", )["profile"] self.user_id = str(user["legacyId"]) diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index 3eb5565..c961ded 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -54,10 +54,17 @@ class _8musesAlbumExtractor(Extractor): "private": False, }, }), + # custom sorting ("https://www.8muses.com/comics/album/Fakku-Comics/8?sort=az", { "count": ">= 70", "keyword": {"name": r"re:^[R-Zr-z]"}, }), + # non-ASCII characters + (("https://comics.8muses.com/comics/album/Various-Authors/Chessire88" + "/From-Trainers-to-Pokmons"), { + "count": 2, + "keyword": {"name": "re:From Trainers to Pokémons"}, + }), ) def __init__(self, match): @@ -125,6 +132,6 @@ class _8musesAlbumExtractor(Extractor): @staticmethod def _unobfuscate(data): return json.loads("".join([ - chr(33 + (ord(c) + 14) % 94) if c != " " else c + chr(33 + (ord(c) + 14) % 94) if "!" <= c <= "~" else c for c in text.unescape(data.strip("\t\n\r !")) ])) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3d61515..d927d70 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -31,6 +31,8 @@ modules = [ "erome", "exhentai", "fallenangels", + "fanbox", + "fantia", "flickr", "furaffinity", "fuskator", @@ -40,6 +42,7 @@ modules = [ "gfycat", "hbrowse", "hentai2read", + "hentaicosplays", "hentaifoundry", "hentaifox", "hentaihand", @@ -127,7 +130,6 @@ modules = [ "wikiart", "xhamster", "xvideos", - "yuki", "booru", "moebooru", "foolfuuka", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 6914f24..f2ad0ab 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.artstation.com/""" +"""Extractors for https://www.artstation.com/""" from .common import Extractor, Message from .. import text, util, exception @@ -29,7 +29,6 @@ class ArtstationExtractor(Extractor): def items(self): data = self.metadata() - yield Message.Version, 1 yield Message.Directory, data for project in self.projects(): @@ -49,7 +48,20 @@ class ArtstationExtractor(Extractor): if adict["has_image"]: url = adict["image_url"] text.nameext_from_url(url, asset) - yield Message.Url, self._no_cache(url), asset + + url = self._no_cache(url) + lhs, _, rhs = url.partition("/large/") + if rhs: + url = lhs + "/4k/" + rhs + asset["_fallback"] = self._image_fallback(lhs, rhs) + + yield Message.Url, url, asset + + @staticmethod + def _image_fallback(lhs, rhs): + yield lhs + "/large/" + rhs + yield lhs + "/medium/" + rhs + yield lhs + "/small/" + rhs def metadata(self): """Return general metadata""" @@ -135,8 +147,8 @@ class ArtstationUserExtractor(ArtstationExtractor): r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$") test = ( ("https://www.artstation.com/gaerikim/", { - "pattern": r"https://\w+\.artstation\.com/p/assets" - r"/images/images/\d+/\d+/\d+/large/[^/]+", + "pattern": r"https://\w+\.artstation\.com/p/assets/images" + r"/images/\d+/\d+/\d+/(4k|large|medium|small)/[^/]+", "count": ">= 6", }), ("https://www.artstation.com/gaerikim/albums/all/"), @@ -202,8 +214,8 @@ class ArtstationLikesExtractor(ArtstationExtractor): r"/(?!artwork|projects|search)([^/?#]+)/likes/?") test = ( ("https://www.artstation.com/mikf/likes", { - "pattern": r"https://\w+\.artstation\.com/p/assets" - r"/images/images/\d+/\d+/\d+/large/[^/]+", + "pattern": r"https://\w+\.artstation\.com/p/assets/images" + r"/images/\d+/\d+/\d+/(4k|large|medium|small)/[^/]+", "count": 6, }), # no likes @@ -250,7 +262,6 @@ class ArtstationChallengeExtractor(ArtstationExtractor): self.root) challenge = self.request(challenge_url).json() - yield Message.Version, 1 yield Message.Directory, {"challenge": challenge} params = {"sorting": self.sorting} @@ -344,10 +355,10 @@ class ArtstationImageExtractor(ArtstationExtractor): test = ( ("https://www.artstation.com/artwork/LQVJr", { "pattern": r"https?://\w+\.artstation\.com/p/assets" - r"/images/images/008/760/279/large/.+", - "content": "1f645ce7634e44675ebde8f6b634d36db0617d3c", + r"/images/images/008/760/279/4k/.+", + "content": "7b113871465fdc09d127adfdc2767d51cf45a7e9", # SHA1 hash without _no_cache() - # "content": "2e8aaf6400aeff2345274f45e90b6ed3f2a0d946", + # "content": "44b80f9af36d40efc5a2668cdd11d36d6793bae9", }), # multiple images per project ("https://www.artstation.com/artwork/Db3dy", { diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py index 6e0003d..d6e3683 100644 --- a/gallery_dl/extractor/bcy.py +++ b/gallery_dl/extractor/bcy.py @@ -170,11 +170,16 @@ class BcyPostExtractor(BcyExtractor): }, }), # only watermarked images available - ("https://bcy.net/item/detail/6780546160802143236", { + ("https://bcy.net/item/detail/6950136331708144648", { "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" r"~tplv-banciyuan-logo-v3:.+\.image", - "count": 8, + "count": 10, "keyword": {"filter": "watermark"} + + }), + # deleted + ("https://bcy.net/item/detail/6780546160802143236", { + "count": 0, }), # only visible to logged in users ("https://bcy.net/item/detail/6747523535150783495", { @@ -183,7 +188,10 @@ class BcyPostExtractor(BcyExtractor): ) def posts(self): - data = self._data_from_post(self.item_id) + try: + data = self._data_from_post(self.item_id) + except KeyError: + return () post = data["post_data"] post["image_list"] = post["multi"] post["plain"] = text.parse_unicode_escapes(post["plain"]) diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index c3cf3f7..a42ec53 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -24,6 +24,7 @@ class BooruExtractor(BaseExtractor): self.login() data = self.metadata() tags = self.config("tags", False) + notes = self.config("notes", False) for post in self.posts(): try: @@ -35,8 +36,11 @@ class BooruExtractor(BaseExtractor): "(md5: %s)", post.get("id"), post.get("md5")) continue + page_html = None if tags: - self._extended_tags(post) + page_html = self._extended_tags(post) + if notes: + self._notes(post, page_html) self._prepare(post) post.update(data) text.nameext_from_url(url, post) @@ -66,4 +70,13 @@ class BooruExtractor(BaseExtractor): """Prepare the 'post's metadata""" def _extended_tags(self, post, page=None): - """Generate extended tag information""" + """Generate extended tag information + + The return value of this function will be + passed to the _notes function as the page parameter. + This makes it possible to reuse the same HTML both for + extracting tags and notes. + """ + + def _notes(self, post, page=None): + """Generate information about notes""" diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 33797f9..1f86ea5 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -32,6 +32,7 @@ class DanbooruExtractor(Extractor): super().__init__(match) self.root = "https://{}.donmai.us".format(match.group(1)) self.ugoira = self.config("ugoira", False) + self.extended_metadata = self.config("metadata", False) username, api_key = self._get_auth_info() if username: @@ -64,6 +65,14 @@ class DanbooruExtractor(Extractor): url = post["large_file_url"] post["extension"] = "webm" + if self.extended_metadata: + template = ( + "{}/posts/{}.json" + "?only=artist_commentary,children,notes,parent" + ) + resp = self.request(template.format(self.root, post["id"])) + post.update(resp.json()) + post.update(data) yield Message.Directory, post yield Message.Url, url, post diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 9d1701f..47f589a 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -259,9 +259,10 @@ class DeviantartExtractor(Extractor): @staticmethod def _find_folder(folders, name): - pattern = re.compile(r"(?i)\W*" + name.replace("-", r"\W+") + r"\W*$") + match = re.compile(name.replace( + "-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match for folder in folders: - if pattern.match(folder["name"]): + if match(folder["name"]): return folder raise exception.NotFoundError("folder") @@ -472,6 +473,12 @@ class DeviantartFolderExtractor(DeviantartExtractor): "count": ">= 4", "options": (("original", False),), }), + # name starts with '_', special characters (#1451) + (("https://www.deviantart.com/justatest235723" + "/gallery/69302698/-test-b-c-d-e-f-"), { + "count": 1, + "options": (("original", False),), + }), ("https://shimoda7.deviantart.com/gallery/722019/Miscellaneous"), ("https://yakuzafc.deviantart.com/gallery/37412168/Crafts"), ) @@ -1230,7 +1237,7 @@ class DeviantartEclipseAPI(): params = { "username" : user, "offset" : offset, - "limit" : "24", + "limit" : 24, "scraps_folder": "true", } return self._pagination(endpoint, params) @@ -1240,8 +1247,8 @@ class DeviantartEclipseAPI(): params = { "username": user, "moduleid": self._module_id_watching(user), - "offset" : None, - "limit" : "24", + "offset" : offset, + "limit" : 24, } return self._pagination(endpoint, params) @@ -1260,14 +1267,23 @@ class DeviantartEclipseAPI(): except Exception: return {"error": response.text} - def _pagination(self, endpoint, params=None): + def _pagination(self, endpoint, params): while True: data = self._call(endpoint, params) - yield from data["results"] - if not data["hasMore"]: + results = data.get("results") + if results is None: + return + yield from results + + if not data.get("hasMore"): return - params["offset"] = data["nextOffset"] + + next_offset = data.get("nextOffset") + if next_offset: + params["offset"] = next_offset + else: + params["offset"] += params["limit"] def _module_id_watching(self, user): url = "{}/{}/about".format(self.extractor.root, user) diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 2e2e952..d4fd826 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -9,7 +9,7 @@ """Extractors for https://www.erome.com/""" from .common import Extractor, Message -from .. import text, util +from .. import text, util, exception from ..cache import cache import itertools import time @@ -32,7 +32,13 @@ class EromeExtractor(Extractor): def items(self): for album_id in self.albums(): url = "{}/a/{}".format(self.root, album_id) - page = self.request(url).text + + try: + page = self.request(url).text + except exception.HttpError as exc: + self.log.warning( + "Unable to fetch album '%s' (%s)", album_id, exc) + continue title, pos = text.extract( page, 'property="og:title" content="', '"') diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 872a338..910da7d 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -45,6 +45,13 @@ class ExhentaiExtractor(Extractor): Extractor.__init__(self, match) self.original = self.config("original", True) + limits = self.config("limits", False) + if limits and limits.__class__ is int: + self.limits = limits + self._remaining = 0 + else: + self.limits = False + self.session.headers["Referer"] = self.root + "/" if version != "ex": self.session.cookies.set("nw", "1", domain=self.cookiedomain) @@ -69,6 +76,7 @@ class ExhentaiExtractor(Extractor): self.log.info("no username given; using e-hentai.org") self.root = "https://e-hentai.org" self.original = False + self.limits = False self.session.cookies["nw"] = "1" @cache(maxage=90*24*3600, keyarg=1) @@ -193,12 +201,24 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.count = text.parse_int(data["filecount"]) yield Message.Directory, data + def _validate_response(response): + # declared inside 'items()' to be able to access 'data' + if not response.history and \ + response.headers.get("content-length") == "137": + self._report_limits(data) + return True + images = itertools.chain( (self.image_from_page(ipage),), self.images_from_api()) for url, image in images: data.update(image) + if self.limits: + self._check_limits(data) if "/fullimg.php" in url: data["extension"] = "" + data["_http_validate"] = _validate_response + else: + data["_http_validate"] = None yield Message.Url, url, data def get_metadata(self, page): @@ -338,6 +358,26 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "Continue with '%s/s/%s/%s-%s' as URL after resetting it.", self.root, data["image_token"], self.gallery_id, data["num"]) + def _check_limits(self, data): + if not self._remaining or data["num"] % 25 == 0: + self._update_limits() + self._remaining -= data["cost"] + if self._remaining <= 0: + self._report_limits(data) + + def _update_limits(self): + url = "https://e-hentai.org/home.php" + cookies = { + cookie.name: cookie.value + for cookie in self.session.cookies + if cookie.domain == self.cookiedomain and cookie.name != "igneous" + } + + page = self.request(url, cookies=cookies).text + current = text.extract(page, "<strong>", "</strong>")[0] + self.log.debug("Image Limits: %s/%s", current, self.limits) + self._remaining = self.limits - text.parse_int(current) + def _gallery_page(self): url = "{}/g/{}/{}/".format( self.root, self.gallery_id, self.gallery_token) diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py new file mode 100644 index 0000000..06054b2 --- /dev/null +++ b/gallery_dl/extractor/fanbox.py @@ -0,0 +1,283 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.fanbox.cc/""" + +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = ( + r"(?:https?://)?(?:" + r"(?!www\.)([\w-]+)\.fanbox\.cc|" + r"(?:www\.)?fanbox\.cc/@([\w-]+))" +) + + +class FanboxExtractor(Extractor): + """Base class for Fanbox extractors""" + category = "fanbox" + root = "https://www.fanbox.cc" + directory_fmt = ("{category}", "{creatorId}") + filename_fmt = "{id}_{num}.{extension}" + archive_fmt = "{id}_{num}" + _warning = True + + def __init__(self, match): + Extractor.__init__(self, match) + self.embeds = self.config("embeds", True) + + def items(self): + yield Message.Version, 1 + + if self._warning: + if "FANBOXSESSID" not in self.session.cookies: + self.log.warning("no 'FANBOXSESSID' cookie set") + FanboxExtractor._warning = False + + for content_body, post in self.posts(): + yield Message.Directory, post + yield from self._get_urls_from_post(content_body, post) + + def posts(self): + """Return all relevant post objects""" + + def _pagination(self, url): + headers = {"Origin": self.root} + + while url: + url = text.ensure_http_scheme(url) + body = self.request(url, headers=headers).json()["body"] + for item in body["items"]: + yield self._process_post(item) + + url = body["nextUrl"] + + def _get_post_data_from_id(self, post_id): + """Fetch and process post data""" + headers = {"Origin": self.root} + url = "https://api.fanbox.cc/post.info?postId="+post_id + post = self.request(url, headers=headers).json()["body"] + + return self._process_post(post) + + def _process_post(self, post): + content_body = post.pop("body", None) + if content_body: + if "html" in content_body: + post["html"] = content_body["html"] + if post["type"] == "article": + post["articleBody"] = content_body.copy() + + post["date"] = text.parse_datetime(post["publishedDatetime"]) + post["text"] = content_body.get("text") if content_body else None + post["isCoverImage"] = False + + return content_body, post + + def _get_urls_from_post(self, content_body, post): + num = 0 + cover_image = post.get("coverImageUrl") + if cover_image: + final_post = post.copy() + final_post["isCoverImage"] = True + final_post["fileUrl"] = cover_image + text.nameext_from_url(cover_image, final_post) + final_post["num"] = num + num += 1 + yield Message.Url, cover_image, final_post + + if not content_body: + return + + if "html" in content_body: + html_urls = [] + + for href in text.extract_iter(content_body["html"], 'href="', '"'): + if "fanbox.pixiv.net/images/entry" in href: + html_urls.append(href) + elif "downloads.fanbox.cc" in href: + html_urls.append(href) + for src in text.extract_iter(content_body["html"], + 'data-src-original="', '"'): + html_urls.append(src) + + for url in html_urls: + final_post = post.copy() + text.nameext_from_url(url, final_post) + final_post["fileUrl"] = url + final_post["num"] = num + num += 1 + yield Message.Url, url, final_post + + for group in ("images", "imageMap"): + if group in content_body: + for item in content_body[group]: + if group == "imageMap": + # imageMap is a dict with image objects as values + item = content_body[group][item] + + final_post = post.copy() + final_post["fileUrl"] = item["originalUrl"] + text.nameext_from_url(item["originalUrl"], final_post) + if "extension" in item: + final_post["extension"] = item["extension"] + final_post["fileId"] = item.get("id") + final_post["width"] = item.get("width") + final_post["height"] = item.get("height") + final_post["num"] = num + num += 1 + yield Message.Url, item["originalUrl"], final_post + + for group in ("files", "fileMap"): + if group in content_body: + for item in content_body[group]: + if group == "fileMap": + # fileMap is a dict with file objects as values + item = content_body[group][item] + + final_post = post.copy() + final_post["fileUrl"] = item["url"] + text.nameext_from_url(item["url"], final_post) + if "extension" in item: + final_post["extension"] = item["extension"] + if "name" in item: + final_post["filename"] = item["name"] + final_post["fileId"] = item.get("id") + final_post["num"] = num + num += 1 + yield Message.Url, item["url"], final_post + + if self.embeds: + embeds_found = [] + if "video" in content_body: + embeds_found.append(content_body["video"]) + embeds_found.extend(content_body.get("embedMap", {}).values()) + + for embed in embeds_found: + # embed_result is (message type, url, metadata dict) + embed_result = self._process_embed(post, embed) + if not embed_result: + continue + embed_result[2]["num"] = num + num += 1 + yield embed_result + + def _process_embed(self, post, embed): + final_post = post.copy() + provider = embed["serviceProvider"] + content_id = embed.get("videoId") or embed.get("contentId") + prefix = "ytdl:" if self.embeds == "ytdl" else "" + url = None + is_video = False + + if provider == "soundcloud": + url = prefix+"https://soundcloud.com/"+content_id + is_video = True + elif provider == "youtube": + url = prefix+"https://youtube.com/watch?v="+content_id + is_video = True + elif provider == "vimeo": + url = prefix+"https://vimeo.com/"+content_id + is_video = True + elif provider == "fanbox": + # this is an old URL format that redirects + # to a proper Fanbox URL + url = "https://www.pixiv.net/fanbox/"+content_id + # resolve redirect + response = self.request(url, method="HEAD", allow_redirects=False) + url = response.headers["Location"] + final_post["_extractor"] = FanboxPostExtractor + elif provider == "twitter": + url = "https://twitter.com/_/status/"+content_id + elif provider == "google_forms": + templ = "https://docs.google.com/forms/d/e/{}/viewform?usp=sf_link" + url = templ.format(content_id) + else: + self.log.warning("service not recognized: {}".format(provider)) + + if url: + final_post["embed"] = embed + final_post["embedUrl"] = url + text.nameext_from_url(url, final_post) + msg_type = Message.Queue + if is_video and self.embeds == "ytdl": + msg_type = Message.Url + return msg_type, url, final_post + + +class FanboxCreatorExtractor(FanboxExtractor): + """Extractor for a Fanbox creator's works""" + subcategory = "creator" + pattern = BASE_PATTERN + r"(?:/posts)?/?$" + test = ( + ("https://xub.fanbox.cc", { + "range": "1-15", + "count": ">= 15", + "keyword": { + "creatorId" : "xub", + "tags" : list, + "title" : str, + }, + }), + ("https://xub.fanbox.cc/posts"), + ("https://www.fanbox.cc/@xub/"), + ("https://www.fanbox.cc/@xub/posts"), + ) + + def __init__(self, match): + FanboxExtractor.__init__(self, match) + self.creator_id = match.group(1) or match.group(2) + + def posts(self): + url = "https://api.fanbox.cc/post.listCreator?creatorId={}&limit=10" + + return self._pagination(url.format(self.creator_id)) + + +class FanboxPostExtractor(FanboxExtractor): + """Extractor for media from a single Fanbox post""" + subcategory = "post" + pattern = BASE_PATTERN + r"/posts/(\d+)" + test = ( + ("https://www.fanbox.cc/@xub/posts/1910054", { + "count": 3, + "keyword": { + "title": "えま★おうがすと", + "tags": list, + "hasAdultContent": True, + "isCoverImage": False + }, + }), + # entry post type, image embedded in html of the post + ("https://nekoworks.fanbox.cc/posts/915", { + "count": 2, + "keyword": { + "title": "【SAYORI FAN CLUB】お届け内容", + "tags": list, + "html": str, + "hasAdultContent": True + }, + }), + # article post type, imageMap, 2 twitter embeds, fanbox embed + ("https://steelwire.fanbox.cc/posts/285502", { + "options": (("embeds", True),), + "count": 10, + "keyword": { + "title": "イラスト+SS|義足の炭鉱少年が義足を見せてくれるだけ 【全体公開版】", + "tags": list, + "articleBody": dict, + "hasAdultContent": True + }, + }), + ) + + def __init__(self, match): + FanboxExtractor.__init__(self, match) + self.post_id = match.group(3) + + def posts(self): + return (self._get_post_data_from_id(self.post_id),) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py new file mode 100644 index 0000000..16fed4e --- /dev/null +++ b/gallery_dl/extractor/fantia.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://fantia.jp/""" + +from .common import Extractor, Message +from .. import text + + +class FantiaExtractor(Extractor): + """Base class for Fantia extractors""" + category = "fantia" + root = "https://fantia.jp" + directory_fmt = ("{category}", "{fanclub_id}") + filename_fmt = "{post_id}_{file_id}.{extension}" + archive_fmt = "{post_id}_{file_id}" + _warning = True + + def items(self): + yield Message.Version, 1 + + if self._warning: + if "_session_id" not in self.session.cookies: + self.log.warning("no '_session_id' cookie set") + FantiaExtractor._warning = False + + for post_id in self.posts(): + full_response, post = self._get_post_data(post_id) + yield Message.Directory, post + for url, url_data in self._get_urls_from_post(full_response, post): + fname = url_data["content_filename"] or url + text.nameext_from_url(fname, url_data) + url_data["file_url"] = url + yield Message.Url, url, url_data + + def posts(self): + """Return post IDs""" + + def _pagination(self, url): + params = {"page": 1} + headers = {"Referer": self.root} + + while True: + page = self.request(url, params=params, headers=headers).text + + post_id = None + for post_id in text.extract_iter( + page, 'class="link-block" href="/posts/', '"'): + yield post_id + + if not post_id: + return + params["page"] += 1 + + def _get_post_data(self, post_id): + """Fetch and process post data""" + headers = {"Referer": self.root} + url = self.root+"/api/v1/posts/"+post_id + resp = self.request(url, headers=headers).json()["post"] + post = { + "post_id": resp["id"], + "post_url": self.root + "/posts/" + str(resp["id"]), + "post_title": resp["title"], + "comment": resp["comment"], + "rating": resp["rating"], + "posted_at": resp["posted_at"], + "fanclub_id": resp["fanclub"]["id"], + "fanclub_user_id": resp["fanclub"]["user"]["id"], + "fanclub_user_name": resp["fanclub"]["user"]["name"], + "fanclub_name": resp["fanclub"]["name"], + "fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]), + "tags": resp["tags"] + } + return resp, post + + def _get_urls_from_post(self, resp, post): + """Extract individual URL data from the response""" + if "thumb" in resp and resp["thumb"] and "original" in resp["thumb"]: + post["content_filename"] = "" + post["content_category"] = "thumb" + post["file_id"] = "thumb" + yield resp["thumb"]["original"], post + + for content in resp["post_contents"]: + post["content_category"] = content["category"] + post["content_title"] = content["title"] + post["content_filename"] = content.get("filename", "") + post["content_id"] = content["id"] + if "post_content_photos" in content: + for photo in content["post_content_photos"]: + post["file_id"] = photo["id"] + yield photo["url"]["original"], post + if "download_uri" in content: + post["file_id"] = content["id"] + yield self.root+"/"+content["download_uri"], post + + +class FantiaCreatorExtractor(FantiaExtractor): + """Extractor for a Fantia creator's works""" + subcategory = "creator" + pattern = r"(?:https?://)?(?:www\.)?fantia\.jp/fanclubs/(\d+)" + test = ( + ("https://fantia.jp/fanclubs/6939", { + "range": "1-25", + "count": ">= 25", + "keyword": { + "fanclub_user_id" : 52152, + "tags" : list, + "title" : str, + }, + }), + ) + + def __init__(self, match): + FantiaExtractor.__init__(self, match) + self.creator_id = match.group(1) + + def posts(self): + url = "{}/fanclubs/{}/posts".format(self.root, self.creator_id) + return self._pagination(url) + + +class FantiaPostExtractor(FantiaExtractor): + """Extractor for media from a single Fantia post""" + subcategory = "post" + pattern = r"(?:https?://)?(?:www\.)?fantia\.jp/posts/(\d+)" + test = ( + ("https://fantia.jp/posts/508363", { + "count": 6, + "keyword": { + "post_title": "zunda逆バニーでおしりコッショリ", + "tags": list, + "rating": "adult", + "post_id": 508363 + }, + }), + ) + + def __init__(self, match): + FantiaExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + return (self.post_id,) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 0042676..863cead 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -91,10 +91,43 @@ class GelbooruPostExtractor(GelbooruBase, "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", "count": 1, }), + ("https://gelbooru.com/index.php?page=post&s=view&id=6018318", { + "options": (("tags", True),), + "content": "977caf22f27c72a5d07ea4d4d9719acdab810991", + "keyword": { + "tags_artist": "kirisaki_shuusei", + "tags_character": str, + "tags_copyright": "vocaloid", + "tags_general": str, + "tags_metadata": str, + }, + }), # video ("https://gelbooru.com/index.php?page=post&s=view&id=5938076", { "content": "6360452fa8c2f0c1137749e81471238564df832a", "pattern": r"https://img\d\.gelbooru\.com/images" r"/22/61/226111273615049235b001b381707bd0\.webm", }), + # notes + ("https://gelbooru.com/index.php?page=post&s=view&id=5997331", { + "options": (("notes", True),), + "keywords": { + "notes": [ + { + "height": 553, + "body": "Look over this way when you talk~", + "width": 246, + "x": 35, + "y": 72 + }, + { + "height": 557, + "body": "Hey~\nAre you listening~?", + "width": 246, + "x": 1233, + "y": 109 + } + ] + } + }), ) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 51fb478..1b877b3 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -47,6 +47,8 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.root, post["id"]) page = self.request(url).text html = text.extract(page, '<ul id="tag-', '</ul>')[0] + if not html: + html = text.extract(page, '<ul class="tag-', '</ul>')[0] if html: tags = collections.defaultdict(list) pattern = re.compile( @@ -55,6 +57,31 @@ class GelbooruV02Extractor(booru.BooruExtractor): tags[tag_type].append(text.unquote(tag_name)) for key, value in tags.items(): post["tags_" + key] = " ".join(value) + return page + + def _notes(self, post, page=None): + if not page: + url = "{}/index.php?page=post&s=view&id={}".format( + self.root, post["id"]) + page = self.request(url).text + notes = [] + notes_data = text.extract(page, '<section id="notes"', '</section>')[0] + if not notes_data: + return + + note_iter = text.extract_iter(notes_data, '<article', '</article>') + extr = text.extract + for note_data in note_iter: + note = { + "width": int(extr(note_data, 'data-width="', '"')[0]), + "height": int(extr(note_data, 'data-height="', '"')[0]), + "x": int(extr(note_data, 'data-x="', '"')[0]), + "y": int(extr(note_data, 'data-y="', '"')[0]), + "body": extr(note_data, 'data-body="', '"')[0], + } + notes.append(note) + + post["notes"] = notes BASE_PATTERN = GelbooruV02Extractor.update({ diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py new file mode 100644 index 0000000..7dd047c --- /dev/null +++ b/gallery_dl/extractor/hentaicosplays.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentai-cosplays.com/ +(also works for hentai-img.com and porn-images-xxx.com)""" + +from .common import GalleryExtractor +from .. import text + + +class HentaicosplaysGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from + hentai-cosplays.com, hentai-img.com, and porn-images-xxx.com""" + category = "hentaicosplays" + directory_fmt = ("{site}", "{title}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{title}_{filename}" + pattern = r"((?:https?://)?(?:\w{2}\.)?" \ + r"(hentai-cosplays|hentai-img|porn-images-xxx)\.com)/" \ + r"(?:image|story)/([\w-]+)" + test = ( + ("https://hentai-cosplays.com/image/---devilism--tide-kurihara-/", { + "pattern": r"https://static\d?.hentai-cosplays.com/upload/" + r"\d+/\d+/\d+/\d+.jpg$", + "keyword": { + "count": 18, + "site": "hentai-cosplays", + "slug": "---devilism--tide-kurihara-", + "title": "艦 こ れ-devilism の tide Kurihara 憂", + }, + }), + ("https://fr.porn-images-xxx.com/image/enako-enako-24/", { + "pattern": r"https://static\d?.porn-images-xxx.com/upload/" + r"\d+/\d+/\d+/\d+.jpg$", + "keyword": { + "count": 11, + "site": "porn-images-xxx", + "title": str, + }, + }), + ("https://ja.hentai-img.com/image/hollow-cora-502/", { + "pattern": r"https://static\d?.hentai-img.com/upload/" + r"\d+/\d+/\d+/\d+.jpg$", + "keyword": { + "count": 2, + "site": "hentai-img", + "title": str, + }, + }), + ) + + def __init__(self, match): + root, self.site, self.slug = match.groups() + self.root = text.ensure_http_scheme(root) + url = "{}/story/{}/".format(self.root, self.slug) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + title = text.extract(page, "<title>", "</title>")[0] + return { + "title": text.unescape(title.rpartition(" Story Viewer - ")[0]), + "slug" : self.slug, + "site" : self.site, + } + + def images(self, page): + return [ + (url, None) + for url in text.extract_iter( + page, '<amp-img class="auto-style" src="', '"') + ] diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 74c6197..a027be1 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -161,13 +161,18 @@ class InstagramExtractor(Extractor): } def _parse_post_graphql(self, post): + typename = post["__typename"] if post.get("is_video") and "video_url" not in post: url = "{}/tv/{}/".format(self.root, post["shortcode"]) post = self._extract_post_page(url) + elif typename == "GraphSidecar" and \ + "edge_sidecar_to_children" not in post: + url = "{}/p/{}/".format(self.root, post["shortcode"]) + post = self._extract_post_page(url) owner = post["owner"] data = { - "typename" : post["__typename"], + "typename" : typename, "date" : text.parse_timestamp(post["taken_at_timestamp"]), "likes" : post["edge_media_preview_like"]["count"], "owner_id" : owner["id"], @@ -328,7 +333,7 @@ class InstagramExtractor(Extractor): def _get_edge_data(self, user, key): cursor = self.config("cursor") - if cursor: + if cursor or not key: return { "edges" : (), "page_info": { @@ -386,6 +391,7 @@ class InstagramUserExtractor(InstagramExtractor): (InstagramPostsExtractor , base + "posts/"), (InstagramReelsExtractor , base + "reels/"), (InstagramChannelExtractor , base + "channel/"), + (InstagramTaggedExtractor , base + "tagged/"), ), ("posts",)) @@ -402,12 +408,31 @@ class InstagramPostsExtractor(InstagramExtractor): url = "{}/{}/".format(self.root, self.item) user = self._extract_profile_page(url) - query_hash = "003056d32c2554def87228bc3fd9668a" + query_hash = "42d2750e44dbac713ff30130659cd891" variables = {"id": user["id"], "first": 50} edge = self._get_edge_data(user, "edge_owner_to_timeline_media") return self._pagination_graphql(query_hash, variables, edge) +class InstagramTaggedExtractor(InstagramExtractor): + """Extractor for ProfilePage tagged posts""" + subcategory = "tagged" + pattern = USER_PATTERN + r"/tagged" + test = ("https://www.instagram.com/instagram/tagged/", { + "range": "1-16", + "count": ">= 16", + }) + + def posts(self): + url = "{}/{}/".format(self.root, self.item) + user = self._extract_profile_page(url) + + query_hash = "31fe64d9463cbbe58319dced405c6206" + variables = {"id": user["id"], "first": 50} + edge = self._get_edge_data(user, None) + return self._pagination_graphql(query_hash, variables, edge) + + class InstagramChannelExtractor(InstagramExtractor): """Extractor for ProfilePage channel""" subcategory = "channel" @@ -588,7 +613,7 @@ class InstagramPostExtractor(InstagramExtractor): ) def posts(self): - query_hash = "2c4c2e343a8f64c625ba02b2aa12c7f8" + query_hash = "cf28bf5eb45d62d4dc8e77cdb99d750d" variables = { "shortcode" : self.item, "child_comment_count" : 3, diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 6e5aec9..21ed3c7 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -60,7 +60,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): ) def metadata(self, page): - info = text.extract(page, "<title>", " – Komikcast<")[0] + info = text.extract(page, "<title>", " – Komikcast<")[0] return self.parse_chapter_string(info) @staticmethod @@ -100,7 +100,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): @staticmethod def metadata(page): """Return a dict with general metadata""" - manga , pos = text.extract(page, "<title>" , " – Komikcast<") + manga , pos = text.extract(page, "<title>" , " – Komikcast<") genres, pos = text.extract( page, 'class="komik_info-content-genre">', "</span>", pos) author, pos = text.extract(page, ">Author:", "</span>", pos) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 143d00d..852c49f 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -101,9 +101,6 @@ class LusciousAlbumExtractor(LusciousExtractor): "number_of_favorites": int, }, }), - ("https://luscious.net/albums/virgin-killer-sweater_282582/", { - "url": "0be0cc279be1de99f727764819e03435e2a79915", - }), ("https://luscious.net/albums/not-found_277035/", { "exception": exception.NotFoundError, }), diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 882031b..f8e1473 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -92,7 +92,7 @@ class ManganeloMangaExtractor(ManganeloBase, MangaExtractor): r"(/(?:manga/|read_)\w+)") test = ( ("https://manganelo.com/manga/ol921234", { - "url": "8a1810edddbafcde993ecb3558a35c99d8d4f13e", + "url": "6ba7f083a6944e414ad8214b74a0a40cb60d4562", }), ("https://manganelo.com/manga/read_otome_no_teikoku", { "pattern": ManganeloChapterExtractor.pattern, diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 0a6fba4..558e682 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -18,6 +18,7 @@ class MangaparkBase(): """Base class for mangapark extractors""" category = "mangapark" root_fmt = "https://mangapark.{}" + browser = "firefox" @staticmethod def parse_chapter_path(path, data): @@ -65,7 +66,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): (("https://mangapark.net/manga" "/gekkan-shoujo-nozaki-kun/i2067426/v7/c70/1"), { "count": 15, - "keyword": "edc14993c4752cee3a76e09b2f024d40d854bfd1", + "keyword": "8f18f1c977ebe049ef35e3a877eaaab97fb25274", }), ("https://mangapark.me/manga/gosu/i811615/c55/1"), ("https://mangapark.com/manga/gosu/i811615/c55/1"), @@ -120,8 +121,8 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): r"(/manga/[^/?#]+)/?$") test = ( ("https://mangapark.net/manga/aria", { - "url": "9b62883c25c8de471f8ab43651e1448536c4ce3f", - "keyword": "eb4a9b273c69acf31efa731eba713e1cfa14bab6", + "url": "f07caf0bc5097c9b32c8c0d6f446bce1bf4bd329", + "keyword": "2c0d28efaf84fcfe62932b6931ef3c3987cd48c0", }), ("https://mangapark.me/manga/aria"), ("https://mangapark.com/manga/aria"), diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index abb937f..5c202f3 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -51,9 +51,11 @@ class MyportfolioGalleryExtractor(Extractor): self.prefix = "myportfolio:" if domain1 else "" def items(self): - yield Message.Version, 1 url = "https://" + self.domain + (self.path or "") - page = self.request(url).text + response = self.request(url) + if response.history and response.url.endswith(".adobe.com/missing"): + raise exception.NotFoundError() + page = response.text projects = text.extract( page, '<section class="project-covers', '</section>')[0] @@ -78,12 +80,12 @@ class MyportfolioGalleryExtractor(Extractor): # <user> and <title> can contain a "-" as well, so we get the title # from somewhere else and cut that amount from the og:title content - user, pos = text.extract( - page, 'property=og:title content="', '"') - desc, pos = text.extract( - page, 'property=og:description content="', '"', pos) - title, pos = text.extract( - page, '<h1 ', '</h1>', pos) + extr = text.extract_from(page) + user = extr('property="og:title" content="', '"') or \ + extr('property=og:title content="', '"') + descr = extr('property="og:description" content="', '"') or \ + extr('property=og:description content="', '"') + title = extr('<h1 ', '</h1>') if title: title = title.partition(">")[2] @@ -96,7 +98,7 @@ class MyportfolioGalleryExtractor(Extractor): return { "user": text.unescape(user), "title": text.unescape(title), - "description": text.unescape(desc or ""), + "description": text.unescape(descr), } @staticmethod diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index db15572..1da3e49 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -8,27 +8,24 @@ """Extractors for https://comic.naver.com/""" -from .common import Extractor, Message -from .. import exception, text +from .common import GalleryExtractor, Extractor, Message +from .. import text BASE_PATTERN = r"(?:https?://)?comic\.naver\.com/webtoon" -class NaverwebtoonExtractor(Extractor): +class NaverwebtoonBase(): + """Base class for naver webtoon extractors""" category = "naverwebtoon" root = "https://comic.naver.com" - def __init__(self, match): - Extractor.__init__(self, match) - self.query = match.group(1) - -class NaverwebtoonEpisodeExtractor(NaverwebtoonExtractor): +class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): subcategory = "episode" directory_fmt = ("{category}", "{comic}") filename_fmt = "{episode:>03}-{num:>02}.{extension}" archive_fmt = "{title_id}_{episode}_{num}" - pattern = (BASE_PATTERN + r"/detail\.nhn\?([^#]+)") + pattern = BASE_PATTERN + r"/detail\.nhn\?([^#]+)" test = ( (("https://comic.naver.com/webtoon/detail.nhn?" "titleId=26458&no=1&weekday=tue"), { @@ -39,52 +36,38 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonExtractor): ) def __init__(self, match): - NaverwebtoonExtractor.__init__(self, match) - query = text.parse_query(self.query) + query = match.group(1) + url = "{}/webtoon/detail.nhn?{}".format(self.root, query) + GalleryExtractor.__init__(self, match, url) + + query = text.parse_query(query) self.title_id = query.get("titleId") - if not self.title_id: - raise exception.NotFoundError("titleId") self.episode = query.get("no") - if not self.episode: - raise exception.NotFoundError("no") - - def items(self): - url = "{}/webtoon/detail.nhn?{}".format(self.root, self.query) - page = self.request(url).text - data = self.get_job_metadata(page) - - yield Message.Directory, data - for data["num"], url in enumerate(self.get_image_urls(page), 1): - yield Message.Url, url, text.nameext_from_url(url, data) - - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" - title, pos = text.extract(page, 'property="og:title" content="', '"') - comic, pos = text.extract(page, '<h2>', '<span', pos) - authors, pos = text.extract(page, 'class="wrt_nm">', '</span>', pos) - authors = authors.strip().split("/") - descr, pos = text.extract(page, '<p class="txt">', '</p>', pos) - genre, pos = text.extract(page, '<span class="genre">', '</span>', pos) - date, pos = text.extract(page, '<dd class="date">', '</dd>', pos) + def metadata(self, page): + extr = text.extract_from(page) return { - "title": title, - "comic": comic, - "authors": authors, - "description": descr, - "genre": genre, "title_id": self.title_id, - "episode": self.episode, - "date": date, + "episode" : self.episode, + "title" : extr('property="og:title" content="', '"'), + "comic" : extr('<h2>', '<span'), + "authors" : extr('class="wrt_nm">', '</span>').strip().split("/"), + "description": extr('<p class="txt">', '</p>'), + "genre" : extr('<span class="genre">', '</span>'), + "date" : extr('<dd class="date">', '</dd>'), } @staticmethod - def get_image_urls(page): + def images(page): view_area = text.extract(page, 'id="comic_view_area"', '</div>')[0] - return text.extract_iter(view_area, '<img src="', '"') + return [ + (url, None) + for url in text.extract_iter(view_area, '<img src="', '"') + if "/static/" not in url + ] -class NaverwebtoonComicExtractor(NaverwebtoonExtractor): +class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor): subcategory = "comic" categorytransfer = True pattern = (BASE_PATTERN + r"/list\.nhn\?([^#]+)") @@ -96,12 +79,10 @@ class NaverwebtoonComicExtractor(NaverwebtoonExtractor): ) def __init__(self, match): - NaverwebtoonExtractor.__init__(self, match) - query = text.parse_query(self.query) + Extractor.__init__(self, match) + query = text.parse_query(match.group(1)) self.title_id = query.get("titleId") - if not self.title_id: - raise exception.NotFoundError("titleId") - self.page_no = text.parse_int(query.get("page", 1)) + self.page_no = text.parse_int(query.get("page"), 1) def items(self): url = self.root + "/webtoon/list.nhn" diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index f3c5ac2..3cfcb0e 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -107,11 +107,11 @@ class PhilomenaPostExtractor(PhilomenaExtractor): "source_url": "https://www.deviantart.com/speccysy/art" "/Afternoon-Flight-215193985", "spoilered": False, - "tag_count": 36, + "tag_count": 37, "tag_ids": list, "tags": list, "thumbnails_generated": True, - "updated_at": "2020-05-28T13:14:07Z", + "updated_at": "2021-04-07T06:01:30Z", "uploader": "Clover the Clever", "uploader_id": 211188, "upvotes": int, diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py index 38f94e0..45ce7f8 100644 --- a/gallery_dl/extractor/piczel.py +++ b/gallery_dl/extractor/piczel.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -117,7 +117,6 @@ class PiczelImageExtractor(PiczelExtractor): "description": None, "extension": "png", "favorites_count": int, - "folder": dict, "folder_id": 1113, "id": 7807, "is_flash": False, diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index ebbce67..8bfae06 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -46,6 +46,10 @@ class PixivExtractor(Extractor): del work["image_urls"] del work["meta_pages"] work["num"] = 0 + if self.translated_tags: + work["untranslated_tags"] = [ + tag["name"] for tag in work["tags"] + ] work["tags"] = [tag[tkey] or tag["name"] for tag in work["tags"]] work["date"] = text.parse_datetime(work["create_date"]) work["rating"] = ratings.get(work["x_restrict"]) diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py index 26a5cd9..49c24bc 100644 --- a/gallery_dl/extractor/pururin.py +++ b/gallery_dl/extractor/pururin.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -29,10 +29,10 @@ class PururinGalleryExtractor(GalleryExtractor): "artist" : ["Shoda Norihiro"], "group" : ["Obsidian Order"], "parody" : ["Kantai Collection"], - "characters": ["Admiral", "Iowa"], + "characters": ["Iowa", "Teitoku"], "tags" : list, "type" : "Doujinshi", - "collection": "", + "collection": "I owant you!", "convention": "C92", "rating" : float, "uploader" : "demo", diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index ea5bb6d..5579017 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -147,9 +147,9 @@ class SankakuPostExtractor(SankakuExtractor): "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", "options": (("tags", True),), "keyword": { - "tags_artist": ["bonocho"], - "tags_studio": ["dc_comics"], - "tags_medium": ["sketch", "copyright_name"], + "tags_artist" : ["bonocho"], + "tags_studio" : ["dc_comics"], + "tags_medium" : list, "tags_copyright": list, "tags_character": list, "tags_general" : list, diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index ba1ab08..1bc353a 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -58,6 +58,9 @@ BASE_PATTERN = ShopifyExtractor.update({ "root": "https://www.fashionnova.com", "pattern": r"(?:www\.)?fashionnova\.com", }, + "omgmiamiswimwear": { + "root": "https://www.omgmiamiswimwear.com" + }, }) @@ -74,6 +77,7 @@ class ShopifyCollectionExtractor(ShopifyExtractor): }), ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), ("https://www.fashionnova.com/collections/mini-dresses#1"), + ("https://www.omgmiamiswimwear.com/collections/fajas"), ) def metadata(self): @@ -120,6 +124,10 @@ class ShopifyProductExtractor(ShopifyExtractor): "pattern": r"https?://cdn\d*\.shopify.com/", "count": 3, }), + ("https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", { + "pattern": r"https://cdn\.shopify\.com/s/files/1/1819/6171/", + "count": 5, + }), ("https://www.fashionnova.com/collections/flats/products/name"), ) diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index 7301cbc..e1b14ef 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,9 +16,9 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): """Extractor for image galleries from simply-hentai.com""" category = "simplyhentai" archive_fmt = "{image_id}" - pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com" + pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.)?simply-hentai\.com" r"(?!/(?:album|gifs?|images?|series)(?:/|$))" - r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?#]+)+)") + r"((?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?#]+)+)") test = ( (("https://original-work.simply-hentai.com" "/amazon-no-hiyaku-amazon-elixir"), { @@ -35,7 +35,10 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): ) def __init__(self, match): - url = "https://" + match.group(1) + subdomain, path = match.groups() + if subdomain and subdomain not in ("www.", "old."): + path = "/" + subdomain.rstrip(".") + path + url = "https://old.simply-hentai.com" + path GalleryExtractor.__init__(self, match, url) self.session.headers["Referer"] = url @@ -43,7 +46,6 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): extr = text.extract_from(page) split = text.split_html - self.gallery_url = extr('<link rel="canonical" href="', '"') title = extr('<meta property="og:title" content="', '"') image = extr('<meta property="og:image" content="', '"') if not title: @@ -99,7 +101,7 @@ class SimplyhentaiImageExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.page_url = "https://www." + match.group(1) + self.page_url = "https://old." + match.group(1) self.type = match.group(2) def items(self): diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index 0b970cc..15dbb85 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.slideshare.net/""" +"""Extractors for https://www.slideshare.net/""" from .common import Extractor, Message from .. import text @@ -58,15 +58,16 @@ class SlidesharePresentationExtractor(Extractor): """Collect metadata for extractor-job""" descr, pos = text.extract( page, '<meta name="description" content="', '"') - title, pos = text.extract( - page, '<span class="j-title-breadcrumb">', '</span>', pos) + category, pos = text.extract( + page, '<div class="metadata-item">', '</div>', pos) views, pos = text.extract( - page, '<span class="notranslate">', 'views<', pos) + page, '<div class="metadata-item">', '</div>', pos) published, pos = text.extract( - page, '<time datetime="', '"', pos) + page, '<div class="metadata-item">', '</div>', pos) + title, pos = text.extract( + page, '<span class="j-title-breadcrumb">', '</span>', pos) alt_descr, pos = text.extract( - page, 'id="slideshow-description-paragraph" class="notranslate">', - '</p>', pos) + page, '<p class="slideshow-description notranslate">', '</p>', pos) if descr.endswith("…") and alt_descr: descr = text.remove_html(alt_descr).strip() @@ -76,8 +77,9 @@ class SlidesharePresentationExtractor(Extractor): "presentation": self.presentation, "title": text.unescape(title.strip()), "description": text.unescape(descr), - "views": text.parse_int(views.replace(",", "")), - "published": published, + "views": text.parse_int(views.rpartition( + " views")[0].replace(",", "")), + "published": published.strip(), } @staticmethod diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index cfbd5eb..02cf832 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -71,7 +71,7 @@ class SmugmugAlbumExtractor(SmugmugExtractor): pattern = r"smugmug:album:([^:]+)$" test = ( ("smugmug:album:cr4C7f", { - "url": "1436ee98d5797b308ecce5862e4885944f59c03c", + "url": "2c2e576e47d4e9ce60b44871f08a8c66b5ebaceb", }), # empty ("smugmug:album:Fb7hMs", { @@ -111,8 +111,8 @@ class SmugmugImageExtractor(SmugmugExtractor): pattern = BASE_PATTERN + r"(?:/[^/?#]+)+/i-([^/?#-]+)" test = ( ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { - "url": "f624ad7293afd6412a7d34e3950a118596c36c85", - "keyword": "d69c69c1517b8ea77bc763cffc4d0a4002dfee3f", + "url": "e6408fd2c64e721fd146130dceb56a971ceb4259", + "keyword": "05c8d50aa6ea08d458f83c38d7f9e92148362f0e", "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0", }), # video diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index 886353f..d13ce0f 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -69,7 +69,7 @@ class UnsplashImageExtractor(UnsplashExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/photos/([^/?#]+)" test = ("https://unsplash.com/photos/lsoogGC_5dg", { - "url": "ac9d194f58b3fc9aacdfc9784c1b69868f212b6e", + "url": "b99a5829ca955b768a206aa9afc391bd3f3dd55e", "keyword": { "alt_description": "re:silhouette of trees near body of water ", "blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz", @@ -190,7 +190,7 @@ class UnsplashSearchExtractor(UnsplashExtractor): subcategory = "search" pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^/?#]+))?" test = ("https://unsplash.com/s/photos/nature", { - "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+" + "pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+" r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", "range": "1-30", "count": 30, diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 1a26264..cebb421 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -6,36 +6,38 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.webtoons.com/""" +"""Extractors for https://www.webtoons.com/""" -from .common import Extractor, Message +from .common import GalleryExtractor, Extractor, Message from .. import exception, text, util BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com/((en|fr)" -class WebtoonsExtractor(Extractor): +class WebtoonsBase(): category = "webtoons" root = "https://www.webtoons.com" - cookiedomain = "www.webtoons.com" - - def __init__(self, match): - Extractor.__init__(self, match) - self.path, self.lang, self.genre , self.comic, self.query = \ - match.groups() - cookies = self.session.cookies - cookies.set("pagGDPR", "true", domain=self.cookiedomain) - cookies.set("ageGatePass", "true", domain=self.cookiedomain) + cookiedomain = ".webtoons.com" + + def setup_agegate_cookies(self): + self._update_cookies({ + "atGDPR" : "AD_CONSENT", + "needCCPA" : "false", + "needCOPPA" : "false", + "needGDPR" : "false", + "pagGDPR" : "true", + "ageGatePass": "true", + }) def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and "/ageGate" in response.request.url: + if response.history and "/ageGate" in response.url: raise exception.StopExtraction( - "Redirected to age gate check ('%s')", response.request.url) + "HTTP redirect to age gate check ('%s')", response.request.url) return response -class WebtoonsEpisodeExtractor(WebtoonsExtractor): +class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): """Extractor for an episode on webtoons.com""" subcategory = "episode" directory_fmt = ("{category}", "{comic}") @@ -55,54 +57,44 @@ class WebtoonsEpisodeExtractor(WebtoonsExtractor): ) def __init__(self, match): - WebtoonsExtractor.__init__(self, match) - query = text.parse_query(self.query) - self.title_no = query.get("title_no") - if not self.title_no: - raise exception.NotFoundError("title_no") - self.episode = query.get("episode_no") - if not self.episode: - raise exception.NotFoundError("episode_no") + self.path, self.lang, self.genre, self.comic, query = match.groups() - def items(self): - url = "{}/{}/viewer?{}".format(self.root, self.path, self.query) + url = "{}/{}/viewer?{}".format(self.root, self.path, query) + GalleryExtractor.__init__(self, match, url) + self.setup_agegate_cookies() self.session.headers["Referer"] = url - page = self.request(url).text - data = self.get_job_metadata(page) - imgs = self.get_image_urls(page) - data["count"] = len(imgs) - - yield Message.Version, 1 - yield Message.Directory, data - for data["num"], url in enumerate(imgs, 1): - yield Message.Url, url, text.nameext_from_url(url, data) + query = text.parse_query(query) + self.title_no = query.get("title_no") + self.episode = query.get("episode_no") - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" + def metadata(self, page): title, pos = text.extract( page, '<meta property="og:title" content="', '"') descr, pos = text.extract( page, '<meta property="og:description" content="', '"', pos) return { - "genre": self.genre, - "comic": self.comic, - "title_no": self.title_no, - "episode": self.episode, - "title": text.unescape(title), + "genre" : self.genre, + "comic" : self.comic, + "title_no" : self.title_no, + "episode" : self.episode, + "title" : text.unescape(title), "description": text.unescape(descr), - "lang": self.lang, - "language": util.code_to_language(self.lang), + "lang" : self.lang, + "language" : util.code_to_language(self.lang), } @staticmethod - def get_image_urls(page): - """Extract and return a list of all image urls""" - return list(text.extract_iter(page, 'class="_images" data-url="', '"')) + def images(page): + return [ + (url, None) + for url in text.extract_iter( + page, 'class="_images" data-url="', '"') + ] -class WebtoonsComicExtractor(WebtoonsExtractor): +class WebtoonsComicExtractor(WebtoonsBase, Extractor): """Extractor for an entire comic on webtoons.com""" subcategory = "comic" categorytransfer = True @@ -129,12 +121,13 @@ class WebtoonsComicExtractor(WebtoonsExtractor): ) def __init__(self, match): - WebtoonsExtractor.__init__(self, match) - query = text.parse_query(self.query) + Extractor.__init__(self, match) + self.setup_agegate_cookies() + + self.path, self.lang, self.genre, self.comic, query = match.groups() + query = text.parse_query(query) self.title_no = query.get("title_no") - if not self.title_no: - raise exception.NotFoundError("title_no") - self.page_no = int(query.get("page", 1)) + self.page_no = text.parse_int(query.get("page"), 1) def items(self): page = None diff --git a/gallery_dl/extractor/yuki.py b/gallery_dl/extractor/yuki.py deleted file mode 100644 index 72d7cad..0000000 --- a/gallery_dl/extractor/yuki.py +++ /dev/null @@ -1,125 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2018-2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extract images from https://yuki.la/""" - -from .common import Extractor, Message -from .. import text - - -class YukiThreadExtractor(Extractor): - """Extractor for images from threads on yuki.la""" - category = "yuki" - subcategory = "thread" - directory_fmt = ("{category}", "{board}", "{thread}{title:? - //}") - filename_fmt = "{time}-{filename}.{extension}" - archive_fmt = "{board}_{thread}_{tim}" - pattern = r"(?:https?://)?yuki\.la/([^/?#]+)/(\d+)" - test = ( - ("https://yuki.la/gd/309639", { - "url": "289e86c5caf673a2515ec5f5f521ac0ae7e189e9", - "keyword": "01cbe29ae207a5cb7556bcbd5ed481ecdaf32727", - "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573", - }), - ("https://yuki.la/a/159767162", { - "url": "cd94d0eb646d279c3b7efb9b7898888e5d44fa93", - "keyword": "7a4ff90e423c74bd3126fb65d13015decec2fa45", - }), - # old thread - missing board name in title and multi-line HTML - ("https://yuki.la/gif/6877752", { - "url": "3dbb2f8453490d002416c5fc2fe95b56c129faf9", - "keyword": "563ef4ae80134d845dddaed7ebe56f5fc41056be", - }), - # even older thread - no thread title - ("https://yuki.la/a/9357051", { - "url": "010560bf254bd485e48366c3531728bda4b22583", - "keyword": "7b736c41e307dcfcb84ef495f29299a6ddd06d67", - }), - ) - root = "https://yuki.la" - - def __init__(self, match): - Extractor.__init__(self, match) - self.board, self.thread = match.groups() - - def items(self): - url = "{}/{}/{}".format(self.root, self.board, self.thread) - page = self.request(url).text - data = self.get_metadata(page) - - yield Message.Version, 1 - yield Message.Directory, data - for post in self.posts(page): - if "image" in post: - for key in ("w", "h", "no", "time"): - post[key] = text.parse_int(post[key]) - post.update(data) - yield Message.Url, post["image"], post - - def get_metadata(self, page): - """Collect metadata for extractor-job""" - title = text.extract(page, "<title>", "</title>")[0] - try: - title, boardname, _ = title.rsplit(" - ", 2) - except ValueError: - title = boardname = "" - else: - title = title.partition(" - ")[2] - if not title: - title, boardname = boardname, "" - return { - "board": self.board, - "board_name": boardname, - "thread": text.parse_int(self.thread), - "title": text.unescape(title), - } - - def posts(self, page): - """Build a list of all post-objects""" - return [ - self.parse(post) for post in text.extract_iter( - page, '<div class="postContainer', '</blockquote>') - ] - - def parse(self, post): - """Build post-object by extracting data from an HTML post""" - data = self._extract_post(post) - if 'class="file"' in post: - self._extract_image(post, data) - part = data["image"].rpartition("/")[2] - data["tim"], _, data["extension"] = part.partition(".") - data["ext"] = "." + data["extension"] - return data - - @staticmethod - def _extract_post(post): - data, pos = text.extract_all(post, ( - ("no" , 'id="pc', '"'), - ("name", '<span class="name">', '</span>'), - ("time", 'data-utc="', '"'), - ("now" , '>', ' <'), - )) - data["com"] = text.unescape(text.remove_html( - post[post.index("<blockquote ", pos):].partition(">")[2])) - return data - - @staticmethod - def _extract_image(post, data): - text.extract_all(post, ( - (None , '>File:', ''), - ("fullname", '<a title="', '"'), - ("image" , 'href="', '"'), - ("filename", '>', '<'), - ("fsize" , '(', ', '), - ("w" , '', 'x'), - ("h" , '', ')'), - ), 0, data) - filename = data["fullname"] or data["filename"] - data["filename"] = text.unescape(filename.rpartition(".")[0]) - data["image"] = "https:" + data["image"] - del data["fullname"] diff --git a/gallery_dl/job.py b/gallery_dl/job.py index d3b4a90..99f61d8 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -57,6 +57,7 @@ class Job(): if pextr.config("category-transfer", pextr.categorytransfer): extr.category = pextr.category extr.subcategory = pextr.subcategory + extr._cfgpath = pextr._cfgpath # transfer parent directory extr._parentdir = pextr._parentdir @@ -575,7 +576,11 @@ class UrlJob(Job): self.handle_queue = self.handle_url @staticmethod - def handle_url(url, kwdict): + def handle_url(url, _): + print(url) + + @staticmethod + def handle_url_fallback(url, kwdict): print(url) if "_fallback" in kwdict: for url in kwdict["_fallback"]: @@ -604,6 +609,7 @@ class InfoJob(Job): pc("Filename format", "filename", ex.filename_fmt) pc("Directory format", "directory", ex.directory_fmt) + pc("Archive format", "archive-format", ex.archive_fmt) pc("Request interval", "sleep-request", ex.request_interval) return 0 diff --git a/gallery_dl/version.py b/gallery_dl/version.py index b75f444..630da7d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.17.2" +__version__ = "1.17.3" diff --git a/test/test_results.py b/test/test_results.py index 223ef57..ed6b2eb 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -312,7 +312,7 @@ def setup_test_config(): config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") for category in ("danbooru", "instagram", "twitter", "subscribestar", - "e621", "inkbunny"): + "e621", "inkbunny", "tapas"): config.set(("extractor", category), "username", None) config.set(("extractor", "mastodon.social"), "access-token", |