diff options
author | Unit 193 <unit193@unit193.net> | 2021-01-11 03:25:49 -0500 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2021-01-11 03:25:49 -0500 |
commit | 99541ac9da24070cd241149ee54db4bf81d564b7 (patch) | |
tree | 073fefa95f7ddedf777db89bb237b51860c1f32f | |
parent | bc18b55e1dd21a4528241926df6302182404fb5a (diff) | |
parent | 6335711bbe769b6b9301a88d88790d7a2f8aa82e (diff) | |
download | gallery-dl-99541ac9da24070cd241149ee54db4bf81d564b7.tar.bz2 gallery-dl-99541ac9da24070cd241149ee54db4bf81d564b7.tar.xz gallery-dl-99541ac9da24070cd241149ee54db4bf81d564b7.tar.zst |
Update upstream source from tag 'upstream/1.16.3'
Update to upstream version '1.16.3'
with Debian dir 9f836edb40c3d6f8084f75a5c1e53b6a52613e4f
27 files changed, 660 insertions, 145 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 3531352..3b3060a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,26 @@ # Changelog +## 1.16.3 - 2021-01-10 + +## 1.16.2 - 2021-01-09 +### Additions +- [derpibooru] add `search` and `gallery` extractors ([#862](https://github.com/mikf/gallery-dl/issues/862)) +- [foolfuuka] add `board` and `search` extractors ([#1044](https://github.com/mikf/gallery-dl/issues/1044), [#1174](https://github.com/mikf/gallery-dl/issues/1174)) +- [gfycat] add `date` metadata field ([#1138](https://github.com/mikf/gallery-dl/issues/1138)) +- [pinterest] add support for getting all boards of a user ([#1205](https://github.com/mikf/gallery-dl/issues/1205)) +- [sankaku] add support for book searches ([#1204](https://github.com/mikf/gallery-dl/issues/1204)) +- [twitter] fetch media from pinned tweets ([#1203](https://github.com/mikf/gallery-dl/issues/1203)) +- [wikiart] add extractor for single paintings ([#1233](https://github.com/mikf/gallery-dl/issues/1233)) +- [downloader:http] add MIME type and signature for `.ico` files ([#1211](https://github.com/mikf/gallery-dl/issues/1211)) +- add a `d` format string conversion for timestamp values +- add `"ascii"` as a special `path-restrict` value +### Fixes +- [hentainexus] fix extraction ([#1234](https://github.com/mikf/gallery-dl/issues/1234)) +- [instagram] categorize single highlight URLs as `highlights` ([#1222](https://github.com/mikf/gallery-dl/issues/1222)) +- [redgifs] fix search results +- [twitter] fix login with username & password +- [twitter] fetch tweets from `homeConversation` entries + ## 1.16.1 - 2020-12-27 ### Additions - [instagram] add `include` option ([#1180](https://github.com/mikf/gallery-dl/issues/1180)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.16.1 +Version: 1.16.3 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.3/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.3/gallery-dl.bin>`__ These executables include a Python interpreter and all required Python packages. @@ -332,7 +332,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.1.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.3.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ @@ -83,8 +83,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.3/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.3/gallery-dl.bin>`__ These executables include a Python interpreter and all required Python packages. @@ -321,7 +321,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.1.tar.gz +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.3.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 5194312..f134f63 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -15,6 +15,7 @@ _arguments -C -S \ {-q,--quiet}'[Activate quiet mode]' \ {-v,--verbose}'[Print various debugging information]' \ {-g,--get-urls}'[Print URLs instead of downloading]' \ +-G'[==SUPPRESS==]' \ {-j,--dump-json}'[Print JSON information]' \ {-s,--simulate}'[Simulate data extraction; do not download anything]' \ {-K,--list-keywords}'[Print a list of available keywords and example values for the given URLs]' \ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index c3df997..a260907 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2020-12-27" "1.16.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2021-01-10" "1.16.3" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 40efa15..609d1de 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2020-12-27" "1.16.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2021-01-10" "1.16.3" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -186,6 +186,8 @@ depending on the local operating system * \f[I]"unix"\f[]: \f[I]"/"\f[] .br * \f[I]"windows"\f[]: \f[I]"\\\\\\\\|/<>:\\"?*"\f[] +.br +* \f[I]"ascii"\f[]: \f[I]"^0-9A-Za-z_."\f[] Note: In a string with 2 or more characters, \f[I][]^-\\\f[] need to be escaped with backslashes, e.g. \f[I]"\\\\[\\\\]"\f[] @@ -803,6 +805,37 @@ Controls the download target for Ugoira posts. * \f[I]false\f[]: Converted video files +.SS extractor.derpibooru.api-key +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Your \f[I]Derpibooru API Key\f[], +to use your account's browsing settings and filters. + + +.SS extractor.derpibooru.filter +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Example:" 4 +56027 (\f[I]Everything\f[] filter) + +.IP "Description:" 4 +The content filter ID to use. + +Setting an explicit filter ID overrides any default filters and can be used +to access 18+ content without \f[I]API Key\f[]. + +See \f[I]Filters\f[] for details. + + .SS extractor.deviantart.extra .IP "Type:" 6 \f[I]bool\f[] diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 7b2006e..2ce1d97 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.16.1 +Version: 1.16.3 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.1/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.3/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.3/gallery-dl.bin>`__ These executables include a Python interpreter and all required Python packages. @@ -332,7 +332,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.1.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.3.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 3b28345..cb025ff 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -51,6 +51,7 @@ gallery_dl/extractor/blogger.py gallery_dl/extractor/booru.py gallery_dl/extractor/common.py gallery_dl/extractor/danbooru.py +gallery_dl/extractor/derpibooru.py gallery_dl/extractor/deviantart.py gallery_dl/extractor/directlink.py gallery_dl/extractor/dynastyscans.py diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index b8546a8..179a552 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -283,7 +283,10 @@ MIME_TYPES = { "image/x-ms-bmp": "bmp", "image/webp" : "webp", "image/svg+xml" : "svg", - + "image/ico" : "ico", + "image/icon" : "ico", + "image/x-icon" : "ico", + "image/vnd.microsoft.icon" : "ico", "image/x-photoshop" : "psd", "application/x-photoshop" : "psd", "image/vnd.adobe.photoshop": "psd", @@ -314,7 +317,7 @@ MIME_TYPES = { "application/octet-stream": "bin", } -# taken from https://en.wikipedia.org/wiki/List_of_file_signatures +# https://en.wikipedia.org/wiki/List_of_file_signatures FILE_SIGNATURES = { "jpg" : b"\xFF\xD8\xFF", "png" : b"\x89PNG\r\n\x1A\n", @@ -322,6 +325,8 @@ FILE_SIGNATURES = { "bmp" : b"BM", "webp": b"RIFF", "svg" : b"<?xml", + "ico" : b"\x00\x00\x01\x00", + "cur" : b"\x00\x00\x02\x00", "psd" : b"8BPS", "webm": b"\x1A\x45\xDF\xA3", "ogg" : b"OggS", @@ -333,8 +338,7 @@ FILE_SIGNATURES = { "pdf" : b"%PDF-", "swf" : (b"CWS", b"FWS"), # check 'bin' files against all other file signatures - "bin" : b"\x00\x00\x00\x00", + "bin" : b"\x00\x00\x00\x00\x00\x00\x00\x00", } - __downloader__ = HttpDownloader diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b38cddc..36107d9 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -24,6 +24,7 @@ modules = [ "behance", "blogger", "danbooru", + "derpibooru", "deviantart", "dynastyscans", "e621", diff --git a/gallery_dl/extractor/derpibooru.py b/gallery_dl/extractor/derpibooru.py new file mode 100644 index 0000000..3b20fa5 --- /dev/null +++ b/gallery_dl/extractor/derpibooru.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://derpibooru.org/""" + +from .booru import BooruExtractor +from .. import text, exception +import operator + +BASE_PATTERN = r"(?:https?://)?derpibooru\.org" + + +class DerpibooruExtractor(BooruExtractor): + """Base class for derpibooru extractors""" + category = "derpibooru" + filename_fmt = "{filename}.{extension}" + archive_fmt = "{id}" + root = "https://derpibooru.org" + request_interval = 1.0 + per_page = 50 + + _file_url = operator.itemgetter("view_url") + + @staticmethod + def _prepare(post): + post["date"] = text.parse_datetime( + post["created_at"], "%Y-%m-%dT%H:%M:%S") + + @staticmethod + def _extended_tags(post): + pass + + def _pagination(self, url, params): + params["page"] = 1 + params["per_page"] = self.per_page + + api_key = self.config("api-key") + if api_key: + params["key"] = api_key + + filter_id = self.config("filter") + if filter_id: + params["filter_id"] = filter_id + + while True: + data = self.request(url, params=params).json() + yield from data["images"] + + if len(data["images"]) < self.per_page: + return + params["page"] += 1 + + +class DerpibooruPostExtractor(DerpibooruExtractor): + """Extractor for single posts from derpibooru.org""" + subcategory = "post" + pattern = BASE_PATTERN + r"/images/(\d+)" + test = ("https://derpibooru.org/images/1", { + "content": "88449eeb0c4fa5d3583d0b794f6bc1d70bf7f889", + "count": 1, + "keyword": { + "animated": False, + "aspect_ratio": 1.0, + "comment_count": int, + "created_at": "2012-01-02T03:12:33", + "date": "dt:2012-01-02 03:12:33", + "deletion_reason": None, + "description": "", + "downvotes": int, + "duplicate_of": None, + "duration": 0.04, + "extension": "png", + "faves": int, + "first_seen_at": "2012-01-02T03:12:33", + "format": "png", + "height": 900, + "hidden_from_users": False, + "id": 1, + "mime_type": "image/png", + "name": "1__safe_fluttershy_solo_cloud_happy_flying_upvotes+galore" + "_artist-colon-speccysy_get_sunshine", + "orig_sha512_hash": None, + "processed": True, + "representations": dict, + "score": int, + "sha512_hash": "f16c98e2848c2f1bfff3985e8f1a54375cc49f78125391aeb8" + "0534ce011ead14e3e452a5c4bc98a66f56bdfcd07ef7800663" + "b994f3f343c572da5ecc22a9660f", + "size": 860914, + "source_url": "https://www.deviantart.com/speccysy/art" + "/Afternoon-Flight-215193985", + "spoilered": False, + "tag_count": 36, + "tag_ids": list, + "tags": list, + "thumbnails_generated": True, + "updated_at": "2020-05-28T13:14:07", + "uploader": "Clover the Clever", + "uploader_id": 211188, + "upvotes": int, + "view_url": str, + "width": 900, + "wilson_score": float, + }, + }) + + def __init__(self, match): + DerpibooruExtractor.__init__(self, match) + self.image_id = match.group(1) + + def posts(self): + url = self.root + "/api/v1/json/images/" + self.image_id + return (self.request(url).json()["image"],) + + +class DerpibooruSearchExtractor(DerpibooruExtractor): + """Extractor for search results on derpibooru.org""" + subcategory = "search" + directory_fmt = ("{category}", "{search_tags}") + pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))" + test = ( + ("https://derpibooru.org/search?q=cute", { + "range": "40-60", + "count": 21, + }), + ("https://derpibooru.org/tags/cute", { + "range": "40-60", + "count": 21, + }), + ) + + def __init__(self, match): + DerpibooruExtractor.__init__(self, match) + query, tags = match.groups() + self.params = text.parse_query(query) if query else {"q": tags} + + def metadata(self): + return {"search_tags": self.params.get("q", "")} + + def posts(self): + url = self.root + "/api/v1/json/search/images" + return self._pagination(url, self.params) + + +class DerpibooruGalleryExtractor(DerpibooruExtractor): + """Extractor for galleries on derpibooru.org""" + subcategory = "gallery" + directory_fmt = ("{category}", "galleries", + "{gallery[id]} {gallery[title]}") + pattern = BASE_PATTERN + r"/galleries/(\d+)" + test = ("https://derpibooru.org/galleries/1", { + "pattern": r"https://derpicdn\.net/img/view/\d+/\d+/\d+/\d+[^/]+$", + "keyword": { + "gallery": { + "description": "Indexes start at 1 :P", + "id": 1, + "spoiler_warning": "", + "thumbnail_id": 1, + "title": "The Very First Gallery", + "user": "DeliciousBlackInk", + "user_id": 365446, + }, + }, + }) + + def __init__(self, match): + DerpibooruExtractor.__init__(self, match) + self.gallery_id = match.group(1) + + def metadata(self): + url = self.root + "/api/v1/json/search/galleries" + params = {"q": "id:" + self.gallery_id} + galleries = self.request(url, params=params).json()["galleries"] + if not galleries: + raise exception.NotFoundError("gallery") + return {"gallery": galleries[0]} + + def posts(self): + gallery_id = "gallery_id:" + self.gallery_id + url = self.root + "/api/v1/json/search/images" + params = {"sd": "desc", "sf": gallery_id, "q" : gallery_id} + return self._pagination(url, params) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 456a173..a9c63a9 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -588,10 +588,13 @@ class DeviantartFavoriteExtractor(DeviantartExtractor): def deviations(self): folders = self.api.collections_folders(self.user) if self.flat: - return itertools.chain.from_iterable( + deviations = itertools.chain.from_iterable( self.api.collections(self.user, folder["folderid"]) for folder in folders ) + if self.offset: + deviations = util.advance(deviations, self.offset) + return deviations return self._folder_urls( folders, "favourites", DeviantartCollectionExtractor) diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 8a03dc9..81f2bc2 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,37 +11,26 @@ from .common import Extractor, Message, generate_extractors from .. import text import itertools -import operator -class FoolfuukaThreadExtractor(Extractor): +class FoolfuukaExtractor(Extractor): """Base extractor for FoolFuuka based boards/archives""" basecategory = "foolfuuka" - subcategory = "thread" - directory_fmt = ("{category}", "{board[shortname]}", - "{thread_num}{title:? - //}") archive_fmt = "{board[shortname]}_{num}_{timestamp}" - pattern_fmt = r"/([^/]+)/thread/(\d+)" external = "default" def __init__(self, match): Extractor.__init__(self, match) - self.board, self.thread = match.groups() self.session.headers["Referer"] = self.root if self.external == "direct": self.remote = self._remote_direct def items(self): - op = True - yield Message.Version, 1 + yield Message.Directory, self.metadata() for post in self.posts(): - if op: - yield Message.Directory, post - op = False - if not post["media"]: - continue - media = post["media"] + if not media: + continue url = media["media_link"] if not url and "remote_media_link" in media: @@ -53,17 +42,11 @@ class FoolfuukaThreadExtractor(Extractor): media["media"].rpartition(".") yield Message.Url, url, post - def posts(self): - """Return an iterable with all posts in this thread""" - url = self.root + "/_/api/chan/thread/" - params = {"board": self.board, "num": self.thread} - data = self.request(url, params=params).json()[self.thread] - - # sort post-objects by key - posts = sorted(data.get("posts", {}).items()) - posts = map(operator.itemgetter(1), posts) + def metadata(self): + """ """ - return itertools.chain((data["op"],), posts) + def posts(self): + """Return an iterable with all relevant posts""" def remote(self, media): """Resolve a remote media link""" @@ -76,6 +59,117 @@ class FoolfuukaThreadExtractor(Extractor): return media["remote_media_link"] +class FoolfuukaThreadExtractor(FoolfuukaExtractor): + """Base extractor for threads on FoolFuuka based boards/archives""" + subcategory = "thread" + directory_fmt = ("{category}", "{board[shortname]}", + "{thread_num}{title:? - //}") + pattern_fmt = r"/([^/?#]+)/thread/(\d+)" + + def __init__(self, match): + FoolfuukaExtractor.__init__(self, match) + self.board, self.thread = match.groups() + self.data = None + + def metadata(self): + url = self.root + "/_/api/chan/thread/" + params = {"board": self.board, "num": self.thread} + self.data = self.request(url, params=params).json()[self.thread] + return self.data["op"] + + def posts(self): + posts = self.data.get("posts") + if posts: + posts = list(posts.values()) + posts.sort(key=lambda p: p["timestamp"]) + else: + posts = () + return itertools.chain((self.data["op"],), posts) + + +class FoolfuukaBoardExtractor(FoolfuukaExtractor): + """Base extractor for FoolFuuka based boards/archives""" + subcategory = "board" + pattern_fmt = r"/([^/?#]+)/\d*$" + + def __init__(self, match): + FoolfuukaExtractor.__init__(self, match) + self.board = match.group(1) + + def items(self): + index_base = "{}/_/api/chan/index/?board={}&page=".format( + self.root, self.board) + thread_base = "{}/{}/thread/".format(self.root, self.board) + + for page in itertools.count(1): + with self.request(index_base + format(page)) as response: + try: + threads = response.json() + except ValueError: + threads = None + + if not threads: + return + + for num, thread in threads.items(): + thread["url"] = thread_base + format(num) + thread["_extractor"] = self.childclass + yield Message.Queue, thread["url"], thread + + +class FoolfuukaSearchExtractor(FoolfuukaExtractor): + """Base extractor for search results on FoolFuuka based boards/archives""" + subcategory = "search" + directory_fmt = ("{category}", "search", "{search}") + pattern_fmt = r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)" + request_interval = 1.0 + + def __init__(self, match): + FoolfuukaExtractor.__init__(self, match) + board, search = match.groups() + + self.params = params = {} + args = search.split("/") + key = None + + for arg in args: + if key: + params[key] = text.unescape(arg) + key = None + else: + key = arg + if board != "_": + params["boards"] = board + + def metadata(self): + return {"search": self.params.get("text", "")} + + def posts(self): + url = self.root + "/_/api/chan/search/" + params = self.params.copy() + params["page"] = text.parse_int(params.get("page"), 1) + if "filter" not in params: + params["filter"] = "text" + + while True: + try: + data = self.request(url, params=params).json() + except ValueError: + return + + if isinstance(data, dict): + if data.get("error"): + return + posts = data["0"]["posts"] + elif isinstance(data, list): + posts = data[0]["posts"] + else: + return + + yield from posts + params["page"] += 1 + + EXTRACTORS = { "4plebs": { "name": "_4plebs", @@ -84,6 +178,8 @@ EXTRACTORS = { "test-thread": ("https://archive.4plebs.org/tg/thread/54059290", { "url": "07452944164b602502b02b24521f8cee5c484d2a", }), + "test-board": ("https://archive.4plebs.org/tg/",), + "test-search": ("https://archive.4plebs.org/_/search/text/test/",), }, "archivedmoe": { "root": "https://archived.moe", @@ -96,6 +192,8 @@ EXTRACTORS = { "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87", }), ), + "test-board": ("https://archived.moe/gd/",), + "test-search": ("https://archived.moe/_/search/text/test/",), }, "archiveofsins": { "root": "https://archiveofsins.com", @@ -104,6 +202,8 @@ EXTRACTORS = { "url": "f612d287087e10a228ef69517cf811539db9a102", "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4", }), + "test-board": ("https://archiveofsins.com/h/",), + "test-search": ("https://archiveofsins.com/_/search/text/test/",), }, "b4k": { "root": "https://arch.b4k.co", @@ -111,18 +211,24 @@ EXTRACTORS = { "test-thread": ("https://arch.b4k.co/meta/thread/196/", { "url": "d309713d2f838797096b3e9cb44fe514a9c9d07a", }), + "test-board": ("https://arch.b4k.co/meta/",), + "test-search": ("https://arch.b4k.co/_/search/text/test/",), }, "desuarchive": { "root": "https://desuarchive.org", "test-thread": ("https://desuarchive.org/a/thread/159542679/", { "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406", }), + "test-board": ("https://desuarchive.org/a/",), + "test-search": ("https://desuarchive.org/_/search/text/test/",), }, "fireden": { "root": "https://boards.fireden.net", "test-thread": ("https://boards.fireden.net/sci/thread/11264294/", { "url": "3adfe181ee86a8c23021c705f623b3657a9b0a43", }), + "test-board": ("https://boards.fireden.net/sci/",), + "test-search": ("https://boards.fireden.net/_/search/text/test/",), }, "nyafuu": { "root": "https://archive.nyafuu.org", @@ -130,6 +236,8 @@ EXTRACTORS = { "test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", { "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f", }), + "test-board": ("https://archive.nyafuu.org/c/",), + "test-search": ("https://archive.nyafuu.org/_/search/text/test/",), }, "rbt": { "root": "https://rbt.asia", @@ -142,6 +250,8 @@ EXTRACTORS = { "url": "61896d9d9a2edb556b619000a308a984307b6d30", }), ), + "test-board": ("https://rbt.asia/g/",), + "test-search": ("https://rbt.asia/_/search/text/test/",), }, "thebarchive": { "root": "https://thebarchive.com", @@ -149,9 +259,14 @@ EXTRACTORS = { "test-thread": ("https://thebarchive.com/b/thread/739772332/", { "url": "e8b18001307d130d67db31740ce57c8561b5d80c", }), + "test-board": ("https://thebarchive.com/b/",), + "test-search": ("https://thebarchive.com/_/search/text/test/",), }, + "_ckey": "childclass", } generate_extractors(EXTRACTORS, globals(), ( FoolfuukaThreadExtractor, + FoolfuukaBoardExtractor, + FoolfuukaSearchExtractor, )) diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index f878dbd..4e62165 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2020 Mike Fährmann +# Copyright 2017-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,7 +22,7 @@ class GfycatExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.key = match.group(1) + self.key = match.group(1).lower() self.formats = (self.config("format", "mp4"), "mp4", "webm", "gif") def items(self): @@ -33,6 +33,7 @@ class GfycatExtractor(Extractor): continue url = self._select_format(gfycat) gfycat.update(metadata) + gfycat["date"] = text.parse_timestamp(gfycat.get("createDate")) yield Message.Directory, gfycat yield Message.Url, url, gfycat @@ -41,8 +42,11 @@ class GfycatExtractor(Extractor): key = fmt + "Url" if key in gfyitem: url = gfyitem[key] + if url.startswith("http:"): + url = "https" + url[4:] gfyitem["extension"] = url.rpartition(".")[2] return url + gfyitem["extension"] = "" return "" def metadata(self): @@ -102,6 +106,7 @@ class GfycatImageExtractor(GfycatExtractor): "title": "Bottom's up", "username": "jackson3oh3", "createDate": 1495884169, + "date": "dt:2017-05-27 11:22:49", "md5": "a4796e05b0db9ba9ce5140145cd318aa", "width": 400, "height": 224, @@ -143,6 +148,7 @@ class GfycatImageExtractor(GfycatExtractor): self.log.warning("Skipping '%s' (malformed)", gfycat["gfyId"]) return url = self._select_format(gfycat) + gfycat["date"] = text.parse_timestamp(gfycat.get("createDate")) yield Message.Directory, gfycat yield Message.Url, url, gfycat @@ -190,7 +196,11 @@ class GfycatAPI(): while True: data = self._call(endpoint, params) gfycats = data["gfycats"] - yield from gfycats + + for gfycat in gfycats: + if "gfyName" not in gfycat: + gfycat.update(self.gfycat(gfycat["gfyId"])) + yield gfycat if "found" not in data and len(gfycats) < params["count"] or \ not data["gfycats"]: diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index 519453b..6d3ed74 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -22,7 +22,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): r"/(?:view|read)/(\d+)") test = ( ("https://hentainexus.com/view/5688", { - "url": "746d0043e20030f1171aae5ea113176607302517", + "url": "f1761895fb7aca2f6ff9e09f839c0ee2fa7a5e54", "keyword": "5e5bb4b1553b1c6e126b198f9ae017a1a5d0a5ad", }), ("https://hentainexus.com/read/5688"), @@ -60,12 +60,15 @@ class HentainexusGalleryExtractor(GalleryExtractor): def images(self, _): url = "{}/read/{}".format(self.root, self.gallery_id) page = self.request(url).text - data = json.loads(self._decode(text.extract( page, 'initReader("', '"')[0])) + + pages = data.get("pages") + if pages: + return [(page, None) for page in pages] + base = data["b"] + data["r"] gid = data["i"] - return [ ("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), None) for page in data["f"] diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 930c8b4..c3e7fe4 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -609,8 +609,10 @@ class InstagramStoriesExtractor(InstagramExtractor): request_interval = 1.0 def __init__(self, match): - InstagramExtractor.__init__(self, match) self.highlight_id, self.user = match.groups() + if self.highlight_id: + self.subcategory = InstagramHighlightsExtractor.subcategory + InstagramExtractor.__init__(self, match) def posts(self): if self.highlight_id: diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 739e67e..334412d 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -22,6 +22,7 @@ class PinterestExtractor(Extractor): category = "pinterest" filename_fmt = "{category}_{id}.{extension}" archive_fmt = "{id}" + root = "https://www.pinterest.com" def __init__(self, match): Extractor.__init__(self, match) @@ -123,7 +124,7 @@ class PinterestBoardExtractor(PinterestExtractor): subcategory = "board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") archive_fmt = "{board[id]}_{id}" - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)/?$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/(?!_saved)([^/?#&]+)/?$" test = ( ("https://www.pinterest.com/g1952849/test-/", { "pattern": r"https://i\.pinimg\.com/originals/", @@ -167,6 +168,30 @@ class PinterestBoardExtractor(PinterestExtractor): return self.api.board_pins(board["id"]) +class PinterestUserExtractor(PinterestExtractor): + """Extractor for a user's boards""" + subcategory = "user" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)(?:/_saved)?/?$" + test = ( + ("https://www.pinterest.de/g1952849/", { + "pattern": PinterestBoardExtractor.pattern, + "count": ">= 2", + }), + ("https://www.pinterest.de/g1952849/_saved/"), + ) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.user = text.unquote(match.group(1)) + + def items(self): + for board in self.api.boards(self.user): + url = board.get("url") + if url: + board["_extractor"] = PinterestBoardExtractor + yield Message.Queue, self.root + url, board + + class PinterestSectionExtractor(PinterestExtractor): """Extractor for board sections on pinterest.com""" subcategory = "section" @@ -301,6 +326,18 @@ class PinterestAPI(): "field_set_key": "detailed"} return self._call("Board", options)["resource_response"]["data"] + def boards(self, user): + """Yield all boards from 'user'""" + options = { + "sort" : "last_pinned_to", + "field_set_key" : "profile_grid_item", + "filter_stories" : False, + "username" : user, + "page_size" : 25, + "include_archived": True, + } + return self._pagination("Boards", options) + def board_pins(self, board_id): """Yield all pins of a specific board""" options = {"board_id": board_id} diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 0a85be6..8611dcb 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Mike Fährmann +# Copyright 2020-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -15,7 +15,7 @@ from .. import text class RedgifsExtractor(GfycatExtractor): """Base class for redgifs extractors""" category = "redgifs" - root = "https://www.redgifs.com/" + root = "https://www.redgifs.com" class RedgifsUserExtractor(RedgifsExtractor): @@ -39,8 +39,8 @@ class RedgifsSearchExtractor(RedgifsExtractor): pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/gifs/browse/([^/?#]+)" test = ("https://www.redgifs.com/gifs/browse/jav", { "pattern": r"https://\w+\.(redgifs|gfycat)\.com/[A-Za-z]+\.mp4", - "range": "100-300", - "count": "> 200", + "range": "1-10", + "count": 10, }) def metadata(self): @@ -54,7 +54,7 @@ class RedgifsSearchExtractor(RedgifsExtractor): class RedgifsImageExtractor(RedgifsExtractor): """Extractor for individual gifs from redgifs.com""" subcategory = "image" - pattern = (r"(?:https?://)?(?:www\.)?(?:redgifs\.com/watch" + pattern = (r"(?:https?://)?(?:www\.)?(?:redgifs\.com/(?:watch|ifr)" r"|gifdeliverynetwork.com)/([A-Za-z]+)") test = ( ("https://redgifs.com/watch/foolishforkedabyssiniancat", { @@ -62,6 +62,7 @@ class RedgifsImageExtractor(RedgifsExtractor): r"/FoolishForkedAbyssiniancat\.mp4", "content": "f6e03f1df9a2ff2a74092f53ee7580d2fb943533", }), + ("https://redgifs.com/ifr/FoolishForkedAbyssiniancat"), ("https://www.gifdeliverynetwork.com/foolishforkedabyssiniancat"), ) @@ -70,6 +71,6 @@ class RedgifsImageExtractor(RedgifsExtractor): class RedgifsAPI(GfycatAPI): - API_ROOT = "https://napi.redgifs.com/" + API_ROOT = "https://napi.redgifs.com" ACCESS_KEY = ("dBLwVuGn9eq4dtXLs8WSfpjcYFY7bPQe" "AqGPSFgqeW5B9uzj2cMVhF63pTFF4Rg9") diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 9e64eac..6a499a3 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -9,6 +9,7 @@ """Extractors for https://sankaku.app/""" from .booru import BooruExtractor +from .common import Message from .. import text, exception from ..cache import cache import collections @@ -163,6 +164,31 @@ class SankakuPostExtractor(SankakuExtractor): return SankakuAPI(self).posts(self.post_id) +class SankakuBooksExtractor(SankakuExtractor): + """Extractor for books by tag search on sankaku.app""" + subcategory = "books" + pattern = BASE_PATTERN + r"/books/?\?([^#]*)" + test = ( + ("https://sankaku.app/books?tags=aiue_oka", { + "range": "1-20", + "count": 20, + }), + ("https://beta.sankakucomplex.com/books?tags=aiue_oka"), + ) + + def __init__(self, match): + SankakuExtractor.__init__(self, match) + query = text.parse_query(match.group(1)) + self.tags = text.unquote(query.get("tags", "").replace("+", " ")) + + def items(self): + params = {"tags": self.tags, "pool_type": "0"} + for pool in SankakuAPI(self).pools_keyset(params): + pool["_extractor"] = SankakuPoolExtractor + url = "https://sankaku.app/books/{}".format(pool["id"]) + yield Message.Queue, url, pool + + class SankakuAPI(): """Interface for the sankaku.app API""" @@ -178,6 +204,9 @@ class SankakuAPI(): params = {"lang": "en"} return self._call("/pools/" + pool_id, params) + def pools_keyset(self, params): + return self._pagination("/pools/keyset", params) + def posts(self, post_id): params = { "lang" : "en", diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index a77ea06..b769912 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -239,30 +239,29 @@ class TwitterExtractor(Extractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - url = "https://mobile.twitter.com/i/nojs_router" - params = {"path": "/login"} - headers = {"Referer": self.root + "/", "Origin": self.root} - page = self.request( - url, method="POST", params=params, headers=headers, data={}).text + token = util.generate_csrf_token() + self.session.cookies.clear() + self.request(self.root + "/login") - pos = page.index('name="authenticity_token"') - token = text.extract(page, 'value="', '"', pos)[0] - - url = "https://mobile.twitter.com/sessions" + url = self.root + "/sessions" + cookies = { + "_mb_tk": token, + } data = { + "redirect_after_login" : "/", + "remember_me" : "1", "authenticity_token" : token, + "wfa" : "1", + "ui_metrics" : "{}", "session[username_or_email]": username, "session[password]" : password, - "remember_me" : "1", - "wfa" : "1", - "commit" : "+Log+in+", - "ui_metrics" : "", } - response = self.request(url, method="POST", data=data) + response = self.request( + url, method="POST", cookies=cookies, data=data) + cookies = { cookie.name: cookie.value for cookie in self.session.cookies - if cookie.domain == self.cookiedomain } if "/error" in response.url or "auth_token" not in cookies: @@ -464,15 +463,17 @@ class TwitterAPI(): def __init__(self, extractor): self.extractor = extractor + + self.root = "https://twitter.com/i/api" self.headers = { "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" "4FA33AGWWjCpTnA", "x-guest-token": None, + "x-twitter-auth-type": None, "x-twitter-client-language": "en", "x-twitter-active-user": "yes", "x-csrf-token": None, - "Origin": "https://twitter.com", "Referer": "https://twitter.com/", } self.params = { @@ -487,8 +488,8 @@ class TwitterAPI(): "skip_status": "1", "cards_platform": "Web-12", "include_cards": "1", - "include_composer_source": "true", "include_ext_alt_text": "true", + "include_quote_count": "true", "include_reply_count": "1", "tweet_mode": "extended", "include_entities": "true", @@ -497,11 +498,9 @@ class TwitterAPI(): "include_ext_media_availability": "true", "send_error_codes": "true", "simple_quoted_tweet": "true", - # "count": "20", "count": "100", "cursor": None, - "ext": "mediaStats,highlightedLabel,cameraMoment", - "include_quote_count": "true", + "ext": "mediaStats,highlightedLabel", } cookies = self.extractor.session.cookies @@ -516,17 +515,15 @@ class TwitterAPI(): if cookies.get("auth_token", domain=cookiedomain): # logged in - self.root = "https://twitter.com/i/api/" self.headers["x-twitter-auth-type"] = "OAuth2Session" else: # guest - self.root = "https://api.twitter.com/" guest_token = self._guest_token() cookies.set("gt", guest_token, domain=cookiedomain) self.headers["x-guest-token"] = guest_token def tweet(self, tweet_id): - endpoint = "2/timeline/conversation/{}.json".format(tweet_id) + endpoint = "/2/timeline/conversation/{}.json".format(tweet_id) tweets = [] for tweet in self._pagination(endpoint): if tweet["id_str"] == tweet_id or \ @@ -540,43 +537,46 @@ class TwitterAPI(): def timeline_profile(self, screen_name): user_id = self._user_id_by_screen_name(screen_name) - endpoint = "2/timeline/profile/{}.json".format(user_id) - return self._pagination(endpoint) + endpoint = "/2/timeline/profile/{}.json".format(user_id) + params = self.params.copy() + params["include_tweet_replies"] = "false" + return self._pagination(endpoint, params) def timeline_media(self, screen_name): user_id = self._user_id_by_screen_name(screen_name) - endpoint = "2/timeline/media/{}.json".format(user_id) + endpoint = "/2/timeline/media/{}.json".format(user_id) return self._pagination(endpoint) def timeline_favorites(self, screen_name): user_id = self._user_id_by_screen_name(screen_name) - endpoint = "2/timeline/favorites/{}.json".format(user_id) + endpoint = "/2/timeline/favorites/{}.json".format(user_id) + params = self.params.copy() + params["sorted_by_time"] = "true" return self._pagination(endpoint) def timeline_bookmark(self): - endpoint = "2/timeline/bookmark.json" + endpoint = "/2/timeline/bookmark.json" return self._pagination(endpoint) def timeline_list(self, list_id): - endpoint = "2/timeline/list.json" + endpoint = "/2/timeline/list.json" params = self.params.copy() params["list_id"] = list_id params["ranking_mode"] = "reverse_chronological" return self._pagination(endpoint, params) def search(self, query): - endpoint = "2/search/adaptive.json" + endpoint = "/2/search/adaptive.json" params = self.params.copy() params["q"] = query params["tweet_search_mode"] = "live" params["query_source"] = "typed_query" params["pc"] = "1" params["spelling_corrections"] = "1" - return self._pagination( - endpoint, params, "sq-I-t-", "sq-cursor-bottom") + return self._pagination(endpoint, params) def list_members(self, list_id): - endpoint = "graphql/M74V2EwlxxVYGB4DbyAphQ/ListMembers" + endpoint = "/graphql/3pV4YlpljXUTFAa1jVNWQw/ListMembers" variables = { "listId": list_id, "count" : 20, @@ -586,7 +586,7 @@ class TwitterAPI(): return self._pagination_members(endpoint, variables) def list_by_rest_id(self, list_id): - endpoint = "graphql/LXXTUytSX1QY-2p8Xp9BFA/ListByRestId" + endpoint = "/graphql/EhaI2uiCBJI97e28GN8WjQ/ListByRestId" params = {"variables": '{"listId":"' + list_id + '"' ',"withUserResult":false}'} try: @@ -595,7 +595,7 @@ class TwitterAPI(): raise exception.NotFoundError("list") def user_by_screen_name(self, screen_name): - endpoint = "graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName" + endpoint = "/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName" params = {"variables": '{"screen_name":"' + screen_name + '"' ',"withHighlightedLabel":true}'} try: @@ -610,14 +610,16 @@ class TwitterAPI(): @cache(maxage=3600) def _guest_token(self): - endpoint = "1.1/guest/activate.json" - return self._call(endpoint, None, "POST")["guest_token"] + root = "https://api.twitter.com" + endpoint = "/1.1/guest/activate.json" + return self._call(endpoint, None, root, "POST")["guest_token"] - def _call(self, endpoint, params, method="GET"): - url = self.root + endpoint + def _call(self, endpoint, params, root=None, method="GET"): + if root is None: + root = self.root response = self.extractor.request( - url, method=method, params=params, headers=self.headers, - fatal=None) + root + endpoint, method=method, params=params, + headers=self.headers, fatal=None) # update 'x-csrf-token' header (#1170) csrf_token = response.cookies.get("ct0") @@ -641,11 +643,11 @@ class TwitterAPI(): raise exception.StopExtraction( "%s %s (%s)", response.status_code, response.reason, msg) - def _pagination(self, endpoint, params=None, - entry_tweet="tweet-", entry_cursor="cursor-bottom-"): + def _pagination(self, endpoint, params=None): if params is None: params = self.params.copy() original_retweets = (self.extractor.retweets == "original") + pinned_tweet = True while True: cursor = tweet = None @@ -654,48 +656,65 @@ class TwitterAPI(): instr = data["timeline"]["instructions"] if not instr: return + tweet_ids = [] tweets = data["globalObjects"]["tweets"] users = data["globalObjects"]["users"] + if pinned_tweet: + if "pinEntry" in instr[-1]: + tweet_ids.append(instr[-1]["pinEntry"]["entry"]["content"] + ["item"]["content"]["tweet"]["id"]) + pinned_tweet = False + + # collect tweet IDs and cursor value for entry in instr[0]["addEntries"]["entries"]: + entry_startswith = entry["entryId"].startswith + + if entry_startswith(("tweet-", "sq-I-t-")): + tweet_ids.append( + entry["content"]["item"]["content"]["tweet"]["id"]) - if entry["entryId"].startswith(entry_tweet): - try: - tweet = tweets[ - entry["content"]["item"]["content"]["tweet"]["id"]] - except KeyError: - self.extractor.log.debug( - "Skipping %s (deleted)", - entry["entryId"][len(entry_tweet):]) - continue - - if "retweeted_status_id_str" in tweet: - retweet = tweets.get(tweet["retweeted_status_id_str"]) - if original_retweets: - if not retweet: - continue - retweet["_retweet_id_str"] = tweet["id_str"] - tweet = retweet - elif retweet: - tweet["author"] = users[retweet["user_id_str"]] - tweet["user"] = users[tweet["user_id_str"]] - yield tweet - - if "quoted_status_id_str" in tweet: - quoted = tweets.get(tweet["quoted_status_id_str"]) - if quoted: - quoted["author"] = users[quoted["user_id_str"]] - quoted["user"] = tweet["user"] - quoted["quoted"] = True - yield quoted - - elif entry["entryId"].startswith(entry_cursor): + elif entry_startswith("homeConversation-"): + tweet_ids.extend( + entry["content"]["timelineModule"]["metadata"] + ["conversationMetadata"]["allTweetIds"][::-1]) + + elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")): cursor = entry["content"]["operation"]["cursor"] if not cursor.get("stopOnEmptyResponse"): # keep going even if there are no tweets tweet = True cursor = cursor["value"] + # process tweets + for tweet_id in tweet_ids: + try: + tweet = tweets[tweet_id] + except KeyError: + self.extractor.log.debug("Skipping %s (deleted)", tweet_id) + continue + + if "retweeted_status_id_str" in tweet: + retweet = tweets.get(tweet["retweeted_status_id_str"]) + if original_retweets: + if not retweet: + continue + retweet["_retweet_id_str"] = tweet["id_str"] + tweet = retweet + elif retweet: + tweet["author"] = users[retweet["user_id_str"]] + tweet["user"] = users[tweet["user_id_str"]] + yield tweet + + if "quoted_status_id_str" in tweet: + quoted = tweets.get(tweet["quoted_status_id_str"]) + if quoted: + quoted["author"] = users[quoted["user_id_str"]] + quoted["user"] = tweet["user"] + quoted["quoted"] = True + yield quoted + + # update cursor value if "replaceEntry" in instr[-1] : cursor = (instr[-1]["replaceEntry"]["entry"] ["content"]["operation"]["cursor"]["value"]) diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index 9238590..428c6b5 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -40,7 +40,7 @@ class WikiartExtractor(Extractor): def paintings(self): """Return an iterable containing all relevant 'painting' objects""" - def _pagination(self, url, extra_params=None, key="Paintings"): + def _pagination(self, url, extra_params=None, key="Paintings", stop=False): headers = { "X-Requested-With": "XMLHttpRequest", "Referer": url, @@ -60,6 +60,8 @@ class WikiartExtractor(Extractor): if not items: return yield from items + if stop: + return params["page"] += 1 @@ -67,7 +69,7 @@ class WikiartArtistExtractor(WikiartExtractor): """Extractor for an artist's paintings on wikiart.org""" subcategory = "artist" directory_fmt = ("{category}", "{artist[artistName]}") - pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)" + pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)/?$" test = ("https://www.wikiart.org/en/thomas-cole", { "url": "5ba2fbe6783fcce34e65014d16e5fbc581490c98", "keyword": "6d92913c55675e05553f000cfee5daff0b4107cf", @@ -75,18 +77,50 @@ class WikiartArtistExtractor(WikiartExtractor): def __init__(self, match): WikiartExtractor.__init__(self, match) - self.artist = match.group(2) + self.artist_name = match.group(2) + self.artist = None def metadata(self): - url = "{}/{}/{}?json=2".format(self.root, self.lang, self.artist) - return {"artist": self.request(url).json()} + url = "{}/{}/{}?json=2".format(self.root, self.lang, self.artist_name) + self.artist = self.request(url).json() + return {"artist": self.artist} def paintings(self): url = "{}/{}/{}/mode/all-paintings".format( - self.root, self.lang, self.artist) + self.root, self.lang, self.artist_name) return self._pagination(url) +class WikiartImageExtractor(WikiartArtistExtractor): + """Extractor for individual paintings on wikiart.org""" + subcategory = "image" + pattern = BASE_PATTERN + r"/(?!(?:paintings|artists)-by-)([\w-]+)/([\w-]+)" + test = ( + ("https://www.wikiart.org/en/thomas-cole/the-departure-1838", { + "url": "4d9fd87680a2620eaeaf1f13e3273475dec93231", + "keyword": "a1b083d500ce2fd364128e35b026e4ca526000cc", + }), + # no year or '-' in slug + ("https://www.wikiart.org/en/huang-shen/summer", { + "url": "d7f60118c34067b2b37d9577e412dc1477b94207", + }), + ) + + def __init__(self, match): + WikiartArtistExtractor.__init__(self, match) + self.title = match.group(3) + + def paintings(self): + title, sep, year = self.title.rpartition("-") + if not sep or not year.isdecimal(): + title = self.title + url = "{}/{}/Search/{} {}".format( + self.root, self.lang, + self.artist.get("artistName") or self.artist_name, title, + ) + return self._pagination(url, stop=True) + + class WikiartArtworksExtractor(WikiartExtractor): """Extractor for artwork collections on wikiart.org""" subcategory = "artworks" diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 01537d6..367b934 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -136,6 +136,11 @@ def build_parser(): help="Print URLs instead of downloading", ) output.add_argument( + "-G", + dest="list_urls", action="store_const", const=128, + help=argparse.SUPPRESS, + ) + output.add_argument( "-j", "--dump-json", dest="jobtype", action="store_const", const=job.DataJob, help="Print JSON information", diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 27f9c03..71a67c1 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -86,7 +86,8 @@ class MetadataPP(PostProcessor): return (pathfmt.filename or "metadata") + "." + self.extension def _filename_custom(self, pathfmt): - return self._filename_fmt(pathfmt.kwdict) + return pathfmt.clean_path(pathfmt.clean_segment( + self._filename_fmt(pathfmt.kwdict))) def _filename_extfmt(self, pathfmt): kwdict = pathfmt.kwdict diff --git a/gallery_dl/util.py b/gallery_dl/util.py index d91d29a..ffd686e 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -506,6 +506,7 @@ class Formatter(): - "c": calls str.capitalize - "C": calls string.capwords - "t": calls str.strip + - "d": calls text.parse_timestamp - "U": calls urllib.parse.unquote - "S": calls util.to_string() - Example: {f!l} -> "example"; {f!u} -> "EXAMPLE" @@ -537,6 +538,7 @@ class Formatter(): "c": str.capitalize, "C": string.capwords, "t": str.strip, + "d": text.parse_timestamp, "U": urllib.parse.unquote, "S": to_string, "s": str, @@ -767,13 +769,14 @@ class PathFormat(): restrict = extractor.config("path-restrict", "auto") replace = extractor.config("path-replace", "_") - if restrict == "auto": restrict = "\\\\|/<>:\"?*" if WINDOWS else "/" elif restrict == "unix": restrict = "/" elif restrict == "windows": restrict = "\\\\|/<>:\"?*" + elif restrict == "ascii": + restrict = "^0-9A-Za-z_." self.clean_segment = self._build_cleanfunc(restrict, replace) remove = extractor.config("path-remove", "\x00-\x1f\x7f") diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 21541be..601eeed 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.16.1" +__version__ = "1.16.3" diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 74e8742..4e98a97 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -306,14 +306,14 @@ class MetadataTest(BasePostprocessorTest): def test_metadata_filename(self): self._create({ - "filename" : "{category}_{filename}_meta.data", + "filename" : "{category}_{filename}_/meta/\n\r.data", "extension-format": "json", }) with patch("builtins.open", mock_open()) as m: self._trigger() - path = self.pathfmt.realdirectory + "test_file_meta.data" + path = self.pathfmt.realdirectory + "test_file__meta_.data" m.assert_called_once_with(path, "w", encoding="utf-8") @staticmethod diff --git a/test/test_util.py b/test/test_util.py index fd659a0..159c4bc 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -14,6 +14,7 @@ import unittest import io import random import string +import datetime import http.cookiejar sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -267,6 +268,7 @@ class TestFormatter(unittest.TestCase): "n": None, "s": " \n\r\tSPACE ", "u": "%27%3C%20/%20%3E%27", + "t": 1262304000, "name": "Name", "title1": "Title", "title2": "", @@ -289,6 +291,9 @@ class TestFormatter(unittest.TestCase): self._run_test("{a!S}", self.kwdict["a"]) self._run_test("{l!S}", "a, b, c") self._run_test("{n!S}", "") + self._run_test("{t!d}", datetime.datetime(2010, 1, 1)) + self._run_test("{t!d:%Y-%m-%d}", "2010-01-01") + with self.assertRaises(KeyError): self._run_test("{a!q}", "hello world") |