diff options
author | Unit 193 <unit193@unit193.net> | 2020-06-16 02:01:17 -0400 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2020-06-16 02:01:17 -0400 |
commit | 8c911e3d62a430f5630c13d51b47201fa8ff3cd1 (patch) | |
tree | 6e0e6f65abc37d7f35ea96d323031a52c7fa966d | |
parent | a70a3246927b72f1ded37acd55ee719515441b5b (diff) | |
download | gallery-dl-8c911e3d62a430f5630c13d51b47201fa8ff3cd1.tar.bz2 gallery-dl-8c911e3d62a430f5630c13d51b47201fa8ff3cd1.tar.xz gallery-dl-8c911e3d62a430f5630c13d51b47201fa8ff3cd1.tar.zst |
New upstream version 1.14.1.upstream/1.14.1
-rw-r--r-- | CHANGELOG.md | 44 | ||||
-rw-r--r-- | PKG-INFO | 8 | ||||
-rw-r--r-- | README.rst | 6 | ||||
-rw-r--r-- | data/man/gallery-dl.1 | 2 | ||||
-rw-r--r-- | data/man/gallery-dl.conf.5 | 43 | ||||
-rw-r--r-- | docs/gallery-dl.conf | 5 | ||||
-rw-r--r-- | gallery_dl.egg-info/PKG-INFO | 8 | ||||
-rw-r--r-- | gallery_dl/cloudflare.py | 29 | ||||
-rw-r--r-- | gallery_dl/extractor/8muses.py | 2 | ||||
-rw-r--r-- | gallery_dl/extractor/deviantart.py | 7 | ||||
-rw-r--r-- | gallery_dl/extractor/foolslide.py | 14 | ||||
-rw-r--r-- | gallery_dl/extractor/furaffinity.py | 13 | ||||
-rw-r--r-- | gallery_dl/extractor/gfycat.py | 31 | ||||
-rw-r--r-- | gallery_dl/extractor/kissmanga.py | 16 | ||||
-rw-r--r-- | gallery_dl/extractor/mangadex.py | 2 | ||||
-rw-r--r-- | gallery_dl/extractor/mangoxo.py | 2 | ||||
-rw-r--r-- | gallery_dl/extractor/nhentai.py | 4 | ||||
-rw-r--r-- | gallery_dl/extractor/realbooru.py | 12 | ||||
-rw-r--r-- | gallery_dl/extractor/reddit.py | 6 | ||||
-rw-r--r-- | gallery_dl/extractor/redgifs.py | 76 | ||||
-rw-r--r-- | gallery_dl/extractor/twitter.py | 582 | ||||
-rw-r--r-- | gallery_dl/extractor/webtoons.py | 15 | ||||
-rw-r--r-- | gallery_dl/util.py | 7 | ||||
-rw-r--r-- | gallery_dl/version.py | 2 | ||||
-rw-r--r-- | test/test_results.py | 4 |
25 files changed, 538 insertions, 402 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index df67569..043d964 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,32 +1,44 @@ # Changelog -## Unreleased +## 1.14.1 - 2020-06-12 ### Additions -- [imagechest] add new extractor for imgchest.com (#750) -- [instagram] add `post_url`, `tags`, `location`, `tagged_users` metadata (#743) -- [redgifs] add image extractor (#724) -- [webtoons] add new extractor for webtoons.com (#761) -- implement `--write-pages` option (#736) -- extend `path-restrict` option (#662) -- implement `path-replace` option (#662, #755) -- make `path` and `keywords` available in logging messages (#574, #575) +- [furaffinity] add `artist_url` metadata field ([#821](https://github.com/mikf/gallery-dl/issues/821)) +- [redgifs] add `user` and `search` extractors ([#724](https://github.com/mikf/gallery-dl/issues/724)) +### Changes +- [deviantart] extend `extra` option; also search journals for sta.sh links ([#712](https://github.com/mikf/gallery-dl/issues/712)) +- [twitter] rewrite; use new interface ([#806](https://github.com/mikf/gallery-dl/issues/806), [#740](https://github.com/mikf/gallery-dl/issues/740)) +### Fixes +- [kissmanga] work around CAPTCHAs ([#818](https://github.com/mikf/gallery-dl/issues/818)) +- [nhentai] fix extraction ([#819](https://github.com/mikf/gallery-dl/issues/819)) +- [webtoons] generalize comic extraction code ([#820](https://github.com/mikf/gallery-dl/issues/820)) + +## 1.14.0 - 2020-05-31 +### Additions +- [imagechest] add new extractor for imgchest.com ([#750](https://github.com/mikf/gallery-dl/issues/750)) +- [instagram] add `post_url`, `tags`, `location`, `tagged_users` metadata ([#743](https://github.com/mikf/gallery-dl/issues/743)) +- [redgifs] add image extractor ([#724](https://github.com/mikf/gallery-dl/issues/724)) +- [webtoons] add new extractor for webtoons.com ([#761](https://github.com/mikf/gallery-dl/issues/761)) +- implement `--write-pages` option ([#736](https://github.com/mikf/gallery-dl/issues/736)) +- extend `path-restrict` option ([#662](https://github.com/mikf/gallery-dl/issues/662)) +- implement `path-replace` option ([#662](https://github.com/mikf/gallery-dl/issues/662), [#755](https://github.com/mikf/gallery-dl/issues/755)) +- make `path` and `keywords` available in logging messages ([#574](https://github.com/mikf/gallery-dl/issues/574), [#575](https://github.com/mikf/gallery-dl/issues/575)) ### Changes - [danbooru] change default value of `ugoira` to `false` - [downloader:ytdl] change default value of `forward-cookies` to `false` -- [downloader:ytdl] fix file extensions when merging into `.mkv` (#720) -- write OAuth tokens to cache (#616) +- [downloader:ytdl] fix file extensions when merging into `.mkv` ([#720](https://github.com/mikf/gallery-dl/issues/720)) +- write OAuth tokens to cache ([#616](https://github.com/mikf/gallery-dl/issues/616)) - use `%APPDATA%\gallery-dl` for config files and cache on Windows - use `util.Formatter` for formatting logging messages - reuse HTTP connections from parent extractors ### Fixes -- [deviantart] use private access tokens for Journals (#738) +- [deviantart] use private access tokens for Journals ([#738](https://github.com/mikf/gallery-dl/issues/738)) - [gelbooru] simplify and fix pool extraction - [imgur] fix extraction of animated images without `mp4` entry - [imgur] treat `/t/unmuted/` URLs as galleries -- [instagram] fix login with username & password (#756, #771, #797, #803) -- [reddit] don't send OAuth headers for file downloads (#729) -- fix/improve Cloudflare bypass code (#728, #757) -- reset filenames on empty file extensions (#733) +- [instagram] fix login with username & password ([#756](https://github.com/mikf/gallery-dl/issues/756), [#771](https://github.com/mikf/gallery-dl/issues/771), [#797](https://github.com/mikf/gallery-dl/issues/797), [#803](https://github.com/mikf/gallery-dl/issues/803)) +- [reddit] don't send OAuth headers for file downloads ([#729](https://github.com/mikf/gallery-dl/issues/729)) +- fix/improve Cloudflare bypass code ([#728](https://github.com/mikf/gallery-dl/issues/728), [#757](https://github.com/mikf/gallery-dl/issues/757)) +- reset filenames on empty file extensions ([#733](https://github.com/mikf/gallery-dl/issues/733)) ## 1.13.6 - 2020-05-02 ### Additions @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.14.0 +Version: 1.14.1 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -302,7 +302,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.0.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.1.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ @@ -83,8 +83,8 @@ Download a standalone executable file, put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -291,7 +291,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst -.. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.0.tar.gz +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.1.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index fe9a684..76a57d1 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2020-05-31" "1.14.0" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2020-06-12" "1.14.1" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 5a37463..88f8ebc 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2020-05-31" "1.14.0" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2020-06-12" "1.14.1" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -220,7 +220,7 @@ escaped with backslashes, e.g. \f[I]"\\\\[\\\\]"\f[] .IP "Description:" 4 Controls the behavior when downloading files that have been downloaded before, i.e. a file with the same filename already -exists or its ID is in a \f[I]download archive\f[]. +exists or its ID is in a \f[I]download archive <extractor.*.archive_>\f[]. .br * \f[I]true\f[]: Skip downloads @@ -604,8 +604,8 @@ current extractor run. \f[I]false\f[] .IP "Description:" 4 -Like \f[I]image-unique\f[], but applies to delegated URLs -like manga-chapters, etc. +Like \f[I]image-unique <extractor.*.image-unique_>\f[], +but applies to delegated URLs like manga-chapters, etc. .SS extractor.*.date-format .IP "Type:" 6 @@ -664,7 +664,8 @@ Controls the download target for Ugoira posts. \f[I]false\f[] .IP "Description:" 4 -Download extra Sta.sh resources from description texts. +Download extra Sta.sh resources from +description texts and journals. Note: Enabling this option also enables deviantart.metadata_. @@ -1259,6 +1260,22 @@ video extraction and download .br * \f[I]false\f[]: Ignore videos +.SS extractor.redgifs.format +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"mp4"\f[] + +.IP "Description:" 4 +The name of the preferred format, which can be one of +\f[I]"mp4"\f[], \f[I]"webm"\f[], \f[I]"gif"\f[], \f[I]"webp"\f[], \f[I]"mobile"\f[], +or \f[I]"mini"\f[]. + +If the selected format is not available, \f[I]"mp4"\f[], \f[I]"webm"\f[] +and \f[I]"gif"\f[] (in that order) will be tried instead, until an +available format is found. + .SS extractor.sankaku.wait-min & .wait-max .IP "Type:" 6 \f[I]float\f[] @@ -1358,16 +1375,6 @@ Possible types are \f[I]text\f[], \f[I]quote\f[], \f[I]link\f[], \f[I]answer\f[] You can use \f[I]"all"\f[] instead of listing all types separately. -.SS extractor.twitter.content -.IP "Type:" 6 -\f[I]bool\f[] - -.IP "Default:" 9 -\f[I]false\f[] - -.IP "Description:" 4 -Extract tweet text as \f[I]content\f[] metadata. - .SS extractor.twitter.replies .IP "Type:" 6 \f[I]bool\f[] @@ -1409,11 +1416,9 @@ Extract \f[I]TwitPic <https://twitpic.com/>\f[] embeds. Control video download behavior. .br -* \f[I]true\f[]: Download videos and use \f[I]youtube-dl\f[] to handle -HLS \f[I].m3u8\f[] manifests +* \f[I]true\f[]: Download videos .br -* \f[I]"ytdl"\f[]: Download videos and let \f[I]youtube-dl\f[] handle all of -video extraction and download +* \f[I]"ytdl"\f[]: Download videos using \f[I]youtube-dl\f[] .br * \f[I]false\f[]: Skip video Tweets diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index c120d25..ae4839d 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -129,6 +129,10 @@ "videos": true, "user-agent": "Python:gallery-dl:0.8.4 (by /u/mikf1)" }, + "redgifs": + { + "format": "mp4" + }, "sankaku": { "username": null, @@ -151,7 +155,6 @@ }, "twitter": { - "content": false, "replies": true, "retweets": true, "twitpic": false, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index c7189b2..3f6f077 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.14.0 +Version: 1.14.1 Summary: Command-line program to download image-galleries and -collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -94,8 +94,8 @@ Description: ========== put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__, and run it inside a command prompt (like ``cmd.exe``). - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.bin>`__ These executables include a Python 3.8 interpreter and all required Python packages. @@ -302,7 +302,7 @@ Description: ========== .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst - .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.0.tar.gz + .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.1.tar.gz .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz .. _Python: https://www.python.org/downloads/ diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py index 0cf5a57..88068d5 100644 --- a/gallery_dl/cloudflare.py +++ b/gallery_dl/cloudflare.py @@ -33,11 +33,21 @@ def solve_challenge(session, response, kwargs): parsed = urllib.parse.urlsplit(response.url) root = parsed.scheme + "://" + parsed.netloc + page = response.text + try: + params = {"ray": text.extract(page, '?ray=', '"')[0]} + + url = root + "/cdn-cgi/images/trace/jschal/nojs/transparent.gif" + session.request("GET", url, params=params) + + url = root + "/cdn-cgi/images/trace/jschal/js/nocookie/transparent.gif" + session.request("GET", url, params=params) + except Exception: + pass + cf_kwargs = {} headers = cf_kwargs["headers"] = collections.OrderedDict() params = cf_kwargs["data"] = collections.OrderedDict() - - page = response.text url = root + text.unescape(text.extract(page, 'action="', '"')[0]) headers["Referer"] = response.url @@ -54,23 +64,26 @@ def solve_challenge(session, response, kwargs): params[name] = value time.sleep(4) - - cf_kwargs["allow_redirects"] = False cf_response = session.request("POST", url, **cf_kwargs) + if cf_response.history: + initial_response = cf_response.history[0] + else: + initial_response = cf_response + cookies = { cookie.name: cookie.value - for cookie in cf_response.cookies + for cookie in initial_response.cookies } if not cookies: import logging log = logging.getLogger("cloudflare") - log.debug("Headers:\n%s", cf_response.headers) - log.debug("Content:\n%s", cf_response.text) + log.debug("Headers:\n%s", initial_response.headers) + log.debug("Content:\n%s", initial_response.text) return cf_response, None, None - domain = next(iter(cf_response.cookies)).domain + domain = next(iter(initial_response.cookies)).domain cookies["__cfduid"] = response.cookies.get("__cfduid", "") return cf_response, domain, cookies diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index dec5972..42fbe12 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -53,7 +53,7 @@ class _8musesAlbumExtractor(Extractor): "private": False, }, }), - ("https://www.8muses.com/comics/album/Fakku-Comics/7?sort=az", { + ("https://www.8muses.com/comics/album/Fakku-Comics/8?sort=az", { "count": ">= 70", "keyword": {"name": r"re:^[R-Zr-z]"}, }), diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index cda357a..73ef20d 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -121,11 +121,14 @@ class DeviantartExtractor(Extractor): if "excerpt" in deviation and self.commit_journal: journal = self.api.deviation_content(deviation["deviationid"]) + if self.extra: + deviation["_journal"] = journal["html"] yield self.commit_journal(deviation, journal) if self.extra: - for match in DeviantartStashExtractor.pattern.finditer( - deviation.get("description", "")): + txt = (deviation.get("description", "") + + deviation.get("_journal", "")) + for match in DeviantartStashExtractor.pattern.finditer(txt): url = text.ensure_http_scheme(match.group(0)) deviation["_extractor"] = DeviantartStashExtractor yield Message.Queue, url, deviation diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 3cc263c..86f63ae 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -209,24 +209,24 @@ EXTRACTORS = { }), }, "sensescans": { - "root": "http://sensescans.com/reader", + "root": "https://sensescans.com/reader", "pattern": r"(?:(?:www\.)?sensescans\.com/reader" r"|reader\.sensescans\.com)", "test-chapter": ( - (("http://sensescans.com/reader/read/" + (("https://sensescans.com/reader/read/" "magi__labyrinth_of_magic/en/37/369/"), { - "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812", + "url": "8bbc59a995640bbb944c0b1be06a490909b58be1", "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988", }), - (("http://reader.sensescans.com/read/" + (("https://reader.sensescans.com/read/" "magi__labyrinth_of_magic/en/37/369/"), { - "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812", + "url": "8bbc59a995640bbb944c0b1be06a490909b58be1", "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988", }), ), "test-manga": - ("http://sensescans.com/reader/series/hakkenden/", { - "url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2", + ("https://sensescans.com/reader/series/hakkenden/", { + "url": "3e0559029c21ca5af8a2082dd6de1567fcec4d83", "keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23", }), }, diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 9af7274..61226b6 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -48,6 +48,7 @@ class FuraffinityExtractor(Extractor): extr = text.extract_from(self.request(url).text) title, _, artist = text.unescape(extr( 'property="og:title" content="', '"')).rpartition(" by ") + artist_url = artist.replace("_", "").lower() path = extr('href="//d.facdn.net/', '"') if not path: @@ -64,11 +65,12 @@ class FuraffinityExtractor(Extractor): rh = text.remove_html data = text.nameext_from_url(path, { - "id" : pi(post_id), - "title" : title, - "artist": artist, - "user" : self.user or artist, - "url" : "https://d.facdn.net/" + path + "id" : pi(post_id), + "title" : title, + "artist" : artist, + "artist_url": artist_url, + "user" : self.user or artist_url, + "url" : "https://d.facdn.net/" + path }) tags = extr('class="tags-row">', '</section>') @@ -178,6 +180,7 @@ class FuraffinityPostExtractor(FuraffinityExtractor): "url": "eae4ef93d99365c69b31a37561bd800c03d336ad", "keyword": { "artist" : "mirlinthloth", + "artist_url" : "mirlinthloth", "date" : "dt:2016-11-27 17:24:06", "description": "A Song made playing the game Cosmic DJ.", "extension" : "mp3", diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index 2ebbec8..9cd3b95 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,8 +20,17 @@ class GfycatExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) + self.key = match.group(1) self.formats = (self.config("format", "mp4"), "mp4", "webm", "gif") + def items(self): + metadata = self.metadata() + for gfycat in self.gfycats(): + url = self._select_format(gfycat) + gfycat.update(metadata) + yield Message.Directory, gfycat + yield Message.Url, url, gfycat + def _select_format(self, gfyitem): for fmt in self.formats: key = fmt + "Url" @@ -31,9 +40,11 @@ class GfycatExtractor(Extractor): return url return "" - def _get_info(self, gfycat_id): - url = "https://api.gfycat.com/v1/gfycats/" + gfycat_id - return self.request(url).json()["gfyItem"] + def metadata(self): + return {} + + def gfycats(self): + return () class GfycatImageExtractor(GfycatExtractor): @@ -72,12 +83,6 @@ class GfycatImageExtractor(GfycatExtractor): ("https://gfycat.com/ru/UnequaledHastyAnkole"), ) - def __init__(self, match): - GfycatExtractor.__init__(self, match) - self.gfycat_id = match.group(1) - - def items(self): - gfyitem = self._get_info(self.gfycat_id) - yield Message.Version, 1 - yield Message.Directory, gfyitem - yield Message.Url, self._select_format(gfyitem), gfyitem + def gfycats(self): + url = "https://api.gfycat.com/v1/gfycats/" + self.key + return (self.request(url).json()["gfyItem"],) diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index 348453d..ade245b 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -9,9 +9,10 @@ """Extract manga-chapters and entire manga from https://kissmanga.com/""" from .common import ChapterExtractor, MangaExtractor, Extractor -from .. import text, aes, exception +from .. import text, aes from ..cache import cache import hashlib +import time import ast import re @@ -24,18 +25,7 @@ class RedirectMixin(): response = Extractor.request(self, url, **kwargs) if not response.history or "/AreYouHuman" not in response.url: return response - if self.config("captcha", "stop") == "wait": - self.log.warning( - "Redirect to \n%s\nVisit this URL in your browser, solve " - "the CAPTCHA, and press ENTER to continue", response.url) - try: - input() - except (EOFError, OSError): - pass - else: - raise exception.StopExtraction( - "Redirect to \n%s\nVisit this URL in your browser and " - "solve the CAPTCHA to continue", response.url) + time.sleep(2) class KissmangaBase(RedirectMixin): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 72465f7..7e2d613 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -51,7 +51,7 @@ class MangadexChapterExtractor(MangadexExtractor): test = ( ("https://mangadex.org/chapter/122094", { "keyword": "ef1084c2845825979e150512fed8fdc209baf05a", - "content": "50383a4c15124682057b197d40261641a98db514", + # "content": "50383a4c15124682057b197d40261641a98db514", }), # oneshot ("https://mangadex.cc/chapter/138086", { diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index 8cd7fa5..25fba70 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -85,7 +85,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor): }, "album": { "id": "lzVOv1Q9", - "name": "池永康晟 Ikenaga Yasunari 透出古朴气息的日本美女人像画作", + "name": "re:池永康晟 Ikenaga Yasunari 透出古朴", "date": "2019.3.22 14:42", "description": str, }, diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py index 746144a..fd83328 100644 --- a/gallery_dl/extractor/nhentai.py +++ b/gallery_dl/extractor/nhentai.py @@ -54,8 +54,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor): self.data = None def metadata(self, page): - data = json.loads(text.extract(page, "N.gallery(", ");")[0]) - self.data = data + self.data = data = json.loads(text.parse_unicode_escapes(text.extract( + page, 'JSON.parse("', '");')[0])) title_en = data["title"].get("english", "") title_ja = data["title"].get("japanese", "") diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py index 4841743..1d2140a 100644 --- a/gallery_dl/extractor/realbooru.py +++ b/gallery_dl/extractor/realbooru.py @@ -50,10 +50,10 @@ class RealbooruPostExtractor(booru.PostMixin, RealbooruExtractor): test = ("https://realbooru.com/index.php?page=post&s=view&id=668483", { "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9", "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", - "options": (("tags", True),), - "keyword": { - "tags_general" : str, - "tags_metadata": str, - "tags_model" : "jennifer_lawrence", - }, + # "options": (("tags", True),), + # "keyword": { + # "tags_general" : str, + # "tags_metadata": str, + # "tags_model" : "jennifer_lawrence", + # }, }) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 2e3864a..6331b77 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -324,7 +324,11 @@ class RedditAPI(): self.extractor.wait(seconds=response.headers["x-ratelimit-reset"]) return self._call(endpoint, params) - data = response.json() + try: + data = response.json() + except ValueError: + raise exception.StopExtraction(text.remove_html(response.text)) + if "error" in data: if data["error"] == 403: raise exception.AuthorizationError() diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 7855eab..dcbbc0d 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -8,22 +8,60 @@ """Extractors for https://redgifs.com/""" -from .gfycat import GfycatImageExtractor +from .gfycat import GfycatExtractor from ..cache import cache -class RedgifsImageExtractor(GfycatImageExtractor): - """Extractor for individual images from redgifs.com""" +class RedgifsExtractor(GfycatExtractor): + """Base class for redgifs extractors""" category = "redgifs" + root = "https://www.redgifs.com/" + + +class RedgifsUserExtractor(RedgifsExtractor): + """Extractor for redgifs user profiles""" + subcategory = "user" + directory_fmt = ("{category}", "{userName}") + pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?&#]+)" + test = ("https://www.redgifs.com/users/Natalifiction", { + "pattern": r"https://thcf\d+\.redgifs\.com/[A-Za-z]+\.mp4", + "count": ">= 100", + }) + + def gfycats(self): + return RedgifsAPI(self).user(self.key) + + +class RedgifsSearchExtractor(RedgifsExtractor): + """Extractor for redgifs search results""" + subcategory = "search" + directory_fmt = ("{category}", "Search", "{search}") + pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/gifs/browse/([^/?&#]+)" + test = ("https://www.redgifs.com/gifs/browse/jav", { + "pattern": r"https://thcf\d+\.redgifs\.com/[A-Za-z]+\.mp4", + "range": "100-300", + "count": "> 200", + }) + + def metadata(self): + self.key = self.key.replace("-", " ") + return {"search": self.key} + + def gfycats(self): + return RedgifsAPI(self).search(self.key) + + +class RedgifsImageExtractor(RedgifsExtractor): + """Extractor for individual gifs from redgifs.com""" + subcategory = "image" pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/watch/([A-Za-z]+)" test = ("https://redgifs.com/watch/foolishforkedabyssiniancat", { "pattern": r"https://\w+.redgifs.com/FoolishForkedAbyssiniancat.mp4", "content": "f6e03f1df9a2ff2a74092f53ee7580d2fb943533", }) - def _get_info(self, gfycat_id): - api = RedgifsAPI(self) - return api.gfycat(gfycat_id) + def gfycats(self): + return (RedgifsAPI(self).gfycat(self.key),) class RedgifsAPI(): @@ -36,6 +74,16 @@ class RedgifsAPI(): endpoint = "v1/gfycats/" + gfycat_id return self._call(endpoint)["gfyItem"] + def user(self, user): + endpoint = "v1/users/{}/gfycats".format(user.lower()) + params = {"count": 100} + return self._pagination(endpoint, params) + + def search(self, query): + endpoint = "v1/gfycats/search" + params = {"search_text": query, "count": 150} + return self._pagination(endpoint, params) + @cache(maxage=3600) def _authenticate_impl(self): url = "https://weblogin.redgifs.com/oauth/webtoken" @@ -52,7 +100,19 @@ class RedgifsAPI(): url, method="POST", headers=headers, json=data) return "Bearer " + response.json()["access_token"] - def _call(self, endpoint): + def _call(self, endpoint, params=None): self.headers["Authorization"] = self._authenticate_impl() url = "https://napi.redgifs.com/" + endpoint - return self.extractor.request(url, headers=self.headers).json() + return self.extractor.request( + url, params=params, headers=self.headers).json() + + def _pagination(self, endpoint, params): + while True: + data = self._call(endpoint, params) + gfycats = data["gfycats"] + yield from gfycats + + if "found" not in data and len(gfycats) < params["count"] or \ + not data["gfycats"]: + return + params["cursor"] = data["cursor"] diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4c7b757..7cabb8c 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -10,9 +10,9 @@ from .common import Extractor, Message from .. import text, exception -from ..cache import cache, memcache -import json -import re +from ..cache import cache +import hashlib +import time class TwitterExtractor(Extractor): @@ -24,23 +24,15 @@ class TwitterExtractor(Extractor): cookiedomain = ".twitter.com" root = "https://twitter.com" sizes = (":orig", ":large", ":medium", ":small") - user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; " - "Trident/7.0; rv:11.0) like Gecko") def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) - self._user_dict = None - self.logged_in = False self.retweets = self.config("retweets", True) self.replies = self.config("replies", True) self.twitpic = self.config("twitpic", False) - self.content = self.config("content", False) self.videos = self.config("videos", True) - - if self.content: - self._emoji_sub = re.compile( - r'<img class="Emoji [^>]+ alt="([^"]+)"[^>]*>').sub + self._user_cache = {} def items(self): self.login() @@ -48,235 +40,194 @@ class TwitterExtractor(Extractor): yield Message.Version, 1 for tweet in self.tweets(): - data = self._data_from_tweet(tweet) - if not data or \ - not self.retweets and data["retweet_id"] or \ - not self.replies and data["reply"]: + + if not self.retweets and "retweeted_status_id_str" in tweet or \ + not self.replies and "in_reply_to_user_id_str" in tweet: continue - data.update(metadata) - if self.videos and "-videoContainer" in tweet: - yield Message.Directory, data + if self.twitpic: + self._extract_twitpic(tweet) + if "extended_entities" not in tweet: + continue - if self.videos == "ytdl": - data["extension"] = None - url = "ytdl:{}/i/web/status/{}".format( - self.root, data["tweet_id"]) - else: - url = self._video_from_tweet(data["tweet_id"]) - if not url: - continue - text.nameext_from_url(url, data) - if data["extension"] == "m3u8": - url = "ytdl:" + url - data["extension"] = "mp4" - data["_ytdl_extra"] = {"protocol": "m3u8_native"} - data["num"] = 1 - yield Message.Url, url, data - - elif "data-image-url=" in tweet: - yield Message.Directory, data - - images = text.extract_iter( - tweet, 'data-image-url="', '"') - for data["num"], url in enumerate(images, 1): - text.nameext_from_url(url, data) + tdata = self._transform_tweet(tweet) + tdata.update(metadata) + + yield Message.Directory, tdata + for tdata["num"], media in enumerate( + tweet["extended_entities"]["media"], 1): + + tdata["width"] = media["original_info"].get("width", 0) + tdata["height"] = media["original_info"].get("height", 0) + + if "video_info" in media and self.videos: + + if self.videos == "ytdl": + url = "ytdl:{}/i/web/status/{}".format( + self.root, tweet["id_str"]) + tdata["extension"] = None + yield Message.Url, url, tdata + + else: + video_info = media["video_info"] + variant = max( + video_info["variants"], + key=lambda v: v.get("bitrate", 0), + ) + tdata["duration"] = video_info.get( + "duration_millis", 0) / 1000 + tdata["bitrate"] = variant.get("bitrate", 0) + + url = variant["url"] + text.nameext_from_url(url, tdata) + yield Message.Url, url, tdata + + elif "media_url_https" in media: + url = media["media_url_https"] urls = [url + size for size in self.sizes] - yield Message.Urllist, urls, data - - if self.twitpic and "//twitpic.com/" in tweet: - urls = [ - url for url in text.extract_iter( - tweet, 'data-expanded-url="', '"') - if "//twitpic.com/" in url - ] - - if "num" not in data: - if urls: - yield Message.Directory, data - data["num"] = 0 - - for data["num"], url in enumerate(urls, data["num"]+1): - response = self.request(url, fatal=False) - if response.status_code >= 400: - continue - url = text.extract( - response.text, 'name="twitter:image" value="', '"')[0] - yield Message.Url, url, text.nameext_from_url(url, data) + text.nameext_from_url(url, tdata) + yield Message.Urllist, urls, tdata + + else: + url = media["media_url"] + text.nameext_from_url(url, tdata) + yield Message.Url, url, tdata + + def _extract_twitpic(self, tweet): + twitpics = [] + for url in tweet["entities"].get("urls", ()): + url = url["expanded_url"] + if "//twitpic.com/" in url: + response = self.request(url, fatal=False) + if response.status_code >= 400: + continue + url = text.extract( + response.text, 'name="twitter:image" value="', '"')[0] + twitpics.append({ + "original_info": {}, + "media_url" : url, + }) + if twitpics: + if "extended_entities" in tweet: + tweet["extended_entities"]["media"].extend(twitpics) + else: + tweet["extended_entities"] = {"media": twitpics} + + def _transform_tweet(self, tweet): + entities = tweet["entities"] + tdata = { + "tweet_id" : text.parse_int(tweet["id_str"]), + "retweet_id" : text.parse_int( + tweet.get("retweeted_status_id_str")), + "quote_id" : text.parse_int( + tweet.get("quoted_status_id_str")), + "reply_id" : text.parse_int( + tweet.get("in_reply_to_status_id_str")), + "date" : text.parse_datetime( + tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), + "user" : self._transform_user(tweet["user"]), + "lang" : tweet["lang"], + "content" : tweet["full_text"], + "favorite_count": tweet["favorite_count"], + "quote_count" : tweet["quote_count"], + "reply_count" : tweet["reply_count"], + "retweet_count" : tweet["retweet_count"], + } + + hashtags = entities.get("hashtags") + if hashtags: + tdata["hashtags"] = [t["text"] for t in hashtags] + + mentions = entities.get("user_mentions") + if mentions: + tdata["mentions"] = [{ + "id": text.parse_int(u["id_str"]), + "name": u["screen_name"], + "nick": u["name"], + } for u in mentions] + + if "in_reply_to_screen_name" in tweet: + tdata["reply_to"] = tweet["in_reply_to_screen_name"] + + if "full_text_quoted" in tweet: + tdata["content_quoted"] = tweet["full_text_quoted"] + + if "author" in tweet: + tdata["author"] = self._transform_user(tweet["author"]) + + return tdata + + def _transform_user(self, user): + uid = user["id_str"] + cache = self._user_cache + + if uid not in cache: + cache[uid] = { + "id" : text.parse_int(uid), + "name" : user["screen_name"], + "nick" : user["name"], + "description" : user["description"], + "location" : user["location"], + "date" : text.parse_datetime( + user["created_at"], "%a %b %d %H:%M:%S %z %Y"), + "verified" : user.get("verified", False), + "profile_banner" : user.get("profile_banner_url", ""), + "profile_image" : user.get( + "profile_image_url_https", "").replace("_normal.", "."), + "favourites_count": user["favourites_count"], + "followers_count" : user["followers_count"], + "friends_count" : user["friends_count"], + "listed_count" : user["listed_count"], + "media_count" : user["media_count"], + "statuses_count" : user["statuses_count"], + } + return cache[uid] def metadata(self): """Return general metadata""" return {} def tweets(self): - """Yield HTML content of all relevant tweets""" + """Yield all relevant tweet objects""" def login(self): username, password = self._get_auth_info() if username: self._update_cookies(self._login_impl(username, password)) - self.logged_in = True @cache(maxage=360*24*3600, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - headers = {"User-Agent": self.user_agent} - page = self.request(self.root + "/login", headers=headers).text + url = "https://mobile.twitter.com/i/nojs_router" + params = {"path": "/login"} + headers = {"Referer": self.root + "/", "Origin": self.root} + page = self.request( + url, method="POST", params=params, headers=headers, data={}).text + pos = page.index('name="authenticity_token"') - token = text.extract(page, 'value="', '"', pos-80)[0] + token = text.extract(page, 'value="', '"', pos)[0] - url = self.root + "/sessions" + url = "https://mobile.twitter.com/sessions" data = { + "authenticity_token" : token, "session[username_or_email]": username, "session[password]" : password, - "authenticity_token" : token, - "ui_metrics" : '{"rf":{},"s":""}', - "scribe_log" : "", - "redirect_after_login" : "", "remember_me" : "1", + "wfa" : "1", + "commit" : "+Log+in+", + "ui_metrics" : "", } - response = self.request(url, method="POST", headers=headers, data=data) - if "/error" in response.url: - raise exception.AuthenticationError() - - return { + response = self.request(url, method="POST", data=data) + cookies = { cookie.name: cookie.value for cookie in self.session.cookies - if cookie.domain and "twitter.com" in cookie.domain - } - - def _data_from_tweet(self, tweet): - extr = text.extract_from(tweet) - data = { - "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), - "reply" : bool(extr('data-is-reply-to="' , '"')), - "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), - "retweeter" : extr('data-retweeter="' , '"'), - "author" : { - "name" : extr('data-screen-name="', '"'), - "nick" : text.unescape(extr('data-name="' , '"')), - "id" : text.parse_int(extr('data-user-id="' , '"')), - }, - } - - if not self._user_dict: - if data["retweet_id"]: - for user in json.loads(text.unescape(extr( - 'data-reply-to-users-json="', '"'))): - if user["screen_name"] == data["retweeter"]: - break - else: - self.log.warning("Unable to extract user info") - return None - self._user_dict = { - "name": user["screen_name"], - "nick": text.unescape(user["name"]), - "id" : text.parse_int(user["id_str"]), - } - else: - self._user_dict = data["author"] - - data["user"] = self._user_dict - data["date"] = text.parse_timestamp(extr('data-time="', '"')) - - if self.content: - content = extr('<div class="js-tweet-text-container">', '\n</div>') - if '<img class="Emoji ' in content: - content = self._emoji_sub(r"\1", content) - content = text.unescape(text.remove_html(content, "", "")) - cl, _, cr = content.rpartition("pic.twitter.com/") - data["content"] = cl if cl and len(cr) < 16 else content - - if extr('<div class="QuoteTweet', '>'): - data["retweet_id"] = text.parse_int(extr('data-item-id="', '"')) - data["retweeter"] = data["user"]["name"] - data["author"] = { - "name" : extr('data-screen-name="', '"'), - "id" : text.parse_int(extr('data-user-id="' , '"')), - "nick" : text.unescape(extr( - 'QuoteTweet-fullname', '<').partition('>')[2]), - } - - return data - - def _video_from_tweet(self, tweet_id): - url = "https://api.twitter.com/1.1/videos/tweet/config/{}.json".format( - tweet_id) - cookies = None - headers = { - "Origin" : self.root, - "Referer" : "{}/i/web/status/{}".format(self.root, tweet_id), - "x-csrf-token" : self.session.cookies.get("ct0"), - "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekM" - "xqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28N" - "HfOPqkca3qaAxGfsyKCs0wRbw", - } - - if self.logged_in: - headers["x-twitter-auth-type"] = "OAuth2Session" - else: - token = _guest_token(self, headers) - cookies = {"gt": token} - headers["x-guest-token"] = token - - response = self.request( - url, cookies=cookies, headers=headers, fatal=None) - - if response.status_code == 429 or \ - response.headers.get("x-rate-limit-remaining") == "0": - if self.logged_in: - self.wait(until=response.headers.get("x-rate-limit-reset")) - else: - _guest_token.invalidate() - return self._video_from_tweet(tweet_id) - - elif response.status_code >= 400: - self.log.warning("Unable to fetch video data for %s ('%s %s')", - tweet_id, response.status_code, response.reason) - return None - - return response.json()["track"]["playbackUrl"] - - def _tweets_from_api(self, url, max_position=None): - params = { - "include_available_features": "1", - "include_entities": "1", - "max_position": max_position, - "reset_error_state": "false", - "lang": "en", + if cookie.domain == self.cookiedomain } - headers = { - "X-Requested-With": "XMLHttpRequest", - "X-Twitter-Active-User": "yes", - "Referer": self.root + "/", - } - - while True: - data = self.request(url, params=params, headers=headers).json() - if "inner" in data: - data = data["inner"] - - for tweet in text.extract_iter( - data["items_html"], '<div class="tweet ', '\n</li>'): - yield tweet - if data.get("min_position") is None: - if data["has_more_items"] and "min_position" not in data: - pass - else: - return - - if "min_position" in data: - position = data["min_position"] - if position == max_position or position is None: - return - else: - position = text.parse_int(text.extract( - tweet, 'data-tweet-id="', '"')[0]) - if max_position and position >= max_position: - return - params["max_position"] = max_position = position + if "/error" in response.url or "auth_token" not in cookies: + raise exception.AuthenticationError() + return cookies class TwitterTimelineExtractor(TwitterExtractor): @@ -288,15 +239,12 @@ class TwitterTimelineExtractor(TwitterExtractor): ("https://twitter.com/supernaturepics", { "range": "1-40", "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", - "keyword": "4a3d28cc9f7a39e27333d56f3fe19e6e07ee979e", }), ("https://mobile.twitter.com/supernaturepics?p=i"), ) def tweets(self): - url = "{}/i/profiles/show/{}/timeline/tweets".format( - self.root, self.user) - return self._tweets_from_api(url) + return TwitterAPI(self).timeline_profile(self.user) class TwitterMediaExtractor(TwitterExtractor): @@ -313,9 +261,7 @@ class TwitterMediaExtractor(TwitterExtractor): ) def tweets(self): - url = "{}/i/profiles/show/{}/media_timeline".format( - self.root, self.user) - return self._tweets_from_api(url) + return TwitterAPI(self).timeline_media(self.user) class TwitterSearchExtractor(TwitterExtractor): @@ -330,12 +276,10 @@ class TwitterSearchExtractor(TwitterExtractor): }) def metadata(self): - return {"search": self.user} + return {"search": text.unquote(self.user)} def tweets(self): - url = "{}/i/search/timeline?f=tweets&q={}".format( - self.root, self.user) - return self._tweets_from_api(url, "-1") + return TwitterAPI(self).search(self.user) class TwitterTweetExtractor(TwitterExtractor): @@ -346,22 +290,19 @@ class TwitterTweetExtractor(TwitterExtractor): test = ( ("https://twitter.com/supernaturepics/status/604341487988576256", { "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580", - "keyword": "76e018cf3f4c8b82d3bdd425e01e28078c98373b", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", }), # 4 images ("https://twitter.com/perrypumas/status/894001459754180609", { "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6", - "keyword": "c9251b1fd79d547b0c6b4577f06c937d0e9b63d2", }), # video ("https://twitter.com/perrypumas/status/1065692031626829824", { "options": (("videos", True),), - "pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8", + "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5", }), # content with emoji, newlines, hashtags (#338) ("https://twitter.com/playpokemon/status/1263832915173048321", { - "options": (("content", True),), "keyword": {"content": ( r"re:Gear up for #PokemonSwordShieldEX with special Mystery " "Gifts! \n\nYou’ll be able to receive four Galarian form " @@ -386,10 +327,6 @@ class TwitterTweetExtractor(TwitterExtractor): # quoted tweet (#526) ("https://twitter.com/Pistachio/status/1222690391817932803", { "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg", - "keyword": { - "author": {"name": "Afro_Herper", "id": 786047748508221440}, - "user" : {"name": "Pistachio" , "id": 3533231}, - }, }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { @@ -404,18 +341,7 @@ class TwitterTweetExtractor(TwitterExtractor): self.tweet_id = match.group(2) def tweets(self): - url = "{}/i/web/status/{}".format(self.root, self.tweet_id) - cookies = {"app_shell_visited": "1"} - headers = {"User-Agent": self.user_agent, "Referer": url} - - response = self.request(url, cookies=cookies, headers=headers) - if response.history and response.url == self.root + "/": - raise exception.AuthorizationError() - page = response.text - - end = page.index('class="js-tweet-stats-container') - beg = page.rindex('<div class="tweet ', 0, end) - return (page[beg:end],) + return TwitterAPI(self).tweet(self.tweet_id) class TwitterBookmarkExtractor(TwitterExtractor): @@ -424,15 +350,26 @@ class TwitterBookmarkExtractor(TwitterExtractor): pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()" test = ("https://twitter.com/i/bookmarks",) - def items(self): - self.login() - if not self.logged_in: - raise exception.AuthorizationError("Login required") - for cookie in self.session.cookies: - cookie.expires = None + def tweets(self): + return TwitterAPI(self).bookmarks() - url = "https://api.twitter.com/2/timeline/bookmark.json" - params = { + +class TwitterAPI(): + + def __init__(self, extractor): + self.extractor = extractor + self.headers = { + "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" + "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" + "4FA33AGWWjCpTnA", + "x-guest-token": None, + "x-twitter-client-language": "en", + "x-twitter-active-user": "yes", + "x-csrf-token": None, + "Origin": "https://twitter.com", + "Referer": "https://twitter.com/", + } + self.params = { "include_profile_interstitial_type": "1", "include_blocking": "1", "include_blocked_by": "1", @@ -453,47 +390,134 @@ class TwitterBookmarkExtractor(TwitterExtractor): "include_ext_media_color": "true", "include_ext_media_availability": "true", "send_error_codes": "true", - "simple_quoted_tweets": "true", + "simple_quoted_tweet": "true", + # "count": "20", "count": "100", "cursor": None, - "ext": "mediaStats%2CcameraMoment", + "ext": "mediaStats,highlightedLabel,cameraMoment", + "include_quote_count": "true", } - headers = { - "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" - "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" - "4FA33AGWWjCpTnA", - "Origin": self.root, - "Referer": self.root + "/i/bookmarks", - "x-csrf-token": self.session.cookies.get("ct0"), - "x-twitter-active-user": "yes", - "x-twitter-auth-type": "OAuth2Session", - "x-twitter-client-language": "en", + + cookies = self.extractor.session.cookies + + # CSRF + csrf = hashlib.md5(str(time.time()).encode()).hexdigest() + self.headers["x-csrf-token"] = csrf + cookies.set("ct0", csrf, domain=".twitter.com") + + if cookies.get("auth_token", domain=".twitter.com"): + self.headers["x-twitter-auth-type"] = "OAuth2Session" + else: + # guest token + guest_token = _guest_token(self.extractor, self.headers) + self.headers["x-guest-token"] = guest_token + cookies.set("gt", guest_token, domain=".twitter.com") + + def tweet(self, tweet_id): + endpoint = "2/timeline/conversation/{}.json".format(tweet_id) + for tweet in self._pagination(endpoint): + if tweet["id_str"] == tweet_id: + return (tweet,) + return () + + def timeline_profile(self, screen_name): + user = self.user_by_screen_name(screen_name) + endpoint = "2/timeline/profile/{}.json".format(user["rest_id"]) + return self._pagination(endpoint) + + def timeline_media(self, screen_name): + user = self.user_by_screen_name(screen_name) + endpoint = "2/timeline/media/{}.json".format(user["rest_id"]) + return self._pagination(endpoint) + + def search(self, query): + endpoint = "2/search/adaptive.json" + params = self.params.copy() + params["q"] = text.unquote(query) + return self._pagination( + endpoint, params, "sq-I-t-", "sq-cursor-bottom") + + def bookmarks(self): + endpoint = "2/timeline/bookmark.json" + return self._pagination(endpoint) + + def user_by_screen_name(self, screen_name): + endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName" + params = { + "variables": '{"screen_name":"' + screen_name + '"' + ',"withHighlightedLabel":true}' } + return self._call(endpoint, params)["data"]["user"] + + def _call(self, endpoint, params): + url = "https://api.twitter.com/" + endpoint + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=None) + if response.status_code < 400: + return response.json() + if response.status_code == 429: + self.extractor.wait(until=response.headers["x-rate-limit-reset"]) + return self._call(endpoint, params) + raise exception.StopExtraction( + "%s %s (%s)", response.status_code, response.reason, response.text) + + def _pagination(self, endpoint, params=None, + entry_tweet="tweet-", entry_cursor="cursor-bottom-"): + if params is None: + params = self.params.copy() while True: - response = self.request( - url, params=params, headers=headers, fatal=False) - if response.status_code >= 400: - raise exception.StopExtraction(response.text) - data = response.json() - tweets = data["globalObjects"]["tweets"] + cursor = tweet = None + data = self._call(endpoint, params) - if not tweets: + instr = data["timeline"]["instructions"] + if not instr: return - for tweet_id, tweet_data in tweets.items(): - tweet_url = "{}/i/web/status/{}".format(self.root, tweet_id) - tweet_data["_extractor"] = TwitterTweetExtractor - yield Message.Queue, tweet_url, tweet_data + tweets = data["globalObjects"]["tweets"] + users = data["globalObjects"]["users"] + + for entry in instr[0]["addEntries"]["entries"]: - inst = data["timeline"]["instructions"][0] - for entry in inst["addEntries"]["entries"]: - if entry["entryId"].startswith("cursor-bottom-"): - params["cursor"] = \ - entry["content"]["operation"]["cursor"]["value"] - break + if entry["entryId"].startswith(entry_tweet): + tid = entry["content"]["item"]["content"]["tweet"]["id"] + if tid not in tweets: + self.extractor.log.debug( + "Skipping unavailable Tweet %s", tid) + continue + tweet = tweets[tid] + tweet["user"] = users[tweet["user_id_str"]] + + if "quoted_status_id_str" in tweet: + quoted = tweets.get(tweet["quoted_status_id_str"]) + if quoted: + tweet["full_text_quoted"] = quoted["full_text"] + if "extended_entities" in quoted: + tweet["extended_entities"] = \ + quoted["extended_entities"] + elif "retweeted_status_id_str" in tweet: + retweet = tweets.get(tweet["retweeted_status_id_str"]) + if retweet: + tweet["author"] = users[retweet["user_id_str"]] + + yield tweet + + elif entry["entryId"].startswith(entry_cursor): + cursor = entry["content"]["operation"]["cursor"] + if not cursor.get("stopOnEmptyResponse"): + # keep going even if there are no tweets + tweet = True + cursor = cursor["value"] + + if "replaceEntry" in instr[-1] : + cursor = (instr[-1]["replaceEntry"]["entry"] + ["content"]["operation"]["cursor"]["value"]) + + if not cursor or not tweet: + return + params["cursor"] = cursor -@memcache() +@cache(maxage=3600) def _guest_token(extr, headers): return extr.request( "https://api.twitter.com/1.1/guest/activate.json", diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 86ada49..3b992a2 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -35,7 +35,7 @@ class WebtoonsEpisodeExtractor(WebtoonsExtractor): filename_fmt = "{episode}-{num:>02}.{extension}" archive_fmt = "{title_no}_{episode}_{num}" pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+)/(?:[^/?&#]+))" - r"/viewer(?:\?([^#]+))") + r"/viewer(?:\?([^#'\"]+))") test = ( (("https://www.webtoons.com/en/comedy/safely-endangered" "/ep-572-earth/viewer?title_no=352&episode_no=572"), { @@ -111,6 +111,11 @@ class WebtoonsComicExtractor(WebtoonsExtractor): "list?title_no=1845&page=3"), { "count": ">= 15", }), + # (#820) + (("https://www.webtoons.com/en/challenge/scoob-and-shag/" + "list?title_no=210827&page=9"), { + "count": ">= 18", + }), ) def __init__(self, match): @@ -143,6 +148,8 @@ class WebtoonsComicExtractor(WebtoonsExtractor): @staticmethod def get_episode_urls(page): """Extract and return all episode urls in 'page'""" - pos = page.find('id="_listUl"') - return text.extract_iter( - page, '<a href="', '" class="NPI=a:list', pos) + page = text.extract(page, 'id="_listUl"', '</ul>')[0] + return [ + match.group(0) + for match in WebtoonsEpisodeExtractor.pattern.finditer(page) + ] diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 85b871b..afd96b8 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -84,6 +84,13 @@ def filter_dict(a): return {k: v for k, v in a.items() if k[0] != "_"} +def delete_items(obj, keys): + """Remove all 'keys' from 'obj'""" + for key in keys: + if key in obj: + del obj[key] + + def number_to_string(value, numbers=(int, float)): """Convert numbers (int, float) to string; Return everything else as is.""" return str(value) if value.__class__ in numbers else value diff --git a/gallery_dl/version.py b/gallery_dl/version.py index dd6f373..8509e1e 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.14.0" +__version__ = "1.14.1" diff --git a/test/test_results.py b/test/test_results.py index 5bef1a4..196d859 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -31,10 +31,10 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { - "e621", "imagevenue", - "jaiminisbox", "photobucket", + "seiga", + "twitter", "worldthree", } |