aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2020-06-16 02:01:17 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2020-06-16 02:01:17 -0400
commit8c911e3d62a430f5630c13d51b47201fa8ff3cd1 (patch)
tree6e0e6f65abc37d7f35ea96d323031a52c7fa966d
parenta70a3246927b72f1ded37acd55ee719515441b5b (diff)
downloadgallery-dl-8c911e3d62a430f5630c13d51b47201fa8ff3cd1.tar.bz2
gallery-dl-8c911e3d62a430f5630c13d51b47201fa8ff3cd1.tar.xz
gallery-dl-8c911e3d62a430f5630c13d51b47201fa8ff3cd1.tar.zst
New upstream version 1.14.1.upstream/1.14.1
-rw-r--r--CHANGELOG.md44
-rw-r--r--PKG-INFO8
-rw-r--r--README.rst6
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.543
-rw-r--r--docs/gallery-dl.conf5
-rw-r--r--gallery_dl.egg-info/PKG-INFO8
-rw-r--r--gallery_dl/cloudflare.py29
-rw-r--r--gallery_dl/extractor/8muses.py2
-rw-r--r--gallery_dl/extractor/deviantart.py7
-rw-r--r--gallery_dl/extractor/foolslide.py14
-rw-r--r--gallery_dl/extractor/furaffinity.py13
-rw-r--r--gallery_dl/extractor/gfycat.py31
-rw-r--r--gallery_dl/extractor/kissmanga.py16
-rw-r--r--gallery_dl/extractor/mangadex.py2
-rw-r--r--gallery_dl/extractor/mangoxo.py2
-rw-r--r--gallery_dl/extractor/nhentai.py4
-rw-r--r--gallery_dl/extractor/realbooru.py12
-rw-r--r--gallery_dl/extractor/reddit.py6
-rw-r--r--gallery_dl/extractor/redgifs.py76
-rw-r--r--gallery_dl/extractor/twitter.py582
-rw-r--r--gallery_dl/extractor/webtoons.py15
-rw-r--r--gallery_dl/util.py7
-rw-r--r--gallery_dl/version.py2
-rw-r--r--test/test_results.py4
25 files changed, 538 insertions, 402 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index df67569..043d964 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,32 +1,44 @@
# Changelog
-## Unreleased
+## 1.14.1 - 2020-06-12
### Additions
-- [imagechest] add new extractor for imgchest.com (#750)
-- [instagram] add `post_url`, `tags`, `location`, `tagged_users` metadata (#743)
-- [redgifs] add image extractor (#724)
-- [webtoons] add new extractor for webtoons.com (#761)
-- implement `--write-pages` option (#736)
-- extend `path-restrict` option (#662)
-- implement `path-replace` option (#662, #755)
-- make `path` and `keywords` available in logging messages (#574, #575)
+- [furaffinity] add `artist_url` metadata field ([#821](https://github.com/mikf/gallery-dl/issues/821))
+- [redgifs] add `user` and `search` extractors ([#724](https://github.com/mikf/gallery-dl/issues/724))
+### Changes
+- [deviantart] extend `extra` option; also search journals for sta.sh links ([#712](https://github.com/mikf/gallery-dl/issues/712))
+- [twitter] rewrite; use new interface ([#806](https://github.com/mikf/gallery-dl/issues/806), [#740](https://github.com/mikf/gallery-dl/issues/740))
+### Fixes
+- [kissmanga] work around CAPTCHAs ([#818](https://github.com/mikf/gallery-dl/issues/818))
+- [nhentai] fix extraction ([#819](https://github.com/mikf/gallery-dl/issues/819))
+- [webtoons] generalize comic extraction code ([#820](https://github.com/mikf/gallery-dl/issues/820))
+
+## 1.14.0 - 2020-05-31
+### Additions
+- [imagechest] add new extractor for imgchest.com ([#750](https://github.com/mikf/gallery-dl/issues/750))
+- [instagram] add `post_url`, `tags`, `location`, `tagged_users` metadata ([#743](https://github.com/mikf/gallery-dl/issues/743))
+- [redgifs] add image extractor ([#724](https://github.com/mikf/gallery-dl/issues/724))
+- [webtoons] add new extractor for webtoons.com ([#761](https://github.com/mikf/gallery-dl/issues/761))
+- implement `--write-pages` option ([#736](https://github.com/mikf/gallery-dl/issues/736))
+- extend `path-restrict` option ([#662](https://github.com/mikf/gallery-dl/issues/662))
+- implement `path-replace` option ([#662](https://github.com/mikf/gallery-dl/issues/662), [#755](https://github.com/mikf/gallery-dl/issues/755))
+- make `path` and `keywords` available in logging messages ([#574](https://github.com/mikf/gallery-dl/issues/574), [#575](https://github.com/mikf/gallery-dl/issues/575))
### Changes
- [danbooru] change default value of `ugoira` to `false`
- [downloader:ytdl] change default value of `forward-cookies` to `false`
-- [downloader:ytdl] fix file extensions when merging into `.mkv` (#720)
-- write OAuth tokens to cache (#616)
+- [downloader:ytdl] fix file extensions when merging into `.mkv` ([#720](https://github.com/mikf/gallery-dl/issues/720))
+- write OAuth tokens to cache ([#616](https://github.com/mikf/gallery-dl/issues/616))
- use `%APPDATA%\gallery-dl` for config files and cache on Windows
- use `util.Formatter` for formatting logging messages
- reuse HTTP connections from parent extractors
### Fixes
-- [deviantart] use private access tokens for Journals (#738)
+- [deviantart] use private access tokens for Journals ([#738](https://github.com/mikf/gallery-dl/issues/738))
- [gelbooru] simplify and fix pool extraction
- [imgur] fix extraction of animated images without `mp4` entry
- [imgur] treat `/t/unmuted/` URLs as galleries
-- [instagram] fix login with username & password (#756, #771, #797, #803)
-- [reddit] don't send OAuth headers for file downloads (#729)
-- fix/improve Cloudflare bypass code (#728, #757)
-- reset filenames on empty file extensions (#733)
+- [instagram] fix login with username & password ([#756](https://github.com/mikf/gallery-dl/issues/756), [#771](https://github.com/mikf/gallery-dl/issues/771), [#797](https://github.com/mikf/gallery-dl/issues/797), [#803](https://github.com/mikf/gallery-dl/issues/803))
+- [reddit] don't send OAuth headers for file downloads ([#729](https://github.com/mikf/gallery-dl/issues/729))
+- fix/improve Cloudflare bypass code ([#728](https://github.com/mikf/gallery-dl/issues/728), [#757](https://github.com/mikf/gallery-dl/issues/757))
+- reset filenames on empty file extensions ([#733](https://github.com/mikf/gallery-dl/issues/733))
## 1.13.6 - 2020-05-02
### Additions
diff --git a/PKG-INFO b/PKG-INFO
index e520fc1..51e514a 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.14.0
+Version: 1.14.1
Summary: Command-line program to download image-galleries and -collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -94,8 +94,8 @@ Description: ==========
put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
and run it inside a command prompt (like ``cmd.exe``).
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.bin>`__
These executables include a Python 3.8 interpreter
and all required Python packages.
@@ -302,7 +302,7 @@ Description: ==========
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
- .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.0.tar.gz
+ .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.1.tar.gz
.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
diff --git a/README.rst b/README.rst
index a2258db..b66efb7 100644
--- a/README.rst
+++ b/README.rst
@@ -83,8 +83,8 @@ Download a standalone executable file,
put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
and run it inside a command prompt (like ``cmd.exe``).
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.bin>`__
These executables include a Python 3.8 interpreter
and all required Python packages.
@@ -291,7 +291,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
-.. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.0.tar.gz
+.. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.1.tar.gz
.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index fe9a684..76a57d1 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2020-05-31" "1.14.0" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2020-06-12" "1.14.1" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index 5a37463..88f8ebc 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2020-05-31" "1.14.0" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2020-06-12" "1.14.1" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -220,7 +220,7 @@ escaped with backslashes, e.g. \f[I]"\\\\[\\\\]"\f[]
.IP "Description:" 4
Controls the behavior when downloading files that have been
downloaded before, i.e. a file with the same filename already
-exists or its ID is in a \f[I]download archive\f[].
+exists or its ID is in a \f[I]download archive <extractor.*.archive_>\f[].
.br
* \f[I]true\f[]: Skip downloads
@@ -604,8 +604,8 @@ current extractor run.
\f[I]false\f[]
.IP "Description:" 4
-Like \f[I]image-unique\f[], but applies to delegated URLs
-like manga-chapters, etc.
+Like \f[I]image-unique <extractor.*.image-unique_>\f[],
+but applies to delegated URLs like manga-chapters, etc.
.SS extractor.*.date-format
.IP "Type:" 6
@@ -664,7 +664,8 @@ Controls the download target for Ugoira posts.
\f[I]false\f[]
.IP "Description:" 4
-Download extra Sta.sh resources from description texts.
+Download extra Sta.sh resources from
+description texts and journals.
Note: Enabling this option also enables deviantart.metadata_.
@@ -1259,6 +1260,22 @@ video extraction and download
.br
* \f[I]false\f[]: Ignore videos
+.SS extractor.redgifs.format
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"mp4"\f[]
+
+.IP "Description:" 4
+The name of the preferred format, which can be one of
+\f[I]"mp4"\f[], \f[I]"webm"\f[], \f[I]"gif"\f[], \f[I]"webp"\f[], \f[I]"mobile"\f[],
+or \f[I]"mini"\f[].
+
+If the selected format is not available, \f[I]"mp4"\f[], \f[I]"webm"\f[]
+and \f[I]"gif"\f[] (in that order) will be tried instead, until an
+available format is found.
+
.SS extractor.sankaku.wait-min & .wait-max
.IP "Type:" 6
\f[I]float\f[]
@@ -1358,16 +1375,6 @@ Possible types are \f[I]text\f[], \f[I]quote\f[], \f[I]link\f[], \f[I]answer\f[]
You can use \f[I]"all"\f[] instead of listing all types separately.
-.SS extractor.twitter.content
-.IP "Type:" 6
-\f[I]bool\f[]
-
-.IP "Default:" 9
-\f[I]false\f[]
-
-.IP "Description:" 4
-Extract tweet text as \f[I]content\f[] metadata.
-
.SS extractor.twitter.replies
.IP "Type:" 6
\f[I]bool\f[]
@@ -1409,11 +1416,9 @@ Extract \f[I]TwitPic <https://twitpic.com/>\f[] embeds.
Control video download behavior.
.br
-* \f[I]true\f[]: Download videos and use \f[I]youtube-dl\f[] to handle
-HLS \f[I].m3u8\f[] manifests
+* \f[I]true\f[]: Download videos
.br
-* \f[I]"ytdl"\f[]: Download videos and let \f[I]youtube-dl\f[] handle all of
-video extraction and download
+* \f[I]"ytdl"\f[]: Download videos using \f[I]youtube-dl\f[]
.br
* \f[I]false\f[]: Skip video Tweets
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index c120d25..ae4839d 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -129,6 +129,10 @@
"videos": true,
"user-agent": "Python:gallery-dl:0.8.4 (by /u/mikf1)"
},
+ "redgifs":
+ {
+ "format": "mp4"
+ },
"sankaku":
{
"username": null,
@@ -151,7 +155,6 @@
},
"twitter":
{
- "content": false,
"replies": true,
"retweets": true,
"twitpic": false,
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index c7189b2..3f6f077 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.14.0
+Version: 1.14.1
Summary: Command-line program to download image-galleries and -collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -94,8 +94,8 @@ Description: ==========
put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
and run it inside a command prompt (like ``cmd.exe``).
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.0/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.14.1/gallery-dl.bin>`__
These executables include a Python 3.8 interpreter
and all required Python packages.
@@ -302,7 +302,7 @@ Description: ==========
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
- .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.0.tar.gz
+ .. _stable: https://github.com/mikf/gallery-dl/archive/v1.14.1.tar.gz
.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py
index 0cf5a57..88068d5 100644
--- a/gallery_dl/cloudflare.py
+++ b/gallery_dl/cloudflare.py
@@ -33,11 +33,21 @@ def solve_challenge(session, response, kwargs):
parsed = urllib.parse.urlsplit(response.url)
root = parsed.scheme + "://" + parsed.netloc
+ page = response.text
+ try:
+ params = {"ray": text.extract(page, '?ray=', '"')[0]}
+
+ url = root + "/cdn-cgi/images/trace/jschal/nojs/transparent.gif"
+ session.request("GET", url, params=params)
+
+ url = root + "/cdn-cgi/images/trace/jschal/js/nocookie/transparent.gif"
+ session.request("GET", url, params=params)
+ except Exception:
+ pass
+
cf_kwargs = {}
headers = cf_kwargs["headers"] = collections.OrderedDict()
params = cf_kwargs["data"] = collections.OrderedDict()
-
- page = response.text
url = root + text.unescape(text.extract(page, 'action="', '"')[0])
headers["Referer"] = response.url
@@ -54,23 +64,26 @@ def solve_challenge(session, response, kwargs):
params[name] = value
time.sleep(4)
-
- cf_kwargs["allow_redirects"] = False
cf_response = session.request("POST", url, **cf_kwargs)
+ if cf_response.history:
+ initial_response = cf_response.history[0]
+ else:
+ initial_response = cf_response
+
cookies = {
cookie.name: cookie.value
- for cookie in cf_response.cookies
+ for cookie in initial_response.cookies
}
if not cookies:
import logging
log = logging.getLogger("cloudflare")
- log.debug("Headers:\n%s", cf_response.headers)
- log.debug("Content:\n%s", cf_response.text)
+ log.debug("Headers:\n%s", initial_response.headers)
+ log.debug("Content:\n%s", initial_response.text)
return cf_response, None, None
- domain = next(iter(cf_response.cookies)).domain
+ domain = next(iter(initial_response.cookies)).domain
cookies["__cfduid"] = response.cookies.get("__cfduid", "")
return cf_response, domain, cookies
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
index dec5972..42fbe12 100644
--- a/gallery_dl/extractor/8muses.py
+++ b/gallery_dl/extractor/8muses.py
@@ -53,7 +53,7 @@ class _8musesAlbumExtractor(Extractor):
"private": False,
},
}),
- ("https://www.8muses.com/comics/album/Fakku-Comics/7?sort=az", {
+ ("https://www.8muses.com/comics/album/Fakku-Comics/8?sort=az", {
"count": ">= 70",
"keyword": {"name": r"re:^[R-Zr-z]"},
}),
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index cda357a..73ef20d 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -121,11 +121,14 @@ class DeviantartExtractor(Extractor):
if "excerpt" in deviation and self.commit_journal:
journal = self.api.deviation_content(deviation["deviationid"])
+ if self.extra:
+ deviation["_journal"] = journal["html"]
yield self.commit_journal(deviation, journal)
if self.extra:
- for match in DeviantartStashExtractor.pattern.finditer(
- deviation.get("description", "")):
+ txt = (deviation.get("description", "") +
+ deviation.get("_journal", ""))
+ for match in DeviantartStashExtractor.pattern.finditer(txt):
url = text.ensure_http_scheme(match.group(0))
deviation["_extractor"] = DeviantartStashExtractor
yield Message.Queue, url, deviation
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
index 3cc263c..86f63ae 100644
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -209,24 +209,24 @@ EXTRACTORS = {
}),
},
"sensescans": {
- "root": "http://sensescans.com/reader",
+ "root": "https://sensescans.com/reader",
"pattern": r"(?:(?:www\.)?sensescans\.com/reader"
r"|reader\.sensescans\.com)",
"test-chapter": (
- (("http://sensescans.com/reader/read/"
+ (("https://sensescans.com/reader/read/"
"magi__labyrinth_of_magic/en/37/369/"), {
- "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812",
+ "url": "8bbc59a995640bbb944c0b1be06a490909b58be1",
"keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988",
}),
- (("http://reader.sensescans.com/read/"
+ (("https://reader.sensescans.com/read/"
"magi__labyrinth_of_magic/en/37/369/"), {
- "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812",
+ "url": "8bbc59a995640bbb944c0b1be06a490909b58be1",
"keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988",
}),
),
"test-manga":
- ("http://sensescans.com/reader/series/hakkenden/", {
- "url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2",
+ ("https://sensescans.com/reader/series/hakkenden/", {
+ "url": "3e0559029c21ca5af8a2082dd6de1567fcec4d83",
"keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23",
}),
},
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index 9af7274..61226b6 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -48,6 +48,7 @@ class FuraffinityExtractor(Extractor):
extr = text.extract_from(self.request(url).text)
title, _, artist = text.unescape(extr(
'property="og:title" content="', '"')).rpartition(" by ")
+ artist_url = artist.replace("_", "").lower()
path = extr('href="//d.facdn.net/', '"')
if not path:
@@ -64,11 +65,12 @@ class FuraffinityExtractor(Extractor):
rh = text.remove_html
data = text.nameext_from_url(path, {
- "id" : pi(post_id),
- "title" : title,
- "artist": artist,
- "user" : self.user or artist,
- "url" : "https://d.facdn.net/" + path
+ "id" : pi(post_id),
+ "title" : title,
+ "artist" : artist,
+ "artist_url": artist_url,
+ "user" : self.user or artist_url,
+ "url" : "https://d.facdn.net/" + path
})
tags = extr('class="tags-row">', '</section>')
@@ -178,6 +180,7 @@ class FuraffinityPostExtractor(FuraffinityExtractor):
"url": "eae4ef93d99365c69b31a37561bd800c03d336ad",
"keyword": {
"artist" : "mirlinthloth",
+ "artist_url" : "mirlinthloth",
"date" : "dt:2016-11-27 17:24:06",
"description": "A Song made playing the game Cosmic DJ.",
"extension" : "mp3",
diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py
index 2ebbec8..9cd3b95 100644
--- a/gallery_dl/extractor/gfycat.py
+++ b/gallery_dl/extractor/gfycat.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2019 Mike Fährmann
+# Copyright 2017-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -20,8 +20,17 @@ class GfycatExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
+ self.key = match.group(1)
self.formats = (self.config("format", "mp4"), "mp4", "webm", "gif")
+ def items(self):
+ metadata = self.metadata()
+ for gfycat in self.gfycats():
+ url = self._select_format(gfycat)
+ gfycat.update(metadata)
+ yield Message.Directory, gfycat
+ yield Message.Url, url, gfycat
+
def _select_format(self, gfyitem):
for fmt in self.formats:
key = fmt + "Url"
@@ -31,9 +40,11 @@ class GfycatExtractor(Extractor):
return url
return ""
- def _get_info(self, gfycat_id):
- url = "https://api.gfycat.com/v1/gfycats/" + gfycat_id
- return self.request(url).json()["gfyItem"]
+ def metadata(self):
+ return {}
+
+ def gfycats(self):
+ return ()
class GfycatImageExtractor(GfycatExtractor):
@@ -72,12 +83,6 @@ class GfycatImageExtractor(GfycatExtractor):
("https://gfycat.com/ru/UnequaledHastyAnkole"),
)
- def __init__(self, match):
- GfycatExtractor.__init__(self, match)
- self.gfycat_id = match.group(1)
-
- def items(self):
- gfyitem = self._get_info(self.gfycat_id)
- yield Message.Version, 1
- yield Message.Directory, gfyitem
- yield Message.Url, self._select_format(gfyitem), gfyitem
+ def gfycats(self):
+ url = "https://api.gfycat.com/v1/gfycats/" + self.key
+ return (self.request(url).json()["gfyItem"],)
diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py
index 348453d..ade245b 100644
--- a/gallery_dl/extractor/kissmanga.py
+++ b/gallery_dl/extractor/kissmanga.py
@@ -9,9 +9,10 @@
"""Extract manga-chapters and entire manga from https://kissmanga.com/"""
from .common import ChapterExtractor, MangaExtractor, Extractor
-from .. import text, aes, exception
+from .. import text, aes
from ..cache import cache
import hashlib
+import time
import ast
import re
@@ -24,18 +25,7 @@ class RedirectMixin():
response = Extractor.request(self, url, **kwargs)
if not response.history or "/AreYouHuman" not in response.url:
return response
- if self.config("captcha", "stop") == "wait":
- self.log.warning(
- "Redirect to \n%s\nVisit this URL in your browser, solve "
- "the CAPTCHA, and press ENTER to continue", response.url)
- try:
- input()
- except (EOFError, OSError):
- pass
- else:
- raise exception.StopExtraction(
- "Redirect to \n%s\nVisit this URL in your browser and "
- "solve the CAPTCHA to continue", response.url)
+ time.sleep(2)
class KissmangaBase(RedirectMixin):
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index 72465f7..7e2d613 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -51,7 +51,7 @@ class MangadexChapterExtractor(MangadexExtractor):
test = (
("https://mangadex.org/chapter/122094", {
"keyword": "ef1084c2845825979e150512fed8fdc209baf05a",
- "content": "50383a4c15124682057b197d40261641a98db514",
+ # "content": "50383a4c15124682057b197d40261641a98db514",
}),
# oneshot
("https://mangadex.cc/chapter/138086", {
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
index 8cd7fa5..25fba70 100644
--- a/gallery_dl/extractor/mangoxo.py
+++ b/gallery_dl/extractor/mangoxo.py
@@ -85,7 +85,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
},
"album": {
"id": "lzVOv1Q9",
- "name": "池永康晟 Ikenaga Yasunari 透出古朴气息的日本美女人像画作",
+ "name": "re:池永康晟 Ikenaga Yasunari 透出古朴",
"date": "2019.3.22 14:42",
"description": str,
},
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
index 746144a..fd83328 100644
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@@ -54,8 +54,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
self.data = None
def metadata(self, page):
- data = json.loads(text.extract(page, "N.gallery(", ");")[0])
- self.data = data
+ self.data = data = json.loads(text.parse_unicode_escapes(text.extract(
+ page, 'JSON.parse("', '");')[0]))
title_en = data["title"].get("english", "")
title_ja = data["title"].get("japanese", "")
diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py
index 4841743..1d2140a 100644
--- a/gallery_dl/extractor/realbooru.py
+++ b/gallery_dl/extractor/realbooru.py
@@ -50,10 +50,10 @@ class RealbooruPostExtractor(booru.PostMixin, RealbooruExtractor):
test = ("https://realbooru.com/index.php?page=post&s=view&id=668483", {
"url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
"content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
- "options": (("tags", True),),
- "keyword": {
- "tags_general" : str,
- "tags_metadata": str,
- "tags_model" : "jennifer_lawrence",
- },
+ # "options": (("tags", True),),
+ # "keyword": {
+ # "tags_general" : str,
+ # "tags_metadata": str,
+ # "tags_model" : "jennifer_lawrence",
+ # },
})
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 2e3864a..6331b77 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -324,7 +324,11 @@ class RedditAPI():
self.extractor.wait(seconds=response.headers["x-ratelimit-reset"])
return self._call(endpoint, params)
- data = response.json()
+ try:
+ data = response.json()
+ except ValueError:
+ raise exception.StopExtraction(text.remove_html(response.text))
+
if "error" in data:
if data["error"] == 403:
raise exception.AuthorizationError()
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
index 7855eab..dcbbc0d 100644
--- a/gallery_dl/extractor/redgifs.py
+++ b/gallery_dl/extractor/redgifs.py
@@ -8,22 +8,60 @@
"""Extractors for https://redgifs.com/"""
-from .gfycat import GfycatImageExtractor
+from .gfycat import GfycatExtractor
from ..cache import cache
-class RedgifsImageExtractor(GfycatImageExtractor):
- """Extractor for individual images from redgifs.com"""
+class RedgifsExtractor(GfycatExtractor):
+ """Base class for redgifs extractors"""
category = "redgifs"
+ root = "https://www.redgifs.com/"
+
+
+class RedgifsUserExtractor(RedgifsExtractor):
+ """Extractor for redgifs user profiles"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "{userName}")
+ pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?&#]+)"
+ test = ("https://www.redgifs.com/users/Natalifiction", {
+ "pattern": r"https://thcf\d+\.redgifs\.com/[A-Za-z]+\.mp4",
+ "count": ">= 100",
+ })
+
+ def gfycats(self):
+ return RedgifsAPI(self).user(self.key)
+
+
+class RedgifsSearchExtractor(RedgifsExtractor):
+ """Extractor for redgifs search results"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "Search", "{search}")
+ pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/gifs/browse/([^/?&#]+)"
+ test = ("https://www.redgifs.com/gifs/browse/jav", {
+ "pattern": r"https://thcf\d+\.redgifs\.com/[A-Za-z]+\.mp4",
+ "range": "100-300",
+ "count": "> 200",
+ })
+
+ def metadata(self):
+ self.key = self.key.replace("-", " ")
+ return {"search": self.key}
+
+ def gfycats(self):
+ return RedgifsAPI(self).search(self.key)
+
+
+class RedgifsImageExtractor(RedgifsExtractor):
+ """Extractor for individual gifs from redgifs.com"""
+ subcategory = "image"
pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/watch/([A-Za-z]+)"
test = ("https://redgifs.com/watch/foolishforkedabyssiniancat", {
"pattern": r"https://\w+.redgifs.com/FoolishForkedAbyssiniancat.mp4",
"content": "f6e03f1df9a2ff2a74092f53ee7580d2fb943533",
})
- def _get_info(self, gfycat_id):
- api = RedgifsAPI(self)
- return api.gfycat(gfycat_id)
+ def gfycats(self):
+ return (RedgifsAPI(self).gfycat(self.key),)
class RedgifsAPI():
@@ -36,6 +74,16 @@ class RedgifsAPI():
endpoint = "v1/gfycats/" + gfycat_id
return self._call(endpoint)["gfyItem"]
+ def user(self, user):
+ endpoint = "v1/users/{}/gfycats".format(user.lower())
+ params = {"count": 100}
+ return self._pagination(endpoint, params)
+
+ def search(self, query):
+ endpoint = "v1/gfycats/search"
+ params = {"search_text": query, "count": 150}
+ return self._pagination(endpoint, params)
+
@cache(maxage=3600)
def _authenticate_impl(self):
url = "https://weblogin.redgifs.com/oauth/webtoken"
@@ -52,7 +100,19 @@ class RedgifsAPI():
url, method="POST", headers=headers, json=data)
return "Bearer " + response.json()["access_token"]
- def _call(self, endpoint):
+ def _call(self, endpoint, params=None):
self.headers["Authorization"] = self._authenticate_impl()
url = "https://napi.redgifs.com/" + endpoint
- return self.extractor.request(url, headers=self.headers).json()
+ return self.extractor.request(
+ url, params=params, headers=self.headers).json()
+
+ def _pagination(self, endpoint, params):
+ while True:
+ data = self._call(endpoint, params)
+ gfycats = data["gfycats"]
+ yield from gfycats
+
+ if "found" not in data and len(gfycats) < params["count"] or \
+ not data["gfycats"]:
+ return
+ params["cursor"] = data["cursor"]
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 4c7b757..7cabb8c 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -10,9 +10,9 @@
from .common import Extractor, Message
from .. import text, exception
-from ..cache import cache, memcache
-import json
-import re
+from ..cache import cache
+import hashlib
+import time
class TwitterExtractor(Extractor):
@@ -24,23 +24,15 @@ class TwitterExtractor(Extractor):
cookiedomain = ".twitter.com"
root = "https://twitter.com"
sizes = (":orig", ":large", ":medium", ":small")
- user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; "
- "Trident/7.0; rv:11.0) like Gecko")
def __init__(self, match):
Extractor.__init__(self, match)
self.user = match.group(1)
- self._user_dict = None
- self.logged_in = False
self.retweets = self.config("retweets", True)
self.replies = self.config("replies", True)
self.twitpic = self.config("twitpic", False)
- self.content = self.config("content", False)
self.videos = self.config("videos", True)
-
- if self.content:
- self._emoji_sub = re.compile(
- r'<img class="Emoji [^>]+ alt="([^"]+)"[^>]*>').sub
+ self._user_cache = {}
def items(self):
self.login()
@@ -48,235 +40,194 @@ class TwitterExtractor(Extractor):
yield Message.Version, 1
for tweet in self.tweets():
- data = self._data_from_tweet(tweet)
- if not data or \
- not self.retweets and data["retweet_id"] or \
- not self.replies and data["reply"]:
+
+ if not self.retweets and "retweeted_status_id_str" in tweet or \
+ not self.replies and "in_reply_to_user_id_str" in tweet:
continue
- data.update(metadata)
- if self.videos and "-videoContainer" in tweet:
- yield Message.Directory, data
+ if self.twitpic:
+ self._extract_twitpic(tweet)
+ if "extended_entities" not in tweet:
+ continue
- if self.videos == "ytdl":
- data["extension"] = None
- url = "ytdl:{}/i/web/status/{}".format(
- self.root, data["tweet_id"])
- else:
- url = self._video_from_tweet(data["tweet_id"])
- if not url:
- continue
- text.nameext_from_url(url, data)
- if data["extension"] == "m3u8":
- url = "ytdl:" + url
- data["extension"] = "mp4"
- data["_ytdl_extra"] = {"protocol": "m3u8_native"}
- data["num"] = 1
- yield Message.Url, url, data
-
- elif "data-image-url=" in tweet:
- yield Message.Directory, data
-
- images = text.extract_iter(
- tweet, 'data-image-url="', '"')
- for data["num"], url in enumerate(images, 1):
- text.nameext_from_url(url, data)
+ tdata = self._transform_tweet(tweet)
+ tdata.update(metadata)
+
+ yield Message.Directory, tdata
+ for tdata["num"], media in enumerate(
+ tweet["extended_entities"]["media"], 1):
+
+ tdata["width"] = media["original_info"].get("width", 0)
+ tdata["height"] = media["original_info"].get("height", 0)
+
+ if "video_info" in media and self.videos:
+
+ if self.videos == "ytdl":
+ url = "ytdl:{}/i/web/status/{}".format(
+ self.root, tweet["id_str"])
+ tdata["extension"] = None
+ yield Message.Url, url, tdata
+
+ else:
+ video_info = media["video_info"]
+ variant = max(
+ video_info["variants"],
+ key=lambda v: v.get("bitrate", 0),
+ )
+ tdata["duration"] = video_info.get(
+ "duration_millis", 0) / 1000
+ tdata["bitrate"] = variant.get("bitrate", 0)
+
+ url = variant["url"]
+ text.nameext_from_url(url, tdata)
+ yield Message.Url, url, tdata
+
+ elif "media_url_https" in media:
+ url = media["media_url_https"]
urls = [url + size for size in self.sizes]
- yield Message.Urllist, urls, data
-
- if self.twitpic and "//twitpic.com/" in tweet:
- urls = [
- url for url in text.extract_iter(
- tweet, 'data-expanded-url="', '"')
- if "//twitpic.com/" in url
- ]
-
- if "num" not in data:
- if urls:
- yield Message.Directory, data
- data["num"] = 0
-
- for data["num"], url in enumerate(urls, data["num"]+1):
- response = self.request(url, fatal=False)
- if response.status_code >= 400:
- continue
- url = text.extract(
- response.text, 'name="twitter:image" value="', '"')[0]
- yield Message.Url, url, text.nameext_from_url(url, data)
+ text.nameext_from_url(url, tdata)
+ yield Message.Urllist, urls, tdata
+
+ else:
+ url = media["media_url"]
+ text.nameext_from_url(url, tdata)
+ yield Message.Url, url, tdata
+
+ def _extract_twitpic(self, tweet):
+ twitpics = []
+ for url in tweet["entities"].get("urls", ()):
+ url = url["expanded_url"]
+ if "//twitpic.com/" in url:
+ response = self.request(url, fatal=False)
+ if response.status_code >= 400:
+ continue
+ url = text.extract(
+ response.text, 'name="twitter:image" value="', '"')[0]
+ twitpics.append({
+ "original_info": {},
+ "media_url" : url,
+ })
+ if twitpics:
+ if "extended_entities" in tweet:
+ tweet["extended_entities"]["media"].extend(twitpics)
+ else:
+ tweet["extended_entities"] = {"media": twitpics}
+
+ def _transform_tweet(self, tweet):
+ entities = tweet["entities"]
+ tdata = {
+ "tweet_id" : text.parse_int(tweet["id_str"]),
+ "retweet_id" : text.parse_int(
+ tweet.get("retweeted_status_id_str")),
+ "quote_id" : text.parse_int(
+ tweet.get("quoted_status_id_str")),
+ "reply_id" : text.parse_int(
+ tweet.get("in_reply_to_status_id_str")),
+ "date" : text.parse_datetime(
+ tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
+ "user" : self._transform_user(tweet["user"]),
+ "lang" : tweet["lang"],
+ "content" : tweet["full_text"],
+ "favorite_count": tweet["favorite_count"],
+ "quote_count" : tweet["quote_count"],
+ "reply_count" : tweet["reply_count"],
+ "retweet_count" : tweet["retweet_count"],
+ }
+
+ hashtags = entities.get("hashtags")
+ if hashtags:
+ tdata["hashtags"] = [t["text"] for t in hashtags]
+
+ mentions = entities.get("user_mentions")
+ if mentions:
+ tdata["mentions"] = [{
+ "id": text.parse_int(u["id_str"]),
+ "name": u["screen_name"],
+ "nick": u["name"],
+ } for u in mentions]
+
+ if "in_reply_to_screen_name" in tweet:
+ tdata["reply_to"] = tweet["in_reply_to_screen_name"]
+
+ if "full_text_quoted" in tweet:
+ tdata["content_quoted"] = tweet["full_text_quoted"]
+
+ if "author" in tweet:
+ tdata["author"] = self._transform_user(tweet["author"])
+
+ return tdata
+
+ def _transform_user(self, user):
+ uid = user["id_str"]
+ cache = self._user_cache
+
+ if uid not in cache:
+ cache[uid] = {
+ "id" : text.parse_int(uid),
+ "name" : user["screen_name"],
+ "nick" : user["name"],
+ "description" : user["description"],
+ "location" : user["location"],
+ "date" : text.parse_datetime(
+ user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
+ "verified" : user.get("verified", False),
+ "profile_banner" : user.get("profile_banner_url", ""),
+ "profile_image" : user.get(
+ "profile_image_url_https", "").replace("_normal.", "."),
+ "favourites_count": user["favourites_count"],
+ "followers_count" : user["followers_count"],
+ "friends_count" : user["friends_count"],
+ "listed_count" : user["listed_count"],
+ "media_count" : user["media_count"],
+ "statuses_count" : user["statuses_count"],
+ }
+ return cache[uid]
def metadata(self):
"""Return general metadata"""
return {}
def tweets(self):
- """Yield HTML content of all relevant tweets"""
+ """Yield all relevant tweet objects"""
def login(self):
username, password = self._get_auth_info()
if username:
self._update_cookies(self._login_impl(username, password))
- self.logged_in = True
@cache(maxage=360*24*3600, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- headers = {"User-Agent": self.user_agent}
- page = self.request(self.root + "/login", headers=headers).text
+ url = "https://mobile.twitter.com/i/nojs_router"
+ params = {"path": "/login"}
+ headers = {"Referer": self.root + "/", "Origin": self.root}
+ page = self.request(
+ url, method="POST", params=params, headers=headers, data={}).text
+
pos = page.index('name="authenticity_token"')
- token = text.extract(page, 'value="', '"', pos-80)[0]
+ token = text.extract(page, 'value="', '"', pos)[0]
- url = self.root + "/sessions"
+ url = "https://mobile.twitter.com/sessions"
data = {
+ "authenticity_token" : token,
"session[username_or_email]": username,
"session[password]" : password,
- "authenticity_token" : token,
- "ui_metrics" : '{"rf":{},"s":""}',
- "scribe_log" : "",
- "redirect_after_login" : "",
"remember_me" : "1",
+ "wfa" : "1",
+ "commit" : "+Log+in+",
+ "ui_metrics" : "",
}
- response = self.request(url, method="POST", headers=headers, data=data)
- if "/error" in response.url:
- raise exception.AuthenticationError()
-
- return {
+ response = self.request(url, method="POST", data=data)
+ cookies = {
cookie.name: cookie.value
for cookie in self.session.cookies
- if cookie.domain and "twitter.com" in cookie.domain
- }
-
- def _data_from_tweet(self, tweet):
- extr = text.extract_from(tweet)
- data = {
- "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
- "reply" : bool(extr('data-is-reply-to="' , '"')),
- "retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
- "retweeter" : extr('data-retweeter="' , '"'),
- "author" : {
- "name" : extr('data-screen-name="', '"'),
- "nick" : text.unescape(extr('data-name="' , '"')),
- "id" : text.parse_int(extr('data-user-id="' , '"')),
- },
- }
-
- if not self._user_dict:
- if data["retweet_id"]:
- for user in json.loads(text.unescape(extr(
- 'data-reply-to-users-json="', '"'))):
- if user["screen_name"] == data["retweeter"]:
- break
- else:
- self.log.warning("Unable to extract user info")
- return None
- self._user_dict = {
- "name": user["screen_name"],
- "nick": text.unescape(user["name"]),
- "id" : text.parse_int(user["id_str"]),
- }
- else:
- self._user_dict = data["author"]
-
- data["user"] = self._user_dict
- data["date"] = text.parse_timestamp(extr('data-time="', '"'))
-
- if self.content:
- content = extr('<div class="js-tweet-text-container">', '\n</div>')
- if '<img class="Emoji ' in content:
- content = self._emoji_sub(r"\1", content)
- content = text.unescape(text.remove_html(content, "", ""))
- cl, _, cr = content.rpartition("pic.twitter.com/")
- data["content"] = cl if cl and len(cr) < 16 else content
-
- if extr('<div class="QuoteTweet', '>'):
- data["retweet_id"] = text.parse_int(extr('data-item-id="', '"'))
- data["retweeter"] = data["user"]["name"]
- data["author"] = {
- "name" : extr('data-screen-name="', '"'),
- "id" : text.parse_int(extr('data-user-id="' , '"')),
- "nick" : text.unescape(extr(
- 'QuoteTweet-fullname', '<').partition('>')[2]),
- }
-
- return data
-
- def _video_from_tweet(self, tweet_id):
- url = "https://api.twitter.com/1.1/videos/tweet/config/{}.json".format(
- tweet_id)
- cookies = None
- headers = {
- "Origin" : self.root,
- "Referer" : "{}/i/web/status/{}".format(self.root, tweet_id),
- "x-csrf-token" : self.session.cookies.get("ct0"),
- "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekM"
- "xqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28N"
- "HfOPqkca3qaAxGfsyKCs0wRbw",
- }
-
- if self.logged_in:
- headers["x-twitter-auth-type"] = "OAuth2Session"
- else:
- token = _guest_token(self, headers)
- cookies = {"gt": token}
- headers["x-guest-token"] = token
-
- response = self.request(
- url, cookies=cookies, headers=headers, fatal=None)
-
- if response.status_code == 429 or \
- response.headers.get("x-rate-limit-remaining") == "0":
- if self.logged_in:
- self.wait(until=response.headers.get("x-rate-limit-reset"))
- else:
- _guest_token.invalidate()
- return self._video_from_tweet(tweet_id)
-
- elif response.status_code >= 400:
- self.log.warning("Unable to fetch video data for %s ('%s %s')",
- tweet_id, response.status_code, response.reason)
- return None
-
- return response.json()["track"]["playbackUrl"]
-
- def _tweets_from_api(self, url, max_position=None):
- params = {
- "include_available_features": "1",
- "include_entities": "1",
- "max_position": max_position,
- "reset_error_state": "false",
- "lang": "en",
+ if cookie.domain == self.cookiedomain
}
- headers = {
- "X-Requested-With": "XMLHttpRequest",
- "X-Twitter-Active-User": "yes",
- "Referer": self.root + "/",
- }
-
- while True:
- data = self.request(url, params=params, headers=headers).json()
- if "inner" in data:
- data = data["inner"]
-
- for tweet in text.extract_iter(
- data["items_html"], '<div class="tweet ', '\n</li>'):
- yield tweet
- if data.get("min_position") is None:
- if data["has_more_items"] and "min_position" not in data:
- pass
- else:
- return
-
- if "min_position" in data:
- position = data["min_position"]
- if position == max_position or position is None:
- return
- else:
- position = text.parse_int(text.extract(
- tweet, 'data-tweet-id="', '"')[0])
- if max_position and position >= max_position:
- return
- params["max_position"] = max_position = position
+ if "/error" in response.url or "auth_token" not in cookies:
+ raise exception.AuthenticationError()
+ return cookies
class TwitterTimelineExtractor(TwitterExtractor):
@@ -288,15 +239,12 @@ class TwitterTimelineExtractor(TwitterExtractor):
("https://twitter.com/supernaturepics", {
"range": "1-40",
"url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
- "keyword": "4a3d28cc9f7a39e27333d56f3fe19e6e07ee979e",
}),
("https://mobile.twitter.com/supernaturepics?p=i"),
)
def tweets(self):
- url = "{}/i/profiles/show/{}/timeline/tweets".format(
- self.root, self.user)
- return self._tweets_from_api(url)
+ return TwitterAPI(self).timeline_profile(self.user)
class TwitterMediaExtractor(TwitterExtractor):
@@ -313,9 +261,7 @@ class TwitterMediaExtractor(TwitterExtractor):
)
def tweets(self):
- url = "{}/i/profiles/show/{}/media_timeline".format(
- self.root, self.user)
- return self._tweets_from_api(url)
+ return TwitterAPI(self).timeline_media(self.user)
class TwitterSearchExtractor(TwitterExtractor):
@@ -330,12 +276,10 @@ class TwitterSearchExtractor(TwitterExtractor):
})
def metadata(self):
- return {"search": self.user}
+ return {"search": text.unquote(self.user)}
def tweets(self):
- url = "{}/i/search/timeline?f=tweets&q={}".format(
- self.root, self.user)
- return self._tweets_from_api(url, "-1")
+ return TwitterAPI(self).search(self.user)
class TwitterTweetExtractor(TwitterExtractor):
@@ -346,22 +290,19 @@ class TwitterTweetExtractor(TwitterExtractor):
test = (
("https://twitter.com/supernaturepics/status/604341487988576256", {
"url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
- "keyword": "76e018cf3f4c8b82d3bdd425e01e28078c98373b",
"content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
}),
# 4 images
("https://twitter.com/perrypumas/status/894001459754180609", {
"url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
- "keyword": "c9251b1fd79d547b0c6b4577f06c937d0e9b63d2",
}),
# video
("https://twitter.com/perrypumas/status/1065692031626829824", {
"options": (("videos", True),),
- "pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8",
+ "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
}),
# content with emoji, newlines, hashtags (#338)
("https://twitter.com/playpokemon/status/1263832915173048321", {
- "options": (("content", True),),
"keyword": {"content": (
r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
"Gifts! \n\nYou’ll be able to receive four Galarian form "
@@ -386,10 +327,6 @@ class TwitterTweetExtractor(TwitterExtractor):
# quoted tweet (#526)
("https://twitter.com/Pistachio/status/1222690391817932803", {
"pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg",
- "keyword": {
- "author": {"name": "Afro_Herper", "id": 786047748508221440},
- "user" : {"name": "Pistachio" , "id": 3533231},
- },
}),
# TwitPic embeds (#579)
("https://twitter.com/i/web/status/112900228289540096", {
@@ -404,18 +341,7 @@ class TwitterTweetExtractor(TwitterExtractor):
self.tweet_id = match.group(2)
def tweets(self):
- url = "{}/i/web/status/{}".format(self.root, self.tweet_id)
- cookies = {"app_shell_visited": "1"}
- headers = {"User-Agent": self.user_agent, "Referer": url}
-
- response = self.request(url, cookies=cookies, headers=headers)
- if response.history and response.url == self.root + "/":
- raise exception.AuthorizationError()
- page = response.text
-
- end = page.index('class="js-tweet-stats-container')
- beg = page.rindex('<div class="tweet ', 0, end)
- return (page[beg:end],)
+ return TwitterAPI(self).tweet(self.tweet_id)
class TwitterBookmarkExtractor(TwitterExtractor):
@@ -424,15 +350,26 @@ class TwitterBookmarkExtractor(TwitterExtractor):
pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()"
test = ("https://twitter.com/i/bookmarks",)
- def items(self):
- self.login()
- if not self.logged_in:
- raise exception.AuthorizationError("Login required")
- for cookie in self.session.cookies:
- cookie.expires = None
+ def tweets(self):
+ return TwitterAPI(self).bookmarks()
- url = "https://api.twitter.com/2/timeline/bookmark.json"
- params = {
+
+class TwitterAPI():
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.headers = {
+ "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
+ "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
+ "4FA33AGWWjCpTnA",
+ "x-guest-token": None,
+ "x-twitter-client-language": "en",
+ "x-twitter-active-user": "yes",
+ "x-csrf-token": None,
+ "Origin": "https://twitter.com",
+ "Referer": "https://twitter.com/",
+ }
+ self.params = {
"include_profile_interstitial_type": "1",
"include_blocking": "1",
"include_blocked_by": "1",
@@ -453,47 +390,134 @@ class TwitterBookmarkExtractor(TwitterExtractor):
"include_ext_media_color": "true",
"include_ext_media_availability": "true",
"send_error_codes": "true",
- "simple_quoted_tweets": "true",
+ "simple_quoted_tweet": "true",
+ # "count": "20",
"count": "100",
"cursor": None,
- "ext": "mediaStats%2CcameraMoment",
+ "ext": "mediaStats,highlightedLabel,cameraMoment",
+ "include_quote_count": "true",
}
- headers = {
- "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
- "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
- "4FA33AGWWjCpTnA",
- "Origin": self.root,
- "Referer": self.root + "/i/bookmarks",
- "x-csrf-token": self.session.cookies.get("ct0"),
- "x-twitter-active-user": "yes",
- "x-twitter-auth-type": "OAuth2Session",
- "x-twitter-client-language": "en",
+
+ cookies = self.extractor.session.cookies
+
+ # CSRF
+ csrf = hashlib.md5(str(time.time()).encode()).hexdigest()
+ self.headers["x-csrf-token"] = csrf
+ cookies.set("ct0", csrf, domain=".twitter.com")
+
+ if cookies.get("auth_token", domain=".twitter.com"):
+ self.headers["x-twitter-auth-type"] = "OAuth2Session"
+ else:
+ # guest token
+ guest_token = _guest_token(self.extractor, self.headers)
+ self.headers["x-guest-token"] = guest_token
+ cookies.set("gt", guest_token, domain=".twitter.com")
+
+ def tweet(self, tweet_id):
+ endpoint = "2/timeline/conversation/{}.json".format(tweet_id)
+ for tweet in self._pagination(endpoint):
+ if tweet["id_str"] == tweet_id:
+ return (tweet,)
+ return ()
+
+ def timeline_profile(self, screen_name):
+ user = self.user_by_screen_name(screen_name)
+ endpoint = "2/timeline/profile/{}.json".format(user["rest_id"])
+ return self._pagination(endpoint)
+
+ def timeline_media(self, screen_name):
+ user = self.user_by_screen_name(screen_name)
+ endpoint = "2/timeline/media/{}.json".format(user["rest_id"])
+ return self._pagination(endpoint)
+
+ def search(self, query):
+ endpoint = "2/search/adaptive.json"
+ params = self.params.copy()
+ params["q"] = text.unquote(query)
+ return self._pagination(
+ endpoint, params, "sq-I-t-", "sq-cursor-bottom")
+
+ def bookmarks(self):
+ endpoint = "2/timeline/bookmark.json"
+ return self._pagination(endpoint)
+
+ def user_by_screen_name(self, screen_name):
+ endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName"
+ params = {
+ "variables": '{"screen_name":"' + screen_name + '"'
+ ',"withHighlightedLabel":true}'
}
+ return self._call(endpoint, params)["data"]["user"]
+
+ def _call(self, endpoint, params):
+ url = "https://api.twitter.com/" + endpoint
+ response = self.extractor.request(
+ url, params=params, headers=self.headers, fatal=None)
+ if response.status_code < 400:
+ return response.json()
+ if response.status_code == 429:
+ self.extractor.wait(until=response.headers["x-rate-limit-reset"])
+ return self._call(endpoint, params)
+ raise exception.StopExtraction(
+ "%s %s (%s)", response.status_code, response.reason, response.text)
+
+ def _pagination(self, endpoint, params=None,
+ entry_tweet="tweet-", entry_cursor="cursor-bottom-"):
+ if params is None:
+ params = self.params.copy()
while True:
- response = self.request(
- url, params=params, headers=headers, fatal=False)
- if response.status_code >= 400:
- raise exception.StopExtraction(response.text)
- data = response.json()
- tweets = data["globalObjects"]["tweets"]
+ cursor = tweet = None
+ data = self._call(endpoint, params)
- if not tweets:
+ instr = data["timeline"]["instructions"]
+ if not instr:
return
- for tweet_id, tweet_data in tweets.items():
- tweet_url = "{}/i/web/status/{}".format(self.root, tweet_id)
- tweet_data["_extractor"] = TwitterTweetExtractor
- yield Message.Queue, tweet_url, tweet_data
+ tweets = data["globalObjects"]["tweets"]
+ users = data["globalObjects"]["users"]
+
+ for entry in instr[0]["addEntries"]["entries"]:
- inst = data["timeline"]["instructions"][0]
- for entry in inst["addEntries"]["entries"]:
- if entry["entryId"].startswith("cursor-bottom-"):
- params["cursor"] = \
- entry["content"]["operation"]["cursor"]["value"]
- break
+ if entry["entryId"].startswith(entry_tweet):
+ tid = entry["content"]["item"]["content"]["tweet"]["id"]
+ if tid not in tweets:
+ self.extractor.log.debug(
+ "Skipping unavailable Tweet %s", tid)
+ continue
+ tweet = tweets[tid]
+ tweet["user"] = users[tweet["user_id_str"]]
+
+ if "quoted_status_id_str" in tweet:
+ quoted = tweets.get(tweet["quoted_status_id_str"])
+ if quoted:
+ tweet["full_text_quoted"] = quoted["full_text"]
+ if "extended_entities" in quoted:
+ tweet["extended_entities"] = \
+ quoted["extended_entities"]
+ elif "retweeted_status_id_str" in tweet:
+ retweet = tweets.get(tweet["retweeted_status_id_str"])
+ if retweet:
+ tweet["author"] = users[retweet["user_id_str"]]
+
+ yield tweet
+
+ elif entry["entryId"].startswith(entry_cursor):
+ cursor = entry["content"]["operation"]["cursor"]
+ if not cursor.get("stopOnEmptyResponse"):
+ # keep going even if there are no tweets
+ tweet = True
+ cursor = cursor["value"]
+
+ if "replaceEntry" in instr[-1] :
+ cursor = (instr[-1]["replaceEntry"]["entry"]
+ ["content"]["operation"]["cursor"]["value"])
+
+ if not cursor or not tweet:
+ return
+ params["cursor"] = cursor
-@memcache()
+@cache(maxage=3600)
def _guest_token(extr, headers):
return extr.request(
"https://api.twitter.com/1.1/guest/activate.json",
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 86ada49..3b992a2 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -35,7 +35,7 @@ class WebtoonsEpisodeExtractor(WebtoonsExtractor):
filename_fmt = "{episode}-{num:>02}.{extension}"
archive_fmt = "{title_no}_{episode}_{num}"
pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+)/(?:[^/?&#]+))"
- r"/viewer(?:\?([^#]+))")
+ r"/viewer(?:\?([^#'\"]+))")
test = (
(("https://www.webtoons.com/en/comedy/safely-endangered"
"/ep-572-earth/viewer?title_no=352&episode_no=572"), {
@@ -111,6 +111,11 @@ class WebtoonsComicExtractor(WebtoonsExtractor):
"list?title_no=1845&page=3"), {
"count": ">= 15",
}),
+ # (#820)
+ (("https://www.webtoons.com/en/challenge/scoob-and-shag/"
+ "list?title_no=210827&page=9"), {
+ "count": ">= 18",
+ }),
)
def __init__(self, match):
@@ -143,6 +148,8 @@ class WebtoonsComicExtractor(WebtoonsExtractor):
@staticmethod
def get_episode_urls(page):
"""Extract and return all episode urls in 'page'"""
- pos = page.find('id="_listUl"')
- return text.extract_iter(
- page, '<a href="', '" class="NPI=a:list', pos)
+ page = text.extract(page, 'id="_listUl"', '</ul>')[0]
+ return [
+ match.group(0)
+ for match in WebtoonsEpisodeExtractor.pattern.finditer(page)
+ ]
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 85b871b..afd96b8 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -84,6 +84,13 @@ def filter_dict(a):
return {k: v for k, v in a.items() if k[0] != "_"}
+def delete_items(obj, keys):
+ """Remove all 'keys' from 'obj'"""
+ for key in keys:
+ if key in obj:
+ del obj[key]
+
+
def number_to_string(value, numbers=(int, float)):
"""Convert numbers (int, float) to string; Return everything else as is."""
return str(value) if value.__class__ in numbers else value
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index dd6f373..8509e1e 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.14.0"
+__version__ = "1.14.1"
diff --git a/test/test_results.py b/test/test_results.py
index 5bef1a4..196d859 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -31,10 +31,10 @@ TRAVIS_SKIP = {
# temporary issues, etc.
BROKEN = {
- "e621",
"imagevenue",
- "jaiminisbox",
"photobucket",
+ "seiga",
+ "twitter",
"worldthree",
}