aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Unit 193 <unit193@unit193.net>2020-12-13 23:07:48 -0500
committerLibravatar Unit 193 <unit193@unit193.net>2020-12-13 23:07:48 -0500
commit91a9e964cb02f9c0d6e5c3b254378778f295b6b3 (patch)
tree9befdc2987a39d015c9829f836002de3eda8253f
parent9a430174e9ee690675d3772b27994b60041de51a (diff)
parent8f7c87a2697113134c311aaeafd9c919555a2741 (diff)
downloadgallery-dl-91a9e964cb02f9c0d6e5c3b254378778f295b6b3.tar.bz2
gallery-dl-91a9e964cb02f9c0d6e5c3b254378778f295b6b3.tar.xz
gallery-dl-91a9e964cb02f9c0d6e5c3b254378778f295b6b3.tar.zst
Update upstream source from tag 'upstream/1.16.0'
Update to upstream version '1.16.0' with Debian dir a323dcdcc7c5a90d7d0ad623d392e091f56dbe52
-rw-r--r--CHANGELOG.md26
-rw-r--r--PKG-INFO63
-rw-r--r--README.rst58
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.587
-rw-r--r--docs/gallery-dl.conf7
-rw-r--r--gallery_dl.egg-info/PKG-INFO63
-rw-r--r--gallery_dl.egg-info/SOURCES.txt7
-rw-r--r--gallery_dl/downloader/http.py212
-rw-r--r--gallery_dl/extractor/3dbooru.py39
-rw-r--r--gallery_dl/extractor/__init__.py8
-rw-r--r--gallery_dl/extractor/booru.py381
-rw-r--r--gallery_dl/extractor/common.py65
-rw-r--r--gallery_dl/extractor/danbooru.py4
-rw-r--r--gallery_dl/extractor/e621.py15
-rw-r--r--gallery_dl/extractor/flickr.py47
-rw-r--r--gallery_dl/extractor/foolfuuka.py4
-rw-r--r--gallery_dl/extractor/foolslide.py3
-rw-r--r--gallery_dl/extractor/gelbooru.py111
-rw-r--r--gallery_dl/extractor/hentainexus.py36
-rw-r--r--gallery_dl/extractor/hypnohub.py68
-rw-r--r--gallery_dl/extractor/idolcomplex.py238
-rw-r--r--gallery_dl/extractor/imagehosts.py6
-rw-r--r--gallery_dl/extractor/instagram.py818
-rw-r--r--gallery_dl/extractor/konachan.py85
-rw-r--r--gallery_dl/extractor/mangadex.py12
-rw-r--r--gallery_dl/extractor/moebooru.py257
-rw-r--r--gallery_dl/extractor/nozomi.py13
-rw-r--r--gallery_dl/extractor/paheal.py24
-rw-r--r--gallery_dl/extractor/piczel.py10
-rw-r--r--gallery_dl/extractor/reactor.py6
-rw-r--r--gallery_dl/extractor/realbooru.py59
-rw-r--r--gallery_dl/extractor/rule34.py63
-rw-r--r--gallery_dl/extractor/safebooru.py61
-rw-r--r--gallery_dl/extractor/sankaku.py332
-rw-r--r--gallery_dl/extractor/shopify.py4
-rw-r--r--gallery_dl/extractor/twitter.py59
-rw-r--r--gallery_dl/extractor/webtoons.py13
-rw-r--r--gallery_dl/extractor/yandere.py68
-rw-r--r--gallery_dl/job.py64
-rw-r--r--gallery_dl/option.py3
-rw-r--r--gallery_dl/postprocessor/classify.py5
-rw-r--r--gallery_dl/postprocessor/common.py20
-rw-r--r--gallery_dl/postprocessor/compare.py19
-rw-r--r--gallery_dl/postprocessor/exec.py72
-rw-r--r--gallery_dl/postprocessor/metadata.py64
-rw-r--r--gallery_dl/postprocessor/mtime.py1
-rw-r--r--gallery_dl/postprocessor/ugoira.py5
-rw-r--r--gallery_dl/postprocessor/zip.py13
-rw-r--r--gallery_dl/util.py10
-rw-r--r--gallery_dl/version.py2
-rw-r--r--setup.py5
-rw-r--r--test/test_cookies.py10
-rw-r--r--test/test_postprocessor.py115
-rw-r--r--test/test_results.py7
55 files changed, 2005 insertions, 1844 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e08f243..c536269 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,31 @@
# Changelog
+## 1.16.0 - 2020-12-12
+### Additions
+- [booru] implement generalized extractors for `*booru` and `moebooru` sites
+ - add support for sakugabooru.com ([#1136](https://github.com/mikf/gallery-dl/issues/1136))
+ - add support for lolibooru.moe ([#1050](https://github.com/mikf/gallery-dl/issues/1050))
+ - provide formattable `date` metadata fields ([#1138](https://github.com/mikf/gallery-dl/issues/1138))
+- [postprocessor:metadata] add `event` and `filename` options ([#315](https://github.com/mikf/gallery-dl/issues/315), [#866](https://github.com/mikf/gallery-dl/issues/866), [#984](https://github.com/mikf/gallery-dl/issues/984))
+- [postprocessor:exec] add `event` option ([#992](https://github.com/mikf/gallery-dl/issues/992))
+### Changes
+- [flickr] update default directories and improve metadata consistency ([#828](https://github.com/mikf/gallery-dl/issues/828))
+- [sankaku] use API endpoints from `beta.sankakucomplex.com`
+- [downloader:http] improve filename extension handling ([#776](https://github.com/mikf/gallery-dl/issues/776))
+- replace all JPEG filename extensions with `jpg` by default
+### Fixes
+- [hentainexus] fix extraction ([#1166](https://github.com/mikf/gallery-dl/issues/1166))
+- [instagram] rewrite ([#1113](https://github.com/mikf/gallery-dl/issues/1113), [#1122](https://github.com/mikf/gallery-dl/issues/1122), [#1128](https://github.com/mikf/gallery-dl/issues/1128), [#1130](https://github.com/mikf/gallery-dl/issues/1130), [#1149](https://github.com/mikf/gallery-dl/issues/1149))
+- [mangadex] handle external chapters ([#1154](https://github.com/mikf/gallery-dl/issues/1154))
+- [nozomi] handle empty `date` fields ([#1163](https://github.com/mikf/gallery-dl/issues/1163))
+- [paheal] create directory for each post ([#1147](https://github.com/mikf/gallery-dl/issues/1147))
+- [piczel] update API URLs
+- [twitter] update image URL format ([#1145](https://github.com/mikf/gallery-dl/issues/1145))
+- [twitter] improve `x-csrf-token` header handling ([#1170](https://github.com/mikf/gallery-dl/issues/1170))
+- [webtoons] update `ageGate` cookies
+### Removals
+- [sankaku] remove login support
+
## 1.15.4 - 2020-11-27
### Fixes
- [2chan] skip external links
diff --git a/PKG-INFO b/PKG-INFO
index db9cba2..049e111 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,7 +1,7 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.15.4
-Summary: Command-line program to download image-galleries and -collections from several image hosting sites
+Version: 1.16.0
+Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
Author-email: mike_faehrmann@web.de
@@ -13,8 +13,8 @@ Description: ==========
gallery-dl
==========
- *gallery-dl* is a command-line program to download image-galleries and
- -collections from several image hosting sites (see `Supported Sites`_).
+ *gallery-dl* is a command-line program to download image galleries and
+ collections from several image hosting sites (see `Supported Sites`_).
It is a cross-platform tool with many configuration options
and powerful filenaming capabilities.
@@ -46,14 +46,14 @@ Description: ==========
.. code:: bash
- $ python3 -m pip install --upgrade gallery-dl
+ $ python3 -m pip install -U gallery-dl
- Installing the latest dev-version directly from GitHub can be done with
+ Installing the latest dev version directly from GitHub can be done with
pip_ as well:
.. code:: bash
- $ python3 -m pip install --upgrade https://github.com/mikf/gallery-dl/archive/master.tar.gz
+ $ python3 -m pip install -U -I --no-deps --no-cache-dir https://github.com/mikf/gallery-dl/archive/master.tar.gz
Note: Windows users should use :code:`py -3` instead of :code:`python3`.
@@ -94,10 +94,10 @@ Description: ==========
put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
and run it inside a command prompt (like ``cmd.exe``).
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.4/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.4/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.bin>`__
- These executables include a Python 3.8 interpreter
+ These executables include a Python interpreter
and all required Python packages.
@@ -192,19 +192,22 @@ Description: ==========
see gallery-dl-example.conf_.
| A list of all available configuration options and their
descriptions can be found in configuration.rst_.
+ |
*gallery-dl* searches for configuration files in the following places:
- +--------------------------------------------+------------------------------------------+
- | Linux | Windows |
- +--------------------------------------------+------------------------------------------+
- |* ``/etc/gallery-dl.conf`` |* ``%APPDATA%\gallery-dl\config.json`` |
- |* ``${HOME}/.config/gallery-dl/config.json``|* ``%USERPROFILE%\gallery-dl\config.json``|
- |* ``${HOME}/.gallery-dl.conf`` |* ``%USERPROFILE%\gallery-dl.conf`` |
- +--------------------------------------------+------------------------------------------+
+ Windows:
+ * ``%APPDATA%\gallery-dl\config.json``
+ * ``%USERPROFILE%\gallery-dl\config.json``
+ * ``%USERPROFILE%\gallery-dl.conf``
- (``%USERPROFILE%`` usually refers to the user's home directory,
- i.e. ``C:\Users\<username>\``)
+ (``%USERPROFILE%`` usually refers to the user's home directory,
+ i.e. ``C:\Users\<username>\``)
+
+ Linux, macOS, etc.:
+ * ``/etc/gallery-dl.conf``
+ * ``${HOME}/.config/gallery-dl/config.json``
+ * ``${HOME}/.gallery-dl.conf``
Values in later configuration files will override previous ones.
@@ -224,9 +227,18 @@ Description: ==========
a username & password pair. This is necessary for
``pixiv``, ``nijie``, and ``seiga``
and optional for
- ``aryion``, ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``,
- ``instagram``, ``luscious``, ``pinterest``, ``sankaku``, ``subscribestar``,
- ``tsumino``, and ``twitter``.
+ ``aryion``,
+ ``danbooru``,
+ ``e621``,
+ ``exhentai``,
+ ``idolcomplex``,
+ ``inkbunny``,
+ ``instagram``,
+ ``luscious``,
+ ``pinterest``,
+ ``subscribestar``,
+ ``tsumino``,
+ and ``twitter``.
You can set the necessary information in your configuration file
(cf. gallery-dl.conf_)
@@ -319,7 +331,7 @@ Description: ==========
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
- .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.4.tar.gz
+ .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.0.tar.gz
.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
@@ -337,8 +349,8 @@ Description: ==========
.. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg
:target: https://pypi.org/project/gallery-dl/
- .. |build| image:: https://travis-ci.com/mikf/gallery-dl.svg?branch=master
- :target: https://travis-ci.com/mikf/gallery-dl
+ .. |build| image:: https://github.com/mikf/gallery-dl/workflows/tests/badge.svg
+ :target: https://github.com/mikf/gallery-dl/actions
.. |gitter| image:: https://badges.gitter.im/gallery-dl/main.svg
:target: https://gitter.im/gallery-dl/main
@@ -357,6 +369,7 @@ Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Topic :: Internet :: WWW/HTTP
Classifier: Topic :: Multimedia :: Graphics
diff --git a/README.rst b/README.rst
index 1cde544..4bfb821 100644
--- a/README.rst
+++ b/README.rst
@@ -2,8 +2,8 @@
gallery-dl
==========
-*gallery-dl* is a command-line program to download image-galleries and
--collections from several image hosting sites (see `Supported Sites`_).
+*gallery-dl* is a command-line program to download image galleries and
+collections from several image hosting sites (see `Supported Sites`_).
It is a cross-platform tool with many configuration options
and powerful filenaming capabilities.
@@ -35,14 +35,14 @@ easily installed or upgraded using pip_:
.. code:: bash
- $ python3 -m pip install --upgrade gallery-dl
+ $ python3 -m pip install -U gallery-dl
-Installing the latest dev-version directly from GitHub can be done with
+Installing the latest dev version directly from GitHub can be done with
pip_ as well:
.. code:: bash
- $ python3 -m pip install --upgrade https://github.com/mikf/gallery-dl/archive/master.tar.gz
+ $ python3 -m pip install -U -I --no-deps --no-cache-dir https://github.com/mikf/gallery-dl/archive/master.tar.gz
Note: Windows users should use :code:`py -3` instead of :code:`python3`.
@@ -83,10 +83,10 @@ Download a standalone executable file,
put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
and run it inside a command prompt (like ``cmd.exe``).
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.4/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.4/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.bin>`__
-These executables include a Python 3.8 interpreter
+These executables include a Python interpreter
and all required Python packages.
@@ -181,19 +181,22 @@ Configuration files for *gallery-dl* use a JSON-based file format.
see gallery-dl-example.conf_.
| A list of all available configuration options and their
descriptions can be found in configuration.rst_.
+|
*gallery-dl* searches for configuration files in the following places:
-+--------------------------------------------+------------------------------------------+
-| Linux | Windows |
-+--------------------------------------------+------------------------------------------+
-|* ``/etc/gallery-dl.conf`` |* ``%APPDATA%\gallery-dl\config.json`` |
-|* ``${HOME}/.config/gallery-dl/config.json``|* ``%USERPROFILE%\gallery-dl\config.json``|
-|* ``${HOME}/.gallery-dl.conf`` |* ``%USERPROFILE%\gallery-dl.conf`` |
-+--------------------------------------------+------------------------------------------+
+Windows:
+ * ``%APPDATA%\gallery-dl\config.json``
+ * ``%USERPROFILE%\gallery-dl\config.json``
+ * ``%USERPROFILE%\gallery-dl.conf``
-(``%USERPROFILE%`` usually refers to the user's home directory,
-i.e. ``C:\Users\<username>\``)
+ (``%USERPROFILE%`` usually refers to the user's home directory,
+ i.e. ``C:\Users\<username>\``)
+
+Linux, macOS, etc.:
+ * ``/etc/gallery-dl.conf``
+ * ``${HOME}/.config/gallery-dl/config.json``
+ * ``${HOME}/.gallery-dl.conf``
Values in later configuration files will override previous ones.
@@ -213,9 +216,18 @@ Some extractors require you to provide valid login credentials in the form of
a username & password pair. This is necessary for
``pixiv``, ``nijie``, and ``seiga``
and optional for
-``aryion``, ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``,
-``instagram``, ``luscious``, ``pinterest``, ``sankaku``, ``subscribestar``,
-``tsumino``, and ``twitter``.
+``aryion``,
+``danbooru``,
+``e621``,
+``exhentai``,
+``idolcomplex``,
+``inkbunny``,
+``instagram``,
+``luscious``,
+``pinterest``,
+``subscribestar``,
+``tsumino``,
+and ``twitter``.
You can set the necessary information in your configuration file
(cf. gallery-dl.conf_)
@@ -308,7 +320,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
-.. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.4.tar.gz
+.. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.0.tar.gz
.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
@@ -326,8 +338,8 @@ access to *gallery-dl*. Authorize it and you will be shown one or more
.. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg
:target: https://pypi.org/project/gallery-dl/
-.. |build| image:: https://travis-ci.com/mikf/gallery-dl.svg?branch=master
- :target: https://travis-ci.com/mikf/gallery-dl
+.. |build| image:: https://github.com/mikf/gallery-dl/workflows/tests/badge.svg
+ :target: https://github.com/mikf/gallery-dl/actions
.. |gitter| image:: https://badges.gitter.im/gallery-dl/main.svg
:target: https://gitter.im/gallery-dl/main
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index 114502a..af6eaf3 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2020-11-27" "1.15.4" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2020-12-12" "1.16.0" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index 34ac377..8c291fb 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2020-11-27" "1.15.4" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2020-12-12" "1.16.0" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -222,9 +222,6 @@ escaped with backslashes, e.g. \f[I]"\\\\[\\\\]"\f[]
\f[I]object\f[]
.IP "Default:" 9
-\f[I]null\f[]
-
-.IP "Example:" 4
.. code:: json
{
@@ -236,7 +233,7 @@ escaped with backslashes, e.g. \f[I]"\\\\[\\\\]"\f[]
}
.IP "Description:" 4
-A JSON \f[I]object\f[] mapping filename extensions to alternatives.
+A JSON \f[I]object\f[] mapping filename extensions to their replacements.
.SS extractor.*.skip
@@ -349,8 +346,6 @@ and optional for
.br
* \f[I]pinterest\f[]
.br
-* \f[I]sankaku\f[]
-.br
* \f[I]subscribestar\f[]
.br
* \f[I]tsumino\f[]
@@ -1537,21 +1532,6 @@ and \f[I]"gif"\f[] (in that order) will be tried instead, until an
available format is found.
-.SS extractor.sankaku.wait-min & .wait-max
-.IP "Type:" 6
-\f[I]float\f[]
-
-.IP "Default:" 9
-\f[I]3.0\f[] and \f[I]6.0\f[]
-
-.IP "Description:" 4
-Minimum and maximum wait time in seconds between each image
-
-Sankaku Channel responds with \f[I]429 Too Many Requests\f[] if it
-receives too many HTTP requests in a certain amount of time.
-Waiting a few seconds between each request tries to prevent that.
-
-
.SS extractor.sankakucomplex.embeds
.IP "Type:" 6
\f[I]bool\f[]
@@ -2243,7 +2223,7 @@ The command to run.
* If this is a \f[I]string\f[], it will be executed using the system's
shell, e.g. \f[I]/bin/sh\f[]. Any \f[I]{}\f[] will be replaced
with the full path of a file or target directory, depending on
-\f[I]exec.final\f[]
+\f[I]exec.event\f[]
.br
* If this is a \f[I]list\f[], the first element specifies the program
@@ -2253,17 +2233,17 @@ the files' metadata as well as \f[I]{_path}\f[], \f[I]{_directory}\f[],
and \f[I]{_filename}\f[].
-.SS exec.final
+.SS exec.event
.IP "Type:" 6
-\f[I]bool\f[]
+\f[I]string\f[]
.IP "Default:" 9
-\f[I]false\f[]
+\f[I]"after"\f[]
.IP "Description:" 4
-Controls whether to execute \f[I]exec.command\f[] for each
-downloaded file or only once after all files
-have been downloaded successfully.
+The event for which \f[I]exec.command\f[] is run.
+
+See \f[I]metadata.event\f[] for a list of available events.
.SS metadata.mode
@@ -2286,6 +2266,24 @@ Select how to write metadata.
to a file's metadata dictionary
+.SS metadata.filename
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]null\f[]
+
+.IP "Example:" 4
+"{id}.data.json"
+
+.IP "Description:" 4
+A \f[I]format string\f[] to build the filenames for metadata files with.
+(see \f[I]extractor.filename\f[])
+
+If this option is set, \f[I]metadata.extension\f[] and
+\f[I]metadata.extension-format\f[] will be ignored.
+
+
.SS metadata.directory
.IP "Type:" 6
\f[I]string\f[]
@@ -2330,6 +2328,37 @@ files with, which will replace the original filename extensions.
Note: \f[I]metadata.extension\f[] is ignored if this option is set.
+.SS metadata.event
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"file"\f[]
+
+.IP "Description:" 4
+The event for which metadata gets written to a file.
+
+The available events are:
+
+\f[I]init\f[]
+After post procesor initialization
+and before the first file download
+\f[I]finalize\f[]
+On extractor shutdown, e.g. after all files were downloaded
+\f[I]prepare\f[]
+Before a file download
+\f[I]file\f[]
+When completing a file download,
+but before it gets moved to its target location
+\f[I]after\f[]
+After a file got moved to its target location
+\f[I]skip\f[]
+When skipping a file download
+\f[I]post\f[]
+When starting to download all files of a post,
+e.g. a Tweet on Twitter or a post on Patreon.
+
+
.SS metadata.content-format
.IP "Type:" 6
\f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[]
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index ecb9f9b..bc9999b 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -128,13 +128,6 @@
{
"format": "mp4"
},
- "sankaku":
- {
- "username": null,
- "password": null,
- "wait-min": 3.0,
- "wait-max": 6.0
- },
"seiga":
{
"username": null,
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index d22ca6b..e0eda0d 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,7 +1,7 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.15.4
-Summary: Command-line program to download image-galleries and -collections from several image hosting sites
+Version: 1.16.0
+Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
Author-email: mike_faehrmann@web.de
@@ -13,8 +13,8 @@ Description: ==========
gallery-dl
==========
- *gallery-dl* is a command-line program to download image-galleries and
- -collections from several image hosting sites (see `Supported Sites`_).
+ *gallery-dl* is a command-line program to download image galleries and
+ collections from several image hosting sites (see `Supported Sites`_).
It is a cross-platform tool with many configuration options
and powerful filenaming capabilities.
@@ -46,14 +46,14 @@ Description: ==========
.. code:: bash
- $ python3 -m pip install --upgrade gallery-dl
+ $ python3 -m pip install -U gallery-dl
- Installing the latest dev-version directly from GitHub can be done with
+ Installing the latest dev version directly from GitHub can be done with
pip_ as well:
.. code:: bash
- $ python3 -m pip install --upgrade https://github.com/mikf/gallery-dl/archive/master.tar.gz
+ $ python3 -m pip install -U -I --no-deps --no-cache-dir https://github.com/mikf/gallery-dl/archive/master.tar.gz
Note: Windows users should use :code:`py -3` instead of :code:`python3`.
@@ -94,10 +94,10 @@ Description: ==========
put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
and run it inside a command prompt (like ``cmd.exe``).
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.15.4/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.15.4/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.0/gallery-dl.bin>`__
- These executables include a Python 3.8 interpreter
+ These executables include a Python interpreter
and all required Python packages.
@@ -192,19 +192,22 @@ Description: ==========
see gallery-dl-example.conf_.
| A list of all available configuration options and their
descriptions can be found in configuration.rst_.
+ |
*gallery-dl* searches for configuration files in the following places:
- +--------------------------------------------+------------------------------------------+
- | Linux | Windows |
- +--------------------------------------------+------------------------------------------+
- |* ``/etc/gallery-dl.conf`` |* ``%APPDATA%\gallery-dl\config.json`` |
- |* ``${HOME}/.config/gallery-dl/config.json``|* ``%USERPROFILE%\gallery-dl\config.json``|
- |* ``${HOME}/.gallery-dl.conf`` |* ``%USERPROFILE%\gallery-dl.conf`` |
- +--------------------------------------------+------------------------------------------+
+ Windows:
+ * ``%APPDATA%\gallery-dl\config.json``
+ * ``%USERPROFILE%\gallery-dl\config.json``
+ * ``%USERPROFILE%\gallery-dl.conf``
- (``%USERPROFILE%`` usually refers to the user's home directory,
- i.e. ``C:\Users\<username>\``)
+ (``%USERPROFILE%`` usually refers to the user's home directory,
+ i.e. ``C:\Users\<username>\``)
+
+ Linux, macOS, etc.:
+ * ``/etc/gallery-dl.conf``
+ * ``${HOME}/.config/gallery-dl/config.json``
+ * ``${HOME}/.gallery-dl.conf``
Values in later configuration files will override previous ones.
@@ -224,9 +227,18 @@ Description: ==========
a username & password pair. This is necessary for
``pixiv``, ``nijie``, and ``seiga``
and optional for
- ``aryion``, ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``inkbunny``,
- ``instagram``, ``luscious``, ``pinterest``, ``sankaku``, ``subscribestar``,
- ``tsumino``, and ``twitter``.
+ ``aryion``,
+ ``danbooru``,
+ ``e621``,
+ ``exhentai``,
+ ``idolcomplex``,
+ ``inkbunny``,
+ ``instagram``,
+ ``luscious``,
+ ``pinterest``,
+ ``subscribestar``,
+ ``tsumino``,
+ and ``twitter``.
You can set the necessary information in your configuration file
(cf. gallery-dl.conf_)
@@ -319,7 +331,7 @@ Description: ==========
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
- .. _stable: https://github.com/mikf/gallery-dl/archive/v1.15.4.tar.gz
+ .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.0.tar.gz
.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
@@ -337,8 +349,8 @@ Description: ==========
.. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg
:target: https://pypi.org/project/gallery-dl/
- .. |build| image:: https://travis-ci.com/mikf/gallery-dl.svg?branch=master
- :target: https://travis-ci.com/mikf/gallery-dl
+ .. |build| image:: https://github.com/mikf/gallery-dl/workflows/tests/badge.svg
+ :target: https://github.com/mikf/gallery-dl/actions
.. |gitter| image:: https://badges.gitter.im/gallery-dl/main.svg
:target: https://gitter.im/gallery-dl/main
@@ -357,6 +369,7 @@ Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Topic :: Internet :: WWW/HTTP
Classifier: Topic :: Multimedia :: Graphics
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index c2e5cb4..fd1b4a1 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -74,7 +74,6 @@ gallery_dl/extractor/hentaihere.py
gallery_dl/extractor/hentainexus.py
gallery_dl/extractor/hiperdex.py
gallery_dl/extractor/hitomi.py
-gallery_dl/extractor/hypnohub.py
gallery_dl/extractor/idolcomplex.py
gallery_dl/extractor/imagebam.py
gallery_dl/extractor/imagechest.py
@@ -91,7 +90,6 @@ gallery_dl/extractor/kabeuchi.py
gallery_dl/extractor/keenspot.py
gallery_dl/extractor/khinsider.py
gallery_dl/extractor/komikcast.py
-gallery_dl/extractor/konachan.py
gallery_dl/extractor/lineblog.py
gallery_dl/extractor/livedoor.py
gallery_dl/extractor/luscious.py
@@ -106,6 +104,7 @@ gallery_dl/extractor/mangastream.py
gallery_dl/extractor/mangoxo.py
gallery_dl/extractor/mastodon.py
gallery_dl/extractor/message.py
+gallery_dl/extractor/moebooru.py
gallery_dl/extractor/myhentaigallery.py
gallery_dl/extractor/myportfolio.py
gallery_dl/extractor/naver.py
@@ -128,12 +127,9 @@ gallery_dl/extractor/pornhub.py
gallery_dl/extractor/pururin.py
gallery_dl/extractor/reactor.py
gallery_dl/extractor/readcomiconline.py
-gallery_dl/extractor/realbooru.py
gallery_dl/extractor/recursive.py
gallery_dl/extractor/reddit.py
gallery_dl/extractor/redgifs.py
-gallery_dl/extractor/rule34.py
-gallery_dl/extractor/safebooru.py
gallery_dl/extractor/sankaku.py
gallery_dl/extractor/sankakucomplex.py
gallery_dl/extractor/seiga.py
@@ -160,7 +156,6 @@ gallery_dl/extractor/weibo.py
gallery_dl/extractor/wikiart.py
gallery_dl/extractor/xhamster.py
gallery_dl/extractor/xvideos.py
-gallery_dl/extractor/yandere.py
gallery_dl/extractor/yuki.py
gallery_dl/postprocessor/__init__.py
gallery_dl/postprocessor/classify.py
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index 0e67330..b8546a8 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -44,12 +44,14 @@ class HttpDownloader(DownloaderBase):
if self.minsize:
minsize = text.parse_bytes(self.minsize)
if not minsize:
- self.log.warning("Invalid minimum filesize (%r)", self.minsize)
+ self.log.warning(
+ "Invalid minimum file size (%r)", self.minsize)
self.minsize = minsize
if self.maxsize:
maxsize = text.parse_bytes(self.maxsize)
if not maxsize:
- self.log.warning("Invalid maximum filesize (%r)", self.maxsize)
+ self.log.warning(
+ "Invalid maximum file size (%r)", self.maxsize)
self.maxsize = maxsize
if self.rate:
rate = text.parse_bytes(self.rate)
@@ -84,17 +86,20 @@ class HttpDownloader(DownloaderBase):
if tries:
if response:
response.close()
+ response = None
self.log.warning("%s (%s/%s)", msg, tries, self.retries+1)
if tries > self.retries:
return False
time.sleep(tries)
- tries += 1
+ tries += 1
headers = {}
+ file_header = None
+
# check for .part file
- filesize = pathfmt.part_size()
- if filesize:
- headers["Range"] = "bytes={}-".format(filesize)
+ file_size = pathfmt.part_size()
+ if file_size:
+ headers["Range"] = "bytes={}-".format(file_size)
# file-specific headers
extra = pathfmt.kwdict.get("_http_headers")
if extra:
@@ -118,9 +123,9 @@ class HttpDownloader(DownloaderBase):
offset = 0
size = response.headers.get("Content-Length")
elif code == 206: # Partial Content
- offset = filesize
+ offset = file_size
size = response.headers["Content-Range"].rpartition("/")[2]
- elif code == 416 and filesize: # Requested Range Not Satisfiable
+ elif code == 416 and file_size: # Requested Range Not Satisfiable
break
else:
msg = "'{} {}' for '{}'".format(code, response.reason, url)
@@ -129,7 +134,14 @@ class HttpDownloader(DownloaderBase):
self.log.warning(msg)
return False
- # check filesize
+ # set missing filename extension from MIME type
+ if not pathfmt.extension:
+ pathfmt.set_extension(self._find_extension(response))
+ if pathfmt.exists():
+ pathfmt.temppath = ""
+ return True
+
+ # check file size
size = text.parse_int(size, None)
if size is not None:
if self.minsize and size < self.minsize:
@@ -143,50 +155,59 @@ class HttpDownloader(DownloaderBase):
size, self.maxsize)
return False
- # set missing filename extension
- if not pathfmt.extension:
- pathfmt.set_extension(self.get_extension(response))
- if pathfmt.exists():
+ content = response.iter_content(self.chunk_size)
+
+ # check filename extension against file header
+ if self.adjust_extension and not offset and \
+ pathfmt.extension in FILE_SIGNATURES:
+ try:
+ file_header = next(
+ content if response.raw.chunked
+ else response.iter_content(16), b"")
+ except (RequestException, SSLError, OpenSSLError) as exc:
+ msg = str(exc)
+ print()
+ continue
+ if self._adjust_extension(pathfmt, file_header) and \
+ pathfmt.exists():
pathfmt.temppath = ""
return True
# set open mode
if not offset:
mode = "w+b"
- if filesize:
+ if file_size:
self.log.debug("Unable to resume partial download")
else:
mode = "r+b"
self.log.debug("Resuming download at byte %d", offset)
- # start downloading
- self.out.start(pathfmt.path)
+ # download content
self.downloading = True
- with pathfmt.open(mode) as file:
- if offset:
- file.seek(offset)
-
- # download content
+ with pathfmt.open(mode) as fp:
+ if file_header:
+ fp.write(file_header)
+ elif offset:
+ if self.adjust_extension and \
+ pathfmt.extension in FILE_SIGNATURES:
+ self._adjust_extension(pathfmt, fp.read(16))
+ fp.seek(offset)
+
+ self.out.start(pathfmt.path)
try:
- self.receive(response, file)
+ self.receive(fp, content)
except (RequestException, SSLError, OpenSSLError) as exc:
msg = str(exc)
print()
continue
- # check filesize
- if size and file.tell() < size:
- msg = "filesize mismatch ({} < {})".format(
- file.tell(), size)
+ # check file size
+ if size and fp.tell() < size:
+ msg = "file size mismatch ({} < {})".format(
+ fp.tell(), size)
print()
continue
- # check filename extension
- if self.adjust_extension:
- adj_ext = self.check_extension(file, pathfmt.extension)
- if adj_ext:
- pathfmt.set_extension(adj_ext)
-
break
self.downloading = False
@@ -198,16 +219,18 @@ class HttpDownloader(DownloaderBase):
return True
- def receive(self, response, file):
- for data in response.iter_content(self.chunk_size):
- file.write(data)
+ @staticmethod
+ def receive(fp, content):
+ write = fp.write
+ for data in content:
+ write(data)
- def _receive_rate(self, response, file):
- t1 = time.time()
+ def _receive_rate(self, fp, content):
rt = self.rate
+ t1 = time.time()
- for data in response.iter_content(self.chunk_size):
- file.write(data)
+ for data in content:
+ fp.write(data)
t2 = time.time() # current time
actual = t2 - t1 # actual elapsed time
@@ -220,81 +243,98 @@ class HttpDownloader(DownloaderBase):
else:
t1 = t2
- def get_extension(self, response):
+ def _find_extension(self, response):
+ """Get filename extension from MIME type"""
mtype = response.headers.get("Content-Type", "image/jpeg")
mtype = mtype.partition(";")[0]
if "/" not in mtype:
mtype = "image/" + mtype
- if mtype in MIMETYPE_MAP:
- return MIMETYPE_MAP[mtype]
+ if mtype in MIME_TYPES:
+ return MIME_TYPES[mtype]
- exts = mimetypes.guess_all_extensions(mtype, strict=False)
- if exts:
- exts.sort()
- return exts[-1][1:]
+ ext = mimetypes.guess_extension(mtype, strict=False)
+ if ext:
+ return ext[1:]
- self.log.warning(
- "No filename extension found for MIME type '%s'", mtype)
- return "txt"
+ self.log.warning("Unknown MIME type '%s'", mtype)
+ return "bin"
@staticmethod
- def check_extension(file, extension):
- """Check filename extension against fileheader"""
- if extension in FILETYPE_CHECK:
- file.seek(0)
- header = file.read(8)
- if len(header) >= 8 and not FILETYPE_CHECK[extension](header):
- for ext, check in FILETYPE_CHECK.items():
- if ext != extension and check(header):
- return ext
- return None
-
-
-FILETYPE_CHECK = {
- "jpg": lambda h: h[0:2] == b"\xff\xd8",
- "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
- "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97,
-}
+ def _adjust_extension(pathfmt, file_header):
+ """Check filename extension against file header"""
+ sig = FILE_SIGNATURES[pathfmt.extension]
+ if not file_header.startswith(sig):
+ for ext, sig in FILE_SIGNATURES.items():
+ if file_header.startswith(sig):
+ pathfmt.set_extension(ext)
+ return True
+ return False
-MIMETYPE_MAP = {
- "image/jpeg": "jpg",
- "image/jpg": "jpg",
- "image/png": "png",
- "image/gif": "gif",
- "image/bmp": "bmp",
- "image/x-bmp": "bmp",
+MIME_TYPES = {
+ "image/jpeg" : "jpg",
+ "image/jpg" : "jpg",
+ "image/png" : "png",
+ "image/gif" : "gif",
+ "image/bmp" : "bmp",
+ "image/x-bmp" : "bmp",
"image/x-ms-bmp": "bmp",
- "image/webp": "webp",
- "image/svg+xml": "svg",
+ "image/webp" : "webp",
+ "image/svg+xml" : "svg",
+ "image/x-photoshop" : "psd",
+ "application/x-photoshop" : "psd",
"image/vnd.adobe.photoshop": "psd",
- "image/x-photoshop": "psd",
- "application/x-photoshop": "psd",
"video/webm": "webm",
- "video/ogg": "ogg",
- "video/mp4": "mp4",
+ "video/ogg" : "ogg",
+ "video/mp4" : "mp4",
- "audio/wav": "wav",
+ "audio/wav" : "wav",
"audio/x-wav": "wav",
- "audio/webm": "webm",
- "audio/ogg": "ogg",
- "audio/mpeg": "mp3",
+ "audio/webm" : "webm",
+ "audio/ogg" : "ogg",
+ "audio/mpeg" : "mp3",
- "application/zip": "zip",
+ "application/zip" : "zip",
"application/x-zip": "zip",
"application/x-zip-compressed": "zip",
- "application/rar": "rar",
+ "application/rar" : "rar",
"application/x-rar": "rar",
"application/x-rar-compressed": "rar",
- "application/x-7z-compressed": "7z",
+ "application/x-7z-compressed" : "7z",
+
+ "application/pdf" : "pdf",
+ "application/x-pdf": "pdf",
+ "application/x-shockwave-flash": "swf",
"application/ogg": "ogg",
"application/octet-stream": "bin",
}
+# taken from https://en.wikipedia.org/wiki/List_of_file_signatures
+FILE_SIGNATURES = {
+ "jpg" : b"\xFF\xD8\xFF",
+ "png" : b"\x89PNG\r\n\x1A\n",
+ "gif" : (b"GIF87a", b"GIF89a"),
+ "bmp" : b"BM",
+ "webp": b"RIFF",
+ "svg" : b"<?xml",
+ "psd" : b"8BPS",
+ "webm": b"\x1A\x45\xDF\xA3",
+ "ogg" : b"OggS",
+ "wav" : b"RIFF",
+ "mp3" : (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2", b"ID3"),
+ "zip" : (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"),
+ "rar" : b"\x52\x61\x72\x21\x1A\x07",
+ "7z" : b"\x37\x7A\xBC\xAF\x27\x1C",
+ "pdf" : b"%PDF-",
+ "swf" : (b"CWS", b"FWS"),
+ # check 'bin' files against all other file signatures
+ "bin" : b"\x00\x00\x00\x00",
+}
+
__downloader__ = HttpDownloader
diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py
index 3773ee5..e0066cb 100644
--- a/gallery_dl/extractor/3dbooru.py
+++ b/gallery_dl/extractor/3dbooru.py
@@ -1,22 +1,21 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from http://behoimi.org/"""
+"""Extractors for http://behoimi.org/"""
-from . import booru
+from . import moebooru
-class _3dbooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+class _3dbooruBase():
"""Base class for 3dbooru extractors"""
category = "3dbooru"
- api_url = "http://behoimi.org/post/index.json"
- post_url = "http://behoimi.org/post/show/{}"
- page_limit = 1000
+ basecategory = "booru"
+ root = "http://behoimi.org"
def __init__(self, match):
super().__init__(match)
@@ -26,7 +25,7 @@ class _3dbooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
})
-class _3dbooruTagExtractor(booru.TagMixin, _3dbooruExtractor):
+class _3dbooruTagExtractor(_3dbooruBase, moebooru.MoebooruTagExtractor):
"""Extractor for images from behoimi.org based on search-tags"""
pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org/post"
r"(?:/(?:index)?)?\?tags=(?P<tags>[^&#]+)")
@@ -35,8 +34,12 @@ class _3dbooruTagExtractor(booru.TagMixin, _3dbooruExtractor):
"content": "11cbda40c287e026c1ce4ca430810f761f2d0b2a",
})
+ def posts(self):
+ params = {"tags": self.tags}
+ return self._pagination(self.root + "/post/index.json", params)
-class _3dbooruPoolExtractor(booru.PoolMixin, _3dbooruExtractor):
+
+class _3dbooruPoolExtractor(_3dbooruBase, moebooru.MoebooruPoolExtractor):
"""Extractor for image-pools from behoimi.org"""
pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/pool/show/(?P<pool>\d+)"
test = ("http://behoimi.org/pool/show/27", {
@@ -44,8 +47,12 @@ class _3dbooruPoolExtractor(booru.PoolMixin, _3dbooruExtractor):
"content": "fd5b37c5c6c2de4b4d6f1facffdefa1e28176554",
})
+ def posts(self):
+ params = {"tags": "pool:" + self.pool_id}
+ return self._pagination(self.root + "/post/index.json", params)
+
-class _3dbooruPostExtractor(booru.PostMixin, _3dbooruExtractor):
+class _3dbooruPostExtractor(_3dbooruBase, moebooru.MoebooruPostExtractor):
"""Extractor for single images from behoimi.org"""
pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/post/show/(?P<post>\d+)"
test = ("http://behoimi.org/post/show/140852", {
@@ -60,8 +67,13 @@ class _3dbooruPostExtractor(booru.PostMixin, _3dbooruExtractor):
},
})
+ def posts(self):
+ params = {"tags": "id:" + self.post_id}
+ return self._pagination(self.root + "/post/index.json", params)
+
-class _3dbooruPopularExtractor(booru.MoebooruPopularMixin, _3dbooruExtractor):
+class _3dbooruPopularExtractor(
+ _3dbooruBase, moebooru.MoebooruPopularExtractor):
"""Extractor for popular images from behoimi.org"""
pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org"
r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
@@ -70,8 +82,3 @@ class _3dbooruPopularExtractor(booru.MoebooruPopularMixin, _3dbooruExtractor):
"pattern": r"http://behoimi\.org/data/../../[0-9a-f]{32}\.jpg",
"count": 20,
})
-
- def __init__(self, match):
- super().__init__(match)
- self.api_url = "http://behoimi.org/post/popular_{scale}.json".format(
- scale=self.scale)
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index d0c327a..611603e 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -44,7 +44,6 @@ modules = [
"hentainexus",
"hiperdex",
"hitomi",
- "hypnohub",
"idolcomplex",
"imagebam",
"imagechest",
@@ -60,7 +59,6 @@ modules = [
"keenspot",
"khinsider",
"komikcast",
- "konachan",
"lineblog",
"livedoor",
"luscious",
@@ -94,11 +92,8 @@ modules = [
"pururin",
"reactor",
"readcomiconline",
- "realbooru",
"reddit",
"redgifs",
- "rule34",
- "safebooru",
"sankaku",
"sankakucomplex",
"seiga",
@@ -123,8 +118,9 @@ modules = [
"wikiart",
"xhamster",
"xvideos",
- "yandere",
"yuki",
+ "booru",
+ "moebooru",
"foolfuuka",
"foolslide",
"mastodon",
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 0176d76..517df93 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -1,247 +1,248 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2020 Mike Fährmann
+# Copyright 2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Base classes for extractors for danbooru and co"""
+"""Extractors for *booru sites"""
+
+from .common import Extractor, Message, generate_extractors
+from .. import text, util, exception
-from .common import Extractor, Message, SharedConfigMixin
-from .. import text, exception
from xml.etree import ElementTree
import collections
-import datetime
-import operator
import re
-class BooruExtractor(SharedConfigMixin, Extractor):
- """Base class for all booru extractors"""
+class BooruExtractor(Extractor):
+ """Base class for *booru extractors"""
basecategory = "booru"
filename_fmt = "{category}_{id}_{md5}.{extension}"
- api_url = ""
- post_url = ""
- per_page = 50
- page_start = 1
- page_limit = None
- sort = False
+ page_start = 0
+ per_page = 100
- def __init__(self, match):
- super().__init__(match)
- self.params = {}
- self.extags = self.post_url and self.config("tags", False)
+ def items(self):
+ self.login()
+ extended_tags = self.config("tags", False)
+ data = self.metadata()
+ for post in self.posts():
+ try:
+ url = self._prepare_post(post, extended_tags)
+ except KeyError:
+ continue
+ post.update(data)
+ text.nameext_from_url(url, post)
+ yield Message.Directory, post
+ yield Message.Url, url, post
def skip(self, num):
pages = num // self.per_page
- if self.page_limit and pages + self.page_start > self.page_limit:
- pages = self.page_limit - self.page_start
self.page_start += pages
return pages * self.per_page
- def items(self):
- yield Message.Version, 1
- data = self.get_metadata()
+ def login(self):
+ """Login and set necessary cookies"""
- self.reset_page()
- while True:
- images = self.parse_response(
- self.request(self.api_url, params=self.params))
-
- for image in images:
- try:
- url = self.get_file_url(image)
- except KeyError:
- continue
- if url.startswith("/"):
- url = text.urljoin(self.api_url, url)
- image.update(data)
- text.nameext_from_url(url, image)
- if self.extags:
- self.extended_tags(image)
- yield Message.Directory, image
- yield Message.Url, url, image
-
- if len(images) < self.per_page:
- return
- self.update_page(image)
+ def metadata(self):
+ """Return a dict with general metadata"""
+ return ()
- def reset_page(self):
- """Initialize params to point to the first page"""
- self.params["page"] = self.page_start
+ def posts(self):
+ """Return an iterable with post objects"""
+ return ()
- def update_page(self, data):
- """Update params to point to the next page"""
+ def _prepare_post(self, post, extended_tags=False):
+ url = post["file_url"]
+ if url[0] == "/":
+ url = self.root + url
+ if extended_tags:
+ self._fetch_extended_tags(post)
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ return url
- def parse_response(self, response):
- """Parse JSON API response"""
- images = response.json()
- if self.sort:
- images.sort(key=operator.itemgetter("score", "id"),
- reverse=True)
- return images
+ def _fetch_extended_tags(self, post, page=None):
+ if not page:
+ url = "{}/index.php?page=post&s=view&id={}".format(
+ self.root, post["id"])
+ page = self.request(url).text
+ html = text.extract(page, '<ul id="tag-', '</ul>')[0]
+ if html:
+ tags = collections.defaultdict(list)
+ pattern = re.compile(
+ r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
+ for tag_type, tag_name in pattern.findall(html):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
+
+ def _api_request(self, params):
+ url = self.root + "/index.php?page=dapi&s=post&q=index"
+ return ElementTree.fromstring(self.request(url, params=params).text)
+
+ def _pagination(self, params):
+ params["pid"] = self.page_start
+ params["limit"] = self.per_page
- def get_metadata(self):
- """Collect metadata for extractor-job"""
- return {}
+ while True:
+ root = self._api_request(params)
+ for post in root:
+ yield post.attrib
- @staticmethod
- def get_file_url(image):
- return image["file_url"]
+ if len(root) < self.per_page:
+ return
+ params["pid"] += 1
- def extended_tags(self, image, page=None):
- """Retrieve extended tag information"""
- if not page:
- url = self.post_url.format(image["id"])
- page = self.request(url).text
- tags = collections.defaultdict(list)
- tags_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
- pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
- for tag_type, tag_name in pattern.findall(tags_html or ""):
- tags[tag_type].append(text.unquote(tag_name))
- for key, value in tags.items():
- image["tags_" + key] = " ".join(value)
-
-
-class XmlParserMixin():
- """Mixin for XML based API responses"""
- def parse_response(self, response):
- root = ElementTree.fromstring(response.text)
- return [post.attrib for post in root]
-
-
-class MoebooruPageMixin():
- """Pagination for Moebooru and Danbooru v1"""
- def update_page(self, data):
- if self.page_limit:
- self.params["page"] = None
- self.params["before_id"] = data["id"]
- else:
- self.params["page"] += 1
-
-
-class GelbooruPageMixin():
- """Pagination for Gelbooru-like sites"""
- page_start = 0
- def reset_page(self):
- self.params["pid"] = self.page_start
+class BooruPostExtractor(BooruExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern_fmt = r"/index\.php\?page=post&s=view&id=(\d+)"
- def update_page(self, data):
- self.params["pid"] += 1
+ def __init__(self, match):
+ BooruExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+ def posts(self):
+ return self._pagination({"id": self.post_id})
-class TagMixin():
- """Extraction of images based on search-tags"""
+
+class BooruTagExtractor(BooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
+ pattern_fmt = r"/index\.php\?page=post&s=list&tags=([^&#]+)"
def __init__(self, match):
- super().__init__(match)
- self.tags = text.unquote(match.group("tags").replace("+", " "))
- self.params["tags"] = self.tags
- self.params["limit"] = self.per_page
+ BooruExtractor.__init__(self, match)
+ self.tags = text.unquote(match.group(1).replace("+", " "))
- def get_metadata(self):
+ def metadata(self):
return {"search_tags": self.tags}
+ def posts(self):
+ return self._pagination({"tags" : self.tags})
+
-class PoolMixin():
- """Extraction of image-pools"""
+class BooruPoolExtractor(BooruExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}"
+ pattern_fmt = r"/index\.php\?page=pool&s=show&id=(\d+)"
def __init__(self, match):
- super().__init__(match)
- self.pool = match.group("pool")
- self.params["tags"] = "pool:" + self.pool
- self.params["limit"] = self.per_page
-
- def get_metadata(self):
- return {"pool": text.parse_int(self.pool)}
+ BooruExtractor.__init__(self, match)
+ self.pool_id = match.group(1)
+ self.post_ids = ()
+ def skip(self, num):
+ self.page_start += num
+ return num
-class GelbooruPoolMixin(PoolMixin):
- """Image-pool extraction for Gelbooru-like sites"""
- per_page = 1
+ def metadata(self):
+ url = "{}/index.php?page=pool&s=show&id={}".format(
+ self.root, self.pool_id)
+ page = self.request(url).text
- def get_metadata(self):
- page = self.request(self.pool_url.format(self.pool)).text
- name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
- if not name:
- name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
+ name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
if not name:
raise exception.NotFoundError("pool")
- self.posts = list(text.extract_iter(
- page, 'class="thumb" id="p', '"', pos))
+ self.post_ids = text.extract_iter(
+ page, 'class="thumb" id="p', '"', pos)
return {
- "pool": text.parse_int(self.pool),
+ "pool": text.parse_int(self.pool_id),
"pool_name": text.unescape(name),
- "count": len(self.posts),
}
- def reset_page(self):
- self.index = self.page_start
- self.update_page(None)
-
- def update_page(self, data):
- try:
- post = self.posts[self.index]
- self.index += 1
- except IndexError:
- post = "0"
- self.params["tags"] = "id:" + post
-
-
-class PostMixin():
- """Extraction of a single image-post"""
- subcategory = "post"
- archive_fmt = "{id}"
-
- def __init__(self, match):
- super().__init__(match)
- self.post = match.group("post")
- self.params["tags"] = "id:" + self.post
-
-
-class MoebooruPopularMixin():
- """Extraction and metadata handling for Moebooru and Danbooru v1"""
- subcategory = "popular"
- directory_fmt = ("{category}", "popular", "{scale}", "{date}")
- archive_fmt = "P_{scale[0]}_{date}_{id}"
- page_start = None
- sort = True
-
- def __init__(self, match):
- super().__init__(match)
- self.params.update(text.parse_query(match.group("query")))
- self.scale = match.group("scale")
-
- def get_metadata(self, fmt="%Y-%m-%d"):
- date = self.get_date() or datetime.date.today().isoformat()
- scale = self.get_scale() or "day"
-
- if scale == "week":
- date = datetime.date.fromisoformat(date)
- date = (date - datetime.timedelta(days=date.weekday())).isoformat()
- elif scale == "month":
- date = date[:-3]
-
- return {"date": date, "scale": scale}
-
- def get_date(self):
- if "year" in self.params:
- return "{:>04}-{:>02}-{:>02}".format(
- self.params["year"],
- self.params.get("month", "01"),
- self.params.get("day", "01"))
- return None
-
- def get_scale(self):
- if self.scale and self.scale.startswith("by_"):
- return self.scale[3:]
- return self.scale
+ def posts(self):
+ params = {}
+ for params["id"] in util.advance(self.post_ids, self.page_start):
+ for post in self._api_request(params):
+ yield post.attrib
+
+
+EXTRACTORS = {
+ "rule34": {
+ "root": "https://rule34.xxx",
+ "test-tag": (
+ ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
+ "count": 1,
+ }),
+ ),
+ "test-pool": (
+ ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
+ "count": 3,
+ }),
+ ),
+ "test-post": (
+ ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "danraku",
+ "tags_character": "kashima_(kantai_collection)",
+ "tags_copyright": "kantai_collection",
+ "tags_general": str,
+ "tags_metadata": str,
+ },
+ }),
+ ),
+ },
+ "safebooru": {
+ "root": "https://safebooru.org",
+ "test-tag": (
+ ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
+ "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
+ "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
+ }),
+ ),
+ "test-pool": (
+ ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
+ "count": 5,
+ }),
+ ),
+ "test-post": (
+ ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
+ "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
+ "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "kawanakajima",
+ "tags_character": "heath_ledger ronald_mcdonald the_joker",
+ "tags_copyright": "dc_comics mcdonald's the_dark_knight",
+ "tags_general": str,
+ },
+ }),
+ ),
+ },
+ "realbooru": {
+ "root": "https://realbooru.com",
+ "test-tag": (
+ ("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
+ "count": ">= 64",
+ }),
+ ),
+ "test-pool": (
+ ("https://realbooru.com/index.php?page=pool&s=show&id=1", {
+ "count": 3,
+ }),
+ ),
+ "test-post": (
+ ("https://realbooru.com/index.php?page=post&s=view&id=668483", {
+ "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
+ "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
+ }),
+ ),
+ },
+}
+
+generate_extractors(EXTRACTORS, globals(), (
+ BooruTagExtractor,
+ BooruPoolExtractor,
+ BooruPostExtractor,
+))
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 5efea4a..15cc776 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -24,6 +24,7 @@ class Extractor():
category = ""
subcategory = ""
+ basecategory = ""
categorytransfer = False
directory_fmt = ("{category}",)
filename_fmt = "{filename}.{extension}"
@@ -31,8 +32,9 @@ class Extractor():
cookiedomain = ""
root = ""
test = None
- _request_last = 0
- _request_interval = 0
+ request_interval = 0.0
+ request_interval_min = 0.0
+ request_timestamp = 0.0
def __init__(self, match):
self.session = requests.Session()
@@ -48,11 +50,17 @@ class Extractor():
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
- self._request_interval = self.config(
- "sleep-request", self._request_interval)
+ self.request_interval = self.config(
+ "sleep-request", self.request_interval)
if self._retries < 0:
self._retries = float("inf")
+ if self.request_interval < self.request_interval_min:
+ self.request_interval = self.request_interval_min
+
+ if self.basecategory:
+ self.config = self._config_shared
+ self.config_accumulate = self._config_shared_accumulate
self._init_headers()
self._init_cookies()
@@ -80,6 +88,19 @@ class Extractor():
def config_accumulate(self, key):
return config.accumulate(self._cfgpath, key)
+ def _config_shared(self, key, default=None):
+ return config.interpolate_common(("extractor",), (
+ (self.category, self.subcategory),
+ (self.basecategory, self.subcategory),
+ ), key, default)
+
+ def _config_shared_accumulate(self, key):
+ values = config.accumulate(self._cfgpath, key)
+ conf = config.get(("extractor",), self.basecategory)
+ if conf:
+ values[:0] = config.accumulate((self.subcategory,), key, conf=conf)
+ return values
+
def request(self, url, *, method="GET", session=None, retries=None,
encoding=None, fatal=True, notfound=None, **kwargs):
tries = 1
@@ -89,10 +110,10 @@ class Extractor():
kwargs.setdefault("verify", self._verify)
response = None
- if self._request_interval:
- seconds = (self._request_interval -
- (time.time() - Extractor._request_last))
- if seconds > 0:
+ if self.request_interval:
+ seconds = (self.request_interval -
+ (time.time() - Extractor.request_timestamp))
+ if seconds > 0.0:
self.log.debug("Sleeping for %.5s seconds", seconds)
time.sleep(seconds)
@@ -135,12 +156,12 @@ class Extractor():
if code < 500 and code != 429 and code != 430:
break
finally:
- Extractor._request_last = time.time()
+ Extractor.request_timestamp = time.time()
self.log.debug("%s (%s/%s)", msg, tries, retries+1)
if tries > retries:
break
- time.sleep(tries)
+ time.sleep(max(tries, self.request_interval))
tries += 1
raise exception.HttpError(msg, response)
@@ -506,28 +527,6 @@ class AsynchronousMixin():
messages.put(None)
-class SharedConfigMixin():
- """Enable sharing of config settings based on 'basecategory'"""
- basecategory = ""
-
- def config(self, key, default=None):
- return config.interpolate_common(
- ("extractor",), (
- (self.category, self.subcategory),
- (self.basecategory, self.subcategory),
- ), key, default,
- )
-
- def config_accumulate(self, key):
- values = config.accumulate(self._cfgpath, key)
-
- conf = config.get(("extractor",), self.basecategory)
- if conf:
- values[:0] = config.accumulate((self.subcategory,), key, conf=conf)
-
- return values
-
-
def generate_extractors(extractor_data, symtable, classes):
"""Dynamically generate Extractor classes"""
extractors = config.get(("extractor",), classes[0].basecategory)
@@ -539,7 +538,7 @@ def generate_extractors(extractor_data, symtable, classes):
for category, info in extractor_data.items():
- if not isinstance(info, dict):
+ if not isinstance(info, dict) or "root" not in info:
continue
root = info["root"]
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 1ebaf5b..ca37cb4 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -8,7 +8,7 @@
"""Extractors for https://danbooru.donmai.us/"""
-from .common import Extractor, Message, SharedConfigMixin
+from .common import Extractor, Message
from .. import text
import datetime
@@ -20,7 +20,7 @@ BASE_PATTERN = (
)
-class DanbooruExtractor(SharedConfigMixin, Extractor):
+class DanbooruExtractor(Extractor):
"""Base class for danbooru extractors"""
basecategory = "booru"
category = "danbooru"
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index 5c5c36c..591fe33 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -10,7 +10,6 @@
from .common import Extractor, Message
from . import danbooru
-import time
BASE_PATTERN = r"(?:https?://)?e(621|926)\.net"
@@ -23,22 +22,16 @@ class E621Extractor(danbooru.DanbooruExtractor):
page_limit = 750
page_start = None
per_page = 320
- _last_request = 0
+ request_interval_min = 1.0
def __init__(self, match):
super().__init__(match)
self.root = "https://e{}.net".format(match.group(1))
+ self.headers = {"User-Agent": "gallery-dl/1.14.0 (by mikf)"}
def request(self, url, **kwargs):
- diff = time.time() - E621Extractor._last_request
- if diff < 1.0:
- delay = 1.0 - diff
- self.log.debug("Sleeping for %s seconds", delay)
- time.sleep(delay)
- kwargs["headers"] = {"User-Agent": "gallery-dl/1.14.0 (by mikf)"}
- response = Extractor.request(self, url, **kwargs)
- E621Extractor._last_request = time.time()
- return response
+ kwargs["headers"] = self.headers
+ return Extractor.request(self, url, **kwargs)
def items(self):
data = self.metadata()
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index a9d3c9d..cf4c033 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://www.flickr.com/"""
+"""Extractors for https://www.flickr.com/"""
from .common import Extractor, Message
from .. import text, oauth, util, exception
@@ -16,6 +16,8 @@ class FlickrExtractor(Extractor):
"""Base class for flickr extractors"""
category = "flickr"
filename_fmt = "{category}_{id}.{extension}"
+ directory_fmt = ("{category}", "{user[username]}")
+ archive_fmt = "{id}"
cookiedomain = None
def __init__(self, match):
@@ -27,8 +29,6 @@ class FlickrExtractor(Extractor):
def items(self):
data = self.metadata()
extract = self.api._extract_format
- yield Message.Version, 1
- yield Message.Directory, data
for photo in self.photos():
try:
photo = extract(photo)
@@ -39,6 +39,7 @@ class FlickrExtractor(Extractor):
else:
photo.update(data)
url = photo["url"]
+ yield Message.Directory, photo
yield Message.Url, url, text.nameext_from_url(url, photo)
def metadata(self):
@@ -53,7 +54,6 @@ class FlickrExtractor(Extractor):
class FlickrImageExtractor(FlickrExtractor):
"""Extractor for individual images from flickr.com"""
subcategory = "image"
- archive_fmt = "{id}"
pattern = (r"(?:https?://)?(?:"
r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/"
r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
@@ -106,6 +106,7 @@ class FlickrImageExtractor(FlickrExtractor):
else:
self.api._extract_photo(photo)
+ photo["user"] = photo["owner"]
photo["title"] = photo["title"]["_content"]
photo["comments"] = text.parse_int(photo["comments"]["_content"])
photo["description"] = photo["description"]["_content"]
@@ -121,7 +122,6 @@ class FlickrImageExtractor(FlickrExtractor):
location[key] = value["_content"]
url = photo["url"]
- yield Message.Version, 1
yield Message.Directory, photo
yield Message.Url, url, text.nameext_from_url(url, photo)
@@ -129,8 +129,8 @@ class FlickrImageExtractor(FlickrExtractor):
class FlickrAlbumExtractor(FlickrExtractor):
"""Extractor for photo albums from flickr.com"""
subcategory = "album"
- directory_fmt = ("{category}", "{subcategory}s",
- "{album[id]} - {album[title]}")
+ directory_fmt = ("{category}", "{user[username]}",
+ "Albums", "{album[id]} {album[title]}")
archive_fmt = "a_{album[id]}_{id}"
pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?")
@@ -178,8 +178,8 @@ class FlickrAlbumExtractor(FlickrExtractor):
class FlickrGalleryExtractor(FlickrExtractor):
"""Extractor for photo galleries from flickr.com"""
subcategory = "gallery"
- directory_fmt = ("{category}", "galleries",
- "{user[username]} {gallery[id]}")
+ directory_fmt = ("{category}", "{user[username]}",
+ "Galleries", "{gallery[gallery_id]} {gallery[title]}")
archive_fmt = "g_{gallery[id]}_{id}"
pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
r"photos/([^/]+)/galleries/(\d+)")
@@ -205,7 +205,7 @@ class FlickrGalleryExtractor(FlickrExtractor):
class FlickrGroupExtractor(FlickrExtractor):
"""Extractor for group pools from flickr.com"""
subcategory = "group"
- directory_fmt = ("{category}", "{subcategory}s", "{group[groupname]}")
+ directory_fmt = ("{category}", "Groups", "{group[groupname]}")
archive_fmt = "G_{group[nsid]}_{id}"
pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"
test = ("https://www.flickr.com/groups/bird_headshots/", {
@@ -224,7 +224,6 @@ class FlickrGroupExtractor(FlickrExtractor):
class FlickrUserExtractor(FlickrExtractor):
"""Extractor for the photostream of a flickr user"""
subcategory = "user"
- directory_fmt = ("{category}", "{user[username]}")
archive_fmt = "u_{user[nsid]}_{id}"
pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"
test = ("https://www.flickr.com/photos/shona_s/", {
@@ -239,7 +238,7 @@ class FlickrUserExtractor(FlickrExtractor):
class FlickrFavoriteExtractor(FlickrExtractor):
"""Extractor for favorite photos of a flickr user"""
subcategory = "favorite"
- directory_fmt = ("{category}", "{subcategory}s", "{user[username]}")
+ directory_fmt = ("{category}", "{user[username]}", "Favorites")
archive_fmt = "f_{user[nsid]}_{id}"
pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"
test = ("https://www.flickr.com/photos/shona_s/favorites", {
@@ -254,7 +253,7 @@ class FlickrFavoriteExtractor(FlickrExtractor):
class FlickrSearchExtractor(FlickrExtractor):
"""Extractor for flickr photos based on search results"""
subcategory = "search"
- directory_fmt = ("{category}", "{subcategory}", "{search[text]}")
+ directory_fmt = ("{category}", "Search", "{search[text]}")
archive_fmt = "s_{search}_{id}"
pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)"
test = (
@@ -408,9 +407,11 @@ class FlickrAPI(oauth.OAuth1API):
"""Returns a user NSID, given the url to a user's photos or profile."""
params = {"url": "https://www.flickr.com/photos/" + username}
user = self._call("urls.lookupUser", params)["user"]
- return {"nsid": user["id"],
- "path_alias": username,
- "username": user["username"]["_content"]}
+ return {
+ "nsid" : user["id"],
+ "username" : user["username"]["_content"],
+ "path_alias": username,
+ }
def video_getStreamInfo(self, video_id, secret=None):
"""Returns all available video streams"""
@@ -441,7 +442,8 @@ class FlickrAPI(oauth.OAuth1API):
return data
def _pagination(self, method, params, key="photos"):
- params["extras"] = "description,date_upload,tags,views,media,"
+ params["extras"] = ("description,date_upload,tags,views,media,"
+ "path_alias,owner_name,")
params["extras"] += ",".join("url_" + fmt[0] for fmt in self.formats)
params["page"] = 1
@@ -469,6 +471,17 @@ class FlickrAPI(oauth.OAuth1API):
photo["tags"] = photo["tags"].split()
photo["id"] = text.parse_int(photo["id"])
+ if "owner" in photo:
+ photo["owner"] = {
+ "nsid" : photo["owner"],
+ "username" : photo["ownername"],
+ "path_alias": photo["pathalias"],
+ }
+ else:
+ photo["owner"] = self.extractor.user
+ del photo["pathalias"]
+ del photo["ownername"]
+
if photo["media"] == "video" and self.videos:
return self._extract_video(photo)
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index f2019ca..8a03dc9 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -8,13 +8,13 @@
"""Extractors for 4chan archives based on FoolFuuka"""
-from .common import Extractor, Message, SharedConfigMixin, generate_extractors
+from .common import Extractor, Message, generate_extractors
from .. import text
import itertools
import operator
-class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
+class FoolfuukaThreadExtractor(Extractor):
"""Base extractor for FoolFuuka based boards/archives"""
basecategory = "foolfuuka"
subcategory = "thread"
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
index 4245617..db5e250 100644
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -12,7 +12,6 @@ from .common import (
Extractor,
ChapterExtractor,
MangaExtractor,
- SharedConfigMixin,
Message,
generate_extractors,
)
@@ -20,7 +19,7 @@ from .. import text, util
import json
-class FoolslideBase(SharedConfigMixin):
+class FoolslideBase():
"""Base class for FoOlSlide extractors"""
basecategory = "foolslide"
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index c32ba5c..b0614e2 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -6,98 +6,27 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://gelbooru.com/"""
+"""Extractors for https://gelbooru.com/"""
from . import booru
-from .common import Message
-from .. import text
+from .. import text, exception
-class GelbooruExtractor(booru.XmlParserMixin,
- booru.GelbooruPageMixin,
- booru.BooruExtractor):
+class GelbooruBase():
"""Base class for gelbooru extractors"""
category = "gelbooru"
- api_url = "https://gelbooru.com/index.php"
- post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"
- pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}"
+ root = "https://gelbooru.com"
- def __init__(self, match):
- super().__init__(match)
-
- self.use_api = self.config("api", True)
- if self.use_api:
- self.params.update({"page": "dapi", "s": "post", "q": "index"})
- else:
- self.items = self.items_noapi
- self.session.cookies["fringeBenefits"] = "yup"
- self.per_page = 42
-
- @staticmethod
- def get_file_url(image):
- url = image["file_url"]
+ def _prepare_post(self, post, extended_tags=False):
+ url = booru.BooruExtractor._prepare_post(self, post, extended_tags)
if url.startswith("https://mp4.gelbooru.com/"):
- ihash = image["md5"]
+ md5 = post["md5"]
return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(
- ihash[0:2], ihash[2:4], ihash)
+ md5[0:2], md5[2:4], md5)
return url
- def items_noapi(self):
- yield Message.Version, 1
- data = self.get_metadata()
-
- for post in self.get_posts():
- post = self.get_post_data(post)
- url = post["file_url"]
- post.update(data)
- text.nameext_from_url(url, post)
- yield Message.Directory, post
- yield Message.Url, url, post
-
- def get_posts(self):
- """Return an iterable containing all relevant post objects"""
- url = "https://gelbooru.com/index.php?page=post&s=list"
- params = {
- "tags": self.params["tags"],
- "pid" : self.page_start * self.per_page
- }
-
- while True:
- page = self.request(url, params=params).text
- ids = list(text.extract_iter(page, '<span id="s', '"'))
- yield from ids
- if len(ids) < self.per_page:
- return
- params["pid"] += self.per_page
-
- def get_post_data(self, post_id):
- """Extract metadata of a single post"""
- page = self.request(self.post_url.format(post_id)).text
- data = text.extract_all(page, (
- (None , '<meta name="keywords"', ''),
- ("tags" , ' imageboard- ', '"'),
- ("id" , '<li>Id: ', '<'),
- ("created_at", '<li>Posted: ', '<'),
- ("width" , '<li>Size: ', 'x'),
- ("height" , '', '<'),
- ("source" , '<li>Source: <a href="', '"'),
- ("rating" , '<li>Rating: ', '<'),
- (None , '<li>Score: ', ''),
- ("score" , '>', '<'),
- ("file_url" , '<li><a href="http', '"'),
- ("change" , ' id="lupdated" value="', '"'),
- ))[0]
- data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)
- data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]
- data["rating"] = (data["rating"] or "?")[0].lower()
- data["tags"] = " ".join(
- [tag.replace(" ", "_") for tag in data["tags"].split(", ")])
- if self.extags:
- self.extended_tags(data, page)
- return data
-
-class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
+class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor):
"""Extractor for images from gelbooru.com based on search-tags"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
@@ -112,7 +41,7 @@ class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
)
-class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
+class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor):
"""Extractor for image-pools from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=pool&s=show&id=(?P<pool>\d+)")
@@ -126,8 +55,23 @@ class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
}),
)
+ def metadata(self):
+ url = "{}/index.php?page=pool&s=show&id={}".format(
+ self.root, self.pool_id)
+ page = self.request(url).text
+
+ name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
+ if not name:
+ raise exception.NotFoundError("pool")
+ self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos)
+
+ return {
+ "pool": text.parse_int(self.pool_id),
+ "pool_name": text.unescape(name),
+ }
+
-class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
+class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor):
"""Extractor for single images from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=view&id=(?P<post>\d+)")
@@ -135,6 +79,3 @@ class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
"count": 1,
})
-
- def get_posts(self):
- return (self.post,)
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
index a6db0d5..519453b 100644
--- a/gallery_dl/extractor/hentainexus.py
+++ b/gallery_dl/extractor/hentainexus.py
@@ -58,7 +58,7 @@ class HentainexusGalleryExtractor(GalleryExtractor):
return data
def images(self, _):
- url = "{}/read/{}/001".format(self.root, self.gallery_id)
+ url = "{}/read/{}".format(self.root, self.gallery_id)
page = self.request(url).text
data = json.loads(self._decode(text.extract(
@@ -73,23 +73,37 @@ class HentainexusGalleryExtractor(GalleryExtractor):
@staticmethod
def _decode(data):
- # https://hentainexus.com/static/js/reader.min.js?r=6
+ # https://hentainexus.com/static/js/reader.min.js?r=13
+ primes = (2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53)
blob = binascii.a2b_base64(data)
key = blob[0:64]
- indices = list(range(256))
- result = ""
+
+ C = 0
+ for k in key:
+ C = C ^ k
+ for _ in range(8):
+ if C & 1:
+ C = C >> 1 ^ 0xc
+ else:
+ C = C >> 1
+ k = primes[C & 0x7]
x = 0
+ S = list(range(256))
for i in range(256):
- x = (x + indices[i] + key[i % len(key)]) % 256
- indices[i], indices[x] = indices[x], indices[i]
+ x = (x + S[i] + key[i % len(key)]) % 256
+ S[i], S[x] = S[x], S[i]
- x = i = 0
+ result = ""
+ a = c = m = x = 0
for n in range(64, len(blob)):
- i = (i + 1) % 256
- x = (x + indices[i]) % 256
- indices[i], indices[x] = indices[x], indices[i]
- result += chr(blob[n] ^ indices[(indices[i] + indices[x]) % 256])
+ a = (a + k) % 256
+ x = (c + S[(x + S[a]) % 256]) % 256
+ c = (c + a + S[a]) % 256
+
+ S[a], S[x] = S[x], S[a]
+ m = S[(x + S[(a + S[(m + c) % 256]) % 256]) % 256]
+ result += chr(blob[n] ^ m)
return result
diff --git a/gallery_dl/extractor/hypnohub.py b/gallery_dl/extractor/hypnohub.py
deleted file mode 100644
index 17f9a88..0000000
--- a/gallery_dl/extractor/hypnohub.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://hypnohub.net/"""
-
-from . import booru
-
-
-class HypnohubExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
- """Base class for hypnohub extractors"""
- category = "hypnohub"
- api_url = "https://hypnohub.net/post.json"
- post_url = "https://hypnohub.net/post/show/{}"
-
-
-class HypnohubTagExtractor(booru.TagMixin, HypnohubExtractor):
- """Extractor for images from hypnohub.net based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net"
- r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
- test = ("https://hypnohub.net/post?tags=gonoike_biwa", {
- "url": "2848abe3e433ad39bfdf5be5874682faaccea5be",
- })
-
-
-class HypnohubPoolExtractor(booru.PoolMixin, HypnohubExtractor):
- """Extractor for image-pools from hypnohub.net"""
- pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/pool/show/(?P<pool>\d+)"
- test = ("https://hypnohub.net/pool/show/61", {
- "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf",
- })
-
-
-class HypnohubPostExtractor(booru.PostMixin, HypnohubExtractor):
- """Extractor for single images from hypnohub.net"""
- pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/post/show/(?P<post>\d+)"
- test = ("https://hypnohub.net/post/show/73964", {
- "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "gonoike_biwa icontrol_(manipper)",
- "tags_character": "komaru_naegi",
- "tags_copyright": "dangan_ronpa dangan_ronpa_another_episode",
- "tags_general": str,
- },
- })
-
-
-class HypnohubPopularExtractor(booru.MoebooruPopularMixin, HypnohubExtractor):
- """Extractor for popular images from hypnohub.net"""
- pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net"
- r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
- r"(?:\?(?P<query>[^#]*))?")
- test = (
- ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", {
- "count": 20,
- }),
- ("https://hypnohub.net/post/popular_recent"),
- )
-
- def __init__(self, match):
- super().__init__(match)
- self.api_url = "https://hypnohub.net/post/popular_{scale}.json".format(
- scale=self.scale)
diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py
index dcb4a54..16fe0a0 100644
--- a/gallery_dl/extractor/idolcomplex.py
+++ b/gallery_dl/extractor/idolcomplex.py
@@ -1,26 +1,145 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://idol.sankakucomplex.com/"""
+"""Extractors for https://idol.sankakucomplex.com/"""
-from . import sankaku
+from .sankaku import SankakuExtractor
+from .common import Message
+from ..cache import cache
+from .. import text, util, exception
+import collections
+import random
+import time
+import re
-class IdolcomplexExtractor(sankaku.SankakuExtractor):
+class IdolcomplexExtractor(SankakuExtractor):
"""Base class for idolcomplex extractors"""
category = "idolcomplex"
+ cookienames = ("login", "pass_hash")
cookiedomain = "idol.sankakucomplex.com"
- subdomain = "idol"
+ root = "https://" + cookiedomain
+ def __init__(self, match):
+ SankakuExtractor.__init__(self, match)
+ self.logged_in = True
+ self.start_page = 1
+ self.start_post = 0
+ self.extags = self.config("tags", False)
+ self.wait_min = self.config("wait-min", 3.0)
+ self.wait_max = self.config("wait-max", 6.0)
+ if self.wait_max < self.wait_min:
+ self.wait_max = self.wait_min
-class IdolcomplexTagExtractor(IdolcomplexExtractor,
- sankaku.SankakuTagExtractor):
+ def items(self):
+ self.login()
+ data = self.metadata()
+
+ for post_id in util.advance(self.post_ids(), self.start_post):
+ self.wait()
+ post = self._parse_post(post_id)
+ url = post["file_url"]
+ post.update(data)
+ text.nameext_from_url(url, post)
+ yield Message.Directory, post
+ yield Message.Url, url, post
+
+ def skip(self, num):
+ self.start_post += num
+ return num
+
+ def post_ids(self):
+ """Return an iterable containing all relevant post ids"""
+
+ def login(self):
+ if self._check_cookies(self.cookienames):
+ return
+ username, password = self._get_auth_info()
+ if username:
+ cookies = self._login_impl(username, password)
+ self._update_cookies(cookies)
+ else:
+ self.logged_in = False
+
+ @cache(maxage=90*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ url = self.root + "/user/authenticate"
+ data = {
+ "url" : "",
+ "user[name]" : username,
+ "user[password]": password,
+ "commit" : "Login",
+ }
+ response = self.request(url, method="POST", data=data)
+
+ if not response.history or response.url != self.root + "/user/home":
+ raise exception.AuthenticationError()
+ cookies = response.history[0].cookies
+ return {c: cookies[c] for c in self.cookienames}
+
+ def _parse_post(self, post_id):
+ """Extract metadata of a single post"""
+ url = self.root + "/post/show/" + post_id
+ page = self.request(url, retries=10).text
+ extr = text.extract
+
+ tags , pos = extr(page, "<title>", " | ")
+ vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos)
+ vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos)
+ _ , pos = extr(page, "Posted: <", "", pos)
+ created, pos = extr(page, ' title="', '"', pos)
+ rating = extr(page, "<li>Rating: ", "<", pos)[0]
+
+ file_url, pos = extr(page, '<li>Original: <a href="', '"', pos)
+ if file_url:
+ width , pos = extr(page, '>', 'x', pos)
+ height, pos = extr(page, '', ' ', pos)
+ else:
+ width , pos = extr(page, '<object width=', ' ', pos)
+ height, pos = extr(page, 'height=', '>', pos)
+ file_url = extr(page, '<embed src="', '"', pos)[0]
+
+ data = {
+ "id": text.parse_int(post_id),
+ "md5": file_url.rpartition("/")[2].partition(".")[0],
+ "tags": text.unescape(tags),
+ "vote_average": text.parse_float(vavg),
+ "vote_count": text.parse_int(vcnt),
+ "created_at": created,
+ "rating": (rating or "?")[0].lower(),
+ "file_url": "https:" + text.unescape(file_url),
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
+ }
+
+ if self.extags:
+ tags = collections.defaultdict(list)
+ tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0]
+ pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)')
+ for tag_type, tag_name in pattern.findall(tags_html or ""):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ data["tags_" + key] = " ".join(value)
+
+ return data
+
+ def wait(self):
+ """Wait for a randomly chosen amount of seconds"""
+ time.sleep(random.uniform(self.wait_min, self.wait_max))
+
+
+class IdolcomplexTagExtractor(IdolcomplexExtractor):
"""Extractor for images from idol.sankakucomplex.com by search-tags"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)"
test = (
("https://idol.sankakucomplex.com/?tags=lyumos+wreath", {
@@ -31,20 +150,110 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor,
("https://idol.sankakucomplex.com"
"/?tags=lyumos+wreath&page=3&next=694215"),
)
+ per_page = 20
+
+ def __init__(self, match):
+ IdolcomplexExtractor.__init__(self, match)
+ query = text.parse_query(match.group(1))
+ self.tags = text.unquote(query.get("tags", "").replace("+", " "))
+ self.start_page = text.parse_int(query.get("page"), 1)
+ self.next = text.parse_int(query.get("next"), 0)
+
+ def skip(self, num):
+ if self.next:
+ self.start_post += num
+ else:
+ pages, posts = divmod(num, self.per_page)
+ self.start_page += pages
+ self.start_post += posts
+ return num
+ def metadata(self):
+ if not self.next:
+ max_page = 50 if self.logged_in else 25
+ if self.start_page > max_page:
+ self.log.info("Traversing from page %d to page %d",
+ max_page, self.start_page)
+ self.start_post += self.per_page * (self.start_page - max_page)
+ self.start_page = max_page
-class IdolcomplexPoolExtractor(IdolcomplexExtractor,
- sankaku.SankakuPoolExtractor):
+ tags = self.tags.split()
+ if not self.logged_in and len(tags) > 4:
+ raise exception.StopExtraction(
+ "Non-members can only search up to 4 tags at once")
+ return {"search_tags": " ".join(tags)}
+
+ def post_ids(self):
+ params = {"tags": self.tags}
+
+ if self.next:
+ params["next"] = self.next
+ else:
+ params["page"] = self.start_page
+
+ while True:
+ self.wait()
+ page = self.request(self.root, params=params, retries=10).text
+ pos = page.find("<div id=more-popular-posts-link>") + 1
+
+ ids = list(text.extract_iter(page, '" id=p', '>', pos))
+ if not ids:
+ return
+ yield from ids
+
+ next_qs = text.extract(page, 'next-page-url="/?', '"', pos)[0]
+ next_id = text.parse_query(next_qs).get("next")
+
+ # stop if the same "next" parameter occurs twice in a row (#265)
+ if "next" in params and params["next"] == next_id:
+ return
+
+ params["next"] = next_id or (text.parse_int(ids[-1]) - 1)
+ params["page"] = "2"
+
+
+class IdolcomplexPoolExtractor(IdolcomplexExtractor):
"""Extractor for image-pools from idol.sankakucomplex.com"""
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool}")
+ archive_fmt = "p_{pool}_{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)"
test = ("https://idol.sankakucomplex.com/pool/show/145", {
"count": 3,
})
+ per_page = 24
+ def __init__(self, match):
+ IdolcomplexExtractor.__init__(self, match)
+ self.pool_id = match.group(1)
-class IdolcomplexPostExtractor(IdolcomplexExtractor,
- sankaku.SankakuPostExtractor):
+ def skip(self, num):
+ pages, posts = divmod(num, self.per_page)
+ self.start_page += pages
+ self.start_post += posts
+ return num
+
+ def metadata(self):
+ return {"pool": self.pool_id}
+
+ def post_ids(self):
+ url = self.root + "/pool/show/" + self.pool_id
+ params = {"page": self.start_page}
+
+ while True:
+ page = self.request(url, params=params, retries=10).text
+ ids = list(text.extract_iter(page, '" id=p', '>'))
+
+ yield from ids
+ if len(ids) < self.per_page:
+ return
+ params["page"] += 1
+
+
+class IdolcomplexPostExtractor(IdolcomplexExtractor):
"""Extractor for single images from idol.sankakucomplex.com"""
+ subcategory = "post"
+ archive_fmt = "{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)"
test = ("https://idol.sankakucomplex.com/post/show/694215", {
"content": "694ec2491240787d75bf5d0c75d0082b53a85afd",
@@ -57,3 +266,10 @@ class IdolcomplexPostExtractor(IdolcomplexExtractor,
"tags_general": str,
},
})
+
+ def __init__(self, match):
+ IdolcomplexExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def post_ids(self):
+ return (self.post_id,)
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index ad5a508..28af179 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2019 Mike Fährmann
+# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,13 +8,13 @@
"""Collection of extractors for various imagehosts"""
-from .common import Extractor, Message, SharedConfigMixin
+from .common import Extractor, Message
from .. import text, exception
from ..cache import memcache
from os.path import splitext
-class ImagehostImageExtractor(SharedConfigMixin, Extractor):
+class ImagehostImageExtractor(Extractor):
"""Base class for single-image extractors for various imagehosts"""
basecategory = "imagehost"
subcategory = "image"
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 1194626..9870824 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -10,7 +10,7 @@
"""Extractors for https://www.instagram.com/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
from ..cache import cache
import itertools
import json
@@ -27,43 +27,101 @@ class InstagramExtractor(Extractor):
root = "https://www.instagram.com"
cookiedomain = ".instagram.com"
cookienames = ("sessionid",)
+ request_interval = 5.0
def __init__(self, match):
Extractor.__init__(self, match)
- self._find_tags = re.compile(r'#\w+').findall
-
- def get_metadata(self):
- return {}
+ self.www_claim = "0"
+ self.csrf_token = util.generate_csrf_token()
+ self._find_tags = re.compile(r"#\w+").findall
+ self._cursor = None
def items(self):
self.login()
- yield Message.Version, 1
-
+ data = self.metadata()
videos = self.config("videos", True)
- metadata = self.get_metadata()
- for data in self.instagrams():
- data.update(metadata)
- yield Message.Directory, data
-
- if data['typename'] == 'GraphHighlightReel':
- url = '{}/stories/highlights/{}/'.format(self.root, data['id'])
- data['_extractor'] = InstagramStoriesExtractor
- yield Message.Queue, url, data
+
+ for post in self.posts():
+
+ if post["__typename"] == "GraphReel":
+ post = self._parse_reel(post["id"])
else:
- url = data.get('video_url')
+ post = self._parse_post(post)
+ post.update(data)
+ files = post.pop("_files")
+
+ yield Message.Directory, post
+ for file in files:
+ url = file.get("video_url")
if not url:
- url = data['display_url']
+ url = file["display_url"]
elif not videos:
continue
- yield Message.Url, url, text.nameext_from_url(url, data)
+ file.update(post)
+ yield Message.Url, url, text.nameext_from_url(url, file)
+
+ def metadata(self):
+ return ()
+
+ def posts(self):
+ return ()
+
+ def request(self, url, **kwargs):
+ response = Extractor.request(self, url, **kwargs)
+ if response.history and "/accounts/login/" in response.request.url:
+ if self._cursor:
+ self.log.info("Use '-o cursor=%s' to continue downloading "
+ "from the current position", self._cursor)
+ raise exception.StopExtraction(
+ "Redirected to login page (%s)", response.request.url)
+ www_claim = response.headers.get("x-ig-set-www-claim")
+ if www_claim is not None:
+ self.www_claim = www_claim
+ return response
+
+ def _api_request(self, endpoint, params):
+ url = "https://i.instagram.com/api/" + endpoint
+ headers = {
+ "X-CSRFToken" : self.csrf_token,
+ "X-IG-App-ID" : "936619743392459",
+ "X-IG-WWW-Claim": self.www_claim,
+ }
+ cookies = {
+ "csrftoken": self.csrf_token,
+ }
+ return self.request(
+ url, params=params, headers=headers, cookies=cookies,
+ ).json()
+
+ def _graphql_request(self, query_hash, variables):
+ url = self.root + "/graphql/query/"
+ params = {
+ "query_hash": query_hash,
+ "variables" : json.dumps(variables),
+ }
+ headers = {
+ "X-CSRFToken" : self.csrf_token,
+ "X-IG-App-ID" : "936619743392459",
+ "X-IG-WWW-Claim" : self.www_claim,
+ "X-Requested-With": "XMLHttpRequest",
+ }
+ cookies = {
+ "csrftoken": self.csrf_token,
+ }
+ return self.request(
+ url, params=params, headers=headers, cookies=cookies,
+ ).json()["data"]
def login(self):
- if self._check_cookies(self.cookienames):
- return
- username, password = self._get_auth_info()
- if username:
- self.session.cookies.set("ig_cb", "1", domain="www.instagram.com")
- self._update_cookies(self._login_impl(username, password))
+ if not self._check_cookies(self.cookienames):
+ username, password = self._get_auth_info()
+ if username:
+ self.session.cookies.set(
+ "ig_cb", "2", domain="www.instagram.com")
+ self._update_cookies(self._login_impl(username, password))
+
+ self.session.cookies.set(
+ "csrftoken", self.csrf_token, domain=self.cookiedomain)
@cache(maxage=360*24*3600, keyarg=1)
def _login_impl(self, username, password):
@@ -98,250 +156,346 @@ class InstagramExtractor(Extractor):
for key in ("sessionid", "mid", "csrftoken")
}
- def _request_graphql(self, variables, query_hash, csrf=None):
- headers = {
- 'X-CSRFToken': csrf,
- 'X-IG-App-ID': '936619743392459',
- 'X-Requested-With': 'XMLHttpRequest',
- }
- url = '{}/graphql/query/?query_hash={}&variables={}'.format(
- self.root, query_hash, variables,
- )
- return self.request(url, headers=headers).json()
-
- def _extract_shared_data(self, url):
- page = self.request(url).text
- shared_data, pos = text.extract(
- page, 'window._sharedData =', ';</script>')
- additional_data, pos = text.extract(
- page, 'window.__additionalDataLoaded(', ');</script>', pos)
-
- data = json.loads(shared_data)
- if additional_data:
- next(iter(data['entry_data'].values()))[0] = \
- json.loads(additional_data.partition(',')[2])
- return data
+ def _parse_post(self, post):
+ if post.get("is_video") and "video_url" not in post:
+ url = "{}/tv/{}/".format(self.root, post["shortcode"])
+ post = self._extract_post_page(url)
- def _extract_postpage(self, url):
- try:
- with self.request(url + '?__a=1', fatal=False) as response:
- media = response.json()['graphql']['shortcode_media']
- except (KeyError, ValueError) as exc:
- self.log.warning("Unable to fetch data from '%s': %s: %s",
- url, exc.__class__.__name__, exc)
- self.log.debug("Server response: %s", response.text)
- return ()
-
- common = {
- 'date': text.parse_timestamp(media['taken_at_timestamp']),
- 'likes': text.parse_int(media['edge_media_preview_like']['count']),
- 'owner_id': media['owner']['id'],
- 'username': media['owner']['username'],
- 'fullname': media['owner']['full_name'],
- 'post_id': media['id'],
- 'post_shortcode': media['shortcode'],
- 'post_url': url,
- 'description': text.parse_unicode_escapes('\n'.join(
- edge['node']['text']
- for edge in media['edge_media_to_caption']['edges']
+ owner = post["owner"]
+ data = {
+ "typename" : post["__typename"],
+ "date" : text.parse_timestamp(post["taken_at_timestamp"]),
+ "likes" : post["edge_media_preview_like"]["count"],
+ "owner_id" : owner["id"],
+ "username" : owner.get("username"),
+ "fullname" : owner.get("full_name"),
+ "post_id" : post["id"],
+ "post_shortcode": post["shortcode"],
+ "post_url" : "{}/p/{}/".format(self.root, post["shortcode"]),
+ "description": text.parse_unicode_escapes("\n".join(
+ edge["node"]["text"]
+ for edge in post["edge_media_to_caption"]["edges"]
)),
}
- tags = self._find_tags(common['description'])
+ tags = self._find_tags(data["description"])
if tags:
- common['tags'] = sorted(set(tags))
+ data["tags"] = sorted(set(tags))
- location = media['location']
+ location = post.get("location")
if location:
- common['location_id'] = location['id']
- common['location_slug'] = location['slug']
- common['location_url'] = "{}/explore/locations/{}/{}/".format(
- self.root, location['id'], location['slug'])
+ data["location_id"] = location["id"]
+ data["location_slug"] = location["slug"]
+ data["location_url"] = "{}/explore/locations/{}/{}/".format(
+ self.root, location["id"], location["slug"])
- medias = []
- if media['__typename'] == 'GraphSidecar':
+ data["_files"] = files = []
+ if "edge_sidecar_to_children" in post:
for num, edge in enumerate(
- media['edge_sidecar_to_children']['edges'], 1):
- children = edge['node']
- media_data = {
- 'num': num,
- 'media_id': children['id'],
- 'shortcode': children['shortcode'],
- 'typename': children['__typename'],
- 'display_url': children['display_url'],
- 'video_url': children.get('video_url'),
- 'height': text.parse_int(children['dimensions']['height']),
- 'width': text.parse_int(children['dimensions']['width']),
- 'sidecar_media_id': media['id'],
- 'sidecar_shortcode': media['shortcode'],
+ post["edge_sidecar_to_children"]["edges"], 1):
+ node = edge["node"]
+ dimensions = node["dimensions"]
+ media = {
+ "num": num,
+ "media_id" : node["id"],
+ "shortcode" : (node.get("shortcode") or
+ self._shortcode_from_id(node["id"])),
+ "display_url": node["display_url"],
+ "video_url" : node.get("video_url"),
+ "width" : dimensions["width"],
+ "height" : dimensions["height"],
+ "sidecar_media_id" : post["id"],
+ "sidecar_shortcode": post["shortcode"],
}
- self._extract_tagged_users(children, media_data)
- media_data.update(common)
- medias.append(media_data)
-
+ self._extract_tagged_users(node, media)
+ files.append(media)
else:
- media_data = {
- 'media_id': media['id'],
- 'shortcode': media['shortcode'],
- 'typename': media['__typename'],
- 'display_url': media['display_url'],
- 'video_url': media.get('video_url'),
- 'height': text.parse_int(media['dimensions']['height']),
- 'width': text.parse_int(media['dimensions']['width']),
+ dimensions = post["dimensions"]
+ media = {
+ "media_id" : post["id"],
+ "shortcode" : post["shortcode"],
+ "display_url": post["display_url"],
+ "video_url" : post.get("video_url"),
+ "width" : dimensions["width"],
+ "height" : dimensions["height"],
}
- self._extract_tagged_users(media, media_data)
- media_data.update(common)
- medias.append(media_data)
+ self._extract_tagged_users(post, media)
+ files.append(media)
- return medias
+ return data
- def _extract_stories(self, url):
- if self.highlight_id:
- user_id = ''
- highlight_id = '"{}"'.format(self.highlight_id)
- query_hash = '30a89afdd826d78a5376008a7b81c205'
- else:
- shared_data = self._extract_shared_data(url)
-
- # If no stories are present the URL redirects to `ProfilePage'
- if 'StoriesPage' not in shared_data['entry_data']:
- return []
-
- user_id = '"{}"'.format(
- shared_data['entry_data']['StoriesPage'][0]['user']['id'])
- highlight_id = ''
- query_hash = '0a85e6ea60a4c99edc58ab2f3d17cfdf'
-
- variables = (
- '{{'
- '"reel_ids":[{}],"tag_names":[],"location_ids":[],'
- '"highlight_reel_ids":[{}],"precomposed_overlay":false,'
- '"show_story_viewer_list":true,'
- '"story_viewer_fetch_count":50,"story_viewer_cursor":"",'
- '"stories_video_dash_manifest":false'
- '}}'
- ).format(user_id, highlight_id)
- shared_data = self._request_graphql(variables, query_hash)
-
- # If there are stories present but the user is not authenticated or
- # does not have permissions no stories are returned.
- if not shared_data['data']['reels_media']:
- return [] # no stories present
-
- medias = []
- for media in shared_data['data']['reels_media'][0]['items']:
- media_data = {
- 'owner_id': media['owner']['id'],
- 'username': media['owner']['username'],
- 'date' : text.parse_timestamp(
- media['taken_at_timestamp']),
- 'expires' : text.parse_timestamp(
- media['expiring_at_timestamp']),
- 'media_id': media['id'],
- 'typename': media['__typename'],
- 'display_url': media['display_url'],
- }
- if media['__typename'] == 'GraphStoryImage':
- media_data.update({
- 'height': text.parse_int(media['dimensions']['height']),
- 'width': text.parse_int(media['dimensions']['width']),
- })
- elif media['__typename'] == 'GraphStoryVideo':
- vr = media['video_resources'][0]
- media_data.update({
- 'duration': text.parse_float(media['video_duration']),
- 'video_url': vr['src'],
- 'height': text.parse_int(vr['config_height']),
- 'width': text.parse_int(vr['config_width']),
- })
- medias.append(media_data)
-
- return medias
-
- def _extract_story_highlights(self, shared_data):
- graphql = shared_data['entry_data']['ProfilePage'][0]['graphql']
- variables = (
- '{{'
- '"user_id":"{}","include_chaining":true,'
- '"include_reel":true,"include_suggested_users":false,'
- '"include_logged_out_extras":false,'
- '"include_highlight_reels":true'
- '}}'
- ).format(graphql['user']['id'])
-
- data = self._request_graphql(
- variables,
- 'ad99dd9d3646cc3c0dda65debcd266a7',
- shared_data['config']['csrf_token'],
- )
-
- highlights = []
- for edge in data['data']['user']['edge_highlight_reels']['edges']:
- story = edge['node']
- highlights.append({
- 'id' : story['id'],
- 'title' : story['title'],
- 'owner_id': story['owner']['id'],
- 'username': story['owner']['username'],
- 'typename': story['__typename'],
- })
+ def _parse_reel(self, reel_id):
+ params = {"reel_ids": reel_id}
+ data = self._api_request("v1/feed/reels_media/", params)
+ if not data["reels_media"]:
+ raise exception.NotFoundError("reel")
+ reel = data["reels_media"][0]
- return highlights
+ reel_id = reel_id.rpartition(":")[2]
+ owner = reel["user"]
- def _extract_page(self, shared_data, psdf):
- csrf = shared_data['config']['csrf_token']
+ data = {
+ "expires" : text.parse_timestamp(reel.get("expiring_at")),
+ "owner_id" : owner["pk"],
+ "username" : owner.get("username"),
+ "fullname" : owner.get("full_name"),
+ "post_id" : reel_id,
+ "post_shortcode": self._shortcode_from_id(reel_id),
+ }
- while True:
- # Deal with different structure of pages: the first page
- # has interesting data in `entry_data', next pages in `data'.
- if 'entry_data' in shared_data:
- entry_data = shared_data['entry_data']
- if 'HttpErrorPage' in entry_data:
- return
- base_shared_data = entry_data[psdf['page']][0]['graphql']
-
- # variables_id is available only in the first page
- variables_id = base_shared_data[psdf['node']][psdf['node_id']]
+ data["_files"] = files = []
+ for num, item in enumerate(reel["items"], 1):
+
+ image = item["image_versions2"]["candidates"][0]
+
+ if "video_versions" in item:
+ video = max(
+ item["video_versions"],
+ key=lambda x: (x["width"], x["height"], x["type"]),
+ )
+ media = video
else:
- base_shared_data = shared_data['data']
-
- medias = base_shared_data[psdf['node']][psdf['edge_to_medias']]
- has_next_page = medias['page_info']['has_next_page']
- shortcodes = [n['node']['shortcode'] for n in medias['edges']]
-
- for s in shortcodes:
- url = '{}/p/{}/'.format(self.root, s)
- yield from self._extract_postpage(url)
-
- if not has_next_page:
- break
- time.sleep(3)
- end_cursor = medias['page_info']['end_cursor']
- variables = '{{"{}":"{}","first":12,"after":"{}"}}'.format(
- psdf['variables_id'],
- variables_id,
- end_cursor,
- )
- shared_data = self._request_graphql(
- variables, psdf['query_hash'], csrf,
- )
-
- def _extract_tagged_users(self, src_media, dest_dict):
- edges = src_media['edge_media_to_tagged_user']['edges']
+ video = None
+ media = image
+
+ files.append({
+ "num" : num,
+ "date" : text.parse_timestamp(item["taken_at"]),
+ "media_id" : item["pk"],
+ "shortcode" : item["code"],
+ "display_url": image["url"],
+ "video_url" : video["url"] if video else None,
+ "width" : media["width"],
+ "height" : media["height"],
+ })
+
+ return data
+
+ @staticmethod
+ def _shortcode_from_id(post_id):
+ return util.bencode(
+ int(post_id),
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789-_")
+
+ def _extract_tagged_users(self, src, dest):
+ if "edge_media_to_tagged_user" not in src:
+ return
+ edges = src["edge_media_to_tagged_user"]["edges"]
if edges:
- dest_dict['tagged_users'] = tagged_users = []
+ dest["tagged_users"] = tagged_users = []
for edge in edges:
- user = edge['node']['user']
+ user = edge["node"]["user"]
tagged_users.append({
- 'id' : user['id'],
- 'username' : user['username'],
- 'full_name': user['full_name'],
+ "id" : user["id"],
+ "username" : user["username"],
+ "full_name": user["full_name"],
})
+ def _extract_shared_data(self, url):
+ page = self.request(url).text
+ shared_data, pos = text.extract(
+ page, "window._sharedData =", ";</script>")
+ additional_data, pos = text.extract(
+ page, "window.__additionalDataLoaded(", ");</script>", pos)
+
+ data = json.loads(shared_data)
+ if additional_data:
+ next(iter(data["entry_data"].values()))[0] = \
+ json.loads(additional_data.partition(",")[2])
+ return data
+
+ def _extract_profile_page(self, url):
+ data = self._extract_shared_data(url)["entry_data"]
+ if "HttpErrorPage" in data:
+ raise exception.NotFoundError("user")
+ return data["ProfilePage"][0]["graphql"]["user"]
+
+ def _extract_post_page(self, url):
+ data = self._extract_shared_data(url)["entry_data"]
+ if "HttpErrorPage" in data:
+ raise exception.NotFoundError("post")
+ return data["PostPage"][0]["graphql"]["shortcode_media"]
+
+ def _get_edge_data(self, user, key):
+ cursor = self.config("cursor")
+ if cursor:
+ return {
+ "edges": (),
+ "page_info": {
+ "end_cursor": cursor,
+ "has_next_page": True,
+ },
+ }
+ return user[key]
+
+ def _pagination(self, query_hash, variables, data):
+ while True:
+ for edge in data["edges"]:
+ yield edge["node"]
+
+ info = data["page_info"]
+ if not info["has_next_page"]:
+ return
+
+ variables["after"] = self._cursor = info["end_cursor"]
+ self.log.debug("Cursor: %s", self._cursor)
+ data = next(iter(self._graphql_request(
+ query_hash, variables)["user"].values()))
+
+
+class InstagramUserExtractor(InstagramExtractor):
+ """Extractor for ProfilePage"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)"
+ r"([^/?#]+)/?(?:$|[?#])")
+ test = (
+ ("https://www.instagram.com/instagram/", {
+ "range": "1-16",
+ "count": ">= 16",
+ }),
+ # ("https://www.instagram.com/instagram/", {
+ # "options": (("highlights", True),),
+ # "pattern": InstagramStoriesExtractor.pattern,
+ # "range": "1-2",
+ # "count": 2,
+ # }),
+ ("https://www.instagram.com/instagram/?hl=en"),
+ )
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def posts(self):
+ url = "{}/{}/".format(self.root, self.user)
+ user = self._extract_profile_page(url)
+
+ if user.get("highlight_reel_count") and self.config("highlights"):
+ query_hash = "d4d88dc1500312af6f937f7b804c68c3"
+ variables = {
+ "user_id": user["id"],
+ "include_chaining": False,
+ "include_reel": True,
+ "include_suggested_users": False,
+ "include_logged_out_extras": False,
+ "include_highlight_reels": True,
+ "include_live_status": True,
+ }
+ data = self._graphql_request(query_hash, variables)
+ highlights = [
+ {
+ "__typename": "GraphReel",
+ "id" : "highlight:" + edge["node"]["id"],
+ }
+ for edge in data["user"]["edge_highlight_reels"]["edges"]
+ ]
+ else:
+ highlights = None
+
+ query_hash = "003056d32c2554def87228bc3fd9668a"
+ variables = {"id": user["id"], "first": 50}
+ edge = self._get_edge_data(user, "edge_owner_to_timeline_media")
+ posts = self._pagination(query_hash, variables, edge)
+
+ return itertools.chain(highlights, posts) if highlights else posts
+
+
+class InstagramChannelExtractor(InstagramExtractor):
+ """Extractor for ProfilePage channel"""
+ subcategory = "channel"
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
+ r"([^/?#]+)/channel")
+ test = ("https://www.instagram.com/instagram/channel/", {
+ "range": "1-16",
+ "count": ">= 16",
+ })
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def posts(self):
+ url = "{}/{}/channel/".format(self.root, self.user)
+ user = self._extract_profile_page(url)
+
+ query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
+ variables = {"id": user["id"], "first": 50}
+ edge = self._get_edge_data(user, "edge_felix_video_timeline")
+ return self._pagination(query_hash, variables, edge)
+
+
+class InstagramSavedExtractor(InstagramExtractor):
+ """Extractor for ProfilePage saved media"""
+ subcategory = "saved"
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
+ r"([^/?#]+)/saved")
+ test = ("https://www.instagram.com/instagram/saved/",)
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def posts(self):
+ url = "{}/{}/saved/".format(self.root, self.user)
+ user = self._extract_profile_page(url)
+
+ query_hash = "2ce1d673055b99250e93b6f88f878fde"
+ variables = {"id": user["id"], "first": 50}
+ edge = self._get_edge_data(user, "edge_saved_media")
+ return self._pagination(query_hash, variables, edge)
+
-class InstagramImageExtractor(InstagramExtractor):
- """Extractor for PostPage"""
- subcategory = "image"
+class InstagramTagExtractor(InstagramExtractor):
+ """Extractor for TagPage"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{subcategory}", "{tag}")
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/explore/tags/([^/?#]+)")
+ test = ("https://www.instagram.com/explore/tags/instagram/", {
+ "range": "1-16",
+ "count": ">= 16",
+ })
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.tag = match.group(1)
+
+ def metadata(self):
+ return {"tag": self.tag}
+
+ def posts(self):
+ url = "{}/explore/tags/{}/".format(self.root, self.tag)
+ data = self._extract_shared_data(url)
+ hashtag = data["entry_data"]["TagPage"][0]["graphql"]["hashtag"]
+
+ query_hash = "9b498c08113f1e09617a1703c22b2f32"
+ variables = {"tag_name": hashtag["name"], "first": 50}
+ edge = self._get_edge_data(hashtag, "edge_hashtag_to_media")
+ return self._pagination(query_hash, variables, edge)
+
+ def _pagination(self, query_hash, variables, data):
+ while True:
+ for edge in data["edges"]:
+ yield edge["node"]
+
+ info = data["page_info"]
+ if not info["has_next_page"]:
+ return
+
+ variables["after"] = self._cursor = info["end_cursor"]
+ self.log.debug("Cursor: %s", self._cursor)
+ data = self._graphql_request(
+ query_hash, variables)["hashtag"]["edge_hashtag_to_media"]
+
+
+class InstagramPostExtractor(InstagramExtractor):
+ """Extractor for an Instagram post"""
+ subcategory = "post"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
r"/(?:p|tv|reel)/([^/?#]+)")
test = (
@@ -435,8 +589,8 @@ class InstagramImageExtractor(InstagramExtractor):
("https://www.instagram.com/p/B_2lf3qAd3y/", {
"keyword": {
"tagged_users": [{
- "id": "1246468638",
- "username": "kaaymbl",
+ "id" : "1246468638",
+ "username" : "kaaymbl",
"full_name": "Call Me Kay",
}]
}
@@ -449,158 +603,44 @@ class InstagramImageExtractor(InstagramExtractor):
InstagramExtractor.__init__(self, match)
self.shortcode = match.group(1)
- def instagrams(self):
- url = '{}/p/{}/'.format(self.root, self.shortcode)
- return self._extract_postpage(url)
+ def posts(self):
+ query_hash = "a9441f24ac73000fa17fe6e6da11d59d"
+ variables = {
+ "shortcode" : self.shortcode,
+ "child_comment_count" : 3,
+ "fetch_comment_count" : 40,
+ "parent_comment_count" : 24,
+ "has_threaded_comments": True
+ }
+ data = self._graphql_request(query_hash, variables)
+ return (data["shortcode_media"],)
class InstagramStoriesExtractor(InstagramExtractor):
- """Extractor for StoriesPage"""
+ """Extractor for Instagram stories"""
subcategory = "stories"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/stories/([^/?#]+)(?:/(\d+))?")
+ r"/stories/(?:highlights/(\d+)|([^/?#]+))")
test = (
("https://www.instagram.com/stories/instagram/"),
("https://www.instagram.com/stories/highlights/18042509488170095/"),
)
+ request_interval = 1.0
def __init__(self, match):
InstagramExtractor.__init__(self, match)
- self.username, self.highlight_id = match.groups()
-
- def instagrams(self):
- url = '{}/stories/{}/'.format(self.root, self.username)
- return self._extract_stories(url)
-
-
-class InstagramSavedExtractor(InstagramExtractor):
- """Extractor for ProfilePage saved media"""
- subcategory = "saved"
- pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
- r"([^/?#]+)/saved")
- test = ("https://www.instagram.com/instagram/saved/",)
+ self.highlight_id, self.user = match.groups()
- def __init__(self, match):
- InstagramExtractor.__init__(self, match)
- self.username = match.group(1)
-
- def instagrams(self):
- url = '{}/{}/saved/'.format(self.root, self.username)
- shared_data = self._extract_shared_data(url)
-
- return self._extract_page(shared_data, {
- 'page': 'ProfilePage',
- 'node': 'user',
- 'node_id': 'id',
- 'variables_id': 'id',
- 'edge_to_medias': 'edge_saved_media',
- 'query_hash': '8c86fed24fa03a8a2eea2a70a80c7b6b',
- })
-
-
-class InstagramUserExtractor(InstagramExtractor):
- """Extractor for ProfilePage"""
- subcategory = "user"
- pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/(?!(?:p|explore|directory|accounts|stories|tv|reel)/)"
- r"([^/?#]+)/?(?:$|[?#])")
- test = (
- ("https://www.instagram.com/instagram/", {
- "range": "1-16",
- "count": ">= 16",
- }),
- ("https://www.instagram.com/instagram/", {
- "options": (("highlights", True),),
- "pattern": InstagramStoriesExtractor.pattern,
- "range": "1-2",
- "count": 2,
- }),
- ("https://www.instagram.com/instagram/?hl=en"),
- )
-
- def __init__(self, match):
- InstagramExtractor.__init__(self, match)
- self.username = match.group(1)
-
- def instagrams(self):
- url = '{}/{}/'.format(self.root, self.username)
- shared_data = self._extract_shared_data(url)
-
- instagrams = self._extract_page(shared_data, {
- 'page': 'ProfilePage',
- 'node': 'user',
- 'node_id': 'id',
- 'variables_id': 'id',
- 'edge_to_medias': 'edge_owner_to_timeline_media',
- 'query_hash': '15bf78a4ad24e33cbd838fdb31353ac1',
- })
-
- if self.config('highlights'):
- instagrams = itertools.chain(
- self._extract_story_highlights(shared_data),
- instagrams,
- )
-
- return instagrams
-
-
-class InstagramChannelExtractor(InstagramExtractor):
- """Extractor for ProfilePage channel"""
- subcategory = "channel"
- pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
- r"([^/?#]+)/channel")
- test = ("https://www.instagram.com/instagram/channel/", {
- "range": "1-16",
- "count": ">= 16",
- })
-
- def __init__(self, match):
- InstagramExtractor.__init__(self, match)
- self.username = match.group(1)
-
- def instagrams(self):
- url = '{}/{}/channel/'.format(self.root, self.username)
- shared_data = self._extract_shared_data(url)
-
- return self._extract_page(shared_data, {
- 'page': 'ProfilePage',
- 'node': 'user',
- 'node_id': 'id',
- 'variables_id': 'id',
- 'edge_to_medias': 'edge_felix_video_timeline',
- 'query_hash': 'bc78b344a68ed16dd5d7f264681c4c76',
- })
-
-
-class InstagramTagExtractor(InstagramExtractor):
- """Extractor for TagPage"""
- subcategory = "tag"
- directory_fmt = ("{category}", "{subcategory}", "{tag}")
- pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/explore/tags/([^/?#]+)")
- test = ("https://www.instagram.com/explore/tags/instagram/", {
- "range": "1-16",
- "count": ">= 16",
- })
-
- def __init__(self, match):
- InstagramExtractor.__init__(self, match)
- self.tag = match.group(1)
-
- def get_metadata(self):
- return {"tag": self.tag}
-
- def instagrams(self):
- url = '{}/explore/tags/{}/'.format(self.root, self.tag)
- shared_data = self._extract_shared_data(url)
-
- return self._extract_page(shared_data, {
- 'page': 'TagPage',
- 'node': 'hashtag',
- 'node_id': 'name',
- 'variables_id': 'tag_name',
- 'edge_to_medias': 'edge_hashtag_to_media',
- 'query_hash': 'c769cb6c71b24c8a86590b22402fda50',
- })
+ def posts(self):
+ if self.highlight_id:
+ reel_id = "highlight:" + self.highlight_id
+ else:
+ url = "{}/stories/{}/".format(self.root, self.user)
+ try:
+ data = self._extract_shared_data(url)["entry_data"]
+ user = data["StoriesPage"][0]["user"]
+ except KeyError:
+ return ()
+ reel_id = user["id"]
+
+ return ({"__typename": "GraphReel", "id": reel_id},)
diff --git a/gallery_dl/extractor/konachan.py b/gallery_dl/extractor/konachan.py
deleted file mode 100644
index a9d8b3a..0000000
--- a/gallery_dl/extractor/konachan.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2015-2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://konachan.com/"""
-
-from . import booru
-
-
-class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
- """Base class for konachan extractors"""
- category = "konachan"
-
- def __init__(self, match):
- root = "https://konachan." + match.group("tld")
- self.api_url = root + "/post.json"
- self.post_url = root + "/post/show/{}"
- super().__init__(match)
-
-
-class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
- """Extractor for images from konachan.com based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
- r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
- test = (
- ("https://konachan.com/post?tags=patata", {
- "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
- }),
- ("https://konachan.net/post?tags=patata"),
- )
-
-
-class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor):
- """Extractor for image-pools from konachan.com"""
- pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
- r"/pool/show/(?P<pool>\d+)")
- test = (
- ("https://konachan.com/pool/show/95", {
- "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
- }),
- ("https://konachan.net/pool/show/95"),
- )
-
-
-class KonachanPostExtractor(booru.PostMixin, KonachanExtractor):
- """Extractor for single images from konachan.com"""
- pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
- r"/post/show/(?P<post>\d+)")
- test = (
- ("https://konachan.com/post/show/205189", {
- "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "patata",
- "tags_character": "clownpiece",
- "tags_copyright": "touhou",
- "tags_general": str,
- },
- }),
- ("https://konachan.net/post/show/205189"),
- )
-
-
-class KonachanPopularExtractor(booru.MoebooruPopularMixin, KonachanExtractor):
- """Extractor for popular images from konachan.com"""
- pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
- r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
- r"(?:\?(?P<query>[^#]*))?")
- test = (
- ("https://konachan.com/post/popular_by_month?month=11&year=2010", {
- "count": 20,
- }),
- ("https://konachan.com/post/popular_recent"),
- ("https://konachan.net/post/popular_recent"),
- )
-
- def __init__(self, match):
- super().__init__(match)
- self.api_url = (
- "https://konachan.{tld}/post/popular_{scale}.json".format(
- tld=match.group("tld"), scale=self.scale))
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index c91e9a8..96c81c7 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -9,7 +9,7 @@
"""Extractors for https://mangadex.org/"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text, util, exception
from ..cache import memcache
@@ -74,6 +74,10 @@ class MangadexChapterExtractor(MangadexExtractor):
"count": 64,
"keyword": "c53a0e4c12250578a4e630281085875e59532c03",
}),
+ # MANGA Plus (#1154)
+ ("https://mangadex.org/chapter/1122815", {
+ "excepion": exception.StopExtraction,
+ }),
)
def __init__(self, match):
@@ -82,6 +86,12 @@ class MangadexChapterExtractor(MangadexExtractor):
def items(self):
cdata = self.chapter_data(self.chapter_id)
+ if "server" not in cdata:
+ if cdata["status"] == "external":
+ raise exception.StopExtraction(
+ "Chapter is not available on MangaDex and can be read on "
+ "the official publisher's website at %s.", cdata["pages"])
+ raise exception.StopExtraction("No download server available.")
mdata = self.manga_data(cdata["mangaId"])
chapter, sep, minor = cdata["chapter"].partition(".")
diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py
new file mode 100644
index 0000000..cbc8680
--- /dev/null
+++ b/gallery_dl/extractor/moebooru.py
@@ -0,0 +1,257 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Moebooru based sites"""
+
+from .common import generate_extractors
+from .booru import BooruExtractor
+from .. import text
+
+import collections
+import datetime
+import re
+
+
+class MoebooruExtractor(BooruExtractor):
+ """Base class for Moebooru extractors"""
+ basecategory = "moebooru"
+ filename_fmt = "{category}_{id}_{md5}.{extension}"
+ page_start = 1
+
+ def _prepare_post(self, post, extended_tags=False):
+ url = post["file_url"]
+ if url[0] == "/":
+ url = self.root + url
+ if extended_tags:
+ self._fetch_extended_tags(post)
+ post["date"] = text.parse_timestamp(post["created_at"])
+ return url
+
+ def _fetch_extended_tags(self, post):
+ url = "{}/post/show/{}".format(self.root, post["id"])
+ page = self.request(url).text
+ html = text.extract(page, '<ul id="tag-', '</ul>')[0]
+ if html:
+ tags = collections.defaultdict(list)
+ pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)")
+ for tag_type, tag_name in pattern.findall(html):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
+
+ def _pagination(self, url, params):
+ params["page"] = self.page_start
+ params["limit"] = self.per_page
+
+ while True:
+ posts = self.request(url, params=params).json()
+ yield from posts
+
+ if len(posts) < self.per_page:
+ return
+ params["page"] += 1
+
+
+class MoebooruTagExtractor(MoebooruExtractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern_fmt = r"/post\?(?:[^&#]*&)*tags=([^&#]+)"
+
+ def __init__(self, match):
+ MoebooruExtractor.__init__(self, match)
+ self.tags = text.unquote(match.group(1).replace("+", " "))
+
+ def metadata(self):
+ return {"search_tags": self.tags}
+
+ def posts(self):
+ params = {"tags": self.tags}
+ return self._pagination(self.root + "/post.json", params)
+
+
+class MoebooruPoolExtractor(MoebooruExtractor):
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool}")
+ archive_fmt = "p_{pool}_{id}"
+ pattern_fmt = r"/pool/show/(\d+)"
+
+ def __init__(self, match):
+ MoebooruExtractor.__init__(self, match)
+ self.pool_id = match.group(1)
+
+ def metadata(self):
+ return {"pool": text.parse_int(self.pool_id)}
+
+ def posts(self):
+ params = {"tags": "pool:" + self.pool_id}
+ return self._pagination(self.root + "/post.json", params)
+
+
+class MoebooruPostExtractor(MoebooruExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern_fmt = r"/post/show/(\d+)"
+
+ def __init__(self, match):
+ MoebooruExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def posts(self):
+ params = {"tags": "id:" + self.post_id}
+ return self.request(self.root + "/post.json", params=params).json()
+
+
+class MoebooruPopularExtractor(MoebooruExtractor):
+ subcategory = "popular"
+ directory_fmt = ("{category}", "popular", "{scale}", "{date}")
+ archive_fmt = "P_{scale[0]}_{date}_{id}"
+ pattern_fmt = r"/post/popular_(by_(?:day|week|month)|recent)(?:\?([^#]*))?"
+
+ def __init__(self, match):
+ MoebooruExtractor.__init__(self, match)
+ self.scale, self.query = match.groups()
+
+ def metadata(self):
+ self.params = params = text.parse_query(self.query)
+
+ if "year" in params:
+ date = "{:>04}-{:>02}-{:>02}".format(
+ params["year"],
+ params.get("month", "01"),
+ params.get("day", "01"),
+ )
+ else:
+ date = datetime.date.today().isoformat()
+
+ scale = self.scale
+ if scale.startswith("by_"):
+ scale = scale[3:]
+ if scale == "week":
+ date = datetime.date.fromisoformat(date)
+ date = (date - datetime.timedelta(days=date.weekday())).isoformat()
+ elif scale == "month":
+ date = date[:-3]
+
+ return {"date": date, "scale": scale}
+
+ def posts(self):
+ url = "{}/post/popular_{}.json".format(self.root, self.scale)
+ return self.request(url, params=self.params).json()
+
+
+EXTRACTORS = {
+ "yandere": {
+ "root": "https://yande.re",
+ "test-tag": ("https://yande.re/post?tags=ouzoku+armor", {
+ "content": "59201811c728096b2d95ce6896fd0009235fe683",
+ }),
+ "test-pool": ("https://yande.re/pool/show/318", {
+ "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68",
+ }),
+ "test-post": ("https://yande.re/post/show/51824", {
+ "content": "59201811c728096b2d95ce6896fd0009235fe683",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "sasaki_tamaru",
+ "tags_circle": "softhouse_chara",
+ "tags_copyright": "ouzoku",
+ "tags_general": str,
+ },
+ }),
+ "test-popular": (
+ ("https://yande.re/post/popular_by_month?month=6&year=2014", {
+ "count": 40,
+ }),
+ ("https://yande.re/post/popular_recent"),
+ ),
+ },
+ "konachan": {
+ "root": "https://konachan.com",
+ "pattern": r"konachan\.(?:com|net)",
+ "test-tag": (
+ ("https://konachan.com/post?tags=patata", {
+ "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
+ }),
+ ("https://konachan.net/post?tags=patata"),
+ ),
+ "test-pool": (
+ ("https://konachan.com/pool/show/95", {
+ "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
+ }),
+ ("https://konachan.net/pool/show/95"),
+ ),
+ "test-post": (
+ ("https://konachan.com/post/show/205189", {
+ "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "patata",
+ "tags_character": "clownpiece",
+ "tags_copyright": "touhou",
+ "tags_general": str,
+ },
+ }),
+ ("https://konachan.net/post/show/205189"),
+ ),
+ "test-popular": (
+ ("https://konachan.com/post/popular_by_month?month=11&year=2010", {
+ "count": 20,
+ }),
+ ("https://konachan.com/post/popular_recent"),
+ ("https://konachan.net/post/popular_recent"),
+ ),
+ },
+ "hypnohub": {
+ "root": "https://hypnohub.net",
+ "test-tag": ("https://hypnohub.net/post?tags=gonoike_biwa", {
+ "url": "072330c34a1e773d0cafd00e64b8060d34b078b6",
+ }),
+ "test-pool": ("https://hypnohub.net/pool/show/61", {
+ "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf",
+ }),
+ "test-post": ("https://hypnohub.net/post/show/73964", {
+ "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "gonoike_biwa icontrol_(manipper)",
+ "tags_character": "komaru_naegi",
+ "tags_copyright": "dangan_ronpa dangan_ronpa_another_episode",
+ "tags_general": str,
+ },
+ }),
+ "test-popular": (
+ ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", {
+ "count": 20,
+ }),
+ ("https://hypnohub.net/post/popular_recent"),
+ ),
+ },
+ "lolibooru": {
+ "root": "https://lolibooru.moe",
+ "test-tag" : ("https://lolibooru.moe/post?tags=ruu_%28tksymkw%29",),
+ "test-pool" : ("https://lolibooru.moe/pool/show/239",),
+ "test-post" : ("https://lolibooru.moe/post/show/287835",),
+ "test-popular": ("https://lolibooru.moe/post/popular_recent",),
+ },
+ "sakugabooru": {
+ "root": "https://www.sakugabooru.com",
+ "pattern": r"(?:www\.)?sakugabooru\.com",
+ "test-tag" : ("https://www.sakugabooru.com/post?tags=nichijou",),
+ "test-pool" : ("https://www.sakugabooru.com/pool/show/54",),
+ "test-post" : ("https://www.sakugabooru.com/post/show/125570",),
+ "test-popular": ("https://www.sakugabooru.com/post/popular_recent",),
+ },
+}
+
+generate_extractors(EXTRACTORS, globals(), (
+ MoebooruTagExtractor,
+ MoebooruPoolExtractor,
+ MoebooruPostExtractor,
+ MoebooruPopularExtractor,
+))
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 15bb576..99e397b 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -47,8 +47,13 @@ class NozomiExtractor(Extractor):
post["artist"] = self._list(post.get("artist"))
post["copyright"] = self._list(post.get("copyright"))
post["character"] = self._list(post.get("character"))
- post["date"] = text.parse_datetime(
- post["date"] + ":00", "%Y-%m-%d %H:%M:%S%z")
+
+ try:
+ post["date"] = text.parse_datetime(
+ post["date"] + ":00", "%Y-%m-%d %H:%M:%S%z")
+ except Exception:
+ post["date"] = None
+
post.update(data)
images = post["imageurls"]
@@ -109,6 +114,10 @@ class NozomiPostExtractor(NozomiExtractor):
"keyword": "8c3a2561ccc9ad429be9850d1383a952d0b4a8ab",
"count": 7,
}),
+ # empty 'date' (#1163)
+ ("https://nozomi.la/post/130309.html", {
+ "keyword": {"date": None},
+ })
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index e0b0496..abcc33d 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -6,13 +6,13 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://rule34.paheal.net/"""
+"""Extractors for https://rule34.paheal.net/"""
-from .common import Extractor, Message, SharedConfigMixin
+from .common import Extractor, Message
from .. import text
-class PahealExtractor(SharedConfigMixin, Extractor):
+class PahealExtractor(Extractor):
"""Base class for paheal extractors"""
basecategory = "booru"
category = "paheal"
@@ -23,16 +23,16 @@ class PahealExtractor(SharedConfigMixin, Extractor):
def items(self):
self.session.cookies.set(
"ui-tnc-agreed", "true", domain="rule34.paheal.net")
+ data = self.get_metadata()
- yield Message.Version, 1
- yield Message.Directory, self.get_metadata()
-
- for data in self.get_posts():
- url = data["file_url"]
+ for post in self.get_posts():
+ url = post["file_url"]
for key in ("id", "width", "height"):
- data[key] = text.parse_int(data[key])
- data["tags"] = text.unquote(data["tags"])
- yield Message.Url, url, text.nameext_from_url(url, data)
+ post[key] = text.parse_int(post[key])
+ post["tags"] = text.unquote(post["tags"])
+ post.update(data)
+ yield Message.Directory, post
+ yield Message.Url, url, text.nameext_from_url(url, post)
def get_metadata(self):
"""Return general metadata"""
@@ -100,7 +100,7 @@ class PahealPostExtractor(PahealExtractor):
r"/post/view/(\d+)")
test = ("https://rule34.paheal.net/post/view/481609", {
"url": "a91d579be030753282f55b8cb4eeaa89c45a9116",
- "keyword": "44154bdac3d6cf289d0d9739a566acd8b7839e50",
+ "keyword": "e02e4dcf8cdf4e9c206e695253c9024d79a2e20a",
"content": "7b924bcf150b352ac75c9d281d061e174c851a11",
})
diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py
index 45bd8b5..38f94e0 100644
--- a/gallery_dl/extractor/piczel.py
+++ b/gallery_dl/extractor/piczel.py
@@ -19,6 +19,7 @@ class PiczelExtractor(Extractor):
filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}"
archive_fmt = "{id}_{num}"
root = "https://piczel.tv"
+ api_root = "https://tombstone.piczel.tv"
def items(self):
yield Message.Version, 1
@@ -78,7 +79,7 @@ class PiczelUserExtractor(PiczelExtractor):
self.user = match.group(1)
def posts(self):
- url = "{}/api/users/{}/gallery".format(self.root, self.user)
+ url = "{}/api/users/{}/gallery".format(self.api_root, self.user)
return self._pagination(url)
@@ -98,7 +99,7 @@ class PiczelFolderExtractor(PiczelExtractor):
self.user, self.folder_id = match.groups()
def posts(self):
- url = "{}/api/users/{}/gallery".format(self.root, self.user)
+ url = "{}/api/users/{}/gallery".format(self.api_root, self.user)
return self._pagination(url, int(self.folder_id))
@@ -107,7 +108,8 @@ class PiczelImageExtractor(PiczelExtractor):
subcategory = "image"
pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)"
test = ("https://piczel.tv/gallery/image/7807", {
- "url": "85225dd53a03c3b6028f6c4a45b71eccc07f7066",
+ "pattern": r"https://(\w+\.)?piczel\.tv/static/uploads/gallery_image"
+ r"/32920/image/7807/25737334-Lulena\.png",
"content": "df9a053a24234474a19bce2b7e27e0dec23bff87",
"keyword": {
"created_at": "2018-07-22T05:13:58.000Z",
@@ -136,5 +138,5 @@ class PiczelImageExtractor(PiczelExtractor):
self.image_id = match.group(1)
def posts(self):
- url = "{}/api/gallery/{}".format(self.root, self.image_id)
+ url = "{}/api/gallery/{}".format(self.api_root, self.image_id)
return (self.request(url).json(),)
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index a20312f..cfbab1d 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Generic extractors for *reactor sites"""
-from .common import Extractor, Message, SharedConfigMixin
+from .common import Extractor, Message
from .. import text
import urllib.parse
import random
@@ -19,7 +19,7 @@ import json
BASE_PATTERN = r"(?:https?://)?((?:[^/.]+\.)?reactor\.cc)"
-class ReactorExtractor(SharedConfigMixin, Extractor):
+class ReactorExtractor(Extractor):
"""Base class for *reactor.cc extractors"""
basecategory = "reactor"
filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}"
diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py
deleted file mode 100644
index 1d2140a..0000000
--- a/gallery_dl/extractor/realbooru.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://realbooru.com/"""
-
-from . import booru
-
-
-class RealbooruExtractor(booru.XmlParserMixin,
- booru.GelbooruPageMixin,
- booru.BooruExtractor):
- """Base class for realbooru extractors"""
- category = "realbooru"
- api_url = "https://realbooru.com/index.php"
- post_url = "https://realbooru.com/index.php?page=post&s=view&id={}"
- pool_url = "https://realbooru.com/index.php?page=pool&s=show&id={}"
-
- def __init__(self, match):
- super().__init__(match)
- self.params.update({"page": "dapi", "s": "post", "q": "index"})
-
-
-class RealbooruTagExtractor(booru.TagMixin, RealbooruExtractor):
- """Extractor for images from realbooru.com based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?"
- r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
- test = ("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
- "count": ">= 64",
- })
-
-
-class RealbooruPoolExtractor(booru.GelbooruPoolMixin, RealbooruExtractor):
- """Extractor for image-pools from realbooru.com"""
- pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?"
- r"\?page=pool&s=show&id=(?P<pool>\d+)")
- test = ("https://realbooru.com/index.php?page=pool&s=show&id=1", {
- "count": 3,
- })
-
-
-class RealbooruPostExtractor(booru.PostMixin, RealbooruExtractor):
- """Extractor for single images from realbooru.com"""
- pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?"
- r"\?page=post&s=view&id=(?P<post>\d+)")
- test = ("https://realbooru.com/index.php?page=post&s=view&id=668483", {
- "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
- "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
- # "options": (("tags", True),),
- # "keyword": {
- # "tags_general" : str,
- # "tags_metadata": str,
- # "tags_model" : "jennifer_lawrence",
- # },
- })
diff --git a/gallery_dl/extractor/rule34.py b/gallery_dl/extractor/rule34.py
deleted file mode 100644
index de7ef45..0000000
--- a/gallery_dl/extractor/rule34.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2016-2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://rule34.xxx/"""
-
-from . import booru
-
-
-class Rule34Extractor(booru.XmlParserMixin,
- booru.GelbooruPageMixin,
- booru.BooruExtractor):
- """Base class for rule34 extractors"""
- category = "rule34"
- api_url = "https://rule34.xxx/index.php"
- post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
- pool_url = "https://rule34.xxx/index.php?page=pool&s=show&id={}"
- page_limit = 4000
-
- def __init__(self, match):
- super().__init__(match)
- self.params.update({"page": "dapi", "s": "post", "q": "index"})
-
-
-class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
- """Extractor for images from rule34.xxx based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
- r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
- test = ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
- "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
- "pattern": r"https?://([^.]+\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
- "count": 1,
- })
-
-
-class Rule34PoolExtractor(booru.GelbooruPoolMixin, Rule34Extractor):
- """Extractor for image-pools from rule34.xxx"""
- pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
- r"\?page=pool&s=show&id=(?P<pool>\d+)")
- test = ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
- "count": 3,
- })
-
-
-class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
- """Extractor for single images from rule34.xxx"""
- pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
- r"\?page=post&s=view&id=(?P<post>\d+)")
- test = ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
- "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "danraku",
- "tags_character": "kashima_(kantai_collection)",
- "tags_copyright": "kantai_collection",
- "tags_general": str,
- "tags_metadata": str,
- },
- })
diff --git a/gallery_dl/extractor/safebooru.py b/gallery_dl/extractor/safebooru.py
deleted file mode 100644
index f5f058c..0000000
--- a/gallery_dl/extractor/safebooru.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2015-2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://safebooru.org/"""
-
-from . import booru
-
-
-class SafebooruExtractor(booru.XmlParserMixin,
- booru.GelbooruPageMixin,
- booru.BooruExtractor):
- """Base class for safebooru extractors"""
- category = "safebooru"
- api_url = "https://safebooru.org/index.php"
- post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"
- pool_url = "https://safebooru.org/index.php?page=pool&s=show&id={}"
-
- def __init__(self, match):
- super().__init__(match)
- self.params.update({"page": "dapi", "s": "post", "q": "index"})
-
-
-class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
- """Extractor for images from safebooru.org based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
- r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
- test = ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
- "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
- "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
- })
-
-
-class SafebooruPoolExtractor(booru.GelbooruPoolMixin, SafebooruExtractor):
- """Extractor for image-pools from safebooru.org"""
- pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
- r"\?page=pool&s=show&id=(?P<pool>\d+)")
- test = ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
- "count": 5,
- })
-
-
-class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
- """Extractor for single images from safebooru.org"""
- pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
- r"\?page=post&s=view&id=(?P<post>\d+)")
- test = ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
- "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
- "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "kawanakajima",
- "tags_character": "heath_ledger ronald_mcdonald the_joker",
- "tags_copyright": "dc_comics mcdonald's the_dark_knight",
- "tags_general": str,
- },
- })
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index a9252f5..438dd9f 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2019 Mike Fährmann
+# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,139 +8,76 @@
"""Extractors for https://chan.sankakucomplex.com/"""
-from .common import Extractor, Message, SharedConfigMixin
-from .. import text, util, exception
-from ..cache import cache
+from .booru import BooruExtractor
+from .. import text, exception
import collections
-import random
-import time
-import re
+BASE_PATTERN = r"(?:https?://)?(?:beta|chan)\.sankakucomplex\.com"
-class SankakuExtractor(SharedConfigMixin, Extractor):
- """Base class for sankaku extractors"""
+
+class SankakuExtractor(BooruExtractor):
+ """Base class for sankaku channel extractors"""
basecategory = "booru"
category = "sankaku"
filename_fmt = "{category}_{id}_{md5}.{extension}"
- cookienames = ("login", "pass_hash")
- cookiedomain = "chan.sankakucomplex.com"
- subdomain = "chan"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.root = "https://" + self.cookiedomain
- self.logged_in = True
- self.start_page = 1
- self.start_post = 0
- self.extags = self.config("tags", False)
- self.wait_min = self.config("wait-min", 3.0)
- self.wait_max = self.config("wait-max", 6.0)
- if self.wait_max < self.wait_min:
- self.wait_max = self.wait_min
-
- def items(self):
- self.login()
-
- yield Message.Version, 1
- data = self.get_metadata()
-
- for post_id in util.advance(self.get_posts(), self.start_post):
- self.wait()
- post = self.get_post_data(post_id)
- url = post["file_url"]
- post.update(data)
- text.nameext_from_url(url, post)
- yield Message.Directory, post
- yield Message.Url, url, post
-
- def skip(self, num):
- self.start_post += num
- return num
-
- def get_metadata(self):
- """Return general metadata"""
- return {}
-
- def get_posts(self):
- """Return an iterable containing all relevant post ids"""
-
- def get_post_data(self, post_id, extr=text.extract):
- """Extract metadata of a single post"""
- url = self.root + "/post/show/" + post_id
- page = self.request(url, retries=10).text
-
- tags , pos = extr(page, "<title>", " | ")
- vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos)
- vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos)
- _ , pos = extr(page, "Posted: <", "", pos)
- created, pos = extr(page, ' title="', '"', pos)
- rating = extr(page, "<li>Rating: ", "<", pos)[0]
-
- file_url, pos = extr(page, '<li>Original: <a href="', '"', pos)
- if file_url:
- width , pos = extr(page, '>', 'x', pos)
- height, pos = extr(page, '', ' ', pos)
- else:
- width , pos = extr(page, '<object width=', ' ', pos)
- height, pos = extr(page, 'height=', '>', pos)
- file_url = extr(page, '<embed src="', '"', pos)[0]
-
- data = {
- "id": text.parse_int(post_id),
- "md5": file_url.rpartition("/")[2].partition(".")[0],
- "tags": text.unescape(tags),
- "vote_average": text.parse_float(vavg),
- "vote_count": text.parse_int(vcnt),
- "created_at": created,
- "rating": (rating or "?")[0].lower(),
- "file_url": "https:" + text.unescape(file_url),
- "width": text.parse_int(width),
- "height": text.parse_int(height),
- }
-
- if self.extags:
- tags = collections.defaultdict(list)
- tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0]
- pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)')
- for tag_type, tag_name in pattern.findall(tags_html or ""):
- tags[tag_type].append(text.unquote(tag_name))
- for key, value in tags.items():
- data["tags_" + key] = " ".join(value)
-
- return data
-
- def wait(self):
- """Wait for a randomly chosen amount of seconds"""
- time.sleep(random.uniform(self.wait_min, self.wait_max))
+ request_interval_min = 1.0
+ per_page = 100
+
+ TAG_TYPES = {
+ 0: "general",
+ 1: "artist",
+ 2: "studio",
+ 3: "copyright",
+ 4: "character",
+ 5: "genre",
+ 6: "",
+ 7: "",
+ 8: "medium",
+ 9: "meta",
+ }
+
+ def _prepare_post(self, post, extended_tags=False):
+ url = post["file_url"]
+ if url[0] == "/":
+ url = self.root + url
+ if extended_tags:
+ self._fetch_extended_tags(post)
+ post["date"] = text.parse_timestamp(post["created_at"]["s"])
+ post["tags"] = [tag["name"] for tag in post["tags"]]
+ return url
+
+ def _fetch_extended_tags(self, post):
+ tags = collections.defaultdict(list)
+ types = self.TAG_TYPES
+ for tag in post["tags"]:
+ tags[types[tag["type"]]].append(tag["name"])
+ for key, value in tags.items():
+ post["tags_" + key] = value
+
+ def _api_request(self, endpoint, params=None):
+ url = "https://capi-v2.sankakucomplex.com" + endpoint
+ while True:
+ response = self.request(url, params=params, fatal=False)
+ if response.status_code == 429:
+ self.wait(until=response.headers.get("X-RateLimit-Reset"))
+ continue
+ return response.json()
- def login(self):
- """Login and set necessary cookies"""
- if self._check_cookies(self.cookienames):
- return
- username, password = self._get_auth_info()
- if username:
- cookies = self._login_impl((username, self.subdomain), password)
- self._update_cookies(cookies)
- else:
- self.logged_in = False
+ def _pagination(self, params):
+ params["lang"] = "en"
+ params["limit"] = str(self.per_page)
- @cache(maxage=90*24*3600, keyarg=1)
- def _login_impl(self, usertuple, password):
- username = usertuple[0]
- self.log.info("Logging in as %s", username)
- url = self.root + "/user/authenticate"
- data = {
- "url": "",
- "user[name]": username,
- "user[password]": password,
- "commit": "Login",
- }
- response = self.request(url, method="POST", data=data)
+ while True:
+ data = self._api_request("/posts/keyset", params)
+ if not data.get("success", True):
+ raise exception.StopExtraction(data.get("code"))
+ yield from data["data"]
- if not response.history or response.url != self.root + "/user/home":
- raise exception.AuthenticationError()
- cookies = response.history[0].cookies
- return {c: cookies[c] for c in self.cookienames}
+ params["next"] = data["meta"]["next"]
+ if not params["next"]:
+ return
+ if "page" in params:
+ del params["page"]
class SankakuTagExtractor(SankakuExtractor):
@@ -148,21 +85,13 @@ class SankakuTagExtractor(SankakuExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = r"(?:https?://)?chan\.sankakucomplex\.com/\?([^#]*)"
+ pattern = BASE_PATTERN + r"/\?([^#]*)"
test = (
- ("https://chan.sankakucomplex.com/?tags=bonocho", {
+ ("https://beta.sankakucomplex.com/?tags=bonocho", {
"count": 5,
"pattern": r"https://c?s\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+",
}),
- # respect 'page' query parameter
- ("https://chan.sankakucomplex.com/?tags=bonocho&page=2", {
- "count": 0,
- }),
- # respect 'next' query parameter
- ("https://chan.sankakucomplex.com/?tags=bonocho&next=182284", {
- "count": 1,
- }),
# error on five or more tags
("https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", {
"options": (("username", None),),
@@ -172,128 +101,69 @@ class SankakuTagExtractor(SankakuExtractor):
("https://chan.sankakucomplex.com"
"/?tags=marie_rose&page=98&next=3874906&commit=Search"),
)
- per_page = 20
def __init__(self, match):
SankakuExtractor.__init__(self, match)
query = text.parse_query(match.group(1))
self.tags = text.unquote(query.get("tags", "").replace("+", " "))
- self.start_page = text.parse_int(query.get("page"), 1)
- self.next = text.parse_int(query.get("next"), 0)
-
- def skip(self, num):
- if self.next:
- self.start_post += num
- else:
- pages, posts = divmod(num, self.per_page)
- self.start_page += pages
- self.start_post += posts
- return num
-
- def get_metadata(self):
- if not self.next:
- max_page = 50 if self.logged_in else 25
- if self.start_page > max_page:
- self.log.info("Traversing from page %d to page %d",
- max_page, self.start_page)
- self.start_post += self.per_page * (self.start_page - max_page)
- self.start_page = max_page
- tags = self.tags.split()
- if not self.logged_in and len(tags) > 4:
- raise exception.StopExtraction(
- "Unauthenticated users cannot use more than 4 tags at once.")
- return {"search_tags": " ".join(tags)}
+ def metadata(self):
+ return {"search_tags": self.tags}
- def get_posts(self):
- params = {"tags": self.tags}
-
- if self.next:
- params["next"] = self.next
- else:
- params["page"] = self.start_page
-
- while True:
- self.wait()
- page = self.request(self.root, params=params, retries=10).text
- pos = page.find("<div id=more-popular-posts-link>") + 1
-
- ids = list(text.extract_iter(page, '" id=p', '>', pos))
- if not ids:
- return
- yield from ids
-
- next_qs = text.extract(page, 'next-page-url="/?', '"', pos)[0]
- next_id = text.parse_query(next_qs).get("next")
-
- # stop if the same "next" parameter occurs twice in a row (#265)
- if "next" in params and params["next"] == next_id:
- return
-
- params["next"] = next_id or (text.parse_int(ids[-1]) - 1)
- params["page"] = "2"
+ def posts(self):
+ return self._pagination({"tags": self.tags})
class SankakuPoolExtractor(SankakuExtractor):
- """Extractor for image-pools from chan.sankakucomplex.com"""
+ """Extractor for image pools or books from chan.sankakucomplex.com"""
subcategory = "pool"
- directory_fmt = ("{category}", "pool", "{pool}")
+ directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}")
archive_fmt = "p_{pool}_{id}"
- pattern = r"(?:https?://)?chan\.sankakucomplex\.com/pool/show/(\d+)"
- test = ("https://chan.sankakucomplex.com/pool/show/90", {
- "count": 5,
- })
- per_page = 24
+ pattern = BASE_PATTERN + r"/(?:books|pool/show)/(\d+)"
+ test = (
+ ("https://beta.sankakucomplex.com/books/90", {
+ "count": 5,
+ }),
+ ("https://chan.sankakucomplex.com/pool/show/90"),
+ )
def __init__(self, match):
SankakuExtractor.__init__(self, match)
self.pool_id = match.group(1)
- def skip(self, num):
- pages, posts = divmod(num, self.per_page)
- self.start_page += pages
- self.start_post += posts
- return num
-
- def get_metadata(self):
- return {"pool": self.pool_id}
+ def metadata(self):
+ pool = self._api_request("/pools/" + self.pool_id)
+ self._posts = pool.pop("posts")
+ return {"pool": pool}
- def get_posts(self):
- url = self.root + "/pool/show/" + self.pool_id
- params = {"page": self.start_page}
-
- while True:
- page = self.request(url, params=params, retries=10).text
- ids = list(text.extract_iter(page, '" id=p', '>'))
-
- yield from ids
- if len(ids) < self.per_page:
- return
-
- params["page"] += 1
+ def posts(self):
+ return self._posts
class SankakuPostExtractor(SankakuExtractor):
"""Extractor for single images from chan.sankakucomplex.com"""
subcategory = "post"
archive_fmt = "{id}"
- pattern = r"(?:https?://)?chan\.sankakucomplex\.com/post/show/(\d+)"
- test = ("https://chan.sankakucomplex.com/post/show/360451", {
- "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "bonocho",
- "tags_studio": "dc_comics",
- "tags_medium": "sketch copyright_name",
- "tags_copyright": str,
- "tags_character": str,
- "tags_general": str,
- },
- })
+ pattern = BASE_PATTERN + r"/post/show/(\d+)"
+ test = (
+ ("https://beta.sankakucomplex.com/post/show/360451", {
+ "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": ["bonocho"],
+ "tags_studio": ["dc_comics"],
+ "tags_medium": ["sketch", "copyright_name"],
+ "tags_copyright": list,
+ "tags_character": list,
+ "tags_general" : list,
+ },
+ }),
+ ("https://chan.sankakucomplex.com/post/show/360451"),
+ )
def __init__(self, match):
SankakuExtractor.__init__(self, match)
self.post_id = match.group(1)
- def get_posts(self):
- return (self.post_id,)
+ def posts(self):
+ return self._pagination({"tags": "id:" + self.post_id})
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
index 9d1df18..d65f334 100644
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@@ -8,12 +8,12 @@
"""Extractors for Shopify instances"""
-from .common import Extractor, Message, SharedConfigMixin, generate_extractors
+from .common import Extractor, Message, generate_extractors
from .. import text
import re
-class ShopifyExtractor(SharedConfigMixin, Extractor):
+class ShopifyExtractor(Extractor):
"""Base class for Shopify extractors"""
basecategory = "shopify"
filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}"
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index fe0b3c5..a77ea06 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -106,15 +106,26 @@ class TwitterExtractor(Extractor):
})
elif "media_url_https" in media:
url = media["media_url_https"]
+ base, _, fmt = url.rpartition(".")
+ base += "?format=" + fmt + "&name="
files.append(text.nameext_from_url(url, {
- "url" : url + ":orig",
- "_fallback": [url+":large", url+":medium", url+":small"],
+ "url" : base + "orig",
"width" : width,
"height" : height,
+ "_fallback": self._image_fallback(base, url),
}))
else:
files.append({"url": media["media_url"]})
+ @staticmethod
+ def _image_fallback(base, url):
+ url += ":"
+ yield url + "orig"
+
+ for size in ("large", "medium", "small"):
+ yield base + size
+ yield url + size
+
def _extract_card(self, tweet, files):
card = tweet["card"]
if card["name"] in ("summary", "summary_large_image"):
@@ -267,7 +278,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
test = (
("https://twitter.com/supernaturepics", {
"range": "1-40",
- "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
+ "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
}),
("https://mobile.twitter.com/supernaturepics?p=i"),
("https://www.twitter.com/id:2976459548"),
@@ -291,7 +302,7 @@ class TwitterMediaExtractor(TwitterExtractor):
test = (
("https://twitter.com/supernaturepics/media", {
"range": "1-40",
- "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
+ "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
}),
("https://mobile.twitter.com/supernaturepics/media#t"),
("https://www.twitter.com/id:2976459548/media"),
@@ -374,12 +385,12 @@ class TwitterTweetExtractor(TwitterExtractor):
pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
test = (
("https://twitter.com/supernaturepics/status/604341487988576256", {
- "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
+ "url": "88a40f7d25529c2501c46f2218f9e0de9aa634b4",
"content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
}),
# 4 images
("https://twitter.com/perrypumas/status/894001459754180609", {
- "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
+ "url": "3a2a43dc5fb79dd5432c701d8e55e87c4e551f47",
}),
# video
("https://twitter.com/perrypumas/status/1065692031626829824", {
@@ -396,7 +407,7 @@ class TwitterTweetExtractor(TwitterExtractor):
}),
# Reply to deleted tweet (#403, #838)
("https://twitter.com/i/web/status/1170041925560258560", {
- "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_.jpg:orig",
+ "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_",
}),
# 'replies' option (#705)
("https://twitter.com/i/web/status/1170041925560258560", {
@@ -405,13 +416,13 @@ class TwitterTweetExtractor(TwitterExtractor):
}),
# quoted tweet (#526, #854)
("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
- "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+\.jpg",
+ "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+=jpg",
"count": 8,
}),
# "quoted" option (#854)
("https://twitter.com/StobiesGalaxy/status/1270755918330896395", {
"options": (("quoted", False),),
- "pattern": r"https://pbs\.twimg\.com/media/EaK.+\.jpg",
+ "pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg",
"count": 4,
}),
# TwitPic embeds (#579)
@@ -422,7 +433,7 @@ class TwitterTweetExtractor(TwitterExtractor):
}),
# Nitter tweet (#890)
("https://nitter.net/ed1conf/status/1163841619336007680", {
- "url": "0f6a841e23948e4320af7ae41125e0c5b3cadc98",
+ "url": "4a9ea898b14d3c112f98562d0df75c9785e239d9",
"content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
}),
# Twitter card (#1005)
@@ -494,19 +505,25 @@ class TwitterAPI():
}
cookies = self.extractor.session.cookies
+ cookiedomain = ".twitter.com"
# CSRF
- csrf = util.generate_csrf_token()
- self.headers["x-csrf-token"] = csrf
- cookies.set("ct0", csrf, domain=".twitter.com")
-
- if cookies.get("auth_token", domain=".twitter.com"):
+ csrf_token = cookies.get("ct0", domain=cookiedomain)
+ if not csrf_token:
+ csrf_token = util.generate_csrf_token()
+ cookies.set("ct0", csrf_token, domain=cookiedomain)
+ self.headers["x-csrf-token"] = csrf_token
+
+ if cookies.get("auth_token", domain=cookiedomain):
+ # logged in
+ self.root = "https://twitter.com/i/api/"
self.headers["x-twitter-auth-type"] = "OAuth2Session"
else:
- # guest token
+ # guest
+ self.root = "https://api.twitter.com/"
guest_token = self._guest_token()
+ cookies.set("gt", guest_token, domain=cookiedomain)
self.headers["x-guest-token"] = guest_token
- cookies.set("gt", guest_token, domain=".twitter.com")
def tweet(self, tweet_id):
endpoint = "2/timeline/conversation/{}.json".format(tweet_id)
@@ -597,10 +614,16 @@ class TwitterAPI():
return self._call(endpoint, None, "POST")["guest_token"]
def _call(self, endpoint, params, method="GET"):
- url = "https://api.twitter.com/" + endpoint
+ url = self.root + endpoint
response = self.extractor.request(
url, method=method, params=params, headers=self.headers,
fatal=None)
+
+ # update 'x-csrf-token' header (#1170)
+ csrf_token = response.cookies.get("ct0")
+ if csrf_token:
+ self.headers["x-csrf-token"] = csrf_token
+
if response.status_code < 400:
return response.json()
if response.status_code == 429:
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 55324cb..a3dc6a0 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -11,7 +11,6 @@
from .common import Extractor, Message
from .. import exception, text, util
-
BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com/((en|fr)"
@@ -22,10 +21,18 @@ class WebtoonsExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.session.cookies.set("ageGatePass", "true",
- domain=self.cookiedomain)
self.path, self.lang, self.genre , self.comic, self.query = \
match.groups()
+ cookies = self.session.cookies
+ cookies.set("pagGDPR", "true", domain=self.cookiedomain)
+ cookies.set("ageGatePass", "true", domain=self.cookiedomain)
+
+ def request(self, url, **kwargs):
+ response = Extractor.request(self, url, **kwargs)
+ if response.history and "/ageGate" in response.request.url:
+ raise exception.StopExtraction(
+ "Redirected to age gate check ('%s')", response.request.url)
+ return response
class WebtoonsEpisodeExtractor(WebtoonsExtractor):
diff --git a/gallery_dl/extractor/yandere.py b/gallery_dl/extractor/yandere.py
deleted file mode 100644
index 623e7a8..0000000
--- a/gallery_dl/extractor/yandere.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2015-2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://yande.re/"""
-
-from . import booru
-
-
-class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
- """Base class for yandere extractors"""
- category = "yandere"
- api_url = "https://yande.re/post.json"
- post_url = "https://yande.re/post/show/{}"
-
-
-class YandereTagExtractor(booru.TagMixin, YandereExtractor):
- """Extractor for images from yande.re based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?yande\.re"
- r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
- test = ("https://yande.re/post?tags=ouzoku+armor", {
- "content": "59201811c728096b2d95ce6896fd0009235fe683",
- })
-
-
-class YanderePoolExtractor(booru.PoolMixin, YandereExtractor):
- """Extractor for image-pools from yande.re"""
- pattern = r"(?:https?://)?(?:www\.)?yande\.re/pool/show/(?P<pool>\d+)"
- test = ("https://yande.re/pool/show/318", {
- "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68",
- })
-
-
-class YanderePostExtractor(booru.PostMixin, YandereExtractor):
- """Extractor for single images from yande.re"""
- pattern = r"(?:https?://)?(?:www\.)?yande\.re/post/show/(?P<post>\d+)"
- test = ("https://yande.re/post/show/51824", {
- "content": "59201811c728096b2d95ce6896fd0009235fe683",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "sasaki_tamaru",
- "tags_circle": "softhouse_chara",
- "tags_copyright": "ouzoku",
- "tags_general": str,
- },
- })
-
-
-class YanderePopularExtractor(booru.MoebooruPopularMixin, YandereExtractor):
- """Extractor for popular images from yande.re"""
- pattern = (r"(?:https?://)?(?:www\.)?yande\.re"
- r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
- r"(?:\?(?P<query>[^#]*))?")
- test = (
- ("https://yande.re/post/popular_by_month?month=6&year=2014", {
- "count": 40,
- }),
- ("https://yande.re/post/popular_recent"),
- )
-
- def __init__(self, match):
- super().__init__(match)
- self.api_url = "https://yande.re/post/popular_{scale}.json".format(
- scale=self.scale)
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index 66dea08..c1d32ef 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -10,6 +10,7 @@ import sys
import time
import errno
import logging
+import collections
from . import extractor, downloader, postprocessor
from . import config, text, util, output, exception
from .extractor.message import Message
@@ -193,8 +194,8 @@ class DownloadJob(Job):
self.blacklist = None
self.archive = None
self.sleep = None
+ self.hooks = ()
self.downloaders = {}
- self.postprocessors = None
self.out = output.select()
if parent:
@@ -207,16 +208,16 @@ class DownloadJob(Job):
def handle_url(self, url, kwdict):
"""Download the resource specified in 'url'"""
- postprocessors = self.postprocessors
+ hooks = self.hooks
pathfmt = self.pathfmt
archive = self.archive
# prepare download
pathfmt.set_filename(kwdict)
- if postprocessors:
- for pp in postprocessors:
- pp.prepare(pathfmt)
+ if "prepare" in hooks:
+ for callback in hooks["prepare"]:
+ callback(pathfmt)
if archive and archive.check(kwdict):
pathfmt.fix_extension()
@@ -255,19 +256,19 @@ class DownloadJob(Job):
return
# run post processors
- if postprocessors:
- for pp in postprocessors:
- pp.run(pathfmt)
+ if "file" in hooks:
+ for callback in hooks["file"]:
+ callback(pathfmt)
# download succeeded
pathfmt.finalize()
self.out.success(pathfmt.path, 0)
+ self._skipcnt = 0
if archive:
archive.add(kwdict)
- if postprocessors:
- for pp in postprocessors:
- pp.run_after(pathfmt)
- self._skipcnt = 0
+ if "after" in hooks:
+ for callback in hooks["after"]:
+ callback(pathfmt)
def handle_directory(self, kwdict):
"""Set and create the target directory for downloads"""
@@ -275,17 +276,18 @@ class DownloadJob(Job):
self.initialize(kwdict)
else:
self.pathfmt.set_directory(kwdict)
+ if "post" in self.hooks:
+ for callback in self.hooks["post"]:
+ callback(self.pathfmt)
def handle_metadata(self, kwdict):
"""Run postprocessors with metadata from 'kwdict'"""
- postprocessors = self.postprocessors
-
- if postprocessors:
+ if "metadata" in self.hooks:
kwdict["extension"] = "metadata"
pathfmt = self.pathfmt
pathfmt.set_filename(kwdict)
- for pp in postprocessors:
- pp.run_metadata(pathfmt)
+ for callback in self.hooks["metadata"]:
+ callback(pathfmt)
def handle_queue(self, url, kwdict):
if url in self.visited:
@@ -313,13 +315,17 @@ class DownloadJob(Job):
self.archive.close()
if pathfmt:
self.extractor._store_cookies()
- if self.postprocessors:
+ if "finalize" in self.hooks:
status = self.status
- for pp in self.postprocessors:
- pp.run_final(pathfmt, status)
+ for callback in self.hooks["finalize"]:
+ callback(pathfmt, status)
def handle_skip(self):
- self.out.skip(self.pathfmt.path)
+ pathfmt = self.pathfmt
+ self.out.skip(pathfmt.path)
+ if "skip" in self.hooks:
+ for callback in self.hooks["skip"]:
+ callback(pathfmt)
if self._skipexc:
self._skipcnt += 1
if self._skipcnt >= self._skipmax:
@@ -407,16 +413,24 @@ class DownloadJob(Job):
postprocessors = self.extractor.config_accumulate("postprocessors")
if postprocessors:
+ self.hooks = collections.defaultdict(list)
pp_log = self.get_logger("postprocessor")
pp_list = []
category = self.extractor.category
+ basecategory = self.extractor.basecategory
for pp_dict in postprocessors:
+
whitelist = pp_dict.get("whitelist")
+ if whitelist and category not in whitelist and \
+ basecategory not in whitelist:
+ continue
+
blacklist = pp_dict.get("blacklist")
- if (whitelist and category not in whitelist or
- blacklist and category in blacklist):
+ if blacklist and (
+ category in blacklist or basecategory in blacklist):
continue
+
name = pp_dict.get("name")
pp_cls = postprocessor.find(name)
if not pp_cls:
@@ -431,9 +445,11 @@ class DownloadJob(Job):
pp_list.append(pp_obj)
if pp_list:
- self.postprocessors = pp_list
self.extractor.log.debug(
"Active postprocessor modules: %s", pp_list)
+ if "init" in self.hooks:
+ for callback in self.hooks["init"]:
+ callback(pathfmt)
def _build_blacklist(self):
wlist = self.extractor.config("whitelist")
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
index 2a48c87..01537d6 100644
--- a/gallery_dl/option.py
+++ b/gallery_dl/option.py
@@ -376,7 +376,8 @@ def build_parser():
postprocessor.add_argument(
"--exec-after",
dest="postprocessors", metavar="CMD",
- action=AppendCommandAction, const={"name": "exec", "final": True},
+ action=AppendCommandAction, const={
+ "name": "exec", "event": "finalize"},
help=("Execute CMD after all files were downloaded successfully. "
"Example: --exec-after 'cd {} && convert * ../doc.pdf'"),
)
diff --git a/gallery_dl/postprocessor/classify.py b/gallery_dl/postprocessor/classify.py
index 0106903..eda092d 100644
--- a/gallery_dl/postprocessor/classify.py
+++ b/gallery_dl/postprocessor/classify.py
@@ -32,13 +32,16 @@ class ClassifyPP(PostProcessor):
for ext in exts
}
+ job.hooks["prepare"].append(self.prepare)
+ job.hooks["file"].append(self.move)
+
def prepare(self, pathfmt):
ext = pathfmt.extension
if ext in self.mapping:
# set initial paths to enable download skips
self._build_paths(pathfmt, self.mapping[ext])
- def run(self, pathfmt):
+ def move(self, pathfmt):
ext = pathfmt.extension
if ext in self.mapping:
# rebuild paths in case the filename extension changed
diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py
index 64f978e..ef211e6 100644
--- a/gallery_dl/postprocessor/common.py
+++ b/gallery_dl/postprocessor/common.py
@@ -16,25 +16,5 @@ class PostProcessor():
name = self.__class__.__name__[:-2].lower()
self.log = job.get_logger("postprocessor." + name)
- @staticmethod
- def prepare(pathfmt):
- """Update file paths, etc."""
-
- @staticmethod
- def run(pathfmt):
- """Execute the postprocessor for a file"""
-
- @staticmethod
- def run_metadata(pathfmt):
- """Execute the postprocessor for a file"""
-
- @staticmethod
- def run_after(pathfmt):
- """Execute postprocessor after moving a file to its target location"""
-
- @staticmethod
- def run_final(pathfmt, status):
- """Postprocessor finalization after all files have been downloaded"""
-
def __repr__(self):
return self.__class__.__name__
diff --git a/gallery_dl/postprocessor/compare.py b/gallery_dl/postprocessor/compare.py
index 0d11844..ca416c9 100644
--- a/gallery_dl/postprocessor/compare.py
+++ b/gallery_dl/postprocessor/compare.py
@@ -16,22 +16,25 @@ class ComparePP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
- if options.get("action") == "enumerate":
- self.run = self._run_enumerate
if options.get("shallow"):
- self.compare = self._compare_size
+ self._compare = self._compare_size
+ job.hooks["file"].append(
+ self.enumerate
+ if options.get("action") == "enumerate" else
+ self.compare
+ )
- def run(self, pathfmt):
+ def compare(self, pathfmt):
try:
- if self.compare(pathfmt.realpath, pathfmt.temppath):
+ if self._compare(pathfmt.realpath, pathfmt.temppath):
pathfmt.delete = True
except OSError:
pass
- def _run_enumerate(self, pathfmt):
+ def enumerate(self, pathfmt):
num = 1
try:
- while not self.compare(pathfmt.realpath, pathfmt.temppath):
+ while not self._compare(pathfmt.realpath, pathfmt.temppath):
pathfmt.prefix = str(num) + "."
pathfmt.set_extension(pathfmt.extension, False)
num += 1
@@ -39,7 +42,7 @@ class ComparePP(PostProcessor):
except OSError:
pass
- def compare(self, f1, f2):
+ def _compare(self, f1, f2):
return self._compare_size(f1, f2) and self._compare_content(f1, f2)
@staticmethod
diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py
index cbe51ae..205f42e 100644
--- a/gallery_dl/postprocessor/exec.py
+++ b/gallery_dl/postprocessor/exec.py
@@ -24,54 +24,58 @@ class ExecPP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
- args = options["command"]
- final = options.get("final", False)
+ if options.get("async", False):
+ self._exec = self._exec_async
+
+ args = options["command"]
if isinstance(args, str):
- if final:
- self._format = self._format_args_directory
- else:
- self._format = self._format_args_path
if "{}" not in args:
args += " {}"
self.args = args
- self.shell = True
+ execute = self.exec_string
else:
- self._format = self._format_args_list
self.args = [util.Formatter(arg) for arg in args]
- self.shell = False
-
- if final:
- self.run_after = PostProcessor.run_after
- else:
- self.run_final = PostProcessor.run_final
-
- if options.get("async", False):
- self._exec = self._exec_async
+ execute = self.exec_list
+
+ events = options.get("event")
+ if events is None:
+ events = ("after",)
+ if options.get("final"):
+ self.log.warning("'final' is deprecated, "
+ "use '\"event\": \"finalize\"' instead")
+ events = ("finalize",)
+ elif isinstance(events, str):
+ events = events.split(",")
+ for event in events:
+ job.hooks[event].append(execute)
+
+ def exec_list(self, pathfmt, status=None):
+ if status:
+ return
- def run_after(self, pathfmt):
- self._exec(self._format(pathfmt))
-
- def run_final(self, pathfmt, status):
- if status == 0:
- self._exec(self._format(pathfmt))
-
- def _format_args_path(self, pathfmt):
- return self.args.replace("{}", quote(pathfmt.realpath))
-
- def _format_args_directory(self, pathfmt):
- return self.args.replace("{}", quote(pathfmt.realdirectory))
-
- def _format_args_list(self, pathfmt):
kwdict = pathfmt.kwdict
kwdict["_directory"] = pathfmt.realdirectory
kwdict["_filename"] = pathfmt.filename
kwdict["_path"] = pathfmt.realpath
- return [arg.format_map(kwdict) for arg in self.args]
- def _exec(self, args):
+ args = [arg.format_map(kwdict) for arg in self.args]
+ self._exec(args, False)
+
+ def exec_string(self, pathfmt, status=None):
+ if status:
+ return
+
+ if status is None and pathfmt.realpath:
+ args = self.args.replace("{}", quote(pathfmt.realpath))
+ else:
+ args = self.args.replace("{}", quote(pathfmt.realdirectory))
+
+ self._exec(args, True)
+
+ def _exec(self, args, shell):
self.log.debug("Running '%s'", args)
- retcode = subprocess.Popen(args, shell=self.shell).wait()
+ retcode = subprocess.Popen(args, shell=shell).wait()
if retcode:
self.log.warning(
"Executing '%s' returned with non-zero exit status (%d)",
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
index f88dde7..27f9c03 100644
--- a/gallery_dl/postprocessor/metadata.py
+++ b/gallery_dl/postprocessor/metadata.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Write metadata to JSON files"""
+"""Write metadata to external files"""
from .common import PostProcessor
from .. import util
@@ -24,7 +24,7 @@ class MetadataPP(PostProcessor):
cfmt = options.get("content-format") or options.get("format")
if isinstance(cfmt, list):
cfmt = "\n".join(cfmt) + "\n"
- self.contentfmt = util.Formatter(cfmt).format_map
+ self._content_fmt = util.Formatter(cfmt).format_map
ext = "txt"
elif mode == "tags":
self.write = self._write_tags
@@ -39,47 +39,68 @@ class MetadataPP(PostProcessor):
if directory:
self._directory = self._directory_custom
sep = os.sep + (os.altsep or "")
- self.metadir = directory.rstrip(sep) + os.sep
+ self._metadir = directory.rstrip(sep) + os.sep
+ filename = options.get("filename")
extfmt = options.get("extension-format")
- if extfmt:
+ if filename:
self._filename = self._filename_custom
- self.extfmt = util.Formatter(extfmt).format_map
+ self._filename_fmt = util.Formatter(filename).format_map
+ elif extfmt:
+ self._filename = self._filename_extfmt
+ self._extension_fmt = util.Formatter(extfmt).format_map
else:
self.extension = options.get("extension", ext)
- if options.get("bypost"):
- self.run_metadata, self.run = self.run, self.run_metadata
+ events = options.get("event")
+ if events is None:
+ events = ("file",)
+ if options.get("bypost"):
+ self.log.warning("'bypost' is deprecated, use '\"event\": "
+ "\"post\"' and 'filename' instead")
+ events = ("metadata",)
+ elif isinstance(events, str):
+ events = events.split(",")
+ for event in events:
+ job.hooks[event].append(self.run)
def run(self, pathfmt):
- path = self._directory(pathfmt) + self._filename(pathfmt)
- with open(path, "w", encoding="utf-8") as file:
- self.write(file, pathfmt.kwdict)
+ directory = self._directory(pathfmt)
+ path = directory + self._filename(pathfmt)
+
+ try:
+ with open(path, "w", encoding="utf-8") as fp:
+ self.write(fp, pathfmt.kwdict)
+ except FileNotFoundError:
+ os.makedirs(directory, exist_ok=True)
+ with open(path, "w", encoding="utf-8") as fp:
+ self.write(fp, pathfmt.kwdict)
def _directory(self, pathfmt):
return pathfmt.realdirectory
def _directory_custom(self, pathfmt):
- directory = os.path.join(pathfmt.realdirectory, self.metadir)
- os.makedirs(directory, exist_ok=True)
- return directory
+ return os.path.join(pathfmt.realdirectory, self._metadir)
def _filename(self, pathfmt):
- return pathfmt.filename + "." + self.extension
+ return (pathfmt.filename or "metadata") + "." + self.extension
def _filename_custom(self, pathfmt):
+ return self._filename_fmt(pathfmt.kwdict)
+
+ def _filename_extfmt(self, pathfmt):
kwdict = pathfmt.kwdict
ext = kwdict["extension"]
kwdict["extension"] = pathfmt.extension
- kwdict["extension"] = pathfmt.prefix + self.extfmt(kwdict)
+ kwdict["extension"] = pathfmt.prefix + self._extension_fmt(kwdict)
filename = pathfmt.build_filename()
kwdict["extension"] = ext
return filename
- def _write_custom(self, file, kwdict):
- file.write(self.contentfmt(kwdict))
+ def _write_custom(self, fp, kwdict):
+ fp.write(self._content_fmt(kwdict))
- def _write_tags(self, file, kwdict):
+ def _write_tags(self, fp, kwdict):
tags = kwdict.get("tags") or kwdict.get("tag_string")
if not tags:
@@ -91,11 +112,10 @@ class MetadataPP(PostProcessor):
taglist = tags.split(" ")
tags = taglist
- file.write("\n".join(tags))
- file.write("\n")
+ fp.write("\n".join(tags) + "\n")
- def _write_json(self, file, kwdict):
- util.dump_json(util.filter_dict(kwdict), file, self.ascii, self.indent)
+ def _write_json(self, fp, kwdict):
+ util.dump_json(util.filter_dict(kwdict), fp, self.ascii, self.indent)
__postprocessor__ = MetadataPP
diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py
index b8a4988..e4c28ea 100644
--- a/gallery_dl/postprocessor/mtime.py
+++ b/gallery_dl/postprocessor/mtime.py
@@ -17,6 +17,7 @@ class MtimePP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
self.key = options.get("key", "date")
+ job.hooks["file"].append(self.run)
def run(self, pathfmt):
mtime = pathfmt.kwdict.get(self.key)
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
index 1afba86..14eaa8d 100644
--- a/gallery_dl/postprocessor/ugoira.py
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -49,6 +49,9 @@ class UgoiraPP(PostProcessor):
else:
self.prevent_odd = False
+ job.hooks["prepare"].append(self.prepare)
+ job.hooks["file"].append(self.convert)
+
def prepare(self, pathfmt):
self._frames = None
@@ -65,7 +68,7 @@ class UgoiraPP(PostProcessor):
if self.delete:
pathfmt.set_extension(self.extension)
- def run(self, pathfmt):
+ def convert(self, pathfmt):
if not self._frames:
return
diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py
index a6e5bc3..e820280 100644
--- a/gallery_dl/postprocessor/zip.py
+++ b/gallery_dl/postprocessor/zip.py
@@ -38,12 +38,11 @@ class ZipPP(PostProcessor):
self.args = (self.path[:-1] + ext, "a",
self.COMPRESSION_ALGORITHMS[algorithm], True)
- if options.get("mode") == "safe":
- self.run = self._write_safe
- else:
- self.run = self._write
+ job.hooks["file"].append(
+ self.write_safe if options.get("mode") == "safe" else self.write)
+ job.hooks["finalize"].append(self.finalize)
- def _write(self, pathfmt, zfile=None):
+ def write(self, pathfmt, zfile=None):
# 'NameToInfo' is not officially documented, but it's available
# for all supported Python versions and using it directly is a lot
# faster than calling getinfo()
@@ -55,11 +54,11 @@ class ZipPP(PostProcessor):
zfile.write(pathfmt.temppath, pathfmt.filename)
pathfmt.delete = self.delete
- def _write_safe(self, pathfmt):
+ def write_safe(self, pathfmt):
with zipfile.ZipFile(*self.args) as zfile:
self._write(pathfmt, zfile)
- def run_final(self, pathfmt, status):
+ def finalize(self, pathfmt, status):
if self.zfile:
self.zfile.close()
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index a334b6e..4c0d17b 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -717,6 +717,13 @@ class Formatter():
class PathFormat():
+ EXTENSION_MAP = {
+ "jpeg": "jpg",
+ "jpe" : "jpg",
+ "jfif": "jpg",
+ "jif" : "jpg",
+ "jfi" : "jpg",
+ }
def __init__(self, extractor):
filename_fmt = extractor.config("filename", extractor.filename_fmt)
@@ -725,8 +732,7 @@ class PathFormat():
extension_map = extractor.config("extension-map")
if extension_map is None:
- # TODO: better default value in 1.16.0
- extension_map = {}
+ extension_map = self.EXTENSION_MAP
self.extension_map = extension_map.get
try:
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 0683276..0b01ad2 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.15.4"
+__version__ = "1.16.0"
diff --git a/setup.py b/setup.py
index d7226ea..a82176e 100644
--- a/setup.py
+++ b/setup.py
@@ -47,8 +47,8 @@ FILES = [
setup(
name="gallery_dl",
version=VERSION,
- description=("Command-line program to download image-galleries and "
- "-collections from several image hosting sites"),
+ description=("Command-line program to download image galleries and "
+ "collections from several image hosting sites"),
long_description=read("README.rst"),
url="https://github.com/mikf/gallery-dl",
download_url="https://github.com/mikf/gallery-dl/releases/latest",
@@ -92,6 +92,7 @@ setup(
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3 :: Only",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Multimedia :: Graphics",
diff --git a/test/test_cookies.py b/test/test_cookies.py
index f691980..d103d02 100644
--- a/test/test_cookies.py
+++ b/test/test_cookies.py
@@ -88,7 +88,7 @@ class TestCookiedict(unittest.TestCase):
self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values()))
def test_domain(self):
- for category in ["exhentai", "nijie", "sankaku", "seiga"]:
+ for category in ["exhentai", "idolcomplex", "nijie", "seiga"]:
extr = _get_extractor(category)
cookies = extr.session.cookies
for key in self.cdict:
@@ -104,10 +104,10 @@ class TestCookieLogin(unittest.TestCase):
def test_cookie_login(self):
extr_cookies = {
- "exhentai": ("ipb_member_id", "ipb_pass_hash"),
- "nijie" : ("nemail", "nlogin"),
- "sankaku" : ("login", "pass_hash"),
- "seiga" : ("user_session",),
+ "exhentai" : ("ipb_member_id", "ipb_pass_hash"),
+ "idolcomplex": ("login", "pass_hash"),
+ "nijie" : ("nemail", "nlogin"),
+ "seiga" : ("user_session",),
}
for category, cookienames in extr_cookies.items():
cookies = {name: "value" for name in cookienames}
diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py
index 524e501..74e8742 100644
--- a/test/test_postprocessor.py
+++ b/test/test_postprocessor.py
@@ -15,11 +15,12 @@ from unittest.mock import Mock, mock_open, patch
import logging
import zipfile
import tempfile
+import collections
from datetime import datetime, timezone as tz
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gallery_dl import extractor, output, util # noqa E402
-from gallery_dl import postprocessor, util, config # noqa E402
+from gallery_dl import postprocessor, config # noqa E402
from gallery_dl.postprocessor.common import PostProcessor # noqa E402
@@ -34,6 +35,7 @@ class FakeJob():
self.pathfmt = util.PathFormat(self.extractor)
self.out = output.NullOutput()
self.get_logger = logging.getLogger
+ self.hooks = collections.defaultdict(list)
class TestPostprocessorModule(unittest.TestCase):
@@ -78,6 +80,9 @@ class BasePostprocessorTest(unittest.TestCase):
cls.dir.cleanup()
config.clear()
+ def tearDown(self):
+ self.job.hooks.clear()
+
def _create(self, options=None, data=None):
kwdict = {"category": "test", "filename": "file", "extension": "ext"}
if options is None:
@@ -92,6 +97,11 @@ class BasePostprocessorTest(unittest.TestCase):
pp = postprocessor.find(self.__class__.__name__[:-4].lower())
return pp(self.job, options)
+ def _trigger(self, events=None, *args):
+ for event in (events or ("prepare", "file")):
+ for callback in self.job.hooks[event]:
+ callback(self.pathfmt, *args)
+
class ClassifyTest(BasePostprocessorTest):
@@ -111,7 +121,7 @@ class ClassifyTest(BasePostprocessorTest):
self.assertEqual(self.pathfmt.realpath, path + "/file.jpg")
with patch("os.makedirs") as mkdirs:
- pp.run(self.pathfmt)
+ self._trigger()
mkdirs.assert_called_once_with(path, exist_ok=True)
def test_classify_noop(self):
@@ -123,7 +133,7 @@ class ClassifyTest(BasePostprocessorTest):
self.assertEqual(self.pathfmt.realpath, rp)
with patch("os.makedirs") as mkdirs:
- pp.run(self.pathfmt)
+ self._trigger()
self.assertEqual(mkdirs.call_count, 0)
def test_classify_custom(self):
@@ -143,7 +153,7 @@ class ClassifyTest(BasePostprocessorTest):
self.assertEqual(self.pathfmt.realpath, path + "/file.foo")
with patch("os.makedirs") as mkdirs:
- pp.run(self.pathfmt)
+ self._trigger()
mkdirs.assert_called_once_with(path, exist_ok=True)
@@ -175,8 +185,7 @@ class MetadataTest(BasePostprocessorTest):
self.assertEqual(pp.extension, "JSON")
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
path = self.pathfmt.realpath + ".JSON"
m.assert_called_once_with(path, "w", encoding="utf-8")
@@ -197,41 +206,37 @@ class MetadataTest(BasePostprocessorTest):
self.assertEqual(pp.extension, "txt")
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
path = self.pathfmt.realpath + ".txt"
m.assert_called_once_with(path, "w", encoding="utf-8")
self.assertEqual(self._output(m), "foo\nbar\nbaz\n")
def test_metadata_tags_split_1(self):
- pp = self._create(
+ self._create(
{"mode": "tags"},
{"tags": "foo, bar, baz"},
)
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
self.assertEqual(self._output(m), "foo\nbar\nbaz\n")
def test_metadata_tags_split_2(self):
- pp = self._create(
+ self._create(
{"mode": "tags"},
{"tags": "foobar1 foobar2 foobarbaz"},
)
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
self.assertEqual(self._output(m), "foobar1\nfoobar2\nfoobarbaz\n")
def test_metadata_tags_tagstring(self):
- pp = self._create(
+ self._create(
{"mode": "tags"},
{"tag_string": "foo, bar, baz"},
)
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
self.assertEqual(self._output(m), "foo\nbar\nbaz\n")
def test_metadata_custom(self):
@@ -239,12 +244,12 @@ class MetadataTest(BasePostprocessorTest):
pp = self._create(pp_info, {"foo": "bar"})
self.assertEqual(pp.write, pp._write_custom)
self.assertEqual(pp.extension, "txt")
- self.assertTrue(pp.contentfmt)
+ self.assertTrue(pp._content_fmt)
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
self.assertEqual(self._output(m), "bar\nNone\n")
+ self.job.hooks.clear()
test({"mode": "custom", "content-format": "{foo}\n{missing}\n"})
test({"mode": "custom", "content-format": ["{foo}", "{missing}"]})
@@ -256,53 +261,61 @@ class MetadataTest(BasePostprocessorTest):
"extension-format": "json",
})
- self.assertEqual(pp._filename, pp._filename_custom)
+ self.assertEqual(pp._filename, pp._filename_extfmt)
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
path = self.pathfmt.realdirectory + "file.json"
m.assert_called_once_with(path, "w", encoding="utf-8")
def test_metadata_extfmt_2(self):
- pp = self._create({
+ self._create({
"extension-format": "{extension!u}-data:{category:Res/ES/}",
})
self.pathfmt.prefix = "2."
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
path = self.pathfmt.realdirectory + "file.2.EXT-data:tESt"
m.assert_called_once_with(path, "w", encoding="utf-8")
def test_metadata_directory(self):
- pp = self._create({
+ self._create({
"directory": "metadata",
})
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
path = self.pathfmt.realdirectory + "metadata/file.ext.json"
m.assert_called_once_with(path, "w", encoding="utf-8")
def test_metadata_directory_2(self):
- pp = self._create({
+ self._create({
"directory" : "metadata////",
"extension-format": "json",
})
with patch("builtins.open", mock_open()) as m:
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
path = self.pathfmt.realdirectory + "metadata/file.json"
m.assert_called_once_with(path, "w", encoding="utf-8")
+ def test_metadata_filename(self):
+ self._create({
+ "filename" : "{category}_{filename}_meta.data",
+ "extension-format": "json",
+ })
+
+ with patch("builtins.open", mock_open()) as m:
+ self._trigger()
+
+ path = self.pathfmt.realdirectory + "test_file_meta.data"
+ m.assert_called_once_with(path, "w", encoding="utf-8")
+
@staticmethod
def _output(mock):
return "".join(
@@ -319,21 +332,18 @@ class MtimeTest(BasePostprocessorTest):
self.assertEqual(pp.key, "date")
def test_mtime_datetime(self):
- pp = self._create(None, {"date": datetime(1980, 1, 1, tzinfo=tz.utc)})
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._create(None, {"date": datetime(1980, 1, 1, tzinfo=tz.utc)})
+ self._trigger()
self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
def test_mtime_timestamp(self):
- pp = self._create(None, {"date": 315532800})
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._create(None, {"date": 315532800})
+ self._trigger()
self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
def test_mtime_custom(self):
- pp = self._create({"key": "foo"}, {"foo": 315532800})
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._create({"key": "foo"}, {"foo": 315532800})
+ self._trigger()
self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800)
@@ -341,8 +351,8 @@ class ZipTest(BasePostprocessorTest):
def test_zip_default(self):
pp = self._create()
+ self.assertEqual(self.job.hooks["file"][0], pp.write)
self.assertEqual(pp.path, self.pathfmt.realdirectory)
- self.assertEqual(pp.run, pp._write)
self.assertEqual(pp.delete, True)
self.assertEqual(pp.args, (
pp.path[:-1] + ".zip", "a", zipfile.ZIP_STORED, True,
@@ -351,8 +361,8 @@ class ZipTest(BasePostprocessorTest):
def test_zip_safe(self):
pp = self._create({"mode": "safe"})
+ self.assertEqual(self.job.hooks["file"][0], pp.write_safe)
self.assertEqual(pp.path, self.pathfmt.realdirectory)
- self.assertEqual(pp.run, pp._write_safe)
self.assertEqual(pp.delete, True)
self.assertEqual(pp.args, (
pp.path[:-1] + ".zip", "a", zipfile.ZIP_STORED, True,
@@ -383,8 +393,7 @@ class ZipTest(BasePostprocessorTest):
self.pathfmt.temppath = file.name
self.pathfmt.filename = name
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
nti = pp.zfile.NameToInfo
self.assertEqual(len(nti), i+1)
@@ -397,12 +406,11 @@ class ZipTest(BasePostprocessorTest):
self.assertIn("file2.ext", nti)
# write the last file a second time (will be skipped)
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
self.assertEqual(len(pp.zfile.NameToInfo), 3)
# close file
- pp.run_final(self.pathfmt, 0)
+ self._trigger(("finalize",), 0)
# reopen to check persistence
with zipfile.ZipFile(pp.zfile.filename) as file:
@@ -428,14 +436,13 @@ class ZipTest(BasePostprocessorTest):
for i in range(3):
self.pathfmt.temppath = self.pathfmt.realdirectory + "file.ext"
self.pathfmt.filename = "file{}.ext".format(i)
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ self._trigger()
- # write the last file a second time (will be skipped)
- pp.prepare(self.pathfmt)
- pp.run(self.pathfmt)
+ # write the last file a second time (should be skipped)
+ self._trigger()
- pp.run_final(self.pathfmt, 0)
+ # close file
+ self._trigger(("finalize",), 0)
self.assertEqual(pp.zfile.write.call_count, 3)
for call in pp.zfile.write.call_args_list:
diff --git a/test/test_results.py b/test/test_results.py
index 759a4b4..4e9f4b2 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -30,7 +30,7 @@ TRAVIS_SKIP = {
# temporary issues, etc.
BROKEN = {
- "4plebs",
+ "dokireader",
"imagevenue",
"photobucket",
}
@@ -208,6 +208,9 @@ class ResultJob(job.DownloadJob):
self._update_kwdict(kwdict, False)
self.format_directory(kwdict)
+ def handle_metadata(self, kwdict):
+ pass
+
def handle_queue(self, url, kwdict):
self.queue = True
self._update_url(url)
@@ -367,7 +370,7 @@ def generate_tests():
# filter available extractor classes
extractors = [
extr for extr in extractor.extractors()
- if fltr(extr.category, getattr(extr, "basecategory", None))
+ if fltr(extr.category, extr.basecategory)
]
# add 'test_...' methods