[zerochan] use API by default (#3669)

add 'pagination' option
mikf · Feb 24, 2024 · cc6b9e4 · cc6b9e4
1 parent efccd3d
commit cc6b9e4
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 8 deletions.
diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -4131,6 +4131,21 @@ Description
     Note: This requires 1-2 additional HTTP requests per post.
 
 
+extractor.zerochan.pagination
+-----------------------------
+Type
+    ``string``
+Default
+    ``"api"``
+Description
+    Controls how to paginate over tag search results.
+
+    * ``"api"``: Use the `JSON API <https://www.zerochan.net/api>`__
+      (no ``extension`` metadata)
+    * ``"html"``: Parse HTML pages
+      (limited to 100 pages * 24 posts)
+
+
 extractor.[booru].tags
 ----------------------
 Type

diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
@@ -10,7 +10,7 @@
 
 from .booru import BooruExtractor
 from ..cache import cache
-from .. import text, exception
+from .. import text, util, exception
 
 BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
 
@@ -21,8 +21,11 @@ class ZerochanExtractor(BooruExtractor):
     root = "https://www.zerochan.net"
     filename_fmt = "{id}.{extension}"
     archive_fmt = "{id}"
+    page_start = 1
+    per_page = 250
     cookies_domain = ".zerochan.net"
     cookies_names = ("z_id", "z_hash")
+    request_interval = (0.5, 1.5)
 
     def login(self):
         self._logged_in = True
@@ -86,7 +89,7 @@ def _parse_entry_html(self, entry_id):
 
         return data
 
-    def _parse_entry_json(self, entry_id):
+    def _parse_entry_api(self, entry_id):
         url = "{}/{}?json".format(self.root, entry_id)
         item = self.request(url).json()
 
@@ -117,14 +120,22 @@ def __init__(self, match):
         ZerochanExtractor.__init__(self, match)
         self.search_tag, self.query = match.groups()
 
+    def _init(self):
+        if self.config("pagination") == "html":
+            self.posts = self.posts_html
+            self.per_page = 24
+        else:
+            self.posts = self.posts_api
+            self.session.headers["User-Agent"] = util.USERAGENT
+
     def metadata(self):
         return {"search_tags": text.unquote(
             self.search_tag.replace("+", " "))}
 
-    def posts(self):
+    def posts_html(self):
         url = self.root + "/" + self.search_tag
         params = text.parse_query(self.query)
-        params["p"] = text.parse_int(params.get("p"), 1)
+        params["p"] = text.parse_int(params.get("p"), self.page_start)
         metadata = self.config("metadata")
 
         while True:
@@ -140,7 +151,7 @@ def posts(self):
                 if metadata:
                     entry_id = extr('href="/', '"')
                     post = self._parse_entry_html(entry_id)
-                    post.update(self._parse_entry_json(entry_id))
+                    post.update(self._parse_entry_api(entry_id))
                     yield post
                 else:
                     yield {
@@ -157,6 +168,41 @@ def posts(self):
                 break
             params["p"] += 1
 
+    def posts_api(self):
+        url = self.root + "/" + self.search_tag
+        metadata = self.config("metadata")
+        params = {
+            "json": "1",
+            "l"   : self.per_page,
+            "p"   : self.page_start,
+        }
+
+        static = "https://static.zerochan.net/.full."
+
+        while True:
+            data = self.request(url, params=params).json()
+            try:
+                posts = data["items"]
+            except ValueError:
+                return
+
+            if metadata:
+                for post in posts:
+                    post_id = post["id"]
+                    post.update(self._parse_entry_html(post_id))
+                    post.update(self._parse_entry_api(post_id))
+            else:
+                for post in posts:
+                    base = static + str(post["id"])
+                    post["file_url"] = base + ".jpg"
+                    post["_fallback"] = (base + ".png",)
+
+            yield from posts
+
+            if not data.get("next"):
+                return
+            params["p"] += 1
+
 
 class ZerochanImageExtractor(ZerochanExtractor):
     subcategory = "image"
@@ -170,5 +216,5 @@ def __init__(self, match):
     def posts(self):
         post = self._parse_entry_html(self.image_id)
         if self.config("metadata"):
-            post.update(self._parse_entry_json(self.image_id))
+            post.update(self._parse_entry_api(self.image_id))
         return (post,)
diff --git a/test/results/zerochan.py b/test/results/zerochan.py
@@ -12,8 +12,27 @@
     "#url"     : "https://www.zerochan.net/Perth+%28Kantai+Collection%29",
     "#category": ("booru", "zerochan", "tag"),
     "#class"   : zerochan.ZerochanTagExtractor,
+    "#pattern" : r"https://static\.zerochan\.net/\.full\.\d+\.jpg",
+    "#count"   : "> 50",
+
+    "extension"  : r"jpg",
+    "file_url"   : r"re:https://static\.zerochan\.net/\.full\.\d+\.jpg",
+    "filename"   : r"re:\.full\.\d+",
+    "height"     : int,
+    "id"         : int,
+    "search_tags": "Perth (Kantai Collection)",
+    "tag"        : r"re:(Perth \(Kantai Collection\)|Kantai Collection)",
+    "tags"       : list,
+    "width"      : int,
+},
+
+{
+    "#url"     : "https://www.zerochan.net/Perth+%28Kantai+Collection%29",
+    "#category": ("booru", "zerochan", "tag"),
+    "#class"   : zerochan.ZerochanTagExtractor,
+    "#options" : {"pagination": "html"},
     "#pattern" : r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)",
-    "#count"   : "> 24",
+    "#count"   : "> 45",
 
     "extension"  : r"re:jpg|png",
     "file_url"   : r"re:https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)",
@@ -58,8 +77,37 @@
         "Theme:Personification",
         "Theme:Pins",
         "Theme:Ribbon",
-        "Theme:Shirt",
         "Theme:Short Hair",
+        "Theme:Top",
+    ],
+    "uploader": "YukinoTokisaki",
+    "width"   : 1920,
+},
+
+{
+    "#url"     : "https://www.zerochan.net/2920445",
+    "#category": ("booru", "zerochan", "image"),
+    "#class"   : zerochan.ZerochanImageExtractor,
+    "#pattern" : r"https://static\.zerochan\.net/Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg",
+    "#auth"    : False,
+
+    "author"  : "YeFan 葉凡",
+    "date"    : "dt:2020-04-24 21:33:44",
+    "file_url": "https://static.zerochan.net/Perth.%28Kantai.Collection%29.full.2920445.jpg",
+    "filename": "Perth.(Kantai.Collection).full.2920445",
+    "height"  : 1366,
+    "id"      : 2920445,
+    "path"    : [
+        "Kantai Collection",
+        "Perth (Kantai Collection)",
+    ],
+    "size"    : 1975296,
+    "tags"    : [
+        "Mangaka:YeFan 葉凡",
+        "Game:Kantai Collection",
+        "Character:Perth (Kantai Collection)",
+        "Theme:Firefighter Outfit",
+        "Theme:Pins",
     ],
     "uploader": "YukinoTokisaki",
     "width"   : 1920,