From 1418cccb200a016b6e6f7b5fc74023bb7ef7af78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 16 Jan 2024 00:24:30 +0100 Subject: [PATCH] [kemonoparty] add 'revision_hash' metadata (#4706, #4727, #5013) A SHA1 hexdigest of other relevant metadata fields like title, content, file and attachment URLs. This value does NOT reflect which revisions are listed on the website. Neither does 'edited' or any other metadata field (combinations). --- gallery_dl/extractor/kemonoparty.py | 26 ++++++++++++++++++++++---- test/results/kemonoparty.py | 2 ++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index c24e57d16d8..10228b5c78e 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -9,9 +9,10 @@ """Extractors for https://kemono.party/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache, memcache import itertools +import json import re BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" @@ -37,10 +38,14 @@ def __init__(self, match): Extractor.__init__(self, match) def _init(self): + self.revisions = self.config("revisions") self._prepare_ddosguard_cookies() self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall + self._json_dumps = json.JSONEncoder( + ensure_ascii=False, check_circular=False, + sort_keys=True, separators=(",", ":")).encode def items(self): find_hash = re.compile(HASH_PATTERN).match @@ -223,11 +228,23 @@ def _post_revisions(self, url): idx = len(revs) for rev in revs: + rev["revision_hash"] = self._revision_hash(rev) rev["revision_index"] = idx idx -= 1 return revs + def _revision_hash(self, revision): + rev = revision.copy() + rev.pop("revision_id", None) + rev.pop("added", None) + rev.pop("next", None) + rev.pop("prev", None) + rev["file"].pop("name", None) + for a in rev["attachments"]: + a.pop("name", None) + return util.sha1(self._json_dumps(rev)) + def _validate(response): return (response.headers["content-length"] != "9" or @@ -252,13 +269,13 @@ def posts(self): url = self.api_url params = text.parse_query(self.query) params["o"] = text.parse_int(params.get("o")) - revisions = self.config("revisions") while True: posts = self.request(url, params=params).json() - if revisions: + if self.revisions: for post in posts: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 post_url = "{}/post/{}".format(self.api_url, post["id"]) try: @@ -296,7 +313,8 @@ def __init__(self, match): def posts(self): if not self.revision: post = self.request(self.api_url).json() - if self.config("revisions"): + if self.revisions: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 try: revs = self._post_revisions(self.api_url) diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 5bd541a3ae1..c3dbdf7325c 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -177,6 +177,7 @@ "revision_id": 142470, "revision_index": 2, + "revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40", }, { @@ -190,6 +191,7 @@ "revision_id": range(134996, 3052965), "revision_index": range(1, 9), + "revision_hash": r"re:^[0-9a-f]{40}$", },