From 61f3b2f820f4687837e10fa9b067782807d49a4c Mon Sep 17 00:00:00 2001 From: blankie Date: Tue, 9 Jan 2024 01:29:47 +1100 Subject: [PATCH 1/5] [hatenablog] add support --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/hatenablog.py | 167 +++++++++++++++++++++++++++++ scripts/supportedsites.py | 7 +- test/results/hatenablog.py | 144 +++++++++++++++++++++++++ 5 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 gallery_dl/extractor/hatenablog.py create mode 100644 test/results/hatenablog.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d046aad4b5..188d829498 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -259,6 +259,12 @@ Consider all listed sites to potentially be NSFW. Folders + + HatenaBlog + https://hatenablog.com + Archive, Individual Posts, Home Feed, Search Results + + HBrowse https://www.hbrowse.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e33f2c3c2..26ce209373 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -53,6 +53,7 @@ "gelbooru_v01", "gelbooru_v02", "gofile", + "hatenablog", "hbrowse", "hentai2read", "hentaicosplays", diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py new file mode 100644 index 0000000000..59e2f94e27 --- /dev/null +++ b/gallery_dl/extractor/hatenablog.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hatenablog.com""" + +import re +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = ( + r"(?:hatenablog:https?://([^/]+)|(?:https?://)?" + r"([\w-]+\.(?:hatenablog\.com|hatenablog\.jp" + r"|hatenadiary\.com|hateblo\.jp)))" +) +QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" + + +class HatenaBlogExtractor(Extractor): + """Base class for HatenaBlog extractors""" + category = "hatenablog" + directory_fmt = ("{category}", "{domain}") + filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}" + archive_fmt = "{filename}" + + def __init__(self, match): + Extractor.__init__(self, match) + + self.domain = match.group(1) or match.group(2) + self._find_img = re.compile(r'').finditer + self._is_image = re.compile( + r'(?: |^)class="hatena-fotolife"(?: |$)').search + self._find_img_src = re.compile(r'(?: |^)src="(.+?)"(?: |$)').search + + def _handle_article(self, article: str): + extr = text.extract_from(article) + date = text.parse_datetime(extr('