From cc15fbe71aa2c34f4075a687daef641490a765a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 1 Dec 2020 15:31:28 +0100 Subject: [PATCH] [moebooru] add generalized extractors for moebooru sites - add support for sakugabooru.com (closes #1136) - add support for lolibooru.moe (closes #1050) This allows users to dynamically add support for moebooru/myimouto based sites by adding an entry to their config file (like for foolslide, foolfuuka, etc) For example: { "extractor": { "moebooru": { "new-site-1": {"root": "https://site1.net"}, "new-site-2": {"root": "https://www.site2.moe"} } } } --- docs/supportedsites.rst | 4 +- gallery_dl/extractor/3dbooru.py | 39 +++-- gallery_dl/extractor/__init__.py | 4 +- gallery_dl/extractor/hypnohub.py | 68 -------- gallery_dl/extractor/konachan.py | 85 ---------- gallery_dl/extractor/moebooru.py | 281 +++++++++++++++++++++++++++++++ gallery_dl/extractor/yandere.py | 68 -------- 7 files changed, 308 insertions(+), 241 deletions(-) delete mode 100644 gallery_dl/extractor/hypnohub.py delete mode 100644 gallery_dl/extractor/konachan.py create mode 100644 gallery_dl/extractor/moebooru.py delete mode 100644 gallery_dl/extractor/yandere.py diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 8780e28939..97254e7259 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -69,13 +69,14 @@ Komikcast https://komikcast.com/ Chapters, Manga Konachan https://konachan.com/ Pools, Popular Images, Posts, Tag Searches LINE BLOG https://www.lineblog.me/ Blogs, Posts livedoor Blog http://blog.livedoor.jp/ Blogs, Posts +Lolibooru https://lolibooru.moe/ Pools, Popular Images, Posts, Tag Searches Luscious https://members.luscious.net/ Albums, Search Results Manga Fox https://fanfox.net/ Chapters Manga Here https://www.mangahere.cc/ Chapters, Manga Manga Stream https://readms.net/ Chapters MangaDex https://mangadex.org/ Chapters, Manga MangaKakalot https://mangakakalot.com/ Chapters, Manga -Mangapanda https://www.mangapanda.com/ Chapters, Manga +Mangapanda http://www.mangapanda.com/ Chapters, Manga MangaPark https://mangapark.net/ Chapters, Manga Mangareader https://www.mangareader.net/ Chapters, Manga Mangoxo https://www.mangoxo.com/ Albums, Channels Supported @@ -110,6 +111,7 @@ RedGIFs https://redgifs.com/ individual Images, Sear rule #34 https://rule34.paheal.net/ Posts, Tag Searches Rule 34 https://rule34.xxx/ Pools, Posts, Tag Searches Safebooru https://safebooru.org/ Pools, Posts, Tag Searches +Sakugabooru https://www.sakugabooru.com/ Pools, Popular Images, Posts, Tag Searches Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag Searches Supported Sankaku Complex https://www.sankakucomplex.com/ Articles, Tag Searches Sen Manga https://raw.senmanga.com/ Chapters diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py index 3773ee583d..e0066cb9d6 100644 --- a/gallery_dl/extractor/3dbooru.py +++ b/gallery_dl/extractor/3dbooru.py @@ -1,22 +1,21 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from http://behoimi.org/""" +"""Extractors for http://behoimi.org/""" -from . import booru +from . import moebooru -class _3dbooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): +class _3dbooruBase(): """Base class for 3dbooru extractors""" category = "3dbooru" - api_url = "http://behoimi.org/post/index.json" - post_url = "http://behoimi.org/post/show/{}" - page_limit = 1000 + basecategory = "booru" + root = "http://behoimi.org" def __init__(self, match): super().__init__(match) @@ -26,7 +25,7 @@ def __init__(self, match): }) -class _3dbooruTagExtractor(booru.TagMixin, _3dbooruExtractor): +class _3dbooruTagExtractor(_3dbooruBase, moebooru.MoebooruTagExtractor): """Extractor for images from behoimi.org based on search-tags""" pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org/post" r"(?:/(?:index)?)?\?tags=(?P[^&#]+)") @@ -35,8 +34,12 @@ class _3dbooruTagExtractor(booru.TagMixin, _3dbooruExtractor): "content": "11cbda40c287e026c1ce4ca430810f761f2d0b2a", }) + def posts(self): + params = {"tags": self.tags} + return self._pagination(self.root + "/post/index.json", params) -class _3dbooruPoolExtractor(booru.PoolMixin, _3dbooruExtractor): + +class _3dbooruPoolExtractor(_3dbooruBase, moebooru.MoebooruPoolExtractor): """Extractor for image-pools from behoimi.org""" pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/pool/show/(?P\d+)" test = ("http://behoimi.org/pool/show/27", { @@ -44,8 +47,12 @@ class _3dbooruPoolExtractor(booru.PoolMixin, _3dbooruExtractor): "content": "fd5b37c5c6c2de4b4d6f1facffdefa1e28176554", }) + def posts(self): + params = {"tags": "pool:" + self.pool_id} + return self._pagination(self.root + "/post/index.json", params) + -class _3dbooruPostExtractor(booru.PostMixin, _3dbooruExtractor): +class _3dbooruPostExtractor(_3dbooruBase, moebooru.MoebooruPostExtractor): """Extractor for single images from behoimi.org""" pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/post/show/(?P\d+)" test = ("http://behoimi.org/post/show/140852", { @@ -60,8 +67,13 @@ class _3dbooruPostExtractor(booru.PostMixin, _3dbooruExtractor): }, }) + def posts(self): + params = {"tags": "id:" + self.post_id} + return self._pagination(self.root + "/post/index.json", params) + -class _3dbooruPopularExtractor(booru.MoebooruPopularMixin, _3dbooruExtractor): +class _3dbooruPopularExtractor( + _3dbooruBase, moebooru.MoebooruPopularExtractor): """Extractor for popular images from behoimi.org""" pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org" r"/post/popular_(?Pby_(?:day|week|month)|recent)" @@ -70,8 +82,3 @@ class _3dbooruPopularExtractor(booru.MoebooruPopularMixin, _3dbooruExtractor): "pattern": r"http://behoimi\.org/data/../../[0-9a-f]{32}\.jpg", "count": 20, }) - - def __init__(self, match): - super().__init__(match) - self.api_url = "http://behoimi.org/post/popular_{scale}.json".format( - scale=self.scale) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d0c327addd..3577c3e669 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -44,7 +44,6 @@ "hentainexus", "hiperdex", "hitomi", - "hypnohub", "idolcomplex", "imagebam", "imagechest", @@ -60,7 +59,6 @@ "keenspot", "khinsider", "komikcast", - "konachan", "lineblog", "livedoor", "luscious", @@ -123,8 +121,8 @@ "wikiart", "xhamster", "xvideos", - "yandere", "yuki", + "moebooru", "foolfuuka", "foolslide", "mastodon", diff --git a/gallery_dl/extractor/hypnohub.py b/gallery_dl/extractor/hypnohub.py deleted file mode 100644 index 17f9a88c39..0000000000 --- a/gallery_dl/extractor/hypnohub.py +++ /dev/null @@ -1,68 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://hypnohub.net/""" - -from . import booru - - -class HypnohubExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): - """Base class for hypnohub extractors""" - category = "hypnohub" - api_url = "https://hypnohub.net/post.json" - post_url = "https://hypnohub.net/post/show/{}" - - -class HypnohubTagExtractor(booru.TagMixin, HypnohubExtractor): - """Extractor for images from hypnohub.net based on search-tags""" - pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net" - r"/post\?(?:[^&#]*&)*tags=(?P[^&#]+)") - test = ("https://hypnohub.net/post?tags=gonoike_biwa", { - "url": "2848abe3e433ad39bfdf5be5874682faaccea5be", - }) - - -class HypnohubPoolExtractor(booru.PoolMixin, HypnohubExtractor): - """Extractor for image-pools from hypnohub.net""" - pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/pool/show/(?P\d+)" - test = ("https://hypnohub.net/pool/show/61", { - "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf", - }) - - -class HypnohubPostExtractor(booru.PostMixin, HypnohubExtractor): - """Extractor for single images from hypnohub.net""" - pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/post/show/(?P\d+)" - test = ("https://hypnohub.net/post/show/73964", { - "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee", - "options": (("tags", True),), - "keyword": { - "tags_artist": "gonoike_biwa icontrol_(manipper)", - "tags_character": "komaru_naegi", - "tags_copyright": "dangan_ronpa dangan_ronpa_another_episode", - "tags_general": str, - }, - }) - - -class HypnohubPopularExtractor(booru.MoebooruPopularMixin, HypnohubExtractor): - """Extractor for popular images from hypnohub.net""" - pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net" - r"/post/popular_(?Pby_(?:day|week|month)|recent)" - r"(?:\?(?P[^#]*))?") - test = ( - ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", { - "count": 20, - }), - ("https://hypnohub.net/post/popular_recent"), - ) - - def __init__(self, match): - super().__init__(match) - self.api_url = "https://hypnohub.net/post/popular_{scale}.json".format( - scale=self.scale) diff --git a/gallery_dl/extractor/konachan.py b/gallery_dl/extractor/konachan.py deleted file mode 100644 index a9d8b3a9de..0000000000 --- a/gallery_dl/extractor/konachan.py +++ /dev/null @@ -1,85 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extract images from https://konachan.com/""" - -from . import booru - - -class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): - """Base class for konachan extractors""" - category = "konachan" - - def __init__(self, match): - root = "https://konachan." + match.group("tld") - self.api_url = root + "/post.json" - self.post_url = root + "/post/show/{}" - super().__init__(match) - - -class KonachanTagExtractor(booru.TagMixin, KonachanExtractor): - """Extractor for images from konachan.com based on search-tags""" - pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?Pcom|net)" - r"/post\?(?:[^&#]*&)*tags=(?P[^&#]+)") - test = ( - ("https://konachan.com/post?tags=patata", { - "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d", - }), - ("https://konachan.net/post?tags=patata"), - ) - - -class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor): - """Extractor for image-pools from konachan.com""" - pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?Pcom|net)" - r"/pool/show/(?P\d+)") - test = ( - ("https://konachan.com/pool/show/95", { - "content": "cf0546e38a93c2c510a478f8744e60687b7a8426", - }), - ("https://konachan.net/pool/show/95"), - ) - - -class KonachanPostExtractor(booru.PostMixin, KonachanExtractor): - """Extractor for single images from konachan.com""" - pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?Pcom|net)" - r"/post/show/(?P\d+)") - test = ( - ("https://konachan.com/post/show/205189", { - "content": "674e75a753df82f5ad80803f575818b8e46e4b65", - "options": (("tags", True),), - "keyword": { - "tags_artist": "patata", - "tags_character": "clownpiece", - "tags_copyright": "touhou", - "tags_general": str, - }, - }), - ("https://konachan.net/post/show/205189"), - ) - - -class KonachanPopularExtractor(booru.MoebooruPopularMixin, KonachanExtractor): - """Extractor for popular images from konachan.com""" - pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?Pcom|net)" - r"/post/popular_(?Pby_(?:day|week|month)|recent)" - r"(?:\?(?P[^#]*))?") - test = ( - ("https://konachan.com/post/popular_by_month?month=11&year=2010", { - "count": 20, - }), - ("https://konachan.com/post/popular_recent"), - ("https://konachan.net/post/popular_recent"), - ) - - def __init__(self, match): - super().__init__(match) - self.api_url = ( - "https://konachan.{tld}/post/popular_{scale}.json".format( - tld=match.group("tld"), scale=self.scale)) diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py new file mode 100644 index 0000000000..a87753158d --- /dev/null +++ b/gallery_dl/extractor/moebooru.py @@ -0,0 +1,281 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Moebooru based sites""" + +from .common import Extractor, Message, generate_extractors +from .. import text + +import re +import datetime +import collections + + +class MoebooruExtractor(Extractor): + """Base class for Moebooru extractors""" + basecategory = "moebooru" + filename_fmt = "{category}_{id}_{md5}.{extension}" + page_start = 1 + per_page = 50 + + def items(self): + extended_tags = self.config("tags", False) + data = self.metadata() + for post in self.posts(): + try: + url = self._prepare_post(post, extended_tags) + except KeyError: + continue + post.update(data) + text.nameext_from_url(url, post) + yield Message.Directory, post + yield Message.Url, url, post + + def metadata(self): + return () + + def posts(self): + return () + + def skip(self, num): + pages = num // self.per_page + self.page_start += pages + return pages * self.per_page + + def _prepare_post(self, post, extended_tags=False): + url = post["file_url"] + if url[0] == "/": + url = self.root + url + if extended_tags: + self._fetch_extended_tags(post) + post["date"] = text.parse_timestamp(post["created_at"]) + return url + + def _fetch_extended_tags(self, post): + url = "{}/post/show/{}".format(self.root, post["id"]) + page = self.request(url).text + html = text.extract(page, '
    [^&#]+)") - test = ("https://yande.re/post?tags=ouzoku+armor", { - "content": "59201811c728096b2d95ce6896fd0009235fe683", - }) - - -class YanderePoolExtractor(booru.PoolMixin, YandereExtractor): - """Extractor for image-pools from yande.re""" - pattern = r"(?:https?://)?(?:www\.)?yande\.re/pool/show/(?P\d+)" - test = ("https://yande.re/pool/show/318", { - "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68", - }) - - -class YanderePostExtractor(booru.PostMixin, YandereExtractor): - """Extractor for single images from yande.re""" - pattern = r"(?:https?://)?(?:www\.)?yande\.re/post/show/(?P\d+)" - test = ("https://yande.re/post/show/51824", { - "content": "59201811c728096b2d95ce6896fd0009235fe683", - "options": (("tags", True),), - "keyword": { - "tags_artist": "sasaki_tamaru", - "tags_circle": "softhouse_chara", - "tags_copyright": "ouzoku", - "tags_general": str, - }, - }) - - -class YanderePopularExtractor(booru.MoebooruPopularMixin, YandereExtractor): - """Extractor for popular images from yande.re""" - pattern = (r"(?:https?://)?(?:www\.)?yande\.re" - r"/post/popular_(?Pby_(?:day|week|month)|recent)" - r"(?:\?(?P[^#]*))?") - test = ( - ("https://yande.re/post/popular_by_month?month=6&year=2014", { - "count": 40, - }), - ("https://yande.re/post/popular_recent"), - ) - - def __init__(self, match): - super().__init__(match) - self.api_url = "https://yande.re/post/popular_{scale}.json".format( - scale=self.scale)