Skip to content

Commit

Permalink
decouple extractor initialization
Browse files Browse the repository at this point in the history
Introduce an 'initialize()' function that does the actual init
(session, cookies, config options) and can called separately from
the constructor __init__().

This allows, for example, to adjust config access inside a Job
before most of it already happened when calling 'extractor.find()'.
  • Loading branch information
mikf committed Jul 25, 2023
1 parent f0203b7 commit a383eca
Show file tree
Hide file tree
Showing 71 changed files with 314 additions and 193 deletions.
12 changes: 5 additions & 7 deletions gallery_dl/extractor/3dbooru.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright 2015-2020 Mike Fährmann
# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
Expand All @@ -17,12 +17,10 @@ class _3dbooruBase():
basecategory = "booru"
root = "http://behoimi.org"

def __init__(self, match):
super().__init__(match)
self.session.headers.update({
"Referer": "http://behoimi.org/post/show/",
"Accept-Encoding": "identity",
})
def _init(self):
headers = self.session.headers
headers["Referer"] = "http://behoimi.org/post/show/"
headers["Accept-Encoding"] = "identity"


class _3dbooruTagExtractor(_3dbooruBase, moebooru.MoebooruTagExtractor):
Expand Down
3 changes: 1 addition & 2 deletions gallery_dl/extractor/500px.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ class _500pxExtractor(Extractor):
root = "https://500px.com"
cookies_domain = ".500px.com"

def __init__(self, match):
Extractor.__init__(self, match)
def _init(self):
self.session.headers["Referer"] = self.root + "/"

def items(self):
Expand Down
4 changes: 3 additions & 1 deletion gallery_dl/extractor/8chan.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright 2022 Mike Fährmann
# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
Expand Down Expand Up @@ -150,6 +150,8 @@ class _8chanBoardExtractor(_8chanExtractor):
def __init__(self, match):
_8chanExtractor.__init__(self, match)
_, self.board, self.page = match.groups()

def _init(self):
self.session.headers["Referer"] = self.root + "/"

def items(self):
Expand Down
6 changes: 3 additions & 3 deletions gallery_dl/extractor/artstation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright 2018-2022 Mike Fährmann
# Copyright 2018-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
Expand All @@ -27,12 +27,12 @@ class ArtstationExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.user = match.group(1) or match.group(2)
self.external = self.config("external", False)

def items(self):
data = self.metadata()

projects = self.projects()
external = self.config("external", False)
max_posts = self.config("max-posts")
if max_posts:
projects = itertools.islice(projects, max_posts)
Expand All @@ -45,7 +45,7 @@ def items(self):
asset["num"] = num
yield Message.Directory, asset

if adict["has_embedded_player"] and self.external:
if adict["has_embedded_player"] and external:
player = adict["player_embedded"]
url = (text.extr(player, 'src="', '"') or
text.extr(player, "src='", "'"))
Expand Down
8 changes: 6 additions & 2 deletions gallery_dl/extractor/aryion.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,11 @@ class AryionGalleryExtractor(AryionExtractor):

def __init__(self, match):
AryionExtractor.__init__(self, match)
self.recursive = self.config("recursive", True)
self.offset = 0

def _init(self):
self.recursive = self.config("recursive", True)

def skip(self, num):
if self.recursive:
return 0
Expand All @@ -217,9 +219,11 @@ class AryionTagExtractor(AryionExtractor):
"count": ">= 5",
})

def metadata(self):
def _init(self):
self.params = text.parse_query(self.user)
self.user = None

def metadata(self):
return {"search_tags": self.params.get("tag")}

def posts(self):
Expand Down
5 changes: 3 additions & 2 deletions gallery_dl/extractor/blogger.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@ class BloggerExtractor(Extractor):

def __init__(self, match):
Extractor.__init__(self, match)
self.videos = self.config("videos", True)
self.blog = match.group(1) or match.group(2)

def _init(self):
self.api = BloggerAPI(self)
self.videos = self.config("videos", True)

def items(self):

blog = self.api.blog_by_url("http://" + self.blog)
blog["pages"] = blog["pages"]["totalItems"]
blog["posts"] = blog["posts"]["totalItems"]
Expand Down
56 changes: 37 additions & 19 deletions gallery_dl/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,25 +52,6 @@ def __init__(self, match):
self._cfgpath = ("extractor", self.category, self.subcategory)
self._parentdir = ""

self._write_pages = self.config("write-pages", False)
self._retry_codes = self.config("retry-codes")
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
self._proxies = util.build_proxy_map(self.config("proxy"), self.log)
self._interval = util.build_duration_func(
self.config("sleep-request", self.request_interval),
self.request_interval_min,
)

if self._retries < 0:
self._retries = float("inf")
if not self._retry_codes:
self._retry_codes = ()

self._init_session()
self._init_cookies()

@classmethod
def from_url(cls, url):
if isinstance(cls.pattern, str):
Expand All @@ -79,8 +60,16 @@ def from_url(cls, url):
return cls(match) if match else None

def __iter__(self):
self.initialize()
return self.items()

def initialize(self):
self._init_options()
self._init_session()
self._init_cookies()
self._init()
self.initialize = util.noop

def items(self):
yield Message.Version, 1

Expand Down Expand Up @@ -245,6 +234,26 @@ def _get_auth_info(self):

return username, password

def _init(self):
pass

def _init_options(self):
self._write_pages = self.config("write-pages", False)
self._retry_codes = self.config("retry-codes")
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
self._proxies = util.build_proxy_map(self.config("proxy"), self.log)
self._interval = util.build_duration_func(
self.config("sleep-request", self.request_interval),
self.request_interval_min,
)

if self._retries < 0:
self._retries = float("inf")
if not self._retry_codes:
self._retry_codes = ()

def _init_session(self):
self.session = session = requests.Session()
headers = session.headers
Expand Down Expand Up @@ -454,6 +463,13 @@ def _prepare_ddosguard_cookies(self):
self.cookies.set(
"__ddg2", util.generate_token(), domain=self.cookies_domain)

def _cache(self, func, maxage, keyarg=None):
# return cache.DatabaseCacheDecorator(func, maxage, keyarg)
return cache.DatabaseCacheDecorator(func, keyarg, maxage)

def _cache_memory(self, func, maxage=None, keyarg=None):
return cache.Memcache()

def _get_date_min_max(self, dmin=None, dmax=None):
"""Retrieve and parse 'date-min' and 'date-max' config values"""
def get(key, default):
Expand Down Expand Up @@ -654,6 +670,8 @@ class AsynchronousMixin():
"""Run info extraction in a separate thread"""

def __iter__(self):
self.initialize()

messages = queue.Queue(5)
thread = threading.Thread(
target=self.async_items,
Expand Down
3 changes: 1 addition & 2 deletions gallery_dl/extractor/danbooru.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ class DanbooruExtractor(BaseExtractor):
per_page = 200
request_interval = 1.0

def __init__(self, match):
BaseExtractor.__init__(self, match)
def _init(self):
self.ugoira = self.config("ugoira", False)
self.external = self.config("external", False)
self.includes = False
Expand Down
25 changes: 16 additions & 9 deletions gallery_dl/extractor/deviantart.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,18 @@ class DeviantartExtractor(Extractor):

def __init__(self, match):
Extractor.__init__(self, match)
self.user = match.group(1) or match.group(2)

def _init(self):
self.flat = self.config("flat", True)
self.extra = self.config("extra", False)
self.original = self.config("original", True)
self.comments = self.config("comments", False)
self.user = match.group(1) or match.group(2)

self.api = DeviantartOAuthAPI(self)
self.group = False
self.offset = 0
self.api = None
self._premium_cache = {}

unwatch = self.config("auto-unwatch")
if unwatch:
Expand All @@ -60,11 +64,13 @@ def __init__(self, match):
self._update_content = self._update_content_image
self.original = True

self._premium_cache = {}
self.commit_journal = {
"html": self._commit_journal_html,
"text": self._commit_journal_text,
}.get(self.config("journals", "html"))
journals = self.config("journals", "html")
if journals == "html":
self.commit_journal = self._commit_journal_html
elif journals == "text":
self.commit_journal = self._commit_journal_text
else:
self.commit_journal = None

def skip(self, num):
self.offset += num
Expand All @@ -80,8 +86,6 @@ def login(self):
return True

def items(self):
self.api = DeviantartOAuthAPI(self)

if self.user and self.config("group", True):
profile = self.api.user_profile(self.user)
self.group = not profile
Expand Down Expand Up @@ -449,6 +453,9 @@ class DeviantartUserExtractor(DeviantartExtractor):
("https://shimoda7.deviantart.com/"),
)

def initialize(self):
pass

def items(self):
base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors((
Expand Down
19 changes: 10 additions & 9 deletions gallery_dl/extractor/exhentai.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,21 @@ class ExhentaiExtractor(Extractor):
LIMIT = False

def __init__(self, match):
# allow calling 'self.config()' before 'Extractor.__init__()'
self._cfgpath = ("extractor", self.category, self.subcategory)
Extractor.__init__(self, match)
self.version = match.group(1)

version = match.group(1)
def initialize(self):
domain = self.config("domain", "auto")
if domain == "auto":
domain = ("ex" if version == "ex" else "e-") + "hentai.org"
domain = ("ex" if self.version == "ex" else "e-") + "hentai.org"
self.root = "https://" + domain
self.cookies_domain = "." + domain

Extractor.__init__(self, match)
Extractor.initialize(self)

if self.version != "ex":
self.cookies.set("nw", "1", domain=self.cookies_domain)
self.session.headers["Referer"] = self.root + "/"
self.original = self.config("original", True)

limits = self.config("limits", False)
Expand All @@ -51,10 +55,6 @@ def __init__(self, match):
else:
self.limits = False

self.session.headers["Referer"] = self.root + "/"
if version != "ex":
self.cookies.set("nw", "1", domain=self.cookies_domain)

def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)
if response.history and response.headers.get("Content-Length") == "0":
Expand Down Expand Up @@ -174,6 +174,7 @@ def __init__(self, match):
self.image_token = match.group(4)
self.image_num = text.parse_int(match.group(6), 1)

def _init(self):
source = self.config("source")
if source == "hitomi":
self.items = self._items_hitomi
Expand Down
5 changes: 2 additions & 3 deletions gallery_dl/extractor/fanbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

"""Extractors for https://www.fanbox.cc/"""

import re
from .common import Extractor, Message
from .. import text
import re


BASE_PATTERN = (
Expand All @@ -27,8 +27,7 @@ class FanboxExtractor(Extractor):
archive_fmt = "{id}_{num}"
_warning = True

def __init__(self, match):
Extractor.__init__(self, match)
def _init(self):
self.embeds = self.config("embeds", True)

def items(self):
Expand Down
4 changes: 3 additions & 1 deletion gallery_dl/extractor/flickr.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ class FlickrExtractor(Extractor):

def __init__(self, match):
Extractor.__init__(self, match)
self.api = FlickrAPI(self)
self.item_id = match.group(1)

def _init(self):
self.api = FlickrAPI(self)
self.user = None

def items(self):
Expand Down
4 changes: 3 additions & 1 deletion gallery_dl/extractor/foolfuuka.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@ class FoolfuukaExtractor(BaseExtractor):

def __init__(self, match):
BaseExtractor.__init__(self, match)
self.session.headers["Referer"] = self.root
if self.category == "b4k":
self.remote = self._remote_direct

def _init(self):
self.session.headers["Referer"] = self.root + "/"

def items(self):
yield Message.Directory, self.metadata()
for post in self.posts():
Expand Down
5 changes: 5 additions & 0 deletions gallery_dl/extractor/furaffinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class FuraffinityExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.user = match.group(1)

def _init(self):
self.offset = 0

if self.config("descriptions") == "html":
Expand Down Expand Up @@ -384,6 +386,9 @@ class FuraffinityUserExtractor(FuraffinityExtractor):
}),
)

def initialize(self):
pass

def items(self):
base = "{}/{{}}/{}/".format(self.root, self.user)
return self._dispatch_extractors((
Expand Down
Loading

0 comments on commit a383eca

Please sign in to comment.