Skip to content

Commit

Permalink
[realbooru] fix extraction
Browse files Browse the repository at this point in the history
get file URLs from HTML pages
  • Loading branch information
mikf committed Apr 2, 2023
1 parent 75666cf commit ac97aca
Showing 1 changed file with 24 additions and 2 deletions.
26 changes: 24 additions & 2 deletions gallery_dl/extractor/gelbooru_v02.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, match):
self.api_root = self.root

if self.category == "realbooru":
self._file_url = self._file_url_realbooru
self.items = self._items_realbooru
self._tags = self._tags_realbooru

def _api_request(self, params):
Expand Down Expand Up @@ -129,6 +129,28 @@ def _file_url_realbooru(self, post):
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url

def _items_realbooru(self):
from .common import Message
data = self.metadata()

for post in self.posts():
try:
html = self._html(post)
url = post["file_url"] = text.rextract(
html, 'href="', '"', html.index(">Original<"))[0]
except Exception:
self.log.debug("Unable to fetch download URL for post %s "
"(md5: %s)", post.get("id"), post.get("md5"))
continue

text.nameext_from_url(url, post)
post.update(data)
self._prepare(post)
self._tags(post, html)

yield Message.Directory, post
yield Message.Url, url, post

def _tags_realbooru(self, post, page):
tag_container = text.extr(page, 'id="tagLink"', '</div>')
tags = collections.defaultdict(list)
Expand Down Expand Up @@ -404,7 +426,7 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
},
}),
("https://realbooru.com/index.php?page=post&s=view&id=668483", {
"pattern": r"https://realbooru\.com/images/dc/b5"
"pattern": r"https://realbooru\.com//?images/dc/b5"
r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg",
"content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
"options": (("tags", True),),
Expand Down

0 comments on commit ac97aca

Please sign in to comment.