Skip to content

Commit

Permalink
[realbooru] fix extraction (fixes #2530)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed May 2, 2022
1 parent 82eee72 commit 3e926bd
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions gallery_dl/extractor/gelbooru_v02.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def __init__(self, match):
except KeyError:
self.api_root = self.root

if self.category == "realbooru":
self._file_url = self._file_url_realbooru

def _api_request(self, params):
url = self.api_root + "/index.php?page=dapi&s=post&q=index"
return ElementTree.fromstring(self.request(url, params=params).text)
Expand Down Expand Up @@ -61,6 +64,14 @@ def _prepare(post):
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")

def _file_url_realbooru(self, post):
url = post["file_url"]
if url.count("/") == 5:
md5 = post["md5"]
url = "{}/images/{}/{}/{}.{}".format(
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url

def _extended_tags(self, post, page=None):
if not page:
url = "{}/index.php?page=post&s=view&id={}".format(
Expand Down Expand Up @@ -213,7 +224,7 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor):
"count": 2,
}),
("https://realbooru.com/index.php?page=favorites&s=view&id=274", {
"count": 4,
"count": 2,
}),
("https://tbib.org/index.php?page=favorites&s=view&id=7881", {
"count": 3,
Expand Down Expand Up @@ -279,7 +290,8 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
},
}),
("https://realbooru.com/index.php?page=post&s=view&id=668483", {
"url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
"pattern": r"https://realbooru\.com/images/dc/b5"
r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg",
"content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
}),
("https://tbib.org/index.php?page=post&s=view&id=9233957", {
Expand Down

1 comment on commit 3e926bd

@mo-han
Copy link
Contributor

@mo-han mo-han commented on 3e926bd Nov 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

    def _file_url_realbooru(self, post):
        url = post["file_url"]
        md5 = post["md5"]
        if md5 not in post['preview_url'] or url.count("/") == 5:
        # add a case in which file_url need to be correct, otherwise it would be an error of 404
        # example post: https://realbooru.com/index.php?page=post&s=view&id=722682
        # preview_url: https://realbooru.com/thumbnails/698/thumbnail_bbcc334dd6204d9c8c41522725718df9d6dd66bb.jpg
        # sample_url: https://realbooru.com/samples/698/sample_bbcc334dd6204d9c8c41522725718df9d6dd66bb.jpg
        # file_url ("original" link): https://realbooru.com/images/18/27/bbcc334dd6204d9c8c41522725718df9d6dd66bb.jpeg (404 error)
        # correct url: https://realbooru.com//images/18/27/1827047517204d1b321e17ab23c293ae.jpeg (not exist in the page code)
            url = "{}/images/{}/{}/{}.{}".format(
                self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
        return url
    def _extended_tags_realbooru(self, post, page=None):
        # example post: https://realbooru.com/index.php?page=post&s=view&id=722682
        # tags keywords output:
        # tags
        #    2009 3girls ass brynn_tyler daisy_young lexi_belle multiple_girls photo three_sisters x-art
        # tags_copyright
        #   x-art
        # tags_general
        #   2009 3girls ass multiple_girls
        # tags_metadata
        #   photo three_sisters
        # tags_model
        #   brynn_tyler daisy_young lexi_belle
        if not page:
            url = "{}/index.php?page=post&s=view&id={}".format(
                self.root, post["id"])
            page = self.request(url).text
        html = text.extract(page, '<div id="tagLink', '</div>')[0]
        if html:
            tags = collections.defaultdict(list)
            pattern = re.compile(r'''<a class="([^"' ]+?)" href=".*?[?;]tags=([^"' ]+)">.+?</a>''')
            for tag_type, tag_name in pattern.findall(html):
                if tag_type.startswith('tag-type-'):
                    tag_type = tag_type[9:]
                tags[tag_type].append(text.unquote(tag_name))
            for key, value in tags.items():
                post["tags_" + key] = " ".join(value)
        return page
        if self.category == "realbooru":
            self._file_url = self._file_url_realbooru
            self._extended_tags = self._extended_tags_realbooru

Please sign in to comment.