Skip to content

Commit

Permalink
use sitemap for goldenaudiobooks
Browse files Browse the repository at this point in the history
  • Loading branch information
JarbasAl committed Mar 25, 2024
1 parent e136a52 commit 1b01984
Showing 1 changed file with 24 additions and 9 deletions.
33 changes: 24 additions & 9 deletions audiobooker/scrappers/goldenaudiobooks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import requests
from audiobooker import AudioBook, BookTag, BookAuthor
from audiobooker.scrappers import AudioBookSource
from sitemapparser import SiteMapParser


class GoldenAudioBooksAudioBook(AudioBook):
Expand Down Expand Up @@ -69,11 +70,14 @@ class GoldenAudioBooks(AudioBookSource):
@classmethod
def scrap_tags(cls):
bucket = {}
soup = cls._get_soup(cls._get_html(cls.base_url))
for tag in soup.find("aside",
{"class": "widget widget_categories"}). \
find_all("a"):
bucket[tag.text] = tag["href"]
sm = SiteMapParser('https://goldenaudiobook.co/category-sitemap.xml') # reads /sitemap.xml
urls = sm.get_urls() # returns iterator of sitemapper.Url instances
for url in urls:
url = str(url)
title = url.strip("/").split("/")[-1].replace("-", " ").title()

bucket[title] = url

return bucket

@property
Expand Down Expand Up @@ -194,9 +198,19 @@ def get_audiobook(cls, book_id):

@classmethod
def scrap_all_audiobooks(cls, limit=-1, offset=0):
for tag in cls._tags:
for book in cls.scrap_by_tag(tag, limit, offset):
yield book
sm = SiteMapParser('https://goldenaudiobook.co/post-sitemap.xml') # reads /sitemap.xml
urls = sm.get_urls() # returns iterator of sitemapper.Url instances
for url in urls:
url = str(url)
title = url.strip("/").split("/")[-1].replace("-", " ").title()
yield GoldenAudioBooksAudioBook(url=url, title=title)

sm = SiteMapParser('https://goldenaudiobook.co/post-sitemap2.xml') # reads /sitemap.xml
urls = sm.get_urls() # returns iterator of sitemapper.Url instances
for url in urls:
url = str(url)
title = url.strip("/").split("/")[-1].replace("-", " ").title()
yield GoldenAudioBooksAudioBook(url=url, title=title)


if __name__ == "__main__":
Expand All @@ -207,8 +221,9 @@ def scrap_all_audiobooks(cls, limit=-1, offset=0):
for a in book.authors:
# print(a.as_json)
pass

tags = GoldenAudioBooks.scrap_tags()
# print(tags)
print(tags)

for book in GoldenAudioBooks.search_audiobooks(author="Lovecraft"):
pprint(book.as_json)
Expand Down

0 comments on commit 1b01984

Please sign in to comment.