-
Notifications
You must be signed in to change notification settings - Fork 5
/
scraper.py
executable file
·870 lines (695 loc) · 32.4 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
#!/usr/bin/env python3
import concurrent.futures
import json
from logging import DEBUG, INFO
import os
import sys
import re
from typing import Dict, List, Optional, Tuple, Union
from urllib.error import HTTPError
from urllib.parse import urlparse
from html2text import html2text
from pydantic import HttpUrl, PositiveInt
from pydantic.json import pydantic_encoder
import requests
import yaml
from bs4 import BeautifulSoup, ResultSet
from bs4.element import Tag
from loguru import logger
from models import Episode, Person, Sponsor
from models.config import ConfigData, ShowDetails
from models.episode import Chapters
from models.fireside import FsShowItem, FsShowItemAttachment, ShowJson
from models.misc import Jb_Episode_Record
from models.person import PersonType
from frontmatter import Post, dumps as f_dumps
# DO NOT REMOVE, even though not used. (JupiterBroadcasting/show-scraper #21)
from pydantic_yaml import YamlModelMixin
config = {}
# Limit scraping only the latest episodes of the show (executes the script much faster!)
# Used with GitHub Actions to run on a daily schedule and scrape the latest episodes.
IS_LATEST_ONLY = bool(os.getenv("LATEST_ONLY", False))
LATEST_ONLY_EP_LIMIT = 5
# Root dir where all the scraped data should to saved to.
# The data save to this dir follows the directory structure of the Hugo files relative
# to the root of the repo.
# Could be set via env variable to use the Hugo root directory.
# Any files that already exist in this directory will not be overwritten.
DATA_ROOT_DIR = os.getenv("DATA_DIR", "./data")
# The sponsors' data is collected into this global when episode files are scraped.
# This data is saved to files files after the episode files have been created.
SPONSORS: Dict[str, Sponsor] = {} # JSON filename as key (e.g. "linode.com-lup.json")
# Global that holds scraped show episodes data from jupiterbroadcasting.com.
# The data is links to different types episode medium files (mp3, youtube, ogg, video,
# etc.) - whatever is available on the episode page under the "Direct Download:" header
#
# The structure of this is:
# {
# "coderradio": { # <-- `show_slug`` as defined in config.yml
# "123": { # <-- ep number
# # rest defined in models.misc.EPISODE_RECORD
# "youtube_link": "https://www.youtube.com/watch?v=98Mh0BP__gE",
# ...
# }
# },
# "show_slug_2": { ... }
# }
JB_DATA = {}
JB_DATA: Dict[str, Dict[int, Jb_Episode_Record]]
CHAPTERS_URL_TPL = "https://feeds.fireside.fm/{show}/json/episodes/{ep_id}/chapters"
# Regex to strip Episode Numbers and information after the |
# https://regex101.com/r/gkUzld/
SHOW_TITLE_REGEX = re.compile(r"^(?:(?:Episode)?\s?[0-9]+:+\s+)?(.+?)(?:(\s+\|+.*)|\s+)?$")
def makedirs_safe(directory):
try:
os.makedirs(directory)
except FileExistsError:
pass
def get_list(soup: BeautifulSoup,
pre_title: str,
find_tag: str = "p",
sibling_tag: str = "ul"):
"""
Blocks of links are preceded by a find_tag (`p` default) saying what it is.
"""
pre_element = soup.find(find_tag, string=pre_title)
if pre_element is None:
return None
return pre_element.find_next_sibling(sibling_tag)
# from the FsShowItemAttachment.duration_in_seconds property
def seconds_2_hhmmss_str(seconds: PositiveInt) -> str:
seconds = seconds
minutes, seconds = divmod(seconds, 60)
hours, minutes = divmod(minutes, 60)
return f"{hours:02}:{minutes:02}:{seconds:02}"
def get_plain_title(title: str) -> str:
"""
Get just the show title, without any numbering etc
"""
return SHOW_TITLE_REGEX.match(title)[1]
def create_episode(api_episode: FsShowItem,
show_config: ShowDetails,
show_slug: str,
output_dir: str):
try:
# RANT: What kind of API doesn't give the episode number?!
try:
episode_number = int(api_episode.url.path.split("/")[-1])
episode_number_padded = f"{episode_number:04}"
except:
episode_number = api_episode.url.path.split("/")[-1]
episode_number_padded = episode_number
episode_guid = api_episode.id
output_file = f"{output_dir}/{episode_number_padded}.md"
if not IS_LATEST_ONLY and os.path.isfile(output_file):
# Overwrite when IS_LATEST_ONLY mode is true
# Because the episode is published on JB website after fireside
logger.warning(f"Skipping saving `{output_file}` as it already exists")
return
podcast_chapters = get_podcast_chapters(api_episode, show_config)
publish_date = api_episode.date_published
api_soup = BeautifulSoup(api_episode.content_html, "html.parser")
page_soup = BeautifulSoup(requests.get(
api_episode.url).content, "html.parser")
blurb = api_episode.summary
if not blurb:
# `summary` might be empty, fallback to getting content of first <p> inside
# `content_html`
_p_tag = api_soup.find("p")
if _p_tag:
blurb = _p_tag.text
sponsors = parse_sponsors(
api_soup, page_soup, show_config.acronym, episode_number)
links_list = get_list(api_soup, "Links:") or get_list(api_soup, "Episode Links:")
links = html2text(str(links_list)) if links_list else None
tags = []
for link in page_soup.find_all("a", class_="tag"):
_tag = link.get_text().strip()
# escape inner quotes (occurs in coderradio 434)
_tag = _tag.replace('"', r'\"')
tags.append(_tag)
tags = sorted(tags)
hosts = parse_hosts_in_ep(page_soup, show_config, episode_number)
guests = parse_guests_in_ep(page_soup, show_config, episode_number)
show_attachment = api_episode.attachments[0]
# not setting this to empty values (which is what .get does with {} as the second parameter)
# so informed about issues like GH issue 16
jb_ep_data = JB_DATA.get(show_slug).get(episode_number)
# logger.debug(f"{episode_number} jb_ep_data: {jb_ep_data}")
jb_ep_data: Jb_Episode_Record
# TODO: handle this use case:
# https://github.com/JupiterBroadcasting/show-scraper/issues/16#issuecomment-1196751641
try:
jb_url = jb_ep_data.jb_url
except AttributeError as errorz:
# TODO: create some notification when 16 happens
# still want to create the episode since the jb_url
# doesn't get used in the new website.
# this means that we're just pulling info directly
# from fireside and have no direct downloads
logger.warning("Show won't have direct download links!\n"
f"episode_url: {api_episode.url}\n")
if LOG_LVL == DEBUG:
logger.exception(f"data we have: {jb_ep_data}\n"
f"error: {errorz}")
jb_ep_data = Jb_Episode_Record()
jb_url = None
if jb_url:
jb_url = urlparse(jb_url).path
episode = Episode(
show_slug=show_slug,
show_name=show_config.name,
episode=episode_number,
episode_padded=episode_number_padded,
episode_guid=episode_guid,
title=get_plain_title(api_episode.title),
description=blurb,
date=publish_date,
tags=tags,
hosts=hosts,
guests=guests,
sponsors=sponsors,
podcast_duration=seconds_2_hhmmss_str(show_attachment.duration_in_seconds),
podcast_file=show_attachment.url,
podcast_bytes=show_attachment.size_in_bytes,
podcast_chapters=podcast_chapters,
podcast_alt_file=jb_ep_data.mp3_audio,
podcast_ogg_file=jb_ep_data.ogg_audio,
video_file=jb_ep_data.video,
video_hd_file=jb_ep_data.hd_video,
video_mobile_file=jb_ep_data.mobile_video,
youtube_link=jb_ep_data.youtube,
jb_url=jb_url,
fireside_url=api_episode.url.path,
episode_links=links
)
save_file(output_file, episode.get_hugo_md_file_content(), overwrite=IS_LATEST_ONLY)
except Exception as e:
logger.exception("Failed to create an episode from url!\n"
f"episode_url: {api_episode.url}")
def get_podcast_chapters(api_episode: FsShowItem, show_config: ShowDetails) -> Optional[Chapters]:
try:
chapters_url = CHAPTERS_URL_TPL.format(
show=show_config.fireside_slug,
ep_id=api_episode.id)
resp = requests.get(chapters_url)
resp.raise_for_status()
# TODO: use pydantic to validate
return Chapters(**resp.json())
except requests.HTTPError:
# No chapters
pass
def save_file(file_path: str, content: Union[bytes,str], mode: str = "w", overwrite: bool = False) -> bool:
if not overwrite and os.path.exists(file_path):
logger.warning(f"Skipping saving `{file_path}` as it already exists")
return False
makedirs_safe(os.path.dirname(file_path))
with open(file_path, mode) as f:
f.write(content)
logger.info(f"Saved file: {file_path}")
return True
def parse_hosts_in_ep(page_soup: BeautifulSoup, show_config: ShowDetails, ep: int):
show = show_config.acronym
base_url = show_config.fireside_url
episode_hosts = []
# assumes the hosts are ALWAYS the first <ul> and guests are in the second one
hosts_links = page_soup.find("ul", class_="episode-hosts").find_all("a")
# hosts_links = page_soup.select(".episode-hosts ul:first-child a")
for link in hosts_links:
try:
host_page_url = base_url + link.get("href")
episode_hosts.append(get_username_from_url(host_page_url))
except Exception as e:
logger.exception(f"Failed to parse HOST for link href!\n"
f" show: {show}\n"
f" ep: {ep}\n"
f" href: {link.get('hrerf')}")
return episode_hosts
def parse_guests_in_ep(page_soup: BeautifulSoup, show_config: ShowDetails, ep: int):
show = show_config.acronym
base_url = show_config.fireside_url
episode_guests = []
# assumes the hosts are ALWAYS the first <ul> and guests are in the second one
# <- this would always be the hosts list
hosts_list = page_soup.find("ul", class_="episode-hosts")
# look for the NEXT `ul.episode-hosts`, that should be the guests list (might not exist)
guests_list = hosts_list.find_next("ul", class_="episode-hosts")
if not guests_list:
return episode_guests
guests_links = guests_list.find_all("a")
for link in guests_links:
try:
guest_page_url = base_url + link.get("href")
episode_guests.append(get_username_from_url(guest_page_url))
except Exception as e:
logger.exception(f"Failed to parse GUEST for link href!\n"
f" show: {show}\n"
f" ep: {ep}\n"
f" href: {link.get('hrerf')}")
return episode_guests
def parse_sponsors(api_soup: BeautifulSoup, page_soup: BeautifulSoup, show: str, ep: int) -> List[str]:
# Get only the links of all the sponsors
sponsors_ul = get_list(api_soup, "Sponsored By:")
if not sponsors_ul:
logger.warning("No sponsors found for this episode.\n"
f" show: {show}\n"
f" ep: {ep}")
return []
sponsors_links = [a["href"]
for a in sponsors_ul.select('li > a:first-child')]
sponsors = []
for sl in sponsors_links:
try:
# FIXME: eventually get around to do a more "official" solution
# Very ugly but works. The goal is to get the hostname of the sponsor
# link without the subdomain. It would fail on tlds like "co.uk". but I
# don't think JB had any sponsors like that so it's fine.
sponsor_slug = ".".join(urlparse(sl).hostname.split(".")[-2:])
shortname = f"{sponsor_slug}-{show}".lower()
sponsors.append(shortname)
filename = f"{shortname}.md"
# Find the <a> element on the page with the link
sponsor_a = page_soup.find(
"div", class_="episode-sponsors").find("a", attrs={"href": sl})
if sponsor_a and not SPONSORS.get(filename):
SPONSORS.update({
filename: Sponsor(
shortname=shortname,
title=sponsor_a.find("header").text.strip(),
description=sponsor_a.find("p").text.strip(),
link=sl
)
})
except Exception as e:
logger.exception("Failed to collect/parse sponsor data!\n"
f" show: {show}\n"
f" ep: {ep}")
return sponsors
def save_post_obj_file(filename: str, post_obj: Post, dest_dir: str, overwrite: bool = False) -> None:
data_dont_override = set(config.get("data_dont_override"))
if IS_LATEST_ONLY and filename in data_dont_override:
logger.warning(f"Filename `{filename}` found in `data_dont_override`! Will not save to it.")
overwrite = False
file_path = os.path.join(dest_dir, filename)
save_file(file_path, f_dumps(post_obj), overwrite=overwrite)
def get_username_from_url(url):
"""
Get the last path part of the url which is the username for the hosts and guests.
Replace it using the `username_map` from config.
"""
username = urlparse(url).path.split("/")[-1]
# Replace username if found in usernames_map
usernames_map = config.get("usernames_map")
if usernames_map:
username = usernames_map.get(
username, # get by the key that should be replaced
username) # default to the key if not found
return username
def save_avatar_img(img_url: str, username: str, is_small=False) -> str:
"""Save the avatar image only if it doesn't exist.
Return the file path relative to the `static` folder.
For example: "images/people/chris.jpg"
"""
try:
relative_filepath = get_avatar_relative_path(username, is_small)
full_filepath = os.path.join(DATA_ROOT_DIR, "static", relative_filepath)
# Check if file exist BEFORE the request. This is more efficient as it saves
# time and bandwidth
if os.path.exists(full_filepath):
logger.warning(f"Skipping saving `{full_filepath}` as it already exists")
return relative_filepath
resp = requests.get(img_url)
resp.raise_for_status()
save_file(full_filepath, resp.content, mode="wb")
return relative_filepath
except Exception:
logger.exception("Failed to save avatar!\n"
f" img_url: {img_url}"
f" username: {username}")
def get_avatar_relative_path(username, is_small=False):
# Assume all images are JPG.
# Might need to use `python-magic` lib to get the actual mime-type and append
# appropriate file extension.
filename_suffix = "_small.jpg" if is_small else ".jpg"
filename = username + filename_suffix
relative_filepath = os.path.join("images", "people", filename)
return relative_filepath
def parse_name(page_soup, username, guest_data):
# Fallback name to be to username
name = username
name_h1 = page_soup.find("h1")
if name_h1:
name = name_h1.text.strip()
elif guest_data:
name = guest_data.get("name", username)
return name
def scrape_data_from_jb(shows: Dict[str,ShowDetails], executor):
logger.info(">>> Scraping data from jupiterbroadcasting.com...")
# Collect all links for episode page of each show into JB_DATA
for show_slug, show_config in shows.items():
show_base_url = show_config.jb_url
jb_populate_episodes_urls(show_slug, show_base_url)
logger.success(">>> Finished collecting urls of episode pages")
logger.info(">>> Scraping data from each episode page...")
# Scrape each page for data
futures = []
for show, show_episodes in JB_DATA.items():
for ep, ep_data in show_episodes.items():
futures.append(executor.submit(
jb_get_ep_page_content, ep_data.jb_url, ep_data, show, ep))
# after previous is completed then parse + load all info
# into JB_DATA for direct downloads
for future in concurrent.futures.as_completed(futures):
page_content, ep_data, show, ep = future.result()
page_content: requests.Response
ep_data: Jb_Episode_Record
show: str # episode slug
ep: int # episode number
jb_populate_direct_links_for_episode(page_content, ep_data, show, ep)
# save to a json file - this might be useful for files migrations
# save_json_file("jb_all_shows_links.json", JB_DATA, DATA_ROOT_DIR)
logger.success(">>> Finished scraping data from jupiterbroadcasting.com")
def jb_get_ep_page_content(page_url: HttpUrl, ep_data: Jb_Episode_Record, show: str, ep: int)-> Tuple[requests.Response, Dict, str, int]:
"""
returns a tuple with the page's content, Jbd_Episode_Record, show slug, and episode number
"""
resp = requests.get(page_url)
return resp, ep_data, show, ep
def jb_populate_direct_links_for_episode(ep_page_content: requests.Response, ep_data: Jb_Episode_Record, show: str, ep: int) -> None:
"""
this populates the rest of the Jbd_Episode_Record object with direct
download links to various services (YouTube, OGG audio, etc..).
It dynamically adds them based on their name (i.e. video, hd_video, youtube, mp3_audio, etc...)
as well as the corresponding URL for the direct download.
This also modifies the ep_data parameter in place, which is why it doesn't return anything
"""
try:
ep_soup = BeautifulSoup(ep_page_content.content, "html.parser")
dd_div = ep_soup.find("div", attrs={"id": "direct-downloads"})
if dd_div:
dl_links = dd_div.find_all("a")
else:
# older episodes have different structure. (example below)
# https://web.archive.org/web/20200227001055/https://www.jupiterbroadcasting.com/90751/budgie-jumping-lup-120/
p_links = get_list(ep_soup, "Direct Download:", "h3", "p")
if p_links:
dl_links = p_links.find_all("a")
else:
logger.warning(
"Failed to find Direct Download links for the episode.\n"
f" show: {show} \n"
f" ep: {ep}")
return
# this uses the resulting anchor tags and creates them dynamically based on text
for dl_link in dl_links:
url = dl_link.get("href").strip("\\\"")
slug = dl_link.text.lower().replace(" ", "_")
# check if it's a defined property on the dataclass
if slug in ep_data.__match_args__:
setattr(ep_data, slug, url)
continue
logger.error(f"New field {slug} (value of: {url}) is not already explicitly defined in the Jbd_Episode_Record")
except Exception as e:
logger.exception(
"Failed to parse direct links for episode.\n"
f" show: {show} \n"
f" ep: {ep}")
def jb_populate_episodes_urls(show_slug: str, show_base_url: HttpUrl) -> None:
"""
Populates the JB_DATA global dictionary with
{
<show_slug>: {
<episode_number>: {
"jb_url": "<episode_link>"
}
}
"""
# setting JB_DATA[show_slug] to an empty dictionary
JB_DATA.setdefault(show_slug, {})
# referencing aforementioned show slug as show_data
show_data = JB_DATA[show_slug]
last_page = jb_get_last_page_of_show(show_base_url)
# these are edge cases for the loop below which don't match
# the typical episode number format of "<title> <episode_number>"
show_exceptions = {
# LAN edge case. This episode is between ep152 and 153, hence it
# shall be officially titled as episode 152.5 for now forth
# (hopefully having floaty number won't brake things 😛)
# TODO create the episode file for this, cuz it's not in Fireside
'Goodbye from Linux Action News': 152.5,
# Some Coder exceptions
'Say My Functional Name | Coder Radio': 343,
'New Show! | Coder Radio': 0,
"Someone Else’s Computer | Self-Hosted 59": 60,
}
futures = []
with concurrent.futures.ThreadPoolExecutor() as executor:
for page in range(1, last_page+1):
page_url = f"{show_base_url}/page/{page}/"
futures.append(executor.submit(requests.get, page_url))
for future in concurrent.futures.as_completed(futures):
resp = future.result()
resp: requests.Response
page_soup = BeautifulSoup(resp.content, "html.parser")
videoitems = page_soup.find_all("div", class_="videoitem")
for idx, item in enumerate(videoitems):
item: Tag
if IS_LATEST_ONLY and idx >= LATEST_ONLY_EP_LIMIT:
logger.debug(f"Limiting scraping to only {LATEST_ONLY_EP_LIMIT} most"
" recent episodes")
break
try:
# finds anchor tag that links to the episode
link = item.find("a")
link_href = link.get("href")
title = link.get("title")
ep_num = title.split(" ")[-1]
if ep_num == "LU1":
# LUP edge case for ep 1
ep_num = 1
elif title in show_exceptions.keys():
ep_num = show_exceptions[title]
else:
# if ep_num != link_href.split('-')[-1].strip('/'):
# raise ValueError(f"Episode URL ({link_href}) doesn't have the same episode number as the title: {ep_num}")
ep_num = int(ep_num)
# catching if overwriting episodes with JB_DATA
if ep_num in show_data.keys():
raise ValueError(f"There is already an existing show for episode number: {ep_num}\nWhich is: {show_data[ep_num]}\nCurrent attempted info: {item.contents}\nAll current info: {JB_DATA}")
show_data.update({ep_num: Jb_Episode_Record(jb_url=link_href)})
except Exception as e:
logger.exception(
"Failed to get episode page link and number from JB site.\n"
f" show: {show_slug}\n"
f" page: {page}\n"
f" ep_idx: {idx}\n"
f" html: {item.string}")
def jb_get_last_page_of_show(show_base_url) -> int:
"""
This uses the pagination element on https://www.jupiterbroadcasting.com/show/<show_name> to determine
how many pages of the show there is to process
"""
# this is an override to only get the most recent page of the show
if IS_LATEST_ONLY:
logger.debug(f"Force only scraping of the most recent page")
# Scrape only the most recent page
return 1
# requests the first page of the show
page_soup = BeautifulSoup(requests.get(
show_base_url).content, "html.parser")
# parses the pagination numbers i.e. "Page 1 of 7"
pages_span = page_soup.find("span", class_="pages")
# if the pagination exists
if pages_span:
# grabs the last space delimited text
# i.e. 7 with "Page 1 of 7"
last_page = int(pages_span.text.split(" ")[-1])
# if no pagination element exists
else:
last_page = 1 # Just one page
return last_page
def scrape_hosts_and_guests(shows: Dict[str, ShowDetails] , executor):
logger.info(">>> Scraping hosts and guests from Fireside...")
people_dir = os.path.join(DATA_ROOT_DIR, "content", "people")
guests = scrape_show_guests(shows, executor)
hosts = scrape_show_hosts(shows, executor)
people = guests | hosts # combine the two dicts (hosts data overrides guests)
# Save json files asynchronously
futures = []
for username, person in people.items():
futures.append(
executor.submit(save_post_obj_file,
f"{username}.md", Post('', **person.dict()),
people_dir, overwrite=True)
)
# Drain all threads
for future in concurrent.futures.as_completed(futures):
future.result()
logger.success(">>> Finished scraping hosts and guests")
def scrape_show_hosts(shows: Dict[str, ShowDetails] , executor) -> Dict[str, Person]:
show_hosts = {}
for show_data in shows.values():
show_fireside_url = show_data.fireside_url
all_hosts_url = f"{show_fireside_url}/hosts"
hosts_soup = BeautifulSoup(requests.get(all_hosts_url).content, "html.parser")
for host_soup in hosts_soup.find_all("div", class_="host"):
host_info_soup = host_soup.find("div", class_="host-info")
host_link = host_info_soup.find("h3").find("a")
name = host_link.text.strip()
host_url = show_fireside_url + host_link.get("href")
username = get_username_from_url(host_url)
bio = host_info_soup.find("p").text
links = host_info_soup.find("ul", class_="host-links").find_all("a")
links_data = parse_social_links(links)
avatar_small_url = host_soup.find("div", class_="host-avatar").find("img").get("src")
avatar_url = avatar_small_url.replace("_small.jpg", ".jpg")
avatar_small = save_avatar_img(avatar_small_url, username, is_small=True)
avatar = save_avatar_img(avatar_url, username)
append_person_to_dict("host", show_hosts, username, show_data.acronym,
title=name,
avatar="/"+avatar,
avatar_small="/"+avatar_small,
bio=bio,
**links_data)
return show_hosts
def scrape_show_guests(shows: Dict[str, ShowDetails], executor) -> Dict[str, Person]:
"""Return dict of Person by username
"""
show_guests = {} # username as key
# no need to do thread since there's only a handful number of shows
for show_data in shows.values():
show_fireside_url = show_data.fireside_url
all_guests_url = f"{show_fireside_url}/guests"
guests_soup = BeautifulSoup(requests.get(all_guests_url).content, "html.parser")
links = guests_soup.find("ul", class_="show-guests").find_all("a")
all_urls = [show_fireside_url + a.get("href") for a in links]
guest_pages = get_pages_content_threaded(all_urls, executor)
for l in links:
url = show_fireside_url + l.get("href")
username = get_username_from_url(url)
name = l.find("h5").text.strip()
avatar_small_url = l.find("img").get("src").split("?")[0]
avatar_url = avatar_small_url.replace("_small.jpg", ".jpg")
avatar_small = save_avatar_img(avatar_small_url, username, is_small=True)
avatar = save_avatar_img(avatar_url, username)
html_page = guest_pages.get(url)
page_data = parse_person_page(html_page)
append_person_to_dict("guest", show_guests, username, show_data.acronym,
title=name,
avatar="/"+avatar,
avatar_small="/"+avatar_small,
**page_data)
return show_guests
def append_person_to_dict(p_type: PersonType, the_dict: dict, username, show_acr: str, **data):
new = Person(type=p_type, username=username, **data)
existing = the_dict.get(username)
if existing and existing.dict() != new.dict() and not IS_LATEST_ONLY:
# If different, save as an alternative version
the_dict[f"__{username}_{show_acr}"] = new
else:
the_dict[username] = new
def parse_person_page(html_page):
if not html_page:
return {}
page_soup = BeautifulSoup(html_page, "html.parser")
page_data = {}
# Parse bio
bio = page_soup.find("section")
if bio:
page_data["bio"] = bio.text.strip()
# Parse social links
nav = page_soup.find("nav", class_="links")
if nav:
links = nav.find_all("a")
page_data = {**page_data, **parse_social_links(links)}
return page_data
def parse_social_links(links: ResultSet):
result = {}
for link in links:
href = link.get("href").lower()
label = link.text.lower()
if "website" in label:
result["homepage"] = href
elif "twitter" in label:
result["twitter"] = href
elif "linkedin" in label:
result["linkedin"] = href
elif "instagram" in label:
result["instagram"] = href
elif "youtube" in label:
result["youtube"] = href
return result
def get_pages_content_threaded(urls: List[str], executor) -> Dict[str, str]:
result = {} # by request url as key
futures = []
for url in urls:
futures.append(executor.submit(requests.get, url))
for f in concurrent.futures.as_completed(futures):
resp: requests.Response = f.result()
if not resp.ok:
logger.error("GET Request failed!\n"
f" url: {resp.request.url}\n"
f" status code: {resp.status_code}\n"
f" msg: {resp.reason}")
continue
result[resp.request.url] = resp.content
return result
def scrape_episodes_from_fireside(shows: Dict[str,ShowDetails] , executor):
logger.info(">>> Scraping episodes from Fireside...")
futures = []
for show_slug, show_config in shows.items():
# Use same structure as in the root project for easy copy over
output_dir = os.path.join(
DATA_ROOT_DIR, "content", "show", show_slug)
try:
api_data = ShowJson(
**requests.get( show_config.fireside_url + "/json")
.json()
)
except Exception as e:
logger.exception(f"Failed to retrieve JSON for {show_config.name}.")
for idx, api_episode in enumerate(api_data.items):
if IS_LATEST_ONLY and idx >= LATEST_ONLY_EP_LIMIT:
logger.debug(f"Limiting scraping to only {LATEST_ONLY_EP_LIMIT} most"
" recent episodes")
break
futures.append(executor.submit(
create_episode, api_episode, show_config,
show_slug, output_dir
))
# Drain to get exceptions. This is important in order to collect all the
# MISSING_* globals first before proceeding
for future in concurrent.futures.as_completed(futures):
future.result()
logger.success(">>> Finished scraping from episodes ✓")
def save_sponsors(executor):
logger.info(">>> Saving the sponsors found in episodes from Fireside...")
sponsors_dir = os.path.join(DATA_ROOT_DIR, "content", "sponsors")
futures = []
for filename, sponsor in SPONSORS.items():
futures.append(executor.submit(
save_post_obj_file,
filename, Post('',**sponsor.dict()), sponsors_dir, overwrite=True))
# Drain all threads
for future in concurrent.futures.as_completed(futures):
future.result()
logger.success(">>> Finished saving sponsors")
def main():
global config
with open("config.yml") as f:
config = yaml.load(f, Loader=yaml.SafeLoader)
validated_config = ConfigData(shows=config['shows'], usernames_map=config['usernames_map'])
with concurrent.futures.ThreadPoolExecutor() as executor:
# Must be first. Here the JB_DATA global is populated
scrape_data_from_jb(validated_config.shows, executor)
scrape_episodes_from_fireside(validated_config.shows, executor)
save_sponsors(executor)
scrape_hosts_and_guests(validated_config.shows, executor)
if __name__ == "__main__":
LOG_LVL = int(os.getenv("LOG_LVL", INFO)) # Defaults to INFO, 10 for debug
logger.remove() # Remove default logger
logger.add(sys.stderr, level=LOG_LVL)
logger.info("🚀🚀🚀 SCRAPER STARTED! 🚀🚀🚀")
main()
logger.success("🔥🔥🔥 ALL DONE :) 🔥🔥🔥\n\n")
exit(0)