-
Notifications
You must be signed in to change notification settings - Fork 0
/
goodreads_web_scraping.py
338 lines (270 loc) · 14.2 KB
/
goodreads_web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# -*- coding: utf-8 -*-
"""GoodReads - Web Scraping.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1l69q6S-56TyaMGKgDx5Sjd_O7uKrwPvY
# Web Scraping
Scraping is simply a process of extracting (from various means), copying and screening of data. Web scraping provides a way to the developers to collect and analyze data from the internet.
Web-scraping provides one of the great tools to automate most of the things a human does while browsing.
In this project we will explore "How to extract information from the popular Good Reads platform to analyze and generate interesting insights around Book Trends"
Goodreads is one the world’s largest community for reviewing and recommending books. It's a favorite platform for many a voracious readers!!
This project is partly inspired by following linked project. [Reference link :](https://medium.com/@soodakriti175/goodreads-web-scraping-92345b620f9c)
I have structured this first python notebook detailing below tasks:
1. How to scrape certain sections of a page using Beautiful Soup? In particular, all books listed under a Good Reads user defined list in a given page.
2. How to iteratively scrape all pages to obtain specific attributes on all books related information belonging to a particular list?
3. How to load the scraped contents into a Pandas dataframe?
4. How to expand the scope and iteratively scrape all lists for books related information for a list of user defined tags and append the extracted info to an existing .csv file loaded in Google Drive? Example: Tags such as "fiction", "science-fiction" etc.
# Install the required libraries
"""
!pip install requests
!pip install beautifulsoup4
!pip install pandas
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
"""# Import drive library and mount google drive path to access files stored in the drive"""
from google.colab import drive
drive.mount('/content/drive')
"""# Change to working directory
"""
# Commented out IPython magic to ensure Python compatibility.
# %cd /content/drive/My Drive/
"""Command to read and load a given file to a dataframe."""
df = pd.read_csv("/content/drive/My Drive/goodreads.csv", delim_whitespace=True)
"""# ***Section 1***
How to scrape certain sections of a page using Beautiful Soup? In particular, all books listed under a Good Reads user defined list in a given page.
Import "get" method from python requests module.
Below is a sample command to fetch contents of a URL from GoodReads platform.
Note: Below URL fetches contents of the first page of the list **"12362.All_Time_Favorite_Romance_Novels" **
"""
from requests import get
url = 'https://www.goodreads.com/list/show/12362.All_Time_Favorite_Romance_Novels?page=1'
response = get(url)
print(response.text[:500])
"""# Beautiful Soup - An Overview
The Beautiful Soup is a python library which is named after a Lewis Carroll poem of the same name in “Alice’s Adventures in the Wonderland”. Beautiful Soup is a python package and as the name suggests, parses the unwanted data and helps to organize and format the messy web data by fixing bad HTML and present to us in an easily-traversible XML structures.
In short, Beautiful Soup is a python package which allows us to pull data out of HTML and XML documents.
[Click for more info:](https://www.tutorialspoint.com/beautiful_soup/beautiful_soup_quick_guide.htm)
Below section used find_all method to extract all <tr> tags of itemtype "http://schema.org/Book". This information on which tag to parse to extract relevant book containers was obtained by right clicking on [this page](https://www.goodreads.com/list/show/12362.All_Time_Favorite_Romance_Novels) and choosing inspect option.
"""
response = requests.get(url)
html = response.content
html_soup = bs(html, 'html.parser')
book_containers = html_soup.find_all('tr',itemtype="http://schema.org/Book")
print(type(book_containers))
print(len(book_containers))
"""Below section extracts and prints the **html content** of a single book container."""
first_book = book_containers[0]
first_book
"""Next we see how to extract the entire "a" tag belonging to the class "bookTitle" for the first book. """
name = first_book.find('a',class_="bookTitle")
name
"""We then use the strip() method to extract specific details like the **Book Title**, **AuthorName**, **Scoring** and **Ratings** etc."""
name = first_book.find('a',class_="bookTitle").text.strip()
name
authors = first_book.find('a',class_="authorName").text.strip()
authors
scoring = first_book.find('span',class_="greyText smallText uitext").text.strip().split()
scoring
avg_scores=scoring[0]
rates = scoring[4]
print("average scores:",avg_scores)
print("ratings", rates)
scores_and_votes = first_book.find('span',class_="smallText uitext").text.strip().split()
scores_and_votes
"""**PLEASE NOTE**: A book’s total score is based on multiple factors, including the number of people who have voted for it and how highly those voters ranked the book."""
scores = scores_and_votes[1]
votes = scores_and_votes[3]
print("scores:", scores)
print("votes:", votes)
"""# Section 2
How to iteratively scrape all pages to obtain all books related information belonging to a particular list?
Below piece iterates through all the pages of a given list and constructs and prints a valid URL to retrieve the page content from. Here we have hardcoded the total number of pages as 53( at the time of retrieval ).
"""
page = 1
while page != 53:
url = f"https://www.goodreads.com/list/show/12362.All_Time_Favorite_Romance_Novels?page={page}"
print(url)
page = page + 1
"""Combining all the pieces of code we have seen so far, we see a sample on how to extract all books related info from all pages for a specific Book List, eg: "https://www.goodreads.com/list/show/12362.All_Time_Favorite_Romance_Novels"
Please note that we have still hardcoded both the list url and the total number of pages in this sample.
"""
from requests import get
page = 1
names = []
ratings = []
avgscores = []
author=[]
score=[]
votes=[]
while page != 53:
url = f"https://www.goodreads.com/list/show/12362.All_Time_Favorite_Romance_Novels?page={page}"
#print(url)
response = get(url)
html = response.content
html_soup = bs(html, 'html.parser')
book_containers = html_soup.find_all('tr',itemtype="http://schema.org/Book")
#print(book_containers[0])
for container in book_containers:
#print(container)
if container.find('td', width='100%') is not None:
book = container.find('td', width='100%')
#print(book)
name = book.find('a',class_="bookTitle").text.strip()
#print(name)
names.append(name)
authors = book.find('a',class_="authorName").text.strip()
author.append(authors)
scoring = book.find('span',class_="greyText smallText uitext").text.strip().split()
avg_score = scoring[0]
rating = scoring[4]
avgscores.append(avg_score)
ratings.append(rating)
scoring_and_rating = book.find('span',class_="smallText uitext").text.strip().split()
#print(scoring_and_rating)
scores = scoring_and_rating[1]
score.append(scores)
voted = scoring_and_rating[3]
votes.append(voted)
page = page + 1
"""# Section 3
How to load the scraped contents into a Pandas dataframe?
We now see how to create and populate a pandas dataframe with the specific book related information we have scraped for all books in a given list.
"""
df = pd.DataFrame({'book title': names,
'author': author,
'avg_score': avg_scores,
'rating': ratings,
'score': score,
'votes': votes,
'list': 'All Time Favorite Romance Novels'
})
df
"""# Section 4
How to expand the scope and iteratively scrape all lists for books related information for a list of user defined tags and append the extracted info to an existing .csv file loaded in Google Drive? Example: Tags such as "fiction", "science-fiction" etc.
For execution purpose we have limited the value of tags to include only "fiction" and "science-fiction".
However the scope of the tags can be increased to include more genres such as below
```
# tags = ["romance", "fiction", "young-adult", "fantasy", "science-fiction", "non-fiction", "children", "history", "mystery", "covers", "horror", "historical-fiction", "best", "gay", "titles", "paranormal", "love", "middle-grade", "contemporary", "historical-romance", "thriller", "nonfiction", "biography", "women", "series", "lgbt", "queer", "classics", "graphic-novels", "memoir"]
```
Also please note how the hardcoding of list URLs and number of pages are eliminated here and are instead derived using below beautiful soup lambda expressions as highlighted below.
```
html_soup.find_all("a", href=re.compile(f"/list/tag/{tags[i]}?page="))
all_links = html_soup.find_all("a", href=lambda href: href and f"/list/tag/{tags[i]}?page=" in href)
```
"""
import re
from requests import get
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
names = []
ratings = []
avgscores = []
author=[]
score=[]
votes=[]
baseUrl = f"https://www.goodreads.com"
page = 1
urls = []
dfs = []
#Below line is commented out to indicate how the scope of the tags can be increased to include more genres"
#tags = ["romance", "fiction", "young-adult", "fantasy", "science-fiction", "non-fiction", "children", "history", "mystery", "covers", "horror", "historical-fiction", "best", "gay", "titles", "paranormal", "love", "middle-grade", "contemporary", "historical-romance", "thriller", "nonfiction", "biography", "women", "series", "lgbt", "queer", "classics", "graphic-novels", "memoir"]
tags = ["fiction", "science-fiction"]
listTitles = []
listTitleLinks = []
for tag in tags:
url = f"https://www.goodreads.com/list/tag/{tag}"
urls.append(url)
for i in range(len(tags)):
response = get(urls[i])
html = response.content
html_soup = bs(html, 'html.parser')
html_soup.find_all("a", href=re.compile(f"/list/tag/{tags[i]}?page="))
all_links = html_soup.find_all("a", href=lambda href: href and f"/list/tag/{tags[i]}?page=" in href)
#print(all_links)
#extract total number of pages
pages = all_links[-2].text.strip()
print(f"Total pages for {tags[i]} genre is : {pages}")
page = 1
while page <= int(pages):
#print(page)
for link in all_links:
currentUrl = baseUrl + link.get('href')
response = get(currentUrl)
html = response.content
html_soup = bs(html, 'html.parser')
listTitles_for_tag = html_soup.find_all("a",class_="listTitle")
for listTitle in listTitles_for_tag:
listTitles.append(listTitle.text.strip())
print(f"Total number of listTiles in genre {tags[i]} are : {len(listTitles_for_tag)}")
currentLink = baseUrl + listTitle.get('href')
listTitleLinks.append(currentLink)
print(f"Fetching content for {currentLink}")
response = get(currentLink)
html = response.content
html_soup = bs(html, 'html.parser')
html_soup.find_all("a", href=re.compile(f"{listTitle.get('href')}?page="))
all_links = html_soup.find_all("a", href=lambda href: href and f"{listTitle.get('href')}?page=" in href)
for j in range(len(all_links)):
m = re.search(r'\d+$', all_links[j].text.strip())
if m is None:
pages_in_listTitle = all_links[j-1].text.strip()
break
#pages_in_listTitle = all_links[-2].text.strip()
print(f"Total pages in {currentLink} : {pages_in_listTitle}")
currentPage=1
while currentPage <=int(pages_in_listTitle):
currentLink = baseUrl + listTitle.get('href')
currentLink = currentLink + f"?page={currentPage}"
print(f"Fetching content for {currentLink}")
response = get(currentLink)
html = response.content
html_soup = bs(html, 'html.parser')
book_containers = html_soup.find_all('tr',itemtype="http://schema.org/Book")
print(f"Total number of books in {listTitle.text.strip()} are : {len(book_containers)}")
if len(book_containers) == 0:
break
for container in book_containers:
if container.find('td', width='100%') is not None:
book = container.find('td', width='100%')
#print(book)
name = book.find('a',class_="bookTitle").text.strip()
#print(name)
names.append(name)
authors = book.find('a',class_="authorName").text.strip()
author.append(authors)
scoring = book.find('span',class_="greyText smallText uitext").text.strip().split()
if len(scoring) >=0:
avg_score = scoring[0]
avgscores.append(avg_score)
if len(scoring) >= 4:
rating = scoring[4]
ratings.append(rating)
scoring_and_rating = book.find('span',class_="smallText uitext").text.strip().split()
#print(scoring_and_rating)
if len(scoring_and_rating) >=1:
scores = scoring_and_rating[1]
score.append(scores)
if len(scoring_and_rating) >= 3:
voted = scoring_and_rating[3]
votes.append(voted)
else:
currentPage = currentPage + 1
#print(f"Book Titles: {names}, Author: {author}, Avg_Score: {avgscores}, Rating: {ratings}, Score: {score}, Votes: {votes}, ListTitle: {listTitle}, Genre: {tags[0]}")
a = {"book title": names, "author": author, "avg_score": avgscores, "rating": ratings, "score": score, "votes": votes, "list title": listTitle.text.strip() }
df = pd.DataFrame.from_dict(a, orient='index')
df = df.transpose()
for index, row in df.iterrows():
df.at[index, 'list title'] = listTitle.text.strip()
df.at[index, 'genre'] = tags[i]
print("Appending the dataframe to file: goodreads_fiction_types.csv")
df.to_csv('/content/drive/My Drive/goodreads_fiction_types.csv', mode='a', index=False, header=False)
names = []
ratings = []
avgscores = []
author=[]
score=[]
votes=[]
currentPage = currentPage + 1
page = page + 1