-
Notifications
You must be signed in to change notification settings - Fork 0
/
bbcscrape.py
137 lines (111 loc) · 4.42 KB
/
bbcscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import numpy as np
from cs50 import SQL
from bs4 import BeautifulSoup
import urllib.request
import os
import csv
from tempfile import mkdtemp
db = SQL("sqlite:///recipes.db")
clinks = []
chlinks = [[]]
recip_links = [[[]]]
ingredient = [[[[]]]]
# Get the categories
r = urllib.request.urlopen('http://www.bbc.co.uk/food/chefs').read()
soup = BeautifulSoup(r, "html.parser")
menu = soup.find("ol", class_="resource-nav")
categories = menu.find_all("li")
for category in categories:
if category.a:
cat_link = category.a["href"]
clinks.append(cat_link)
else:
pass
i = 0
# Iterate over lists of chefs
for clink in clinks:
s = urllib.request.urlopen('http://www.bbc.co.uk' + clink).read()
ssoup = BeautifulSoup(s, "html.parser")
# Find each chef
ch = ssoup.find_all("li", class_="resource chef")
chlinks.append([])
ingredient.append([])
recip_links.append([])
for c in ch:
chef_link = c.a["href"]
chlinks[i].append(chef_link)
j = 0
# Iterate over chefs
for chlink in chlinks[i]:
t = urllib.request.urlopen('http://www.bbc.co.uk' + chlink).read()
tsoup = BeautifulSoup(t, "html.parser")
rls = tsoup.find("p", class_="see-more")
if not rls:
break
# Make sure all your lists stay in range
ingredient[i].append([])
recip_links[i].append([])
# Get link to the recipe list
rl_link = rls.a["href"]
u = urllib.request.urlopen('http://www.bbc.co.uk' + rl_link).read()
rlink = rl_link
usoup = BeautifulSoup(u, "html.parser")
# Identify the number of results
resultnum = usoup.find("div", class_="pagInfo-recipe-numbers")
resulttxt = resultnum.get_text()
resulttxt = resulttxt[9:]
rsltnm = sum([int(it) for it in resulttxt.split() if it.isdigit()])
divs = int(np.ceil(rsltnm / 15))
k = 0
# Find links to ALL the recipes
for div in range(divs):
v = urllib.request.urlopen('http://www.bbc.co.uk' + rlink).read()
vsoup = BeautifulSoup(v, "html.parser")
recips = vsoup.find_all("li", class_="article with-image")
recips = recips + vsoup.find_all("li", class_="article no-image")
# Get them links
for recip in recips:
recip_link = recip.div.h3.a["href"]
recip_links[i][j].append(recip_link)
# Deal with annoying results thing
if div + 1 < divs:
nextlink = vsoup.find("a", class_="see-all-search")
rlink = nextlink["href"]
else:
pass
# Iterate over recipes
for reci_link in recip_links[i][j]:
w = urllib.request.urlopen('http://www.bbc.co.uk' + reci_link).read()
wsoup = BeautifulSoup(w, "html.parser")
ings = wsoup.find_all("li", class_="recipe-ingredients__list-item")
ingredient[i][j].append([])
# Iterate over ingredients
for ing in ings:
if ing.a:
ing_link = ing.a["href"]
# Make the ingredients pretty
ingred = ing_link.replace("/food/", "")
ingred = ingred.replace("_", " ")
# Append them
ingredient[i][j][k].append(ingred + "\r")
else:
pass
# Name the recipes
title = wsoup.find("h1", class_="content-title__text")
name = title.get_text()
print(name)
# Give them a link
link = 'http://www.bbc.co.uk' + reci_link
# Insert info into database
if ingredient[i][j][k] != None:
ingredients = ''.join(str(e) for e in ingredient[i][j][k])
insertion = db.execute("INSERT INTO recipes (name, ingredients, link) VALUES(:name, :ingredients, :link)",
name=name, ingredients=ingredients, link=link)
selection = db.execute("SELECT recipeid FROM recipes where name=:name", name=name)
recipeid = selection[0]["recipeid"]
for q in ingredient[i][j][k]:
ing_insert = db.execute("INSERT INTO ingredients (ingredient, recipeid) VALUES(:ingredient, :recipeid)",
ingredient=q, recipeid=recipeid)
k += 1
j += 1
i += 1