-
Notifications
You must be signed in to change notification settings - Fork 0
/
fill_database.py
80 lines (64 loc) · 2.57 KB
/
fill_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import requests as req
import re
import io
import pytesseract
from PIL import Image
# cloud_config = {
# 'secure_connect_bundle': './secure-connect-m4cro-database.zip'
# }
# auth_provider = PlainTextAuthProvider('m4cro', 'M@VnDu2D7#tc')
# cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
# session = cluster.connect()
zomato_headers = {'Accept': 'application/json', 'user-key':'8b3dc6c1a42f7efdc2da8c6dab9f8778'}
# city is a string, "Athens, GA" for example
def get_restaurants_in_city(city):
global zomato_headers
city_id = req.get('https://developers.zomato.com/api/v2.1/cities?q={}'.format(city), headers=zomato_headers).json()['location_suggestions'][0]['id']
restaurants = req.get('https://developers.zomato.com/api/v2.1/search?entity_id={}&entity_type=city'.format(city_id), headers=zomato_headers).json()['restaurants']
# pertinent information for each restaurant: "name", "location", "menu_url"
return restaurants
def parse_menu(restaurant):
# download menu images
haha_fake_header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0',
}
content = req.get(restaurant['menu_url'], headers=haha_fake_header).text
url_pattern = re.compile(r'https:\\/\\/b\\.zmtcdn\\.com\\/data\\/menus\\/[0-9]*\\/[0-9]*\\/.*?\\.(png|jpg|jpeg)')
menu_urls = set(url_pattern.findall(content))
print(content)
# parse text in all the images
menu_pages = []
for menu_url in menu_urls:
resp = req.get(menu_url)
img = Image.open(io.BytesIO(resp.content))
text = pytesseract.image_to_string(img)
menu_pages.append(text)
# list of OCRed strings
return menu_pages
def fill_database():
# get restaurants in athens
rests = get_restaurants_in_city('Athens, GA')
for rest in rests:
rest = rest['restaurant']
items = []
obj = {
'name': rest['name'],
'website': rest['url'],
'latitude': float(rest['location']['latitude']),
'longitude': float(rest['location']['longitude']),
'items': items
}
# parse the menu
pages = parse_menu(rest)
print(obj['name'])
print(pages)
print('----------------------------\n')
# nutrition for each menu item
pass
# throw the restaurant into the database
pass
# done :)
if __name__ == '__main__':
fill_database()