-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawlerSapoPraias.py
62 lines (51 loc) · 2.26 KB
/
crawlerSapoPraias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from bs4 import BeautifulSoup
import urllib
def search_page(page):
html = urllib.urlopen(page).read()
soup = BeautifulSoup(html)
data = []
items = soup.findAll('p', {'class': 'title-details'})
count = 0
for item in items:
data.append({})
data[count]['place'] = item.find('span', {'class': 'municipality'}).get_text()
innerhtml = urllib.urlopen(item.a.get('href')).read()
innersoup = BeautifulSoup(innerhtml)
moreitems = innersoup.findAll('meta')
for moreitem in moreitems:
if moreitem.has_key('property'):
if moreitem['property'] == 'og:title' or moreitem['property'] == 'og:latitude' or moreitem['property'] == 'og:longitude':
data[count][moreitem['property'][3:]] = moreitem['content']
count += 1
items = soup.findAll('p', {'class': 'figure'})
count = 0
for item in items:
data[count]['picture'] = item.a.img.get('src')
count += 1
items = soup.findAll('div', {'class': 'info-detail'})
count = 0
for item in items:
if item.find('li', {'class': 'ico-restaurante'}):
data[count]['restaurant'] = True
else:
data[count]['restaurant'] = False
if item.find('li', {'class': 'ico-bandeira-azul'}):
data[count]['blueFlag'] = True
else:
data[count]['blueFlag'] = False
if item.find('li', {'class': 'ico-parque-estacionamento'}):
data[count]['parking'] = True
else:
data[count]['parking'] = False
if item.find('li', {'class': 'ico-toldos'}):
data[count]['umbrella'] = True
else:
data[count]['umbrella'] = False
count += 1
for beach in data:
if beach.has_key('title'):
print 'INSERT INTO BEACH(name, latitude, longitude, place, picture, parking, blueFlag, restaurant, umbrella) VALUES ("%s", %s, %s, "%s", "%s", %s, %s, %s, %s);' % (beach['title'], beach['latitude'], beach['longitude'], beach['place'], beach['picture'], beach['parking'], beach['blueFlag'], beach['restaurant'], beach['umbrella'])
nextp = soup.find('a', {'class': 'linkNext'})
if nextp:
search_page(nextp.get('href'))
search_page("http://praias.sapo.pt/praias/norte/")