This repository has been archived by the owner on Dec 2, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
manufacturers-scraper.py
145 lines (126 loc) · 5.4 KB
/
manufacturers-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
import os
import sys
import requests
import requests_cache
from bs4 import BeautifulSoup
from pprint import pprint
from lib.csvtools import dict_to_csv
def main():
requests_cache.install_cache('scraper_cache')
mfers = {}
website = 'http://southcarolinasccoc.weblinkconnect.com/Manufacturing'
r = requests.get(website)
soup = BeautifulSoup(r.text, 'html.parser')
## Get the manufacturer names and addresses from the json script
scripts = soup.findAll('script')
for script in scripts:
if 'var listingLatitude' in script.text:
lines = script.text.split('\n')
for line in lines:
if 'addressesToMap' in line:
continue
if not line.strip():
continue
if line.startswith('var'):
line = line[135:]
#print line
try:
name = line.split(';')[2].split('>')[2].split('<')[0]
except Exception as e:
print e
print line
import pdb; pdb.set_trace()
street = line.split(';')[2].split('>')[4].split('<')[0]
city = line.split(';')[2].split('>')[5].split('<')[0].split(',')[0]
state = line.split(';')[2].split('>')[5].split('<')[0].split(',')[1].replace(' ', '').strip()
zipcode = line.split(';')[4].split('<')[0]
#import pdb; pdb.set_trace()
mfers[name] = {}
mfers[name]['name'] = name
mfers[name]['street'] = street
mfers[name]['city'] = city
mfers[name]['state'] = state
mfers[name]['zipcode'] = zipcode
mfers[name]['phone'] = ""
mfers[name]['contact'] = ""
mfers[name]['website'] = ""
## Get the contact names and phone numbers from the ListingResults divs
listresults1 = soup.findAll('div', {'class': 'ListingResults_Level1_CONTAINER'})
listresults2 = soup.findAll('div', {'class': 'ListingResults_Level2_CONTAINER'})
listresults3 = soup.findAll('div', {'class': 'ListingResults_Level3_CONTAINER'})
listresults4 = soup.findAll('div', {'class': 'ListingResults_Level4_CONTAINER'})
all_listresults = listresults1 + listresults2 + listresults3 + listresults4
for lr1 in all_listresults:
#print lr1
mfername = ""
phone_num = ""
contact = ""
## use the hrefs+imgs to figure out the mfer name in this cell
links = lr1.findAll('a')
for link in links:
badwords = ['learn more', 'visit site', 'show on map']
if link.text.lower() in badwords:
continue
href = link.attrs['href']
mfername = link.find('img').attrs['alt']
break
## more names here than in the js ... huwhat?
if mfername not in mfers:
mfers[mfername] = {}
mfers[mfername]['name'] = mfername
mfers[mfername]['street'] = ""
mfers[mfername]['city'] = ""
mfers[mfername]['state'] = ""
mfers[mfername]['zipcode'] = ""
mfers[mfername]['phone'] = ""
mfers[mfername]['contact'] = ""
mfers[mfername]['website'] = ""
#import pdb; pdb.set_trace()
## get the location if not already known
if not mfers[mfername]['state']:
addydiv = lr1.find('div', {'itemprop': 'address'})
mfers[mfername]['street'] = addydiv.find('span', {'itemprop': 'street-address'}).text
mfers[mfername]['city'] = addydiv.find('span', {'itemprop': 'locality'}).text
mfers[mfername]['state'] = addydiv.find('span', {'itemprop': 'region'}).text
mfers[mfername]['zipcode'] = addydiv.find('span', {'itemprop': 'postal-code'}).text
#import pdb; pdb.set_trace()
## phone number is set as an image (not all have a phone)...
try:
phone_img = lr1.find('img', {'src': '/external/wcpages/images/phone.gif'})
phone_num = phone_img.text.strip().encode('ascii', 'ignore')
mfers[mfername]['phone'] = phone_num
except Exception as e:
pass
## set the website
try:
mfers[mfername]['website'] = lr1.find('a', {'target': '_blank'}).attrs['href']
except Exception as e:
#print e
#import pdb; pdb.set_trace()
pass
# ListingResults_Level3_MAINCONTACT
try:
mfers[mfername]['contact'] = lr1.find('div', {'class': 'ListingResults_Level3_MAINCONTACT'}).text
except Exception as e:
pass
#if mfername == 'Bose Corporation':
# import pdb; pdb.set_trace()
####################################################
# CSV PRINT
####################################################
#pprint(mfers)
mfnames = sorted(mfers.keys())
keys = mfers[mfnames[0]].keys()
keys = [x for x in keys if x != 'name']
print "manufacturer," + ','.join(keys)
for k in mfnames:
v = mfers[k]
sys.stdout.write('"' + k + '"' + ',')
for key in keys:
sys.stdout.write('"' + v.get(key, "") + '"' + ',')
sys.stdout.write('\n')
#import pdb; pdb.set_trace()
dict_to_csv(mfers, 'manufacturers.csv')
if __name__ == "__main__":
main()