-
Notifications
You must be signed in to change notification settings - Fork 19
/
crawler.py
101 lines (94 loc) · 4.34 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# encoding=utf-8
import os
import re
from bs4 import BeautifulSoup as bs
import sys
import imghdr
PY3 = sys.version.startswith('3')
if PY3:
from urllib.request import Request, urlopen
print ('python 3')
else:
from urllib2 import Request, urlopen
print ('python 2')
class Crawler(object):
def __init__(self):
super(Crawler, self).__init__()
self.album_prefix = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={0}&page={1}'
self.image_prefix = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&album_id={1}&page={2}'
self.image_pattern = re.compile('''img.*290x10000.jpg''', re.U)
self.image_name_pattern = re.compile('''"picId":"(.*?)"''', re.U)
self.model_pattern = re.compile('''<a class="lady-name" href="(.*?)".*>(.*?)</a>''', re.U)
self.album_pattern = re.compile('''.*album_id=(.*?)&.*''', re.U)
self.links = []
self.ids= []
self.names= []
def readHtml(self, html):
req = Request(html)
resp = urlopen(req)
content = resp.read()
if PY3:
charset = resp.headers.get_content_charset()
else:
charset = resp.headers.getparam('charset')
if charset:
content = content.decode(charset)
return content
def getLinkIdAndNames(self, htmlData):
items = re.findall(self.model_pattern, htmlData)
self.links = [link for link, name in items]
self.names = [name for link, name in items]
self.ids = [link[link.index('=')+1:] for link in self.links]
def getAlbums(self):
for i, model_id in enumerate(self.ids):
print ('start downloading %s'%self.names[i])
try:
os.mkdir(self.names[i])
except OSError as e:
pass
os.chdir(self.names[i])
for page in range(1, 10):
print ('current page %s'%page)
model_url = self.album_prefix.format(model_id, page)
soup = bs(self.readHtml(model_url), 'html.parser')
albums = soup.find_all('div', class_ = 'mm-photo-cell-middle')
if not albums:
break
for album in albums:
album_name = album.find('h4').a.string.strip().rstrip('.')
album_link= album.find('h4').a['href']
album_id = re.findall(self.album_pattern, album_link)[0]
album_create_time = album.find('p', class_ = 'mm-photo-date').string.lstrip(u'创建时间: ')
album_img_count = album.find('span', class_ = 'mm-pic-number').string.strip('()')
# print (album_name, album_create_time, album_img_count, album_id)
subDir = ''.join([album_name, '_', album_create_time, u'共', album_img_count])
try:
os.mkdir(subDir)
except OSError as e:
pass
os.chdir(subDir)
self.getImages(model_id, album_id, album_img_count.strip(u'张'))
os.chdir('..')
os.chdir('..')
print ('finish downloading', self.names[i])
def getImages(self, model_id, album_id, image_count):
print ('start downloading album %s %s %s' %(album_id, image_count, u'张'))
for page in range(1, int((int(image_count)-1)/16+2)):
link = self.image_prefix.format(model_id, album_id, page)
body = self.readHtml(link)
images = re.findall(self.image_pattern, body)
# tried to use des as names, however, it duplicates times. So i chose pic ids.
names = re.findall(self.image_name_pattern, body)
for idx, image in enumerate(images):
image = image[:image.find('_290')]
image_content = self.readHtml('http://'+image)
image_type = imghdr.what('guest what type of image it is', image_content) or 'jpg'
with open(names[idx]+'.%s'%image_type, 'wb') as img:
img.write(image_content)
if __name__ == '__main__':
test_html = 'https://mm.taobao.com/json/request_top_list.htm?page={0}'
c = Crawler()
for page in range(1, 5):
data = c.readHtml(test_html.format(page))
c.getLinkIdAndNames(data)
c.getAlbums()