-
Notifications
You must be signed in to change notification settings - Fork 13
/
fetcher.py
90 lines (83 loc) · 2.77 KB
/
fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import requests
from bs4 import BeautifulSoup
headers = {
"authority": "www.zhihu.com",
"user-agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Mobile Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
def fetch_by_html():
res = requests.get("https://www.zhihu.com/hot", headers=headers)
if res.status_code != 200:
print("Failed to fetch html.")
exit(-1)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
main_tag = soup.findAll('main')[0]
list_tag = None
for child in main_tag.children:
if child.name == 'div':
list_tag = child
break
if list_tag is None:
print("Failed to find list tag.")
exit(-1)
results = []
for item in list_tag.children:
try:
if item.name != "a":
continue
link = item.attrs['href']
texts = []
is_second_div = False
for tag in item.children:
if tag.name == 'div':
if is_second_div:
for sub_tag in tag.children:
if sub_tag.name in ['div', 'h1']:
texts.append(sub_tag.text)
else:
is_second_div = True
if len(texts) == 2:
result = {
"link": link,
"title": texts[0],
"description": "",
"hot": texts[1]
}
else:
result = {
"link": link,
"title": texts[0],
"description": texts[1],
"hot": texts[2]
}
except Exception as e:
result = {
"link": "",
"title": "Error",
"description": str(e),
"hot": ""
}
print(e)
results.append(result)
return results
def fetch_by_api():
res = requests.get("https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total", headers=headers)
data = res.json()
results = []
for item in data['data']:
result = {
"link": f"https://www.zhihu.com/question/{item['target']['id']}",
"title": item['target']['title'],
"description": item['target']['excerpt'],
"hot": item['detail_text']
}
results.append(result)
return results
def fetch():
try:
data = fetch_by_html()
except:
print("Failed to fetch from html.")
data = fetch_by_api()
return data