forked from zhangchenchen/proxyspider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.py
90 lines (76 loc) · 3.22 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- author by pekingzcc -*-
# -*- date : 2017-05-19 -*-
"""配置文件
"""
# 利用一个正则就可以直接采集代理IP的站点
PROXY_SITES_BY_REGX = {
'urls': [
'http://ab57.ru/downloads/proxyold.txt',
'http://www.proxylists.net/http_highanon.txt',
'http://www.atomintersoft.com/high_anonymity_elite_proxy_list',
'http://tools.rosinstrument.com/proxy/plab100.xml',
'http://www.rmccurdy.com/scripts/proxy/good.txt',
'http://proxy.ipcn.org/proxylist2.html',
'http://www.proxylists.net/?HTTP',
'http://uks.pl.ua/script/getproxy.php?last'
],
'proxy_regx': r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,4}"
}
# 需要利用xpath 定位代理IP 的站点
PROXY_SITES_BY_XPATH = [
# {
# 'urls': ['http://www.66ip.cn/%s.html' % page for page in ['index'] + list(range(2, 11))],
# 'ip_xpath': ".//*[@id='main']/div/div[1]/table/tr[position()>1]/td[1]/text()" ,
# 'port_xpath': ".//*[@id='main']/div/div[1]/table/tr[position()>1]/td[2]/text()"
# },
{
'urls': ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'],
'ip_xpath': ".//table[@class='sortable']/tbody/tr/td[1]/text()" ,
'port_xpath': ".//table[@class='sortable']/tbody/tr/td[2]/text()"
},
{
'urls': ['http://www.mimiip.com/gngao/%s' % page for page in range(1, 2)],
'ip_xpath': ".//table[@class='list']/tbody/tr/td[1]/text()",
'port_xpath': ".//table[@class='list']/tbody/tr/td[2]/text()"
},
{
'urls': ['http://www.kuaidaili.com/free/inha/%s/' % page for page in range(1, 2)],
'ip_xpath': ".//*[@id='index_free_list']/table/tbody/tr/td[1]/text()" ,
'port_xpath': ".//*[@id='index_free_list']/table/tbody/tr/td[2]/text()"
},
{
'urls': ['http://www.ip181.com/daili/%s.html' % page for page in range(1, 3)],
'ip_xpath': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]/td[1]/text()" ,
'port_xpath': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]/td[2]/text()"
}
]
# 代理输出位置
OUTPUT_FILE = "/tmp/proxy.txt"
# User-Agent list
USER_AGENT_LIST = [
'Mozilla/4.0 (compatible; MSIE 5.0; SunOS 5.10 sun4u; X11)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser;',
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)',
'Microsoft Internet Explorer/4.0b1 (Windows 95)',
'Opera/8.00 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 5.0; AOL 4.0; Windows 95; c_athome)',
'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; ZoomSpider.net bot; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; QihooBot 1.0 [email protected])',
]
# 超时时间
TIME_OUT = 4
#重试次数
RETRY_NUM = 1
# 测试URL
TEST_URL = "http://www.baidu.com"
## 七牛AccessKey/SecretKey,具体含义参考七牛官网文档
QINIU_AUTH = {
"AccessKey": "*********************",
"SecretKey": "*********************"
}
## 上传到七牛 的bucket, 具体含义参考七牛官网文档
QINIU_BUCKET = "*****"