-
Notifications
You must be signed in to change notification settings - Fork 1
/
spider.conf
64 lines (52 loc) · 1.98 KB
/
spider.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Max number of task_threads in parallel. Each thread fetches an ourl from ourl_queue
# and crawls webpage and generates more threads according to cur_thread_num.
# To adapt depending on your network
max_job_num=4
# From which urls to start job.
# Comma seperated if you have more than one seed.
#seeds=http://www.imeiding.com
seeds=http://www.cnblogs.com/gaorong
#seeds=http://hi.baidu.com/qteqpid_pku
# If include_prefixes is set, We only crawl the urls that match
#include_prefixes=hi.baidu.com/qteqpid_pku/item
# If include_prefixes is set, the urls that match will NOT be crawled
#exclude_prefixes=www.imeiding.com/user
# When daemonized, the process's output will be logged in logfile rather than console
logfile=spiderq.log
# Set the level to log. The probable values list as follow:
# 0 DEBUG
# 1 INFO
# 2 WARN
# 3 ERROR
# 4 CRIT
# Spider only logs those who's level is greater(or equal) than log_level here.
# That means if you set log_level 0 here, You will get all logs.
log_level=0
# How deep do you want to go from seeds. If 0, we only crawl seeds and exit.
# Comment the following line if You want to go as deep as possible
max_depth=3
# The interval time(in seconds) to print stat data.
# If you need it, just uncomment the following line
#stat_interval=2
# How to save the crawled pages. Yes means respect sites hierarchy.
# NOT supported yet!!!
#make_hostdir=yes
# Dynamic Shared Object (DSO) Support
# The path where modules(.so) will locate.
module_path=./modules/
# Which module to load. Each one a line.
# The available modules' source codes are all in modules directory.
# They will all been compiled to .so and copy to ${module_path} during make
# pre_surl: domainlimit
# maxdepth
# after_header: headerfilter
# after_resp: savehtml
# saveimage
load_module=savehtml
load_module=saveimage
load_module=maxdepth
load_module=domainlimit
load_module=headerfilter
# specify which type of resource we accept. Each one a line.
# text/html is accepted default
accept_types=image/jpeg