Skip to content

Commit

Permalink
v0.003 实现深度爬取
Browse files Browse the repository at this point in the history
  • Loading branch information
yihui committed Jun 29, 2017
1 parent a0843cd commit f900c55
Show file tree
Hide file tree
Showing 14 changed files with 575 additions and 63 deletions.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@
> 实现了一个最简单,最基础的爬虫, 处于能用的阶段

### 2. [v.0.002](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.002)
### 2. [v0.002](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.002)

> 利用HttpClient来替代jdk的http请求;新增http参数配置

### 3. [v0.003](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.003)

> 实现深度爬网页
- 支持正向、逆向链接过滤
- 在内存中保存爬取记录,用于去重过滤
- 提供爬取完成后的回调方法,用于结果处理
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
package com.quick.hui.crawler.core.entity;


import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
import lombok.*;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;

/**
* Created by yihui on 2017/6/27.
*/
@ToString
@NoArgsConstructor
@AllArgsConstructor
public class CrawlMeta {

/**
Expand All @@ -26,12 +27,41 @@ public class CrawlMeta {
* 获取指定内容的规则, 因为一个网页中,你可能获取多个不同的内容, 所以放在集合中
*/
@Setter
private Set<String> selectorRules;
@Getter
private Set<String> selectorRules = new HashSet<>();


/**
* 正向的过滤规则
*/
@Setter
@Getter
private Set<Pattern> positiveRegex = new HashSet<>();


/**
* 逆向的过滤规则
*/
@Setter
@Getter
private Set<Pattern> negativeRegex = new HashSet<>();



public Set<String> getSelectorRules() {
return selectorRules != null ? selectorRules : new HashSet<>();
public Set<String> addSelectorRule(String rule) {
this.selectorRules.add(rule);
return selectorRules;
}


public Set<Pattern> addPositiveRegex(String regex) {
this.positiveRegex.add(Pattern.compile(regex));
return this.positiveRegex;
}


public Set<Pattern> addNegativeRegex(String regex) {
this.negativeRegex.add(Pattern.compile(regex));
return this.negativeRegex;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
*/
@Getter
@Setter
@ToString(callSuper = true)
@ToString
public class CrawlResult {


Expand Down
10 changes: 10 additions & 0 deletions core/src/main/java/com/quick/hui/crawler/core/job/AbstractJob.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.quick.hui.crawler.core.job;

import com.quick.hui.crawler.core.entity.CrawlResult;

/**
* Created by yihui on 2017/6/27.
*/
Expand All @@ -12,6 +14,14 @@ public void afterRun() {
}


/**
* 解析完网页后的回调方法
*
* @param crawlResult
*/
protected abstract void visit(CrawlResult crawlResult);


@Override
public void run() {
this.beforeRun();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
package com.quick.hui.crawler.core.job;

import com.quick.hui.crawler.core.entity.CrawlHttpConf;
import com.quick.hui.crawler.core.entity.CrawlMeta;
import com.quick.hui.crawler.core.entity.CrawlResult;
import com.quick.hui.crawler.core.storage.StorageWrapper;
import com.quick.hui.crawler.core.utils.HttpUtils;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Created by yihui on 2017/6/29.
*/
@Getter
@Setter
@NoArgsConstructor
public abstract class DefaultAbstractCrawlJob extends AbstractJob {
/**
* 配置项信息
*/
private CrawlMeta crawlMeta;


/**
* http配置信息
*/
private CrawlHttpConf httpConf = new CrawlHttpConf();


/**
* 爬网页的深度, 默认为0, 即只爬取当前网页
*/
protected int depth = 0;


public DefaultAbstractCrawlJob(int depth) {
this.depth = depth;
}


/**
* 执行抓取网页
*/
public void doFetchPage() throws Exception {
doFetchNextPage(0, this.crawlMeta.getUrl());
}


// fixme 非线程安全
private void doFetchNextPage(int currentDepth, String url) throws Exception {
CrawlResult result = null;
try {
// 判断是否爬过;未爬取,则上锁并继续爬取网页
if (StorageWrapper.getInstance().ifUrlFetched(url)) {
return;
}

CrawlMeta subMeta = new CrawlMeta(url, this.crawlMeta.getSelectorRules(), this.crawlMeta.getPositiveRegex(), this.crawlMeta.getNegativeRegex());
HttpResponse response = HttpUtils.request(subMeta, httpConf);
String res = EntityUtils.toString(response.getEntity());
if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { // 请求成功
result = new CrawlResult();
result.setStatus(response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase());
result.setUrl(crawlMeta.getUrl());
this.visit(result);
return;
}


// 网页解析
result = doParse(res, subMeta);
} finally {
// 添加一条记录, 并释放锁
StorageWrapper.getInstance().addFetchRecord(url, result);
}

// 回调用户的网页内容解析方法
this.visit(result);


// 超过最大深度, 不继续爬
if (currentDepth > depth) {
return;
}


Elements elements = result.getHtmlDoc().select("a[href]");
String src;
for(Element element: elements) {
// 确保将相对地址转为绝对地址
src = element.attr("abs:href");
if (matchRegex(src)) {
doFetchNextPage(currentDepth + 1, src);
}
}
}


private CrawlResult doParse(String html, CrawlMeta meta) {
Document doc = Jsoup.parse(html, meta.getUrl());

Map<String, List<String>> map = new HashMap<>(meta.getSelectorRules().size());
for (String rule : crawlMeta.getSelectorRules()) {
List<String> list = new ArrayList<>();
for (Element element : doc.select(rule)) {
list.add(element.text());
}

map.put(rule, list);
}


CrawlResult result = new CrawlResult();
result.setHtmlDoc(doc);
result.setUrl(meta.getUrl());
result.setResult(map);
result.setStatus(CrawlResult.SUCCESS);
return result;
}


private boolean matchRegex(String url) {
Matcher matcher;
for(Pattern pattern: crawlMeta.getPositiveRegex()) {
matcher = pattern.matcher(url);
if (matcher.find()) {
return true;
}
}


for(Pattern pattern: crawlMeta.getNegativeRegex()) {
matcher = pattern.matcher(url);
if(matcher.find()) {
return false;
}
}


return crawlMeta.getPositiveRegex().size() == 0;
}
}
Original file line number Diff line number Diff line change
@@ -1,21 +1,12 @@
package com.quick.hui.crawler.core.job;

import com.quick.hui.crawler.core.entity.CrawlHttpConf;
import com.quick.hui.crawler.core.entity.CrawlMeta;
import com.quick.hui.crawler.core.entity.CrawlResult;
import com.quick.hui.crawler.core.utils.HttpUtils;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import org.apache.http.HttpResponse;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* 最简单的一个爬虫任务
Expand All @@ -24,61 +15,38 @@
*/
@Getter
@Setter
public class SimpleCrawlJob extends AbstractJob {
@NoArgsConstructor
public class SimpleCrawlJob extends DefaultAbstractCrawlJob {

/**
* 配置项信息
* 存储爬取的结果
*/
private CrawlMeta crawlMeta;
private CrawlResult crawlResult;


/**
* http配置信息
* 批量查询的结果
*/
private CrawlHttpConf httpConf = new CrawlHttpConf();

private List<CrawlResult> crawlResults = new ArrayList<>();

/**
* 存储爬取的结果
*/
private CrawlResult crawlResult = new CrawlResult();


/**
* 执行抓取网页
*/
public void doFetchPage() throws Exception {
HttpResponse response = HttpUtils.request(crawlMeta, httpConf);
String res = EntityUtils.toString(response.getEntity());
if (response.getStatusLine().getStatusCode() == 200) { // 请求成功
doParse(res);
} else {
this.crawlResult = new CrawlResult();
this.crawlResult.setStatus(response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase());
this.crawlResult.setUrl(crawlMeta.getUrl());
}
public SimpleCrawlJob(int depth) {
super(depth);
}


@Override
protected void visit(CrawlResult crawlResult) {
crawlResults.add(crawlResult);
}

private void doParse(String html) {
Document doc = Jsoup.parse(html);

Map<String, List<String>> map = new HashMap<>(crawlMeta.getSelectorRules().size());
for (String rule: crawlMeta.getSelectorRules()) {
List<String> list = new ArrayList<>();
for (Element element: doc.select(rule)) {
list.add(element.text());
}

map.put(rule, list);
public CrawlResult getCrawlResult() {
if(crawlResults.size() == 0) {
return null;
}


this.crawlResult = new CrawlResult();
this.crawlResult.setHtmlDoc(doc);
this.crawlResult.setUrl(crawlMeta.getUrl());
this.crawlResult.setResult(map);
this.crawlResult.setStatus(CrawlResult.SUCCESS);
return crawlResults.get(0);
}
}
Loading

0 comments on commit f900c55

Please sign in to comment.