Skip to content

Commit

Permalink
v0.002 引入httpclient&http conf
Browse files Browse the repository at this point in the history
  • Loading branch information
yihui committed Jun 27, 2017
1 parent c7c40dd commit a0843cd
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 46 deletions.
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# quick-crawler
> java实现的爬虫框架
从头开始, 一步一步的实现一个可用的爬虫框架,每个地方加一个里程碑的tag,主要用于记录这个工程的诞生过程



## tag 记录列表

### 1. [v0.001](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.001)

> 实现了一个最简单,最基础的爬虫, 处于能用的阶段

### 2. [v.0.002](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.002)

> 利用HttpClient来替代jdk的http请求;新增http参数配置
17 changes: 17 additions & 0 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,23 @@
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>

<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.5</version>
</dependency>

<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.2</version>
</dependency>
</dependencies>


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package com.quick.hui.crawler.core.entity;

import lombok.Getter;
import lombok.Setter;
import lombok.ToString;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

/**
* http的相关配置
*
* 1. 请求参数头
* 2. 返回的各项设置
*
* Created by yihui on 2017/6/27.
*/
@ToString
public class CrawlHttpConf {

private static Map<String, String> DEFAULT_HEADERS;

static {
DEFAULT_HEADERS = new HashMap<>();
DEFAULT_HEADERS.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
DEFAULT_HEADERS.put("connection", "Keep-Alive");
DEFAULT_HEADERS.put("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36");
}


public enum HttpMethod {
GET,
POST,
OPTIONS,
PUT;
}


@Getter
@Setter
private HttpMethod method = HttpMethod.GET;


/**
* 请求头
*/
@Setter
private Map<String, String> requestHeaders;


/**
* 请求参数
*/
@Setter
private Map<String, Object> requestParams;


public Map<String, String> getRequestHeaders() {
return requestHeaders == null ? DEFAULT_HEADERS : requestHeaders;
}

public Map<String, Object> getRequestParams() {
return requestParams == null ? Collections.emptyMap() : requestParams;
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.quick.hui.crawler.core.entity;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
Expand All @@ -13,9 +14,17 @@
*/
@Getter
@Setter
@ToString
@ToString(callSuper = true)
public class CrawlResult {


public static Status SUCCESS = new Status(200, "success");
public static Status NOT_FOUND = new Status(494, "not found");


private Status status;


/**
* 爬取的网址
*/
Expand All @@ -33,4 +42,22 @@ public class CrawlResult {
*/
private Map<String, List<String>> result;



public void setStatus(int code, String msg) {
this.status = new Status(code, msg);
}



@Getter
@Setter
@ToString
@AllArgsConstructor
static class Status {
private int code;

private String msg;
}

}
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
package com.quick.hui.crawler.core.job;

import com.quick.hui.crawler.core.entity.CrawlHttpConf;
import com.quick.hui.crawler.core.entity.CrawlMeta;
import com.quick.hui.crawler.core.entity.CrawlResult;
import com.quick.hui.crawler.core.utils.HttpUtils;
import lombok.Getter;
import lombok.Setter;
import org.apache.http.HttpResponse;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
Expand All @@ -32,57 +32,31 @@ public class SimpleCrawlJob extends AbstractJob {
private CrawlMeta crawlMeta;


/**
* http配置信息
*/
private CrawlHttpConf httpConf = new CrawlHttpConf();


/**
* 存储爬取的结果
*/
private CrawlResult crawlResult;
private CrawlResult crawlResult = new CrawlResult();


/**
* 执行抓取网页
*/
public void doFetchPage() throws Exception {

URL url = new URL(crawlMeta.getUrl());
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
BufferedReader in = null;

StringBuilder result = new StringBuilder();

try {
// 设置通用的请求属性
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 建立实际的连接
connection.connect();


Map<String, List<String>> map = connection.getHeaderFields();
//遍历所有的响应头字段
for (String key : map.keySet()) {
System.out.println(key + "--->" + map.get(key));
}

// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result.append(line);
}
} finally { // 使用finally块来关闭输入流
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
HttpResponse response = HttpUtils.request(crawlMeta, httpConf);
String res = EntityUtils.toString(response.getEntity());
if (response.getStatusLine().getStatusCode() == 200) { // 请求成功
doParse(res);
} else {
this.crawlResult = new CrawlResult();
this.crawlResult.setStatus(response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase());
this.crawlResult.setUrl(crawlMeta.getUrl());
}


doParse(result.toString());
}


Expand All @@ -105,5 +79,6 @@ private void doParse(String html) {
this.crawlResult.setHtmlDoc(doc);
this.crawlResult.setUrl(crawlMeta.getUrl());
this.crawlResult.setResult(map);
this.crawlResult.setStatus(CrawlResult.SUCCESS);
}
}
94 changes: 94 additions & 0 deletions core/src/main/java/com/quick/hui/crawler/core/utils/HttpUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package com.quick.hui.crawler.core.utils;

import com.quick.hui.crawler.core.entity.CrawlHttpConf;
import com.quick.hui.crawler.core.entity.CrawlMeta;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.ssl.SSLContextBuilder;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
* Created by yihui on 2017/6/27.
*/
public class HttpUtils {

public static HttpResponse request(CrawlMeta crawlMeta, CrawlHttpConf httpConf) throws Exception {
switch (httpConf.getMethod()) {
case GET:
return doGet(crawlMeta, httpConf);
case POST:
return doPost(crawlMeta, httpConf);
default:
return null;
}
}


private static HttpResponse doGet(CrawlMeta crawlMeta, CrawlHttpConf httpConf) throws Exception {
// HttpClient httpClient = HttpClients.createDefault();
SSLContextBuilder builder = new SSLContextBuilder();
// 全部信任 不做身份鉴定
builder.loadTrustMaterial(null, (x509Certificates, s) -> true);
HttpClient httpClient = HttpClientBuilder.create().setSslcontext(builder.build()).build();

// 设置请求参数
StringBuilder param = new StringBuilder(crawlMeta.getUrl()).append("?");
for (Map.Entry<String, Object> entry : httpConf.getRequestParams().entrySet()) {
param.append(entry.getKey())
.append("=")
.append(entry.getValue())
.append("&");
}

HttpGet httpGet = new HttpGet(param.substring(0, param.length() - 1)); // 过滤掉最后一个无效字符

// 设置请求头
for (Map.Entry<String, String> head : httpConf.getRequestHeaders().entrySet()) {
httpGet.addHeader(head.getKey(), head.getValue());
}


// 执行网络请求
return httpClient.execute(httpGet);
}


private static HttpResponse doPost(CrawlMeta crawlMeta, CrawlHttpConf httpConf) throws Exception {
// HttpClient httpClient = HttpClients.createDefault();
SSLContextBuilder builder = new SSLContextBuilder();
// 全部信任 不做身份鉴定
builder.loadTrustMaterial(null, (x509Certificates, s) -> true);
HttpClient httpClient = HttpClientBuilder.create().setSslcontext(builder.build()).build();

HttpPost httpPost = new HttpPost(crawlMeta.getUrl());


// 建立一个NameValuePair数组,用于存储欲传送的参数
List<NameValuePair> params = new ArrayList<>();
for (Map.Entry<String, Object> param : httpConf.getRequestParams().entrySet()) {
params.add(new BasicNameValuePair(param.getKey(), param.getValue().toString()));
}

httpPost.setEntity(new UrlEncodedFormEntity(params, HTTP.UTF_8));


// 设置请求头
for (Map.Entry<String, String> head : httpConf.getRequestHeaders().entrySet()) {
httpPost.addHeader(head.getKey(), head.getValue());
}

return httpClient.execute(httpPost);
}


}

0 comments on commit a0843cd

Please sign in to comment.