-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
yihui
committed
Jun 27, 2017
1 parent
c7c40dd
commit a0843cd
Showing
6 changed files
with
242 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# quick-crawler | ||
> java实现的爬虫框架 | ||
从头开始, 一步一步的实现一个可用的爬虫框架,每个地方加一个里程碑的tag,主要用于记录这个工程的诞生过程 | ||
|
||
|
||
|
||
## tag 记录列表 | ||
|
||
### 1. [v0.001](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.001) | ||
|
||
> 实现了一个最简单,最基础的爬虫, 处于能用的阶段 | ||
|
||
### 2. [v.0.002](https://github.com/liuyueyi/quick-crawler/releases/tag/v0.002) | ||
|
||
> 利用HttpClient来替代jdk的http请求;新增http参数配置 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
66 changes: 66 additions & 0 deletions
66
core/src/main/java/com/quick/hui/crawler/core/entity/CrawlHttpConf.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package com.quick.hui.crawler.core.entity; | ||
|
||
import lombok.Getter; | ||
import lombok.Setter; | ||
import lombok.ToString; | ||
|
||
import java.util.Collections; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
/** | ||
* http的相关配置 | ||
* | ||
* 1. 请求参数头 | ||
* 2. 返回的各项设置 | ||
* | ||
* Created by yihui on 2017/6/27. | ||
*/ | ||
@ToString | ||
public class CrawlHttpConf { | ||
|
||
private static Map<String, String> DEFAULT_HEADERS; | ||
|
||
static { | ||
DEFAULT_HEADERS = new HashMap<>(); | ||
DEFAULT_HEADERS.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); | ||
DEFAULT_HEADERS.put("connection", "Keep-Alive"); | ||
DEFAULT_HEADERS.put("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"); | ||
} | ||
|
||
|
||
public enum HttpMethod { | ||
GET, | ||
POST, | ||
OPTIONS, | ||
PUT; | ||
} | ||
|
||
|
||
@Getter | ||
@Setter | ||
private HttpMethod method = HttpMethod.GET; | ||
|
||
|
||
/** | ||
* 请求头 | ||
*/ | ||
@Setter | ||
private Map<String, String> requestHeaders; | ||
|
||
|
||
/** | ||
* 请求参数 | ||
*/ | ||
@Setter | ||
private Map<String, Object> requestParams; | ||
|
||
|
||
public Map<String, String> getRequestHeaders() { | ||
return requestHeaders == null ? DEFAULT_HEADERS : requestHeaders; | ||
} | ||
|
||
public Map<String, Object> getRequestParams() { | ||
return requestParams == null ? Collections.emptyMap() : requestParams; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
94 changes: 94 additions & 0 deletions
94
core/src/main/java/com/quick/hui/crawler/core/utils/HttpUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
package com.quick.hui.crawler.core.utils; | ||
|
||
import com.quick.hui.crawler.core.entity.CrawlHttpConf; | ||
import com.quick.hui.crawler.core.entity.CrawlMeta; | ||
import org.apache.http.HttpResponse; | ||
import org.apache.http.NameValuePair; | ||
import org.apache.http.client.HttpClient; | ||
import org.apache.http.client.entity.UrlEncodedFormEntity; | ||
import org.apache.http.client.methods.HttpGet; | ||
import org.apache.http.client.methods.HttpPost; | ||
import org.apache.http.conn.ssl.SSLContextBuilder; | ||
import org.apache.http.impl.client.HttpClientBuilder; | ||
import org.apache.http.message.BasicNameValuePair; | ||
import org.apache.http.protocol.HTTP; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
/** | ||
* Created by yihui on 2017/6/27. | ||
*/ | ||
public class HttpUtils { | ||
|
||
public static HttpResponse request(CrawlMeta crawlMeta, CrawlHttpConf httpConf) throws Exception { | ||
switch (httpConf.getMethod()) { | ||
case GET: | ||
return doGet(crawlMeta, httpConf); | ||
case POST: | ||
return doPost(crawlMeta, httpConf); | ||
default: | ||
return null; | ||
} | ||
} | ||
|
||
|
||
private static HttpResponse doGet(CrawlMeta crawlMeta, CrawlHttpConf httpConf) throws Exception { | ||
// HttpClient httpClient = HttpClients.createDefault(); | ||
SSLContextBuilder builder = new SSLContextBuilder(); | ||
// 全部信任 不做身份鉴定 | ||
builder.loadTrustMaterial(null, (x509Certificates, s) -> true); | ||
HttpClient httpClient = HttpClientBuilder.create().setSslcontext(builder.build()).build(); | ||
|
||
// 设置请求参数 | ||
StringBuilder param = new StringBuilder(crawlMeta.getUrl()).append("?"); | ||
for (Map.Entry<String, Object> entry : httpConf.getRequestParams().entrySet()) { | ||
param.append(entry.getKey()) | ||
.append("=") | ||
.append(entry.getValue()) | ||
.append("&"); | ||
} | ||
|
||
HttpGet httpGet = new HttpGet(param.substring(0, param.length() - 1)); // 过滤掉最后一个无效字符 | ||
|
||
// 设置请求头 | ||
for (Map.Entry<String, String> head : httpConf.getRequestHeaders().entrySet()) { | ||
httpGet.addHeader(head.getKey(), head.getValue()); | ||
} | ||
|
||
|
||
// 执行网络请求 | ||
return httpClient.execute(httpGet); | ||
} | ||
|
||
|
||
private static HttpResponse doPost(CrawlMeta crawlMeta, CrawlHttpConf httpConf) throws Exception { | ||
// HttpClient httpClient = HttpClients.createDefault(); | ||
SSLContextBuilder builder = new SSLContextBuilder(); | ||
// 全部信任 不做身份鉴定 | ||
builder.loadTrustMaterial(null, (x509Certificates, s) -> true); | ||
HttpClient httpClient = HttpClientBuilder.create().setSslcontext(builder.build()).build(); | ||
|
||
HttpPost httpPost = new HttpPost(crawlMeta.getUrl()); | ||
|
||
|
||
// 建立一个NameValuePair数组,用于存储欲传送的参数 | ||
List<NameValuePair> params = new ArrayList<>(); | ||
for (Map.Entry<String, Object> param : httpConf.getRequestParams().entrySet()) { | ||
params.add(new BasicNameValuePair(param.getKey(), param.getValue().toString())); | ||
} | ||
|
||
httpPost.setEntity(new UrlEncodedFormEntity(params, HTTP.UTF_8)); | ||
|
||
|
||
// 设置请求头 | ||
for (Map.Entry<String, String> head : httpConf.getRequestHeaders().entrySet()) { | ||
httpPost.addHeader(head.getKey(), head.getValue()); | ||
} | ||
|
||
return httpClient.execute(httpPost); | ||
} | ||
|
||
|
||
} |