Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
zhegexiaohuozi committed Jan 11, 2017
2 parents d708349 + 1a63bc4 commit e1cbc25
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 15 deletions.
2 changes: 1 addition & 1 deletion demo/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>SeimiCrawler</artifactId>
<version>1.3.0</version>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>net.paoding</groupId>
Expand Down
2 changes: 1 addition & 1 deletion project/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
</parent>
<groupId>cn.wanghaomiao</groupId>
<artifactId>SeimiCrawler</artifactId>
<version>1.3.0</version>
<version>1.3.1</version>
<modelVersion>4.0.0</modelVersion>
<packaging>jar</packaging>
<name>SeimiCrawler</name>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import cn.wanghaomiao.seimi.annotation.Queue;
import cn.wanghaomiao.seimi.core.SeimiQueue;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.utils.GenericUtils;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -68,7 +69,7 @@ public long len(String crawlerName) {
@Override
public boolean isProcessed(Request req) {
ConcurrentSkipListSet<String> set = getProcessedSet(req.getCrawlerName());
String sign = DigestUtils.md5Hex(req.getUrl());
String sign = GenericUtils.signRequest(req);
return set.contains(sign);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import cn.wanghaomiao.seimi.annotation.Queue;
import cn.wanghaomiao.seimi.core.SeimiQueue;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.utils.GenericUtils;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -155,7 +156,7 @@ public boolean isProcessed(Request req) {
boolean res = false;
try {
jedis = getWClient();
String sign = DigestUtils.md5Hex(req.getUrl());
String sign = GenericUtils.signRequest(req);
res = jedis.sismember(setNamePrefix +req.getCrawlerName(),sign);
}catch (Exception e){
logger.warn(e.getMessage());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,16 @@
import cn.wanghaomiao.seimi.struct.Request;
import com.alibaba.fastjson.JSON;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.message.BasicNameValuePair;
import org.springframework.util.CollectionUtils;

import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
Expand All @@ -43,39 +49,49 @@ public static RequestBuilder getHttpRequestBuilder(Request request, CrawlerModel
}
String seimiAgentUrl = "http://" + crawler.seimiAgentHost() + (crawler.seimiAgentPort() != 80 ? (":" + crawler.seimiAgentPort()) : "") + "/doload";
requestBuilder = RequestBuilder.post().setUri(seimiAgentUrl);
List<NameValuePair> nameValuePairList = new LinkedList<>();
requestBuilder.addParameter("url", request.getUrl());
if (StringUtils.isNotBlank(crawler.proxy())) {
requestBuilder.addParameter("proxy", crawler.proxy());
nameValuePairList.add(new BasicNameValuePair("proxy", crawler.proxy()));
}
if (request.getSeimiAgentRenderTime() > 0) {
requestBuilder.addParameter("renderTime", String.valueOf(request.getSeimiAgentRenderTime()));
nameValuePairList.add(new BasicNameValuePair("renderTime", String.valueOf(request.getSeimiAgentRenderTime())));
}
if (StringUtils.isNotBlank(request.getSeimiAgentScript())) {
requestBuilder.addParameter("script", request.getSeimiAgentScript());
nameValuePairList.add(new BasicNameValuePair("script", request.getSeimiAgentScript()));
}
//如果针对SeimiAgent的请求设置是否使用cookie,以针对请求的设置为准,默认使用全局设置
if ((request.isSeimiAgentUseCookie() == null && crawlerModel.isUseCookie()) || (request.isSeimiAgentUseCookie() != null && request.isSeimiAgentUseCookie())) {
requestBuilder.addParameter("useCookie", "1");
nameValuePairList.add(new BasicNameValuePair("useCookie", "1"));
}
if (request.getParams() != null && request.getParams().size() > 0) {
requestBuilder.addParameter("postParam", JSON.toJSONString(request.getParams()));
nameValuePairList.add(new BasicNameValuePair("postParam", JSON.toJSONString(request.getParams())));
}
if (request.getSeimiAgentContentType().val() > SeimiAgentContentType.HTML.val()) {
requestBuilder.addParameter("contentType", request.getSeimiAgentContentType().typeVal());
nameValuePairList.add(new BasicNameValuePair("contentType", request.getSeimiAgentContentType().typeVal()));
}
requestBuilder.setEntity(new UrlEncodedFormEntity(nameValuePairList, Charset.forName("utf8")));
} else {
if (HttpMethod.POST.equals(request.getHttpMethod())) {
requestBuilder = RequestBuilder.post().setUri(request.getUrl());
if (request.getParams() != null) {
List<NameValuePair> nameValuePairList = new LinkedList<>();
for (Map.Entry<String, String> entry : request.getParams().entrySet()) {
nameValuePairList.add(new BasicNameValuePair(entry.getKey(),entry.getValue()));
}
requestBuilder.setEntity(new UrlEncodedFormEntity(nameValuePairList, Charset.forName("utf8")));
}
} else {
requestBuilder = RequestBuilder.get().setUri(request.getUrl());
if (request.getParams() != null) {
for (Map.Entry<String, String> entry : request.getParams().entrySet()) {
requestBuilder.addParameter(entry.getKey(), entry.getValue());
}
}
}
RequestConfig config = RequestConfig.custom().setProxy(crawlerModel.getProxy()).setCircularRedirectsAllowed(true).build();

if (request.getParams() != null) {
for (Map.Entry<String, String> entry : request.getParams().entrySet()) {
requestBuilder.addParameter(entry.getKey(), entry.getValue());
}
}

requestBuilder.setConfig(config).setHeader("User-Agent", crawlerModel.isUseCookie() ? crawlerModel.getCurrentUA() : crawler.getUserAgent());
requestBuilder.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
requestBuilder.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
Expand Down
24 changes: 24 additions & 0 deletions project/src/main/java/cn/wanghaomiao/seimi/utils/GenericUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,20 @@
package cn.wanghaomiao.seimi.utils;

import cn.wanghaomiao.seimi.core.CastToNumber;
import cn.wanghaomiao.seimi.struct.Request;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.codec.digest.DigestUtils;

import java.lang.reflect.Array;
import java.lang.reflect.GenericArrayType;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.math.BigDecimal;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
Expand Down Expand Up @@ -137,4 +144,21 @@ public static boolean isNumber(Class cls){
public static Object castToNumber(Class cls,String val){
return numberClass.get(cls).castTo(val);
}

public static String sortParams(Map<String,String> params){
if (params == null){
return "";
}
JSONObject data = new JSONObject(new LinkedHashMap<String,Object>());
List<String> keys = new LinkedList<>(params.keySet());
Collections.sort(keys);
for (String k :keys){
data.put(k,params.get(k));
}
return data.toJSONString();
}

public static String signRequest(Request request){
return DigestUtils.md5Hex(request.getUrl()+sortParams(request.getParams()));
}
}

0 comments on commit e1cbc25

Please sign in to comment.