diff --git a/demo/pom.xml b/demo/pom.xml
index 986f31e..15e59e4 100644
--- a/demo/pom.xml
+++ b/demo/pom.xml
@@ -34,7 +34,7 @@
cn.wanghaomiao
SeimiCrawler
- 1.3.0
+ 1.3.1
net.paoding
diff --git a/project/pom.xml b/project/pom.xml
index d91bc45..d30c866 100644
--- a/project/pom.xml
+++ b/project/pom.xml
@@ -7,7 +7,7 @@
cn.wanghaomiao
SeimiCrawler
- 1.3.0
+ 1.3.1
4.0.0
jar
SeimiCrawler
diff --git a/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultLocalQueue.java b/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultLocalQueue.java
index 8d49c6d..a1f99d0 100644
--- a/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultLocalQueue.java
+++ b/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultLocalQueue.java
@@ -18,6 +18,7 @@
import cn.wanghaomiao.seimi.annotation.Queue;
import cn.wanghaomiao.seimi.core.SeimiQueue;
import cn.wanghaomiao.seimi.struct.Request;
+import cn.wanghaomiao.seimi.utils.GenericUtils;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -68,7 +69,7 @@ public long len(String crawlerName) {
@Override
public boolean isProcessed(Request req) {
ConcurrentSkipListSet set = getProcessedSet(req.getCrawlerName());
- String sign = DigestUtils.md5Hex(req.getUrl());
+ String sign = GenericUtils.signRequest(req);
return set.contains(sign);
}
diff --git a/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultRedisQueue.java b/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultRedisQueue.java
index e6b20b4..a714b36 100644
--- a/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultRedisQueue.java
+++ b/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultRedisQueue.java
@@ -18,6 +18,7 @@
import cn.wanghaomiao.seimi.annotation.Queue;
import cn.wanghaomiao.seimi.core.SeimiQueue;
import cn.wanghaomiao.seimi.struct.Request;
+import cn.wanghaomiao.seimi.utils.GenericUtils;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
@@ -155,7 +156,7 @@ public boolean isProcessed(Request req) {
boolean res = false;
try {
jedis = getWClient();
- String sign = DigestUtils.md5Hex(req.getUrl());
+ String sign = GenericUtils.signRequest(req);
res = jedis.sismember(setNamePrefix +req.getCrawlerName(),sign);
}catch (Exception e){
logger.warn(e.getMessage());
diff --git a/project/src/main/java/cn/wanghaomiao/seimi/http/hc/HcRequestGenerator.java b/project/src/main/java/cn/wanghaomiao/seimi/http/hc/HcRequestGenerator.java
index 318999e..169582d 100644
--- a/project/src/main/java/cn/wanghaomiao/seimi/http/hc/HcRequestGenerator.java
+++ b/project/src/main/java/cn/wanghaomiao/seimi/http/hc/HcRequestGenerator.java
@@ -23,10 +23,16 @@
import cn.wanghaomiao.seimi.struct.Request;
import com.alibaba.fastjson.JSON;
import org.apache.commons.lang3.StringUtils;
+import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.RequestBuilder;
+import org.apache.http.message.BasicNameValuePair;
import org.springframework.util.CollectionUtils;
+import java.nio.charset.Charset;
+import java.util.LinkedList;
+import java.util.List;
import java.util.Map;
/**
@@ -43,39 +49,49 @@ public static RequestBuilder getHttpRequestBuilder(Request request, CrawlerModel
}
String seimiAgentUrl = "http://" + crawler.seimiAgentHost() + (crawler.seimiAgentPort() != 80 ? (":" + crawler.seimiAgentPort()) : "") + "/doload";
requestBuilder = RequestBuilder.post().setUri(seimiAgentUrl);
+ List nameValuePairList = new LinkedList<>();
requestBuilder.addParameter("url", request.getUrl());
if (StringUtils.isNotBlank(crawler.proxy())) {
- requestBuilder.addParameter("proxy", crawler.proxy());
+ nameValuePairList.add(new BasicNameValuePair("proxy", crawler.proxy()));
}
if (request.getSeimiAgentRenderTime() > 0) {
- requestBuilder.addParameter("renderTime", String.valueOf(request.getSeimiAgentRenderTime()));
+ nameValuePairList.add(new BasicNameValuePair("renderTime", String.valueOf(request.getSeimiAgentRenderTime())));
}
if (StringUtils.isNotBlank(request.getSeimiAgentScript())) {
- requestBuilder.addParameter("script", request.getSeimiAgentScript());
+ nameValuePairList.add(new BasicNameValuePair("script", request.getSeimiAgentScript()));
}
//如果针对SeimiAgent的请求设置是否使用cookie,以针对请求的设置为准,默认使用全局设置
if ((request.isSeimiAgentUseCookie() == null && crawlerModel.isUseCookie()) || (request.isSeimiAgentUseCookie() != null && request.isSeimiAgentUseCookie())) {
- requestBuilder.addParameter("useCookie", "1");
+ nameValuePairList.add(new BasicNameValuePair("useCookie", "1"));
}
if (request.getParams() != null && request.getParams().size() > 0) {
- requestBuilder.addParameter("postParam", JSON.toJSONString(request.getParams()));
+ nameValuePairList.add(new BasicNameValuePair("postParam", JSON.toJSONString(request.getParams())));
}
if (request.getSeimiAgentContentType().val() > SeimiAgentContentType.HTML.val()) {
- requestBuilder.addParameter("contentType", request.getSeimiAgentContentType().typeVal());
+ nameValuePairList.add(new BasicNameValuePair("contentType", request.getSeimiAgentContentType().typeVal()));
}
+ requestBuilder.setEntity(new UrlEncodedFormEntity(nameValuePairList, Charset.forName("utf8")));
} else {
if (HttpMethod.POST.equals(request.getHttpMethod())) {
requestBuilder = RequestBuilder.post().setUri(request.getUrl());
+ if (request.getParams() != null) {
+ List nameValuePairList = new LinkedList<>();
+ for (Map.Entry entry : request.getParams().entrySet()) {
+ nameValuePairList.add(new BasicNameValuePair(entry.getKey(),entry.getValue()));
+ }
+ requestBuilder.setEntity(new UrlEncodedFormEntity(nameValuePairList, Charset.forName("utf8")));
+ }
} else {
requestBuilder = RequestBuilder.get().setUri(request.getUrl());
+ if (request.getParams() != null) {
+ for (Map.Entry entry : request.getParams().entrySet()) {
+ requestBuilder.addParameter(entry.getKey(), entry.getValue());
+ }
+ }
}
RequestConfig config = RequestConfig.custom().setProxy(crawlerModel.getProxy()).setCircularRedirectsAllowed(true).build();
- if (request.getParams() != null) {
- for (Map.Entry entry : request.getParams().entrySet()) {
- requestBuilder.addParameter(entry.getKey(), entry.getValue());
- }
- }
+
requestBuilder.setConfig(config).setHeader("User-Agent", crawlerModel.isUseCookie() ? crawlerModel.getCurrentUA() : crawler.getUserAgent());
requestBuilder.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
requestBuilder.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
diff --git a/project/src/main/java/cn/wanghaomiao/seimi/utils/GenericUtils.java b/project/src/main/java/cn/wanghaomiao/seimi/utils/GenericUtils.java
index 6aaf8c4..2a51284 100644
--- a/project/src/main/java/cn/wanghaomiao/seimi/utils/GenericUtils.java
+++ b/project/src/main/java/cn/wanghaomiao/seimi/utils/GenericUtils.java
@@ -16,13 +16,20 @@
package cn.wanghaomiao.seimi.utils;
import cn.wanghaomiao.seimi.core.CastToNumber;
+import cn.wanghaomiao.seimi.struct.Request;
+import com.alibaba.fastjson.JSONObject;
+import org.apache.commons.codec.digest.DigestUtils;
import java.lang.reflect.Array;
import java.lang.reflect.GenericArrayType;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.math.BigDecimal;
+import java.util.Collections;
import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
import java.util.Map;
/**
@@ -137,4 +144,21 @@ public static boolean isNumber(Class cls){
public static Object castToNumber(Class cls,String val){
return numberClass.get(cls).castTo(val);
}
+
+ public static String sortParams(Map params){
+ if (params == null){
+ return "";
+ }
+ JSONObject data = new JSONObject(new LinkedHashMap());
+ List keys = new LinkedList<>(params.keySet());
+ Collections.sort(keys);
+ for (String k :keys){
+ data.put(k,params.get(k));
+ }
+ return data.toJSONString();
+ }
+
+ public static String signRequest(Request request){
+ return DigestUtils.md5Hex(request.getUrl()+sortParams(request.getParams()));
+ }
}