diff --git a/demo/pom.xml b/demo/pom.xml index 986f31e..15e59e4 100644 --- a/demo/pom.xml +++ b/demo/pom.xml @@ -34,7 +34,7 @@ cn.wanghaomiao SeimiCrawler - 1.3.0 + 1.3.1 net.paoding diff --git a/project/pom.xml b/project/pom.xml index d91bc45..d30c866 100644 --- a/project/pom.xml +++ b/project/pom.xml @@ -7,7 +7,7 @@ cn.wanghaomiao SeimiCrawler - 1.3.0 + 1.3.1 4.0.0 jar SeimiCrawler diff --git a/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultLocalQueue.java b/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultLocalQueue.java index 8d49c6d..a1f99d0 100644 --- a/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultLocalQueue.java +++ b/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultLocalQueue.java @@ -18,6 +18,7 @@ import cn.wanghaomiao.seimi.annotation.Queue; import cn.wanghaomiao.seimi.core.SeimiQueue; import cn.wanghaomiao.seimi.struct.Request; +import cn.wanghaomiao.seimi.utils.GenericUtils; import org.apache.commons.codec.digest.DigestUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -68,7 +69,7 @@ public long len(String crawlerName) { @Override public boolean isProcessed(Request req) { ConcurrentSkipListSet set = getProcessedSet(req.getCrawlerName()); - String sign = DigestUtils.md5Hex(req.getUrl()); + String sign = GenericUtils.signRequest(req); return set.contains(sign); } diff --git a/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultRedisQueue.java b/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultRedisQueue.java index e6b20b4..a714b36 100644 --- a/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultRedisQueue.java +++ b/project/src/main/java/cn/wanghaomiao/seimi/def/DefaultRedisQueue.java @@ -18,6 +18,7 @@ import cn.wanghaomiao.seimi.annotation.Queue; import cn.wanghaomiao.seimi.core.SeimiQueue; import cn.wanghaomiao.seimi.struct.Request; +import cn.wanghaomiao.seimi.utils.GenericUtils; import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; @@ -155,7 +156,7 @@ public boolean isProcessed(Request req) { boolean res = false; try { jedis = getWClient(); - String sign = DigestUtils.md5Hex(req.getUrl()); + String sign = GenericUtils.signRequest(req); res = jedis.sismember(setNamePrefix +req.getCrawlerName(),sign); }catch (Exception e){ logger.warn(e.getMessage()); diff --git a/project/src/main/java/cn/wanghaomiao/seimi/http/hc/HcRequestGenerator.java b/project/src/main/java/cn/wanghaomiao/seimi/http/hc/HcRequestGenerator.java index 318999e..169582d 100644 --- a/project/src/main/java/cn/wanghaomiao/seimi/http/hc/HcRequestGenerator.java +++ b/project/src/main/java/cn/wanghaomiao/seimi/http/hc/HcRequestGenerator.java @@ -23,10 +23,16 @@ import cn.wanghaomiao.seimi.struct.Request; import com.alibaba.fastjson.JSON; import org.apache.commons.lang3.StringUtils; +import org.apache.http.NameValuePair; import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.message.BasicNameValuePair; import org.springframework.util.CollectionUtils; +import java.nio.charset.Charset; +import java.util.LinkedList; +import java.util.List; import java.util.Map; /** @@ -43,39 +49,49 @@ public static RequestBuilder getHttpRequestBuilder(Request request, CrawlerModel } String seimiAgentUrl = "http://" + crawler.seimiAgentHost() + (crawler.seimiAgentPort() != 80 ? (":" + crawler.seimiAgentPort()) : "") + "/doload"; requestBuilder = RequestBuilder.post().setUri(seimiAgentUrl); + List nameValuePairList = new LinkedList<>(); requestBuilder.addParameter("url", request.getUrl()); if (StringUtils.isNotBlank(crawler.proxy())) { - requestBuilder.addParameter("proxy", crawler.proxy()); + nameValuePairList.add(new BasicNameValuePair("proxy", crawler.proxy())); } if (request.getSeimiAgentRenderTime() > 0) { - requestBuilder.addParameter("renderTime", String.valueOf(request.getSeimiAgentRenderTime())); + nameValuePairList.add(new BasicNameValuePair("renderTime", String.valueOf(request.getSeimiAgentRenderTime()))); } if (StringUtils.isNotBlank(request.getSeimiAgentScript())) { - requestBuilder.addParameter("script", request.getSeimiAgentScript()); + nameValuePairList.add(new BasicNameValuePair("script", request.getSeimiAgentScript())); } //如果针对SeimiAgent的请求设置是否使用cookie,以针对请求的设置为准,默认使用全局设置 if ((request.isSeimiAgentUseCookie() == null && crawlerModel.isUseCookie()) || (request.isSeimiAgentUseCookie() != null && request.isSeimiAgentUseCookie())) { - requestBuilder.addParameter("useCookie", "1"); + nameValuePairList.add(new BasicNameValuePair("useCookie", "1")); } if (request.getParams() != null && request.getParams().size() > 0) { - requestBuilder.addParameter("postParam", JSON.toJSONString(request.getParams())); + nameValuePairList.add(new BasicNameValuePair("postParam", JSON.toJSONString(request.getParams()))); } if (request.getSeimiAgentContentType().val() > SeimiAgentContentType.HTML.val()) { - requestBuilder.addParameter("contentType", request.getSeimiAgentContentType().typeVal()); + nameValuePairList.add(new BasicNameValuePair("contentType", request.getSeimiAgentContentType().typeVal())); } + requestBuilder.setEntity(new UrlEncodedFormEntity(nameValuePairList, Charset.forName("utf8"))); } else { if (HttpMethod.POST.equals(request.getHttpMethod())) { requestBuilder = RequestBuilder.post().setUri(request.getUrl()); + if (request.getParams() != null) { + List nameValuePairList = new LinkedList<>(); + for (Map.Entry entry : request.getParams().entrySet()) { + nameValuePairList.add(new BasicNameValuePair(entry.getKey(),entry.getValue())); + } + requestBuilder.setEntity(new UrlEncodedFormEntity(nameValuePairList, Charset.forName("utf8"))); + } } else { requestBuilder = RequestBuilder.get().setUri(request.getUrl()); + if (request.getParams() != null) { + for (Map.Entry entry : request.getParams().entrySet()) { + requestBuilder.addParameter(entry.getKey(), entry.getValue()); + } + } } RequestConfig config = RequestConfig.custom().setProxy(crawlerModel.getProxy()).setCircularRedirectsAllowed(true).build(); - if (request.getParams() != null) { - for (Map.Entry entry : request.getParams().entrySet()) { - requestBuilder.addParameter(entry.getKey(), entry.getValue()); - } - } + requestBuilder.setConfig(config).setHeader("User-Agent", crawlerModel.isUseCookie() ? crawlerModel.getCurrentUA() : crawler.getUserAgent()); requestBuilder.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); requestBuilder.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6"); diff --git a/project/src/main/java/cn/wanghaomiao/seimi/utils/GenericUtils.java b/project/src/main/java/cn/wanghaomiao/seimi/utils/GenericUtils.java index 6aaf8c4..2a51284 100644 --- a/project/src/main/java/cn/wanghaomiao/seimi/utils/GenericUtils.java +++ b/project/src/main/java/cn/wanghaomiao/seimi/utils/GenericUtils.java @@ -16,13 +16,20 @@ package cn.wanghaomiao.seimi.utils; import cn.wanghaomiao.seimi.core.CastToNumber; +import cn.wanghaomiao.seimi.struct.Request; +import com.alibaba.fastjson.JSONObject; +import org.apache.commons.codec.digest.DigestUtils; import java.lang.reflect.Array; import java.lang.reflect.GenericArrayType; import java.lang.reflect.ParameterizedType; import java.lang.reflect.Type; import java.math.BigDecimal; +import java.util.Collections; import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; import java.util.Map; /** @@ -137,4 +144,21 @@ public static boolean isNumber(Class cls){ public static Object castToNumber(Class cls,String val){ return numberClass.get(cls).castTo(val); } + + public static String sortParams(Map params){ + if (params == null){ + return ""; + } + JSONObject data = new JSONObject(new LinkedHashMap()); + List keys = new LinkedList<>(params.keySet()); + Collections.sort(keys); + for (String k :keys){ + data.put(k,params.get(k)); + } + return data.toJSONString(); + } + + public static String signRequest(Request request){ + return DigestUtils.md5Hex(request.getUrl()+sortParams(request.getParams())); + } }