diff --git a/engine/src/main/java/org/archive/crawler/frontier/HostnameQueueAssignmentPolicyWithLimits.java b/engine/src/main/java/org/archive/crawler/frontier/HostnameQueueAssignmentPolicyWithLimits.java new file mode 100644 index 000000000..c65d38d9a --- /dev/null +++ b/engine/src/main/java/org/archive/crawler/frontier/HostnameQueueAssignmentPolicyWithLimits.java @@ -0,0 +1,75 @@ +package org.archive.crawler.frontier; + +import org.archive.crawler.spring.SurtPrefixesSheetAssociation; +import org.archive.net.UURI; + +/** + * A variation on @link {@link HostnameQueueAssignmentPolicy} that allows the + * operator (per sheet) to specify the maximum number of domains and sub-domains + * to use for the queue name. + * + */ +public class HostnameQueueAssignmentPolicyWithLimits extends HostnameQueueAssignmentPolicy { + private static final long serialVersionUID = 3L; + + public static final String LIMIT = "limit"; + + // Default limit to -1 (no limit enforced) + { + setLimit(-1); + } + + /** + * Set the maximum number of domains and sub-domains to include in the queue + * name. + *

+ * E.g. if limit is set to 2 than the following assignments are + * made:
+ * example.com -> example.com
+ * www.example.com -> example.com
+ * subdomain.example.com -> example.com
+ * www.subdomain.example.com -> example.com
+ * otherdomain.com -> otherdomain.com
+ *

+ * Note: No accommodation is made for TLDs, like + * .co.uk that always use two levels. Operators should use use + * {@link SurtPrefixesSheetAssociation} sheets to apply these limits + * appropriately if crawling a mixture of TLDs with and without the mandatory + * second level or only apply the limit on specific domains. + * + * @param limit The limit on number of domains to use in assigning a queue name + * to a URI. + */ + public void setLimit(int limit) { + kp.put(LIMIT, limit); + } + + public int getLimit() { + return (Integer) kp.get(LIMIT); + } + + @Override + protected String getCoreKey(UURI basis) { + int limit = (Integer) kp.get(LIMIT); + return getLimitedHostname(super.getCoreKey(basis), limit); + } + + protected String getLimitedHostname(String hostname, int limit) { + if (limit <= 0) { + return hostname; + } + + String[] domains = hostname.split("\\."); + if (limit >= domains.length) { + return hostname; + } + // More domains are present than allowed. + StringBuilder limitedHostname = new StringBuilder(); + for (int i = domains.length - limit; i < domains.length - 1; i++) { + limitedHostname.append(domains[i]); + limitedHostname.append("."); + } + limitedHostname.append(domains[domains.length - 1]); + return limitedHostname.toString(); + } +} diff --git a/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicyWithLimits.java b/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicyWithLimits.java new file mode 100644 index 000000000..67f62513c --- /dev/null +++ b/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicyWithLimits.java @@ -0,0 +1,80 @@ +package org.archive.crawler.frontier; + +import org.archive.crawler.spring.SurtPrefixesSheetAssociation; +import org.archive.net.UURI; + +/** + * A variation on @link {@link SurtAuthorityQueueAssignmentPolicy} that allows + * the operator (per sheet) to specify the maximum number of SURT segments + * to use for the queue name. + * + */ +public class SurtAuthorityQueueAssignmentPolicyWithLimits extends SurtAuthorityQueueAssignmentPolicy { + private static final long serialVersionUID = 3L; + + public static final String LIMIT = "limit"; + + // Default limit to -1 (no limit enforced) + { + setLimit(-1); + } + + /** + * Set the maximum number of surt segments to include in the queue name. + *

+ * E.g. if limit is set to 2 than the following assignments are + * made:
+ * com,example, -> com,example,
+ * com,example,www, -> com,example,
+ * com,example,subdomain, -> com,example,
+ * com,example,subdomain,www, -> com,example,
+ * com,otherdomain, -> com,otherdomain,
+ *

+ * Note: No accommodation is made for TLDs, like + * .co.uk that always use two levels. Operators should use use + * {@link SurtPrefixesSheetAssociation} sheets to apply these limits + * appropriately if crawling a mixture of TLDs with and without the mandatory + * second level or only apply the limit on specific domains. + * + * @param limit The limit on number of domains to use in assigning a queue name + * to a URI. + */ + public void setLimit(int limit) { + kp.put(LIMIT, limit); + } + + public int getLimit() { + return (Integer) kp.get(LIMIT); + } + + @Override + protected String getCoreKey(UURI basis) { + int limit = (Integer) kp.get(LIMIT); + return getLimitedSurtAuthority(super.getCoreKey(basis), limit); + } + + protected String getLimitedSurtAuthority(String surt, int limit) { + if (limit <= 0) { + return surt; + } + String domainPart = surt; + String portPart = ""; + int indexOfHash = surt.indexOf('#'); + if (indexOfHash > -1) { + domainPart = surt.substring(0, indexOfHash); + portPart = surt.substring(indexOfHash); + } + String[] segments = domainPart.split(","); + if (limit >= segments.length) { + return surt; + } + // More domains are present than allowed. + StringBuilder limitedSurt = new StringBuilder(); + for (int i = 0; i < limit; i++) { + limitedSurt.append(segments[i]); + limitedSurt.append(","); + } + limitedSurt.append(portPart); + return limitedSurt.toString(); + } +}