diff --git a/engine/src/main/java/org/archive/crawler/frontier/HostnameQueueAssignmentPolicyWithLimits.java b/engine/src/main/java/org/archive/crawler/frontier/HostnameQueueAssignmentPolicyWithLimits.java new file mode 100644 index 000000000..c65d38d9a --- /dev/null +++ b/engine/src/main/java/org/archive/crawler/frontier/HostnameQueueAssignmentPolicyWithLimits.java @@ -0,0 +1,75 @@ +package org.archive.crawler.frontier; + +import org.archive.crawler.spring.SurtPrefixesSheetAssociation; +import org.archive.net.UURI; + +/** + * A variation on @link {@link HostnameQueueAssignmentPolicy} that allows the + * operator (per sheet) to specify the maximum number of domains and sub-domains + * to use for the queue name. + * + */ +public class HostnameQueueAssignmentPolicyWithLimits extends HostnameQueueAssignmentPolicy { + private static final long serialVersionUID = 3L; + + public static final String LIMIT = "limit"; + + // Default limit to -1 (no limit enforced) + { + setLimit(-1); + } + + /** + * Set the maximum number of domains and sub-domains to include in the queue + * name. + *
+ * E.g. if limit is set to 2
than the following assignments are
+ * made:
+ * example.com -> example.com
+ * www.example.com -> example.com
+ * subdomain.example.com -> example.com
+ * www.subdomain.example.com -> example.com
+ * otherdomain.com -> otherdomain.com
+ *
+ * Note: No accommodation is made for TLDs, like
+ * .co.uk
that always use two levels. Operators should use use
+ * {@link SurtPrefixesSheetAssociation} sheets to apply these limits
+ * appropriately if crawling a mixture of TLDs with and without the mandatory
+ * second level or only apply the limit on specific domains.
+ *
+ * @param limit The limit on number of domains to use in assigning a queue name
+ * to a URI.
+ */
+ public void setLimit(int limit) {
+ kp.put(LIMIT, limit);
+ }
+
+ public int getLimit() {
+ return (Integer) kp.get(LIMIT);
+ }
+
+ @Override
+ protected String getCoreKey(UURI basis) {
+ int limit = (Integer) kp.get(LIMIT);
+ return getLimitedHostname(super.getCoreKey(basis), limit);
+ }
+
+ protected String getLimitedHostname(String hostname, int limit) {
+ if (limit <= 0) {
+ return hostname;
+ }
+
+ String[] domains = hostname.split("\\.");
+ if (limit >= domains.length) {
+ return hostname;
+ }
+ // More domains are present than allowed.
+ StringBuilder limitedHostname = new StringBuilder();
+ for (int i = domains.length - limit; i < domains.length - 1; i++) {
+ limitedHostname.append(domains[i]);
+ limitedHostname.append(".");
+ }
+ limitedHostname.append(domains[domains.length - 1]);
+ return limitedHostname.toString();
+ }
+}
diff --git a/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicyWithLimits.java b/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicyWithLimits.java
new file mode 100644
index 000000000..67f62513c
--- /dev/null
+++ b/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicyWithLimits.java
@@ -0,0 +1,80 @@
+package org.archive.crawler.frontier;
+
+import org.archive.crawler.spring.SurtPrefixesSheetAssociation;
+import org.archive.net.UURI;
+
+/**
+ * A variation on @link {@link SurtAuthorityQueueAssignmentPolicy} that allows
+ * the operator (per sheet) to specify the maximum number of SURT segments
+ * to use for the queue name.
+ *
+ */
+public class SurtAuthorityQueueAssignmentPolicyWithLimits extends SurtAuthorityQueueAssignmentPolicy {
+ private static final long serialVersionUID = 3L;
+
+ public static final String LIMIT = "limit";
+
+ // Default limit to -1 (no limit enforced)
+ {
+ setLimit(-1);
+ }
+
+ /**
+ * Set the maximum number of surt segments to include in the queue name.
+ *
+ * E.g. if limit is set to 2
than the following assignments are
+ * made:
+ * com,example, -> com,example,
+ * com,example,www, -> com,example,
+ * com,example,subdomain, -> com,example,
+ * com,example,subdomain,www, -> com,example,
+ * com,otherdomain, -> com,otherdomain,
+ *
+ * Note: No accommodation is made for TLDs, like
+ * .co.uk
that always use two levels. Operators should use use
+ * {@link SurtPrefixesSheetAssociation} sheets to apply these limits
+ * appropriately if crawling a mixture of TLDs with and without the mandatory
+ * second level or only apply the limit on specific domains.
+ *
+ * @param limit The limit on number of domains to use in assigning a queue name
+ * to a URI.
+ */
+ public void setLimit(int limit) {
+ kp.put(LIMIT, limit);
+ }
+
+ public int getLimit() {
+ return (Integer) kp.get(LIMIT);
+ }
+
+ @Override
+ protected String getCoreKey(UURI basis) {
+ int limit = (Integer) kp.get(LIMIT);
+ return getLimitedSurtAuthority(super.getCoreKey(basis), limit);
+ }
+
+ protected String getLimitedSurtAuthority(String surt, int limit) {
+ if (limit <= 0) {
+ return surt;
+ }
+ String domainPart = surt;
+ String portPart = "";
+ int indexOfHash = surt.indexOf('#');
+ if (indexOfHash > -1) {
+ domainPart = surt.substring(0, indexOfHash);
+ portPart = surt.substring(indexOfHash);
+ }
+ String[] segments = domainPart.split(",");
+ if (limit >= segments.length) {
+ return surt;
+ }
+ // More domains are present than allowed.
+ StringBuilder limitedSurt = new StringBuilder();
+ for (int i = 0; i < limit; i++) {
+ limitedSurt.append(segments[i]);
+ limitedSurt.append(",");
+ }
+ limitedSurt.append(portPart);
+ return limitedSurt.toString();
+ }
+}