-
Notifications
You must be signed in to change notification settings - Fork 763
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #598 from kris-sigur/hostname-queues-with-limits
Hostname based queue assignment variants that optionally limit queue name length
- Loading branch information
Showing
2 changed files
with
155 additions
and
0 deletions.
There are no files selected for viewing
75 changes: 75 additions & 0 deletions
75
...e/src/main/java/org/archive/crawler/frontier/HostnameQueueAssignmentPolicyWithLimits.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
package org.archive.crawler.frontier; | ||
|
||
import org.archive.crawler.spring.SurtPrefixesSheetAssociation; | ||
import org.archive.net.UURI; | ||
|
||
/** | ||
* A variation on @link {@link HostnameQueueAssignmentPolicy} that allows the | ||
* operator (per sheet) to specify the maximum number of domains and sub-domains | ||
* to use for the queue name. | ||
* | ||
*/ | ||
public class HostnameQueueAssignmentPolicyWithLimits extends HostnameQueueAssignmentPolicy { | ||
private static final long serialVersionUID = 3L; | ||
|
||
public static final String LIMIT = "limit"; | ||
|
||
// Default limit to -1 (no limit enforced) | ||
{ | ||
setLimit(-1); | ||
} | ||
|
||
/** | ||
* Set the maximum number of domains and sub-domains to include in the queue | ||
* name. | ||
* <p> | ||
* E.g. if limit is set to <code>2</code> than the following assignments are | ||
* made: <br/> | ||
* <code>example.com -> example.com</code> <br/> | ||
* <code>www.example.com -> example.com</code> <br/> | ||
* <code>subdomain.example.com -> example.com</code> <br/> | ||
* <code>www.subdomain.example.com -> example.com</code> <br/> | ||
* <code>otherdomain.com -> otherdomain.com</code> <br/> | ||
* <p> | ||
* <strong>Note:</strong> No accommodation is made for TLDs, like | ||
* <code>.co.uk</code> that always use two levels. Operators should use use | ||
* {@link SurtPrefixesSheetAssociation} sheets to apply these limits | ||
* appropriately if crawling a mixture of TLDs with and without the mandatory | ||
* second level or only apply the limit on specific domains. | ||
* | ||
* @param limit The limit on number of domains to use in assigning a queue name | ||
* to a URI. | ||
*/ | ||
public void setLimit(int limit) { | ||
kp.put(LIMIT, limit); | ||
} | ||
|
||
public int getLimit() { | ||
return (Integer) kp.get(LIMIT); | ||
} | ||
|
||
@Override | ||
protected String getCoreKey(UURI basis) { | ||
int limit = (Integer) kp.get(LIMIT); | ||
return getLimitedHostname(super.getCoreKey(basis), limit); | ||
} | ||
|
||
protected String getLimitedHostname(String hostname, int limit) { | ||
if (limit <= 0) { | ||
return hostname; | ||
} | ||
|
||
String[] domains = hostname.split("\\."); | ||
if (limit >= domains.length) { | ||
return hostname; | ||
} | ||
// More domains are present than allowed. | ||
StringBuilder limitedHostname = new StringBuilder(); | ||
for (int i = domains.length - limit; i < domains.length - 1; i++) { | ||
limitedHostname.append(domains[i]); | ||
limitedHostname.append("."); | ||
} | ||
limitedHostname.append(domains[domains.length - 1]); | ||
return limitedHostname.toString(); | ||
} | ||
} |
80 changes: 80 additions & 0 deletions
80
.../main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicyWithLimits.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
package org.archive.crawler.frontier; | ||
|
||
import org.archive.crawler.spring.SurtPrefixesSheetAssociation; | ||
import org.archive.net.UURI; | ||
|
||
/** | ||
* A variation on @link {@link SurtAuthorityQueueAssignmentPolicy} that allows | ||
* the operator (per sheet) to specify the maximum number of SURT segments | ||
* to use for the queue name. | ||
* | ||
*/ | ||
public class SurtAuthorityQueueAssignmentPolicyWithLimits extends SurtAuthorityQueueAssignmentPolicy { | ||
private static final long serialVersionUID = 3L; | ||
|
||
public static final String LIMIT = "limit"; | ||
|
||
// Default limit to -1 (no limit enforced) | ||
{ | ||
setLimit(-1); | ||
} | ||
|
||
/** | ||
* Set the maximum number of surt segments to include in the queue name. | ||
* <p> | ||
* E.g. if limit is set to <code>2</code> than the following assignments are | ||
* made: <br/> | ||
* <code>com,example, -> com,example,</code> <br/> | ||
* <code>com,example,www, -> com,example,</code> <br/> | ||
* <code>com,example,subdomain, -> com,example,</code> <br/> | ||
* <code>com,example,subdomain,www, -> com,example,</code> <br/> | ||
* <code>com,otherdomain, -> com,otherdomain,</code> <br/> | ||
* <p> | ||
* <strong>Note:</strong> No accommodation is made for TLDs, like | ||
* <code>.co.uk</code> that always use two levels. Operators should use use | ||
* {@link SurtPrefixesSheetAssociation} sheets to apply these limits | ||
* appropriately if crawling a mixture of TLDs with and without the mandatory | ||
* second level or only apply the limit on specific domains. | ||
* | ||
* @param limit The limit on number of domains to use in assigning a queue name | ||
* to a URI. | ||
*/ | ||
public void setLimit(int limit) { | ||
kp.put(LIMIT, limit); | ||
} | ||
|
||
public int getLimit() { | ||
return (Integer) kp.get(LIMIT); | ||
} | ||
|
||
@Override | ||
protected String getCoreKey(UURI basis) { | ||
int limit = (Integer) kp.get(LIMIT); | ||
return getLimitedSurtAuthority(super.getCoreKey(basis), limit); | ||
} | ||
|
||
protected String getLimitedSurtAuthority(String surt, int limit) { | ||
if (limit <= 0) { | ||
return surt; | ||
} | ||
String domainPart = surt; | ||
String portPart = ""; | ||
int indexOfHash = surt.indexOf('#'); | ||
if (indexOfHash > -1) { | ||
domainPart = surt.substring(0, indexOfHash); | ||
portPart = surt.substring(indexOfHash); | ||
} | ||
String[] segments = domainPart.split(","); | ||
if (limit >= segments.length) { | ||
return surt; | ||
} | ||
// More domains are present than allowed. | ||
StringBuilder limitedSurt = new StringBuilder(); | ||
for (int i = 0; i < limit; i++) { | ||
limitedSurt.append(segments[i]); | ||
limitedSurt.append(","); | ||
} | ||
limitedSurt.append(portPart); | ||
return limitedSurt.toString(); | ||
} | ||
} |