Skip to content

Commit

Permalink
Merge pull request #597 from kris-sigur/extractorhttp-implicit
Browse files Browse the repository at this point in the history
Add a more general support for inferred path discovery
  • Loading branch information
kris-sigur authored Aug 8, 2024
2 parents 0faf338 + 778a57b commit cd3a424
Showing 1 changed file with 27 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@

package org.archive.modules.extractor;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.modules.CrawlURI.FetchType;
Expand All @@ -39,15 +42,33 @@ public ExtractorHTTP() {
}

/** should all HTTP URIs be used to infer a link to the site's root? */
protected boolean inferRootPage = false;
protected boolean inferRootPage = false;
/**
* @deprecated Deprecated in favor of {@link #getInferPaths()} which allows the specification of arbitrary
* paths and can be overridden with sheets.
*/
public boolean getInferRootPage() {
return inferRootPage;
}
/**
* @deprecated Deprecated in favor of {@link #setInferPaths(List)} which allows the specification of arbitrary
* paths and can be overridden with sheets.
*/
public void setInferRootPage(boolean inferRootPage) {
this.inferRootPage = inferRootPage;
}


{
setInferPaths(new ArrayList<>());
}
@SuppressWarnings("unchecked")
public List<String> getInferPaths() {
return (List<String>) kp.get("inferPaths");
}
public void setInferPaths(List<String> inferPaths) {
kp.put("inferPaths", inferPaths);
}

@Override
protected boolean shouldProcess(CrawlURI uri) {
if (uri.getFetchStatus() <= 0) {
Expand All @@ -67,9 +88,12 @@ protected void extract(CrawlURI curi) {

// try /favicon.ico for every HTTP(S) URI
addOutlink(curi, "/favicon.ico", LinkContext.INFERRED_MISC, Hop.INFERRED);
if(getInferRootPage()) {
if (getInferRootPage()) {
addOutlink(curi, "/", LinkContext.INFERRED_MISC, Hop.INFERRED);
}
for (String inferPath : getInferPaths()) {
addOutlink(curi, inferPath, LinkContext.INFERRED_MISC, Hop.INFERRED);
}
}

protected void addRefreshHeaderLink(CrawlURI curi, String headerKey) {
Expand Down

0 comments on commit cd3a424

Please sign in to comment.