spider: prune_xpath parameter added (#684)

* Added prune xpath to spider * updated prune xpath in favor to params * Moved prune code to process_links
adbar · Aug 30, 2024 · 7836a76 · 7836a76
1 parent ba43965
commit 7836a76
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 4 deletions.
diff --git a/tests/spider_tests.py b/tests/spider_tests.py
@@ -220,6 +220,24 @@ def test_crawl_page():
     ]
     assert params.i == 1 and params.is_on and params.known_num == 3
 
+    # prune path
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
+    params = spider.CrawlParameters(base_url, prune_xpath="//a")
+    params = spider.crawl_page(params)
+    todo = spider.URL_STORE.find_unvisited_urls(base_url)
+
+    assert len(todo) == 0 and params.i == 1
+
+    # prune path with initial page
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
+    params = spider.CrawlParameters(base_url, prune_xpath="//a")
+    params = spider.crawl_page(params, initial=True)
+    todo = spider.URL_STORE.find_unvisited_urls(base_url)
+
+    assert len(todo) == 0 and params.i == 1
+
     # initial page
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
     spider.URL_STORE.add_urls(["https://httpbun.com/html"])

diff --git a/trafilatura/spider.py b/trafilatura/spider.py
@@ -24,7 +24,9 @@
 except ImportError:
     pass
 
-from .core import baseline
+from lxml.etree import XPath, tostring
+
+from .core import baseline, prune_unwanted_nodes
 from .downloads import Response, fetch_response, fetch_url
 from .settings import DEFAULT_CONFIG
 from .utils import LANGID_FLAG, decode_file, load_html
@@ -41,13 +43,14 @@
 
 class CrawlParameters:
     "Store necessary information to manage a focused crawl."
-    __slots__ = ["start", "base", "lang", "rules", "ref", "i", "known_num", "is_on"]
+    __slots__ = ["start", "base", "lang", "rules", "ref", "i", "known_num", "is_on", "prune_xpath"]
 
     def __init__(
         self,
         start: str,
         lang: Optional[str] = None,
         rules: Optional[RobotFileParser] = None,
+        prune_xpath: Optional[str] = None,
     ) -> None:
         self.start: str = start
         self.base: str = self._get_base_url(start)
@@ -57,6 +60,7 @@ def __init__(
         self.i: int = 0
         self.known_num: int = 0
         self.is_on: bool = True
+        self.prune_xpath: Optional[str] = prune_xpath
 
     def _get_base_url(self, start: str) -> str:
         "Set reference domain for the crawl."
@@ -200,6 +204,13 @@ def process_links(
     if not is_target_language(htmlstring, params.lang):
         return
 
+    if htmlstring and params.prune_xpath is not None:
+        if isinstance(params.prune_xpath, str):
+            params.prune_xpath = [params.prune_xpath]
+        tree = load_html(htmlstring)
+        tree = prune_unwanted_nodes(tree, [XPath(x) for x in params.prune_xpath])
+        htmlstring = tostring(tree).decode()
+
     links, links_priority = [], []
     for link in extract_links(
         pagecontent=htmlstring,
@@ -227,6 +238,7 @@ def process_response(
         return
     # add final document URL to known_links
     URL_STORE.add_urls([response.url], visited=True)
+
     # convert urllib3 response to string and proceed to link extraction
     process_links(decode_file(response.data), params, params.base)
 
@@ -237,10 +249,11 @@ def init_crawl(
     rules: Optional[RobotFileParser] = None,
     todo: Optional[List[str]] = None,
     known: Optional[List[str]] = None,
+    prune_xpath: Optional[str] = None,
 ) -> CrawlParameters:
     """Initialize crawl by setting variables, copying values to the
     URL store and retrieving the initial page if the crawl starts."""
-    params = CrawlParameters(start, lang, rules)
+    params = CrawlParameters(start, lang, rules, prune_xpath)
 
     # todo: just known or also visited?
     URL_STORE.add_urls(urls=known or [], visited=True)
@@ -297,6 +310,7 @@ def focused_crawler(
     lang: Optional[str] = None,
     config: ConfigParser = DEFAULT_CONFIG,
     rules: Optional[RobotFileParser] = None,
+    prune_xpath: Optional[str] = None,
 ) -> Tuple[List[str], List[str]]:
     """Basic crawler targeting pages of interest within a website.
 
@@ -309,13 +323,14 @@ def focused_crawler(
         lang: try to target links according to language heuristics.
         config: use a different configuration (configparser format).
         rules: provide politeness rules (urllib.robotparser.RobotFileParser() format).
+        prune_xpath: remove unwanted elements from the HTML pages using XPath.
 
     Returns:
         List of pages to visit, deque format, possibly empty if there are no further pages to visit.
         Set of known links.
 
     """
-    params = init_crawl(homepage, lang, rules, todo, known_links)
+    params = init_crawl(homepage, lang, rules, todo, known_links, prune_xpath)
 
     sleep_time = URL_STORE.get_crawl_delay(
         params.base, default=config.getfloat("DEFAULT", "SLEEP_TIME")