Skip to content

Commit

Permalink
spider: prune_xpath parameter added (#684)
Browse files Browse the repository at this point in the history
* Added prune xpath to spider

* updated prune xpath in favor to params

* Moved prune code to process_links
  • Loading branch information
felipehertzer authored Aug 30, 2024
1 parent ba43965 commit 7836a76
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 4 deletions.
18 changes: 18 additions & 0 deletions tests/spider_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,24 @@ def test_crawl_page():
]
assert params.i == 1 and params.is_on and params.known_num == 3

# prune path
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
params = spider.CrawlParameters(base_url, prune_xpath="//a")
params = spider.crawl_page(params)
todo = spider.URL_STORE.find_unvisited_urls(base_url)

assert len(todo) == 0 and params.i == 1

# prune path with initial page
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
params = spider.CrawlParameters(base_url, prune_xpath="//a")
params = spider.crawl_page(params, initial=True)
todo = spider.URL_STORE.find_unvisited_urls(base_url)

assert len(todo) == 0 and params.i == 1

# initial page
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(["https://httpbun.com/html"])
Expand Down
23 changes: 19 additions & 4 deletions trafilatura/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
except ImportError:
pass

from .core import baseline
from lxml.etree import XPath, tostring

from .core import baseline, prune_unwanted_nodes
from .downloads import Response, fetch_response, fetch_url
from .settings import DEFAULT_CONFIG
from .utils import LANGID_FLAG, decode_file, load_html
Expand All @@ -41,13 +43,14 @@

class CrawlParameters:
"Store necessary information to manage a focused crawl."
__slots__ = ["start", "base", "lang", "rules", "ref", "i", "known_num", "is_on"]
__slots__ = ["start", "base", "lang", "rules", "ref", "i", "known_num", "is_on", "prune_xpath"]

def __init__(
self,
start: str,
lang: Optional[str] = None,
rules: Optional[RobotFileParser] = None,
prune_xpath: Optional[str] = None,
) -> None:
self.start: str = start
self.base: str = self._get_base_url(start)
Expand All @@ -57,6 +60,7 @@ def __init__(
self.i: int = 0
self.known_num: int = 0
self.is_on: bool = True
self.prune_xpath: Optional[str] = prune_xpath

def _get_base_url(self, start: str) -> str:
"Set reference domain for the crawl."
Expand Down Expand Up @@ -200,6 +204,13 @@ def process_links(
if not is_target_language(htmlstring, params.lang):
return

if htmlstring and params.prune_xpath is not None:
if isinstance(params.prune_xpath, str):
params.prune_xpath = [params.prune_xpath]
tree = load_html(htmlstring)
tree = prune_unwanted_nodes(tree, [XPath(x) for x in params.prune_xpath])
htmlstring = tostring(tree).decode()

links, links_priority = [], []
for link in extract_links(
pagecontent=htmlstring,
Expand Down Expand Up @@ -227,6 +238,7 @@ def process_response(
return
# add final document URL to known_links
URL_STORE.add_urls([response.url], visited=True)

# convert urllib3 response to string and proceed to link extraction
process_links(decode_file(response.data), params, params.base)

Expand All @@ -237,10 +249,11 @@ def init_crawl(
rules: Optional[RobotFileParser] = None,
todo: Optional[List[str]] = None,
known: Optional[List[str]] = None,
prune_xpath: Optional[str] = None,
) -> CrawlParameters:
"""Initialize crawl by setting variables, copying values to the
URL store and retrieving the initial page if the crawl starts."""
params = CrawlParameters(start, lang, rules)
params = CrawlParameters(start, lang, rules, prune_xpath)

# todo: just known or also visited?
URL_STORE.add_urls(urls=known or [], visited=True)
Expand Down Expand Up @@ -297,6 +310,7 @@ def focused_crawler(
lang: Optional[str] = None,
config: ConfigParser = DEFAULT_CONFIG,
rules: Optional[RobotFileParser] = None,
prune_xpath: Optional[str] = None,
) -> Tuple[List[str], List[str]]:
"""Basic crawler targeting pages of interest within a website.
Expand All @@ -309,13 +323,14 @@ def focused_crawler(
lang: try to target links according to language heuristics.
config: use a different configuration (configparser format).
rules: provide politeness rules (urllib.robotparser.RobotFileParser() format).
prune_xpath: remove unwanted elements from the HTML pages using XPath.
Returns:
List of pages to visit, deque format, possibly empty if there are no further pages to visit.
Set of known links.
"""
params = init_crawl(homepage, lang, rules, todo, known_links)
params = init_crawl(homepage, lang, rules, todo, known_links, prune_xpath)

sleep_time = URL_STORE.get_crawl_delay(
params.base, default=config.getfloat("DEFAULT", "SLEEP_TIME")
Expand Down

0 comments on commit 7836a76

Please sign in to comment.