Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added prune xpath to spider #684

Merged
merged 3 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions tests/spider_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,24 @@ def test_crawl_page():
]
assert params.i == 1 and params.is_on and params.known_num == 3

# prune path
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
params = spider.CrawlParameters(base_url, prune_xpath="//a")
params = spider.crawl_page(params)
todo = spider.URL_STORE.find_unvisited_urls(base_url)

assert len(todo) == 0 and params.i == 1

# prune path with initial page
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
params = spider.CrawlParameters(base_url, prune_xpath="//a")
params = spider.crawl_page(params, initial=True)
todo = spider.URL_STORE.find_unvisited_urls(base_url)

assert len(todo) == 0 and params.i == 1

# initial page
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(["https://httpbun.com/html"])
Expand Down
23 changes: 19 additions & 4 deletions trafilatura/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
except ImportError:
pass

from .core import baseline
from lxml.etree import XPath, tostring

from .core import baseline, prune_unwanted_nodes
from .downloads import Response, fetch_response, fetch_url
from .settings import DEFAULT_CONFIG
from .utils import LANGID_FLAG, decode_file, load_html
Expand All @@ -41,13 +43,14 @@

class CrawlParameters:
"Store necessary information to manage a focused crawl."
__slots__ = ["start", "base", "lang", "rules", "ref", "i", "known_num", "is_on"]
__slots__ = ["start", "base", "lang", "rules", "ref", "i", "known_num", "is_on", "prune_xpath"]

def __init__(
self,
start: str,
lang: Optional[str] = None,
rules: Optional[RobotFileParser] = None,
prune_xpath: Optional[str] = None,
) -> None:
self.start: str = start
self.base: str = self._get_base_url(start)
Expand All @@ -57,6 +60,7 @@ def __init__(
self.i: int = 0
self.known_num: int = 0
self.is_on: bool = True
self.prune_xpath: Optional[str] = prune_xpath

def _get_base_url(self, start: str) -> str:
"Set reference domain for the crawl."
Expand Down Expand Up @@ -200,6 +204,13 @@ def process_links(
if not is_target_language(htmlstring, params.lang):
return

if htmlstring and params.prune_xpath is not None:
if isinstance(params.prune_xpath, str):
params.prune_xpath = [params.prune_xpath]
tree = load_html(htmlstring)
tree = prune_unwanted_nodes(tree, [XPath(x) for x in params.prune_xpath])
htmlstring = tostring(tree).decode()

links, links_priority = [], []
for link in extract_links(
pagecontent=htmlstring,
Expand Down Expand Up @@ -227,6 +238,7 @@ def process_response(
return
# add final document URL to known_links
URL_STORE.add_urls([response.url], visited=True)

# convert urllib3 response to string and proceed to link extraction
process_links(decode_file(response.data), params, params.base)

Expand All @@ -237,10 +249,11 @@ def init_crawl(
rules: Optional[RobotFileParser] = None,
todo: Optional[List[str]] = None,
known: Optional[List[str]] = None,
prune_xpath: Optional[str] = None,
) -> CrawlParameters:
"""Initialize crawl by setting variables, copying values to the
URL store and retrieving the initial page if the crawl starts."""
params = CrawlParameters(start, lang, rules)
params = CrawlParameters(start, lang, rules, prune_xpath)

# todo: just known or also visited?
URL_STORE.add_urls(urls=known or [], visited=True)
Expand Down Expand Up @@ -297,6 +310,7 @@ def focused_crawler(
lang: Optional[str] = None,
config: ConfigParser = DEFAULT_CONFIG,
rules: Optional[RobotFileParser] = None,
prune_xpath: Optional[str] = None,
) -> Tuple[List[str], List[str]]:
"""Basic crawler targeting pages of interest within a website.

Expand All @@ -309,13 +323,14 @@ def focused_crawler(
lang: try to target links according to language heuristics.
config: use a different configuration (configparser format).
rules: provide politeness rules (urllib.robotparser.RobotFileParser() format).
prune_xpath: remove unwanted elements from the HTML pages using XPath.

Returns:
List of pages to visit, deque format, possibly empty if there are no further pages to visit.
Set of known links.

"""
params = init_crawl(homepage, lang, rules, todo, known_links)
params = init_crawl(homepage, lang, rules, todo, known_links, prune_xpath)

sleep_time = URL_STORE.get_crawl_delay(
params.base, default=config.getfloat("DEFAULT", "SLEEP_TIME")
Expand Down
Loading