Skip to content

Commit

Permalink
added possibility to prune xPaths (#414)
Browse files Browse the repository at this point in the history
* added possibility to prune xPaths

* user shoudl not have to specify no_fallback when custom pruning x_paths | fixed test

* extensive prune xpath testing
  • Loading branch information
HeLehm authored Oct 4, 2023
1 parent 0fbc1cb commit 7dd7347
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 4 deletions.
32 changes: 31 additions & 1 deletion tests/filters_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from lxml import etree, html

import trafilatura.filters
from trafilatura import extract
from trafilatura import extract, bare_extraction
from trafilatura.core import Extractor
from trafilatura.filters import (check_html_lang, duplicate_test,
language_filter)
Expand Down Expand Up @@ -147,6 +147,36 @@ def test_lrucache():
assert lru_test.get('tralala') == -1


def test_prune_xpath():
'''test xpath pruning (parameter in extract and bare_extraction)'''
#create example html
def doc():
my_p = '<p>abc</p>'
return html.fromstring('<html><body>' + my_p*50 + '</body></html>')

def doc2():
my_p = '<p>abc</p>'
my_h1 = '<h1>ABC</h1>'
return html.fromstring('<html><body>' + my_h1 + my_p*50 + '</body></html>')

def doc3():
my_p = '<p>abc</p>'
my_h1 = '<h1>ABC</h1>'
my_h2 = '<h2>42</h2>'
return html.fromstring('<html><body>' + my_h1 + my_h2 + my_p*50 + '</body></html>')

#test xpath pruning
assert extract(doc(), prune_xpath='//p') == ''
assert extract(doc2(), prune_xpath='//p') == 'ABC'
assert extract(doc2(), prune_xpath=['//p', '//h1']) == ''
assert extract(doc3(), prune_xpath=['//p', '//h1']) == '42'
# sanity check
assert extract(doc()) != ''
assert extract(doc2()) != ''
assert extract(doc3()) != ''


if __name__ == '__main__':
test_filters()
test_lrucache()
test_prune_xpath()
20 changes: 17 additions & 3 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,8 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
date_extraction_params=None,
only_with_metadata=False, with_metadata=False,
max_tree_size=None, url_blacklist=None, author_blacklist=None,
as_dict=True, config=DEFAULT_CONFIG):
as_dict=True, prune_xpath=None,
config=DEFAULT_CONFIG):
"""Internal function for text extraction returning bare Python variables.
Args:
Expand All @@ -869,6 +870,8 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
as_dict: Legacy option, return a dictionary instead of a class with attributes.
prune_xpath: Provide an XPath expression to prune the tree before extraction.
can be str or list of str.
config: Directly provide a configparser configuration.
Returns:
Expand Down Expand Up @@ -930,6 +933,13 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
include_comments, include_formatting, include_links,
include_images, include_tables, deduplicate,
target_language)

# prune all xpath expressions that user specified
# no backup as this is unetre full control of the user
if prune_xpath is not None:
if isinstance(prune_xpath, str):
prune_xpath = [prune_xpath]
tree = prune_unwanted_nodes(tree, prune_xpath)

# backup (or not) for further processing
tree_backup_1 = deepcopy(tree) if no_fallback is False else None
Expand Down Expand Up @@ -1022,7 +1032,8 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
date_extraction_params=None,
only_with_metadata=False, with_metadata=False,
max_tree_size=None, url_blacklist=None, author_blacklist=None,
settingsfile=None, config=DEFAULT_CONFIG,
settingsfile=None, prune_xpath=None,
config=DEFAULT_CONFIG,
**kwargs):
"""Main function exposed by the package:
Wrapper for text extraction and conversion to chosen output format.
Expand Down Expand Up @@ -1052,6 +1063,8 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
settingsfile: Use a configuration file to override the standard settings.
prune_xpath: Provide an XPath expression to prune the tree before extraction.
can be str or list of str.
config: Directly provide a configparser configuration.
Returns:
Expand Down Expand Up @@ -1088,7 +1101,8 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
only_with_metadata=only_with_metadata, with_metadata=with_metadata,
max_tree_size=max_tree_size, url_blacklist=url_blacklist,
author_blacklist=author_blacklist,
as_dict=False, config=config,
as_dict=False, prune_xpath=prune_xpath,
config=config,
)
except RuntimeError:
LOGGER.error('Processing timeout for %s', url)
Expand Down

0 comments on commit 7dd7347

Please sign in to comment.