added possibility to prune xPaths (#414)

* added possibility to prune xPaths * user shoudl not have to specify no_fallback when custom pruning x_paths | fixed test * extensive prune xpath testing
adbar · Oct 4, 2023 · 7dd7347 · 7dd7347
1 parent 0fbc1cb
commit 7dd7347
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 4 deletions.
diff --git a/tests/filters_tests.py b/tests/filters_tests.py
@@ -14,7 +14,7 @@
 from lxml import etree, html
 
 import trafilatura.filters
-from trafilatura import extract
+from trafilatura import extract, bare_extraction
 from trafilatura.core import Extractor
 from trafilatura.filters import (check_html_lang, duplicate_test,
                                  language_filter)
@@ -147,6 +147,36 @@ def test_lrucache():
     assert lru_test.get('tralala') == -1
 
 
+def test_prune_xpath():
+    '''test xpath pruning (parameter in extract and bare_extraction)'''
+    #create example html
+    def doc():
+        my_p = '<p>abc</p>'
+        return html.fromstring('<html><body>' + my_p*50 + '</body></html>')
+
+    def doc2():
+        my_p = '<p>abc</p>'
+        my_h1 = '<h1>ABC</h1>'
+        return html.fromstring('<html><body>' + my_h1 + my_p*50 + '</body></html>')
+
+    def doc3():
+        my_p = '<p>abc</p>'
+        my_h1 = '<h1>ABC</h1>'
+        my_h2 = '<h2>42</h2>'
+        return html.fromstring('<html><body>' + my_h1 + my_h2 + my_p*50 + '</body></html>')
+
+    #test xpath pruning
+    assert extract(doc(), prune_xpath='//p') == ''
+    assert extract(doc2(), prune_xpath='//p') == 'ABC'
+    assert extract(doc2(), prune_xpath=['//p', '//h1']) == ''
+    assert extract(doc3(), prune_xpath=['//p', '//h1']) == '42'
+    # sanity check
+    assert extract(doc()) != ''
+    assert extract(doc2()) != ''
+    assert extract(doc3()) != ''
+
+
 if __name__ == '__main__':
     test_filters()
     test_lrucache()
+    test_prune_xpath()
diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -842,7 +842,8 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
                     date_extraction_params=None,
                     only_with_metadata=False, with_metadata=False,
                     max_tree_size=None, url_blacklist=None, author_blacklist=None,
-                    as_dict=True, config=DEFAULT_CONFIG):
+                    as_dict=True, prune_xpath=None,
+                    config=DEFAULT_CONFIG):
     """Internal function for text extraction returning bare Python variables.
 
     Args:
@@ -869,6 +870,8 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
         url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
         author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
         as_dict: Legacy option, return a dictionary instead of a class with attributes.
+        prune_xpath: Provide an XPath expression to prune the tree before extraction.
+            can be str or list of str.
         config: Directly provide a configparser configuration.
 
     Returns:
@@ -930,6 +933,13 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
                             include_comments, include_formatting, include_links,
                             include_images, include_tables, deduplicate,
                             target_language)
+
+        # prune all xpath expressions that user specified
+        # no backup as this is unetre full control of the user
+        if prune_xpath is not None:
+            if isinstance(prune_xpath, str):
+                prune_xpath = [prune_xpath]
+            tree = prune_unwanted_nodes(tree, prune_xpath)
 
         # backup (or not) for further processing
         tree_backup_1 = deepcopy(tree) if no_fallback is False else None
@@ -1022,7 +1032,8 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
             date_extraction_params=None,
             only_with_metadata=False, with_metadata=False,
             max_tree_size=None, url_blacklist=None, author_blacklist=None,
-            settingsfile=None, config=DEFAULT_CONFIG,
+            settingsfile=None, prune_xpath=None,
+            config=DEFAULT_CONFIG,
             **kwargs):
     """Main function exposed by the package:
        Wrapper for text extraction and conversion to chosen output format.
@@ -1052,6 +1063,8 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
         url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
         author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
         settingsfile: Use a configuration file to override the standard settings.
+        prune_xpath: Provide an XPath expression to prune the tree before extraction.
+            can be str or list of str.
         config: Directly provide a configparser configuration.
 
     Returns:
@@ -1088,7 +1101,8 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
             only_with_metadata=only_with_metadata, with_metadata=with_metadata,
             max_tree_size=max_tree_size, url_blacklist=url_blacklist,
             author_blacklist=author_blacklist,
-            as_dict=False, config=config,
+            as_dict=False, prune_xpath=prune_xpath,
+            config=config,
         )
     except RuntimeError:
         LOGGER.error('Processing timeout for %s', url)