From c642ab65e7385cf49804f458e4d1eb8902981878 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 31 Jul 2024 16:46:57 +0200 Subject: [PATCH] table fix: enforce maximum number of header columns (#665) * table fix: maximum number of header columns * fix syntax --- tests/unit_tests.py | 4 ++++ trafilatura/xml.py | 21 +++++++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index e880bad1..0c21e5c1 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1118,6 +1118,10 @@ def test_table_processing(): result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) assert result is not None + htmlstring = '
ab
cde
' + result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) + assert result is not None + # wrong span info htmlstring = '
ab
cde
' result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 9b413c16..513dc727 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -299,34 +299,35 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str: def process_element(element: _Element, returnlist: List[str], include_formatting: bool) -> None: - # Process children recursively - if element.text is not None: + "Recursively convert a LXML element and its children to a flattened string representation." + if element.text: # this is the text that comes before the first child returnlist.append(replace_element_text(element, include_formatting)) for child in element: process_element(child, returnlist, include_formatting) - if element.text is None and element.tail is None: - if element.tag == 'graphic': + if not element.text and not element.tail: + if element.tag == "graphic": # add source, default to '' text = f'{element.get("title", "")} {element.get("alt", "")}' - returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')']) + returnlist.append(f'![{text.strip()}]({element.get("src", "")})') # newlines for textless elements elif element.tag in NEWLINE_ELEMS: # add line after table head if element.tag == "row": cell_count = len(element.xpath(".//cell")) - max_span = int(element.get("colspan") or element.get("span", 1)) + # restrict columns to a maximum of 1000 + max_span = min(int(element.get("colspan") or element.get("span", 1)), 1000) # row ended so draw extra empty cells to match max_span - if 0 < max_span < 1000 and cell_count < max_span: + if cell_count < max_span: returnlist.append(f'{"|" * (max_span - cell_count)}\n') # if this is a head row, draw the separator below if element.xpath("./cell[@role='head']"): returnlist.append(f'\n{"---|" * max_span}\n') else: - returnlist.append('\n') - elif element.tag != 'cell': + returnlist.append("\n") + elif element.tag != "cell": # cells still need to append vertical bars # but nothing more to do with other textless elements return @@ -343,7 +344,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting returnlist.append(" ") # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS - if element.tail is not None: + if element.tail: returnlist.append(element.tail)