From c642ab65e7385cf49804f458e4d1eb8902981878 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Wed, 31 Jul 2024 16:46:57 +0200
Subject: [PATCH] table fix: enforce maximum number of header columns (#665)

* table fix: maximum number of header columns

* fix syntax
---
 tests/unit_tests.py |  4 ++++
 trafilatura/xml.py  | 21 +++++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index e880bad1..0c21e5c1 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -1118,6 +1118,10 @@ def test_table_processing():
     result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
     assert result is not None
 
+    htmlstring = '<html><body><article><table><tr><th colspan="9007199254740991">a</th><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
+    result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+    assert result is not None
+
     # wrong span info
     htmlstring = '<html><body><article><table><tr><td span="-1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
index 9b413c16..513dc727 100644
--- a/trafilatura/xml.py
+++ b/trafilatura/xml.py
@@ -299,34 +299,35 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
 
 
 def process_element(element: _Element, returnlist: List[str], include_formatting: bool) -> None:
-    # Process children recursively
-    if element.text is not None:
+    "Recursively convert a LXML element and its children to a flattened string representation."
+    if element.text:
         # this is the text that comes before the first child
         returnlist.append(replace_element_text(element, include_formatting))
 
     for child in element:
         process_element(child, returnlist, include_formatting)
 
-    if element.text is None and element.tail is None:
-        if element.tag == 'graphic':
+    if not element.text and not element.tail:
+        if element.tag == "graphic":
             # add source, default to ''
             text = f'{element.get("title", "")} {element.get("alt", "")}'
-            returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')'])
+            returnlist.append(f'![{text.strip()}]({element.get("src", "")})')
         # newlines for textless elements
         elif element.tag in NEWLINE_ELEMS:
             # add line after table head
             if element.tag == "row":
                 cell_count = len(element.xpath(".//cell"))
-                max_span = int(element.get("colspan") or element.get("span", 1))
+                # restrict columns to a maximum of 1000
+                max_span = min(int(element.get("colspan") or element.get("span", 1)), 1000)
                 # row ended so draw extra empty cells to match max_span
-                if 0 < max_span < 1000 and cell_count < max_span:
+                if cell_count < max_span:
                     returnlist.append(f'{"|" * (max_span - cell_count)}\n')
                 # if this is a head row, draw the separator below
                 if element.xpath("./cell[@role='head']"):
                     returnlist.append(f'\n{"---|" * max_span}\n')
             else:
-                returnlist.append('\n')
-        elif element.tag != 'cell':
+                returnlist.append("\n")
+        elif element.tag != "cell":
             # cells still need to append vertical bars
             # but nothing more to do with other textless elements
             return
@@ -343,7 +344,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
         returnlist.append(" ")
 
     # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
-    if element.tail is not None:
+    if element.tail:
         returnlist.append(element.tail)