Skip to content

Commit

Permalink
table fix: enforce maximum number of header columns (#665)
Browse files Browse the repository at this point in the history
* table fix: maximum number of header columns

* fix syntax
  • Loading branch information
adbar authored Jul 31, 2024
1 parent c60395c commit c642ab6
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 10 deletions.
4 changes: 4 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,10 @@ def test_table_processing():
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result is not None

htmlstring = '<html><body><article><table><tr><th colspan="9007199254740991">a</th><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result is not None

# wrong span info
htmlstring = '<html><body><article><table><tr><td span="-1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
Expand Down
21 changes: 11 additions & 10 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,34 +299,35 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:


def process_element(element: _Element, returnlist: List[str], include_formatting: bool) -> None:
# Process children recursively
if element.text is not None:
"Recursively convert a LXML element and its children to a flattened string representation."
if element.text:
# this is the text that comes before the first child
returnlist.append(replace_element_text(element, include_formatting))

for child in element:
process_element(child, returnlist, include_formatting)

if element.text is None and element.tail is None:
if element.tag == 'graphic':
if not element.text and not element.tail:
if element.tag == "graphic":
# add source, default to ''
text = f'{element.get("title", "")} {element.get("alt", "")}'
returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')'])
returnlist.append(f'![{text.strip()}]({element.get("src", "")})')
# newlines for textless elements
elif element.tag in NEWLINE_ELEMS:
# add line after table head
if element.tag == "row":
cell_count = len(element.xpath(".//cell"))
max_span = int(element.get("colspan") or element.get("span", 1))
# restrict columns to a maximum of 1000
max_span = min(int(element.get("colspan") or element.get("span", 1)), 1000)
# row ended so draw extra empty cells to match max_span
if 0 < max_span < 1000 and cell_count < max_span:
if cell_count < max_span:
returnlist.append(f'{"|" * (max_span - cell_count)}\n')
# if this is a head row, draw the separator below
if element.xpath("./cell[@role='head']"):
returnlist.append(f'\n{"---|" * max_span}\n')
else:
returnlist.append('\n')
elif element.tag != 'cell':
returnlist.append("\n")
elif element.tag != "cell":
# cells still need to append vertical bars
# but nothing more to do with other textless elements
return
Expand All @@ -343,7 +344,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(" ")

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
if element.tail is not None:
if element.tail:
returnlist.append(element.tail)


Expand Down

0 comments on commit c642ab6

Please sign in to comment.