Skip to content

Commit

Permalink
improve support for code elements (#401)
Browse files Browse the repository at this point in the history
* pip code snippets support

Add support in inline and multi-line code snippets from pip
Example: https://pypi.org/project/openai-function-call/

* hljs code snippets support

Add support in inline and multi-line code snippets powered by hljs
Example: https://medium.com/@jxnlco/seamless-integration-with-openai-and-pydantic-a-powerful-duo-for-output-parsing-fcb1e616167b

* medium ssr code snippets support

When fetching Medium pages without javascript enabled, code snippets are structured as <pre><span>code</span></pre>
Example: https://medium.com/@jxnlco/seamless-integration-with-openai-and-pydantic-a-powerful-duo-for-output-parsing-fcb1e616167b

* switch line break to <lb/>

Use `<lb/>` instead of `\n` for code snippets

* set child as done

* keep code formatting

* increase test coverage

* move code parts and adapt tests

* fix merge conflict and clean up

* fix XPath issue

* cleanup

---------

Co-authored-by: Adrien Barbaresi <[email protected]>
Co-authored-by: Adrien Barbaresi <[email protected]>
  • Loading branch information
3 people authored Sep 5, 2023
1 parent 982ec1d commit f101371
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 28 deletions.
35 changes: 26 additions & 9 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,7 +1060,6 @@ def test_code_blocks():
<span class="hljs-keyword">highlighted</span> more <span class="hljs-keyword">code</span>
</code></pre>
</div>'''
''
testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml')
assert '<code>code\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
github = '''<div class="highlight highlight-source-shell notranslate position-relative overflow-auto" dir="auto"><pre>$ pip install PyGithub</pre><div class="zeroclipboard-container position-absolute right-0 top-0">
Expand Down Expand Up @@ -1090,14 +1089,32 @@ def test_code_blocks():
</div>'''
testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml')
expected = '''<code>
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
p1 = Person("John",
36)
print(p1.name)
print(p1.age) </code>'''
class Person:<lb/> def __init__(self, name, age):<lb/>
self.name = name<lb/> self.age = age<lb/><lb/>p1 = Person("John",
36)<lb/>
<lb/>print(p1.name)<lb/>print(p1.age) </code>'''
assert expected in testresult and 'quote' not in testresult
pip = '''<div><p>Code:</p>
<pre lang="python3"><span class="kn">import</span> <span class="nn">openai</span>
<span class="kn">from</span> <span class="nn">openai_function_call</span> <span class="kn">import</span> <span class="n">openai_function</span></pre></div>'''
expected = '''<code>import openai
from openai_function_call import openai_function</code>'''
testresult = extract(pip, config=ZERO_CONFIG, output_format='xml')
assert expected in testresult and 'quote' not in testresult
medium_js = '''<div><p>Code:</p>
<pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny" data-selectable-paragraph=""><span class="hljs-keyword">import</span> openai_function<br><br><span class="hljs-meta">@openai_function</span></span></pre>'''
expected = '''<code>import openai_function<lb/><lb/>@openai_function</code>'''
testresult = extract(medium_js, config=ZERO_CONFIG, output_format='xml')
assert expected in testresult and 'quote' not in testresult
medium_ssr = '''<div><p>Code:</p>
<pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny">import openai_function<br><br>@openai_functiondef sum(a:int, b:int):<br/> &quot;&quot;&quot;Sum description adds a + b&quot;&quot;&quot;</span></pre>'''
expected = '<code>import openai_function<lb/><lb/>@openai_functiondef sum(a:int, b:int):<lb/> """Sum description adds a + b"""</code>'
testresult = extract(medium_ssr, config=ZERO_CONFIG, output_format='xml')
assert expected in testresult and 'quote' not in testresult
code_el = '''<div><p>Code:</p>
<pre><code><span>my code</span></code></pre>'''
expected = '''<code>my code</code>'''
testresult = extract(code_el, config=ZERO_CONFIG, output_format='xml')
assert expected in testresult and 'quote' not in testresult


Expand Down
29 changes: 15 additions & 14 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,33 +208,33 @@ def handle_lists(element, options):
return None


def get_code_block_element(element):
def is_code_block_element(element):
# pip
if element.get('lang') is not None or element.tag == 'code':
return True
# GitHub
parent = element.getparent()
if parent is not None and 'highlight' in parent.get('class', default=''):
return element
return True
# highlightjs
code = element.find('code')
if code is not None and len(element.getchildren()) == 1:
return code
return None
return True
return False


def handle_code_blocks(element, code):
processed_element = Element('code')
def handle_code_blocks(element):
processed_element = deepcopy(element)
for child in element.iter('*'):
if child.tag == 'lb':
child.text = '\n'
child.tag = 'done'
processed_element.text = ''.join(code.itertext())
processed_element.tag = 'code'
return processed_element


def handle_quotes(element, options):
'''Process quotes elements'''
code = get_code_block_element(element)
if code is not None:
return handle_code_blocks(element, code)
if is_code_block_element(element):
return handle_code_blocks(element)

processed_element = Element(element.tag)
for child in element.iter('*'):
Expand All @@ -254,7 +254,7 @@ def handle_other_elements(element, potential_tags, options):
'''Handle diverse or unknown elements in the scope of relevant tags'''
# handle w3schools code
if element.tag == 'div' and 'w3-code' in element.get('class', default=''):
return handle_code_blocks(element, element)
return handle_code_blocks(element)
# delete unwanted
if element.tag not in potential_tags:
if element.tag != 'done':
Expand Down Expand Up @@ -801,7 +801,8 @@ def determine_returnstring(document, output_format, include_formatting, tei_vali
for element in document.body.iter('*'):
if element.tag != 'graphic' and len(element) == 0 and not element.text and not element.tail:
parent = element.getparent()
if parent is not None:
# do not remove elements inside <code> to preserve formatting
if parent is not None and parent.tag != 'code':
parent.remove(element)
# build output trees
strip_double_tags(document.body)
Expand Down
26 changes: 23 additions & 3 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,29 @@ def convert_tags(tree, options, url=None):
elif elem.tag in ('br', 'hr'):
elem.tag = 'lb'
# wbr
# blockquote, pre, q → quote
# pre
#elif elem.tag == 'pre':
# else:
# elem.tag = 'quote'
# blockquote, q → quote
elif elem.tag in ('blockquote', 'pre', 'q'):
elem.tag = 'quote'
code_flag = False
if elem.tag == 'pre':
# detect if there could be code inside
children = elem.getchildren()
# pre with a single span is more likely to be code
if len(children) == 1 and children[0].tag == 'span':
code_flag = True
# find hljs elements to detect if it's code
code_elems = elem.xpath(".//span[starts-with(@class,'hljs')]")
if code_elems:
code_flag = True
for subelem in code_elems:
subelem.attrib.clear()
if code_flag:
elem.tag = 'code'
else:
elem.tag = 'quote'
# del | s | strike → <del rend="overstrike">
elif elem.tag in ('del', 's', 'strike'):
elem.tag = 'del'
Expand Down Expand Up @@ -332,7 +352,7 @@ def handle_textnode(element, options, comments_fix=True, preserve_spaces=False):
element.tail = trim(element.tail)
# filter content
# or not re.search(r'\w', element.text): # text_content()?
if not element.text or textfilter(element) is True:
if not element.text or textfilter(element) is True:
return None
if options.dedup and duplicate_test(element, options.config) is True:
return None
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,8 @@ def line_processing(line):
'''Remove HTML space entities, then discard incompatible unicode
and invalid XML characters on line level'''
# spacing HTML entities: https://www.w3.org/MarkUp/html-spec/html-spec_13.html
line = line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0')
# unique code spaces
line = line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0').replace(';cs;', ' ')
# remove newlines that are not related to punctuation or markup
# remove non-printable chars and normalize space characters (including Unicode spaces)
line = trim(remove_control_characters(LINES_TRIMMING.sub(r' ', line)))
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def remove_empty_elements(tree):
if len(element) == 0 and text_chars_test(element.text) is False and text_chars_test(element.tail) is False:
parent = element.getparent()
# not root element or element which is naturally empty
if parent is not None and element.tag != "graphic":
# do not remove elements inside <code> to preserve formatting
if parent is not None and element.tag != "graphic" and parent.tag != 'code':
element.getparent().remove(element)
return tree

Expand Down

0 comments on commit f101371

Please sign in to comment.