improve support for code elements (#401)

* pip code snippets support Add support in inline and multi-line code snippets from pip Example: https://pypi.org/project/openai-function-call/ * hljs code snippets support Add support in inline and multi-line code snippets powered by hljs Example: https://medium.com/@jxnlco/seamless-integration-with-openai-and-pydantic-a-powerful-duo-for-output-parsing-fcb1e616167b * medium ssr code snippets support When fetching Medium pages without javascript enabled, code snippets are structured as <pre><span>code</span></pre> Example: https://medium.com/@jxnlco/seamless-integration-with-openai-and-pydantic-a-powerful-duo-for-output-parsing-fcb1e616167b * switch line break to <lb/> Use `<lb/>` instead of `\n` for code snippets * set child as done * keep code formatting * increase test coverage * move code parts and adapt tests * fix merge conflict and clean up * fix XPath issue * cleanup --------- Co-authored-by: Adrien Barbaresi <[email protected]> Co-authored-by: Adrien Barbaresi <[email protected]>
adbar · Sep 5, 2023 · f101371 · f101371
1 parent 982ec1d
commit f101371
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 28 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -1060,7 +1060,6 @@ def test_code_blocks():
 <span class="hljs-keyword">highlighted</span> more <span class="hljs-keyword">code</span>
 </code></pre>
 </div>'''
-    ''
     testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml')
     assert '<code>code\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
     github = '''<div class="highlight highlight-source-shell notranslate position-relative overflow-auto" dir="auto"><pre>$ pip install PyGithub</pre><div class="zeroclipboard-container position-absolute right-0 top-0">
@@ -1090,14 +1089,32 @@ def test_code_blocks():
 </div>'''
     testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml')
     expected = '''<code>
-class Person:
-def __init__(self, name, age):
-self.name = name
-self.age = age
-p1 = Person("John",
-36)
-print(p1.name)
-print(p1.age) </code>'''
+class Person:<lb/> def __init__(self, name, age):<lb/>
+self.name = name<lb/> self.age = age<lb/><lb/>p1 = Person("John",
+36)<lb/>
+<lb/>print(p1.name)<lb/>print(p1.age) </code>'''
+    assert expected in testresult and 'quote' not in testresult
+    pip = '''<div><p>Code:</p>
+    <pre lang="python3"><span class="kn">import</span> <span class="nn">openai</span>
+    <span class="kn">from</span> <span class="nn">openai_function_call</span> <span class="kn">import</span> <span class="n">openai_function</span></pre></div>'''
+    expected = '''<code>import openai
+from openai_function_call import openai_function</code>'''
+    testresult = extract(pip, config=ZERO_CONFIG, output_format='xml')
+    assert expected in testresult and 'quote' not in testresult
+    medium_js = '''<div><p>Code:</p>
+    <pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny" data-selectable-paragraph=""><span class="hljs-keyword">import</span> openai_function<br><br><span class="hljs-meta">@openai_function</span></span></pre>'''
+    expected = '''<code>import openai_function<lb/><lb/>@openai_function</code>'''
+    testresult = extract(medium_js, config=ZERO_CONFIG, output_format='xml')
+    assert expected in testresult and 'quote' not in testresult
+    medium_ssr = '''<div><p>Code:</p>
+    <pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny">import openai_function<br><br>@openai_functiondef sum(a:int, b:int):<br/>  &quot;&quot;&quot;Sum description adds a + b&quot;&quot;&quot;</span></pre>'''
+    expected = '<code>import openai_function<lb/><lb/>@openai_functiondef sum(a:int, b:int):<lb/> """Sum description adds a + b"""</code>'
+    testresult = extract(medium_ssr, config=ZERO_CONFIG, output_format='xml')
+    assert expected in testresult and 'quote' not in testresult
+    code_el = '''<div><p>Code:</p>
+    <pre><code><span>my code</span></code></pre>'''
+    expected = '''<code>my code</code>'''
+    testresult = extract(code_el, config=ZERO_CONFIG, output_format='xml')
     assert expected in testresult and 'quote' not in testresult
 
 

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -208,33 +208,33 @@ def handle_lists(element, options):
     return None
 
 
-def get_code_block_element(element):
+def is_code_block_element(element):
+    # pip
+    if element.get('lang') is not None or element.tag == 'code':
+        return True
     # GitHub
     parent = element.getparent()
     if parent is not None and 'highlight' in parent.get('class', default=''):
-        return element
+        return True
     # highlightjs
     code = element.find('code')
     if code is not None and len(element.getchildren()) == 1:
-        return code
-    return None
+        return True
+    return False
 
 
-def handle_code_blocks(element, code):
-    processed_element = Element('code')
+def handle_code_blocks(element):
+    processed_element = deepcopy(element)
     for child in element.iter('*'):
-        if child.tag == 'lb':
-            child.text = '\n'
         child.tag = 'done'
-    processed_element.text = ''.join(code.itertext())
+    processed_element.tag = 'code'
     return processed_element
 
 
 def handle_quotes(element, options):
     '''Process quotes elements'''
-    code = get_code_block_element(element)
-    if code is not None:
-        return handle_code_blocks(element, code)
+    if is_code_block_element(element):
+        return handle_code_blocks(element)
 
     processed_element = Element(element.tag)
     for child in element.iter('*'):
@@ -254,7 +254,7 @@ def handle_other_elements(element, potential_tags, options):
     '''Handle diverse or unknown elements in the scope of relevant tags'''
     # handle w3schools code
     if element.tag == 'div' and 'w3-code' in element.get('class', default=''):
-        return handle_code_blocks(element, element)
+        return handle_code_blocks(element)
     # delete unwanted
     if element.tag not in potential_tags:
         if element.tag != 'done':
@@ -801,7 +801,8 @@ def determine_returnstring(document, output_format, include_formatting, tei_vali
         for element in document.body.iter('*'):
             if element.tag != 'graphic' and len(element) == 0 and not element.text and not element.tail:
                 parent = element.getparent()
-                if parent is not None:
+                # do not remove elements inside <code> to preserve formatting
+                if parent is not None and parent.tag != 'code':
                     parent.remove(element)
         # build output trees
         strip_double_tags(document.body)

diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -288,9 +288,29 @@ def convert_tags(tree, options, url=None):
         elif elem.tag in ('br', 'hr'):
             elem.tag = 'lb'
         # wbr
-        # blockquote, pre, q → quote
+        # pre
+        #elif elem.tag == 'pre':
+        #    else:
+        #        elem.tag = 'quote'
+        # blockquote, q → quote
         elif elem.tag in ('blockquote', 'pre', 'q'):
-            elem.tag = 'quote'
+            code_flag = False
+            if elem.tag == 'pre':
+                # detect if there could be code inside
+                children = elem.getchildren()
+                # pre with a single span is more likely to be code
+                if len(children) == 1 and children[0].tag == 'span':
+                    code_flag = True
+            # find hljs elements to detect if it's code
+            code_elems = elem.xpath(".//span[starts-with(@class,'hljs')]")
+            if code_elems:
+                code_flag = True
+                for subelem in code_elems:
+                    subelem.attrib.clear()
+            if code_flag:
+                elem.tag = 'code'
+            else:
+                elem.tag = 'quote'
         # del | s | strike → <del rend="overstrike">
         elif elem.tag in ('del', 's', 'strike'):
             elem.tag = 'del'
@@ -332,7 +352,7 @@ def handle_textnode(element, options, comments_fix=True, preserve_spaces=False):
             element.tail = trim(element.tail)
     # filter content
     # or not re.search(r'\w', element.text):  # text_content()?
-    if not element.text or textfilter(element) is True:  
+    if not element.text or textfilter(element) is True:
         return None
     if options.dedup and duplicate_test(element, options.config) is True:
         return None

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -261,7 +261,8 @@ def line_processing(line):
     '''Remove HTML space entities, then discard incompatible unicode
        and invalid XML characters on line level'''
     # spacing HTML entities: https://www.w3.org/MarkUp/html-spec/html-spec_13.html
-    line = line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0')
+    # unique code spaces
+    line = line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0').replace(';cs;', ' ')
     # remove newlines that are not related to punctuation or markup
     # remove non-printable chars and normalize space characters (including Unicode spaces)
     line = trim(remove_control_characters(LINES_TRIMMING.sub(r' ', line)))

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -83,7 +83,8 @@ def remove_empty_elements(tree):
         if len(element) == 0 and text_chars_test(element.text) is False and text_chars_test(element.tail) is False:
             parent = element.getparent()
             # not root element or element which is naturally empty
-            if parent is not None and element.tag != "graphic":
+            # do not remove elements inside <code> to preserve formatting
+            if parent is not None and element.tag != "graphic" and parent.tag != 'code':
                 element.getparent().remove(element)
     return tree