Merge branch 'adbar:master' into master

adbar · Jun 27, 2023 · b500c69 · b500c69
2 parents f4367de + 2f1fd35
commit b500c69
Show file tree

Hide file tree

Showing 11 changed files with 215 additions and 76 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -24,9 +24,9 @@ jobs:
           - os: ubuntu-20.04
             python-version: 3.6
           - os: macos-latest
-            python-version: 3.7
+            python-version: 3.8
           - os: windows-latest
-            python-version: 3.7
+            python-version: 3.8
           - os: ubuntu-latest
             python-version: 3.9
           - os: ubuntu-latest

diff --git a/docs/usage-cli.rst b/docs/usage-cli.rst
@@ -77,7 +77,7 @@ Further options:
 ``--formatting``
     Keep structural elements related to formatting (``<b>``/``<strong>``, ``<i>``/``<emph>`` etc.)
 ``--links``
-    Keep link targets (in ``href="..."``)
+    Keep link targets (in ``href="..."``), converting relative URLs to absolute where possible
 ``--images``
     Keep track of images along with their targets (``<img>`` attributes: alt, src, title)
 

diff --git a/docs/usage-python.rst b/docs/usage-python.rst
@@ -77,9 +77,12 @@ The variables from the example above can be used further:
     # source URL provided for inclusion in metadata
     >>> extract(downloaded, output_format='xml', url=url)
 
-    # links preserved in XML
+    # links preserved in XML, converting relative links to absolute where possible
     >>> extract(downloaded, output_format='xml', include_links=True)
 
+    # source URL must be provided to convert relative links to absolute with TXT output
+    >>> extract(downloaded, include_links=True, url=url)
+
 
 
 Choice of HTML elements

diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -19,6 +19,7 @@
 
 from trafilatura import cli, cli_utils, settings, spider
 from trafilatura.downloads import add_to_compressed_dict, fetch_url
+from trafilatura.filters import LANGID_FLAG
 
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
@@ -248,45 +249,6 @@ def test_cli_pipeline():
     # Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool
     os.environ['PYTHONIOENCODING'] = "utf-8"
 
-    # Crawling
-    testargs = ['', '--crawl', '']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    cli_utils.cli_crawler(args)
-
-    testargs = ['', '--crawl', ' ']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    cli_utils.cli_crawler(args)
-
-    testargs = ['', '--crawl', 'https://httpbun.org/html']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli_utils.cli_crawler(args)
-    assert f.getvalue() == 'https://httpbun.org/html\n'
-
-    spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    # links permitted
-    testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli_utils.cli_crawler(args)
-    # possibly a bug on Github actions, should be 2 URLs
-    assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
-    spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    # 0 links permitted
-    args.crawl = 'https://httpbun.org/links/4/4'
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli_utils.cli_crawler(args, n=0)
-    ## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD
-    assert len(f.getvalue().split('\n')) in (2, 6)
-    spider.URL_STORE = UrlStore(compressed=False, strict=False)
-
     # test URL listing
     testargs = ['', '--list']
     with patch.object(sys, 'argv', testargs):
@@ -375,15 +337,6 @@ def test_cli_pipeline():
     print(result)
     assert '[link](testlink.html)' in result and 'test.jpg' in result
 
-    # Exploration (Sitemap + Crawl)
-    testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli.process_args(args)
-    assert f.getvalue().strip() == 'https://httpbun.org/html'
-
 
 def test_input_filtering():
     '''test internal functions to filter urls'''
@@ -446,11 +399,84 @@ def test_input_filtering():
     assert url_store.get_known_domains() == ["https://test.info"]
 
 
+def test_crawling():
+    "Test crawling and exploration functions."
+
+    testargs = ['', '--crawl', '']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    cli_utils.cli_crawler(args)
+
+    testargs = ['', '--crawl', ' ']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    cli_utils.cli_crawler(args)
+
+    testargs = ['', '--crawl', 'https://httpbun.org/html']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args)
+    assert f.getvalue() == 'https://httpbun.org/html\n'
+
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    # links permitted
+    testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args)
+    # possibly a bug on Github actions, should be 2 URLs
+    assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    # 0 links permitted
+    args.crawl = 'https://httpbun.org/links/4/4'
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args, n=0)
+    ## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD
+    assert len(f.getvalue().split('\n')) in (2, 6)
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+
+    # Exploration (Sitemap + Crawl)
+    testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli.process_args(args)
+    assert f.getvalue().strip() == 'https://httpbun.org/html'
+
+
+def test_probing():
+    "Test webpage probing functions."
+    url = 'https://httpbun.org/html'
+    testargs = ['', '--probe', url, '--target-language', 'de']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli.process_args(args)
+    if LANGID_FLAG:
+        assert f.getvalue().strip() == ''
+    else:
+        assert f.getvalue().strip() == url
+    args.target_language = 'en'
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli.process_args(args)
+    assert f.getvalue().strip() == url
+
+
 if __name__ == '__main__':
     test_parser()
     test_climain()
     test_input_type()
     test_input_filtering()
     test_sysoutput()
     test_cli_pipeline()
+    test_crawling()
     test_download()
+    test_probing()
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -351,6 +351,8 @@ def test_html2txt():
     mydoc = "<html><body>Here is the body text</body></html>"
     assert html2txt(mydoc) == "Here is the body text"
     assert html2txt(html.fromstring(mydoc)) == "Here is the body text"
+    assert html2txt("") == ""
+    assert html2txt("123") == ""
 
 
 def test_external():
@@ -429,6 +431,8 @@ def test_links():
     mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>')
     assert 'testlink.html' not in extract(mydoc)
     assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
+    # relative link conversion
+    assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(mydoc, url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG)
     # link without target
     mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>')
     assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
@@ -1050,6 +1054,54 @@ def test_list_processing():
     assert target_element.tail == 'tail'
 
 
+def test_code_blocks():
+    highlightjs = '''<div class="s-prose js-post-body" itemprop="text">
+<p>Code:</p>
+<pre class="lang-sql s-code-block"><code class="hljs language-sql">code\n
+<span class="hljs-keyword">highlighted</span> more <span class="hljs-keyword">code</span>
+</code></pre>
+</div>'''
+    ''
+    testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml')
+    assert '<code>code\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
+    github = '''<div class="highlight highlight-source-shell notranslate position-relative overflow-auto" dir="auto"><pre>$ pip install PyGithub</pre><div class="zeroclipboard-container position-absolute right-0 top-0">
+    <clipboard-copy aria-label="Copy" class="ClipboardButton btn js-clipboard-copy m-2 p-0 tooltipped-no-delay" data-copy-feedback="Copied!" data-tooltip-direction="w" value="$ pip install PyGithub" tabindex="0" role="button" style="display: inherit;">
+      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon m-2">
+    <path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
+</svg>
+      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-check js-clipboard-check-icon color-fg-success d-none m-2">
+    <path d="M13.78 4.22a.75.75 0 0 1 0 1.06l-7.25 7.25a.75.75 0 0 1-1.06 0L2.22 9.28a.751.751 0 0 1 .018-1.042.751.751 0 0 1 1.042-.018L6 10.94l6.72-6.72a.75.75 0 0 1 1.06 0Z"></path>
+</svg>
+    </clipboard-copy>
+  </div></div>
+    '''
+    testresult = extract(github, config=ZERO_CONFIG, output_format='xml')
+    assert '<code>$ pip install PyGithub</code>' in testresult and 'quote' not in testresult
+    inline_code = '<div><p>paragraph</p><p>here is <code>some</code> code</p></div>'
+    testresult = extract(inline_code, config=ZERO_CONFIG, output_format='xml')
+    assert '<code>some</code>' in testresult and 'quote' not in testresult
+    w3schools = '''<div class="w3-example"><h3>Example</h3>
+<p>Create a class named Person, use the __init__() function to assign values
+for name and age:</p>
+<div class="w3-code notranslate pythonHigh"><span class="pythoncolor" style="color:black"><span class="pythonnumbercolor" style="color:red">
+</span>  <span class="pythonkeywordcolor" style="color:mediumblue">class</span> Person:<br>&nbsp; <span class="pythonkeywordcolor" style="color:mediumblue">def</span> __init__(self, name, age):<br>&nbsp;&nbsp;&nbsp; <span class="pythonnumbercolor" style="color:red">
+</span>  self.name = name<br>&nbsp;&nbsp;&nbsp; self.age = age<br><br>p1 = Person(<span class="pythonstringcolor" style="color:brown">"John"</span>, <span class="pythonnumbercolor" style="color:red">
+</span>  <span class="pythonnumbercolor" style="color:red">36</span>)<br><span class="pythonnumbercolor" style="color:red">
+</span>  <br><span class="pythonkeywordcolor" style="color:mediumblue">print</span>(p1.name)<br><span class="pythonkeywordcolor" style="color:mediumblue">print</span>(p1.age) </span></div>
+</div>'''
+    testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml')
+    expected = '''<code>
+class Person:
+def __init__(self, name, age):
+self.name = name
+self.age = age
+p1 = Person("John",
+36)
+print(p1.name)
+print(p1.age) </code>'''
+    assert expected in testresult and 'quote' not in testresult
+
+
 if __name__ == '__main__':
     test_trim()
     test_input()
@@ -1066,3 +1118,4 @@ def test_list_processing():
     test_tei()
     test_table_processing()
     test_list_processing()
+    test_code_blocks()
diff --git a/trafilatura/cli.py b/trafilatura/cli.py
@@ -15,7 +15,7 @@
 
 from . import __version__
 from .cli_utils import (load_blacklist, load_input_dict,
-                        build_exploration_dict, cli_crawler,
+                        build_exploration_dict, cli_crawler, probe_homepage,
                         file_processing_pipeline, url_processing_pipeline,
                         examine, write_result)
 from .feeds import find_feed_urls
@@ -104,6 +104,9 @@ def parse_args(args):
     group3_ex.add_argument("--explore",
                         help="explore the given websites (combination of sitemap and crawl)",
                         nargs='?', const=True, default=False)
+    group3_ex.add_argument("--probe",
+                        help="probe for extractable content (works best with target language)",
+                        nargs='?', const=True, default=False)
     group3.add_argument('--archived',
                         help='try to fetch URLs from the Internet Archive if downloads fail',
                         action="store_true")
@@ -279,7 +282,7 @@ def process_args(args):
 
     # processing according to mutually exclusive options
     # read url list from input file
-    if args.input_file and all([not args.crawl, not args.explore, not args.feed, not args.sitemap]):
+    if args.input_file and all([not args.crawl, not args.explore, not args.feed, not args.probe, not args.sitemap]):
         url_store = load_input_dict(args)
         error_caught = url_processing_pipeline(args, url_store)
 
@@ -314,6 +317,10 @@ def process_args(args):
     elif args.crawl:
         cli_crawler(args)
 
+    # probe and print only
+    elif args.probe:
+        probe_homepage(args)
+
     # read files from an input directory
     elif args.input_dir:
         file_processing_pipeline(args)

diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -22,8 +22,9 @@
 
 from trafilatura import spider
 
-from .core import extract
+from .core import extract, html2txt
 from .downloads import add_to_compressed_dict, buffered_downloads, load_download_buffer
+from .filters import LANGID_FLAG, language_classifier
 from .hashing import generate_hash_filename
 from .utils import uniquify_list, URL_BLACKLIST_REGEX
 from .settings import (use_config, FILENAME_LEN,
@@ -56,6 +57,8 @@ def load_input_urls(args):
         input_urls = [args.crawl]
     elif args.explore:
         input_urls = [args.explore]
+    elif args.probe:
+        input_urls = [args.probe]
     elif args.feed:
         input_urls = [args.feed]
     elif args.sitemap:
@@ -285,6 +288,16 @@ def cli_crawler(args, n=30, url_store=None):
     #return todo, known_links
 
 
+def probe_homepage(args):
+    "Probe websites for extractable content and print the fitting ones."
+    input_urls = load_input_urls(args)
+    for url, result in buffered_downloads(input_urls, args.parallel):
+        if result is not None:
+            result = html2txt(result)
+            if result and (not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language):
+                print(url, flush=True)
+
+
 def url_processing_pipeline(args, url_store):
     '''Aggregated functions to show a list and download and process an input list'''
     # print list without further processing