Skip to content

Commit

Permalink
Merge branch 'adbar:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
felipehertzer authored Jun 27, 2023
2 parents f4367de + 2f1fd35 commit b500c69
Show file tree
Hide file tree
Showing 11 changed files with 215 additions and 76 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ jobs:
- os: ubuntu-20.04
python-version: 3.6
- os: macos-latest
python-version: 3.7
python-version: 3.8
- os: windows-latest
python-version: 3.7
python-version: 3.8
- os: ubuntu-latest
python-version: 3.9
- os: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion docs/usage-cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Further options:
``--formatting``
Keep structural elements related to formatting (``<b>``/``<strong>``, ``<i>``/``<emph>`` etc.)
``--links``
Keep link targets (in ``href="..."``)
Keep link targets (in ``href="..."``), converting relative URLs to absolute where possible
``--images``
Keep track of images along with their targets (``<img>`` attributes: alt, src, title)

Expand Down
5 changes: 4 additions & 1 deletion docs/usage-python.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,12 @@ The variables from the example above can be used further:
# source URL provided for inclusion in metadata
>>> extract(downloaded, output_format='xml', url=url)
# links preserved in XML
# links preserved in XML, converting relative links to absolute where possible
>>> extract(downloaded, output_format='xml', include_links=True)
# source URL must be provided to convert relative links to absolute with TXT output
>>> extract(downloaded, include_links=True, url=url)
Choice of HTML elements
Expand Down
122 changes: 74 additions & 48 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from trafilatura import cli, cli_utils, settings, spider
from trafilatura.downloads import add_to_compressed_dict, fetch_url
from trafilatura.filters import LANGID_FLAG


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
Expand Down Expand Up @@ -248,45 +249,6 @@ def test_cli_pipeline():
# Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool
os.environ['PYTHONIOENCODING'] = "utf-8"

# Crawling
testargs = ['', '--crawl', '']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
cli_utils.cli_crawler(args)

testargs = ['', '--crawl', ' ']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
cli_utils.cli_crawler(args)

testargs = ['', '--crawl', 'https://httpbun.org/html']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert f.getvalue() == 'https://httpbun.org/html\n'

spider.URL_STORE = UrlStore(compressed=False, strict=False)
# links permitted
testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
# possibly a bug on Github actions, should be 2 URLs
assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# 0 links permitted
args.crawl = 'https://httpbun.org/links/4/4'
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args, n=0)
## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD
assert len(f.getvalue().split('\n')) in (2, 6)
spider.URL_STORE = UrlStore(compressed=False, strict=False)

# test URL listing
testargs = ['', '--list']
with patch.object(sys, 'argv', testargs):
Expand Down Expand Up @@ -375,15 +337,6 @@ def test_cli_pipeline():
print(result)
assert '[link](testlink.html)' in result and 'test.jpg' in result

# Exploration (Sitemap + Crawl)
testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert f.getvalue().strip() == 'https://httpbun.org/html'


def test_input_filtering():
'''test internal functions to filter urls'''
Expand Down Expand Up @@ -446,11 +399,84 @@ def test_input_filtering():
assert url_store.get_known_domains() == ["https://test.info"]


def test_crawling():
"Test crawling and exploration functions."

testargs = ['', '--crawl', '']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
cli_utils.cli_crawler(args)

testargs = ['', '--crawl', ' ']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
cli_utils.cli_crawler(args)

testargs = ['', '--crawl', 'https://httpbun.org/html']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert f.getvalue() == 'https://httpbun.org/html\n'

spider.URL_STORE = UrlStore(compressed=False, strict=False)
# links permitted
testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
# possibly a bug on Github actions, should be 2 URLs
assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# 0 links permitted
args.crawl = 'https://httpbun.org/links/4/4'
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args, n=0)
## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD
assert len(f.getvalue().split('\n')) in (2, 6)
spider.URL_STORE = UrlStore(compressed=False, strict=False)

# Exploration (Sitemap + Crawl)
testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert f.getvalue().strip() == 'https://httpbun.org/html'


def test_probing():
"Test webpage probing functions."
url = 'https://httpbun.org/html'
testargs = ['', '--probe', url, '--target-language', 'de']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
if LANGID_FLAG:
assert f.getvalue().strip() == ''
else:
assert f.getvalue().strip() == url
args.target_language = 'en'
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert f.getvalue().strip() == url


if __name__ == '__main__':
test_parser()
test_climain()
test_input_type()
test_input_filtering()
test_sysoutput()
test_cli_pipeline()
test_crawling()
test_download()
test_probing()
53 changes: 53 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,8 @@ def test_html2txt():
mydoc = "<html><body>Here is the body text</body></html>"
assert html2txt(mydoc) == "Here is the body text"
assert html2txt(html.fromstring(mydoc)) == "Here is the body text"
assert html2txt("") == ""
assert html2txt("123") == ""


def test_external():
Expand Down Expand Up @@ -429,6 +431,8 @@ def test_links():
mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>')
assert 'testlink.html' not in extract(mydoc)
assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
# relative link conversion
assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(mydoc, url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG)
# link without target
mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>')
assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
Expand Down Expand Up @@ -1050,6 +1054,54 @@ def test_list_processing():
assert target_element.tail == 'tail'


def test_code_blocks():
highlightjs = '''<div class="s-prose js-post-body" itemprop="text">
<p>Code:</p>
<pre class="lang-sql s-code-block"><code class="hljs language-sql">code\n
<span class="hljs-keyword">highlighted</span> more <span class="hljs-keyword">code</span>
</code></pre>
</div>'''
''
testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml')
assert '<code>code\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
github = '''<div class="highlight highlight-source-shell notranslate position-relative overflow-auto" dir="auto"><pre>$ pip install PyGithub</pre><div class="zeroclipboard-container position-absolute right-0 top-0">
<clipboard-copy aria-label="Copy" class="ClipboardButton btn js-clipboard-copy m-2 p-0 tooltipped-no-delay" data-copy-feedback="Copied!" data-tooltip-direction="w" value="$ pip install PyGithub" tabindex="0" role="button" style="display: inherit;">
<svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon m-2">
<path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
</svg>
<svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-check js-clipboard-check-icon color-fg-success d-none m-2">
<path d="M13.78 4.22a.75.75 0 0 1 0 1.06l-7.25 7.25a.75.75 0 0 1-1.06 0L2.22 9.28a.751.751 0 0 1 .018-1.042.751.751 0 0 1 1.042-.018L6 10.94l6.72-6.72a.75.75 0 0 1 1.06 0Z"></path>
</svg>
</clipboard-copy>
</div></div>
'''
testresult = extract(github, config=ZERO_CONFIG, output_format='xml')
assert '<code>$ pip install PyGithub</code>' in testresult and 'quote' not in testresult
inline_code = '<div><p>paragraph</p><p>here is <code>some</code> code</p></div>'
testresult = extract(inline_code, config=ZERO_CONFIG, output_format='xml')
assert '<code>some</code>' in testresult and 'quote' not in testresult
w3schools = '''<div class="w3-example"><h3>Example</h3>
<p>Create a class named Person, use the __init__() function to assign values
for name and age:</p>
<div class="w3-code notranslate pythonHigh"><span class="pythoncolor" style="color:black"><span class="pythonnumbercolor" style="color:red">
</span> <span class="pythonkeywordcolor" style="color:mediumblue">class</span> Person:<br>&nbsp; <span class="pythonkeywordcolor" style="color:mediumblue">def</span> __init__(self, name, age):<br>&nbsp;&nbsp;&nbsp; <span class="pythonnumbercolor" style="color:red">
</span> self.name = name<br>&nbsp;&nbsp;&nbsp; self.age = age<br><br>p1 = Person(<span class="pythonstringcolor" style="color:brown">"John"</span>, <span class="pythonnumbercolor" style="color:red">
</span> <span class="pythonnumbercolor" style="color:red">36</span>)<br><span class="pythonnumbercolor" style="color:red">
</span> <br><span class="pythonkeywordcolor" style="color:mediumblue">print</span>(p1.name)<br><span class="pythonkeywordcolor" style="color:mediumblue">print</span>(p1.age) </span></div>
</div>'''
testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml')
expected = '''<code>
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
p1 = Person("John",
36)
print(p1.name)
print(p1.age) </code>'''
assert expected in testresult and 'quote' not in testresult


if __name__ == '__main__':
test_trim()
test_input()
Expand All @@ -1066,3 +1118,4 @@ def test_list_processing():
test_tei()
test_table_processing()
test_list_processing()
test_code_blocks()
11 changes: 9 additions & 2 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from . import __version__
from .cli_utils import (load_blacklist, load_input_dict,
build_exploration_dict, cli_crawler,
build_exploration_dict, cli_crawler, probe_homepage,
file_processing_pipeline, url_processing_pipeline,
examine, write_result)
from .feeds import find_feed_urls
Expand Down Expand Up @@ -104,6 +104,9 @@ def parse_args(args):
group3_ex.add_argument("--explore",
help="explore the given websites (combination of sitemap and crawl)",
nargs='?', const=True, default=False)
group3_ex.add_argument("--probe",
help="probe for extractable content (works best with target language)",
nargs='?', const=True, default=False)
group3.add_argument('--archived',
help='try to fetch URLs from the Internet Archive if downloads fail',
action="store_true")
Expand Down Expand Up @@ -279,7 +282,7 @@ def process_args(args):

# processing according to mutually exclusive options
# read url list from input file
if args.input_file and all([not args.crawl, not args.explore, not args.feed, not args.sitemap]):
if args.input_file and all([not args.crawl, not args.explore, not args.feed, not args.probe, not args.sitemap]):
url_store = load_input_dict(args)
error_caught = url_processing_pipeline(args, url_store)

Expand Down Expand Up @@ -314,6 +317,10 @@ def process_args(args):
elif args.crawl:
cli_crawler(args)

# probe and print only
elif args.probe:
probe_homepage(args)

# read files from an input directory
elif args.input_dir:
file_processing_pipeline(args)
Expand Down
15 changes: 14 additions & 1 deletion trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@

from trafilatura import spider

from .core import extract
from .core import extract, html2txt
from .downloads import add_to_compressed_dict, buffered_downloads, load_download_buffer
from .filters import LANGID_FLAG, language_classifier
from .hashing import generate_hash_filename
from .utils import uniquify_list, URL_BLACKLIST_REGEX
from .settings import (use_config, FILENAME_LEN,
Expand Down Expand Up @@ -56,6 +57,8 @@ def load_input_urls(args):
input_urls = [args.crawl]
elif args.explore:
input_urls = [args.explore]
elif args.probe:
input_urls = [args.probe]
elif args.feed:
input_urls = [args.feed]
elif args.sitemap:
Expand Down Expand Up @@ -285,6 +288,16 @@ def cli_crawler(args, n=30, url_store=None):
#return todo, known_links


def probe_homepage(args):
"Probe websites for extractable content and print the fitting ones."
input_urls = load_input_urls(args)
for url, result in buffered_downloads(input_urls, args.parallel):
if result is not None:
result = html2txt(result)
if result and (not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language):
print(url, flush=True)


def url_processing_pipeline(args, url_store):
'''Aggregated functions to show a list and download and process an input list'''
# print list without further processing
Expand Down
Loading

0 comments on commit b500c69

Please sign in to comment.