diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2c377e44..cb2bb177 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,9 +24,9 @@ jobs: - os: ubuntu-20.04 python-version: 3.6 - os: macos-latest - python-version: 3.7 + python-version: 3.8 - os: windows-latest - python-version: 3.7 + python-version: 3.8 - os: ubuntu-latest python-version: 3.9 - os: ubuntu-latest diff --git a/docs/usage-cli.rst b/docs/usage-cli.rst index b978f195..9da31317 100644 --- a/docs/usage-cli.rst +++ b/docs/usage-cli.rst @@ -77,7 +77,7 @@ Further options: ``--formatting`` Keep structural elements related to formatting (````/````, ````/```` etc.) ``--links`` - Keep link targets (in ``href="..."``) + Keep link targets (in ``href="..."``), converting relative URLs to absolute where possible ``--images`` Keep track of images along with their targets (```` attributes: alt, src, title) diff --git a/docs/usage-python.rst b/docs/usage-python.rst index 06ea68e2..869595bc 100644 --- a/docs/usage-python.rst +++ b/docs/usage-python.rst @@ -77,9 +77,12 @@ The variables from the example above can be used further: # source URL provided for inclusion in metadata >>> extract(downloaded, output_format='xml', url=url) - # links preserved in XML + # links preserved in XML, converting relative links to absolute where possible >>> extract(downloaded, output_format='xml', include_links=True) + # source URL must be provided to convert relative links to absolute with TXT output + >>> extract(downloaded, include_links=True, url=url) + Choice of HTML elements diff --git a/tests/cli_tests.py b/tests/cli_tests.py index b5b90078..35c859ac 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -19,6 +19,7 @@ from trafilatura import cli, cli_utils, settings, spider from trafilatura.downloads import add_to_compressed_dict, fetch_url +from trafilatura.filters import LANGID_FLAG logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) @@ -248,45 +249,6 @@ def test_cli_pipeline(): # Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool os.environ['PYTHONIOENCODING'] = "utf-8" - # Crawling - testargs = ['', '--crawl', ''] - with patch.object(sys, 'argv', testargs): - args = cli.parse_args(testargs) - cli_utils.cli_crawler(args) - - testargs = ['', '--crawl', ' '] - with patch.object(sys, 'argv', testargs): - args = cli.parse_args(testargs) - cli_utils.cli_crawler(args) - - testargs = ['', '--crawl', 'https://httpbun.org/html'] - with patch.object(sys, 'argv', testargs): - args = cli.parse_args(testargs) - f = io.StringIO() - with redirect_stdout(f): - cli_utils.cli_crawler(args) - assert f.getvalue() == 'https://httpbun.org/html\n' - - spider.URL_STORE = UrlStore(compressed=False, strict=False) - # links permitted - testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1'] - with patch.object(sys, 'argv', testargs): - args = cli.parse_args(testargs) - f = io.StringIO() - with redirect_stdout(f): - cli_utils.cli_crawler(args) - # possibly a bug on Github actions, should be 2 URLs - assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n') - spider.URL_STORE = UrlStore(compressed=False, strict=False) - # 0 links permitted - args.crawl = 'https://httpbun.org/links/4/4' - f = io.StringIO() - with redirect_stdout(f): - cli_utils.cli_crawler(args, n=0) - ## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD - assert len(f.getvalue().split('\n')) in (2, 6) - spider.URL_STORE = UrlStore(compressed=False, strict=False) - # test URL listing testargs = ['', '--list'] with patch.object(sys, 'argv', testargs): @@ -375,15 +337,6 @@ def test_cli_pipeline(): print(result) assert '[link](testlink.html)' in result and 'test.jpg' in result - # Exploration (Sitemap + Crawl) - testargs = ['', '--explore', 'https://httpbun.org/html', '--list'] - with patch.object(sys, 'argv', testargs): - args = cli.parse_args(testargs) - f = io.StringIO() - with redirect_stdout(f): - cli.process_args(args) - assert f.getvalue().strip() == 'https://httpbun.org/html' - def test_input_filtering(): '''test internal functions to filter urls''' @@ -446,6 +399,77 @@ def test_input_filtering(): assert url_store.get_known_domains() == ["https://test.info"] +def test_crawling(): + "Test crawling and exploration functions." + + testargs = ['', '--crawl', ''] + with patch.object(sys, 'argv', testargs): + args = cli.parse_args(testargs) + cli_utils.cli_crawler(args) + + testargs = ['', '--crawl', ' '] + with patch.object(sys, 'argv', testargs): + args = cli.parse_args(testargs) + cli_utils.cli_crawler(args) + + testargs = ['', '--crawl', 'https://httpbun.org/html'] + with patch.object(sys, 'argv', testargs): + args = cli.parse_args(testargs) + f = io.StringIO() + with redirect_stdout(f): + cli_utils.cli_crawler(args) + assert f.getvalue() == 'https://httpbun.org/html\n' + + spider.URL_STORE = UrlStore(compressed=False, strict=False) + # links permitted + testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1'] + with patch.object(sys, 'argv', testargs): + args = cli.parse_args(testargs) + f = io.StringIO() + with redirect_stdout(f): + cli_utils.cli_crawler(args) + # possibly a bug on Github actions, should be 2 URLs + assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n') + spider.URL_STORE = UrlStore(compressed=False, strict=False) + # 0 links permitted + args.crawl = 'https://httpbun.org/links/4/4' + f = io.StringIO() + with redirect_stdout(f): + cli_utils.cli_crawler(args, n=0) + ## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD + assert len(f.getvalue().split('\n')) in (2, 6) + spider.URL_STORE = UrlStore(compressed=False, strict=False) + + # Exploration (Sitemap + Crawl) + testargs = ['', '--explore', 'https://httpbun.org/html', '--list'] + with patch.object(sys, 'argv', testargs): + args = cli.parse_args(testargs) + f = io.StringIO() + with redirect_stdout(f): + cli.process_args(args) + assert f.getvalue().strip() == 'https://httpbun.org/html' + + +def test_probing(): + "Test webpage probing functions." + url = 'https://httpbun.org/html' + testargs = ['', '--probe', url, '--target-language', 'de'] + with patch.object(sys, 'argv', testargs): + args = cli.parse_args(testargs) + f = io.StringIO() + with redirect_stdout(f): + cli.process_args(args) + if LANGID_FLAG: + assert f.getvalue().strip() == '' + else: + assert f.getvalue().strip() == url + args.target_language = 'en' + f = io.StringIO() + with redirect_stdout(f): + cli.process_args(args) + assert f.getvalue().strip() == url + + if __name__ == '__main__': test_parser() test_climain() @@ -453,4 +477,6 @@ def test_input_filtering(): test_input_filtering() test_sysoutput() test_cli_pipeline() + test_crawling() test_download() + test_probing() diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 14101dda..e80d7dd8 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -351,6 +351,8 @@ def test_html2txt(): mydoc = "Here is the body text" assert html2txt(mydoc) == "Here is the body text" assert html2txt(html.fromstring(mydoc)) == "Here is the body text" + assert html2txt("") == "" + assert html2txt("123") == "" def test_external(): @@ -429,6 +431,8 @@ def test_links(): mydoc = html.fromstring('

Test link text. This part of the text has to be long enough.

') assert 'testlink.html' not in extract(mydoc) assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) + # relative link conversion + assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(mydoc, url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG) # link without target mydoc = html.fromstring('

Test link text. This part of the text has to be long enough.

') assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) @@ -1050,6 +1054,54 @@ def test_list_processing(): assert target_element.tail == 'tail' +def test_code_blocks(): + highlightjs = '''
+

Code:

+
code\n
+highlighted more code
+
+
''' + '' + testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml') + assert 'code\nhighlighted more code\n' in testresult and 'quote' not in testresult + github = '''
$ pip install PyGithub
+ + + + +
+ ''' + testresult = extract(github, config=ZERO_CONFIG, output_format='xml') + assert '$ pip install PyGithub' in testresult and 'quote' not in testresult + inline_code = '

paragraph

here is some code

' + testresult = extract(inline_code, config=ZERO_CONFIG, output_format='xml') + assert 'some' in testresult and 'quote' not in testresult + w3schools = '''

Example

+

Create a class named Person, use the __init__() function to assign values +for name and age:

+
+ class Person:
  def __init__(self, name, age):
    + self.name = name
    self.age = age

p1 = Person("John", + 36)
+
print(p1.name)
print(p1.age)
+
''' + testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml') + expected = ''' +class Person: +def __init__(self, name, age): +self.name = name +self.age = age +p1 = Person("John", +36) +print(p1.name) +print(p1.age) ''' + assert expected in testresult and 'quote' not in testresult + + if __name__ == '__main__': test_trim() test_input() @@ -1066,3 +1118,4 @@ def test_list_processing(): test_tei() test_table_processing() test_list_processing() + test_code_blocks() diff --git a/trafilatura/cli.py b/trafilatura/cli.py index a66e1c6b..61093f15 100644 --- a/trafilatura/cli.py +++ b/trafilatura/cli.py @@ -15,7 +15,7 @@ from . import __version__ from .cli_utils import (load_blacklist, load_input_dict, - build_exploration_dict, cli_crawler, + build_exploration_dict, cli_crawler, probe_homepage, file_processing_pipeline, url_processing_pipeline, examine, write_result) from .feeds import find_feed_urls @@ -104,6 +104,9 @@ def parse_args(args): group3_ex.add_argument("--explore", help="explore the given websites (combination of sitemap and crawl)", nargs='?', const=True, default=False) + group3_ex.add_argument("--probe", + help="probe for extractable content (works best with target language)", + nargs='?', const=True, default=False) group3.add_argument('--archived', help='try to fetch URLs from the Internet Archive if downloads fail', action="store_true") @@ -279,7 +282,7 @@ def process_args(args): # processing according to mutually exclusive options # read url list from input file - if args.input_file and all([not args.crawl, not args.explore, not args.feed, not args.sitemap]): + if args.input_file and all([not args.crawl, not args.explore, not args.feed, not args.probe, not args.sitemap]): url_store = load_input_dict(args) error_caught = url_processing_pipeline(args, url_store) @@ -314,6 +317,10 @@ def process_args(args): elif args.crawl: cli_crawler(args) + # probe and print only + elif args.probe: + probe_homepage(args) + # read files from an input directory elif args.input_dir: file_processing_pipeline(args) diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index 468b261b..04e3257f 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -22,8 +22,9 @@ from trafilatura import spider -from .core import extract +from .core import extract, html2txt from .downloads import add_to_compressed_dict, buffered_downloads, load_download_buffer +from .filters import LANGID_FLAG, language_classifier from .hashing import generate_hash_filename from .utils import uniquify_list, URL_BLACKLIST_REGEX from .settings import (use_config, FILENAME_LEN, @@ -56,6 +57,8 @@ def load_input_urls(args): input_urls = [args.crawl] elif args.explore: input_urls = [args.explore] + elif args.probe: + input_urls = [args.probe] elif args.feed: input_urls = [args.feed] elif args.sitemap: @@ -285,6 +288,16 @@ def cli_crawler(args, n=30, url_store=None): #return todo, known_links +def probe_homepage(args): + "Probe websites for extractable content and print the fitting ones." + input_urls = load_input_urls(args) + for url, result in buffered_downloads(input_urls, args.parallel): + if result is not None: + result = html2txt(result) + if result and (not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language): + print(url, flush=True) + + def url_processing_pipeline(args, url_store): '''Aggregated functions to show a list and download and process an input list''' # print list without further processing diff --git a/trafilatura/core.py b/trafilatura/core.py index 0afdf95c..2976c578 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -213,8 +213,34 @@ def handle_lists(element, options): return None +def get_code_block_element(element): + # GitHub + parent = element.getparent() + if parent is not None and 'highlight' in parent.get('class', default=''): + return element + # highlightjs + code = element.find('code') + if code is not None and len(element.getchildren()) == 1: + return code + return None + + +def handle_code_blocks(element, code): + processed_element = Element('code') + for child in element.iter('*'): + if child.tag == 'lb': + child.text = '\n' + child.tag = 'done' + processed_element.text = ''.join(code.itertext()) + return processed_element + + def handle_quotes(element, options): '''Process quotes elements''' + code = get_code_block_element(element) + if code is not None: + return handle_code_blocks(element, code) + processed_element = Element(element.tag) for child in element.iter('*'): processed_child = process_node(child, options) # handle_textnode(child, comments_fix=True) @@ -231,6 +257,9 @@ def handle_quotes(element, options): def handle_other_elements(element, potential_tags, options): '''Handle diverse or unknown elements in the scope of relevant tags''' + # handle w3schools code + if element.tag == 'div' and 'w3-code' in element.get('class', default=''): + return handle_code_blocks(element, element) # delete unwanted if element.tag not in potential_tags: if element.tag != 'done': @@ -460,7 +489,7 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG): '''Look for all previously unconsidered wild elements, including outside of the determined frame and throughout the document to recover potentially missing text parts''' LOGGER.debug('Recovering wild text elements') - search_expr = './/blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table' + search_expr = './/blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table|.//div[contains(@class, \'w3-code\')]' if options.recall is True: potential_tags.update(['div', 'lb']) search_expr += '|.//div|.//lb|.//list' @@ -760,10 +789,12 @@ def html2txt(content): content: HTML document as string or LXML element. Returns: - The extracted text in the form of a string. + The extracted text in the form of a string or an empty string. """ tree = load_html(content) + if tree is None: + return '' return ' '.join(tree.text_content().split()).strip() @@ -913,7 +944,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, cleaned_tree_backup = deepcopy(cleaned_tree) # convert tags, the rest does not work without conversion - cleaned_tree = convert_tags(cleaned_tree, options) + cleaned_tree = convert_tags(cleaned_tree, options, url or document.url) # comments first, then remove if include_comments is True: diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index daa107f5..fe35d3bc 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -32,7 +32,7 @@ from . import __version__ from .settings import DEFAULT_CONFIG -from .utils import decode_response, uniquify_list, URL_BLACKLIST_REGEX +from .utils import decode_response, make_chunks, uniquify_list, URL_BLACKLIST_REGEX NUM_CONNECTIONS = 50 @@ -250,12 +250,12 @@ def load_download_buffer(url_store, sleep_time=5): def buffered_downloads(bufferlist, download_threads, decode=True): '''Download queue consumer, single- or multi-threaded.''' - # start several threads with ThreadPoolExecutor(max_workers=download_threads) as executor: - future_to_url = {executor.submit(fetch_url, url, decode): url for url in bufferlist} - for future in as_completed(future_to_url): - # url and download result - yield future_to_url[future], future.result() + for chunk in make_chunks(bufferlist, 10000): + future_to_url = {executor.submit(fetch_url, url, decode): url for url in chunk} + for future in as_completed(future_to_url): + # url and download result + yield future_to_url[future], future.result() def _send_pycurl_request(url, no_ssl, config): diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index b2dc5cb4..6d86e7f9 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -11,6 +11,7 @@ from collections import defaultdict from copy import deepcopy +from courlan.urlutils import get_base_url, fix_relative_urls from lxml.etree import strip_tags from lxml.html.clean import Cleaner @@ -219,7 +220,7 @@ def delete_by_link_density(subtree, tagname, backtracking=False, favor_precision return subtree -def convert_tags(tree, options): +def convert_tags(tree, options, url=None): '''Simplify markup and convert relevant HTML tags to an XML standard''' # delete links for faster processing if options.links is False: @@ -233,12 +234,17 @@ def convert_tags(tree, options): # strip the rest strip_tags(tree, 'a') else: + # get base URL for converting relative URLs + base_url = url and get_base_url(url) for elem in tree.iter('a', 'ref'): elem.tag = 'ref' # replace href attribute and delete the rest target = elem.get('href') # defaults to None elem.attrib.clear() if target is not None: + # convert relative URLs + if base_url is not None: + target = fix_relative_urls(base_url, target) elem.set('target', target) # include_formatting if options.formatting is False: diff --git a/trafilatura/utils.py b/trafilatura/utils.py index 58f3b728..fd73fbf4 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -17,9 +17,10 @@ brotli = None from difflib import SequenceMatcher -from gzip import decompress from functools import lru_cache +from gzip import decompress from html import unescape +from itertools import islice from unicodedata import normalize # CChardet is faster and can be more accurate @@ -30,7 +31,6 @@ from charset_normalizer import from_bytes from lxml.html import HtmlElement, HTMLParser, fromstring -# from lxml.html.soupparser import fromstring as fromsoup # response types from urllib3.response import HTTPResponse @@ -212,20 +212,13 @@ def load_html(htmlobject): tree = fromstring(htmlobject, parser=HTML_PARSER) except ValueError: # "Unicode strings with encoding declaration are not supported." - fallback_parse = True tree = fromstring_bytes(htmlobject) + fallback_parse = True except Exception as err: LOGGER.error('lxml parsing failed: %s', err) # second pass: try passing bytes to LXML - if (tree is None or len(tree) < 2) and fallback_parse is False: + if (tree is None or len(tree) < 1) and not fallback_parse: tree = fromstring_bytes(htmlobject) - # more robust option: try BeautifulSoup? - #if tree is None or not isinstance(tree, HtmlElement): - # if isinstance(htmlobject, (bytes, str)): - # try: - # tree = fromsoup(htmlobject) - # except Exception as err: - # LOGGER.error('BS parser error: %s', err) # rejection test: is it (well-formed) HTML at all? # log parsing errors if tree is not None and check_flag is True and len(tree) < 2: @@ -381,3 +374,10 @@ def is_similar_domain(reference, new_string, threshold=0.5): if SequenceMatcher(None, reference, new_string).ratio() < threshold: return False return True + + +def make_chunks(data, size): + "Chunk data into smaller pieces." + iterator = iter(data) + for _ in range(0, len(data), size): + yield list(islice(iterator, size))