diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2c377e44..cb2bb177 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -24,9 +24,9 @@ jobs:
           - os: ubuntu-20.04
             python-version: 3.6
           - os: macos-latest
-            python-version: 3.7
+            python-version: 3.8
           - os: windows-latest
-            python-version: 3.7
+            python-version: 3.8
           - os: ubuntu-latest
             python-version: 3.9
           - os: ubuntu-latest
diff --git a/docs/usage-cli.rst b/docs/usage-cli.rst
index b978f195..9da31317 100644
--- a/docs/usage-cli.rst
+++ b/docs/usage-cli.rst
@@ -77,7 +77,7 @@ Further options:
 ``--formatting``
     Keep structural elements related to formatting (``<b>``/``<strong>``, ``<i>``/``<emph>`` etc.)
 ``--links``
-    Keep link targets (in ``href="..."``)
+    Keep link targets (in ``href="..."``), converting relative URLs to absolute where possible
 ``--images``
     Keep track of images along with their targets (``<img>`` attributes: alt, src, title)
 
diff --git a/docs/usage-python.rst b/docs/usage-python.rst
index 06ea68e2..869595bc 100644
--- a/docs/usage-python.rst
+++ b/docs/usage-python.rst
@@ -77,9 +77,12 @@ The variables from the example above can be used further:
     # source URL provided for inclusion in metadata
     >>> extract(downloaded, output_format='xml', url=url)
 
-    # links preserved in XML
+    # links preserved in XML, converting relative links to absolute where possible
     >>> extract(downloaded, output_format='xml', include_links=True)
 
+    # source URL must be provided to convert relative links to absolute with TXT output
+    >>> extract(downloaded, include_links=True, url=url)
+
 
 
 Choice of HTML elements
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
index b5b90078..35c859ac 100644
--- a/tests/cli_tests.py
+++ b/tests/cli_tests.py
@@ -19,6 +19,7 @@
 
 from trafilatura import cli, cli_utils, settings, spider
 from trafilatura.downloads import add_to_compressed_dict, fetch_url
+from trafilatura.filters import LANGID_FLAG
 
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
@@ -248,45 +249,6 @@ def test_cli_pipeline():
     # Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool
     os.environ['PYTHONIOENCODING'] = "utf-8"
 
-    # Crawling
-    testargs = ['', '--crawl', '']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    cli_utils.cli_crawler(args)
-
-    testargs = ['', '--crawl', ' ']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    cli_utils.cli_crawler(args)
-
-    testargs = ['', '--crawl', 'https://httpbun.org/html']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli_utils.cli_crawler(args)
-    assert f.getvalue() == 'https://httpbun.org/html\n'
-
-    spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    # links permitted
-    testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli_utils.cli_crawler(args)
-    # possibly a bug on Github actions, should be 2 URLs
-    assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
-    spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    # 0 links permitted
-    args.crawl = 'https://httpbun.org/links/4/4'
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli_utils.cli_crawler(args, n=0)
-    ## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD
-    assert len(f.getvalue().split('\n')) in (2, 6)
-    spider.URL_STORE = UrlStore(compressed=False, strict=False)
-
     # test URL listing
     testargs = ['', '--list']
     with patch.object(sys, 'argv', testargs):
@@ -375,15 +337,6 @@ def test_cli_pipeline():
     print(result)
     assert '[link](testlink.html)' in result and 'test.jpg' in result
 
-    # Exploration (Sitemap + Crawl)
-    testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
-    with patch.object(sys, 'argv', testargs):
-        args = cli.parse_args(testargs)
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli.process_args(args)
-    assert f.getvalue().strip() == 'https://httpbun.org/html'
-
 
 def test_input_filtering():
     '''test internal functions to filter urls'''
@@ -446,6 +399,77 @@ def test_input_filtering():
     assert url_store.get_known_domains() == ["https://test.info"]
 
 
+def test_crawling():
+    "Test crawling and exploration functions."
+
+    testargs = ['', '--crawl', '']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    cli_utils.cli_crawler(args)
+
+    testargs = ['', '--crawl', ' ']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    cli_utils.cli_crawler(args)
+
+    testargs = ['', '--crawl', 'https://httpbun.org/html']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args)
+    assert f.getvalue() == 'https://httpbun.org/html\n'
+
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    # links permitted
+    testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args)
+    # possibly a bug on Github actions, should be 2 URLs
+    assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    # 0 links permitted
+    args.crawl = 'https://httpbun.org/links/4/4'
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args, n=0)
+    ## should be 6 (5 URLs as output), possibly a bug on Actions CI/CD
+    assert len(f.getvalue().split('\n')) in (2, 6)
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+
+    # Exploration (Sitemap + Crawl)
+    testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli.process_args(args)
+    assert f.getvalue().strip() == 'https://httpbun.org/html'
+
+
+def test_probing():
+    "Test webpage probing functions."
+    url = 'https://httpbun.org/html'
+    testargs = ['', '--probe', url, '--target-language', 'de']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli.process_args(args)
+    if LANGID_FLAG:
+        assert f.getvalue().strip() == ''
+    else:
+        assert f.getvalue().strip() == url
+    args.target_language = 'en'
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli.process_args(args)
+    assert f.getvalue().strip() == url
+
+
 if __name__ == '__main__':
     test_parser()
     test_climain()
@@ -453,4 +477,6 @@ def test_input_filtering():
     test_input_filtering()
     test_sysoutput()
     test_cli_pipeline()
+    test_crawling()
     test_download()
+    test_probing()
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 14101dda..e80d7dd8 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -351,6 +351,8 @@ def test_html2txt():
     mydoc = "<html><body>Here is the body text</body></html>"
     assert html2txt(mydoc) == "Here is the body text"
     assert html2txt(html.fromstring(mydoc)) == "Here is the body text"
+    assert html2txt("") == ""
+    assert html2txt("123") == ""
 
 
 def test_external():
@@ -429,6 +431,8 @@ def test_links():
     mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>')
     assert 'testlink.html' not in extract(mydoc)
     assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
+    # relative link conversion
+    assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(mydoc, url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG)
     # link without target
     mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>')
     assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
@@ -1050,6 +1054,54 @@ def test_list_processing():
     assert target_element.tail == 'tail'
 
 
+def test_code_blocks():
+    highlightjs = '''<div class="s-prose js-post-body" itemprop="text">
+<p>Code:</p>
+<pre class="lang-sql s-code-block"><code class="hljs language-sql">code\n
+<span class="hljs-keyword">highlighted</span> more <span class="hljs-keyword">code</span>
+</code></pre>
+</div>'''
+    ''
+    testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml')
+    assert '<code>code\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
+    github = '''<div class="highlight highlight-source-shell notranslate position-relative overflow-auto" dir="auto"><pre>$ pip install PyGithub</pre><div class="zeroclipboard-container position-absolute right-0 top-0">
+    <clipboard-copy aria-label="Copy" class="ClipboardButton btn js-clipboard-copy m-2 p-0 tooltipped-no-delay" data-copy-feedback="Copied!" data-tooltip-direction="w" value="$ pip install PyGithub" tabindex="0" role="button" style="display: inherit;">
+      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon m-2">
+    <path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path><path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path>
+</svg>
+      <svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-check js-clipboard-check-icon color-fg-success d-none m-2">
+    <path d="M13.78 4.22a.75.75 0 0 1 0 1.06l-7.25 7.25a.75.75 0 0 1-1.06 0L2.22 9.28a.751.751 0 0 1 .018-1.042.751.751 0 0 1 1.042-.018L6 10.94l6.72-6.72a.75.75 0 0 1 1.06 0Z"></path>
+</svg>
+    </clipboard-copy>
+  </div></div>
+    '''
+    testresult = extract(github, config=ZERO_CONFIG, output_format='xml')
+    assert '<code>$ pip install PyGithub</code>' in testresult and 'quote' not in testresult
+    inline_code = '<div><p>paragraph</p><p>here is <code>some</code> code</p></div>'
+    testresult = extract(inline_code, config=ZERO_CONFIG, output_format='xml')
+    assert '<code>some</code>' in testresult and 'quote' not in testresult
+    w3schools = '''<div class="w3-example"><h3>Example</h3>
+<p>Create a class named Person, use the __init__() function to assign values
+for name and age:</p>
+<div class="w3-code notranslate pythonHigh"><span class="pythoncolor" style="color:black"><span class="pythonnumbercolor" style="color:red">
+</span>  <span class="pythonkeywordcolor" style="color:mediumblue">class</span> Person:<br>&nbsp; <span class="pythonkeywordcolor" style="color:mediumblue">def</span> __init__(self, name, age):<br>&nbsp;&nbsp;&nbsp; <span class="pythonnumbercolor" style="color:red">
+</span>  self.name = name<br>&nbsp;&nbsp;&nbsp; self.age = age<br><br>p1 = Person(<span class="pythonstringcolor" style="color:brown">"John"</span>, <span class="pythonnumbercolor" style="color:red">
+</span>  <span class="pythonnumbercolor" style="color:red">36</span>)<br><span class="pythonnumbercolor" style="color:red">
+</span>  <br><span class="pythonkeywordcolor" style="color:mediumblue">print</span>(p1.name)<br><span class="pythonkeywordcolor" style="color:mediumblue">print</span>(p1.age) </span></div>
+</div>'''
+    testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml')
+    expected = '''<code>
+class Person:
+def __init__(self, name, age):
+self.name = name
+self.age = age
+p1 = Person("John",
+36)
+print(p1.name)
+print(p1.age) </code>'''
+    assert expected in testresult and 'quote' not in testresult
+
+
 if __name__ == '__main__':
     test_trim()
     test_input()
@@ -1066,3 +1118,4 @@ def test_list_processing():
     test_tei()
     test_table_processing()
     test_list_processing()
+    test_code_blocks()
diff --git a/trafilatura/cli.py b/trafilatura/cli.py
index a66e1c6b..61093f15 100644
--- a/trafilatura/cli.py
+++ b/trafilatura/cli.py
@@ -15,7 +15,7 @@
 
 from . import __version__
 from .cli_utils import (load_blacklist, load_input_dict,
-                        build_exploration_dict, cli_crawler,
+                        build_exploration_dict, cli_crawler, probe_homepage,
                         file_processing_pipeline, url_processing_pipeline,
                         examine, write_result)
 from .feeds import find_feed_urls
@@ -104,6 +104,9 @@ def parse_args(args):
     group3_ex.add_argument("--explore",
                         help="explore the given websites (combination of sitemap and crawl)",
                         nargs='?', const=True, default=False)
+    group3_ex.add_argument("--probe",
+                        help="probe for extractable content (works best with target language)",
+                        nargs='?', const=True, default=False)
     group3.add_argument('--archived',
                         help='try to fetch URLs from the Internet Archive if downloads fail',
                         action="store_true")
@@ -279,7 +282,7 @@ def process_args(args):
 
     # processing according to mutually exclusive options
     # read url list from input file
-    if args.input_file and all([not args.crawl, not args.explore, not args.feed, not args.sitemap]):
+    if args.input_file and all([not args.crawl, not args.explore, not args.feed, not args.probe, not args.sitemap]):
         url_store = load_input_dict(args)
         error_caught = url_processing_pipeline(args, url_store)
 
@@ -314,6 +317,10 @@ def process_args(args):
     elif args.crawl:
         cli_crawler(args)
 
+    # probe and print only
+    elif args.probe:
+        probe_homepage(args)
+
     # read files from an input directory
     elif args.input_dir:
         file_processing_pipeline(args)
diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
index 468b261b..04e3257f 100644
--- a/trafilatura/cli_utils.py
+++ b/trafilatura/cli_utils.py
@@ -22,8 +22,9 @@
 
 from trafilatura import spider
 
-from .core import extract
+from .core import extract, html2txt
 from .downloads import add_to_compressed_dict, buffered_downloads, load_download_buffer
+from .filters import LANGID_FLAG, language_classifier
 from .hashing import generate_hash_filename
 from .utils import uniquify_list, URL_BLACKLIST_REGEX
 from .settings import (use_config, FILENAME_LEN,
@@ -56,6 +57,8 @@ def load_input_urls(args):
         input_urls = [args.crawl]
     elif args.explore:
         input_urls = [args.explore]
+    elif args.probe:
+        input_urls = [args.probe]
     elif args.feed:
         input_urls = [args.feed]
     elif args.sitemap:
@@ -285,6 +288,16 @@ def cli_crawler(args, n=30, url_store=None):
     #return todo, known_links
 
 
+def probe_homepage(args):
+    "Probe websites for extractable content and print the fitting ones."
+    input_urls = load_input_urls(args)
+    for url, result in buffered_downloads(input_urls, args.parallel):
+        if result is not None:
+            result = html2txt(result)
+            if result and (not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language):
+                print(url, flush=True)
+
+
 def url_processing_pipeline(args, url_store):
     '''Aggregated functions to show a list and download and process an input list'''
     # print list without further processing
diff --git a/trafilatura/core.py b/trafilatura/core.py
index 0afdf95c..2976c578 100644
--- a/trafilatura/core.py
+++ b/trafilatura/core.py
@@ -213,8 +213,34 @@ def handle_lists(element, options):
     return None
 
 
+def get_code_block_element(element):
+    # GitHub
+    parent = element.getparent()
+    if parent is not None and 'highlight' in parent.get('class', default=''):
+        return element
+    # highlightjs
+    code = element.find('code')
+    if code is not None and len(element.getchildren()) == 1:
+        return code
+    return None
+
+
+def handle_code_blocks(element, code):
+    processed_element = Element('code')
+    for child in element.iter('*'):
+        if child.tag == 'lb':
+            child.text = '\n'
+        child.tag = 'done'
+    processed_element.text = ''.join(code.itertext())
+    return processed_element
+
+
 def handle_quotes(element, options):
     '''Process quotes elements'''
+    code = get_code_block_element(element)
+    if code is not None:
+        return handle_code_blocks(element, code)
+
     processed_element = Element(element.tag)
     for child in element.iter('*'):
         processed_child = process_node(child, options)  # handle_textnode(child, comments_fix=True)
@@ -231,6 +257,9 @@ def handle_quotes(element, options):
 
 def handle_other_elements(element, potential_tags, options):
     '''Handle diverse or unknown elements in the scope of relevant tags'''
+    # handle w3schools code
+    if element.tag == 'div' and 'w3-code' in element.get('class', default=''):
+        return handle_code_blocks(element, element)
     # delete unwanted
     if element.tag not in potential_tags:
         if element.tag != 'done':
@@ -460,7 +489,7 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG):
     '''Look for all previously unconsidered wild elements, including outside of the determined
        frame and throughout the document to recover potentially missing text parts'''
     LOGGER.debug('Recovering wild text elements')
-    search_expr = './/blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table'
+    search_expr = './/blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table|.//div[contains(@class, \'w3-code\')]'
     if options.recall is True:
         potential_tags.update(['div', 'lb'])
         search_expr += '|.//div|.//lb|.//list'
@@ -760,10 +789,12 @@ def html2txt(content):
         content: HTML document as string or LXML element.
 
     Returns:
-        The extracted text in the form of a string.
+        The extracted text in the form of a string or an empty string.
 
     """
     tree = load_html(content)
+    if tree is None:
+        return ''
     return ' '.join(tree.text_content().split()).strip()
 
 
@@ -913,7 +944,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
         cleaned_tree_backup = deepcopy(cleaned_tree)
 
         # convert tags, the rest does not work without conversion
-        cleaned_tree = convert_tags(cleaned_tree, options)
+        cleaned_tree = convert_tags(cleaned_tree, options, url or document.url)
 
         # comments first, then remove
         if include_comments is True:
diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
index daa107f5..fe35d3bc 100644
--- a/trafilatura/downloads.py
+++ b/trafilatura/downloads.py
@@ -32,7 +32,7 @@
 
 from . import __version__
 from .settings import DEFAULT_CONFIG
-from .utils import decode_response, uniquify_list, URL_BLACKLIST_REGEX
+from .utils import decode_response, make_chunks, uniquify_list, URL_BLACKLIST_REGEX
 
 
 NUM_CONNECTIONS = 50
@@ -250,12 +250,12 @@ def load_download_buffer(url_store, sleep_time=5):
 
 def buffered_downloads(bufferlist, download_threads, decode=True):
     '''Download queue consumer, single- or multi-threaded.'''
-    # start several threads
     with ThreadPoolExecutor(max_workers=download_threads) as executor:
-        future_to_url = {executor.submit(fetch_url, url, decode): url for url in bufferlist}
-        for future in as_completed(future_to_url):
-            # url and download result
-            yield future_to_url[future], future.result()
+        for chunk in make_chunks(bufferlist, 10000):
+            future_to_url = {executor.submit(fetch_url, url, decode): url for url in chunk}
+            for future in as_completed(future_to_url):
+                # url and download result
+                yield future_to_url[future], future.result()
 
 
 def _send_pycurl_request(url, no_ssl, config):
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
index b2dc5cb4..6d86e7f9 100644
--- a/trafilatura/htmlprocessing.py
+++ b/trafilatura/htmlprocessing.py
@@ -11,6 +11,7 @@
 from collections import defaultdict
 from copy import deepcopy
 
+from courlan.urlutils import get_base_url, fix_relative_urls
 from lxml.etree import strip_tags
 from lxml.html.clean import Cleaner
 
@@ -219,7 +220,7 @@ def delete_by_link_density(subtree, tagname, backtracking=False, favor_precision
     return subtree
 
 
-def convert_tags(tree, options):
+def convert_tags(tree, options, url=None):
     '''Simplify markup and convert relevant HTML tags to an XML standard'''
     # delete links for faster processing
     if options.links is False:
@@ -233,12 +234,17 @@ def convert_tags(tree, options):
         # strip the rest
         strip_tags(tree, 'a')
     else:
+        # get base URL for converting relative URLs
+        base_url = url and get_base_url(url)
         for elem in tree.iter('a', 'ref'):
             elem.tag = 'ref'
             # replace href attribute and delete the rest
             target = elem.get('href') # defaults to None
             elem.attrib.clear()
             if target is not None:
+                # convert relative URLs
+                if base_url is not None:
+                    target = fix_relative_urls(base_url, target)
                 elem.set('target', target)
     # include_formatting
     if options.formatting is False:
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
index 58f3b728..fd73fbf4 100644
--- a/trafilatura/utils.py
+++ b/trafilatura/utils.py
@@ -17,9 +17,10 @@
     brotli = None
 
 from difflib import SequenceMatcher
-from gzip import decompress
 from functools import lru_cache
+from gzip import decompress
 from html import unescape
+from itertools import islice
 from unicodedata import normalize
 
 # CChardet is faster and can be more accurate
@@ -30,7 +31,6 @@
 from charset_normalizer import from_bytes
 
 from lxml.html import HtmlElement, HTMLParser, fromstring
-# from lxml.html.soupparser import fromstring as fromsoup
 
 # response types
 from urllib3.response import HTTPResponse
@@ -212,20 +212,13 @@ def load_html(htmlobject):
         tree = fromstring(htmlobject, parser=HTML_PARSER)
     except ValueError:
         # "Unicode strings with encoding declaration are not supported."
-        fallback_parse = True
         tree = fromstring_bytes(htmlobject)
+        fallback_parse = True
     except Exception as err:
         LOGGER.error('lxml parsing failed: %s', err)
     # second pass: try passing bytes to LXML
-    if (tree is None or len(tree) < 2) and fallback_parse is False:
+    if (tree is None or len(tree) < 1) and not fallback_parse:
         tree = fromstring_bytes(htmlobject)
-    # more robust option: try BeautifulSoup?
-    #if tree is None or not isinstance(tree, HtmlElement):
-    #    if isinstance(htmlobject, (bytes, str)):
-    #        try:
-    #            tree = fromsoup(htmlobject)
-    #        except Exception as err:
-    #            LOGGER.error('BS parser error: %s', err)
     # rejection test: is it (well-formed) HTML at all?
     # log parsing errors
     if tree is not None and check_flag is True and len(tree) < 2:
@@ -381,3 +374,10 @@ def is_similar_domain(reference, new_string, threshold=0.5):
         if SequenceMatcher(None, reference, new_string).ratio() < threshold:
             return False
     return True
+
+
+def make_chunks(data, size):
+    "Chunk data into smaller pieces."
+    iterator = iter(data)
+    for _ in range(0, len(data), size):
+        yield list(islice(iterator, size))