Merge branch 'master' into latest_courlan

adbar · Aug 7, 2023 · 5b2ee02 · 5b2ee02
2 parents 7077420 + d78fbb5
commit 5b2ee02
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 23 deletions.
diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -325,11 +325,15 @@ def cli_crawler(args, n=30, url_store=None):
 def probe_homepage(args):
     "Probe websites for extractable content and print the fitting ones."
     input_urls = load_input_urls(args)
+    config = use_config(filename=args.config_file)
+    min_length = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')
+
     for url, result in buffered_downloads(input_urls, args.parallel):
         if result is not None:
             result = html2txt(result)
-            if result and (not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language):
-                print(url, flush=True)
+            if result and len(result) > min_length and any(c.isalpha() for c in result):
+                if not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language:
+                    print(url, flush=True)
 
 
 def url_processing_pipeline(args, url_store):
@@ -366,14 +370,16 @@ def file_processing_pipeline(args):
     filecounter = None
     processing_cores = args.parallel or FILE_PROCESSING_CORES
     config = use_config(filename=args.config_file)
+    timeout = config.getint('DEFAULT', 'EXTRACTION_TIMEOUT') or None
 
-    # max_tasks_per_child available in Python 3.11+
+    # max_tasks_per_child available in Python >= 3.11
     with ProcessPoolExecutor(max_workers=processing_cores) as executor:
+        # chunk input: https://github.com/python/cpython/issues/74028
         for filebatch in make_chunks(generate_filelist(args.input_dir), MAX_FILES_PER_DIRECTORY):
             if filecounter is None and len(filebatch) >= MAX_FILES_PER_DIRECTORY:
                 filecounter = 0
             worker = partial(file_processing, args=args, counter=filecounter, config=config)
-            executor.map(worker, filebatch, chunksize=10)
+            executor.map(worker, filebatch, chunksize=10, timeout=timeout)
             # update counter
             if filecounter is not None:
                 filecounter += len(filebatch)

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -14,13 +14,6 @@
 
 from copy import deepcopy
 
-# SIGALRM isn't present on Windows, detect it
-try:
-    from signal import signal, alarm, SIGALRM
-    HAS_SIGNAL = True
-except ImportError:
-    HAS_SIGNAL = False
-
 from lxml.etree import Element, SubElement, strip_elements, strip_tags
 from lxml.html import tostring
 
@@ -1078,13 +1071,6 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
     # configuration init
     config = use_config(settingsfile, config)
 
-    # put timeout signal in place
-    if HAS_SIGNAL is True:
-        timeout = config.getint('DEFAULT', 'EXTRACTION_TIMEOUT')
-        if timeout > 0:
-            signal(SIGALRM, timeout_handler)
-            alarm(timeout)
-
     # extraction
     try:
         document = bare_extraction(
@@ -1105,10 +1091,6 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
         LOGGER.error('Processing timeout for %s', url)
         document = None
 
-    # deactivate alarm signal
-    if HAS_SIGNAL is True and timeout > 0:
-        alarm(0)
-
     # post-processing
     if document is None:
         return None

diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg
@@ -19,7 +19,7 @@ MIN_EXTRACTED_COMM_SIZE = 1
 MIN_OUTPUT_SIZE = 1
 MIN_OUTPUT_COMM_SIZE = 1
 
-# Set to 0 to disable signal
+# CLI file processing only, set to 0 to disable
 EXTRACTION_TIMEOUT = 30
 
 # Deduplication