diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index 533b9eaf..2be18268 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -325,11 +325,15 @@ def cli_crawler(args, n=30, url_store=None): def probe_homepage(args): "Probe websites for extractable content and print the fitting ones." input_urls = load_input_urls(args) + config = use_config(filename=args.config_file) + min_length = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') + for url, result in buffered_downloads(input_urls, args.parallel): if result is not None: result = html2txt(result) - if result and (not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language): - print(url, flush=True) + if result and len(result) > min_length and any(c.isalpha() for c in result): + if not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language: + print(url, flush=True) def url_processing_pipeline(args, url_store): @@ -366,14 +370,16 @@ def file_processing_pipeline(args): filecounter = None processing_cores = args.parallel or FILE_PROCESSING_CORES config = use_config(filename=args.config_file) + timeout = config.getint('DEFAULT', 'EXTRACTION_TIMEOUT') or None - # max_tasks_per_child available in Python 3.11+ + # max_tasks_per_child available in Python >= 3.11 with ProcessPoolExecutor(max_workers=processing_cores) as executor: + # chunk input: https://github.com/python/cpython/issues/74028 for filebatch in make_chunks(generate_filelist(args.input_dir), MAX_FILES_PER_DIRECTORY): if filecounter is None and len(filebatch) >= MAX_FILES_PER_DIRECTORY: filecounter = 0 worker = partial(file_processing, args=args, counter=filecounter, config=config) - executor.map(worker, filebatch, chunksize=10) + executor.map(worker, filebatch, chunksize=10, timeout=timeout) # update counter if filecounter is not None: filecounter += len(filebatch) diff --git a/trafilatura/core.py b/trafilatura/core.py index 2976c578..4233d6ab 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -14,13 +14,6 @@ from copy import deepcopy -# SIGALRM isn't present on Windows, detect it -try: - from signal import signal, alarm, SIGALRM - HAS_SIGNAL = True -except ImportError: - HAS_SIGNAL = False - from lxml.etree import Element, SubElement, strip_elements, strip_tags from lxml.html import tostring @@ -1078,13 +1071,6 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False, # configuration init config = use_config(settingsfile, config) - # put timeout signal in place - if HAS_SIGNAL is True: - timeout = config.getint('DEFAULT', 'EXTRACTION_TIMEOUT') - if timeout > 0: - signal(SIGALRM, timeout_handler) - alarm(timeout) - # extraction try: document = bare_extraction( @@ -1105,10 +1091,6 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False, LOGGER.error('Processing timeout for %s', url) document = None - # deactivate alarm signal - if HAS_SIGNAL is True and timeout > 0: - alarm(0) - # post-processing if document is None: return None diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg index 65967765..29573a8c 100644 --- a/trafilatura/settings.cfg +++ b/trafilatura/settings.cfg @@ -19,7 +19,7 @@ MIN_EXTRACTED_COMM_SIZE = 1 MIN_OUTPUT_SIZE = 1 MIN_OUTPUT_COMM_SIZE = 1 -# Set to 0 to disable signal +# CLI file processing only, set to 0 to disable EXTRACTION_TIMEOUT = 30 # Deduplication