Skip to content

Commit

Permalink
Merge branch 'master' into latest_courlan
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar authored Aug 7, 2023
2 parents 7077420 + d78fbb5 commit 5b2ee02
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 23 deletions.
14 changes: 10 additions & 4 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,11 +325,15 @@ def cli_crawler(args, n=30, url_store=None):
def probe_homepage(args):
"Probe websites for extractable content and print the fitting ones."
input_urls = load_input_urls(args)
config = use_config(filename=args.config_file)
min_length = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')

for url, result in buffered_downloads(input_urls, args.parallel):
if result is not None:
result = html2txt(result)
if result and (not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language):
print(url, flush=True)
if result and len(result) > min_length and any(c.isalpha() for c in result):
if not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language:
print(url, flush=True)


def url_processing_pipeline(args, url_store):
Expand Down Expand Up @@ -366,14 +370,16 @@ def file_processing_pipeline(args):
filecounter = None
processing_cores = args.parallel or FILE_PROCESSING_CORES
config = use_config(filename=args.config_file)
timeout = config.getint('DEFAULT', 'EXTRACTION_TIMEOUT') or None

# max_tasks_per_child available in Python 3.11+
# max_tasks_per_child available in Python >= 3.11
with ProcessPoolExecutor(max_workers=processing_cores) as executor:
# chunk input: https://github.com/python/cpython/issues/74028
for filebatch in make_chunks(generate_filelist(args.input_dir), MAX_FILES_PER_DIRECTORY):
if filecounter is None and len(filebatch) >= MAX_FILES_PER_DIRECTORY:
filecounter = 0
worker = partial(file_processing, args=args, counter=filecounter, config=config)
executor.map(worker, filebatch, chunksize=10)
executor.map(worker, filebatch, chunksize=10, timeout=timeout)
# update counter
if filecounter is not None:
filecounter += len(filebatch)
Expand Down
18 changes: 0 additions & 18 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,6 @@

from copy import deepcopy

# SIGALRM isn't present on Windows, detect it
try:
from signal import signal, alarm, SIGALRM
HAS_SIGNAL = True
except ImportError:
HAS_SIGNAL = False

from lxml.etree import Element, SubElement, strip_elements, strip_tags
from lxml.html import tostring

Expand Down Expand Up @@ -1078,13 +1071,6 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
# configuration init
config = use_config(settingsfile, config)

# put timeout signal in place
if HAS_SIGNAL is True:
timeout = config.getint('DEFAULT', 'EXTRACTION_TIMEOUT')
if timeout > 0:
signal(SIGALRM, timeout_handler)
alarm(timeout)

# extraction
try:
document = bare_extraction(
Expand All @@ -1105,10 +1091,6 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
LOGGER.error('Processing timeout for %s', url)
document = None

# deactivate alarm signal
if HAS_SIGNAL is True and timeout > 0:
alarm(0)

# post-processing
if document is None:
return None
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/settings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ MIN_EXTRACTED_COMM_SIZE = 1
MIN_OUTPUT_SIZE = 1
MIN_OUTPUT_COMM_SIZE = 1

# Set to 0 to disable signal
# CLI file processing only, set to 0 to disable
EXTRACTION_TIMEOUT = 30

# Deduplication
Expand Down

0 comments on commit 5b2ee02

Please sign in to comment.