Skip to content

Commit

Permalink
add checks to probing mode (#392)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar authored Jul 11, 2023
1 parent f2e17de commit d78fbb5
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,11 +325,15 @@ def cli_crawler(args, n=30, url_store=None):
def probe_homepage(args):
"Probe websites for extractable content and print the fitting ones."
input_urls = load_input_urls(args)
config = use_config(filename=args.config_file)
min_length = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')

for url, result in buffered_downloads(input_urls, args.parallel):
if result is not None:
result = html2txt(result)
if result and (not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language):
print(url, flush=True)
if result and len(result) > min_length and any(c.isalpha() for c in result):
if not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language:
print(url, flush=True)


def url_processing_pipeline(args, url_store):
Expand Down

0 comments on commit d78fbb5

Please sign in to comment.