From a882c61cbcd80f7ea68650716df93a51ca186b50 Mon Sep 17 00:00:00 2001 From: Denis Moklaf Date: Wed, 2 Oct 2024 17:07:49 +0200 Subject: [PATCH] fix: set options.source before raising error on empty doc tree --- trafilatura/core.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/trafilatura/core.py b/trafilatura/core.py index a853cca9..f84a9283 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -147,11 +147,6 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, # load data try: - tree = load_html(filecontent) - if tree is None: - LOGGER.error('empty HTML tree: %s', url) - raise ValueError - # regroup extraction options if not options or not isinstance(options, Extractor): options = Extractor( @@ -164,6 +159,12 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, author_blacklist=author_blacklist, url_blacklist=url_blacklist, date_params=date_extraction_params ) + + # load the HTML tree + tree = load_html(filecontent) + if tree is None: + LOGGER.error('empty HTML tree: %s', url) + raise ValueError # quick and dirty HTML lang check if options.lang and (options.fast or not LANGID_FLAG):