diff --git a/HISTORY.md b/HISTORY.md index a8005e3f..0f51515b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,20 @@ ## History / Changelog +### 1.7.0 + +Extraction: +- improved `html2txt()` function + +Downloads: +- add advanced `fetch_response()` function +→ pending deprecation for `fetch_url(decode=False)` + +Maintenance: +- support for LXML v5+ (#484 by @knit-bee, #485) +- update [htmldate](https://github.com/adbar/htmldate/releases/tag/v1.7.0) + + ### 1.6.4 Maintenance: diff --git a/README.rst b/README.rst index a82de3c6..ba584a29 100644 --- a/README.rst +++ b/README.rst @@ -87,7 +87,7 @@ Evaluation and alternatives Trafilatura consistently outperforms other open-source libraries in text extraction benchmarks, showcasing its efficiency and accuracy in extracting web content. The extractor tries to strike a balance between limiting noise and including all valid parts. -For more detailed results see the `benchmark `_. The results can be reproduced, see the `evaluation readme _` for instructions. +For more detailed results see the `benchmark `_. The results can be reproduced, see the `evaluation readme `_ for instructions. =============================== ========= ========== ========= ========= ====== 750 documents, 2236 text & 2250 boilerplate segments (2022-05-18), Python 3.8 diff --git a/docs/usage-python.rst b/docs/usage-python.rst index a2492e42..7040fd86 100644 --- a/docs/usage-python.rst +++ b/docs/usage-python.rst @@ -313,18 +313,18 @@ The function ``bare_extraction`` can be used to bypass output conversion, it ret Raw HTTP response objects ^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``fetch_url()`` function can pass a urllib3 response object straight to the extraction by setting the optional ``decode`` argument to ``False``. +The ``fetch_response()`` function can pass a response object straight to the extraction. This can be useful to get the final redirection URL with ``response.url`` and then pass is directly as a URL argument to the extraction function: .. code-block:: python # necessary components - >>> from trafilatura import fetch_url, bare_extraction + >>> from trafilatura import fetch_response, bare_extraction # load an example - >>> response = fetch_url("https://www.example.org", decode=False) + >>> response = fetch_response("https://www.example.org") # perform extract() or bare_extraction() on Trafilatura's response object - >>> bare_extraction(response, url=response.url) # here is the redirection URL + >>> bare_extraction(response.data, url=response.url) # here is the redirection URL LXML objects diff --git a/setup.py b/setup.py index 54e93ffa..7dbc52ad 100644 --- a/setup.py +++ b/setup.py @@ -30,8 +30,8 @@ def get_long_description(): "all": [ "brotli", "cchardet >= 2.1.7; python_version < '3.11'", # build issue - "faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build - "htmldate[speed] >= 1.6.0", + "faust-cchardet >= 2.1.19; python_version >= '3.11'", + "htmldate[speed] >= 1.7.0", "py3langid >= 0.2.2", "pycurl >= 7.45.2", ], @@ -112,7 +112,7 @@ def get_long_description(): "charset_normalizer >= 3.0.1; python_version < '3.7'", "charset_normalizer >= 3.2.0; python_version >= '3.7'", "courlan >= 0.9.5", - "htmldate >= 1.6.1", + "htmldate >= 1.7.0", "importlib_metadata; python_version < '3.8'", "justext >= 3.0.0", # see tests on Github Actions diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py index 190b9385..5ba69a39 100644 --- a/trafilatura/__init__.py +++ b/trafilatura/__init__.py @@ -9,7 +9,7 @@ __author__ = 'Adrien Barbaresi and contributors' __license__ = 'GNU GPL v3+' __copyright__ = 'Copyright 2019-2024, Adrien Barbaresi' -__version__ = '1.6.4' +__version__ = '1.7.0' import logging