From ff38644e41341c20f8f13e970b0e94f83fa4a0ef Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 20 Mar 2024 16:24:17 +0100 Subject: [PATCH] prepare version 1.8.0 (#527) --- HISTORY.md | 21 +++++++++++++++++++++ setup.py | 10 +++++----- trafilatura/__init__.py | 2 +- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 0f51515b..a7c1809f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,27 @@ ## History / Changelog +### 1.8.0 + +Extraction: +- Better precision by @felipehertzer (#509, #520) +- Code formatting in TXT/Markdown output added (#498) +- Improved CSV output (#496) +- LXML: compile XPath expressions (#504) +- Overall speedup about +5% + +Downloads and Navigation: +- More robust scans with `is_live_page()` (#501) +- Better sitemap start and safeguards (#503, #506) +- Fix for headers in response object (#513) + +Maintenance: +- License changed to Apache 2.0 +- `Response` class: convenience functions added (#497) +- `lxml.html.Cleaner` removed (#491) +- CLI fixes: parallel cores and processing (#524) + + ### 1.7.0 Extraction: diff --git a/setup.py b/setup.py index cd591cca..63b7fdc9 100644 --- a/setup.py +++ b/setup.py @@ -31,9 +31,9 @@ def get_long_description(): "brotli", "cchardet >= 2.1.7; python_version < '3.11'", # build issue "faust-cchardet >= 2.1.19; python_version >= '3.11'", - "htmldate[speed] >= 1.7.0", + "htmldate[speed] >= 1.8.0", "py3langid >= 0.2.2", - "pycurl >= 7.45.2", + "pycurl >= 7.45.3", ], "gui": [ "Gooey >= 1.0.1", @@ -43,7 +43,7 @@ def get_long_description(): setup( name="trafilatura", version=get_version("trafilatura"), - description="Python package and command-line tool designed to gather text on the Web. It includes discovery, extraction and text processing components. Its main applications are web crawling, downloads, scraping, and extraction of main texts, metadata and comments.", + description="Python package and command-line tool designed to gather text on the Web, includes all necessary discovery and text processing components to perform web crawling, downloads, scraping, and extraction of main texts, metadata and comments.", long_description=get_long_description(), classifiers=[ # As from https://pypi.python.org/pypi?%3Aaction=list_classifiers @@ -111,8 +111,8 @@ def get_long_description(): "certifi", "charset_normalizer >= 3.0.1; python_version < '3.7'", "charset_normalizer >= 3.2.0; python_version >= '3.7'", - "courlan >= 0.9.5", - "htmldate >= 1.7.0", + "courlan >= 1.0.0", + "htmldate >= 1.8.0", "importlib_metadata; python_version < '3.8'", "justext >= 3.0.0", # see tests on Github Actions diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py index 8341593b..39cb6a20 100644 --- a/trafilatura/__init__.py +++ b/trafilatura/__init__.py @@ -9,7 +9,7 @@ __author__ = 'Adrien Barbaresi and contributors' __license__ = "Apache-2.0" __copyright__ = 'Copyright 2019-2024, Adrien Barbaresi' -__version__ = '1.7.0' +__version__ = '1.8.0' import logging