maintenance: deprecate 3.6 & 3.7 and simplify code base (#709)

* maintenance: deprecate 3.6 & 3.7 and simplify code base * update setup * fix tests by removing use of urllib.robotparser * sort imports
adbar · Oct 4, 2024 · f2ca512 · f2ca512
1 parent 35ec481
commit f2ca512
Show file tree

Hide file tree

Showing 9 changed files with 34 additions and 69 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -26,10 +26,6 @@ jobs:
             PROXY_TEST: "true"
         include:
           # custom python versions
-          - os: ubuntu-20.04
-            python-version: 3.6
-          - os: ubuntu-20.04
-            python-version: 3.7
           - os: ubuntu-20.04
             python-version: 3.8
           - os: macos-13

diff --git a/setup.py b/setup.py
@@ -28,7 +28,7 @@ def get_long_description():
         "brotli",
         "cchardet >= 2.1.7; python_version < '3.11'",  # build issue
         "faust-cchardet >= 2.1.19; python_version >= '3.11'",
-        "htmldate[speed] >= 1.8.1",
+        "htmldate[speed] >= 1.9.0",
         "py3langid >= 0.2.2",
         "pycurl >= 7.45.3",
         "urllib3[socks]",
@@ -60,8 +60,6 @@ def get_long_description():
         "Operating System :: POSIX",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
@@ -106,20 +104,17 @@ def get_long_description():
         ]
     },
     include_package_data=True,
-    python_requires=">=3.6",
+    python_requires=">=3.8",
     install_requires=[
         "certifi",
-        "charset_normalizer >= 3.0.1; python_version < '3.7'",
-        "charset_normalizer >= 3.2.0; python_version >= '3.7'",
-        "courlan >= 1.2.0",
-        "htmldate >= 1.8.1",
-        "importlib_metadata; python_version < '3.8'",
+        "charset_normalizer >= 3.2.0",
+        "courlan >= 1.3.1",
+        "htmldate >= 1.9.0",
         "justext >= 3.0.1",
         # see tests on Github Actions
         "lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
         "lxml >= 5.2.2 ; platform_system != 'Darwin' or python_version > '3.8'",
-        "urllib3 >= 1.26, < 2; python_version < '3.7'",
-        "urllib3 >= 1.26, < 3; python_version >= '3.7'",
+        "urllib3 >= 1.26, < 3",
     ],
     extras_require=extras,
     entry_points={

diff --git a/trafilatura/cli.py b/trafilatura/cli.py
@@ -6,11 +6,7 @@
 import logging
 import sys
 
-try:  # Python 3.8+
-    from importlib.metadata import version
-except ImportError:
-    from importlib_metadata import version
-
+from importlib.metadata import version
 from platform import python_version
 from typing import Any
 
@@ -21,18 +17,10 @@
 from .settings import PARALLEL_CORES, SUPPORTED_FMT_CLI
 
 # fix output encoding on some systems
-try:
-    # > Python 3.7
-    if sys.stdout.encoding != 'UTF-8':
-        sys.stdout.reconfigure(encoding='utf-8')
-    if sys.stderr.encoding != 'UTF-8':
-        sys.stderr.reconfigure(encoding='utf-8')
-except AttributeError:
-    import codecs
-    if sys.stdout.encoding != 'UTF-8':
-        sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
-    if sys.stderr.encoding != 'UTF-8':
-        sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict')
+if sys.stdout.encoding != 'UTF-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+if sys.stderr.encoding != 'UTF-8':
+    sys.stderr.reconfigure(encoding='utf-8')
 
 
 def add_args(parser: Any) -> Any:

diff --git a/trafilatura/deduplication.py b/trafilatura/deduplication.py
@@ -1,6 +1,7 @@
 "Code parts dedicated to duplicate removal and text similarity."
 
-# 3.7+: from __future__ import annotations  # 3.11+: from typing import Self
+# from __future__ import annotations
+# 3.11+: from typing import Self
 
 import re
 import string

diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
@@ -10,9 +10,11 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from configparser import ConfigParser
 from functools import partial
+from importlib.metadata import version
 from io import BytesIO
 from time import sleep
-from typing import Any, ByteString, Dict, Generator, List, Optional, Set, Tuple, Union
+from typing import (Any, ByteString, Dict, Generator, List, Optional, Set,
+                    Tuple, Union)
 
 import certifi
 import urllib3
@@ -21,8 +23,8 @@
 from courlan.network import redirection_test
 
 from .settings import DEFAULT_CONFIG, Extractor
-from .utils import URL_BLACKLIST_REGEX, decode_file, is_acceptable_length, make_chunks
-
+from .utils import (URL_BLACKLIST_REGEX, decode_file, is_acceptable_length,
+                    make_chunks)
 
 try:
     from urllib3.contrib.socks import SOCKSProxyManager
@@ -43,11 +45,6 @@
 except ImportError:
     HAS_PYCURL = False
 
-try:  # Python 3.8+
-    from importlib.metadata import version
-except ImportError:
-    from importlib_metadata import version
-
 
 LOGGER = logging.getLogger(__name__)
 

diff --git a/trafilatura/settings.py b/trafilatura/settings.py
@@ -76,7 +76,6 @@ class Extractor:
     'date_params',
     'author_blacklist', 'url_blacklist'
     ]
-    # consider dataclasses for Python 3.7+
     def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt",
                  fast=False, precision=False, recall=False,
                  comments=True, formatting=False, links=False, images=False,
@@ -144,7 +143,7 @@ def set_date_params(extensive: bool = True):
            }
 
 
-class Document:  # consider dataclasses for Python 3.7+
+class Document:
     "Defines a class to store all necessary data and metadata fields for extracted information."
     __slots__ = [
     'title', 'author', 'url', 'hostname', 'description', 'sitename',

diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py
@@ -22,9 +22,6 @@
 from .downloads import fetch_url, is_live_page
 from .settings import MAX_LINKS, MAX_SITEMAPS_SEEN
 
-# import urllib.robotparser # Python >= 3.8
-# ROBOT_PARSER = urllib.robotparser.RobotFileParser()
-
 
 LOGGER = logging.getLogger(__name__)
 
@@ -277,13 +274,14 @@ def find_robots_sitemaps(baseurl: str) -> List[str]:
     return extract_robots_sitemaps(robotstxt, baseurl)
 
 
-def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]:
+def extract_robots_sitemaps(robotstxt: Optional[str], baseurl: str) -> List[str]:
     "Read a robots.txt file and find sitemap links."
     # sanity check on length (cause: redirections)
     if robotstxt is None or len(robotstxt) > 10000:
         return []
-    sitemapurls = []
-    # source: https://github.com/python/cpython/blob/3.8/Lib/urllib/robotparser.py
+
+    candidates = []
+    # source: https://github.com/python/cpython/blob/3.12/Lib/urllib/robotparser.py
     for line in robotstxt.splitlines():
         # remove optional comment and strip line
         i = line.find("#")
@@ -297,7 +295,10 @@ def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]:
             line[0] = line[0].strip().lower()
             if line[0] == "sitemap":
                 # urllib.parse.unquote(line[1].strip())
-                candidate = fix_relative_urls(baseurl, line[1].strip())
-                sitemapurls.append(candidate)
+                candidates.append(line[1].strip())
+
+    candidates = list(dict.fromkeys(candidates))
+    sitemapurls = [fix_relative_urls(baseurl, u) for u in candidates if u]
+
     LOGGER.debug("%s sitemaps found in robots.txt", len(sitemapurls))
     return sitemapurls
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -361,19 +361,11 @@ def is_image_file(imagesrc):
 
 
 def make_chunks(iterable, n):
-    """
-    Chunk data into smaller pieces.
-    https://docs.python.org/3/library/itertools.html
-    """
-    it = iter(iterable)
-    while True:
-        chunk = tuple(islice(it, n))
-        if not chunk:
-            return
-        yield chunk
-    # Python 3.8+ with walrus operator
-    # while batch := tuple(islice(it, n)):
-    #    yield batch
+    "Chunk data into smaller pieces."
+    # 3.12+: https://docs.python.org/3/library/itertools.html#itertools.batched
+    iterator = iter(iterable)
+    while batch := tuple(islice(iterator, n)):
+        yield batch
 
 
 def is_acceptable_length(my_len, options) -> bool:

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -7,16 +7,12 @@
 import logging
 
 from html import unescape
+from importlib.metadata import version
 from io import StringIO
 from json import dumps as json_dumps
 from pathlib import Path
 from typing import List, Optional
 
-try:  # Python 3.8+
-    from importlib.metadata import version
-except ImportError:
-    from importlib_metadata import version
-
 from lxml.etree import (_Element, Element, SubElement, XMLParser,
                         fromstring, tostring, DTD)