Skip to content

Commit

Permalink
maintenance: deprecate 3.6 & 3.7 and simplify code base (#709)
Browse files Browse the repository at this point in the history
* maintenance: deprecate 3.6 & 3.7 and simplify code base

* update setup

* fix tests by removing use of urllib.robotparser

* sort imports
  • Loading branch information
adbar authored Oct 4, 2024
1 parent 35ec481 commit f2ca512
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 69 deletions.
4 changes: 0 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ jobs:
PROXY_TEST: "true"
include:
# custom python versions
- os: ubuntu-20.04
python-version: 3.6
- os: ubuntu-20.04
python-version: 3.7
- os: ubuntu-20.04
python-version: 3.8
- os: macos-13
Expand Down
17 changes: 6 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def get_long_description():
"brotli",
"cchardet >= 2.1.7; python_version < '3.11'", # build issue
"faust-cchardet >= 2.1.19; python_version >= '3.11'",
"htmldate[speed] >= 1.8.1",
"htmldate[speed] >= 1.9.0",
"py3langid >= 0.2.2",
"pycurl >= 7.45.3",
"urllib3[socks]",
Expand Down Expand Up @@ -60,8 +60,6 @@ def get_long_description():
"Operating System :: POSIX",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
Expand Down Expand Up @@ -106,20 +104,17 @@ def get_long_description():
]
},
include_package_data=True,
python_requires=">=3.6",
python_requires=">=3.8",
install_requires=[
"certifi",
"charset_normalizer >= 3.0.1; python_version < '3.7'",
"charset_normalizer >= 3.2.0; python_version >= '3.7'",
"courlan >= 1.2.0",
"htmldate >= 1.8.1",
"importlib_metadata; python_version < '3.8'",
"charset_normalizer >= 3.2.0",
"courlan >= 1.3.1",
"htmldate >= 1.9.0",
"justext >= 3.0.1",
# see tests on Github Actions
"lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
"lxml >= 5.2.2 ; platform_system != 'Darwin' or python_version > '3.8'",
"urllib3 >= 1.26, < 2; python_version < '3.7'",
"urllib3 >= 1.26, < 3; python_version >= '3.7'",
"urllib3 >= 1.26, < 3",
],
extras_require=extras,
entry_points={
Expand Down
22 changes: 5 additions & 17 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
import logging
import sys

try: # Python 3.8+
from importlib.metadata import version
except ImportError:
from importlib_metadata import version

from importlib.metadata import version
from platform import python_version
from typing import Any

Expand All @@ -21,18 +17,10 @@
from .settings import PARALLEL_CORES, SUPPORTED_FMT_CLI

# fix output encoding on some systems
try:
# > Python 3.7
if sys.stdout.encoding != 'UTF-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'UTF-8':
sys.stderr.reconfigure(encoding='utf-8')
except AttributeError:
import codecs
if sys.stdout.encoding != 'UTF-8':
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
if sys.stderr.encoding != 'UTF-8':
sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict')
if sys.stdout.encoding != 'UTF-8':
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'UTF-8':
sys.stderr.reconfigure(encoding='utf-8')


def add_args(parser: Any) -> Any:
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/deduplication.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"Code parts dedicated to duplicate removal and text similarity."

# 3.7+: from __future__ import annotations # 3.11+: from typing import Self
# from __future__ import annotations
# 3.11+: from typing import Self

import re
import string
Expand Down
13 changes: 5 additions & 8 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from configparser import ConfigParser
from functools import partial
from importlib.metadata import version
from io import BytesIO
from time import sleep
from typing import Any, ByteString, Dict, Generator, List, Optional, Set, Tuple, Union
from typing import (Any, ByteString, Dict, Generator, List, Optional, Set,
Tuple, Union)

import certifi
import urllib3
Expand All @@ -21,8 +23,8 @@
from courlan.network import redirection_test

from .settings import DEFAULT_CONFIG, Extractor
from .utils import URL_BLACKLIST_REGEX, decode_file, is_acceptable_length, make_chunks

from .utils import (URL_BLACKLIST_REGEX, decode_file, is_acceptable_length,
make_chunks)

try:
from urllib3.contrib.socks import SOCKSProxyManager
Expand All @@ -43,11 +45,6 @@
except ImportError:
HAS_PYCURL = False

try: # Python 3.8+
from importlib.metadata import version
except ImportError:
from importlib_metadata import version


LOGGER = logging.getLogger(__name__)

Expand Down
3 changes: 1 addition & 2 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ class Extractor:
'date_params',
'author_blacklist', 'url_blacklist'
]
# consider dataclasses for Python 3.7+
def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt",
fast=False, precision=False, recall=False,
comments=True, formatting=False, links=False, images=False,
Expand Down Expand Up @@ -144,7 +143,7 @@ def set_date_params(extensive: bool = True):
}


class Document: # consider dataclasses for Python 3.7+
class Document:
"Defines a class to store all necessary data and metadata fields for extracted information."
__slots__ = [
'title', 'author', 'url', 'hostname', 'description', 'sitename',
Expand Down
17 changes: 9 additions & 8 deletions trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@
from .downloads import fetch_url, is_live_page
from .settings import MAX_LINKS, MAX_SITEMAPS_SEEN

# import urllib.robotparser # Python >= 3.8
# ROBOT_PARSER = urllib.robotparser.RobotFileParser()


LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -277,13 +274,14 @@ def find_robots_sitemaps(baseurl: str) -> List[str]:
return extract_robots_sitemaps(robotstxt, baseurl)


def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]:
def extract_robots_sitemaps(robotstxt: Optional[str], baseurl: str) -> List[str]:
"Read a robots.txt file and find sitemap links."
# sanity check on length (cause: redirections)
if robotstxt is None or len(robotstxt) > 10000:
return []
sitemapurls = []
# source: https://github.com/python/cpython/blob/3.8/Lib/urllib/robotparser.py

candidates = []
# source: https://github.com/python/cpython/blob/3.12/Lib/urllib/robotparser.py
for line in robotstxt.splitlines():
# remove optional comment and strip line
i = line.find("#")
Expand All @@ -297,7 +295,10 @@ def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]:
line[0] = line[0].strip().lower()
if line[0] == "sitemap":
# urllib.parse.unquote(line[1].strip())
candidate = fix_relative_urls(baseurl, line[1].strip())
sitemapurls.append(candidate)
candidates.append(line[1].strip())

candidates = list(dict.fromkeys(candidates))
sitemapurls = [fix_relative_urls(baseurl, u) for u in candidates if u]

LOGGER.debug("%s sitemaps found in robots.txt", len(sitemapurls))
return sitemapurls
18 changes: 5 additions & 13 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,19 +361,11 @@ def is_image_file(imagesrc):


def make_chunks(iterable, n):
"""
Chunk data into smaller pieces.
https://docs.python.org/3/library/itertools.html
"""
it = iter(iterable)
while True:
chunk = tuple(islice(it, n))
if not chunk:
return
yield chunk
# Python 3.8+ with walrus operator
# while batch := tuple(islice(it, n)):
# yield batch
"Chunk data into smaller pieces."
# 3.12+: https://docs.python.org/3/library/itertools.html#itertools.batched
iterator = iter(iterable)
while batch := tuple(islice(iterator, n)):
yield batch


def is_acceptable_length(my_len, options) -> bool:
Expand Down
6 changes: 1 addition & 5 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,12 @@
import logging

from html import unescape
from importlib.metadata import version
from io import StringIO
from json import dumps as json_dumps
from pathlib import Path
from typing import List, Optional

try: # Python 3.8+
from importlib.metadata import version
except ImportError:
from importlib_metadata import version

from lxml.etree import (_Element, Element, SubElement, XMLParser,
fromstring, tostring, DTD)

Expand Down

0 comments on commit f2ca512

Please sign in to comment.