From b30c0cd2c96e57cc273ffe29c0313487b364f15a Mon Sep 17 00:00:00 2001 From: John Parton Date: Sat, 9 Sep 2023 06:36:37 -0500 Subject: [PATCH] Remove chardet/charset-normalizer. (#7589) Add fallback_charset_resolver ClientSession parameter. (#7561) Co-authored-by: Sam Bull (cherry picked from commit 675579699422680607108a7dd68c85ec5284220c) --------- Co-authored-by: Sam Bull --- CHANGES/7561.feature | 2 ++ CONTRIBUTORS.txt | 1 + aiohttp/client.py | 26 +++++++++++++++++ aiohttp/client_reqrep.py | 55 ++++++++++++++++++----------------- docs/client_advanced.rst | 30 +++++++++++++++++++ docs/client_reference.rst | 51 +++++++++++++++----------------- docs/index.rst | 8 ----- setup.cfg | 1 + tests/test_client_response.py | 45 +++++++--------------------- 9 files changed, 121 insertions(+), 98 deletions(-) create mode 100644 CHANGES/7561.feature diff --git a/CHANGES/7561.feature b/CHANGES/7561.feature new file mode 100644 index 00000000000..a57914ff2a3 --- /dev/null +++ b/CHANGES/7561.feature @@ -0,0 +1,2 @@ +Replace automatic character set detection with a `fallback_charset_resolver` parameter +in `ClientSession` to allow user-supplied character set detection functions. diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index c1d93268978..f8a8df5e347 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -163,6 +163,7 @@ Jesus Cea Jian Zeng Jinkyu Yi Joel Watts +John Parton Jon Nabozny Jonas Krüger Svensson Jonas Obrist diff --git a/aiohttp/client.py b/aiohttp/client.py index 0d0f4c16c0c..4f56f61727b 100644 --- a/aiohttp/client.py +++ b/aiohttp/client.py @@ -88,6 +88,11 @@ from .tracing import Trace, TraceConfig from .typedefs import Final, JSONEncoder, LooseCookies, LooseHeaders, StrOrURL +try: + import cchardet as chardet +except ImportError: # pragma: no cover + import charset_normalizer as chardet # type: ignore[no-redef] + __all__ = ( # client_exceptions "ClientConnectionError", @@ -159,6 +164,22 @@ class ClientTimeout: DEFAULT_TIMEOUT: Final[ClientTimeout] = ClientTimeout(total=5 * 60) _RetType = TypeVar("_RetType") +_CharsetResolver = Callable[[ClientResponse, bytes], str] + + +def _default_fallback_charset_resolver(response: ClientResponse, body: bytes) -> str: + + ret: str = chardet.detect(body)["encoding"] or "utf-8" + + if ret != "utf-8": + warnings.warn( + "Automatic charset detection will be removed in 3.9, see: " + "https://docs.aiohttp.org/en/stable/client_advanced.html#character-set-detection", # noqa: E501 + DeprecationWarning, + stacklevel=3, + ) + + return ret class ClientSession: @@ -220,6 +241,9 @@ def __init__( requote_redirect_url: bool = True, trace_configs: Optional[List[TraceConfig]] = None, read_bufsize: int = 2**16, + fallback_charset_resolver: _CharsetResolver = ( + _default_fallback_charset_resolver + ), ) -> None: if loop is None: if connector is not None: @@ -313,6 +337,8 @@ def __init__( for trace_config in self._trace_configs: trace_config.freeze() + self._resolve_charset = fallback_charset_resolver + def __init_subclass__(cls: Type["ClientSession"]) -> None: warnings.warn( "Inheritance class {} from ClientSession " diff --git a/aiohttp/client_reqrep.py b/aiohttp/client_reqrep.py index 28b8a28d0d8..987d68f9034 100644 --- a/aiohttp/client_reqrep.py +++ b/aiohttp/client_reqrep.py @@ -1,5 +1,6 @@ import asyncio import codecs +import contextlib import functools import io import re @@ -12,6 +13,7 @@ from typing import ( TYPE_CHECKING, Any, + Callable, Dict, Iterable, List, @@ -66,11 +68,6 @@ ssl = None # type: ignore[assignment] SSLContext = object # type: ignore[misc,assignment] -try: - import cchardet as chardet -except ImportError: # pragma: no cover - import charset_normalizer as chardet # type: ignore[no-redef] - __all__ = ("ClientRequest", "ClientResponse", "RequestInfo", "Fingerprint") @@ -722,8 +719,8 @@ class ClientResponse(HeadersMixin): _raw_headers: RawHeaders = None # type: ignore[assignment] # Response raw headers _connection = None # current connection - _source_traceback = None - # setted up by ClientRequest after ClientResponse object creation + _source_traceback: Optional[traceback.StackSummary] = None + # set up by ClientRequest after ClientResponse object creation # post-init stage allows to not change ctor signature _closed = True # to allow __del__ for non-initialized properly response _released = False @@ -760,6 +757,15 @@ def __init__( self._loop = loop # store a reference to session #1985 self._session: Optional[ClientSession] = session + # Save reference to _resolve_charset, so that get_encoding() will still + # work after the response has finished reading the body. + if session is None: + # TODO: Fix session=None in tests (see ClientRequest.__init__). + self._resolve_charset: Callable[ + ["ClientResponse", bytes], str + ] = lambda *_: "utf-8" + else: + self._resolve_charset = session._resolve_charset if loop.get_debug(): self._source_traceback = traceback.extract_stack(sys._getframe(1)) @@ -1053,27 +1059,22 @@ def get_encoding(self) -> str: encoding = mimetype.parameters.get("charset") if encoding: - try: - codecs.lookup(encoding) - except LookupError: - encoding = None - if not encoding: - if mimetype.type == "application" and ( - mimetype.subtype == "json" or mimetype.subtype == "rdap" - ): - # RFC 7159 states that the default encoding is UTF-8. - # RFC 7483 defines application/rdap+json - encoding = "utf-8" - elif self._body is None: - raise RuntimeError( - "Cannot guess the encoding of " "a not yet read body" - ) - else: - encoding = chardet.detect(self._body)["encoding"] - if not encoding: - encoding = "utf-8" + with contextlib.suppress(LookupError): + return codecs.lookup(encoding).name + + if mimetype.type == "application" and ( + mimetype.subtype == "json" or mimetype.subtype == "rdap" + ): + # RFC 7159 states that the default encoding is UTF-8. + # RFC 7483 defines application/rdap+json + return "utf-8" + + if self._body is None: + raise RuntimeError( + "Cannot compute fallback encoding of a not yet read body" + ) - return encoding + return self._resolve_charset(self, self._body) async def text(self, encoding: Optional[str] = None, errors: str = "strict") -> str: """Read response payload and decode.""" diff --git a/docs/client_advanced.rst b/docs/client_advanced.rst index 43d7dd251ef..e8f016a96f0 100644 --- a/docs/client_advanced.rst +++ b/docs/client_advanced.rst @@ -640,3 +640,33 @@ are changed so that aiohttp itself can wait on the underlying connection to close. Please follow issue `#1925 `_ for the progress on this. + + +Character Set Detection +----------------------- + +If you encounter an 'Automatic charset detection will be removed' warning +when using :meth:`ClientResponse.text()` this may be because the response +does not include the charset needed to decode the body. + +If you know the correct encoding for a request, you can simply specify +the encoding as a parameter (e.g. ``resp.text("windows-1252")``). + +Alternatively, :class:`ClientSession` accepts a ``fallback_charset_resolver`` parameter which +can be used to reintroduce charset guessing functionality. When a charset is not found +in the Content-Type header, this function will be called to get the charset encoding. For +example, this can be used with the ``chardetng_py`` library.:: + + from chardetng_py import detect + + def charset_resolver(resp: ClientResponse, body: bytes) -> str: + tld = resp.url.host.rsplit(".", maxsplit=1)[-1] + return detect(body, allow_utf8=True, tld=tld) + + ClientSession(fallback_charset_resolver=charset_resolver) + +Or, if ``chardetng_py`` doesn't work for you, then ``charset-normalizer`` is another option:: + + from charset_normalizer import detect + + ClientSession(fallback_charset_resolver=lamba r, b: detect(b)["encoding"] or "utf-8") diff --git a/docs/client_reference.rst b/docs/client_reference.rst index 8d9abe37eb0..bb2f7e23032 100644 --- a/docs/client_reference.rst +++ b/docs/client_reference.rst @@ -51,7 +51,8 @@ The client session supports the context manager protocol for self closing. read_bufsize=2**16, \ requote_redirect_url=False, \ trust_env=False, \ - trace_configs=None) + trace_configs=None, \ + fallback_charset_resolver=_chardet_resolver) The class for creating client sessions and making requests. @@ -200,6 +201,18 @@ The client session supports the context manager protocol for self closing. disabling. See :ref:`aiohttp-client-tracing-reference` for more information. + :param Callable[[ClientResponse,bytes],str] fallback_charset_resolver: + A :term:`callable` that accepts a :class:`ClientResponse` and the + :class:`bytes` contents, and returns a :class:`str` which will be used as + the encoding parameter to :meth:`bytes.decode()`. + + This function will be called when the charset is not known (e.g. not specified in the + Content-Type header). The default function in 3.8.6 calls ``chardetng`` + or ``charset-normaliser``. In 3.9+ this be replaced with a function that + simply defaults to ``utf-8``. + + .. versionadded:: 3.8.6 + .. attribute:: closed ``True`` if the session has been closed, ``False`` otherwise. @@ -1400,12 +1413,8 @@ Response object Read response's body and return decoded :class:`str` using specified *encoding* parameter. - If *encoding* is ``None`` content encoding is autocalculated - using ``Content-Type`` HTTP header and *charset-normalizer* tool if the - header is not provided by server. - - :term:`cchardet` is used with fallback to :term:`charset-normalizer` if - *cchardet* is not available. + If *encoding* is ``None`` content encoding is determined from the + Content-Type header, or using the ``fallback_charset_resolver`` function. Close underlying connection if data reading gets an error, release connection otherwise. @@ -1414,10 +1423,7 @@ Response object ``None`` for encoding autodetection (default). - :return str: decoded *BODY* - :raise LookupError: if the encoding detected by cchardet is - unknown by Python (e.g. VISCII). .. note:: @@ -1430,18 +1436,15 @@ Response object await resp.text('ISO-8859-1') - .. comethod:: json(*, encoding=None, loads=json.loads, \ + .. method:: json(*, encoding=None, loads=json.loads, \ content_type='application/json') + :async: Read response's body as *JSON*, return :class:`dict` using specified *encoding* and *loader*. If data is not still available - a ``read`` call will be done, + a ``read`` call will be done. - If *encoding* is ``None`` content encoding is autocalculated - using :term:`cchardet` or :term:`charset-normalizer` as fallback if - *cchardet* is not available. - - if response's `content-type` does not match `content_type` parameter + If response's `content-type` does not match `content_type` parameter :exc:`aiohttp.ContentTypeError` get raised. To disable content type check pass ``None`` value. @@ -1473,17 +1476,9 @@ Response object .. method:: get_encoding() - Automatically detect content encoding using ``charset`` info in - ``Content-Type`` HTTP header. If this info is not exists or there - are no appropriate codecs for encoding then :term:`cchardet` / - :term:`charset-normalizer` is used. - - Beware that it is not always safe to use the result of this function to - decode a response. Some encodings detected by cchardet are not known by - Python (e.g. VISCII). *charset-normalizer* is not concerned by that issue. - - :raise RuntimeError: if called before the body has been read, - for :term:`cchardet` usage + Retrieve content encoding using ``charset`` info in ``Content-Type`` HTTP header. + If no charset is present or the charset is not understood by Python, the + ``fallback_charset_resolver`` function associated with the ``ClientSession`` is called. .. versionadded:: 3.0 diff --git a/docs/index.rst b/docs/index.rst index a171dc1f48b..94cebd01f7d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -162,14 +162,6 @@ Dependencies - *charset-normalizer* - *multidict* - *yarl* -- *Optional* :term:`cchardet` as faster replacement for - :term:`charset-normalizer`. - - Install it explicitly via: - - .. code-block:: bash - - $ pip install cchardet - *Optional* :term:`aiodns` for fast DNS resolving. The library is highly recommended. diff --git a/setup.cfg b/setup.cfg index 6d50d321811..12cd4124742 100644 --- a/setup.cfg +++ b/setup.cfg @@ -150,6 +150,7 @@ filterwarnings = # can be dropped with the next release of `certify`, specifically # `certify > 2022.06.15`. ignore:path is deprecated. Use files.. instead. Refer to https.//importlib-resources.readthedocs.io/en/latest/using.html#migrating-from-legacy for migration advice.:DeprecationWarning:certifi.core + ignore:Automatic charset detection will be removed in 3.9:DeprecationWarning junit_suite_name = aiohttp_test_suite norecursedirs = dist docs build .tox .eggs minversion = 3.8.2 diff --git a/tests/test_client_response.py b/tests/test_client_response.py index f8bee42be49..fa472e791ff 100644 --- a/tests/test_client_response.py +++ b/tests/test_client_response.py @@ -2,6 +2,7 @@ import gc import sys +from typing import Any from unittest import mock import pytest @@ -440,7 +441,11 @@ def side_effect(*args, **kwargs): assert not response.get_encoding.called -async def test_text_detect_encoding(loop, session) -> None: +@pytest.mark.parametrize("content_type", ("text/plain", "text/plain;charset=invalid")) +async def test_text_charset_resolver( + content_type: str, loop: Any, session: Any +) -> None: + session._resolve_charset = lambda r, b: "cp1251" response = ClientResponse( "get", URL("http://def-cl-resp.org"), @@ -458,7 +463,7 @@ def side_effect(*args, **kwargs): fut.set_result('{"тест": "пройден"}'.encode("cp1251")) return fut - response._headers = {"Content-Type": "text/plain"} + response._headers = {"Content-Type": content_type} content = response.content = mock.Mock() content.read.side_effect = side_effect @@ -466,35 +471,7 @@ def side_effect(*args, **kwargs): res = await response.text() assert res == '{"тест": "пройден"}' assert response._connection is None - - -async def test_text_detect_encoding_if_invalid_charset(loop, session) -> None: - response = ClientResponse( - "get", - URL("http://def-cl-resp.org"), - request_info=mock.Mock(), - writer=mock.Mock(), - continue100=None, - timer=TimerNoop(), - traces=[], - loop=loop, - session=session, - ) - - def side_effect(*args, **kwargs): - fut = loop.create_future() - fut.set_result('{"тест": "пройден"}'.encode("cp1251")) - return fut - - response._headers = {"Content-Type": "text/plain;charset=invalid"} - content = response.content = mock.Mock() - content.read.side_effect = side_effect - - await response.read() - res = await response.text() - assert res == '{"тест": "пройден"}' - assert response._connection is None - assert response.get_encoding().lower() in ("windows-1251", "maccyrillic") + assert response.get_encoding() == "cp1251" async def test_get_encoding_body_none(loop, session) -> None: @@ -521,7 +498,7 @@ def side_effect(*args, **kwargs): with pytest.raises( RuntimeError, - match="^Cannot guess the encoding of a not yet read body$", + match="^Cannot compute fallback encoding of a not yet read body$", ): response.get_encoding() assert response.closed @@ -742,9 +719,7 @@ def test_get_encoding_unknown(loop, session) -> None: ) response._headers = {"Content-Type": "application/json"} - with mock.patch("aiohttp.client_reqrep.chardet") as m_chardet: - m_chardet.detect.return_value = {"encoding": None} - assert response.get_encoding() == "utf-8" + assert response.get_encoding() == "utf-8" def test_raise_for_status_2xx() -> None: