From a65f86ff4df9d3ec421f46d76eb23619ac06f970 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta <1731933+elacuesta@users.noreply.github.com> Date: Thu, 24 Aug 2023 15:17:58 -0300 Subject: [PATCH] Do not fail on exceptions when getting referer header (#225) * Get header values safely * Do not log referer header for responses --- scrapy_playwright/_utils.py | 14 ++++++++++++-- scrapy_playwright/handler.py | 31 +++++++++++++++++-------------- tests/tests_asyncio/test_utils.py | 25 ++++++++++++++++++++++++- 3 files changed, 53 insertions(+), 17 deletions(-) diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py index 53f3cb2..9e054a5 100644 --- a/scrapy_playwright/_utils.py +++ b/scrapy_playwright/_utils.py @@ -1,7 +1,7 @@ import logging -from typing import Awaitable, Iterator, Tuple +from typing import Awaitable, Iterator, Optional, Tuple, Union -from playwright.async_api import Error, Page +from playwright.async_api import Error, Page, Request, Response from scrapy import Spider from scrapy.http.headers import Headers from scrapy.utils.python import to_unicode @@ -79,3 +79,13 @@ async def _get_page_content( ) return await page.content() raise + + +async def _get_header_value( + resource: Union[Request, Response], + header_name: str, +) -> Optional[str]: + try: + return await resource.header_value(header_name) + except Exception: + return None diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 8b4458b..d269273 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -32,6 +32,7 @@ from scrapy_playwright.page import PageMethod from scrapy_playwright._utils import ( _encode_body, + _get_header_value, _get_page_content, _is_safe_close_error, _maybe_await, @@ -632,19 +633,22 @@ async def _maybe_execute_page_init_callback( def _make_request_logger(context_name: str, spider: Spider) -> Callable: async def _log_request(request: PlaywrightRequest) -> None: - referrer = await request.header_value("referer") + log_args = [context_name, request.method.upper(), request.url, request.resource_type] + referrer = await _get_header_value(request, "referer") + if referrer: + log_args.append(referrer) + log_msg = "[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)" + else: + log_msg = "[Context=%s] Request: <%s %s> (resource type: %s)" logger.debug( - "[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)", - context_name, - request.method.upper(), - request.url, - request.resource_type, - referrer, + log_msg, + *log_args, extra={ "spider": spider, "context_name": context_name, "playwright_request_url": request.url, "playwright_request_method": request.method, + "playwright_resource_type": request.resource_type, }, ) @@ -653,16 +657,15 @@ async def _log_request(request: PlaywrightRequest) -> None: def _make_response_logger(context_name: str, spider: Spider) -> Callable: async def _log_response(response: PlaywrightResponse) -> None: - referrer = await response.header_value("referer") - log_args = [context_name, response.status, response.url, referrer] - if 300 <= response.status < 400: - location = await response.header_value("location") + log_args = [context_name, response.status, response.url] + location = await _get_header_value(response, "location") + if location: log_args.append(location) - msg = "[Context=%s] Response: <%i %s> (referrer: %s, location: %s)" + log_msg = "[Context=%s] Response: <%i %s> (location: %s)" else: - msg = "[Context=%s] Response: <%i %s> (referrer: %s)" + log_msg = "[Context=%s] Response: <%i %s>" logger.debug( - msg, + log_msg, *log_args, extra={ "spider": spider, diff --git a/tests/tests_asyncio/test_utils.py b/tests/tests_asyncio/test_utils.py index 5ecfdee..e6f88d2 100644 --- a/tests/tests_asyncio/test_utils.py +++ b/tests/tests_asyncio/test_utils.py @@ -6,7 +6,12 @@ from playwright.async_api import Error as PlaywrightError from scrapy import Spider from scrapy.http.headers import Headers -from scrapy_playwright._utils import _get_page_content, _NAVIGATION_ERROR_MSG, _encode_body +from scrapy_playwright._utils import ( + _NAVIGATION_ERROR_MSG, + _encode_body, + _get_header_value, + _get_page_content, +) class TestPageContent(IsolatedAsyncioTestCase): @@ -119,3 +124,21 @@ async def test_encode_mismatch(self): ) assert encoding == "gb18030" assert body == text.encode(encoding) + + +class TestHeaderValue(IsolatedAsyncioTestCase): + @pytest.mark.asyncio + async def test_get_header_ok(self): + async def _identity(x): + return x + + resource = AsyncMock() + resource.header_value = _identity + assert "asdf" == await _get_header_value(resource, "asdf") + assert "qwerty" == await _get_header_value(resource, "qwerty") + + async def test_get_header_exception(self): + resource = AsyncMock() + resource.header_value.side_effect = Exception("nope") + assert await _get_header_value(resource, "asdf") is None + assert await _get_header_value(resource, "qwerty") is None