Skip to content

Commit

Permalink
Do not fail on exceptions when getting referer header (#225)
Browse files Browse the repository at this point in the history
* Get header values safely

* Do not log referer header for responses
  • Loading branch information
elacuesta authored Aug 24, 2023
1 parent 396bd74 commit a65f86f
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 17 deletions.
14 changes: 12 additions & 2 deletions scrapy_playwright/_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from typing import Awaitable, Iterator, Tuple
from typing import Awaitable, Iterator, Optional, Tuple, Union

from playwright.async_api import Error, Page
from playwright.async_api import Error, Page, Request, Response
from scrapy import Spider
from scrapy.http.headers import Headers
from scrapy.utils.python import to_unicode
Expand Down Expand Up @@ -79,3 +79,13 @@ async def _get_page_content(
)
return await page.content()
raise


async def _get_header_value(
resource: Union[Request, Response],
header_name: str,
) -> Optional[str]:
try:
return await resource.header_value(header_name)
except Exception:
return None
31 changes: 17 additions & 14 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from scrapy_playwright.page import PageMethod
from scrapy_playwright._utils import (
_encode_body,
_get_header_value,
_get_page_content,
_is_safe_close_error,
_maybe_await,
Expand Down Expand Up @@ -632,19 +633,22 @@ async def _maybe_execute_page_init_callback(

def _make_request_logger(context_name: str, spider: Spider) -> Callable:
async def _log_request(request: PlaywrightRequest) -> None:
referrer = await request.header_value("referer")
log_args = [context_name, request.method.upper(), request.url, request.resource_type]
referrer = await _get_header_value(request, "referer")
if referrer:
log_args.append(referrer)
log_msg = "[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)"
else:
log_msg = "[Context=%s] Request: <%s %s> (resource type: %s)"
logger.debug(
"[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)",
context_name,
request.method.upper(),
request.url,
request.resource_type,
referrer,
log_msg,
*log_args,
extra={
"spider": spider,
"context_name": context_name,
"playwright_request_url": request.url,
"playwright_request_method": request.method,
"playwright_resource_type": request.resource_type,
},
)

Expand All @@ -653,16 +657,15 @@ async def _log_request(request: PlaywrightRequest) -> None:

def _make_response_logger(context_name: str, spider: Spider) -> Callable:
async def _log_response(response: PlaywrightResponse) -> None:
referrer = await response.header_value("referer")
log_args = [context_name, response.status, response.url, referrer]
if 300 <= response.status < 400:
location = await response.header_value("location")
log_args = [context_name, response.status, response.url]
location = await _get_header_value(response, "location")
if location:
log_args.append(location)
msg = "[Context=%s] Response: <%i %s> (referrer: %s, location: %s)"
log_msg = "[Context=%s] Response: <%i %s> (location: %s)"
else:
msg = "[Context=%s] Response: <%i %s> (referrer: %s)"
log_msg = "[Context=%s] Response: <%i %s>"
logger.debug(
msg,
log_msg,
*log_args,
extra={
"spider": spider,
Expand Down
25 changes: 24 additions & 1 deletion tests/tests_asyncio/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
from playwright.async_api import Error as PlaywrightError
from scrapy import Spider
from scrapy.http.headers import Headers
from scrapy_playwright._utils import _get_page_content, _NAVIGATION_ERROR_MSG, _encode_body
from scrapy_playwright._utils import (
_NAVIGATION_ERROR_MSG,
_encode_body,
_get_header_value,
_get_page_content,
)


class TestPageContent(IsolatedAsyncioTestCase):
Expand Down Expand Up @@ -119,3 +124,21 @@ async def test_encode_mismatch(self):
)
assert encoding == "gb18030"
assert body == text.encode(encoding)


class TestHeaderValue(IsolatedAsyncioTestCase):
@pytest.mark.asyncio
async def test_get_header_ok(self):
async def _identity(x):
return x

resource = AsyncMock()
resource.header_value = _identity
assert "asdf" == await _get_header_value(resource, "asdf")
assert "qwerty" == await _get_header_value(resource, "qwerty")

async def test_get_header_exception(self):
resource = AsyncMock()
resource.header_value.side_effect = Exception("nope")
assert await _get_header_value(resource, "asdf") is None
assert await _get_header_value(resource, "qwerty") is None

0 comments on commit a65f86f

Please sign in to comment.