Skip to content

Commit

Permalink
Fix header processing (#226)
Browse files Browse the repository at this point in the history
* Scrapy headers were being updated for every Playwright request,
  they should only be updated for the Playwright request that
  corresponds to the original Scrapy one
* Referer header from the Playwright request was not added to the
  output of scrapy_playwright.headers.use_scrapy_headers
  • Loading branch information
elacuesta authored Aug 28, 2023
1 parent 05648c6 commit a0fc5c6
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
13 changes: 7 additions & 6 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,13 +494,8 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
self.browser_type_name, playwright_request, headers
)
)
# the request that reaches the callback should contain the final headers
headers.clear()
headers.update(final_headers)
del final_headers

# if the request is triggered by scrapy, not playwright
original_playwright_method: str = playwright_request.method
# if the current request corresponds to the original scrapy one
if (
playwright_request.url.rstrip("/") == url.rstrip("/")
and playwright_request.is_navigation_request()
Expand All @@ -509,7 +504,13 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
overrides["method"] = method
if body:
overrides["post_data"] = body.decode(encoding)
# the request that reaches the callback should contain the final headers
headers.clear()
headers.update(final_headers)

del final_headers

original_playwright_method: str = playwright_request.method
try:
await route.continue_(**overrides)
if overrides.get("method"):
Expand Down
7 changes: 6 additions & 1 deletion scrapy_playwright/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,14 @@ async def use_scrapy_headers(
scrapy_headers_str.setdefault("user-agent", playwright_headers.get("user-agent"))

if playwright_request.is_navigation_request():
# if referer header is set via playwright_page_goto_kwargs
if referer := playwright_headers.get("referer"):
scrapy_headers_str.setdefault("referer", referer)

# otherwise it fails with playwright.helper.Error: NS_ERROR_NET_RESET
if browser_type == "firefox":
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc

return scrapy_headers_str

# override user agent, for consistency with other requests
Expand Down

0 comments on commit a0fc5c6

Please sign in to comment.