Merge remote-tracking branch 'origin/main' into close-inactive-contexts

scrapy-plugins · Aug 29, 2023 · 12eb537 · 12eb537
2 parents a609ae5 + f1004bd
commit 12eb537
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 9 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.30
+current_version = 0.0.31
 commit = True
 tag = True
 

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,6 +1,12 @@
 # scrapy-playwright changelog
 
 
+### [v0.0.31](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.31) (2023-08-28)
+
+* Do not fail when getting referer header for debug log messages (#225)
+* Do not override headers with values from asset requests (#226)
+
+
 ### [v0.0.30](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.30) (2023-08-17)
 
 * Fix page_init_callback duplication (#222)

diff --git a/scrapy_playwright/__init__.py b/scrapy_playwright/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.30"
+__version__ = "0.0.31"
diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -524,13 +524,8 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
                         self.browser_type_name, playwright_request, headers
                     )
                 )
-            # the request that reaches the callback should contain the final headers
-            headers.clear()
-            headers.update(final_headers)
-            del final_headers
 
-            # if the request is triggered by scrapy, not playwright
-            original_playwright_method: str = playwright_request.method
+            # if the current request corresponds to the original scrapy one
             if (
                 playwright_request.url.rstrip("/") == url.rstrip("/")
                 and playwright_request.is_navigation_request()
@@ -539,7 +534,13 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
                     overrides["method"] = method
                 if body:
                     overrides["post_data"] = body.decode(encoding)
+                # the request that reaches the callback should contain the final headers
+                headers.clear()
+                headers.update(final_headers)
 
+            del final_headers
+
+            original_playwright_method: str = playwright_request.method
             try:
                 await route.continue_(**overrides)
                 if overrides.get("method"):

diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py
@@ -23,9 +23,14 @@ async def use_scrapy_headers(
     scrapy_headers_str.setdefault("user-agent", playwright_headers.get("user-agent"))
 
     if playwright_request.is_navigation_request():
+        # if referer header is set via playwright_page_goto_kwargs
+        if referer := playwright_headers.get("referer"):
+            scrapy_headers_str.setdefault("referer", referer)
+
+        # otherwise it fails with playwright.helper.Error: NS_ERROR_NET_RESET
         if browser_type == "firefox":
-            # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
             scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc
+
         return scrapy_headers_str
 
     # override user agent, for consistency with other requests