From 664bd160e35d2a49793ab77ffa06ac5555d36476 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 16 Jul 2024 15:50:52 -0300 Subject: [PATCH] Retry to create page on browser crash --- scrapy_playwright/handler.py | 39 ++++++++++-- tests/tests_asyncio/test_browser.py | 92 +++++++++++++++++++++++++++-- 2 files changed, 122 insertions(+), 9 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index e9a2f22..a1b562a 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -6,6 +6,7 @@ from time import time from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union +from playwright._impl._errors import TargetClosedError from playwright.async_api import ( BrowserContext, BrowserType, @@ -91,6 +92,7 @@ class Config: startup_context_kwargs: dict navigation_timeout: Optional[float] restart_disconnected_browser: bool + target_closed_max_retries: int = 3 @classmethod def from_settings(cls, settings: Settings) -> "Config": @@ -190,6 +192,7 @@ async def _maybe_launch_browser(self) -> None: logger.info("Launching browser %s", self.browser_type.name) self.browser = await self.browser_type.launch(**self.config.launch_options) logger.info("Browser %s launched", self.browser_type.name) + self.stats.inc_value("playwright/browser_count") self.browser.on("disconnected", self._browser_disconnected_callback) async def _maybe_connect_remote_devtools(self) -> None: @@ -200,6 +203,7 @@ async def _maybe_connect_remote_devtools(self) -> None: self.config.cdp_url, **self.config.cdp_kwargs ) logger.info("Connected using CDP: %s", self.config.cdp_url) + self.stats.inc_value("playwright/browser_count") self.browser.on("disconnected", self._browser_disconnected_callback) async def _maybe_connect_remote(self) -> None: @@ -210,6 +214,7 @@ async def _maybe_connect_remote(self) -> None: self.config.connect_url, **self.config.connect_kwargs ) logger.info("Connected to remote Playwright") + self.stats.inc_value("playwright/browser_count") self.browser.on("disconnected", self._browser_disconnected_callback) async def _create_browser_context( @@ -337,7 +342,8 @@ def close(self) -> Deferred: _ThreadedLoopAdapter.stop() async def _close(self) -> None: - await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()]) + with suppress(TargetClosedError): + await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()]) self.context_wrappers.clear() if hasattr(self, "browser"): logger.info("Closing browser") @@ -353,8 +359,28 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: return super().download_request(request, spider) async def _download_request(self, request: Request, spider: Spider) -> Response: + counter = 0 + while True: + try: + return await self._download_request_with_retry(request=request, spider=spider) + except TargetClosedError as ex: + counter += 1 + if counter > self.config.target_closed_max_retries: + raise ex + logger.debug( + "Target closed, retrying to create page for %s", + request, + extra={ + "spider": spider, + "scrapy_request_url": request.url, + "scrapy_request_method": request.method, + "exception": ex, + }, + ) + + async def _download_request_with_retry(self, request: Request, spider: Spider) -> Response: page = request.meta.get("playwright_page") - if not isinstance(page, Page): + if not isinstance(page, Page) or page.is_closed(): page = await self._create_page(request=request, spider=spider) context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME) @@ -613,9 +639,12 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None: self.stats.inc_value(f"{stats_prefix}/method/{response.request.method}") async def _browser_disconnected_callback(self) -> None: - await asyncio.gather( - *[ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values()] - ) + close_context_coros = [ + ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values() + ] + self.context_wrappers.clear() + with suppress(TargetClosedError): + await asyncio.gather(*close_context_coros) logger.debug("Browser disconnected") if self.config.restart_disconnected_browser: del self.browser diff --git a/tests/tests_asyncio/test_browser.py b/tests/tests_asyncio/test_browser.py index 78e0b26..82f98d4 100644 --- a/tests/tests_asyncio/test_browser.py +++ b/tests/tests_asyncio/test_browser.py @@ -1,16 +1,21 @@ import asyncio import logging +import os import random import re +import signal import subprocess import time import uuid from contextlib import asynccontextmanager from pathlib import Path +from threading import Thread from typing import Tuple from unittest import IsolatedAsyncioTestCase +import psutil import pytest +from playwright._impl._errors import TargetClosedError from playwright.async_api import async_playwright from scrapy import Request, Spider @@ -77,7 +82,13 @@ async def remote_chromium(with_devtools_protocol: bool = True): proc.communicate() -class TestRemoteBrowser(IsolatedAsyncioTestCase): +def kill_chrome(): + for proc in psutil.process_iter(["pid", "name"]): + if proc.info["name"] == "chrome": + os.kill(proc.info["pid"], signal.SIGKILL) + + +class TestBrowserRemoteChromium(IsolatedAsyncioTestCase): @pytest.fixture(autouse=True) def inject_fixtures(self, caplog): caplog.set_level(logging.DEBUG) @@ -87,6 +98,7 @@ def inject_fixtures(self, caplog): async def test_connect_devtools(self): async with remote_chromium(with_devtools_protocol=True) as devtools_url: settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": "chromium", "PLAYWRIGHT_CDP_URL": devtools_url, "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True}, } @@ -105,6 +117,7 @@ async def test_connect_devtools(self): async def test_connect(self): async with remote_chromium(with_devtools_protocol=False) as browser_url: settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": "chromium", "PLAYWRIGHT_CONNECT_URL": browser_url, "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True}, } @@ -130,16 +143,16 @@ async def test_connect(self): ) in self._caplog.record_tuples -class TestBrowserReconnect(IsolatedAsyncioTestCase): +class TestBrowserReconnectChromium(IsolatedAsyncioTestCase): @pytest.fixture(autouse=True) def inject_fixtures(self, caplog): caplog.set_level(logging.DEBUG) self._caplog = caplog @allow_windows - async def test_restart_browser(self): + async def test_browser_closed_restart(self): spider = Spider("foo") - async with make_handler() as handler: + async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler: with StaticMockServer() as server: req1 = Request( server.urljoin("/index.html"), @@ -172,3 +185,74 @@ async def test_restart_browser(self): ) == 2 # one at the beginning, one after calling Browser.close() manually ) + + @allow_windows + async def test_browser_crashed_restart(self): + spider = Spider("foo") + async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler: + with StaticMockServer() as server: + req1 = Request( + server.urljoin("/index.html"), + meta={"playwright": True, "playwright_include_page": True}, + ) + resp1 = await handler._download_request(req1, spider) + thread = Thread(target=kill_chrome, daemon=True) + thread.start() + req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True}) + req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True}) + req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True}) + resp2 = await handler._download_request(req2, spider) + resp3 = await handler._download_request(req3, spider) + resp4 = await handler._download_request(req4, spider) + thread.join() + assert_correct_response(resp1, req1) + assert_correct_response(resp2, req2) + assert_correct_response(resp3, req3) + assert_correct_response(resp4, req4) + assert ( + self._caplog.record_tuples.count( + ( + "scrapy-playwright", + logging.DEBUG, + "Browser disconnected", + ) + ) + == 2 # one mid-crawl after killing the browser process, one at the end + ) + assert ( + self._caplog.record_tuples.count( + ( + "scrapy-playwright", + logging.INFO, + "Launching browser chromium", + ) + ) + == 2 # one at the beginning, one after killing the broser process + ) + + @allow_windows + async def test_browser_crashed_do_not_restart(self): + spider = Spider("foo") + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": "chromium", + "PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER": False, + } + async with make_handler(settings_dict=settings_dict) as handler: + with StaticMockServer() as server: + await asyncio.sleep(1) # allow time for the browser to fully launch + req1 = Request( + server.urljoin("/index.html"), + meta={"playwright": True, "playwright_include_page": True}, + ) + resp1 = await handler._download_request(req1, spider) + assert_correct_response(resp1, req1) + thread = Thread(target=kill_chrome, daemon=True) + thread.start() + req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True}) + req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True}) + req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True}) + with pytest.raises(TargetClosedError): + await handler._download_request(req2, spider) + await handler._download_request(req3, spider) + await handler._download_request(req4, spider) + thread.join()