Skip to content

Commit

Permalink
Retry to create page on browser crash
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta committed Jul 16, 2024
1 parent a816f86 commit 664bd16
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 9 deletions.
39 changes: 34 additions & 5 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from time import time
from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union

from playwright._impl._errors import TargetClosedError
from playwright.async_api import (
BrowserContext,
BrowserType,
Expand Down Expand Up @@ -91,6 +92,7 @@ class Config:
startup_context_kwargs: dict
navigation_timeout: Optional[float]
restart_disconnected_browser: bool
target_closed_max_retries: int = 3

@classmethod
def from_settings(cls, settings: Settings) -> "Config":
Expand Down Expand Up @@ -190,6 +192,7 @@ async def _maybe_launch_browser(self) -> None:
logger.info("Launching browser %s", self.browser_type.name)
self.browser = await self.browser_type.launch(**self.config.launch_options)
logger.info("Browser %s launched", self.browser_type.name)
self.stats.inc_value("playwright/browser_count")
self.browser.on("disconnected", self._browser_disconnected_callback)

async def _maybe_connect_remote_devtools(self) -> None:
Expand All @@ -200,6 +203,7 @@ async def _maybe_connect_remote_devtools(self) -> None:
self.config.cdp_url, **self.config.cdp_kwargs
)
logger.info("Connected using CDP: %s", self.config.cdp_url)
self.stats.inc_value("playwright/browser_count")
self.browser.on("disconnected", self._browser_disconnected_callback)

async def _maybe_connect_remote(self) -> None:
Expand All @@ -210,6 +214,7 @@ async def _maybe_connect_remote(self) -> None:
self.config.connect_url, **self.config.connect_kwargs
)
logger.info("Connected to remote Playwright")
self.stats.inc_value("playwright/browser_count")
self.browser.on("disconnected", self._browser_disconnected_callback)

async def _create_browser_context(
Expand Down Expand Up @@ -337,7 +342,8 @@ def close(self) -> Deferred:
_ThreadedLoopAdapter.stop()

async def _close(self) -> None:
await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()])
with suppress(TargetClosedError):
await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()])
self.context_wrappers.clear()
if hasattr(self, "browser"):
logger.info("Closing browser")
Expand All @@ -353,8 +359,28 @@ def download_request(self, request: Request, spider: Spider) -> Deferred:
return super().download_request(request, spider)

async def _download_request(self, request: Request, spider: Spider) -> Response:
counter = 0
while True:
try:
return await self._download_request_with_retry(request=request, spider=spider)
except TargetClosedError as ex:
counter += 1
if counter > self.config.target_closed_max_retries:
raise ex
logger.debug(
"Target closed, retrying to create page for %s",
request,
extra={
"spider": spider,
"scrapy_request_url": request.url,
"scrapy_request_method": request.method,
"exception": ex,
},
)

async def _download_request_with_retry(self, request: Request, spider: Spider) -> Response:
page = request.meta.get("playwright_page")
if not isinstance(page, Page):
if not isinstance(page, Page) or page.is_closed():
page = await self._create_page(request=request, spider=spider)
context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)

Expand Down Expand Up @@ -613,9 +639,12 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None:
self.stats.inc_value(f"{stats_prefix}/method/{response.request.method}")

async def _browser_disconnected_callback(self) -> None:
await asyncio.gather(
*[ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values()]
)
close_context_coros = [
ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values()
]
self.context_wrappers.clear()
with suppress(TargetClosedError):
await asyncio.gather(*close_context_coros)
logger.debug("Browser disconnected")
if self.config.restart_disconnected_browser:
del self.browser
Expand Down
92 changes: 88 additions & 4 deletions tests/tests_asyncio/test_browser.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import asyncio
import logging
import os
import random
import re
import signal
import subprocess
import time
import uuid
from contextlib import asynccontextmanager
from pathlib import Path
from threading import Thread
from typing import Tuple
from unittest import IsolatedAsyncioTestCase

import psutil
import pytest
from playwright._impl._errors import TargetClosedError
from playwright.async_api import async_playwright
from scrapy import Request, Spider

Expand Down Expand Up @@ -77,7 +82,13 @@ async def remote_chromium(with_devtools_protocol: bool = True):
proc.communicate()


class TestRemoteBrowser(IsolatedAsyncioTestCase):
def kill_chrome():
for proc in psutil.process_iter(["pid", "name"]):
if proc.info["name"] == "chrome":
os.kill(proc.info["pid"], signal.SIGKILL)


class TestBrowserRemoteChromium(IsolatedAsyncioTestCase):
@pytest.fixture(autouse=True)
def inject_fixtures(self, caplog):
caplog.set_level(logging.DEBUG)
Expand All @@ -87,6 +98,7 @@ def inject_fixtures(self, caplog):
async def test_connect_devtools(self):
async with remote_chromium(with_devtools_protocol=True) as devtools_url:
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": "chromium",
"PLAYWRIGHT_CDP_URL": devtools_url,
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
}
Expand All @@ -105,6 +117,7 @@ async def test_connect_devtools(self):
async def test_connect(self):
async with remote_chromium(with_devtools_protocol=False) as browser_url:
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": "chromium",
"PLAYWRIGHT_CONNECT_URL": browser_url,
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
}
Expand All @@ -130,16 +143,16 @@ async def test_connect(self):
) in self._caplog.record_tuples


class TestBrowserReconnect(IsolatedAsyncioTestCase):
class TestBrowserReconnectChromium(IsolatedAsyncioTestCase):
@pytest.fixture(autouse=True)
def inject_fixtures(self, caplog):
caplog.set_level(logging.DEBUG)
self._caplog = caplog

@allow_windows
async def test_restart_browser(self):
async def test_browser_closed_restart(self):
spider = Spider("foo")
async with make_handler() as handler:
async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler:
with StaticMockServer() as server:
req1 = Request(
server.urljoin("/index.html"),
Expand Down Expand Up @@ -172,3 +185,74 @@ async def test_restart_browser(self):
)
== 2 # one at the beginning, one after calling Browser.close() manually
)

@allow_windows
async def test_browser_crashed_restart(self):
spider = Spider("foo")
async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler:
with StaticMockServer() as server:
req1 = Request(
server.urljoin("/index.html"),
meta={"playwright": True, "playwright_include_page": True},
)
resp1 = await handler._download_request(req1, spider)
thread = Thread(target=kill_chrome, daemon=True)
thread.start()
req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True})
req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True})
req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True})
resp2 = await handler._download_request(req2, spider)
resp3 = await handler._download_request(req3, spider)
resp4 = await handler._download_request(req4, spider)
thread.join()
assert_correct_response(resp1, req1)
assert_correct_response(resp2, req2)
assert_correct_response(resp3, req3)
assert_correct_response(resp4, req4)
assert (
self._caplog.record_tuples.count(
(
"scrapy-playwright",
logging.DEBUG,
"Browser disconnected",
)
)
== 2 # one mid-crawl after killing the browser process, one at the end
)
assert (
self._caplog.record_tuples.count(
(
"scrapy-playwright",
logging.INFO,
"Launching browser chromium",
)
)
== 2 # one at the beginning, one after killing the broser process
)

@allow_windows
async def test_browser_crashed_do_not_restart(self):
spider = Spider("foo")
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": "chromium",
"PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER": False,
}
async with make_handler(settings_dict=settings_dict) as handler:
with StaticMockServer() as server:
await asyncio.sleep(1) # allow time for the browser to fully launch
req1 = Request(
server.urljoin("/index.html"),
meta={"playwright": True, "playwright_include_page": True},
)
resp1 = await handler._download_request(req1, spider)
assert_correct_response(resp1, req1)
thread = Thread(target=kill_chrome, daemon=True)
thread.start()
req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True})
req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True})
req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True})
with pytest.raises(TargetClosedError):
await handler._download_request(req2, spider)
await handler._download_request(req3, spider)
await handler._download_request(req4, spider)
thread.join()

0 comments on commit 664bd16

Please sign in to comment.