Skip to content

Commit

Permalink
Handle downloads transparently
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta committed Sep 5, 2023
1 parent e253ebf commit fd80403
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 9 deletions.
30 changes: 30 additions & 0 deletions examples/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from scrapy import Spider, Request


class DownloadSpider(Spider):
name = "download"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
}

def start_requests(self):
yield Request(url="https://example.org", meta={"playwright": True})
yield Request(
url="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
meta={"playwright": True},
)

def parse(self, response):
if filename := response.meta.get("playwright_suggested_filename"):
with open(filename, "wb") as fp:
fp.write(response.body)
yield {
"url": response.url,
"response_cls": response.__class__.__name__,
"first_bytes": response.body[:60],
"filename": filename,
}
60 changes: 51 additions & 9 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
from contextlib import suppress
from dataclasses import dataclass
from ipaddress import ip_address
from tempfile import NamedTemporaryFile
from time import time
from typing import Awaitable, Callable, Dict, Optional, Type, TypeVar, Union
from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union

from playwright.async_api import (
BrowserContext,
BrowserType,
Download,
Error as PlaywrightError,
Page,
PlaywrightContextManager,
Expand Down Expand Up @@ -349,13 +351,9 @@ async def _download_request_with_page(
if request.meta.get("playwright_include_page"):
request.meta["playwright_page"] = page

context_name = request.meta.get("playwright_context")
page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
page_goto_kwargs.pop("url", None)

start_time = time()
response = await page.goto(url=request.url, **page_goto_kwargs)
if response is not None:
response, download = await self._get_response_and_download(request=request, page=page)
if isinstance(response, PlaywrightResponse):
await _set_redirect_meta(request=request, response=response)
headers = Headers(await response.all_headers())
headers.pop("Content-Encoding", None)
Expand All @@ -366,7 +364,7 @@ async def _download_request_with_page(
request,
extra={
"spider": spider,
"context_name": context_name,
"context_name": request.meta.get("playwright_context"),
"scrapy_request_url": request.url,
"scrapy_request_method": request.method,
},
Expand All @@ -377,7 +375,7 @@ async def _download_request_with_page(
body_str = await _get_page_content(
page=page,
spider=spider,
context_name=context_name,
context_name=request.meta.get("playwright_context"),
scrapy_request_url=request.url,
scrapy_request_method=request.method,
)
Expand All @@ -394,6 +392,18 @@ async def _download_request_with_page(
await page.close()
self.stats.inc_value("playwright/page_count/closed")

if download:
self.stats.inc_value("playwright/response_count/download")
request.meta["playwright_suggested_filename"] = download.get("suggested_filename")
respcls = responsetypes.from_args(url=download["url"], body=download["bytes"])
return respcls(
url=download["url"],
status=200,
body=download["bytes"],
request=request,
flags=["playwright"],
)

body, encoding = _encode_body(headers=headers, text=body_str)
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
return respcls(
Expand All @@ -407,6 +417,38 @@ async def _download_request_with_page(
ip_address=server_ip_address,
)

async def _get_response_and_download(
self, request: Request, page: Page
) -> Tuple[Optional[PlaywrightResponse], dict]:
response: Optional[PlaywrightResponse] = None
download: dict = {} # updated in-place in _download_event_handler
download_ready = asyncio.Event()

async def _handle_download(dwnld: Download) -> None:
self.stats.inc_value("playwright/request_count/download")
if failure := await dwnld.failure():
raise RuntimeError(failure)
with NamedTemporaryFile() as temp_file:
await dwnld.save_as(temp_file.name)
temp_file.seek(0)
download["bytes"] = temp_file.read()
download["url"] = dwnld.url
download["suggested_filename"] = dwnld.suggested_filename
download_ready.set()

page.on("download", _handle_download)
page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
page_goto_kwargs.pop("url", None)
try:
response = await page.goto(url=request.url, **page_goto_kwargs)
except PlaywrightError as ex:
if "net::ERR_ABORTED" not in str(ex):
raise
await download_ready.wait() # TODO: add timeout
page.remove_listener("download", _handle_download)

return response, download

async def _apply_page_methods(self, page: Page, request: Request, spider: Spider) -> None:
context_name = request.meta.get("playwright_context")
page_methods = request.meta.get("playwright_page_methods") or ()
Expand Down

0 comments on commit fd80403

Please sign in to comment.