Skip to content

Commit

Permalink
Fix JSON handling with Parsel
Browse files Browse the repository at this point in the history
  • Loading branch information
janbuchar committed Sep 2, 2024
1 parent 2be7454 commit a626d3f
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/crawlee/parsel_crawler/_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,12 @@ async def _handle_blocked_request(
if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')

parsel = crawling_context.selector

matched_selectors = [
selector
for selector in RETRY_CSS_SELECTORS
if crawling_context.selector.css(selector).get() is not None
if parsel.type in ('html', 'xml') and parsel.css(selector).get() is not None
]

if matched_selectors:
Expand Down
49 changes: 49 additions & 0 deletions tests/unit/parsel_crawler/test_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,21 @@ async def server() -> AsyncGenerator[respx.MockRouter, None]:
</html>""",
)

mock.get('/json', name='json_endpoint').return_value = Response(
200,
text="""{
"hello": "world"
}""",
)

mock.get('/xml', name='xml_endpoint').return_value = Response(
200,
text="""
<?xml version="1.0"?>
<hello>world</hello>
""",
)

generic_response = Response(
200,
text="""<html>
Expand Down Expand Up @@ -225,3 +240,37 @@ def test_import_error_handled() -> None:
"To import anything from this subpackage, you need to install the 'parsel' extra."
"For example, if you use pip, run `pip install 'crawlee[parsel]'`."
)


async def test_json(server: respx.MockRouter) -> None:
crawler = ParselCrawler(request_provider=RequestList(['https://test.io/json']))
handler = mock.AsyncMock()

@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
result = context.selector.jmespath('hello').getall()
await handler(result)

await crawler.run()

assert server['json_endpoint'].called
assert handler.called

assert handler.call_args[0][0] == ['world']


async def test_xml(server: respx.MockRouter) -> None:
crawler = ParselCrawler(request_provider=RequestList(['https://test.io/xml']))
handler = mock.AsyncMock()

@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
result = context.selector.css('hello').getall()
await handler(result)

await crawler.run()

assert server['xml_endpoint'].called
assert handler.called

assert handler.call_args[0][0] == ['<hello>world</hello>']

0 comments on commit a626d3f

Please sign in to comment.