Skip to content

Commit

Permalink
Merge pull request #99 from Nykakin/processor-changes
Browse files Browse the repository at this point in the history
Processor changes
  • Loading branch information
kmike authored Aug 26, 2024
2 parents b9daff6 + 12cb8ef commit a926028
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 29 deletions.
114 changes: 104 additions & 10 deletions tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,22 @@
from zyte_parsers import Gtin as zp_Gtin
from zyte_parsers import extract_breadcrumbs

from zyte_common_items import AggregateRating, BasePage, Breadcrumb, Gtin, ProductPage
from zyte_common_items import (
AggregateRating,
BasePage,
Brand,
Breadcrumb,
Gtin,
Image,
ProductPage,
)
from zyte_common_items.processors import (
_format_price,
brand_processor,
breadcrumbs_processor,
gtin_processor,
images_processor,
price_processor,
rating_processor,
)

Expand Down Expand Up @@ -125,16 +135,18 @@ def breadcrumbs(self):
"input_value,expected_value",
[
(None, None),
("", ""),
("foo", "foo"),
("", None),
(" ", None),
("foo", Brand(name="foo")),
(" foo ", Brand(name="foo")),
(Selector(text="<html></html>"), None),
(SelectorList([]), None),
(fromstring("<p>foo</p>"), "foo"),
(fromstring("<img alt='foo'>"), "foo"),
(fromstring("<p><img alt='foo'></p>"), "foo"),
(fromstring("<p><p><img alt='foo'></p></p>"), "foo"),
(Selector(text="<p>foo</p>"), "foo"),
(SelectorList([Selector(text="<p>foo</p>")]), "foo"),
(fromstring("<p>foo</p>"), Brand(name="foo")),
(fromstring("<img alt='foo'>"), Brand(name="foo")),
(fromstring("<p><img alt='foo'></p>"), Brand(name="foo")),
(fromstring("<p><p><img alt='foo'></p></p>"), Brand(name="foo")),
(Selector(text="<p>foo</p>"), Brand(name="foo")),
(SelectorList([Selector(text="<p>foo</p>")]), Brand(name="foo")),
],
)
def test_brand(input_value, expected_value):
Expand All @@ -158,7 +170,7 @@ def brand(self):
body="<html><body><img alt='foo'></body></html>".encode(),
)
page = MyProductPage(response=response)
assert page.brand == "foo"
assert page.brand == Brand(name="foo")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -321,3 +333,85 @@ def aggregateRating(self):
assert page.aggregateRating == AggregateRating(
ratingValue=3.8, bestRating=10, reviewCount=5
)


@pytest.mark.parametrize(
"input_value,expected_value",
[
(None, None),
([], []),
("https://www.url.com/img.jpg", [Image(url="https://www.url.com/img.jpg")]),
(
[
Image("https://www.url.com/img1.jpg"),
Image("https://www.url.com/img2.jpg"),
],
[
Image("https://www.url.com/img1.jpg"),
Image("https://www.url.com/img2.jpg"),
],
),
(
["https://www.url.com/img1.jpg", "https://www.url.com/img2.jpg"],
[
Image("https://www.url.com/img1.jpg"),
Image("https://www.url.com/img2.jpg"),
],
),
(
[
{"url": "https://www.url.com/img1.jpg"},
{"url": "https://www.url.com/img2.jpg"},
],
[
Image("https://www.url.com/img1.jpg"),
Image("https://www.url.com/img2.jpg"),
],
),
],
)
def test_images(input_value, expected_value):
class ImagesPage(BasePage):
@field(out=[images_processor])
def images(self):
return input_value

page = ImagesPage(base_url) # type: ignore[arg-type]
assert page.images == expected_value


def test_images_page():
class MyProductPage(ProductPage):
@field
def images(self):
return self.css("img::attr(href)").getall()

response = HttpResponse(
url="http://www.example.com/",
body="<html><body><img href='https://www.url.com/img.jpg'></body></html>".encode(),
)
page = MyProductPage(response=response)
assert page.images == [Image(url="https://www.url.com/img.jpg")]


@pytest.mark.parametrize(
"input_value,expected_value",
[
(100, "100.00"),
(None, None),
([], []),
({}, {}),
(22.9, "22.90"),
(22.0, "22.00"),
("22.9", "22.9"),
("Do not apply to strings...", "Do not apply to strings..."),
],
)
def test_prices(input_value, expected_value):
class PricePage(BasePage):
@field(out=[price_processor])
def price(self):
return input_value

page = PricePage(base_url) # type: ignore[arg-type]
assert page.price == expected_value
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ commands = mypy zyte_common_items tests
[testenv:twinecheck]
basepython = python3
deps =
twine==4.0.2
twine==5.1.1
build==0.10.0
commands =
python -m build --sdist
Expand Down
3 changes: 3 additions & 0 deletions zyte_common_items/pages/product.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
description_html_processor,
description_processor,
gtin_processor,
images_processor,
price_processor,
rating_processor,
simple_price_processor,
Expand Down Expand Up @@ -46,6 +47,7 @@ class Processors(BasePage.Processors):
gtin = [gtin_processor]
price = [price_processor]
regularPrice = [simple_price_processor]
images = [images_processor]


class ProductPage(
Expand All @@ -62,6 +64,7 @@ class Processors(Page.Processors):
gtin = [gtin_processor]
price = [price_processor]
regularPrice = [simple_price_processor]
images = [images_processor]


@attrs.define
Expand Down
99 changes: 81 additions & 18 deletions zyte_common_items/processors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections.abc import Iterable
from collections.abc import Iterable, Mapping
from functools import wraps
from numbers import Real
from typing import Any, Callable, List, Optional, Union

from clear_html import clean_node, cleaned_node_to_html, cleaned_node_to_text
Expand All @@ -21,8 +22,10 @@
from .components import (
AggregateRating,
BaseMetadata,
Brand,
Breadcrumb,
Gtin,
Image,
ProbabilityRequest,
Request,
)
Expand Down Expand Up @@ -104,50 +107,79 @@ def _from_zp_breadcrumb(value: zp_Breadcrumb) -> Breadcrumb:
return results


@only_handle_nodes
def brand_processor(value: Union[Selector, HtmlElement], page: Any) -> Any:
def brand_processor(value: Any, page: Any) -> Any:
"""Convert the data into a brand name if possible.
Supported inputs are :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`.
Other inputs are returned as is.
If inputs are either :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList` or :class:`~lxml.html.HtmlElement`, attempts
to extract brand data from it.
If value is a string, uses it to create a :class:`~zyte_common_items.Brand` instance.
Other inputs are returned unchanged.
"""
return extract_brand_name(value, search_depth=2)
value = _handle_selectorlist(value)

if isinstance(value, str):
value = value.strip()
return Brand(name=value) if value else None

@only_handle_nodes
def price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any:
if isinstance(value, (Selector, SelectorList, HtmlElement)):
if brand_name := extract_brand_name(value, search_depth=2):
return Brand(name=brand_name)
else:
return None

return value


def price_processor(value: Any, page: Any) -> Any:
"""Convert the data into a price string if possible.
Uses the price-parser_ library.
Supported inputs are :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`.
:class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values.
Other inputs are returned as is.
Puts the parsed Price object into ``page._parsed_price``.
.. _price-parser: https://github.com/scrapinghub/price-parser
"""
price = extract_price(value)
page._parsed_price = price
return _format_price(price)
value = _handle_selectorlist(value)

if isinstance(value, Real):
return f"{value:.2f}"
elif isinstance(value, (Selector, HtmlElement)):
price = extract_price(value)
page._parsed_price = price
return _format_price(price)
else:
return value

@only_handle_nodes
def simple_price_processor(value: Union[Selector, HtmlElement], page: Any) -> Any:

def simple_price_processor(value: Any, page: Any) -> Any:
"""Convert the data into a price string if possible.
Uses the price-parser_ library.
Supported inputs are :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList` and :class:`~lxml.html.HtmlElement`.
:class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement` and numeric values.
Other inputs are returned as is.
.. _price-parser: https://github.com/scrapinghub/price-parser
"""
price = extract_price(value)
return _format_price(price)
value = _handle_selectorlist(value)

if isinstance(value, Real):
return f"{value:.2f}"
elif isinstance(value, (Selector, HtmlElement)):
price = extract_price(value)
return _format_price(price)
else:
return value


@only_handle_nodes
Expand Down Expand Up @@ -330,6 +362,37 @@ def aggregateRating(self):
return value


def images_processor(value: Any, page: Any) -> Any:
"""Convert the data into a list of :class:`~zyte_common_items.Image`
objects if possible.
If the input is a string, it's used as a url for returning image object.
If input is either an iterable of strings or mappings with "url" key, they are
used to populate image objects.
Other inputs are returned unchanged.
"""

if isinstance(value, str):
return [Image(url=value)]

if isinstance(value, Iterable):
results: List[Any] = []
for item in value:
if isinstance(item, Image):
results.append(item)
elif isinstance(item, Mapping):
if url := item.get("url"):
results.append(Image(url=url))
elif isinstance(item, str):
results.append(Image(url=item))

return results

return value


def probability_request_list_processor(
request_list: List[Request],
) -> List[ProbabilityRequest]:
Expand Down

0 comments on commit a926028

Please sign in to comment.