Skip to content

Commit

Permalink
Merge pull request #17 from zytedata/pipeline
Browse files Browse the repository at this point in the history
create DuplicateUrlDiscarderPipeline
  • Loading branch information
BurnzZ authored Jul 19, 2024
2 parents 260b145 + c562bc6 commit d0bdb6d
Show file tree
Hide file tree
Showing 10 changed files with 267 additions and 12 deletions.
53 changes: 41 additions & 12 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ duplicate-url-discarder
:target: https://duplicate-url-discarder.readthedocs.io/en/stable/?badge=stable
:alt: Documentation Status

``duplicate-url-discarder`` contains a Scrapy fingerprinter that uses
**duplicate-url-discarder** contains a Scrapy fingerprinter that uses
customizable URL processors to canonicalize URLs before fingerprinting.

Quick Start
Expand Down Expand Up @@ -87,7 +87,7 @@ canonical form.
URL Processors
==============

``duplicate-url-discarder`` utilizes *URL processors* to make canonical
**duplicate-url-discarder** utilizes *URL processors* to make canonical
versions of URLs. The processors are configured with *URL rules*. Each URL rule
specifies an URL pattern for which the processor applies, and specific
processor arguments to use.
Expand Down Expand Up @@ -146,20 +146,49 @@ non-universal rules that match the URL, the universal ones are applied.
Configuration
=============

``duplicate-url-discarder`` uses the following Scrapy settings:
**duplicate-url-discarder** uses the following Scrapy settings:

``DUD_LOAD_RULE_PATHS``: it should be a list of file paths (``str`` or
``pathlib.Path``) pointing to JSON files with the URL rules to apply:
* ``DUD_LOAD_RULE_PATHS``: it should be a list of file paths (``str`` or
``pathlib.Path``) pointing to JSON files with the URL rules to apply:

.. code-block:: python
.. code-block:: python
DUD_LOAD_RULE_PATHS = [
"/home/user/project/custom_rules1.json",
]
DUD_LOAD_RULE_PATHS = [
"/home/user/project/custom_rules1.json",
]
The default value of this setting is empty. However, if the package
`duplicate-url-discarder-rules`_ is installed and ``DUD_LOAD_RULE_PATHS``
has been left empty, the rules in said package are automatically used.

* ``DUD_ATTRIBUTES_PER_ITEM``: it's a mapping of a type *(or its import path)*
into a list of attributes present in the instances of that type.

For example:

.. code-block:: python
DUD_ATTRIBUTES_PER_ITEM = {
"zyte_common_items.Product": [
"canonicalUrl",
"brand",
"name",
"gtin",
"mpn",
"productId",
"sku",
"color",
"size",
"style",
],
# Other than strings representing import paths, types are supported as well.
dict: ["name"]
}
The default value of this setting is empty. However, if the package
`duplicate-url-discarder-rules`_ is installed and ``DUD_LOAD_RULE_PATHS``
has been left empty, the rules in the said package is automatically used.
This allows DUD to select which attributes to use to derive a signature for an item.
This signature is then used to compare the identities of different items. For instance,
``duplicate_url_discarder.DuplicateUrlDiscarderPipeline`` uses this to find duplicate
items that were extracted so it can drop them.

.. _scrapy-zyte-api: https://github.com/scrapy-plugins/scrapy-zyte-api
.. _duplicate-url-discarder-rules: https://github.com/zytedata/duplicate-url-discarder-rules
1 change: 1 addition & 0 deletions duplicate_url_discarder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@

from ._addon import Addon
from ._fingerprinter import Fingerprinter
from .pipelines import DuplicateUrlDiscarderPipeline
24 changes: 24 additions & 0 deletions duplicate_url_discarder/_addon.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
from scrapy.settings import BaseSettings
from scrapy.utils.misc import load_object

from duplicate_url_discarder.pipelines import DuplicateUrlDiscarderPipeline


def _setdefault(settings, setting, cls, pos) -> None:
setting_value = settings[setting]
if not setting_value:
settings[setting] = {cls: pos}
return None
if cls in setting_value:
return None
for cls_or_path in setting_value:
if isinstance(cls_or_path, str):
_cls = load_object(cls_or_path)
if _cls == cls:
return None
settings[setting][cls] = pos


class Addon:
Expand All @@ -14,3 +32,9 @@ def update_settings(self, settings: BaseSettings) -> None:
current_fpr,
"addon",
)
_setdefault(
settings,
"ITEM_PIPELINES",
DuplicateUrlDiscarderPipeline,
100,
)
42 changes: 42 additions & 0 deletions duplicate_url_discarder/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Any, Dict, Set, Tuple, TypeVar

from itemadapter import ItemAdapter
from scrapy import Spider
from scrapy.crawler import Crawler
from scrapy.exceptions import DropItem, NotConfigured

from .utils import item_signature, load_keys_from_path

T = TypeVar("T")


class DuplicateUrlDiscarderPipeline:
def __init__(self, crawler: Crawler):
#: A mapping of item class to a list of attribute names which will be used to
#: compute the item signature.
self._attributes_per_item: Dict[Any, Tuple[str, ...]] = load_keys_from_path(
crawler.settings.getdict("DUD_ATTRIBUTES_PER_ITEM", {})
)

#: Record of all the item signatures that were previously processed and seen.
self._seen_item_signatures: Set[int] = set()

@classmethod
def from_crawler(cls, crawler: Crawler):
if not crawler.settings.getdict("DUD_ATTRIBUTES_PER_ITEM"):
raise NotConfigured(
"Set DUD_ATTRIBUTES_PER_ITEM to enable DuplicateUrlDiscarderPipeline"
)
return cls(crawler)

def process_item(self, item: T, spider: Spider) -> T:
item_attributes = self._attributes_per_item.get(type(item))
if not item_attributes:
return item

signature = item_signature(ItemAdapter(item), item_attributes)
if signature in self._seen_item_signatures:
raise DropItem(f"Dropping item that was already seen before:\n{item}")

self._seen_item_signatures.add(signature)
return item
25 changes: 25 additions & 0 deletions duplicate_url_discarder/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from typing import Any, Dict, Iterable, TypeVar

from itemadapter import ItemAdapter
from scrapy.utils.misc import load_object

T = TypeVar("T")


def load_keys_from_path(mapping: Dict[Any, T]) -> Dict[Any, T]:
"""Given a mapping, convert the keys that represent import paths
into their respective objects.
"""
new_mapping = {}
for cls_or_path, values in mapping.items():
cls = load_object(cls_or_path) if isinstance(cls_or_path, str) else cls_or_path
new_mapping[cls] = values
return new_mapping


def item_signature(item: ItemAdapter, item_attributes: Iterable[str]) -> int:
try:
values = [f"{attrib}:{item.get(attrib)}" for attrib in item_attributes]
except AttributeError:
raise ValueError(f"Got type {type(item)} but expected ItemAdapter.")
return hash("|".join(values))
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ multi_line_output = 3
module = [
"scrapy.*",
"url_matcher.*",
"pytest_twisted.*"
]
ignore_missing_imports = true

Expand Down
2 changes: 2 additions & 0 deletions tests/test_addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from scrapy.utils.test import get_crawler

from duplicate_url_discarder import Addon
from duplicate_url_discarder.pipelines import DuplicateUrlDiscarderPipeline


def test_addon():
Expand All @@ -24,6 +25,7 @@ def test_addon():
crawler.settings["DUD_FALLBACK_REQUEST_FINGERPRINTER_CLASS"]
== _SCRAPY_DEFAULT_REQUEST_FINGEPRINTER_CLASS
)
assert crawler.settings["ITEM_PIPELINES"] == {DuplicateUrlDiscarderPipeline: 100}


def test_addon_fallback():
Expand Down
70 changes: 70 additions & 0 deletions tests/test_pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from dataclasses import dataclass

from pytest_twisted import ensureDeferred
from scrapy import Spider
from scrapy.crawler import Crawler
from scrapy.utils.test import get_crawler


@ensureDeferred
async def test_duplicate_url_discarder_pipeline_no_addon(caplog) -> None:
caplog.set_level("INFO")

class FakeSpider(Spider):
name = "fake_spider"
start_urls = ["data:,"]

crawler: Crawler = get_crawler(FakeSpider, {})
await crawler.crawl()
assert any(
True
for record in caplog.records
if "Enabled item pipelines:\n[]" in record.message
)


@ensureDeferred
async def test_duplicate_url_discarder_pipeline_with_addon(caplog) -> None:
caplog.set_level("INFO")

@dataclass
class FakeItem:
name: str
value: int

class FakeSpider(Spider):
name = "fake_spider"
start_urls = ["data:,"]

def parse(self, response):
yield FakeItem(name="AAA", value=0)
yield FakeItem(name="BBB", value=1)
yield FakeItem(name="AAA", value=2) # dropped

# These aren't declared in DUD_ATTRIBUTES_PER_ITEM, so they aren't dropped
yield {"name": "CCC", "value": 3}
yield {"name": "CCC", "value": 4}

settings = {
"DUD_ATTRIBUTES_PER_ITEM": {FakeItem: ["name"]},
"ADDONS": {
"duplicate_url_discarder.Addon": 600,
},
}
crawler: Crawler = get_crawler(FakeSpider, settings)
await crawler.crawl()

expected_text = "Enabled item pipelines:\n[<class 'duplicate_url_discarder.pipelines.DuplicateUrlDiscarderPipeline'>]"
messages = [record.message for record in caplog.records]
assert any(True for record in caplog.records if expected_text in messages)

assert crawler.stats.get_value("item_scraped_count") == 4
assert crawler.stats.get_value("item_dropped_count") == 1

dropped_messages = [
record.message
for record in caplog.records
if "Dropping item that was already seen before" in record.message
]
assert len(dropped_messages) == 1
assert "FakeItem(name='AAA', value=2)" in dropped_messages[0]
60 changes: 60 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from dataclasses import dataclass

import pytest
from itemadapter import ItemAdapter

from duplicate_url_discarder.utils import item_signature, load_keys_from_path


class Object1:
pass


class Object2:
pass


def test_load_keys_from_path() -> None:
assert load_keys_from_path({}) == {}

mapping = {"tests.test_utils.Object1": "object1", Object2: "object2"}
assert load_keys_from_path(mapping) == {Object1: "object1", Object2: "object2"}

with pytest.raises(
ValueError, match="Error loading object 'does-not-exist': not a full path"
):
load_keys_from_path({"does-not-exist": True})


def test_item_signature() -> None:
@dataclass
class FakeItem:
name: str

value = "fake_item"
item = FakeItem(value)
adapter = ItemAdapter(item)

assert item_signature(adapter, ["name"]) == hash("name:fake_item")

exception_text = (
"Got type <class 'tests.test_utils.test_item_signature.<locals>.FakeItem'> "
"but expected ItemAdapter."
)
with pytest.raises(ValueError, match=exception_text):
item_signature(item, ["name"]) # type: ignore


@pytest.mark.xfail(
reason="unsupported edge case due to setup of attribute name and values"
)
def test_item_signature_edge_case():
item_attributes = ["a", "b"]

adapter = ItemAdapter({"a": "a|b:b", "b": "b"})
sig_1 = item_signature(adapter, item_attributes)

adapter = ItemAdapter({"a": "a", "b": "b|b:b"})
sig_2 = item_signature(adapter, item_attributes)

assert sig_1 != sig_2
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ envlist = py,pre-commit,mypy,docs,twinecheck,rules
deps =
pytest
pytest-cov
pytest-twisted
commands =
py.test \
--cov-report=term --cov-report=html --cov-report= --cov-report=xml \
Expand Down

0 comments on commit d0bdb6d

Please sign in to comment.