Merge pull request #17 from zytedata/pipeline

create DuplicateUrlDiscarderPipeline
zytedata · Jul 19, 2024 · d0bdb6d · d0bdb6d
2 parents 260b145 + c562bc6
commit d0bdb6d
Show file tree

Hide file tree

Showing 10 changed files with 267 additions and 12 deletions.
diff --git a/README.rst b/README.rst
@@ -22,7 +22,7 @@ duplicate-url-discarder
    :target: https://duplicate-url-discarder.readthedocs.io/en/stable/?badge=stable
    :alt: Documentation Status
 
-``duplicate-url-discarder`` contains a Scrapy fingerprinter that uses
+**duplicate-url-discarder** contains a Scrapy fingerprinter that uses
 customizable URL processors to canonicalize URLs before fingerprinting.
 
 Quick Start
@@ -87,7 +87,7 @@ canonical form.
 URL Processors
 ==============
 
-``duplicate-url-discarder`` utilizes *URL processors* to make canonical
+**duplicate-url-discarder** utilizes *URL processors* to make canonical
 versions of URLs. The processors are configured with *URL rules*. Each URL rule
 specifies an URL pattern for which the processor applies, and specific
 processor arguments to use.
@@ -146,20 +146,49 @@ non-universal rules that match the URL, the universal ones are applied.
 Configuration
 =============
 
-``duplicate-url-discarder`` uses the following Scrapy settings:
+**duplicate-url-discarder** uses the following Scrapy settings:
 
-``DUD_LOAD_RULE_PATHS``: it should be a list of file paths (``str`` or
-``pathlib.Path``) pointing to JSON files with the URL rules to apply:
+* ``DUD_LOAD_RULE_PATHS``: it should be a list of file paths (``str`` or
+  ``pathlib.Path``) pointing to JSON files with the URL rules to apply:
 
-.. code-block:: python
+  .. code-block:: python
 
-    DUD_LOAD_RULE_PATHS = [
-        "/home/user/project/custom_rules1.json",
-    ]
+      DUD_LOAD_RULE_PATHS = [
+          "/home/user/project/custom_rules1.json",
+      ]
+
+  The default value of this setting is empty. However, if the package
+  `duplicate-url-discarder-rules`_ is installed and ``DUD_LOAD_RULE_PATHS``
+  has been left empty, the rules in said package are automatically used.
+
+* ``DUD_ATTRIBUTES_PER_ITEM``: it's a mapping of a type *(or its import path)*
+  into a list of attributes present in the instances of that type.
+
+  For example:
+
+  .. code-block:: python
+
+      DUD_ATTRIBUTES_PER_ITEM = {
+          "zyte_common_items.Product": [
+              "canonicalUrl",
+              "brand",
+              "name",
+              "gtin",
+              "mpn",
+              "productId",
+              "sku",
+              "color",
+              "size",
+              "style",
+          ],
+          # Other than strings representing import paths, types are supported as well.
+          dict: ["name"]
+      }
 
-The default value of this setting is empty. However, if the package
-`duplicate-url-discarder-rules`_ is installed and ``DUD_LOAD_RULE_PATHS``
-has been left empty, the rules in the said package is automatically used.
+  This allows DUD to select which attributes to use to derive a signature for an item.
+  This signature is then used to compare the identities of different items. For instance,
+  ``duplicate_url_discarder.DuplicateUrlDiscarderPipeline`` uses this to find duplicate
+  items that were extracted so it can drop them.
 
 .. _scrapy-zyte-api: https://github.com/scrapy-plugins/scrapy-zyte-api
 .. _duplicate-url-discarder-rules: https://github.com/zytedata/duplicate-url-discarder-rules
diff --git a/duplicate_url_discarder/__init__.py b/duplicate_url_discarder/__init__.py
@@ -2,3 +2,4 @@
 
 from ._addon import Addon
 from ._fingerprinter import Fingerprinter
+from .pipelines import DuplicateUrlDiscarderPipeline
diff --git a/duplicate_url_discarder/_addon.py b/duplicate_url_discarder/_addon.py
@@ -1,4 +1,22 @@
 from scrapy.settings import BaseSettings
+from scrapy.utils.misc import load_object
+
+from duplicate_url_discarder.pipelines import DuplicateUrlDiscarderPipeline
+
+
+def _setdefault(settings, setting, cls, pos) -> None:
+    setting_value = settings[setting]
+    if not setting_value:
+        settings[setting] = {cls: pos}
+        return None
+    if cls in setting_value:
+        return None
+    for cls_or_path in setting_value:
+        if isinstance(cls_or_path, str):
+            _cls = load_object(cls_or_path)
+            if _cls == cls:
+                return None
+    settings[setting][cls] = pos
 
 
 class Addon:
@@ -14,3 +32,9 @@ def update_settings(self, settings: BaseSettings) -> None:
             current_fpr,
             "addon",
         )
+        _setdefault(
+            settings,
+            "ITEM_PIPELINES",
+            DuplicateUrlDiscarderPipeline,
+            100,
+        )
diff --git a/duplicate_url_discarder/pipelines.py b/duplicate_url_discarder/pipelines.py
@@ -0,0 +1,42 @@
+from typing import Any, Dict, Set, Tuple, TypeVar
+
+from itemadapter import ItemAdapter
+from scrapy import Spider
+from scrapy.crawler import Crawler
+from scrapy.exceptions import DropItem, NotConfigured
+
+from .utils import item_signature, load_keys_from_path
+
+T = TypeVar("T")
+
+
+class DuplicateUrlDiscarderPipeline:
+    def __init__(self, crawler: Crawler):
+        #: A mapping of item class to a list of attribute names which will be used to
+        #: compute the item signature.
+        self._attributes_per_item: Dict[Any, Tuple[str, ...]] = load_keys_from_path(
+            crawler.settings.getdict("DUD_ATTRIBUTES_PER_ITEM", {})
+        )
+
+        #: Record of all the item signatures that were previously processed and seen.
+        self._seen_item_signatures: Set[int] = set()
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        if not crawler.settings.getdict("DUD_ATTRIBUTES_PER_ITEM"):
+            raise NotConfigured(
+                "Set DUD_ATTRIBUTES_PER_ITEM to enable DuplicateUrlDiscarderPipeline"
+            )
+        return cls(crawler)
+
+    def process_item(self, item: T, spider: Spider) -> T:
+        item_attributes = self._attributes_per_item.get(type(item))
+        if not item_attributes:
+            return item
+
+        signature = item_signature(ItemAdapter(item), item_attributes)
+        if signature in self._seen_item_signatures:
+            raise DropItem(f"Dropping item that was already seen before:\n{item}")
+
+        self._seen_item_signatures.add(signature)
+        return item
diff --git a/duplicate_url_discarder/utils.py b/duplicate_url_discarder/utils.py
@@ -0,0 +1,25 @@
+from typing import Any, Dict, Iterable, TypeVar
+
+from itemadapter import ItemAdapter
+from scrapy.utils.misc import load_object
+
+T = TypeVar("T")
+
+
+def load_keys_from_path(mapping: Dict[Any, T]) -> Dict[Any, T]:
+    """Given a mapping, convert the keys that represent import paths
+    into their respective objects.
+    """
+    new_mapping = {}
+    for cls_or_path, values in mapping.items():
+        cls = load_object(cls_or_path) if isinstance(cls_or_path, str) else cls_or_path
+        new_mapping[cls] = values
+    return new_mapping
+
+
+def item_signature(item: ItemAdapter, item_attributes: Iterable[str]) -> int:
+    try:
+        values = [f"{attrib}:{item.get(attrib)}" for attrib in item_attributes]
+    except AttributeError:
+        raise ValueError(f"Got type {type(item)} but expected ItemAdapter.")
+    return hash("|".join(values))
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,6 +50,7 @@ multi_line_output = 3
 module = [
     "scrapy.*",
     "url_matcher.*",
+    "pytest_twisted.*"
 ]
 ignore_missing_imports = true
 

diff --git a/tests/test_addon.py b/tests/test_addon.py
@@ -7,6 +7,7 @@
 from scrapy.utils.test import get_crawler
 
 from duplicate_url_discarder import Addon
+from duplicate_url_discarder.pipelines import DuplicateUrlDiscarderPipeline
 
 
 def test_addon():
@@ -24,6 +25,7 @@ def test_addon():
         crawler.settings["DUD_FALLBACK_REQUEST_FINGERPRINTER_CLASS"]
         == _SCRAPY_DEFAULT_REQUEST_FINGEPRINTER_CLASS
     )
+    assert crawler.settings["ITEM_PIPELINES"] == {DuplicateUrlDiscarderPipeline: 100}
 
 
 def test_addon_fallback():

diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
@@ -0,0 +1,70 @@
+from dataclasses import dataclass
+
+from pytest_twisted import ensureDeferred
+from scrapy import Spider
+from scrapy.crawler import Crawler
+from scrapy.utils.test import get_crawler
+
+
+@ensureDeferred
+async def test_duplicate_url_discarder_pipeline_no_addon(caplog) -> None:
+    caplog.set_level("INFO")
+
+    class FakeSpider(Spider):
+        name = "fake_spider"
+        start_urls = ["data:,"]
+
+    crawler: Crawler = get_crawler(FakeSpider, {})
+    await crawler.crawl()
+    assert any(
+        True
+        for record in caplog.records
+        if "Enabled item pipelines:\n[]" in record.message
+    )
+
+
+@ensureDeferred
+async def test_duplicate_url_discarder_pipeline_with_addon(caplog) -> None:
+    caplog.set_level("INFO")
+
+    @dataclass
+    class FakeItem:
+        name: str
+        value: int
+
+    class FakeSpider(Spider):
+        name = "fake_spider"
+        start_urls = ["data:,"]
+
+        def parse(self, response):
+            yield FakeItem(name="AAA", value=0)
+            yield FakeItem(name="BBB", value=1)
+            yield FakeItem(name="AAA", value=2)  # dropped
+
+            # These aren't declared in DUD_ATTRIBUTES_PER_ITEM, so they aren't dropped
+            yield {"name": "CCC", "value": 3}
+            yield {"name": "CCC", "value": 4}
+
+    settings = {
+        "DUD_ATTRIBUTES_PER_ITEM": {FakeItem: ["name"]},
+        "ADDONS": {
+            "duplicate_url_discarder.Addon": 600,
+        },
+    }
+    crawler: Crawler = get_crawler(FakeSpider, settings)
+    await crawler.crawl()
+
+    expected_text = "Enabled item pipelines:\n[<class 'duplicate_url_discarder.pipelines.DuplicateUrlDiscarderPipeline'>]"
+    messages = [record.message for record in caplog.records]
+    assert any(True for record in caplog.records if expected_text in messages)
+
+    assert crawler.stats.get_value("item_scraped_count") == 4
+    assert crawler.stats.get_value("item_dropped_count") == 1
+
+    dropped_messages = [
+        record.message
+        for record in caplog.records
+        if "Dropping item that was already seen before" in record.message
+    ]
+    assert len(dropped_messages) == 1
+    assert "FakeItem(name='AAA', value=2)" in dropped_messages[0]
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,60 @@
+from dataclasses import dataclass
+
+import pytest
+from itemadapter import ItemAdapter
+
+from duplicate_url_discarder.utils import item_signature, load_keys_from_path
+
+
+class Object1:
+    pass
+
+
+class Object2:
+    pass
+
+
+def test_load_keys_from_path() -> None:
+    assert load_keys_from_path({}) == {}
+
+    mapping = {"tests.test_utils.Object1": "object1", Object2: "object2"}
+    assert load_keys_from_path(mapping) == {Object1: "object1", Object2: "object2"}
+
+    with pytest.raises(
+        ValueError, match="Error loading object 'does-not-exist': not a full path"
+    ):
+        load_keys_from_path({"does-not-exist": True})
+
+
+def test_item_signature() -> None:
+    @dataclass
+    class FakeItem:
+        name: str
+
+    value = "fake_item"
+    item = FakeItem(value)
+    adapter = ItemAdapter(item)
+
+    assert item_signature(adapter, ["name"]) == hash("name:fake_item")
+
+    exception_text = (
+        "Got type <class 'tests.test_utils.test_item_signature.<locals>.FakeItem'> "
+        "but expected ItemAdapter."
+    )
+    with pytest.raises(ValueError, match=exception_text):
+        item_signature(item, ["name"])  # type: ignore
+
+
+@pytest.mark.xfail(
+    reason="unsupported edge case due to setup of attribute name and values"
+)
+def test_item_signature_edge_case():
+    item_attributes = ["a", "b"]
+
+    adapter = ItemAdapter({"a": "a|b:b", "b": "b"})
+    sig_1 = item_signature(adapter, item_attributes)
+
+    adapter = ItemAdapter({"a": "a", "b": "b|b:b"})
+    sig_2 = item_signature(adapter, item_attributes)
+
+    assert sig_1 != sig_2
diff --git a/tox.ini b/tox.ini
@@ -5,6 +5,7 @@ envlist = py,pre-commit,mypy,docs,twinecheck,rules
 deps =
     pytest
     pytest-cov
+    pytest-twisted
 commands =
     py.test \
         --cov-report=term --cov-report=html --cov-report= --cov-report=xml \