-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #17 from zytedata/pipeline
create DuplicateUrlDiscarderPipeline
- Loading branch information
Showing
10 changed files
with
267 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from typing import Any, Dict, Set, Tuple, TypeVar | ||
|
||
from itemadapter import ItemAdapter | ||
from scrapy import Spider | ||
from scrapy.crawler import Crawler | ||
from scrapy.exceptions import DropItem, NotConfigured | ||
|
||
from .utils import item_signature, load_keys_from_path | ||
|
||
T = TypeVar("T") | ||
|
||
|
||
class DuplicateUrlDiscarderPipeline: | ||
def __init__(self, crawler: Crawler): | ||
#: A mapping of item class to a list of attribute names which will be used to | ||
#: compute the item signature. | ||
self._attributes_per_item: Dict[Any, Tuple[str, ...]] = load_keys_from_path( | ||
crawler.settings.getdict("DUD_ATTRIBUTES_PER_ITEM", {}) | ||
) | ||
|
||
#: Record of all the item signatures that were previously processed and seen. | ||
self._seen_item_signatures: Set[int] = set() | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler: Crawler): | ||
if not crawler.settings.getdict("DUD_ATTRIBUTES_PER_ITEM"): | ||
raise NotConfigured( | ||
"Set DUD_ATTRIBUTES_PER_ITEM to enable DuplicateUrlDiscarderPipeline" | ||
) | ||
return cls(crawler) | ||
|
||
def process_item(self, item: T, spider: Spider) -> T: | ||
item_attributes = self._attributes_per_item.get(type(item)) | ||
if not item_attributes: | ||
return item | ||
|
||
signature = item_signature(ItemAdapter(item), item_attributes) | ||
if signature in self._seen_item_signatures: | ||
raise DropItem(f"Dropping item that was already seen before:\n{item}") | ||
|
||
self._seen_item_signatures.add(signature) | ||
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from typing import Any, Dict, Iterable, TypeVar | ||
|
||
from itemadapter import ItemAdapter | ||
from scrapy.utils.misc import load_object | ||
|
||
T = TypeVar("T") | ||
|
||
|
||
def load_keys_from_path(mapping: Dict[Any, T]) -> Dict[Any, T]: | ||
"""Given a mapping, convert the keys that represent import paths | ||
into their respective objects. | ||
""" | ||
new_mapping = {} | ||
for cls_or_path, values in mapping.items(): | ||
cls = load_object(cls_or_path) if isinstance(cls_or_path, str) else cls_or_path | ||
new_mapping[cls] = values | ||
return new_mapping | ||
|
||
|
||
def item_signature(item: ItemAdapter, item_attributes: Iterable[str]) -> int: | ||
try: | ||
values = [f"{attrib}:{item.get(attrib)}" for attrib in item_attributes] | ||
except AttributeError: | ||
raise ValueError(f"Got type {type(item)} but expected ItemAdapter.") | ||
return hash("|".join(values)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from dataclasses import dataclass | ||
|
||
from pytest_twisted import ensureDeferred | ||
from scrapy import Spider | ||
from scrapy.crawler import Crawler | ||
from scrapy.utils.test import get_crawler | ||
|
||
|
||
@ensureDeferred | ||
async def test_duplicate_url_discarder_pipeline_no_addon(caplog) -> None: | ||
caplog.set_level("INFO") | ||
|
||
class FakeSpider(Spider): | ||
name = "fake_spider" | ||
start_urls = ["data:,"] | ||
|
||
crawler: Crawler = get_crawler(FakeSpider, {}) | ||
await crawler.crawl() | ||
assert any( | ||
True | ||
for record in caplog.records | ||
if "Enabled item pipelines:\n[]" in record.message | ||
) | ||
|
||
|
||
@ensureDeferred | ||
async def test_duplicate_url_discarder_pipeline_with_addon(caplog) -> None: | ||
caplog.set_level("INFO") | ||
|
||
@dataclass | ||
class FakeItem: | ||
name: str | ||
value: int | ||
|
||
class FakeSpider(Spider): | ||
name = "fake_spider" | ||
start_urls = ["data:,"] | ||
|
||
def parse(self, response): | ||
yield FakeItem(name="AAA", value=0) | ||
yield FakeItem(name="BBB", value=1) | ||
yield FakeItem(name="AAA", value=2) # dropped | ||
|
||
# These aren't declared in DUD_ATTRIBUTES_PER_ITEM, so they aren't dropped | ||
yield {"name": "CCC", "value": 3} | ||
yield {"name": "CCC", "value": 4} | ||
|
||
settings = { | ||
"DUD_ATTRIBUTES_PER_ITEM": {FakeItem: ["name"]}, | ||
"ADDONS": { | ||
"duplicate_url_discarder.Addon": 600, | ||
}, | ||
} | ||
crawler: Crawler = get_crawler(FakeSpider, settings) | ||
await crawler.crawl() | ||
|
||
expected_text = "Enabled item pipelines:\n[<class 'duplicate_url_discarder.pipelines.DuplicateUrlDiscarderPipeline'>]" | ||
messages = [record.message for record in caplog.records] | ||
assert any(True for record in caplog.records if expected_text in messages) | ||
|
||
assert crawler.stats.get_value("item_scraped_count") == 4 | ||
assert crawler.stats.get_value("item_dropped_count") == 1 | ||
|
||
dropped_messages = [ | ||
record.message | ||
for record in caplog.records | ||
if "Dropping item that was already seen before" in record.message | ||
] | ||
assert len(dropped_messages) == 1 | ||
assert "FakeItem(name='AAA', value=2)" in dropped_messages[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from dataclasses import dataclass | ||
|
||
import pytest | ||
from itemadapter import ItemAdapter | ||
|
||
from duplicate_url_discarder.utils import item_signature, load_keys_from_path | ||
|
||
|
||
class Object1: | ||
pass | ||
|
||
|
||
class Object2: | ||
pass | ||
|
||
|
||
def test_load_keys_from_path() -> None: | ||
assert load_keys_from_path({}) == {} | ||
|
||
mapping = {"tests.test_utils.Object1": "object1", Object2: "object2"} | ||
assert load_keys_from_path(mapping) == {Object1: "object1", Object2: "object2"} | ||
|
||
with pytest.raises( | ||
ValueError, match="Error loading object 'does-not-exist': not a full path" | ||
): | ||
load_keys_from_path({"does-not-exist": True}) | ||
|
||
|
||
def test_item_signature() -> None: | ||
@dataclass | ||
class FakeItem: | ||
name: str | ||
|
||
value = "fake_item" | ||
item = FakeItem(value) | ||
adapter = ItemAdapter(item) | ||
|
||
assert item_signature(adapter, ["name"]) == hash("name:fake_item") | ||
|
||
exception_text = ( | ||
"Got type <class 'tests.test_utils.test_item_signature.<locals>.FakeItem'> " | ||
"but expected ItemAdapter." | ||
) | ||
with pytest.raises(ValueError, match=exception_text): | ||
item_signature(item, ["name"]) # type: ignore | ||
|
||
|
||
@pytest.mark.xfail( | ||
reason="unsupported edge case due to setup of attribute name and values" | ||
) | ||
def test_item_signature_edge_case(): | ||
item_attributes = ["a", "b"] | ||
|
||
adapter = ItemAdapter({"a": "a|b:b", "b": "b"}) | ||
sig_1 = item_signature(adapter, item_attributes) | ||
|
||
adapter = ItemAdapter({"a": "a", "b": "b|b:b"}) | ||
sig_2 = item_signature(adapter, item_attributes) | ||
|
||
assert sig_1 != sig_2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters