Merge pull request #18 from zytedata/normalize-processor

Normalizer processor
zytedata · Jul 19, 2024 · 260b145 · 260b145
2 parents d261402 + c17c04a
commit 260b145
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 8 deletions.
diff --git a/duplicate_url_discarder/processors/__init__.py b/duplicate_url_discarder/processors/__init__.py
@@ -2,12 +2,14 @@
 
 from ..rule import UrlRule
 from .base import UrlProcessorBase
+from .normalize import NormalizerProcessor
 from .query_removal import QueryRemovalProcessor
 from .query_removal_except import QueryRemovalExceptProcessor
 
 _PROCESSOR_CLASSES: Dict[str, Type[UrlProcessorBase]] = {
     "queryRemoval": QueryRemovalProcessor,
     "queryRemovalExcept": QueryRemovalExceptProcessor,
+    "normalizer": NormalizerProcessor,
 }
 
 

diff --git a/duplicate_url_discarder/processors/base.py b/duplicate_url_discarder/processors/base.py
@@ -1,10 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import Any, Tuple
+from typing import Any, Optional, Tuple
 
 
 class UrlProcessorBase(ABC):
-    def __init__(self, args: Tuple[Any, ...]):
-        self.args: Tuple[Any, ...] = args
+    def __init__(self, args: Optional[Tuple[Any, ...]] = None):
+        self.args: Tuple[Any, ...] = args or ()
         self.validate_args()
 
     def validate_args(self) -> None:  # noqa: B027

diff --git a/duplicate_url_discarder/processors/normalize.py b/duplicate_url_discarder/processors/normalize.py
@@ -0,0 +1,23 @@
+import re
+
+from scrapy.utils.url import parse_url
+
+from .base import UrlProcessorBase
+
+
+class NormalizerProcessor(UrlProcessorBase):
+    def validate_args(self) -> None:
+        if self.args:
+            raise TypeError(f"normalizeUrl doesn't accept args, got: {self.args}")
+
+    def process(self, input_url: str) -> str:
+        """Normalizes the input URL by removing the following:
+
+        * 'www.' prefixes including ones with numerical characters, e.g. 'www2.'
+        * trailing slashes
+        """
+
+        parsed_url = parse_url(input_url)
+        netloc = re.sub(r"^www\d*\.", "", parsed_url.netloc)
+        path = parsed_url.path.rstrip("/")
+        return parsed_url._replace(netloc=netloc, path=path).geturl()
diff --git a/tests/test_processors.py b/tests/test_processors.py
@@ -2,6 +2,7 @@
 from url_matcher import Patterns
 
 from duplicate_url_discarder.processors import (
+    NormalizerProcessor,
     QueryRemovalExceptProcessor,
     QueryRemovalProcessor,
     get_processor,
@@ -95,3 +96,44 @@ def test_query_removal_except_validate_args():
         QueryRemovalExceptProcessor(("a", None, ""))
     QueryRemovalExceptProcessor(("",))
     QueryRemovalExceptProcessor(())
+
+
+@pytest.mark.parametrize(
+    ["input_url", "expected"],
+    (
+        ("https://example.com", "https://example.com"),
+        ("https://example.com?arg=1#frag", "https://example.com?arg=1#frag"),
+        ("https://www.example.com", "https://example.com"),
+        ("https://www.example.com?arg=1#frag", "https://example.com?arg=1#frag"),
+        ("https://www2.example.com", "https://example.com"),
+        ("https://www.us.example.com?arg=1#frag", "https://us.example.com?arg=1#frag"),
+        ("https://www2.uk.example.com", "https://uk.example.com"),
+    ),
+)
+def test_normalize_processor_www_prefixes(input_url, expected):
+    assert NormalizerProcessor(None).process(input_url) == expected
+
+
+@pytest.mark.parametrize(
+    ["input_url", "expected"],
+    (
+        ("https://example.com", "https://example.com"),
+        ("https://example.com/", "https://example.com"),
+        ("https://example.com//", "https://example.com"),
+        ("https://example.com?arg=1#frag", "https://example.com?arg=1#frag"),
+        ("https://example.com/?arg=1#frag", "https://example.com?arg=1#frag"),
+        ("https://example.com//?arg=1#frag", "https://example.com?arg=1#frag"),
+        ("https://us.example.com/?arg=1#frag", "https://us.example.com?arg=1#frag"),
+        ("https://us.example.com//?arg=1#frag", "https://us.example.com?arg=1#frag"),
+    ),
+)
+def test_normalize_processor_trailing_slashes(input_url, expected):
+    assert NormalizerProcessor(None).process(input_url) == expected
+
+
+def test_normalize_processor_validate_args():
+    with pytest.raises(TypeError, match="normalizeUrl doesn't accept args, got: "):
+        NormalizerProcessor(("",))
+
+    NormalizerProcessor(())
+    NormalizerProcessor(None)
diff --git a/tests/test_url_canonicalizer.py b/tests/test_url_canonicalizer.py
@@ -31,18 +31,28 @@ def test_url_canonicalizer_load(tmp_path):
                     "processor": "queryRemoval",
                     "urlPattern": {"include": []},
                 },
+                {
+                    "args": [],
+                    "order": 2,
+                    "processor": "normalizer",
+                    "urlPattern": {"include": ["bar.example"]},
+                },
             ]
         )
     )
     url_canonicalizer = UrlCanonicalizer([str(empty_path), rules_path])
-    assert len(url_canonicalizer.processors) == 2
+    assert len(url_canonicalizer.processors) == 3
     assert (
-        url_canonicalizer.process_url("http://foo.example/?foo=1&bbn=1&PHPSESSIONID=1")
-        == "http://foo.example/?foo=1&bbn=1"
+        url_canonicalizer.process_url(
+            "http://www.foo.example/?foo=1&bbn=1&PHPSESSIONID=1"
+        )
+        == "http://www.foo.example/?foo=1&bbn=1"
     )
     assert (
-        url_canonicalizer.process_url("http://bar.example/?foo=1&bbn=1&PHPSESSIONID=1")
-        == "http://bar.example/?foo=1&PHPSESSIONID=1"
+        url_canonicalizer.process_url(
+            "http://www2.bar.example/?foo=1&bbn=1&PHPSESSIONID=1"
+        )
+        == "http://bar.example?foo=1&PHPSESSIONID=1"
     )