Skip to content

Commit

Permalink
Merge pull request #18 from zytedata/normalize-processor
Browse files Browse the repository at this point in the history
Normalizer processor
  • Loading branch information
BurnzZ authored Jul 19, 2024
2 parents d261402 + c17c04a commit 260b145
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 8 deletions.
2 changes: 2 additions & 0 deletions duplicate_url_discarder/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from ..rule import UrlRule
from .base import UrlProcessorBase
from .normalize import NormalizerProcessor
from .query_removal import QueryRemovalProcessor
from .query_removal_except import QueryRemovalExceptProcessor

_PROCESSOR_CLASSES: Dict[str, Type[UrlProcessorBase]] = {
"queryRemoval": QueryRemovalProcessor,
"queryRemovalExcept": QueryRemovalExceptProcessor,
"normalizer": NormalizerProcessor,
}


Expand Down
6 changes: 3 additions & 3 deletions duplicate_url_discarder/processors/base.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from abc import ABC, abstractmethod
from typing import Any, Tuple
from typing import Any, Optional, Tuple


class UrlProcessorBase(ABC):
def __init__(self, args: Tuple[Any, ...]):
self.args: Tuple[Any, ...] = args
def __init__(self, args: Optional[Tuple[Any, ...]] = None):
self.args: Tuple[Any, ...] = args or ()
self.validate_args()

def validate_args(self) -> None: # noqa: B027
Expand Down
23 changes: 23 additions & 0 deletions duplicate_url_discarder/processors/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import re

from scrapy.utils.url import parse_url

from .base import UrlProcessorBase


class NormalizerProcessor(UrlProcessorBase):
def validate_args(self) -> None:
if self.args:
raise TypeError(f"normalizeUrl doesn't accept args, got: {self.args}")

def process(self, input_url: str) -> str:
"""Normalizes the input URL by removing the following:
* 'www.' prefixes including ones with numerical characters, e.g. 'www2.'
* trailing slashes
"""

parsed_url = parse_url(input_url)
netloc = re.sub(r"^www\d*\.", "", parsed_url.netloc)
path = parsed_url.path.rstrip("/")
return parsed_url._replace(netloc=netloc, path=path).geturl()
42 changes: 42 additions & 0 deletions tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from url_matcher import Patterns

from duplicate_url_discarder.processors import (
NormalizerProcessor,
QueryRemovalExceptProcessor,
QueryRemovalProcessor,
get_processor,
Expand Down Expand Up @@ -95,3 +96,44 @@ def test_query_removal_except_validate_args():
QueryRemovalExceptProcessor(("a", None, ""))
QueryRemovalExceptProcessor(("",))
QueryRemovalExceptProcessor(())


@pytest.mark.parametrize(
["input_url", "expected"],
(
("https://example.com", "https://example.com"),
("https://example.com?arg=1#frag", "https://example.com?arg=1#frag"),
("https://www.example.com", "https://example.com"),
("https://www.example.com?arg=1#frag", "https://example.com?arg=1#frag"),
("https://www2.example.com", "https://example.com"),
("https://www.us.example.com?arg=1#frag", "https://us.example.com?arg=1#frag"),
("https://www2.uk.example.com", "https://uk.example.com"),
),
)
def test_normalize_processor_www_prefixes(input_url, expected):
assert NormalizerProcessor(None).process(input_url) == expected


@pytest.mark.parametrize(
["input_url", "expected"],
(
("https://example.com", "https://example.com"),
("https://example.com/", "https://example.com"),
("https://example.com//", "https://example.com"),
("https://example.com?arg=1#frag", "https://example.com?arg=1#frag"),
("https://example.com/?arg=1#frag", "https://example.com?arg=1#frag"),
("https://example.com//?arg=1#frag", "https://example.com?arg=1#frag"),
("https://us.example.com/?arg=1#frag", "https://us.example.com?arg=1#frag"),
("https://us.example.com//?arg=1#frag", "https://us.example.com?arg=1#frag"),
),
)
def test_normalize_processor_trailing_slashes(input_url, expected):
assert NormalizerProcessor(None).process(input_url) == expected


def test_normalize_processor_validate_args():
with pytest.raises(TypeError, match="normalizeUrl doesn't accept args, got: "):
NormalizerProcessor(("",))

NormalizerProcessor(())
NormalizerProcessor(None)
20 changes: 15 additions & 5 deletions tests/test_url_canonicalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,28 @@ def test_url_canonicalizer_load(tmp_path):
"processor": "queryRemoval",
"urlPattern": {"include": []},
},
{
"args": [],
"order": 2,
"processor": "normalizer",
"urlPattern": {"include": ["bar.example"]},
},
]
)
)
url_canonicalizer = UrlCanonicalizer([str(empty_path), rules_path])
assert len(url_canonicalizer.processors) == 2
assert len(url_canonicalizer.processors) == 3
assert (
url_canonicalizer.process_url("http://foo.example/?foo=1&bbn=1&PHPSESSIONID=1")
== "http://foo.example/?foo=1&bbn=1"
url_canonicalizer.process_url(
"http://www.foo.example/?foo=1&bbn=1&PHPSESSIONID=1"
)
== "http://www.foo.example/?foo=1&bbn=1"
)
assert (
url_canonicalizer.process_url("http://bar.example/?foo=1&bbn=1&PHPSESSIONID=1")
== "http://bar.example/?foo=1&PHPSESSIONID=1"
url_canonicalizer.process_url(
"http://www2.bar.example/?foo=1&bbn=1&PHPSESSIONID=1"
)
== "http://bar.example?foo=1&PHPSESSIONID=1"
)


Expand Down

0 comments on commit 260b145

Please sign in to comment.