Skip to content

Commit

Permalink
Merge pull request #16 from zytedata/release-prep
Browse files Browse the repository at this point in the history
Prepare for initial release with rules
  • Loading branch information
BurnzZ authored Jul 8, 2024
2 parents e25c27f + 9b895a9 commit d261402
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 10 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ name: tox

on:
pull_request:
branches: [ main ]
push:
branches: [ main ]

Expand All @@ -17,6 +18,7 @@ jobs:
matrix:
include:
- python-version: "3.8"
toxenv: rules
- python-version: "3.9"
- python-version: "3.10"
- python-version: "3.11"
Expand Down
2 changes: 1 addition & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Changes
=======

0.1.0 (YYYY-MM-DD)
0.1.0 (2024-07-08)
------------------

* Initial version.
17 changes: 16 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ Installation
pip install duplicate-url-discarder
Alternatively, you can also include in the installation the predefined rules in
`duplicate-url-discarder-rules`_ via:

.. code-block::
pip install duplicate-url-discarder[rules]
If such rules are installed, they would automatically be used if the
``DUD_LOAD_RULE_PATHS`` setting is left empty (see `configuration`_).

Requires **Python 3.8+**.

Using
Expand Down Expand Up @@ -131,6 +141,8 @@ All non-universal rules (ones that have non-empty include pattern) that match
a request URL are applied according to their order field. If there are no
non-universal rules that match the URL, the universal ones are applied.

.. _configuration:

Configuration
=============

Expand All @@ -145,6 +157,9 @@ Configuration
"/home/user/project/custom_rules1.json",
]
The default value of this setting is empty.
The default value of this setting is empty. However, if the package
`duplicate-url-discarder-rules`_ is installed and ``DUD_LOAD_RULE_PATHS``
has been left empty, the rules in the said package is automatically used.

.. _scrapy-zyte-api: https://github.com/scrapy-plugins/scrapy-zyte-api
.. _duplicate-url-discarder-rules: https://github.com/zytedata/duplicate-url-discarder-rules
19 changes: 16 additions & 3 deletions duplicate_url_discarder/_fingerprinter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
import os
from typing import TYPE_CHECKING, List, Union
from typing import TYPE_CHECKING, Sequence, Union

from scrapy import Request
from scrapy.crawler import Crawler
Expand All @@ -20,15 +20,28 @@

logger = logging.getLogger(__name__)

try:
from importlib.metadata import version

from duplicate_url_discarder_rules import RULE_PATHS as default_rule_paths
except ImportError:
default_rule_paths = None


class Fingerprinter:
def __init__(self, crawler: Crawler):
self.crawler: Crawler = crawler
rule_paths: List[Union[str, os.PathLike]] = self.crawler.settings.getlist(
rule_paths: Sequence[Union[str, os.PathLike]] = self.crawler.settings.getlist(
"DUD_LOAD_RULE_PATHS"
)
if not rule_paths:
logger.warning("DUD_LOAD_RULE_PATHS is not set or is empty.")
msg = "DUD_LOAD_RULE_PATHS is not set or is empty."
if default_rule_paths:
rule_paths = default_rule_paths
v = version("duplicate-url-discarder-rules")
msg += f" Using RULE_PATHS from duplicate-url-discarder-rules=={v} instead."
logger.warning(msg)

self._fallback_request_fingerprinter: RequestFingerprinterProtocol = (
create_instance(
load_object(
Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,18 @@ classifiers = [
]
requires-python = ">=3.8"
dependencies = [
"Scrapy >= 2.7.0",
"Scrapy >= 2.11.0",
"url-matcher >= 0.5.0",
"w3lib >= 1.22.0",
"w3lib >= 2.0.1",
]
dynamic = ["version"]

[project.urls]
Source = "https://github.com/zytedata/duplicate-url-discarder"

[project.optional-dependencies]
rules = ["duplicate-url-discarder-rules"]

[tool.setuptools.dynamic]
version = {attr = "duplicate_url_discarder.__version__"}

Expand Down
33 changes: 33 additions & 0 deletions tests/test_fingerprinter.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,36 @@ def get_stat(stat: str) -> Any:
)
)
assert get_stat("url_modified") == 3


try:
from duplicate_url_discarder_rules import RULE_PATHS as default_rule_paths
except ImportError:
default_rule_paths = None


def test_default_rules(tmp_path):
fingerprinter = get_fingerprinter({})
if default_rule_paths:
assert len(fingerprinter.url_canonicalizer.processors) > 0
else:
assert len(fingerprinter.url_canonicalizer.processors) == 0

# Regardless of the presence of the ``duplicate_url_discarder_rules`` package,
# as long as the ``DUD_LOAD_RULE_PATHS`` setting is set, rules on that will be used.

rules_path = Path(tmp_path) / "single_rule.json"
rules_path.write_text(
json.dumps(
[
{
"args": ["PHPSESSIONID"],
"order": 1,
"processor": "queryRemoval",
"urlPattern": {"include": []},
},
]
)
)
fingerprinter = get_fingerprinter({"DUD_LOAD_RULE_PATHS": [str(rules_path)]})
assert len(fingerprinter.url_canonicalizer.processors) == 1
13 changes: 10 additions & 3 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = py,pre-commit,mypy,docs,twinecheck
envlist = py,pre-commit,mypy,docs,twinecheck,rules

[testenv]
deps =
Expand All @@ -16,9 +16,15 @@ commands =
basepython = python3.8
deps =
{[testenv]deps}
Scrapy==2.7.0
Scrapy==2.11.0
url-matcher==0.5.0
w3lib==1.22.0
w3lib==2.0.1

[testenv:rules]
basepython = python3.8
deps =
{[testenv:pinned]deps}
duplicate-url-discarder-rules

[testenv:pre-commit]
deps =
Expand All @@ -27,6 +33,7 @@ commands = pre-commit run --all-files --show-diff-on-failure

[testenv:mypy]
deps =
duplicate-url-discarder-rules
mypy==1.9.0
pytest
commands = mypy {posargs:duplicate_url_discarder tests}
Expand Down

0 comments on commit d261402

Please sign in to comment.