From d11eab3773832d635fdad8341031fc899812288b Mon Sep 17 00:00:00 2001 From: Nikolay Panov Date: Sun, 25 Nov 2018 11:42:11 -0800 Subject: [PATCH] Core refactoring and cleanup. --- .gitignore | 41 +--- .travis.yml | 15 +- MANIFEST.in | 1 - Makefile | 11 ++ README.md | 45 +++-- pyproject.toml | 25 +++ setup.cfg | 2 - setup.py | 29 --- tests/__init__.py | 0 tests/test_deconstruct_url.py | 32 +++ tests/test_generic_url_cleanup.py | 20 ++ tests/test_normalize_fragment.py | 20 ++ tests/test_normalize_host.py | 19 ++ tests/test_normalize_path.py | 39 ++++ tests/test_normalize_port.py | 13 ++ tests/test_normalize_query.py | 21 ++ tests/test_normalize_scheme.py | 13 ++ tests/test_normalize_userinfo.py | 19 ++ tests/test_provide_url_scheme.py | 20 ++ tests/test_reconstruct_url.py | 38 ++++ tests/test_url_normalize.py | 194 +++++++------------ tox.ini | 36 ++-- url_normalize/__init__.py | 48 ++++- url_normalize/tools.py | 100 ++++++++++ url_normalize/url_normalize.py | 311 ++++++++++++++++++------------ 25 files changed, 752 insertions(+), 360 deletions(-) delete mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 pyproject.toml delete mode 100644 setup.cfg delete mode 100644 setup.py create mode 100644 tests/__init__.py create mode 100644 tests/test_deconstruct_url.py create mode 100644 tests/test_generic_url_cleanup.py create mode 100644 tests/test_normalize_fragment.py create mode 100644 tests/test_normalize_host.py create mode 100644 tests/test_normalize_path.py create mode 100644 tests/test_normalize_port.py create mode 100644 tests/test_normalize_query.py create mode 100644 tests/test_normalize_scheme.py create mode 100644 tests/test_normalize_userinfo.py create mode 100644 tests/test_provide_url_scheme.py create mode 100644 tests/test_reconstruct_url.py mode change 100755 => 100644 tests/test_url_normalize.py create mode 100644 url_normalize/tools.py mode change 100755 => 100644 url_normalize/url_normalize.py diff --git a/.gitignore b/.gitignore index 8f3f281..8ea4eca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,38 +1,9 @@ -*.py[cod] - -# C extensions -*.so - -# Packages -*.egg -*.eggs -*.egg-info -dist -build -eggs -parts -bin -var -sdist -develop-eggs -.installed.cfg -lib -lib64 - -# Installer logs -pip-log.txt - -# Unit test / coverage reports .coverage +.*cache .tox -nosetests.xml - -# Translations -*.mo - -# Mr Developer -.mr.developer.cfg -.project -.pydevproject .vscode -.cache +dist +*.lock +__pycache__ +*.pyc +*.egg-info diff --git a/.travis.yml b/.travis.yml index a2e4c62..ae0f0ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,11 @@ language: python +sudo: required +dist: xenial python: - - "3.6" + - "2.7" + - "3.7" install: - - "pip install coverage" - - "pip install coveralls" -script: - - "coverage run --source=url_normalize setup.py test" -after_success: - coveralls + - "pip install coveralls poetry" + - "poetry install -v" +script: "pytest" +after_success: coveralls diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index bb3ec5f..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include README.md diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f20d6d8 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +tox: + @tox + +test: + @py.test + +build: + @poetry build + +publish: + @poetry publish diff --git a/README.md b/README.md index 25d6a40..1699c1c 100644 --- a/README.md +++ b/README.md @@ -5,21 +5,23 @@ url-normalize [![Coverage Status](https://coveralls.io/repos/github/niksite/url-normalize/badge.svg?branch=master)](https://coveralls.io/github/niksite/url-normalize?branch=master) URI Normalization function: - * Take care of IDN domains. - * Always provide the URI scheme in lowercase characters. - * Always provide the host, if any, in lowercase characters. - * Only perform percent-encoding where it is essential. - * Always use uppercase A-through-F characters when percent-encoding. - * Prevent dot-segments appearing in non-relative URI paths. - * For schemes that define a default authority, use an empty authority if the default is desired. - * For schemes that define an empty path to be equivalent to a path of "/", use "/". - * For schemes that define a port, use an empty port if the default is desired - * All portions of the URI must be utf-8 encoded NFC from Unicode strings + +* Take care of IDN domains. +* Always provide the URI scheme in lowercase characters. +* Always provide the host, if any, in lowercase characters. +* Only perform percent-encoding where it is essential. +* Always use uppercase A-through-F characters when percent-encoding. +* Prevent dot-segments appearing in non-relative URI paths. +* For schemes that define a default authority, use an empty authority if the default is desired. +* For schemes that define an empty path to be equivalent to a path of "/", use "/". +* For schemes that define a port, use an empty port if the default is desired +* All portions of the URI must be utf-8 encoded NFC from Unicode strings Inspired by Sam Ruby's urlnorm.py: http://intertwingly.net/blog/2004/08/04/Urlnorm Example: -``` + +```sh $ pip install url-normalize Collecting url-normalize ... @@ -30,17 +32,20 @@ Python 3.6.1 (default, Jul 8 2017, 05:00:20) Type "help", "copyright", "credits" or "license" for more information. > from url_normalize import url_normalize > print(url_normalize('www.foo.com:80/foo')) -> http://www.foo.com/foo +> https://www.foo.com/foo ``` History: - * 07 Jul 2017: Python 2/3 compatibility. - * 05 Jan 2016: Python 3 compatibility - * 29 Dec 2015: PEP8, setup.py - * 10 Mar 2010: support for shebang (#!) urls - * 28 Feb 2010: using 'http' schema by default when appropriate - * 28 Feb 2010: added handling of IDN domains - * 28 Feb 2010: code pep8-zation - * 27 Feb 2010: forked from Sam Ruby's urlnorm.py + +* 1.4.0: A bit of code refactoring and cleanup +* 1.3.2: Support empty string and double slash urls (//domain.tld) +* 1.3.1: Same code support both Python 3 and Python 2. +* 1.3: Python 3 compatibility +* 1.2: PEP8, setup.py +* 1.1.2: support for shebang (#!) urls +* 1.1.1: using 'http' schema by default when appropriate +* 1.1: added handling of IDN domains +* 1.0: code pep8-zation +* 0.1: forked from Sam Ruby's urlnorm.py License: "Python" (PSF) License diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8ed8581 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.poetry] +name = "url-normalize" +version = "1.4.0" +description = "URL normalization for Python" +authors = ["Nikolay Panov "] +license = "PSF" +readme = "README.md" +repository = "https://github.com/niksite/url-normalize" +homepage = "https://github.com/niksite/url-normalize" +keywords = ['url', 'normalization', 'normalize'] + +[tool.poetry.dependencies] +python = "~2.7 || ^3.6" +six = "^1.11" + +[tool.poetry.dev-dependencies] +pytest = "^3.0" +pytest-cov = "^2.6" +tox = "^3.5" +pytest-flakes = "^4.0" +pytest-socket = "^0.3.1" + +[build-system] +requires = ["poetry>=0.12"] +build-backend = "poetry.masonry.api" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index b7e4789..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[aliases] -test=pytest diff --git a/setup.py b/setup.py deleted file mode 100644 index cafd0a7..0000000 --- a/setup.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -from setuptools import setup - -setup( - name="url-normalize", - version="1.3.3", - author="Nikolay Panov", - author_email="github@niksite.ru", - description="URL normalization for Python", - long_description=open("README.md").read(), - license="Python", - url="https://github.com/niksite/url-normalize", - packages=['url_normalize'], - classifiers=[ - "Environment :: Web Environment", - "Intended Audience :: Developers", - "Operating System :: OS Independent", - "Topic :: Text Processing :: Indexing", - "Topic :: Utilities", - "Topic :: Internet", - "Topic :: Software Development :: Libraries :: Python Modules", - "Programming Language :: Python", - "Programming Language :: Python :: 3" - ], - install_requires=['future'], - setup_requires=['pytest-runner'], - tests_require=['pytest'], -) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_deconstruct_url.py b/tests/test_deconstruct_url.py new file mode 100644 index 0000000..94f008b --- /dev/null +++ b/tests/test_deconstruct_url.py @@ -0,0 +1,32 @@ +"""Deconstruct url tests.""" +from url_normalize.tools import deconstruct_url, URL + +EXPECTED_DATA = { + "http://site.com": URL( + fragment="", + host="site.com", + path="", + port="", + query="", + scheme="http", + userinfo="", + ), + "http://user@www.example.com:8080/path/index.html?param=val#fragment": URL( + fragment="fragment", + host="www.example.com", + path="/path/index.html", + port="8080", + query="param=val", + scheme="http", + userinfo="user@", + ), +} + + +def test_deconstruct_url_result_is_expected(): + """Assert we got expected results from the deconstruct_url function.""" + for url, expected in EXPECTED_DATA.items(): + + result = deconstruct_url(url) + + assert result == expected, url diff --git a/tests/test_generic_url_cleanup.py b/tests/test_generic_url_cleanup.py new file mode 100644 index 0000000..a6ef393 --- /dev/null +++ b/tests/test_generic_url_cleanup.py @@ -0,0 +1,20 @@ +"""Tests for generic_url_cleanup function.""" +from url_normalize.url_normalize import generic_url_cleanup + +EXPECTED_DATA = { + "//site/#!fragment": "//site/?_escaped_fragment_=fragment", + "//site/?utm_source=some source¶m=value": "//site/?param=value", + "//site/?utm_source=some source": "//site/", + "//site/?param=value&utm_source=some source": "//site/?param=value", + "//site/page": "//site/page", + "//site/?& ": "//site/", +} + + +def test_generic_url_cleanup_result_is_expected(): + """Assert we got expected results from the generic_url_cleanup function.""" + for url, expected in EXPECTED_DATA.items(): + + result = generic_url_cleanup(url) + + assert result == expected, url diff --git a/tests/test_normalize_fragment.py b/tests/test_normalize_fragment.py new file mode 100644 index 0000000..9607c5b --- /dev/null +++ b/tests/test_normalize_fragment.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +"""Tests for normalize_fragment function.""" +from url_normalize.url_normalize import normalize_fragment + +EXPECTED_DATA = { + "": "", + "fragment": "fragment", + "пример": "%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80", + "!fragment": "%21fragment", + "~fragment": "~fragment", +} + + +def test_normalize_fragment_result_is_expected(): + """Assert we got expected results from the normalize_fragment function.""" + for url, expected in EXPECTED_DATA.items(): + + result = normalize_fragment(url) + + assert result == expected, url diff --git a/tests/test_normalize_host.py b/tests/test_normalize_host.py new file mode 100644 index 0000000..09a8756 --- /dev/null +++ b/tests/test_normalize_host.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +"""Tests for normalize_host function.""" +from url_normalize.url_normalize import normalize_host + +EXPECTED_DATA = { + "site.com": "site.com", + "SITE.COM": "site.com", + "site.com.": "site.com", + "пример.испытание": "xn--e1afmkfd.xn--80akhbyknj4f", +} + + +def test_normalize_host_result_is_expected(): + """Assert we got expected results from the normalize_host function.""" + for url, expected in EXPECTED_DATA.items(): + + result = normalize_host(url) + + assert result == expected, url diff --git a/tests/test_normalize_path.py b/tests/test_normalize_path.py new file mode 100644 index 0000000..b2b72a5 --- /dev/null +++ b/tests/test_normalize_path.py @@ -0,0 +1,39 @@ +"""Tests for normalize_path function.""" +from url_normalize.url_normalize import normalize_path + +EXPECTED_DATA = { + "": "/", + "/": "/", + "..": "/", + "/foo/bar/.": "/foo/bar/", + "/foo/bar/./": "/foo/bar/", + "/foo/bar/..": "/foo/", + "/foo/bar/../": "/foo/", + "/foo/bar/../baz": "/foo/baz", + "/foo/bar/../..": "/", + "/foo/bar/../../": "/", + "/foo/bar/../../baz": "/baz", + "/foo/bar/../../../baz": "/baz", + "/foo/bar/../../../../baz": "/baz", + "/./foo": "/foo", + "/../foo": "/foo", + "/foo.": "/foo.", + "/.foo": "/.foo", + "/foo..": "/foo..", + "/..foo": "/..foo", + "/./../foo": "/foo", + "/./foo/.": "/foo/", + "/foo/./bar": "/foo/bar", + "/foo/../bar": "/bar", + "/foo//": "/foo/", + "/foo///bar//": "/foo/bar/", +} + + +def test_normalize_host_result_is_expected(): + """Assert we got expected results from the normalize_path function.""" + for url, expected in EXPECTED_DATA.items(): + + result = normalize_path(url, "http") + + assert result == expected, url diff --git a/tests/test_normalize_port.py b/tests/test_normalize_port.py new file mode 100644 index 0000000..78eeb8e --- /dev/null +++ b/tests/test_normalize_port.py @@ -0,0 +1,13 @@ +"""Tests for normalize_port function.""" +from url_normalize.url_normalize import normalize_port + +EXPECTED_DATA = {"8080": "8080", "": "", "80": "", "string": "string"} + + +def test_normalize_port_result_is_expected(): + """Assert we got expected results from the normalize_port function.""" + for url, expected in EXPECTED_DATA.items(): + + result = normalize_port(url, "http") + + assert result == expected, url diff --git a/tests/test_normalize_query.py b/tests/test_normalize_query.py new file mode 100644 index 0000000..f963ef0 --- /dev/null +++ b/tests/test_normalize_query.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +"""Tests for normalize_query function.""" + +from url_normalize.url_normalize import normalize_query + +EXPECTED_DATA = { + "": "", + "param1=val1¶m2=val2": "param1=val1¶m2=val2", + "Ç=Ç": "%C3%87=%C3%87", + "%C3%87=%C3%87": "%C3%87=%C3%87", + "q=C%CC%A7": "q=%C3%87", +} + + +def test_normalize_query_result_is_expected(): + """Assert we got expected results from the normalize_query function.""" + for url, expected in EXPECTED_DATA.items(): + + result = normalize_query(url) + + assert result == expected, url diff --git a/tests/test_normalize_scheme.py b/tests/test_normalize_scheme.py new file mode 100644 index 0000000..b615e3a --- /dev/null +++ b/tests/test_normalize_scheme.py @@ -0,0 +1,13 @@ +"""Tests for normalize_scheme function.""" +from url_normalize.url_normalize import normalize_scheme + +EXPECTED_DATA = {"http": "http", "HTTP": "http"} + + +def test_normalize_scheme_result_is_expected(): + """Assert we got expected results from the normalize_scheme function.""" + for url, expected in EXPECTED_DATA.items(): + + result = normalize_scheme(url) + + assert result == expected, url diff --git a/tests/test_normalize_userinfo.py b/tests/test_normalize_userinfo.py new file mode 100644 index 0000000..05e9b28 --- /dev/null +++ b/tests/test_normalize_userinfo.py @@ -0,0 +1,19 @@ +"""Tests for normalize_userinfo function.""" +from url_normalize.url_normalize import normalize_userinfo + +EXPECTED_DATA = { + ":@": "", + "": "", + "@": "", + "user:password@": "user:password@", + "user@": "user@", +} + + +def test_normalize_userinfo_result_is_expected(): + """Assert we got expected results from the normalize_userinfo function.""" + for url, expected in EXPECTED_DATA.items(): + + result = normalize_userinfo(url) + + assert result == expected, url diff --git a/tests/test_provide_url_scheme.py b/tests/test_provide_url_scheme.py new file mode 100644 index 0000000..abde7de --- /dev/null +++ b/tests/test_provide_url_scheme.py @@ -0,0 +1,20 @@ +"""Tests for provide_url_scheme function.""" +from url_normalize.url_normalize import provide_url_scheme + +EXPECTED_DATA = { + "": "", + "-": "-", + "/file/path": "/file/path", + "//site/path": "https://site/path", + "ftp://site/": "ftp://site/", + "site/page": "https://site/page", +} + + +def test_provide_url_scheme_result_is_expected(): + """Assert we got expected results from the provide_url_scheme function.""" + for url, expected in EXPECTED_DATA.items(): + + result = provide_url_scheme(url) + + assert result == expected, url diff --git a/tests/test_reconstruct_url.py b/tests/test_reconstruct_url.py new file mode 100644 index 0000000..bfaa0fc --- /dev/null +++ b/tests/test_reconstruct_url.py @@ -0,0 +1,38 @@ +"""Reconstruct url tests.""" +from url_normalize.tools import reconstruct_url, URL + +EXPECTED_DATA = ( + ( + URL( + fragment="", + host="site.com", + path="", + port="", + query="", + scheme="http", + userinfo="", + ), + "http://site.com", + ), + ( + URL( + fragment="fragment", + host="www.example.com", + path="/path/index.html", + port="8080", + query="param=val", + scheme="http", + userinfo="user@", + ), + "http://user@www.example.com:8080/path/index.html?param=val#fragment", + ), +) + + +def test_deconstruct_url_result_is_expected(): + """Assert we got expected results from the deconstruct_url function.""" + for url, expected in EXPECTED_DATA: + + result = reconstruct_url(url) + + assert result == expected, url diff --git a/tests/test_url_normalize.py b/tests/test_url_normalize.py old mode 100755 new mode 100644 index a9f2016..a98fb99 --- a/tests/test_url_normalize.py +++ b/tests/test_url_normalize.py @@ -1,126 +1,82 @@ # -*- coding: utf-8 -*- -"""URI normalizator tests.""" -from __future__ import unicode_literals - +"""Integrations tests.""" from url_normalize import url_normalize + EXPECTED_RESULTS = { - '': - '', # empty string - '/foo/bar/.': - '/foo/bar/', - '/foo/bar/./': - '/foo/bar/', - '/foo/bar/..': - '/foo/', - '/foo/bar/../': - '/foo/', - '/foo/bar/../baz': - '/foo/baz', - '/foo/bar/../..': - '/', - '/foo/bar/../../': - '/', - '/foo/bar/../../baz': - '/baz', - '/foo/bar/../../../baz': - '/baz', # was: '/../baz', - '/foo/bar/../../../../baz': - '/baz', - '/./foo': - '/foo', - '/../foo': - '/foo', # was: '/../foo', - '/foo.': - '/foo.', - '/.foo': - '/.foo', - '/foo..': - '/foo..', - '/..foo': - '/..foo', - '/./../foo': - '/foo', # was: '/../foo', - '/./foo/.': - '/foo/', - '/foo/./bar': - '/foo/bar', - '/foo/../bar': - '/bar', - '/foo//': - '/foo/', - '/foo///bar//': - '/foo/bar/', - '//www.foo.com/': - 'https://www.foo.com/', - 'http://www.foo.com:80/foo': - 'http://www.foo.com/foo', - 'http://www.foo.com:8000/foo': - 'http://www.foo.com:8000/foo', - 'http://www.foo.com./foo/bar.html': - 'http://www.foo.com/foo/bar.html', - 'http://www.foo.com.:81/foo': - 'http://www.foo.com:81/foo', - 'http://www.foo.com/%7ebar': - 'http://www.foo.com/~bar', - 'http://www.foo.com/%7Ebar': - 'http://www.foo.com/~bar', - 'ftp://user:pass@ftp.foo.net/foo/bar': - 'ftp://user:pass@ftp.foo.net/foo/bar', - 'http://USER:pass@www.Example.COM/foo/bar': - 'http://USER:pass@www.example.com/foo/bar', - 'http://www.example.com./': - 'http://www.example.com/', - '-': - '-', - 'пример.испытание/Служебная:Search/Test': - 'http://xn--e1afmkfd.xn--80akhbyknj4f/' - '%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%' - 'D0%BD%D0%B0%D1%8F:Search/Test', - 'http://lifehacker.com/#!5753509/' - 'hello-world-this-is-the-new-lifehacker': - 'http://lifehacker.com/?_escaped_fragment' - '_=5753509/hello-world-this-is-the-new-lifehacker', + "/../foo": "/foo", # was: '/../foo', + "/./../foo": "/foo", # was: '/../foo', + "/./foo": "/foo", + "/./foo/.": "/foo/", + "//www.foo.com/": "https://www.foo.com/", + "/foo/../bar": "/bar", + "/foo/./bar": "/foo/bar", + "/foo//": "/foo/", + "/foo///bar//": "/foo/bar/", + "/foo/bar/..": "/foo/", + "/foo/bar/../..": "/", + "/foo/bar/../../../../baz": "/baz", + "/foo/bar/../../../baz": "/baz", # was: '/../baz', + "/foo/bar/../../": "/", + "/foo/bar/../../baz": "/baz", + "/foo/bar/../": "/foo/", + "/foo/bar/../baz": "/foo/baz", + "/foo/bar/.": "/foo/bar/", + "/foo/bar/./": "/foo/bar/", + "http://:@example.com/": "http://example.com/", + "http://@example.com/": "http://example.com/", + "http://127.0.0.1:80/": "http://127.0.0.1/", + "http://example.com:081/": "http://example.com:81/", + "http://example.com:80/": "http://example.com/", + "http://example.com": "http://example.com/", + "http://example.com/?b&a": "http://example.com/?a&b", + "http://example.com/?q=%5c": "http://example.com/?q=%5C", + "http://example.com/?q=%C7": "http://example.com/?q=%EF%BF%BD", + "http://example.com/?q=C%CC%A7": "http://example.com/?q=%C3%87", + "http://EXAMPLE.COM/": "http://example.com/", + "http://example.com/%7Ejane": "http://example.com/~jane", + "http://example.com/a/../a/b": "http://example.com/a/b", + "http://example.com/a/./b": "http://example.com/a/b", + "http://lifehacker.com/#!5753509/hello-world-this-is-the-new-lifehacker": "http://lifehacker.com/?_escaped_fragment_=5753509/hello-world-this-is-the-new-lifehacker", + "http://USER:pass@www.Example.COM/foo/bar": "http://USER:pass@www.example.com/foo/bar", + "http://www.example.com./": "http://www.example.com/", + "http://www.foo.com:80/foo": "http://www.foo.com/foo", + "http://www.foo.com.:81/foo": "http://www.foo.com:81/foo", + "http://www.foo.com./foo/bar.html": "http://www.foo.com/foo/bar.html", + "http://www.foo.com/%7Ebar": "http://www.foo.com/~bar", + "http://www.foo.com/%7ebar": "http://www.foo.com/~bar", + "пример.испытание/Служебная:Search/Test": "https://xn--e1afmkfd.xn--80akhbyknj4f/%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%D0%BD%D0%B0%D1%8F:Search/Test", } -EXPECTED_CHANGES = [ - (False, "http://:@example.com/"), - (False, "http://@example.com/"), - (False, "http://example.com"), - (False, "HTTP://example.com/"), - (False, "http://EXAMPLE.COM/"), - (False, "http://example.com/%7Ejane"), - (False, "http://example.com/?q=%C7"), - (False, "http://example.com/?q=%5c"), - (False, "http://example.com/?q=C%CC%A7"), - (False, "http://example.com/a/../a/b"), - (False, "http://example.com/a/./b"), - (False, "http://example.com:80/"), - (True, "http://example.com/"), - (True, "http://example.com/?q=%C3%87"), - (True, "http://example.com/?q=%E2%85%A0"), - (True, "http://example.com/?q=%5C"), - (True, "http://example.com/~jane"), - (True, "http://example.com/a/b"), - (True, "http://example.com:8080/"), - (True, "http://user:password@example.com/"), +NO_CHANGES_EXPECTED = ( + "-", + "", + "/..foo", + "/.foo", + "/foo..", + "/foo.", + "ftp://user:pass@ftp.foo.net/foo/bar", + "http://127.0.0.1/", + "http://example.com:8080/", + "http://example.com/?a&b", + "http://example.com/?q=%5C", + "http://example.com/?q=%C3%87", + "http://example.com/?q=%E2%85%A0", + "http://example.com/", + "http://example.com/~jane", + "http://example.com/a/b", + "http://user:password@example.com/", + "http://www.foo.com:8000/foo", # from rfc2396bis - (True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"), - (True, "http://www.ietf.org/rfc/rfc2396.txt"), - (True, "ldap://[2001:db8::7]/c=GB?objectClass?one"), - (True, "mailto:John.Doe@example.com"), - (True, "news:comp.infosystems.www.servers.unix"), - (True, "tel:+1-816-555-1212"), - (True, "telnet://192.0.2.16:80/"), - (True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"), - # other - (True, "http://127.0.0.1/"), - (False, "http://127.0.0.1:80/"), - (True, "http://www.w3.org/2000/01/rdf-schema#"), - (False, "http://example.com:081/"), - (True, "http://example.com/?a&b"), - (False, "http://example.com/?b&a"), -] + "ftp://ftp.is.co.za/rfc/rfc1808.txt", + "http://www.ietf.org/rfc/rfc2396.txt", + "ldap://[2001:db8::7]/c=GB?objectClass?one", + "mailto:John.Doe@example.com", + "news:comp.infosystems.www.servers.unix", + "tel:+1-816-555-1212", + "telnet://192.0.2.16:80/", + "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", +) def test_url_normalize_changes(): @@ -128,11 +84,11 @@ def test_url_normalize_changes(): http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """ - for (expected, value) in EXPECTED_CHANGES: - assert expected == (url_normalize(value) == value) + for value in NO_CHANGES_EXPECTED: + assert url_normalize(value) == value def test_url_normalize_results(): """Assert url_normalize return expected results.""" for value, expected in EXPECTED_RESULTS.items(): - assert expected == url_normalize(value) + assert expected == url_normalize(value), value diff --git a/tox.ini b/tox.ini index 7ff4fa1..c388cfb 100644 --- a/tox.ini +++ b/tox.ini @@ -1,21 +1,25 @@ [tox] -envlist=py27, py36 +skipsdist = True +envlist = py27, py37 [testenv] -deps= - pytest - coverage - pytest-cov -setenv= - PYTHONWARNINGS=all +whitelist_externals = poetry +skip_install = true +commands = + poetry install -v + poetry run pytest -[testenv:py27] -commands=pytest url_normalize +[pytest] +addopts = + --cov-fail-under=99 + --cov-report=term-missing:skip-covered + --cov=url_normalize + --disable-socket + --flakes + -v +python_files = tests.py test_*.py *_tests.py -[testenv:py36] -commands=pytest url_normalize - -[testenv:cov] -usedevelop=true -basepython=python3.6 -commands=pytest --cov=url_normalize --cov-report term +[flake8] +max-line-length = 80 +select = C,E,F,W,B,B950 +ignore = E501 diff --git a/url_normalize/__init__.py b/url_normalize/__init__.py index b4debc5..a59dfe8 100644 --- a/url_normalize/__init__.py +++ b/url_normalize/__init__.py @@ -1,9 +1,43 @@ -"""URI normalizator.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) -from future import standard_library +# -*- coding: utf-8 -*- +""" +URI normalizator. -standard_library.install_aliases() +URI Normalization function: + * Take care of IDN domains. + * Always provide the URI scheme in lowercase characters. + * Always provide the host, if any, in lowercase characters. + * Only perform percent-encoding where it is essential. + * Always use uppercase A-through-F characters when percent-encoding. + * Prevent dot-segments appearing in non-relative URI paths. + * For schemes that define a default authority, use an empty authority if the + default is desired. + * For schemes that define an empty path to be equivalent to a path of "/", + use "/". + * For schemes that define a port, use an empty port if the default is desired + * All portions of the URI must be utf-8 encoded NFC from Unicode strings -# pylint: disable=C0413 -from .url_normalize import url_normalize # NOQA +Inspired by Sam Ruby's urlnorm.py: + http://intertwingly.net/blog/2004/08/04/Urlnorm +This fork author: Nikolay Panov () + +History: + * 1.4.0: A bit of code refactoring and cleanup + * 1.3.2: Support empty string and double slash urls (//domain.tld) + * 1.3.1: Same code support both Python 3 and Python 2. + * 1.3: Python 3 compatibility + * 1.2: PEP8, setup.py + * 1.1.2: support for shebang (#!) urls + * 1.1.1: using 'http' schema by default when appropriate + * 1.1: added handling of IDN domains + * 1.0: code pep8-zation + * 0.1: forked from Sam Ruby's urlnorm.py +""" + +from __future__ import absolute_import + +from .url_normalize import url_normalize + +__license__ = "Python" +__version__ = "1.4.0" + +__all__ = ["url_normalize"] diff --git a/url_normalize/tools.py b/url_normalize/tools.py new file mode 100644 index 0000000..4828e82 --- /dev/null +++ b/url_normalize/tools.py @@ -0,0 +1,100 @@ +"""Url normalize tools (py27/py37 compatible).""" +import re +import unicodedata +from collections import namedtuple + +import six +from six.moves.urllib.parse import quote as quote_orig +from six.moves.urllib.parse import unquote as unquote_orig +from six.moves.urllib.parse import urlsplit, urlunsplit + +URL = namedtuple( + "URL", ["scheme", "userinfo", "host", "port", "path", "query", "fragment"] +) + + +def deconstruct_url(url): + """Tranform the url into URL structure. + + Params: + url : string : the URL + + Returns: + URL + + """ + scheme, auth, path, query, fragment = urlsplit(url.strip()) + (userinfo, host, port) = re.search("([^@]*@)?([^:]*):?(.*)", auth).groups() + return URL( + fragment=fragment, + host=host, + path=path, + port=port, + query=query, + scheme=scheme, + userinfo=userinfo or "", + ) + + +def reconstruct_url(url): + """Reconstruct string url from URL. + + Params: + url : URL object instance + + Returns: + string : reconstructed url string + + """ + auth = (url.userinfo or "") + url.host + if url.port: + auth += ":" + url.port + return urlunsplit((url.scheme, auth, url.path, url.query, url.fragment)) + + +def force_unicode(string, charset="utf-8"): + """Convert string to unicode if it is not yet unicode. + + Params: + string : string/unicode : an input string + charset : string : optional : output encoding + + Returns: + unicode + + """ + if isinstance(string, six.text_type): # Always True on Py3 + return string + return string.decode(charset, "replace") # Py2 only + + +def unquote(string, charset="utf-8"): + """Unquote and normalize unicode string. + + Params: + string : string to be unquoted + charset : string : optional : output encoding + + Returns: + string : an unquoted and normalized string + + """ + string = unquote_orig(string) + string = force_unicode(string, charset) + string = unicodedata.normalize("NFC", string).encode(charset) + return string + + +def quote(string, safe="/"): + """Quote string. + + Params: + string : string to be quoted + safe : string of safe characters + + Returns: + string : quoted string + + """ + string = quote_orig(string, safe) + return string diff --git a/url_normalize/url_normalize.py b/url_normalize/url_normalize.py old mode 100755 new mode 100644 index b4e2280..44aee13 --- a/url_normalize/url_normalize.py +++ b/url_normalize/url_normalize.py @@ -1,139 +1,153 @@ # -*- coding: utf-8 -*- -"""URI normalizator. - -URI Normalization function: - * Take care of IDN domains. - * Always provide the URI scheme in lowercase characters. - * Always provide the host, if any, in lowercase characters. - * Only perform percent-encoding where it is essential. - * Always use uppercase A-through-F characters when percent-encoding. - * Prevent dot-segments appearing in non-relative URI paths. - * For schemes that define a default authority, use an empty authority if the - default is desired. - * For schemes that define an empty path to be equivalent to a path of "/", - use "/". - * For schemes that define a port, use an empty port if the default is desired - * All portions of the URI must be utf-8 encoded NFC from Unicode strings - -Inspired by Sam Ruby's urlnorm.py: - http://intertwingly.net/blog/2004/08/04/Urlnorm -This fork author: Nikolay Panov () - -History: - * 28 Oct 2018: Support empty string and double slash urls (//domain.tld/foo.html) - * 07 Jul 2017: Same code support both Python 3 and Python 2. - * 05 Jan 2016: Python 3 compatibility, please use version 1.2 on python 2 - * 29 Dec 2015: PEP8, setup.py - * 10 Mar 2010: support for shebang (#!) urls - * 28 Feb 2010: using 'http' schema by default when appropriate - * 28 Feb 2010: added handling of IDN domains - * 28 Feb 2010: code pep8-zation - * 27 Feb 2010: forked from Sam Ruby's urlnorm.py -""" -from __future__ import unicode_literals - +"""URL normalize main module.""" import re -import unicodedata -from urllib.parse import quote, unquote, urlsplit, urlunsplit -__license__ = "Python" -__version__ = "1.3.4" +from .tools import deconstruct_url, force_unicode, quote, reconstruct_url, unquote +DEFAULT_PORT = { + "ftp": "21", + "gopher": "70", + "http": "80", + "https": "443", + "news": "119", + "nntp": "119", + "snews": "563", + "snntp": "563", + "telnet": "23", + "ws": "80", + "wss": "443", +} +DEFAULT_SCHEME = "https" -def _clean(string, charset='utf-8'): - """Unquote and normalize unicode string. + +def provide_url_scheme(url): + """Make sure we have valid url scheme. Params: - charset : string : optional : output encoding + url : string : the URL Returns: - string : an unquoted and normalized string + string : updated url with validated/attached scheme """ - string = unquote(string) - return unicodedata.normalize('NFC', string).encode(charset) + has_scheme = ":" in url[:7] + is_default_scheme = url.startswith("//") + is_file_path = url == "-" or (url.startswith("/") and not is_default_scheme) + if not url or has_scheme or is_file_path: + return url + if is_default_scheme: + return DEFAULT_SCHEME + ":" + url + return DEFAULT_SCHEME + "://" + url -DEFAULT_PORT = { - 'ftp': 21, - 'telnet': 23, - 'http': 80, - 'ws': 80, - 'gopher': 70, - 'news': 119, - 'nntp': 119, - 'prospero': 191, - 'https': 443, - 'wss': 443, - 'snews': 563, - 'snntp': 563, -} +def generic_url_cleanup(url): + """Cleanup the URL from unnecessary data and convert to final form. -def url_normalize(url, charset='utf-8'): - """URI normalization routine. + Converts shebang urls to final form, removed unnecessary data from the url. - Sometimes you get an URL by a user that just isn't a real - URL because it contains unsafe characters like ' ' and so on. This - function can fix some of the problems in a similar way browsers - handle data entered by the user: + Params: + url : string : the URL + + Returns: + string : update url + + """ + url = url.replace("#!", "?_escaped_fragment_=") + url = re.sub(r"utm_source=[^&]+&?", "", url) + url = url.rstrip("&? ") + return url - >>> url_normalize(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') - 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' + +def normalize_scheme(scheme): + """Normalize scheme part of the url. Params: - charset : string : The target charset for the URL if the url was - given as unicode string. + scheme : string : url scheme, e.g., 'https' + + Returns: + string : normalized scheme data. + """ + return scheme.lower() - # invalid empty / null url - if url is None or len(url) == 0: - return url - # if there is no scheme use http as default scheme - if url[0] not in ['/', '-'] and ':' not in url[:7]: - url = 'https://' + url +def normalize_userinfo(userinfo): + """Normalize userinfo part of the url. - # protocol indeferent url (http|https), prepend https - if len(url) > 2 and url[0] == '/' and url[1] == '/' and ':' not in url[:7]: - url = 'https:' + url + Params: + userinfo : string : url userinfo, e.g., 'user@' - # shebang urls support - url = url.replace('#!', '?_escaped_fragment_=') + Returns: + string : normalized userinfo data. - # remove feedburner's crap - url = re.sub(r'\?utm_source=feedburner.+$', '', url) + """ + if userinfo in ["@", ":@"]: + return "" + return userinfo - # splitting url to useful parts - scheme, auth, path, query, fragment = urlsplit(url.strip()) - (userinfo, host, port) = re.search('([^@]*@)?([^:]*):?(.*)', auth).groups() - # Always provide the URI scheme in lowercase characters. - scheme = scheme.lower() +def normalize_host(host, charset="utf-8"): + """Normalize host part of the url. - # Always provide the host, if any, in lowercase characters. - host = host.lower() - if host and host[-1] == '.': - host = host[:-1] + Lowercase and strip of final dot. + Also, take care about IDN domains. - # take care about IDN domains + Params: + host : string : url host, e.g., 'site.com' + + Returns: + string : normalized host data. + + """ + host = force_unicode(host, charset) + host = host.lower() + host = host.strip(".") host = host.encode("idna").decode(charset) + return host + + +def normalize_port(port, scheme): + """Normalize port part of the url. + + Remove mention of default port number + + Params: + port : string : url port, e.g., '8080' + scheme : string : url scheme, e.g., 'http' + + Returns: + string : normalized port data. + + """ + if not port.isdigit(): + return port + port = str(int(port)) + if DEFAULT_PORT[scheme] == port: + return "" + return port + +def normalize_path(path, scheme): + """Normalize path part of the url. + + Remove mention of default path number + + Params: + path : string : url path, e.g., '/section/page.html' + scheme : string : url scheme, e.g., 'http' + + Returns: + string : normalized path data. + + """ # Only perform percent-encoding where it is essential. # Always use uppercase A-through-F characters when percent-encoding. # All portions of the URI must be utf-8 encoded NFC from Unicode strings - path = quote(_clean(path), "~:/?#[]@!$&'()*+,;=") - fragment = quote(_clean(fragment), "~") - - # note care must be taken to only encode & and = characters as values - query = "&".join( - sorted(["=".join( - [quote(_clean(t), "~:/?#[]@!$'()*+,;=") - for t in q.split("=", 1)]) for q in query.split("&")])) - + path = quote(unquote(path), "~:/?#[]@!$&'()*+,;=") # Prevent dot-segments appearing in non-relative URI paths. if scheme in ["", "http", "https", "ftp", "file"]: output, part = [], None - for part in path.split('/'): + for part in path.split("/"): if part == "": if not output: output.append(part) @@ -146,31 +160,80 @@ def url_normalize(url, charset='utf-8'): output.append(part) if part in ["", ".", ".."]: output.append("") - path = '/'.join(output) - - # For schemes that define a default authority, use an empty authority if - # the default is desired. - if userinfo in ["@", ":@"]: - userinfo = "" - + path = "/".join(output) # For schemes that define an empty path to be equivalent to a path of "/", # use "/". - if path == "" and scheme in ["http", "https", "ftp", "file"]: + if not path and scheme in ["http", "https", "ftp", "file"]: path = "/" + return path + + +def normalize_fragment(fragment): + """Normalize fragment part of the url. + + Params: + fragment : string : url fragment, e.g., 'fragment' + + Returns: + string : normalized fragment data. - # For schemes that define a port, use an empty port if the default is - # desired - if port and scheme in DEFAULT_PORT.keys(): - if port.isdigit(): - port = str(int(port)) - if int(port) == DEFAULT_PORT[scheme]: - port = '' - - # Put it all back together again - auth = (userinfo or "") + host - if port: - auth += ":" + port - if url.endswith("#") and query == "" and fragment == "": - path += "#" - - return urlunsplit((scheme, auth, path, query, fragment)) + """ + return quote(unquote(fragment), "~") + + +def normalize_query(query): + """Normalize query part of the url. + + Params: + query : string : url query, e.g., 'param1=val1¶m2=val2' + + Returns: + string : normalized query data. + + """ + query = "&".join( + sorted( + [ + "=".join( + [quote(unquote(t), "~:/?#[]@!$'()*+,;=") for t in q.split("=", 1)] + ) + for q in query.split("&") + ] + ) + ) + return query + + +def url_normalize(url, charset="utf-8"): + """URI normalization routine. + + Sometimes you get an URL by a user that just isn't a real + URL because it contains unsafe characters like ' ' and so on. + This function can fix some of the problems in a similar way + browsers handle data entered by the user: + + >>> url_normalize('http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') + 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' + + Params: + charset : string : optional + The target charset for the URL if the url was given as unicode string. + """ + if not url: + return url + url = provide_url_scheme(url) + url = generic_url_cleanup(url) + url_elements = deconstruct_url(url) + url_elements = url_elements._replace( + scheme=normalize_scheme(url_elements.scheme), + userinfo=normalize_userinfo(url_elements.userinfo), + host=normalize_host(url_elements.host, charset), + query=normalize_query(url_elements.query), + fragment=normalize_fragment(url_elements.fragment), + ) + url_elements = url_elements._replace( + port=normalize_port(url_elements.port, url_elements.scheme), + path=normalize_path(url_elements.path, url_elements.scheme), + ) + url = reconstruct_url(url_elements) + return url