Skip to content

Commit

Permalink
metadata: more robust URL extraction (#710)
Browse files Browse the repository at this point in the history
* metadata: more robust URL extraction

* simplify code

* remove superfluous function
  • Loading branch information
adbar authored Oct 4, 2024
1 parent b6f93c9 commit 35ec481
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 19 deletions.
3 changes: 2 additions & 1 deletion tests/metadata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,9 @@ def test_url():
'<html><head><meta name="twitter:url" content="https://example.org"/></head><body></body></html>',
'<html><head><link rel="alternate" hreflang="x-default" href="https://example.org"/></head><body></body></html>',
'<html><head><link rel="canonical" href="/article/medical-record"/></head><body></body></html>'
'<html><head><base href="https://example.org" target="_blank"/></head><body></body></html>',
]
default_urls = [None, None, None, None, "https://example.org"]
default_urls = [None, None, None, None, "https://example.org", None]
expected_url = 'https://example.org'

for doc, default_url in zip(htmldocs, default_urls):
Expand Down
36 changes: 18 additions & 18 deletions trafilatura/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@

OG_AUTHOR = {"og:author", "og:article:author"}

URL_SELECTORS = [
'.//head//link[@rel="canonical"]',
'.//head//base',
'.//head//link[@rel="alternate"][@hreflang="x-default"]'
]


def normalize_tags(tags: str) -> str:
"""Remove special characters of tags"""
Expand Down Expand Up @@ -382,21 +388,13 @@ def extract_author(tree: HtmlElement) -> Optional[str]:

def extract_url(tree: HtmlElement, default_url: Optional[str] = None) -> Optional[str]:
"""Extract the URL from the canonical link"""
url = None
# https://www.tutorialrepublic.com/html-reference/html-base-tag.php
# try canonical link first
element = tree.find('.//head//link[@rel="canonical"][@href]')
if element is not None:
url = element.attrib["href"]
# try default language link
else:
element = tree.find('.//head//link[@rel="alternate"][@hreflang="x-default"]')
if element is not None:
LOGGER.debug(
tostring(element, pretty_print=False, encoding="unicode").strip()
)
url = element.attrib["href"]
# add domain name if it's missing
for selector in URL_SELECTORS:
element = tree.find(selector)
url = element.attrib.get("href") if element is not None else None
if url:
break

# fix relative URLs
if url and url.startswith("/"):
for element in tree.iterfind(".//head//meta[@content]"):
attrtype = element.get("name") or element.get("property") or ""
Expand All @@ -406,10 +404,12 @@ def extract_url(tree: HtmlElement, default_url: Optional[str] = None) -> Optiona
# prepend URL
url = base_url + url
break
# sanity check: don't return invalid URLs
if url is not None:

# do not return invalid URLs
if url:
validation_result, parsed_url = validate_url(url)
url = None if validation_result is False else normalize_url(parsed_url)
url = normalize_url(parsed_url) if validation_result else None

return url or default_url


Expand Down

0 comments on commit 35ec481

Please sign in to comment.