metadata: more robust URL extraction (#710)

* metadata: more robust URL extraction * simplify code * remove superfluous function
adbar · Oct 4, 2024 · 35ec481 · 35ec481
1 parent b6f93c9
commit 35ec481
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 19 deletions.
diff --git a/tests/metadata_tests.py b/tests/metadata_tests.py
@@ -193,8 +193,9 @@ def test_url():
         '<html><head><meta name="twitter:url" content="https://example.org"/></head><body></body></html>',
         '<html><head><link rel="alternate" hreflang="x-default" href="https://example.org"/></head><body></body></html>',
         '<html><head><link rel="canonical" href="/article/medical-record"/></head><body></body></html>'
+        '<html><head><base href="https://example.org" target="_blank"/></head><body></body></html>',
     ]
-    default_urls = [None, None, None, None, "https://example.org"]
+    default_urls = [None, None, None, None, "https://example.org", None]
     expected_url = 'https://example.org'
 
     for doc, default_url in zip(htmldocs, default_urls):

diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py
@@ -149,6 +149,12 @@
 
 OG_AUTHOR = {"og:author", "og:article:author"}
 
+URL_SELECTORS = [
+    './/head//link[@rel="canonical"]',
+    './/head//base',
+    './/head//link[@rel="alternate"][@hreflang="x-default"]'
+]
+
 
 def normalize_tags(tags: str) -> str:
     """Remove special characters of tags"""
@@ -382,21 +388,13 @@ def extract_author(tree: HtmlElement) -> Optional[str]:
 
 def extract_url(tree: HtmlElement, default_url: Optional[str] = None) -> Optional[str]:
     """Extract the URL from the canonical link"""
-    url = None
-    # https://www.tutorialrepublic.com/html-reference/html-base-tag.php
-    # try canonical link first
-    element = tree.find('.//head//link[@rel="canonical"][@href]')
-    if element is not None:
-        url = element.attrib["href"]
-    # try default language link
-    else:
-        element = tree.find('.//head//link[@rel="alternate"][@hreflang="x-default"]')
-        if element is not None:
-            LOGGER.debug(
-                tostring(element, pretty_print=False, encoding="unicode").strip()
-            )
-            url = element.attrib["href"]
-    # add domain name if it's missing
+    for selector in URL_SELECTORS:
+        element = tree.find(selector)
+        url = element.attrib.get("href") if element is not None else None
+        if url:
+            break
+
+    # fix relative URLs
     if url and url.startswith("/"):
         for element in tree.iterfind(".//head//meta[@content]"):
             attrtype = element.get("name") or element.get("property") or ""
@@ -406,10 +404,12 @@ def extract_url(tree: HtmlElement, default_url: Optional[str] = None) -> Optiona
                     # prepend URL
                     url = base_url + url
                     break
-    # sanity check: don't return invalid URLs
-    if url is not None:
+
+    # do not return invalid URLs
+    if url:
         validation_result, parsed_url = validate_url(url)
-        url = None if validation_result is False else normalize_url(parsed_url)
+        url = normalize_url(parsed_url) if validation_result else None
+
     return url or default_url