Skip to content

Commit

Permalink
re-order conditions
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 8, 2023
1 parent fcb6a97 commit 140c121
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 2 deletions.
2 changes: 1 addition & 1 deletion trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def handle_link_list(linklist, domainname, baseurl, target_lang=None, external=F
# control output for validity
checked = check_url(link, language=target_lang)
if checked is not None:
if not external and not is_similar_domain(domainname, checked[1]) and not "feed" in link:
if not external and not "feed" in link and not is_similar_domain(domainname, checked[1]):
LOGGER.warning('Rejected, diverging domain names: %s %s', domainname, checked[1])
else:
output_links.append(checked[0])
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def handle_link(self, link: str) -> None:

# don't take links from another domain and make an exception for main platforms
# also bypass: subdomains vs. domains
if not self.external and not is_similar_domain(self.domain, newdomain) and not WHITELISTED_PLATFORMS.search(newdomain):
if not self.external and not WHITELISTED_PLATFORMS.search(newdomain) and \
not is_similar_domain(self.domain, newdomain):
LOGGER.warning('link discarded, diverging domain names: %s %s', self.domain, newdomain)
return

Expand Down

0 comments on commit 140c121

Please sign in to comment.