Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feeds: review code #443

Merged
merged 5 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 196 additions & 94 deletions tests/feeds_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,174 +7,276 @@
import sys
from unittest.mock import patch

from trafilatura import cli, feeds
from courlan import get_hostinfo
from trafilatura.cli import main
from trafilatura.feeds import (
FeedParameters,
determine_feed,
extract_links,
find_feed_urls,
handle_link_list,
)

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

TEST_DIR = os.path.abspath(os.path.dirname(__file__))
RESOURCES_DIR = os.path.join(TEST_DIR, 'resources')
RESOURCES_DIR = os.path.join(TEST_DIR, "resources")

XMLDECL = '<?xml version="1.0" encoding="utf-8"?>\n'


def test_atom_extraction():
'''Test link extraction from an Atom feed'''
assert feeds.extract_links(None, 'example.org', 'https://example.org', '') == []
assert len(feeds.extract_links('<html></html>', 'example.org', 'https://example.org', '')) == 0
filepath = os.path.join(RESOURCES_DIR, 'feed1.atom')
with open(filepath) as f:
"""Test link extraction from an Atom feed"""
params = FeedParameters("https://example.org", "example.org", "")
assert not extract_links(None, params)
assert len(extract_links("<html></html>", params)) == 0

filepath = os.path.join(RESOURCES_DIR, "feed1.atom")
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
assert len(feeds.extract_links(teststring, 'example.org', 'https://example.org', '')) > 0
assert len(extract_links(teststring, params)) > 0

params = FeedParameters("https://www.dwds.de", "dwds.de", "")
assert (
len(
feeds.extract_links(
extract_links(
f'{XMLDECL}<link type="application/atom+xml" rel="self" href="https://www.dwds.de/api/feed/themenglossar/Corona"/>',
'dwds.de',
'https://www.dwds.de',
'',
params,
)
)
== 0
)

params = FeedParameters("http://example.org", "example.org", "http://example.org")
assert (
len(
feeds.extract_links(
extract_links(
f'{XMLDECL}<link rel="self" href="http://example.org/article1/"/>',
'example.org',
'http://example.org/',
'http://example.org',
params,
)
)
== 0
)

params = FeedParameters("https://example.org", "example.org", "")
assert (
len(
feeds.extract_links(
extract_links(
f'{XMLDECL}<link type="application/atom+xml" rel="self" href="123://api.exe"/>',
'example.org',
'https://example.org',
'',
params,
)
)
== 0
)
assert feeds.extract_links(
f'{XMLDECL}<link href="http://example.org/article1/"rest"/>',
'example.org',
'http://example.org/',
'http://example.org',
) == ['http://example.org/article1/'] # TODO: remove slash?

params = FeedParameters("http://example.org/", "example.org", "http://example.org")
assert extract_links(
f'{XMLDECL}<link href="http://example.org/article1/"rest"/>', params
) == [
"http://example.org/article1/"
] # TODO: remove slash?


def test_rss_extraction():
'''Test link extraction from a RSS feed'''
"""Test link extraction from a RSS feed"""
params = FeedParameters("http://example.org/", "example.org", "")
assert (
len(
feeds.extract_links(
f'{XMLDECL}<link>http://example.org/article1/</link>',
'example.org',
'http://example.org/',
'',
)
extract_links(f"{XMLDECL}<link>http://example.org/article1/</link>", params)
)
== 1
)
# CDATA
assert feeds.extract_links(
f'{XMLDECL}<link><![CDATA[http://example.org/article1/]]></link>',
'example.org',
'http://example.org/',
'',
) == ['http://example.org/article1/'] # TODO: remove slash?
assert extract_links(
f"{XMLDECL}<link><![CDATA[http://example.org/article1/]]></link>", params
) == [
"http://example.org/article1/"
] # TODO: remove slash?

# spaces
assert len(feeds.extract_links(XMLDECL + '<link>\r\n https://www.ak-kurier.de/akkurier/www/artikel/108815-sinfonisches-blasorchester-spielt-1500-euro-fuer-kinder-in-drk-krankenhaus-kirchen-ein </link>', 'ak-kurier.de', 'https://www.ak-kurier.de/', '')) == 1
params = FeedParameters("https://www.ak-kurier.de/", "ak-kurier.de", "")
assert (
len(
feeds.extract_links(
f'{XMLDECL}<link>http://example.org/</link>',
'example.org',
'http://example.org',
'http://example.org',
extract_links(
XMLDECL
+ "<link>\r\n https://www.ak-kurier.de/akkurier/www/artikel/108815-sinfonisches-blasorchester-spielt-1500-euro-fuer-kinder-in-drk-krankenhaus-kirchen-ein </link>",
params,
)
)
== 0
== 1
)

params = FeedParameters("http://example.org", "example.org", "http://example.org")
assert len(extract_links(f"{XMLDECL}<link>http://example.org/</link>", params)) == 0

params = FeedParameters("http://example.org", "example.org", "")
assert len(extract_links(f"{XMLDECL}<link>https://example.org</link>", params)) == 0

params = FeedParameters("https://www.dwds.de", "dwds.de", "https://www.dwds.de")
assert extract_links(
f"{XMLDECL}<link>/api/feed/themenglossar/Corona</link>", params
) == ["https://www.dwds.de/api/feed/themenglossar/Corona"]

params = FeedParameters("https://example.org", "example.org", "")
filepath = os.path.join(RESOURCES_DIR, "feed2.rss")
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
assert len(extract_links(teststring, params)) > 0


def test_json_extraction():
"""Test link extraction from a JSON feed"""
# find link
params = FeedParameters("https://www.jsonfeed.org", "jsonfeed.org", "")
assert (
len(
feeds.extract_links(
f'{XMLDECL}<link>https://example.org</link>',
'example.org',
'http://example.org/',
'',
determine_feed(
'<html><meta><link rel="alternate" type="application/json" title="JSON Feed" href="https://www.jsonfeed.org/feed.json" />></meta><body/></html>',
params,
)
)
== 0
== 1
)
assert feeds.extract_links(
f'{XMLDECL}<link>/api/feed/themenglossar/Corona</link>',
'www.dwds.de',
'https://www.dwds.de',
'https://www.dwds.de',
) == ['https://www.dwds.de/api/feed/themenglossar/Corona']
filepath = os.path.join(RESOURCES_DIR, 'feed2.rss')
with open(filepath) as f:
teststring = f.read()
assert len(feeds.extract_links(teststring, 'example.com', 'https://example.org', '')) > 0


def test_json_extraction():
'''Test link extraction from a JSON feed'''
# find link
assert len(feeds.determine_feed('<html><meta><link rel="alternate" type="application/json" title="JSON Feed" href="https://www.jsonfeed.org/feed.json" />></meta><body/></html>', 'jsonfeed.org', 'https://www.jsonfeed.org')) == 1
# extract data
filepath = os.path.join(RESOURCES_DIR, 'feed.json')
with open(filepath) as f:
assert not extract_links("{/}", params)

filepath = os.path.join(RESOURCES_DIR, "feed.json")
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
links = feeds.extract_links(teststring, 'npr.org', 'https://npr.org', '')
params = FeedParameters("https://npr.org", "npr.org", "")
links = extract_links(teststring, params)
assert len(links) == 25

# id as a backup
links = feeds.extract_links(r'{"version":"https:\/\/jsonfeed.org\/version\/1","items":[{"id":"https://www.example.org/1","title":"Test"}]}', 'example.org', 'https://example.org', '')
params = FeedParameters("https://example.org", "example.org", "")
links = extract_links(
r'{"version":"https:\/\/jsonfeed.org\/version\/1","items":[{"id":"https://www.example.org/1","title":"Test"}]}',
params,
)
assert len(links) == 1


def test_feeds_helpers():
'''Test helper functions for feed extraction'''
"""Test helper functions for feed extraction"""
params = FeedParameters("https://example.org", "example.org", "https://example.org")
domainname, baseurl = get_hostinfo("https://example.org")
assert domainname == params.domain and baseurl == params.base

# nothing useful
assert len(feeds.determine_feed('', 'example.org', 'https://example.org')) == 0
assert len(feeds.determine_feed('<html><meta><link rel="alternate" type="application/rss+xml" title="Feed"/></meta><body/></html>', 'example.org', 'https://example.org')) == 0
assert len(determine_feed("", params)) == 0
assert (
len(
determine_feed(
'<html><meta><link rel="alternate" type="application/rss+xml" title="Feed"/></meta><body/></html>',
params,
)
)
== 0
)
# useful
assert len(feeds.determine_feed('<html><meta><link rel="alternate" type="application/rss+xml" title="Feed" href="https://example.org/blog/feed/"/></meta><body/></html>', 'example.org', 'https://example.org')) == 1
assert len(feeds.determine_feed('<html><meta><link rel="alternate" type="application/atom+xml" title="Feed" href="https://example.org/blog/feed/"/></meta><body/></html>', 'example.org', 'https://example.org')) == 1
assert len(feeds.determine_feed('<html><meta><link rel="alternate" title="Feed" href="https://example.org/blog/feed/" type="application/atom+xml"/></meta><body/></html>', 'example.org', 'https://example.org')) == 1
assert len(feeds.determine_feed('<html><meta><link rel="alternate" title="Feed" href="https://example.org/blog/atom/"/></meta><body/></html>', 'example.org', 'https://example.org')) == 1
assert len(feeds.determine_feed('<html><meta><link rel="alternate" href="https://www.theguardian.com/international/rss" title="RSS" type="application/rss+xml"></meta><body/></html>', 'example.org', 'https://example.org')) == 1
assert (
len(
determine_feed(
'<html><meta><link rel="alternate" type="application/rss+xml" title="Feed" href="https://example.org/blog/feed/"/></meta><body/></html>',
params,
)
)
== 1
)
assert (
len(
determine_feed(
'<html><meta><link rel="alternate" type="application/atom+xml" title="Feed" href="https://example.org/blog/feed/"/></meta><body/></html>',
params,
)
)
== 1
)
assert (
len(
determine_feed(
'<html><meta><link rel="alternate" title="Feed" href="https://example.org/blog/feed/" type="application/atom+xml"/></meta><body/></html>',
params,
)
)
== 1
)
assert (
len(
determine_feed(
'<html><meta><link rel="alternate" title="Feed" href="https://example.org/blog/atom/"/></meta><body/></html>',
params,
)
)
== 1
)
assert (
len(
determine_feed(
'<html><meta><link rel="alternate" href="https://www.theguardian.com/international/rss" title="RSS" type="application/rss+xml"></meta><body/></html>',
params,
)
)
== 1
)
# no comments wanted
assert len(feeds.determine_feed('<html><meta><link rel="alternate" type="application/rss+xml" title="Feed" href="https://example.org/blog/comments-feed/"/></meta><body/></html>', 'example.org', 'https://example.org')) == 0
assert (
len(
determine_feed(
'<html><meta><link rel="alternate" type="application/rss+xml" title="Feed" href="https://example.org/blog/comments-feed/"/></meta><body/></html>',
params,
)
)
== 0
)

# invalid links
assert len(feeds.determine_feed('<html><meta><link rel="alternate" href="12345tralala" title="RSS" type="application/rss+xml"></meta><body/></html>', 'example.org', 'https://example.org')) == 0
params = FeedParameters("example.org", "example.org", "https://example.org") # fix
assert (
len(
determine_feed(
'<html><meta><link rel="alternate" href="12345tralala" title="RSS" type="application/rss+xml"></meta><body/></html>',
params,
)
)
== 0
)

# detecting in <a>-elements
assert feeds.determine_feed('<html><body><a href="https://example.org/feed.xml"><body/></html>', 'example.org', 'https://example.org') == ['https://example.org/feed.xml']
assert feeds.determine_feed('<html><body><a href="https://example.org/feed.atom"><body/></html>', 'example.org', 'https://example.org') == ['https://example.org/feed.atom']
assert feeds.determine_feed('<html><body><a href="https://example.org/rss"><body/></html>', 'example.org', 'https://example.org') == ['https://example.org/rss']
params = FeedParameters("https://example.org", "example.org", "https://example.org")
assert determine_feed(
'<html><body><a href="https://example.org/feed.xml"><body/></html>', params
) == ["https://example.org/feed.xml"]
assert determine_feed(
'<html><body><a href="https://example.org/feed.atom"><body/></html>', params
) == ["https://example.org/feed.atom"]
assert determine_feed(
'<html><body><a href="https://example.org/rss"><body/></html>', params
) == ["https://example.org/rss"]
# feed discovery
assert feeds.find_feed_urls('http://') == []
assert feeds.find_feed_urls('https://httpbun.org/status/404') == []
assert not find_feed_urls("http://")
assert not find_feed_urls("https://httpbun.org/status/404")
# Feedburner/Google links
assert feeds.handle_link_list(['https://feedproxy.google.com/ABCD'], 'example.org', 'https://example.org') == ['https://feedproxy.google.com/ABCD']
assert handle_link_list(["https://feedproxy.google.com/ABCD"], params) == [
"https://feedproxy.google.com/ABCD"
]
# override failed checks
assert feeds.handle_link_list(['https://feedburner.com/kat/1'], 'example.org', 'https://example.org') == ['https://feedburner.com/kat/1']
assert handle_link_list(["https://feedburner.com/kat/1"], params) == [
"https://feedburner.com/kat/1"
]
# diverging domain names
assert feeds.handle_link_list(['https://www.software.info/1'], 'example.org', 'https://example.org') == []
assert not handle_link_list(["https://www.software.info/1"], params)


def test_cli_behavior():
'''Test command-line interface with respect to feeds'''
testargs = ['', '--list', '--feed', 'https://httpbun.org/xml']
with patch.object(sys, 'argv', testargs):
assert cli.main() is None
"""Test command-line interface with respect to feeds"""
testargs = ["", "--list", "--feed", "https://httpbun.org/xml"]
with patch.object(sys, "argv", testargs):
assert main() is None


if __name__ == '__main__':
if __name__ == "__main__":
test_atom_extraction()
test_rss_extraction()
test_json_extraction()
Expand Down
Loading
Loading