From b8608a27f698b2083fa1337a84e6e297cb194577 Mon Sep 17 00:00:00 2001 From: RMeissnerCC <50482279+RMeissnerCC@users.noreply.github.com> Date: Mon, 9 Nov 2020 09:40:17 +0100 Subject: [PATCH] Trackers and analytics (#10) * - Easyprivacy lists from Github * - Added AntiAdBlock and EasyListGermany lists - Created method to loop over multiple urls for one key * - Combining EasyPrivacy lists into more general classes * - Resolved bug: missing key in EasyPrivacy - Deduplicated tag lists * - Added Fanboy annoyance, notification and social media lists as well as cookie and adult lists * - Mocked setup in manager unit tests --- src/features/html_based.py | 143 +++++++++++++++++++++++++++----- src/features/metadata_base.py | 17 +++- src/manager.py | 27 ++++-- tests/unit/manager_test.py | 3 +- tests/unit/metadatabase_test.py | 19 +++++ 5 files changed, 179 insertions(+), 30 deletions(-) diff --git a/src/features/html_based.py b/src/features/html_based.py index 922a667..31ace65 100644 --- a/src/features/html_based.py +++ b/src/features/html_based.py @@ -2,15 +2,41 @@ class Advertisement(MetadataBase): - url: str = "https://easylist.to/easylist/easylist.txt" + urls = [ + "https://easylist.to/easylist/easylist.txt", + "https://easylist.to/easylist/easylist_adservers.txt", + "https://easylist.to/easylist/easylist_adservers_popup.txt", + "https://easylist.to/easylist/easylist_allowlist.txt", + "https://easylist.to/easylist/easylist_allowlist_dimensions.txt", + "https://easylist.to/easylist/easylist_allowlist_general_hide.txt", + "https://easylist.to/easylist/easylist_allowlist_popup.txt", + "https://easylist.to/easylist/easylist_general_block.txt", + "https://easylist.to/easylist/easylist_general_block_dimensions.txt", + "https://easylist.to/easylist/easylist_general_block_popup.txt", + "https://easylist.to/easylist/easylist_general_hide.txt", + "https://easylist.to/easylist/easylist_specific_block.txt", + "https://easylist.to/easylist/easylist_specific_block_popup.txt", + "https://easylist.to/easylist/easylist_specific_hide.txt", + "https://easylist.to/easylist/easylist_thirdparty.txt", + "https://easylist.to/easylist/easylist_thirdparty_popup.txt", + ] key: str = "ads" - comment_symbol = "!" -class Tracker(MetadataBase): - url: str = "https://easylist.to/easylist/easyprivacy.txt" - key: str = "tracker" - comment_symbol = "!" +class EasyPrivacy(MetadataBase): + urls: list = [ + "https://easylist.to/easylist/easyprivacy.txt", + "https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_general.txt", + "https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_allowlist.txt", + "https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_allowlist_international.txt", + "https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_specific.txt", + "https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_specific_international.txt", + "https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_trackingservers.txt", + "https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_trackingservers_international.txt", + "https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_thirdparty.txt", + "https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_thirdparty_international.txt", + ] + key = "easyprivacy" class IETracker(MetadataBase): @@ -20,27 +46,61 @@ class IETracker(MetadataBase): class Cookies(MetadataBase): - url: str = "https://easylist-downloads.adblockplus.org/easylist-cookie.txt" + urls = [ + "https://easylist-downloads.adblockplus.org/easylist-cookie.txt", + "https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_allowlist.txt", + "https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_allowlist_general_hide.txt", + "https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_general_block.txt", + "https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_general_hide.txt", + "https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_international_specific_block.txt", + "https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_international_specific_hide.txt", + "https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_specific_block.txt", + "https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_specific_hide.txt", + "https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_thirdparty.txt", + ] key: str = "cookies" - comment_symbol = "!" - - -class EasylistGermany(MetadataBase): - url: str = "https://easylist.to/easylistgermany/easylistgermany.txt" - key: str = "easylist_germany" - comment_symbol = "!" class FanboyAnnoyance(MetadataBase): - url: str = "https://easylist.to/easylist/fanboy-annoyance.txt" + urls = [ + "https://easylist.to/easylist/fanboy-annoyance.txt", + "https://easylist.to/easylist/fanboy_annoyance_allowlist.txt", + "https://easylist.to/easylist/fanboy_annoyance_allowlist_general_hide.txt", + "https://easylist.to/easylist/fanboy_annoyance_general_block.txt", + "https://easylist.to/easylist/fanboy_annoyance_general_hide.txt", + "https://easylist.to/easylist/fanboy_annoyance_international.txt", + "https://easylist.to/easylist/fanboy_annoyance_specific_block.txt", + "https://easylist.to/easylist/fanboy_annoyance_thirdparty.txt", + ] key: str = "fanboy_annoyance" - comment_symbol = "!" + + +class FanboyNotification(MetadataBase): + urls = [ + "https://easylist.to/easylist/fanboy_notifications_allowlist.txt", + "https://easylist.to/easylist/fanboy_notifications_allowlist_general_hide.txt", + "https://easylist.to/easylist/fanboy_notifications_general_block.txt", + "https://easylist.to/easylist/fanboy_notifications_general_hide.txt", + "https://easylist.to/easylist/fanboy_notifications_specific_block.txt", + "https://easylist.to/easylist/fanboy_notifications_specific_hide.txt", + "https://easylist.to/easylist/fanboy_notifications_thirdparty.txt", + ] + key: str = "fanboy_notification" class FanboySocialMedia(MetadataBase): - url: str = "https://easylist.to/easylist/fanboy-social.txt" + urls = [ + "https://easylist.to/easylist/fanboy-social.txt", + "https://easylist.to/easylist/fanboy_social_allowlist.txt", + "https://easylist.to/easylist/fanboy_social_allowlist_general_hide.txt", + "https://easylist.to/easylist/fanboy_social_general_block.txt", + "https://easylist.to/easylist/fanboy_social_general_hide.txt", + "https://easylist.to/easylist/fanboy_social_international.txt", + "https://easylist.to/easylist/fanboy_social_specific_block.txt", + "https://easylist.to/easylist/fanboy_social_specific_hide.txt", + "https://easylist.to/easylist/fanboy_social_thirdparty.txt", + ] key: str = "fanboy_social" - comment_symbol = "!" class AntiAdBlock(MetadataBase): @@ -48,7 +108,52 @@ class AntiAdBlock(MetadataBase): "https://easylist-downloads.adblockplus.org/antiadblockfilters.txt" ) key: str = "anti_adblock" - comment_symbol = "!" + + +class AntiAdBlockGerman(MetadataBase): + url: str = "https://github.com/easylist/antiadblockfilters/blob/master/antiadblockfilters/antiadblock_german.txt" + key: str = "antiadblock_german" + + +class AntiAdBlockEnglish(MetadataBase): + url: str = "https://github.com/easylist/antiadblockfilters/blob/master/antiadblockfilters/antiadblock_english.txt" + key: str = "antiadblock_english" + + +class EasylistGermany(MetadataBase): + urls: list = [ + "https://easylist.to/easylistgermany/easylistgermany.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_adservers.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_adservers_popup.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_allowlist.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_allowlist_dimensions.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_allowlist_general_hide.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_allowlist_popup.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_general_block.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_general_block_popup.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_general_hide.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_specific_block.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_specific_block_popup.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_specific_hide.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_thirdparty.txt", + "https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_thirdparty_popup.txt", + ] + key: str = "easylist_germany" + + +class EasylistAdult(MetadataBase): + urls: list = [ + "https://github.com/easylist/easylist/blob/master/easylist_adult/adult_adservers.txt", + "https://github.com/easylist/easylist/blob/master/easylist_adult/adult_adservers_popup.txt", + "https://github.com/easylist/easylist/blob/master/easylist_adult/adult_allowlist.txt", + "https://github.com/easylist/easylist/blob/master/easylist_adult/adult_allowlist_popup.txt", + "https://github.com/easylist/easylist/blob/master/easylist_adult/adult_specific_block.txt", + "https://github.com/easylist/easylist/blob/master/easylist_adult/adult_specific_block_popup.txt", + "https://github.com/easylist/easylist/blob/master/easylist_adult/adult_specific_hide.txt", + "https://github.com/easylist/easylist/blob/master/easylist_adult/adult_thirdparty.txt", + "https://github.com/easylist/easylist/blob/master/easylist_adult/adult_thirdparty_popup.txt", + ] + key: str = "easylist_adult" class Paywalls(MetadataBase): diff --git a/src/features/metadata_base.py b/src/features/metadata_base.py index 39a298b..7e0264b 100644 --- a/src/features/metadata_base.py +++ b/src/features/metadata_base.py @@ -1,4 +1,5 @@ import re +from collections import OrderedDict import adblockparser import requests @@ -13,6 +14,7 @@ class MetadataBase: tag_list_expires: int = 0 key: str = "" url: str = "" + urls: list = [] comment_symbol: str = "" evaluate_header: bool = False @@ -85,6 +87,13 @@ def _start(self, html_content: str, header: dict) -> dict: values = self._work_html_content(html_content) return {"values": values} + def _download_multiple_tag_lists(self): + complete_tag_list = [] + for url in self.urls: + self.url = url + self._download_tag_list() + complete_tag_list.append(self.tag_list) + def _download_tag_list(self) -> None: result = requests.get(self.url) if result.status_code == 200: @@ -119,6 +128,8 @@ def _extract_date_from_list(self): def _prepare_tag_list(self) -> None: self.tag_list = [i for i in self.tag_list if i != ""] + self.tag_list = list(OrderedDict.fromkeys(self.tag_list)) + if self.comment_symbol != "": self.tag_list = [ x @@ -128,7 +139,11 @@ def _prepare_tag_list(self) -> None: def setup(self) -> None: """Child function.""" - if self.url != "": + if self.urls: + self._download_multiple_tag_lists() + elif self.url != "": self._download_tag_list() + + if self.tag_list: self._extract_date_from_list() self._prepare_tag_list() diff --git a/src/manager.py b/src/manager.py index dcc8251..2f30ce1 100644 --- a/src/manager.py +++ b/src/manager.py @@ -14,15 +14,20 @@ from features.html_based import ( Advertisement, AntiAdBlock, + AntiAdBlockEnglish, + AntiAdBlockGerman, ContentSecurityPolicy, Cookies, + EasylistAdult, EasylistGermany, + EasyPrivacy, FanboyAnnoyance, + FanboyNotification, FanboySocialMedia, IETracker, IFrameEmbeddable, Paywalls, - Tracker, + PopUp, ) from features.metadata_base import MetadataBase from lib.config import ( @@ -66,18 +71,22 @@ def _create_extractors(self): extractors = [ Advertisement, - Tracker, - IFrameEmbeddable, - ContentSecurityPolicy, + EasyPrivacy, + ExtractLinks, + IETracker, Cookies, - AntiAdBlock, - EasylistGermany, FanboyAnnoyance, + FanboyNotification, FanboySocialMedia, - ContentSecurityPolicy, + AntiAdBlock, + AntiAdBlockGerman, + AntiAdBlockEnglish, + EasylistGermany, + EasylistAdult, Paywalls, - IETracker, - ExtractLinks, + ContentSecurityPolicy, + IFrameEmbeddable, + PopUp, ] for extractor in extractors: diff --git a/tests/unit/manager_test.py b/tests/unit/manager_test.py index 41c4c16..cc99be8 100644 --- a/tests/unit/manager_test.py +++ b/tests/unit/manager_test.py @@ -10,6 +10,7 @@ def manager(mocker): Manager._create_logger = mocker.MagicMock() Manager._create_api = mocker.MagicMock() Manager._logger = mocker.MagicMock() + Manager.setup = mocker.MagicMock() with mock.patch("manager.Manager.run"): manager = Manager() @@ -26,7 +27,7 @@ def test_init(manager: Manager, mocker): assert manager._create_logger.call_count == 1 assert manager._create_api.call_count == 1 assert run_spy.call_count == 0 - assert len(manager.metadata_extractors) == 13 + assert len(manager.metadata_extractors) == 17 assert manager.run_loop diff --git a/tests/unit/metadatabase_test.py b/tests/unit/metadatabase_test.py index 5d81d07..235b79c 100644 --- a/tests/unit/metadatabase_test.py +++ b/tests/unit/metadatabase_test.py @@ -97,6 +97,7 @@ def test_under_start(metadatabase: MetadataBase, mocker): def test_setup(metadatabase: MetadataBase, mocker): metadatabase._download_tag_list = mocker.MagicMock() + metadatabase._download_multiple_tag_lists = mocker.MagicMock() extract_date_from_list_spy = mocker.spy( metadatabase, "_extract_date_from_list" ) @@ -104,12 +105,30 @@ def test_setup(metadatabase: MetadataBase, mocker): metadatabase.setup() assert metadatabase._download_tag_list.call_count == 0 + assert metadatabase._download_multiple_tag_lists.call_count == 0 assert extract_date_from_list_spy.call_count == 0 assert prepare_tag_spy.call_count == 0 metadatabase.url = "hello" metadatabase.setup() assert metadatabase._download_tag_list.call_count == 1 + assert metadatabase._download_multiple_tag_lists.call_count == 0 + assert extract_date_from_list_spy.call_count == 0 + assert prepare_tag_spy.call_count == 0 + + metadatabase.url = "" + metadatabase.urls = ["Hello1", "Hello2"] + metadatabase.setup() + assert metadatabase._download_tag_list.call_count == 1 + assert metadatabase._download_multiple_tag_lists.call_count == 1 + assert extract_date_from_list_spy.call_count == 0 + assert prepare_tag_spy.call_count == 0 + + metadatabase.tag_list = ["!Hello"] + metadatabase.urls = [] + metadatabase.setup() + assert metadatabase._download_tag_list.call_count == 1 + assert metadatabase._download_multiple_tag_lists.call_count == 1 assert extract_date_from_list_spy.call_count == 1 assert prepare_tag_spy.call_count == 1