Skip to content

Commit

Permalink
Trackers and analytics (#10)
Browse files Browse the repository at this point in the history
* - Easyprivacy lists from Github

* - Added AntiAdBlock and EasyListGermany lists
- Created method to loop over multiple urls for one key

* - Combining EasyPrivacy lists into more general classes

* - Resolved bug: missing key in EasyPrivacy
- Deduplicated tag lists

* - Added Fanboy annoyance, notification and social media lists as well as cookie and adult lists

* - Mocked setup in manager unit tests
  • Loading branch information
RobertMeissner authored Nov 9, 2020
1 parent 3aadde1 commit b8608a2
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 30 deletions.
143 changes: 124 additions & 19 deletions src/features/html_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,41 @@


class Advertisement(MetadataBase):
url: str = "https://easylist.to/easylist/easylist.txt"
urls = [
"https://easylist.to/easylist/easylist.txt",
"https://easylist.to/easylist/easylist_adservers.txt",
"https://easylist.to/easylist/easylist_adservers_popup.txt",
"https://easylist.to/easylist/easylist_allowlist.txt",
"https://easylist.to/easylist/easylist_allowlist_dimensions.txt",
"https://easylist.to/easylist/easylist_allowlist_general_hide.txt",
"https://easylist.to/easylist/easylist_allowlist_popup.txt",
"https://easylist.to/easylist/easylist_general_block.txt",
"https://easylist.to/easylist/easylist_general_block_dimensions.txt",
"https://easylist.to/easylist/easylist_general_block_popup.txt",
"https://easylist.to/easylist/easylist_general_hide.txt",
"https://easylist.to/easylist/easylist_specific_block.txt",
"https://easylist.to/easylist/easylist_specific_block_popup.txt",
"https://easylist.to/easylist/easylist_specific_hide.txt",
"https://easylist.to/easylist/easylist_thirdparty.txt",
"https://easylist.to/easylist/easylist_thirdparty_popup.txt",
]
key: str = "ads"
comment_symbol = "!"


class Tracker(MetadataBase):
url: str = "https://easylist.to/easylist/easyprivacy.txt"
key: str = "tracker"
comment_symbol = "!"
class EasyPrivacy(MetadataBase):
urls: list = [
"https://easylist.to/easylist/easyprivacy.txt",
"https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_general.txt",
"https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_allowlist.txt",
"https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_allowlist_international.txt",
"https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_specific.txt",
"https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_specific_international.txt",
"https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_trackingservers.txt",
"https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_trackingservers_international.txt",
"https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_thirdparty.txt",
"https://github.com/easylist/easylist/blob/master/easyprivacy/easyprivacy_thirdparty_international.txt",
]
key = "easyprivacy"


class IETracker(MetadataBase):
Expand All @@ -20,35 +46,114 @@ class IETracker(MetadataBase):


class Cookies(MetadataBase):
url: str = "https://easylist-downloads.adblockplus.org/easylist-cookie.txt"
urls = [
"https://easylist-downloads.adblockplus.org/easylist-cookie.txt",
"https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_allowlist.txt",
"https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_allowlist_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_general_block.txt",
"https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_international_specific_block.txt",
"https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_international_specific_hide.txt",
"https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_specific_block.txt",
"https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_specific_hide.txt",
"https://github.com/easylist/easylist/blob/master/easylist_cookie/easylist_cookie_thirdparty.txt",
]
key: str = "cookies"
comment_symbol = "!"


class EasylistGermany(MetadataBase):
url: str = "https://easylist.to/easylistgermany/easylistgermany.txt"
key: str = "easylist_germany"
comment_symbol = "!"


class FanboyAnnoyance(MetadataBase):
url: str = "https://easylist.to/easylist/fanboy-annoyance.txt"
urls = [
"https://easylist.to/easylist/fanboy-annoyance.txt",
"https://easylist.to/easylist/fanboy_annoyance_allowlist.txt",
"https://easylist.to/easylist/fanboy_annoyance_allowlist_general_hide.txt",
"https://easylist.to/easylist/fanboy_annoyance_general_block.txt",
"https://easylist.to/easylist/fanboy_annoyance_general_hide.txt",
"https://easylist.to/easylist/fanboy_annoyance_international.txt",
"https://easylist.to/easylist/fanboy_annoyance_specific_block.txt",
"https://easylist.to/easylist/fanboy_annoyance_thirdparty.txt",
]
key: str = "fanboy_annoyance"
comment_symbol = "!"


class FanboyNotification(MetadataBase):
urls = [
"https://easylist.to/easylist/fanboy_notifications_allowlist.txt",
"https://easylist.to/easylist/fanboy_notifications_allowlist_general_hide.txt",
"https://easylist.to/easylist/fanboy_notifications_general_block.txt",
"https://easylist.to/easylist/fanboy_notifications_general_hide.txt",
"https://easylist.to/easylist/fanboy_notifications_specific_block.txt",
"https://easylist.to/easylist/fanboy_notifications_specific_hide.txt",
"https://easylist.to/easylist/fanboy_notifications_thirdparty.txt",
]
key: str = "fanboy_notification"


class FanboySocialMedia(MetadataBase):
url: str = "https://easylist.to/easylist/fanboy-social.txt"
urls = [
"https://easylist.to/easylist/fanboy-social.txt",
"https://easylist.to/easylist/fanboy_social_allowlist.txt",
"https://easylist.to/easylist/fanboy_social_allowlist_general_hide.txt",
"https://easylist.to/easylist/fanboy_social_general_block.txt",
"https://easylist.to/easylist/fanboy_social_general_hide.txt",
"https://easylist.to/easylist/fanboy_social_international.txt",
"https://easylist.to/easylist/fanboy_social_specific_block.txt",
"https://easylist.to/easylist/fanboy_social_specific_hide.txt",
"https://easylist.to/easylist/fanboy_social_thirdparty.txt",
]
key: str = "fanboy_social"
comment_symbol = "!"


class AntiAdBlock(MetadataBase):
url: str = (
"https://easylist-downloads.adblockplus.org/antiadblockfilters.txt"
)
key: str = "anti_adblock"
comment_symbol = "!"


class AntiAdBlockGerman(MetadataBase):
url: str = "https://github.com/easylist/antiadblockfilters/blob/master/antiadblockfilters/antiadblock_german.txt"
key: str = "antiadblock_german"


class AntiAdBlockEnglish(MetadataBase):
url: str = "https://github.com/easylist/antiadblockfilters/blob/master/antiadblockfilters/antiadblock_english.txt"
key: str = "antiadblock_english"


class EasylistGermany(MetadataBase):
urls: list = [
"https://easylist.to/easylistgermany/easylistgermany.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_adservers.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_adservers_popup.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_allowlist.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_allowlist_dimensions.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_allowlist_general_hide.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_allowlist_popup.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_general_block.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_general_block_popup.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_general_hide.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_specific_block.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_specific_block_popup.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_specific_hide.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_thirdparty.txt",
"https://github.com/easylist/easylistgermany/blob/master/easylistgermany/easylistgermany_thirdparty_popup.txt",
]
key: str = "easylist_germany"


class EasylistAdult(MetadataBase):
urls: list = [
"https://github.com/easylist/easylist/blob/master/easylist_adult/adult_adservers.txt",
"https://github.com/easylist/easylist/blob/master/easylist_adult/adult_adservers_popup.txt",
"https://github.com/easylist/easylist/blob/master/easylist_adult/adult_allowlist.txt",
"https://github.com/easylist/easylist/blob/master/easylist_adult/adult_allowlist_popup.txt",
"https://github.com/easylist/easylist/blob/master/easylist_adult/adult_specific_block.txt",
"https://github.com/easylist/easylist/blob/master/easylist_adult/adult_specific_block_popup.txt",
"https://github.com/easylist/easylist/blob/master/easylist_adult/adult_specific_hide.txt",
"https://github.com/easylist/easylist/blob/master/easylist_adult/adult_thirdparty.txt",
"https://github.com/easylist/easylist/blob/master/easylist_adult/adult_thirdparty_popup.txt",
]
key: str = "easylist_adult"


class Paywalls(MetadataBase):
Expand Down
17 changes: 16 additions & 1 deletion src/features/metadata_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from collections import OrderedDict

import adblockparser
import requests
Expand All @@ -13,6 +14,7 @@ class MetadataBase:
tag_list_expires: int = 0
key: str = ""
url: str = ""
urls: list = []
comment_symbol: str = ""
evaluate_header: bool = False

Expand Down Expand Up @@ -85,6 +87,13 @@ def _start(self, html_content: str, header: dict) -> dict:
values = self._work_html_content(html_content)
return {"values": values}

def _download_multiple_tag_lists(self):
complete_tag_list = []
for url in self.urls:
self.url = url
self._download_tag_list()
complete_tag_list.append(self.tag_list)

def _download_tag_list(self) -> None:
result = requests.get(self.url)
if result.status_code == 200:
Expand Down Expand Up @@ -119,6 +128,8 @@ def _extract_date_from_list(self):
def _prepare_tag_list(self) -> None:
self.tag_list = [i for i in self.tag_list if i != ""]

self.tag_list = list(OrderedDict.fromkeys(self.tag_list))

if self.comment_symbol != "":
self.tag_list = [
x
Expand All @@ -128,7 +139,11 @@ def _prepare_tag_list(self) -> None:

def setup(self) -> None:
"""Child function."""
if self.url != "":
if self.urls:
self._download_multiple_tag_lists()
elif self.url != "":
self._download_tag_list()

if self.tag_list:
self._extract_date_from_list()
self._prepare_tag_list()
27 changes: 18 additions & 9 deletions src/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,20 @@
from features.html_based import (
Advertisement,
AntiAdBlock,
AntiAdBlockEnglish,
AntiAdBlockGerman,
ContentSecurityPolicy,
Cookies,
EasylistAdult,
EasylistGermany,
EasyPrivacy,
FanboyAnnoyance,
FanboyNotification,
FanboySocialMedia,
IETracker,
IFrameEmbeddable,
Paywalls,
Tracker,
PopUp,
)
from features.metadata_base import MetadataBase
from lib.config import (
Expand Down Expand Up @@ -66,18 +71,22 @@ def _create_extractors(self):

extractors = [
Advertisement,
Tracker,
IFrameEmbeddable,
ContentSecurityPolicy,
EasyPrivacy,
ExtractLinks,
IETracker,
Cookies,
AntiAdBlock,
EasylistGermany,
FanboyAnnoyance,
FanboyNotification,
FanboySocialMedia,
ContentSecurityPolicy,
AntiAdBlock,
AntiAdBlockGerman,
AntiAdBlockEnglish,
EasylistGermany,
EasylistAdult,
Paywalls,
IETracker,
ExtractLinks,
ContentSecurityPolicy,
IFrameEmbeddable,
PopUp,
]

for extractor in extractors:
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def manager(mocker):
Manager._create_logger = mocker.MagicMock()
Manager._create_api = mocker.MagicMock()
Manager._logger = mocker.MagicMock()
Manager.setup = mocker.MagicMock()

with mock.patch("manager.Manager.run"):
manager = Manager()
Expand All @@ -26,7 +27,7 @@ def test_init(manager: Manager, mocker):
assert manager._create_logger.call_count == 1
assert manager._create_api.call_count == 1
assert run_spy.call_count == 0
assert len(manager.metadata_extractors) == 13
assert len(manager.metadata_extractors) == 17
assert manager.run_loop


Expand Down
19 changes: 19 additions & 0 deletions tests/unit/metadatabase_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,38 @@ def test_under_start(metadatabase: MetadataBase, mocker):

def test_setup(metadatabase: MetadataBase, mocker):
metadatabase._download_tag_list = mocker.MagicMock()
metadatabase._download_multiple_tag_lists = mocker.MagicMock()
extract_date_from_list_spy = mocker.spy(
metadatabase, "_extract_date_from_list"
)
prepare_tag_spy = mocker.spy(metadatabase, "_prepare_tag_list")

metadatabase.setup()
assert metadatabase._download_tag_list.call_count == 0
assert metadatabase._download_multiple_tag_lists.call_count == 0
assert extract_date_from_list_spy.call_count == 0
assert prepare_tag_spy.call_count == 0

metadatabase.url = "hello"
metadatabase.setup()
assert metadatabase._download_tag_list.call_count == 1
assert metadatabase._download_multiple_tag_lists.call_count == 0
assert extract_date_from_list_spy.call_count == 0
assert prepare_tag_spy.call_count == 0

metadatabase.url = ""
metadatabase.urls = ["Hello1", "Hello2"]
metadatabase.setup()
assert metadatabase._download_tag_list.call_count == 1
assert metadatabase._download_multiple_tag_lists.call_count == 1
assert extract_date_from_list_spy.call_count == 0
assert prepare_tag_spy.call_count == 0

metadatabase.tag_list = ["!Hello"]
metadatabase.urls = []
metadatabase.setup()
assert metadatabase._download_tag_list.call_count == 1
assert metadatabase._download_multiple_tag_lists.call_count == 1
assert extract_date_from_list_spy.call_count == 1
assert prepare_tag_spy.call_count == 1

Expand Down

0 comments on commit b8608a2

Please sign in to comment.