From 3aadde10c4e930d0b0493d20aef323e45ec631e6 Mon Sep 17 00:00:00 2001 From: RMeissnerCC <50482279+RMeissnerCC@users.noreply.github.com> Date: Fri, 6 Nov 2020 09:08:43 +0100 Subject: [PATCH] Adblock list filters (#9) * - Implemented popup check - WIP: Migrated Adpy to Python3.9., refactoring and clean up * - WIP: Unittests * - Replacing adpy with adblockparser - Refactoring list regex into MetadataBase * - Cleanup - Refactoring names to accord with pylint * - Cleanup - Updated stop call to docker container to use actual container name --- poetry.lock | 14 +++++- pyproject.toml | 1 + requirements.txt | 3 ++ .../{ExtractLinks.py => extract_links.py} | 10 ++-- src/features/html_based.py | 15 +++++- .../{MetadataBase.py => metadata_base.py} | 28 ++++++++++- src/manager.py | 5 +- tests/integration/api_integration_test.py | 30 ++++++++++-- tests/unit/metadatabase_test.py | 49 +++++++++++++++++-- 9 files changed, 133 insertions(+), 22 deletions(-) rename src/features/{ExtractLinks.py => extract_links.py} (93%) rename src/features/{MetadataBase.py => metadata_base.py} (75%) diff --git a/poetry.lock b/poetry.lock index f8b3ed8..8686744 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,3 +1,11 @@ +[[package]] +name = "adblockparser" +version = "0.7" +description = "Parser for Adblock Plus rules" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "appdirs" version = "1.4.4" @@ -476,9 +484,13 @@ python-versions = "*" [metadata] lock-version = "1.1" python-versions = "^3.9.0rc1" -content-hash = "91cfc65995f0404f7ec5a67428ee9881b4ab930c19bf887241e821377f430a64" +content-hash = "38ace6f5b9d97750fffdf6c2b5464e10fc78b938814f22c14862de9941c9123e" [metadata.files] +adblockparser = [ + {file = "adblockparser-0.7-py2.py3-none-any.whl", hash = "sha256:90662a6397d441898e8757553997e0a6cd09c8e56fee4e4ac61b32bda65bad3e"}, + {file = "adblockparser-0.7.tar.gz", hash = "sha256:7a3407ddc31a29e42732bbcb04f3677c6959bffa1ea9d712afd498e0b4d09b22"}, +] appdirs = [ {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, diff --git a/pyproject.toml b/pyproject.toml index 78f5e53..afaad11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ fastapi = "^0.61.1" uvicorn = "^0.12.2" requests = "^2.24.0" bs4 = "^0.0.1" +adblockparser = "^0.7" [tool.poetry.dev-dependencies] pylint = "^2.6.0" diff --git a/requirements.txt b/requirements.txt index 452df81..5c33d36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ +adblockparser==0.7 \ + --hash=sha256:90662a6397d441898e8757553997e0a6cd09c8e56fee4e4ac61b32bda65bad3e \ + --hash=sha256:7a3407ddc31a29e42732bbcb04f3677c6959bffa1ea9d712afd498e0b4d09b22 beautifulsoup4==4.9.3 \ --hash=sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35 \ --hash=sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666 \ diff --git a/src/features/ExtractLinks.py b/src/features/extract_links.py similarity index 93% rename from src/features/ExtractLinks.py rename to src/features/extract_links.py index 6f096e1..e0b42d2 100644 --- a/src/features/ExtractLinks.py +++ b/src/features/extract_links.py @@ -2,7 +2,7 @@ from bs4 import BeautifulSoup -from features.MetadataBase import MetadataBase +from features.metadata_base import MetadataBase class ExtractLinks(MetadataBase): @@ -202,10 +202,6 @@ class ExtractLinks(MetadataBase): "\u202e", ] - @staticmethod - def __extract_raw_links(soup: BeautifulSoup) -> list: - return list({a["href"] for a in soup.find_all(href=True)}) - @staticmethod def __extract_extensions(links: list): file_extensions = [os.path.splitext(link)[-1] for link in links] @@ -226,9 +222,9 @@ def __extract_malicious_extensions(self, extensions: list) -> list: ] def _start(self, html_content: str, header: dict) -> dict: - soup = BeautifulSoup(html_content, "html.parser") + soup = self._create_html_soup(html_content) - raw_links = self.__extract_raw_links(soup) + raw_links = self._extract_raw_links(soup) image_links = self.__extract_images(soup) extensions = self.__extract_extensions(raw_links) malicious_extensions = self.__extract_malicious_extensions(extensions) diff --git a/src/features/html_based.py b/src/features/html_based.py index fbd24f6..922a667 100644 --- a/src/features/html_based.py +++ b/src/features/html_based.py @@ -1,4 +1,4 @@ -from features.MetadataBase import MetadataBase +from features.metadata_base import MetadataBase class Advertisement(MetadataBase): @@ -66,3 +66,16 @@ class IFrameEmbeddable(MetadataBase): tag_list = ["X-Frame-Options"] key: str = "iframe_embeddable" evaluate_header = True + + +class PopUp(MetadataBase): + tag_list = [ + "popup", + "popuptext", + "modal", + "modal fade", + "modal-dialog", + "interstitial", + "Interstitial", + ] + key = "popup" diff --git a/src/features/MetadataBase.py b/src/features/metadata_base.py similarity index 75% rename from src/features/MetadataBase.py rename to src/features/metadata_base.py index 8e1429f..39a298b 100644 --- a/src/features/MetadataBase.py +++ b/src/features/metadata_base.py @@ -1,6 +1,8 @@ import re +import adblockparser import requests +from bs4 import BeautifulSoup from lib.timing import get_utc_now @@ -47,9 +49,31 @@ def _work_header(self, header): values = [header[ele] for ele in self.tag_list if ele in header] return values + @staticmethod + def _create_html_soup(html_content: str) -> BeautifulSoup: + return BeautifulSoup(html_content, "html.parser") + + @staticmethod + def _extract_raw_links(soup: BeautifulSoup) -> list: + return list({a["href"] for a in soup.find_all(href=True)}) + def _work_html_content(self, html_content): if self.tag_list: - values = [ele for ele in self.tag_list if ele in html_content] + if self.url.find("easylist") >= 0: + # TODO: These raw_links can be stored for further analysis in other Features -> how to store? + soup = self._create_html_soup(html_content) + raw_links = self._extract_raw_links(soup) + + rules = adblockparser.AdblockRules(self.tag_list) + values = [] + for url in raw_links: + is_blocked = rules.should_block(url) + if is_blocked: + values.append(url) + else: + values = [ + ele for ele in self.tag_list if html_content.find(ele) >= 0 + ] else: values = [] return values @@ -64,7 +88,7 @@ def _start(self, html_content: str, header: dict) -> dict: def _download_tag_list(self) -> None: result = requests.get(self.url) if result.status_code == 200: - self.tag_list = result.text.split("\n") + self.tag_list = result.text.splitlines() else: self._logger.warning( f"Downloading tag list from '{self.url}' yielded status code '{result.status_code}'." diff --git a/src/manager.py b/src/manager.py index 60f999b..dcc8251 100644 --- a/src/manager.py +++ b/src/manager.py @@ -10,7 +10,7 @@ from app.api import app from app.communication import ProcessToDaemonCommunication -from features.ExtractLinks import ExtractLinks +from features.extract_links import ExtractLinks from features.html_based import ( Advertisement, AntiAdBlock, @@ -24,7 +24,7 @@ Paywalls, Tracker, ) -from features.MetadataBase import MetadataBase +from features.metadata_base import MetadataBase from lib.config import ( LOGFILE_MANAGER, MESSAGE_CONTENT, @@ -105,7 +105,6 @@ def _create_logger(self): log_15_mb_limit = 1024 * 1024 * 15 backup_count = 10000 - print(data_path) if not os.path.exists(data_path): os.mkdir(data_path, mode=0o755) diff --git a/tests/integration/api_integration_test.py b/tests/integration/api_integration_test.py index fbb1f8d..3f44a53 100644 --- a/tests/integration/api_integration_test.py +++ b/tests/integration/api_integration_test.py @@ -5,6 +5,7 @@ from json import JSONDecodeError from pathlib import Path +import pytest import requests DOCKER_TEST_URL = "http://0.0.0.0:5057/" @@ -21,12 +22,28 @@ def _build_and_run_docker(): ) os.chdir(new_dir) + print("Exporting requirements") + process = subprocess.call( + ["poetry export -o requirements.txt"], shell=True + ) + print(f"process after exporting requirements: {process}") + print("building docker") process = subprocess.call( ["docker build -t oeh-search-meta:latest ."], shell=True ) print(f"process after building docker: {process}") + + # stopping any old container of the same name prior to launch + + subprocess.call( + [ + "docker stop $(docker ps -a -q --filter ancestor=oeh-search-meta:latest --format='{{.ID}}')" + ], + shell=True, + ) + process = subprocess.Popen( ["docker-compose -f docker-compose.yml up"], shell=True ) @@ -40,11 +57,18 @@ def _build_and_run_docker(): def _stop_docker(): process = subprocess.call( - ["docker stop oeh-search-meta_extractor_1"], shell=True + [ + "docker stop $(docker ps -a -q --filter ancestor=oeh-search-meta:latest --format='{{.ID}}')" + ], + shell=True, ) + print(f"process after docker stop: {process}") +@pytest.mark.skip( + reason="This test takes a lot of time, depending on payload etc. Execute manually." +) def test_api_extract_meta(): url = DOCKER_TEST_URL + "extract_meta" @@ -101,7 +125,7 @@ def test_api_extract_meta(): _build_and_run_docker() response = requests.request( - "POST", url, headers=DOCKER_TEST_HEADERS, data=payload, timeout=20 + "POST", url, headers=DOCKER_TEST_HEADERS, data=payload, timeout=60 ) try: @@ -122,7 +146,7 @@ def test_ping_container(): _build_and_run_docker() response = requests.request( - "GET", url, headers=DOCKER_TEST_HEADERS, timeout=20 + "GET", url, headers=DOCKER_TEST_HEADERS, timeout=60 ) data = json.loads(response.text) diff --git a/tests/unit/metadatabase_test.py b/tests/unit/metadatabase_test.py index 4fe955d..5d81d07 100644 --- a/tests/unit/metadatabase_test.py +++ b/tests/unit/metadatabase_test.py @@ -1,6 +1,7 @@ +import adblockparser import pytest -from features.MetadataBase import MetadataBase +from features.metadata_base import MetadataBase @pytest.fixture @@ -34,7 +35,7 @@ def test_start(metadatabase: MetadataBase, mocker): html_content = "html_content" header = "header" metadatabase.key = "test_key" - _start_spy = mocker.spy(metadatabase, "_start") + start_spy = mocker.spy(metadatabase, "_start") values = metadatabase.start(html_content=html_content, header=header) values_has_only_one_key = len(values.keys()) == 1 @@ -48,14 +49,14 @@ def test_start(metadatabase: MetadataBase, mocker): if "tag_list_last_modified" in values[metadatabase.key].keys(): assert values[metadatabase.key]["tag_list_last_modified"] == "" assert values[metadatabase.key]["tag_list_expires"] == 0 - assert _start_spy.call_count == 1 - assert _start_spy.call_args_list[0][1] == { + assert start_spy.call_count == 1 + assert start_spy.call_args_list[0][1] == { html_content: html_content, header: header, } _ = metadatabase.start(html_content=html_content) - assert _start_spy.call_args_list[1][1] == { + assert start_spy.call_args_list[1][1] == { html_content: html_content, header: {}, } @@ -111,3 +112,41 @@ def test_setup(metadatabase: MetadataBase, mocker): assert metadatabase._download_tag_list.call_count == 1 assert extract_date_from_list_spy.call_count == 1 assert prepare_tag_spy.call_count == 1 + + +""" +-------------------------------------------------------------------------------- +""" + + +def _create_sample_easylist() -> list: + easylist = [ + "! *** easylist:easylist/easylist_general_block.txt ***", + r"/adv_horiz.", + r"@@||imx.to/dropzone.js$script", + r"||testimx.to/dropzone.js$script", + r"||testimx2.to/dropzone.js", + ] + + return easylist + + +def _create_sample_urls() -> list: + url_to_be_blocked = [ + ("https://www.dictionary.com/", False), + ("/adv_horiz.", True), + ("imx.to/dropzone.js", False), + ("testimx.to/dropzone.js", False), + ("testimx2.to/dropzone.js", True), + ] + return url_to_be_blocked + + +def test_easylist_filter(): + urls_to_be_blocked = _create_sample_urls() + + rules = adblockparser.AdblockRules(_create_sample_easylist()) + + for url, to_be_blocked in urls_to_be_blocked: + result = rules.should_block(url) # "http://ads.example.com" + assert result == to_be_blocked