Skip to content

Commit

Permalink
Adblock list filters (#9)
Browse files Browse the repository at this point in the history
* - Implemented popup check
- WIP: Migrated Adpy to Python3.9., refactoring and clean up

* - WIP: Unittests

* - Replacing adpy with adblockparser
- Refactoring list regex into MetadataBase

* - Cleanup
- Refactoring names to accord with pylint

* - Cleanup
- Updated stop call to docker container to use actual container name
  • Loading branch information
RobertMeissner authored Nov 6, 2020
1 parent b4d308f commit 3aadde1
Show file tree
Hide file tree
Showing 9 changed files with 133 additions and 22 deletions.
14 changes: 13 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ fastapi = "^0.61.1"
uvicorn = "^0.12.2"
requests = "^2.24.0"
bs4 = "^0.0.1"
adblockparser = "^0.7"

[tool.poetry.dev-dependencies]
pylint = "^2.6.0"
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
adblockparser==0.7 \
--hash=sha256:90662a6397d441898e8757553997e0a6cd09c8e56fee4e4ac61b32bda65bad3e \
--hash=sha256:7a3407ddc31a29e42732bbcb04f3677c6959bffa1ea9d712afd498e0b4d09b22
beautifulsoup4==4.9.3 \
--hash=sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35 \
--hash=sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666 \
Expand Down
10 changes: 3 additions & 7 deletions src/features/ExtractLinks.py → src/features/extract_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from bs4 import BeautifulSoup

from features.MetadataBase import MetadataBase
from features.metadata_base import MetadataBase


class ExtractLinks(MetadataBase):
Expand Down Expand Up @@ -202,10 +202,6 @@ class ExtractLinks(MetadataBase):
"\u202e",
]

@staticmethod
def __extract_raw_links(soup: BeautifulSoup) -> list:
return list({a["href"] for a in soup.find_all(href=True)})

@staticmethod
def __extract_extensions(links: list):
file_extensions = [os.path.splitext(link)[-1] for link in links]
Expand All @@ -226,9 +222,9 @@ def __extract_malicious_extensions(self, extensions: list) -> list:
]

def _start(self, html_content: str, header: dict) -> dict:
soup = BeautifulSoup(html_content, "html.parser")
soup = self._create_html_soup(html_content)

raw_links = self.__extract_raw_links(soup)
raw_links = self._extract_raw_links(soup)
image_links = self.__extract_images(soup)
extensions = self.__extract_extensions(raw_links)
malicious_extensions = self.__extract_malicious_extensions(extensions)
Expand Down
15 changes: 14 additions & 1 deletion src/features/html_based.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from features.MetadataBase import MetadataBase
from features.metadata_base import MetadataBase


class Advertisement(MetadataBase):
Expand Down Expand Up @@ -66,3 +66,16 @@ class IFrameEmbeddable(MetadataBase):
tag_list = ["X-Frame-Options"]
key: str = "iframe_embeddable"
evaluate_header = True


class PopUp(MetadataBase):
tag_list = [
"popup",
"popuptext",
"modal",
"modal fade",
"modal-dialog",
"interstitial",
"Interstitial",
]
key = "popup"
28 changes: 26 additions & 2 deletions src/features/MetadataBase.py → src/features/metadata_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re

import adblockparser
import requests
from bs4 import BeautifulSoup

from lib.timing import get_utc_now

Expand Down Expand Up @@ -47,9 +49,31 @@ def _work_header(self, header):
values = [header[ele] for ele in self.tag_list if ele in header]
return values

@staticmethod
def _create_html_soup(html_content: str) -> BeautifulSoup:
return BeautifulSoup(html_content, "html.parser")

@staticmethod
def _extract_raw_links(soup: BeautifulSoup) -> list:
return list({a["href"] for a in soup.find_all(href=True)})

def _work_html_content(self, html_content):
if self.tag_list:
values = [ele for ele in self.tag_list if ele in html_content]
if self.url.find("easylist") >= 0:
# TODO: These raw_links can be stored for further analysis in other Features -> how to store?
soup = self._create_html_soup(html_content)
raw_links = self._extract_raw_links(soup)

rules = adblockparser.AdblockRules(self.tag_list)
values = []
for url in raw_links:
is_blocked = rules.should_block(url)
if is_blocked:
values.append(url)
else:
values = [
ele for ele in self.tag_list if html_content.find(ele) >= 0
]
else:
values = []
return values
Expand All @@ -64,7 +88,7 @@ def _start(self, html_content: str, header: dict) -> dict:
def _download_tag_list(self) -> None:
result = requests.get(self.url)
if result.status_code == 200:
self.tag_list = result.text.split("\n")
self.tag_list = result.text.splitlines()
else:
self._logger.warning(
f"Downloading tag list from '{self.url}' yielded status code '{result.status_code}'."
Expand Down
5 changes: 2 additions & 3 deletions src/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from app.api import app
from app.communication import ProcessToDaemonCommunication
from features.ExtractLinks import ExtractLinks
from features.extract_links import ExtractLinks
from features.html_based import (
Advertisement,
AntiAdBlock,
Expand All @@ -24,7 +24,7 @@
Paywalls,
Tracker,
)
from features.MetadataBase import MetadataBase
from features.metadata_base import MetadataBase
from lib.config import (
LOGFILE_MANAGER,
MESSAGE_CONTENT,
Expand Down Expand Up @@ -105,7 +105,6 @@ def _create_logger(self):
log_15_mb_limit = 1024 * 1024 * 15
backup_count = 10000

print(data_path)
if not os.path.exists(data_path):
os.mkdir(data_path, mode=0o755)

Expand Down
30 changes: 27 additions & 3 deletions tests/integration/api_integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from json import JSONDecodeError
from pathlib import Path

import pytest
import requests

DOCKER_TEST_URL = "http://0.0.0.0:5057/"
Expand All @@ -21,12 +22,28 @@ def _build_and_run_docker():
)
os.chdir(new_dir)

print("Exporting requirements")
process = subprocess.call(
["poetry export -o requirements.txt"], shell=True
)
print(f"process after exporting requirements: {process}")

print("building docker")
process = subprocess.call(
["docker build -t oeh-search-meta:latest ."], shell=True
)

print(f"process after building docker: {process}")

# stopping any old container of the same name prior to launch

subprocess.call(
[
"docker stop $(docker ps -a -q --filter ancestor=oeh-search-meta:latest --format='{{.ID}}')"
],
shell=True,
)

process = subprocess.Popen(
["docker-compose -f docker-compose.yml up"], shell=True
)
Expand All @@ -40,11 +57,18 @@ def _build_and_run_docker():

def _stop_docker():
process = subprocess.call(
["docker stop oeh-search-meta_extractor_1"], shell=True
[
"docker stop $(docker ps -a -q --filter ancestor=oeh-search-meta:latest --format='{{.ID}}')"
],
shell=True,
)

print(f"process after docker stop: {process}")


@pytest.mark.skip(
reason="This test takes a lot of time, depending on payload etc. Execute manually."
)
def test_api_extract_meta():
url = DOCKER_TEST_URL + "extract_meta"

Expand Down Expand Up @@ -101,7 +125,7 @@ def test_api_extract_meta():
_build_and_run_docker()

response = requests.request(
"POST", url, headers=DOCKER_TEST_HEADERS, data=payload, timeout=20
"POST", url, headers=DOCKER_TEST_HEADERS, data=payload, timeout=60
)

try:
Expand All @@ -122,7 +146,7 @@ def test_ping_container():
_build_and_run_docker()

response = requests.request(
"GET", url, headers=DOCKER_TEST_HEADERS, timeout=20
"GET", url, headers=DOCKER_TEST_HEADERS, timeout=60
)

data = json.loads(response.text)
Expand Down
49 changes: 44 additions & 5 deletions tests/unit/metadatabase_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import adblockparser
import pytest

from features.MetadataBase import MetadataBase
from features.metadata_base import MetadataBase


@pytest.fixture
Expand Down Expand Up @@ -34,7 +35,7 @@ def test_start(metadatabase: MetadataBase, mocker):
html_content = "html_content"
header = "header"
metadatabase.key = "test_key"
_start_spy = mocker.spy(metadatabase, "_start")
start_spy = mocker.spy(metadatabase, "_start")
values = metadatabase.start(html_content=html_content, header=header)

values_has_only_one_key = len(values.keys()) == 1
Expand All @@ -48,14 +49,14 @@ def test_start(metadatabase: MetadataBase, mocker):
if "tag_list_last_modified" in values[metadatabase.key].keys():
assert values[metadatabase.key]["tag_list_last_modified"] == ""
assert values[metadatabase.key]["tag_list_expires"] == 0
assert _start_spy.call_count == 1
assert _start_spy.call_args_list[0][1] == {
assert start_spy.call_count == 1
assert start_spy.call_args_list[0][1] == {
html_content: html_content,
header: header,
}

_ = metadatabase.start(html_content=html_content)
assert _start_spy.call_args_list[1][1] == {
assert start_spy.call_args_list[1][1] == {
html_content: html_content,
header: {},
}
Expand Down Expand Up @@ -111,3 +112,41 @@ def test_setup(metadatabase: MetadataBase, mocker):
assert metadatabase._download_tag_list.call_count == 1
assert extract_date_from_list_spy.call_count == 1
assert prepare_tag_spy.call_count == 1


"""
--------------------------------------------------------------------------------
"""


def _create_sample_easylist() -> list:
easylist = [
"! *** easylist:easylist/easylist_general_block.txt ***",
r"/adv_horiz.",
r"@@||imx.to/dropzone.js$script",
r"||testimx.to/dropzone.js$script",
r"||testimx2.to/dropzone.js",
]

return easylist


def _create_sample_urls() -> list:
url_to_be_blocked = [
("https://www.dictionary.com/", False),
("/adv_horiz.", True),
("imx.to/dropzone.js", False),
("testimx.to/dropzone.js", False),
("testimx2.to/dropzone.js", True),
]
return url_to_be_blocked


def test_easylist_filter():
urls_to_be_blocked = _create_sample_urls()

rules = adblockparser.AdblockRules(_create_sample_easylist())

for url, to_be_blocked in urls_to_be_blocked:
result = rules.should_block(url) # "http://ads.example.com"
assert result == to_be_blocked

0 comments on commit 3aadde1

Please sign in to comment.