Skip to content

Commit

Permalink
Handing docker over (#12)
Browse files Browse the repository at this point in the history
* - WIP: Improving API to be understandable from SwaggerUI

* - Improving API to be understandable from SwaggerUI

* - Improving API to be understandable from SwaggerUI

* - Improved API description

* - Updated documentation
  • Loading branch information
RobertMeissner authored Nov 10, 2020
1 parent 864dcd8 commit 2eec371
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 69 deletions.
24 changes: 20 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,34 @@
# Install

#
You will need poetry and python3.9:
```shell script
sudo apt-get install python3.9
python3.9 -m pip3 install poetry
```

# Launching the container

1. Start the container by executing `run.sh` from the main folder, not from `src`
2. The container can be reached on `http://0.0.0.0:5057`

# Accessing the Swagger UI

The Swagger UI of FastApi can be access by:
- `http://0.0.0.0:5057/redoc`
- or alternatively `http://0.0.0.0:5057/doc`

# Testing REST

1. Start the container `run.sh`
1. Start the container
2. Execute:
```shell script
curl --location --request POST '0.0.0.0:5057/extract_meta' \
--header 'Content-Type: application/json' \
--data-raw '{"url": "here", "content": "cool_content123"}'
--data-raw '{"url": "here", "html": "cool_content123", "headers": ""}'
```
3. You should get
```shell script
{"url":"here","meta":{"content_lenght":15}}%
{"url":"here","meta":{...}}
```

# Pre commit
Expand Down
6 changes: 5 additions & 1 deletion dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@ RUN adduser -D extractor

WORKDIR /home/extractor

RUN apk add --update --no-cache --virtual .build-deps g++ python3-dev libxml2 libxml2-dev
RUN apk add libxslt-dev

COPY requirements.txt requirements.txt
RUN pip install -r requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
RUN apk del .build-deps

COPY src/ .

Expand Down
123 changes: 111 additions & 12 deletions src/app/api.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,137 @@
from typing import Optional

from fastapi import FastAPI
from pydantic import BaseModel
from pydantic import BaseModel, Field

from app.communication import ProcessToDaemonCommunication
from lib.config import MESSAGE_CONTENT, MESSAGE_URL
from lib.config import MESSAGE_HEADERS, MESSAGE_HTML, MESSAGE_URL
from lib.timing import get_utc_now


class MetadataTags(BaseModel):
values: list = Field(
default=[], description="Raw values found by the metadata extractors."
)
probability: float = Field(
default=-1,
description="The calculated probability that this metadatum is present in the website.",
)
decision: bool = Field(
default=None,
description="A user friendly decision whether or not this metadatum is present in the website.",
)


class ListTags(BaseModel):
advertisement: bool = False
easy_privacy: bool = False
extracted_links: bool = False
extract_from_files: bool = False
internet_explorer_tracker: bool = False
cookies: bool = False
fanboy_annoyance: bool = False
fanboy_notification: bool = False
fanboy_social_media: bool = False
anti_adblock: bool = False
easylist_germany: bool = False
easylist_adult: bool = False
paywall: bool = False
content_security_policy: bool = False
iframe_embeddable: bool = False
pop_up: bool = False


class ExtractorTags(BaseModel):
advertisement: Optional[MetadataTags]
easy_privacy: Optional[MetadataTags]
extracted_links: Optional[MetadataTags]
extract_from_files: Optional[MetadataTags]
internet_explorer_tracker: Optional[MetadataTags]
cookies: Optional[MetadataTags]
fanboy_annoyance: Optional[MetadataTags]
fanboy_notification: Optional[MetadataTags]
fanboy_social_media: Optional[MetadataTags]
anti_adblock: Optional[MetadataTags]
easylist_germany: Optional[MetadataTags]
easylist_adult: Optional[MetadataTags]
paywall: Optional[MetadataTags]
content_security_policy: Optional[MetadataTags]
iframe_embeddable: Optional[MetadataTags]
pop_up: Optional[MetadataTags]


class Input(BaseModel):
url: str
content: dict
url: str = Field(..., description="The base url of the scraped website.")
html: str = Field(
..., description="Everything scraped from the website as text."
)
headers: str = Field(
..., description="The response header interpretable as dict."
)
allow_list: Optional[ListTags] = Field(
default=None,
description="A list of key:bool pairs. "
"Any metadata key == True will be extracted. "
"If this list is not given, all values will be extracted.",
)


class Output(BaseModel):
url: str
meta: dict
url: str = Field(..., description="The base url of the scraped website.")
meta: ExtractorTags = Field(
default=None,
description="The extracted metadata.",
)
exception: str = Field(
default=None,
description="A description of the exception which caused the extraction to fail.",
)


app = FastAPI()
app = FastAPI(title="Metadata Extractor", version="0.1")
app.api_queue: ProcessToDaemonCommunication


def _convert_dict_to_output_model(meta):
out = ExtractorTags()
extractor_keys = ExtractorTags.__fields__.keys()
for key in extractor_keys:
if key in meta.keys() and "values" in meta[key]:
out.__setattr__(
key,
MetadataTags(
value=meta[key]["values"],
probability=-1,
decision=False,
),
)
return out


@app.post("/extract_meta", response_model=Output)
def extract_meta(input_data: Input):
starting_extraction = get_utc_now()
uuid = app.api_queue.send_message(
{MESSAGE_URL: input_data.url, MESSAGE_CONTENT: input_data.content}
{
MESSAGE_URL: input_data.url,
MESSAGE_HTML: input_data.html,
MESSAGE_HEADERS: input_data.headers,
}
)

meta_data: dict = app.api_queue.get_message(uuid)

meta_data.update(
{"time_until_complete": get_utc_now() - starting_extraction}
)
out = Output(url=input_data.url, meta=meta_data)
if meta_data:
meta_data.update(
{"time_until_complete": get_utc_now() - starting_extraction}
)
_convert_dict_to_output_model(meta_data)

out = Output(url=input_data.url, meta=meta_data)
else:
message = "No response from metadata extractor."
out = Output(url=input_data.url, exception=message)

return out


Expand Down
2 changes: 1 addition & 1 deletion src/app/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _receive_message(self) -> bool:
def get_message(self, uuid: UUID) -> Optional[dict]:
tries = 1 # TODO: possible growing dict with each failed attempt
self._receive_message()
while uuid not in self._request_queue.keys() and tries <= 25:
while uuid not in self._request_queue.keys() and tries <= 60:
print(f"waited {tries} times for {uuid}")
self._receive_message()
tries += 1
Expand Down
74 changes: 37 additions & 37 deletions src/features/html_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,21 @@
class Advertisement(MetadataBase):
urls = [
"https://easylist.to/easylist/easylist.txt",
"https://easylist.to/easylist/easylist_adservers.txt",
"https://easylist.to/easylist/easylist_adservers_popup.txt",
"https://easylist.to/easylist/easylist_allowlist.txt",
"https://easylist.to/easylist/easylist_allowlist_dimensions.txt",
"https://easylist.to/easylist/easylist_allowlist_general_hide.txt",
"https://easylist.to/easylist/easylist_allowlist_popup.txt",
"https://easylist.to/easylist/easylist_general_block.txt",
"https://easylist.to/easylist/easylist_general_block_dimensions.txt",
"https://easylist.to/easylist/easylist_general_block_popup.txt",
"https://easylist.to/easylist/easylist_general_hide.txt",
"https://easylist.to/easylist/easylist_specific_block.txt",
"https://easylist.to/easylist/easylist_specific_block_popup.txt",
"https://easylist.to/easylist/easylist_specific_hide.txt",
"https://easylist.to/easylist/easylist_thirdparty.txt",
"https://easylist.to/easylist/easylist_thirdparty_popup.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_adservers.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_adservers_popup.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_allowlist.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_allowlist_dimensions.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_allowlist_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_allowlist_popup.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_general_block.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_general_block_dimensions.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_general_block_popup.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_specific_block.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_specific_block_popup.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_specific_hide.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_thirdparty.txt",
"https://github.com/easylist/easylist/blob/master/easylist/easylist_thirdparty_popup.txt",
]


Expand Down Expand Up @@ -61,39 +61,39 @@ class Cookies(MetadataBase):
class FanboyAnnoyance(MetadataBase):
urls = [
"https://easylist.to/easylist/fanboy-annoyance.txt",
"https://easylist.to/easylist/fanboy_annoyance_allowlist.txt",
"https://easylist.to/easylist/fanboy_annoyance_allowlist_general_hide.txt",
"https://easylist.to/easylist/fanboy_annoyance_general_block.txt",
"https://easylist.to/easylist/fanboy_annoyance_general_hide.txt",
"https://easylist.to/easylist/fanboy_annoyance_international.txt",
"https://easylist.to/easylist/fanboy_annoyance_specific_block.txt",
"https://easylist.to/easylist/fanboy_annoyance_thirdparty.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_annoyance_allowlist.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_annoyance_allowlist_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_annoyance_general_block.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_annoyance_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_annoyance_international.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_annoyance_specific_block.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_annoyance_thirdparty.txt",
]


class FanboyNotification(MetadataBase):
urls = [
"https://easylist.to/easylist/fanboy_notifications_allowlist.txt",
"https://easylist.to/easylist/fanboy_notifications_allowlist_general_hide.txt",
"https://easylist.to/easylist/fanboy_notifications_general_block.txt",
"https://easylist.to/easylist/fanboy_notifications_general_hide.txt",
"https://easylist.to/easylist/fanboy_notifications_specific_block.txt",
"https://easylist.to/easylist/fanboy_notifications_specific_hide.txt",
"https://easylist.to/easylist/fanboy_notifications_thirdparty.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_notifications_allowlist.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_notifications_allowlist_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_notifications_general_block.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_notifications_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_notifications_specific_block.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_notifications_specific_hide.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_notifications_thirdparty.txt",
]


class FanboySocialMedia(MetadataBase):
urls = [
"https://easylist.to/easylist/fanboy-social.txt",
"https://easylist.to/easylist/fanboy_social_allowlist.txt",
"https://easylist.to/easylist/fanboy_social_allowlist_general_hide.txt",
"https://easylist.to/easylist/fanboy_social_general_block.txt",
"https://easylist.to/easylist/fanboy_social_general_hide.txt",
"https://easylist.to/easylist/fanboy_social_international.txt",
"https://easylist.to/easylist/fanboy_social_specific_block.txt",
"https://easylist.to/easylist/fanboy_social_specific_hide.txt",
"https://easylist.to/easylist/fanboy_social_thirdparty.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_social_allowlist.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_social_allowlist_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_social_general_block.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_social_general_hide.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_social_international.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_social_specific_block.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_social_specific_hide.txt",
"https://github.com/easylist/easylist/blob/master/fanboy-addon/fanboy_social_thirdparty.txt",
]


Expand Down
2 changes: 1 addition & 1 deletion src/features/metadata_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def start(self, html_content: str = "", header=None) -> dict:
def _work_header(self, header):
if len(self.tag_list) == 1:
values = (
header[self.tag_list[0]] if self.tag_list[0] in header else ""
header[self.tag_list[0]] if self.tag_list[0] in header else []
)
else:
values = [header[ele] for ele in self.tag_list if ele in header]
Expand Down
5 changes: 2 additions & 3 deletions src/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,9 @@ def handle_content(self, request):
self._logger.debug(f"request: {request}")
for uuid, message in request.items():
self._logger.debug(f"message: {message}")
content = message[MESSAGE_CONTENT]
# TODO A lot of information needs to be known here
html_content = content[MESSAGE_HTML]
header_content = self._preprocess_header(content[MESSAGE_HEADERS])
html_content = message[MESSAGE_HTML]
header_content = self._preprocess_header(message[MESSAGE_HEADERS])

starting_extraction = get_utc_now()

Expand Down
15 changes: 8 additions & 7 deletions tests/integration/api_integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,16 @@ def test_api_extract_meta():
url = DOCKER_TEST_URL + "extract_meta"

payload = (
'{"url": "here", "content": '
'{"html": "<OAI-PMH xmlns=\\"http://www.openarchives.org/OAI/2.0/\\" '
'{"url": "here",'
'"html": "<OAI-PMH xmlns=\\"http://www.openarchives.org/OAI/2.0/\\" '
'xmlns:xsi=\\"http://www.w3.org/2001/XMLSchema-instance\\" '
'xsi:schemaLocation=\\"http://www.openarchives.org/OAI/2.0/ '
'http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd\\">'
'<responseDate>2020-10-23T13:58:02Z</responseDate><request verb=\\"GetRecord\\" '
'identifier=\\"4757e9ca-8829-4377-b0dd-680f1b2b4595\\" metadataPrefix=\\"lom\\">'
"https://cloud.schulcampus-rlp.de/edu-sharing</request><GetRecord><record><header>"
"<identifier>4757e9ca-8829-4377-b0dd-680f1b2b4595</identifier><datestamp>2020-10-23T13:58:02Z"
'</datestamp></header><metadata><lom xmlns=\\"http://ltsc.ieee.org/xsd/LOM\\" xmlns:xsi=\\'
'</datestamp></header><metadata><lom xmlns=\\"http://ltsc.ieee.org/xsd/LOM\\" xmlns:xsi=\\"'
'"http://www.w3.org/2001/XMLSchema-instance\\" xsi:schemaLocation=\\"http://ltsc.ieee.org/xsd/LOM '
'http://ltsc.ieee.org/xsd/lomv1.0/lom.xsd\\">\\n <general>\\n <identifier>\\n '
"<catalog>local</catalog>\\n <entry>4757e9ca-8829-4377-b0dd-680f1b2b4595</entry>\\n "
Expand Down Expand Up @@ -116,10 +116,11 @@ def test_api_extract_meta():
"https://cloud.schulcampus-rlp.de/edu-sharing/preview?nodeId=4757e9ca-8829-4377-b0dd-680f1b2b4595&amp;"
"storeProtocol=workspace&amp;storeId=SpacesStore&amp;dontcache=1603461482271</string>\\n "
"</description>\\n </resource>\\n </relation>\\n</lom></metadata></record></GetRecord>"
"</OAI-PMH>\", \"headers\":\"{b'Date': [b'Fri, 23 Oct 2020 13:58:01 GMT'], b'Server': [b'Apache'], "
"b'Access-Control-Expose-Headers': [b'X-Edu-Scope'], b'Cache-Control': [b'no-cache'], "
"b'Expires': [b'0'], b'Content-Type': [b'application/xml'], b'Vary': [b'Accept-Encoding'],"
" b'X-Content-Type-Options': [b'nosniff'], b'X-Frame-Options': [b'sameorigin']}\"}}"
"</OAI-PMH>', "
'"headers":"{b"Date": [b"Fri, 23 Oct 2020 13:58:01 GMT"], b"Server": [b"Apache"], '
'b"Access-Control-Expose-Headers": [b"X-Edu-Scope"], b"Cache-Control": [b"no-cache"], '
'b"Expires": [b"0"], b"Content-Type": [b"application/xml"], b"Vary": [b"Accept-Encoding"],'
' b"X-Content-Type-Options": [b"nosniff"], b"X-Frame-Options": [b"sameorigin"]}\'}}'
)

_build_and_run_docker()
Expand Down
4 changes: 1 addition & 3 deletions tests/unit/manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,7 @@ def test_handle_content(manager: Manager, mocker):
assert manager._preprocess_header.call_count == 0
assert manager._extract_meta_data.call_count == 0

request = {
"some_uuid": {"content": {"html": empty_html, "headers": empty_header}}
}
request = {"some_uuid": {"html": empty_html, "headers": empty_header}}

manager.manager_to_api_queue = mocker.MagicMock()
manager.handle_content(request)
Expand Down

0 comments on commit 2eec371

Please sign in to comment.