Skip to content

Commit

Permalink
Extracting from word (#11)
Browse files Browse the repository at this point in the history
* - WIP: Loading and extracting .docx documents

* - WIP: Loading and extracting .docx documents
- Extracting raw text

* - Extracting images from .docx
  • Loading branch information
RobertMeissner authored Nov 10, 2020
1 parent 10c8887 commit 864dcd8
Show file tree
Hide file tree
Showing 6 changed files with 184 additions and 2 deletions.
54 changes: 53 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ uvicorn = "^0.12.2"
requests = "^2.24.0"
bs4 = "^0.0.1"
adblockparser = "^0.7"
lxml = "^4.6.1"

[tool.poetry.dev-dependencies]
pylint = "^2.6.0"
Expand Down
37 changes: 37 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,43 @@ h11==0.11.0 \
idna==2.10; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" \
--hash=sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0 \
--hash=sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6
lxml==4.6.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0") \
--hash=sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51 \
--hash=sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a \
--hash=sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f \
--hash=sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5 \
--hash=sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4 \
--hash=sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b \
--hash=sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b \
--hash=sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d \
--hash=sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3 \
--hash=sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360 \
--hash=sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810 \
--hash=sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f \
--hash=sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a \
--hash=sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1 \
--hash=sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856 \
--hash=sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8 \
--hash=sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c \
--hash=sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5 \
--hash=sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d \
--hash=sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f \
--hash=sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1 \
--hash=sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311 \
--hash=sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a \
--hash=sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9 \
--hash=sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891 \
--hash=sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf \
--hash=sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b \
--hash=sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9 \
--hash=sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc \
--hash=sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f \
--hash=sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301 \
--hash=sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b \
--hash=sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174 \
--hash=sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd \
--hash=sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230 \
--hash=sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367
pydantic==1.7.2; python_version >= "3.6" \
--hash=sha256:dfaa6ed1d509b5aef4142084206584280bb6e9014f01df931ec6febdad5b200a \
--hash=sha256:2182ba2a9290964b278bcc07a8d24207de709125d520efec9ad6fa6f92ee058d \
Expand Down
90 changes: 90 additions & 0 deletions src/features/extract_from_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import base64
import os
import zipfile
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

from features.metadata_base import MetadataBase


class ExtractFromFiles(MetadataBase):
def _load_docx(self, docx, filename):
result = requests.get(docx)
if result.status_code == 200:
self.tag_list = result.text.splitlines()
else:
self._logger.warning(
f"Downloading tag list from '{docx}' yielded status code '{result.status_code}'."
)

open(filename, "wb").write(result.content)

@staticmethod
def _extract_docx(filename) -> dict:
document = zipfile.ZipFile(filename)

xml_files = document.filelist

extracted_content = []
images = {}

for xml_file in xml_files:
if xml_file.filename.find(".xml") >= 0:
content = document.read(xml_file, pwd=None).decode()
soup = BeautifulSoup(content, "xml")

body = None
if xml_file.filename == "word/document.xml":
body = soup.document.body
elif xml_file.filename == "word/footer1.xml":
body = soup.ftr
elif xml_file.filename == "word/header1.xml":
body = soup.hdr

text_pieces = []
if body:
text_pieces = [tag.string for tag in body.find_all("t")]

extracted_content += text_pieces
elif xml_file.filename.find("media") >= 0:
image = document.read(xml_file, pwd=None)
image = base64.b64encode(image).decode()

images.update({xml_file.filename: image})

content = {"content": extracted_content, "images": images}

return content

def _work_docx(self, docx_files):
values = {}

for file in docx_files:
filename = os.path.basename(urlparse(file).path)
# self._load_docx(file, filename)
content = self._extract_docx(filename)
values.update({filename: content})
# os.remove(filename)

return values

def _start(self, html_content: str, header: dict) -> dict:
soup = self._create_html_soup(html_content)

raw_links = self._extract_raw_links(soup)

file_extensions = [os.path.splitext(link)[-1] for link in raw_links]

docx_files = [
file
for file, extension in zip(raw_links, file_extensions)
if extension == ".docx"
]

values = self._work_docx(docx_files=docx_files)

content = {**values}

return content
2 changes: 2 additions & 0 deletions src/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from app.api import app
from app.communication import ProcessToDaemonCommunication
from features.extract_from_document import ExtractFromFiles
from features.extract_links import ExtractLinks
from features.html_based import (
Advertisement,
Expand Down Expand Up @@ -71,6 +72,7 @@ def _create_extractors(self):
Advertisement,
EasyPrivacy,
ExtractLinks,
ExtractFromFiles,
IETracker,
Cookies,
FanboyAnnoyance,
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_init(manager: Manager, mocker):
assert manager._create_logger.call_count == 1
assert manager._create_api.call_count == 1
assert run_spy.call_count == 0
assert len(manager.metadata_extractors) == 15
assert len(manager.metadata_extractors) == 16
assert manager.run_loop


Expand Down

0 comments on commit 864dcd8

Please sign in to comment.