From 864dcd825491fe4c473cefbedd9c519e9558795e Mon Sep 17 00:00:00 2001 From: RMeissnerCC <50482279+RMeissnerCC@users.noreply.github.com> Date: Tue, 10 Nov 2020 07:16:40 +0100 Subject: [PATCH] Extracting from word (#11) * - WIP: Loading and extracting .docx documents * - WIP: Loading and extracting .docx documents - Extracting raw text * - Extracting images from .docx --- poetry.lock | 54 +++++++++++++++- pyproject.toml | 1 + requirements.txt | 37 +++++++++++ src/features/extract_from_document.py | 90 +++++++++++++++++++++++++++ src/manager.py | 2 + tests/unit/manager_test.py | 2 +- 6 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 src/features/extract_from_document.py diff --git a/poetry.lock b/poetry.lock index 8686744..106e2bf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -216,6 +216,20 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +[[package]] +name = "lxml" +version = "4.6.1" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["beautifulsoup4"] +source = ["Cython (>=0.29.7)"] + [[package]] name = "mccabe" version = "0.6.1" @@ -484,7 +498,7 @@ python-versions = "*" [metadata] lock-version = "1.1" python-versions = "^3.9.0rc1" -content-hash = "38ace6f5b9d97750fffdf6c2b5464e10fc78b938814f22c14862de9941c9123e" +content-hash = "7db72fab0e4c85beac22a13b9a6fec7ae187db67fa944d06ca9d66c951ee931b" [metadata.files] adblockparser = [ @@ -626,6 +640,44 @@ lazy-object-proxy = [ {file = "lazy_object_proxy-1.4.3-cp38-cp38-win32.whl", hash = "sha256:5541cada25cd173702dbd99f8e22434105456314462326f06dba3e180f203dfd"}, {file = "lazy_object_proxy-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:59f79fef100b09564bc2df42ea2d8d21a64fdcda64979c0fa3db7bdaabaf6239"}, ] +lxml = [ + {file = "lxml-4.6.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51"}, + {file = "lxml-4.6.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a"}, + {file = "lxml-4.6.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f"}, + {file = "lxml-4.6.1-cp27-cp27m-win32.whl", hash = "sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5"}, + {file = "lxml-4.6.1-cp27-cp27m-win_amd64.whl", hash = "sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4"}, + {file = "lxml-4.6.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b"}, + {file = "lxml-4.6.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b"}, + {file = "lxml-4.6.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d"}, + {file = "lxml-4.6.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3"}, + {file = "lxml-4.6.1-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360"}, + {file = "lxml-4.6.1-cp35-cp35m-win32.whl", hash = "sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810"}, + {file = "lxml-4.6.1-cp35-cp35m-win_amd64.whl", hash = "sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f"}, + {file = "lxml-4.6.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a"}, + {file = "lxml-4.6.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1"}, + {file = "lxml-4.6.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856"}, + {file = "lxml-4.6.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8"}, + {file = "lxml-4.6.1-cp36-cp36m-win32.whl", hash = "sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c"}, + {file = "lxml-4.6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5"}, + {file = "lxml-4.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d"}, + {file = "lxml-4.6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f"}, + {file = "lxml-4.6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1"}, + {file = "lxml-4.6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311"}, + {file = "lxml-4.6.1-cp37-cp37m-win32.whl", hash = "sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a"}, + {file = "lxml-4.6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9"}, + {file = "lxml-4.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891"}, + {file = "lxml-4.6.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf"}, + {file = "lxml-4.6.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b"}, + {file = "lxml-4.6.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9"}, + {file = "lxml-4.6.1-cp38-cp38-win32.whl", hash = "sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc"}, + {file = "lxml-4.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f"}, + {file = "lxml-4.6.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301"}, + {file = "lxml-4.6.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b"}, + {file = "lxml-4.6.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174"}, + {file = "lxml-4.6.1-cp39-cp39-win32.whl", hash = "sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd"}, + {file = "lxml-4.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230"}, + {file = "lxml-4.6.1.tar.gz", hash = "sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367"}, +] mccabe = [ {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, diff --git a/pyproject.toml b/pyproject.toml index afaad11..6521ad2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ uvicorn = "^0.12.2" requests = "^2.24.0" bs4 = "^0.0.1" adblockparser = "^0.7" +lxml = "^4.6.1" [tool.poetry.dev-dependencies] pylint = "^2.6.0" diff --git a/requirements.txt b/requirements.txt index 5c33d36..650d195 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,6 +25,43 @@ h11==0.11.0 \ idna==2.10; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" \ --hash=sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0 \ --hash=sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6 +lxml==4.6.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0") \ + --hash=sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51 \ + --hash=sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a \ + --hash=sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f \ + --hash=sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5 \ + --hash=sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4 \ + --hash=sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b \ + --hash=sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b \ + --hash=sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d \ + --hash=sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3 \ + --hash=sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360 \ + --hash=sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810 \ + --hash=sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f \ + --hash=sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a \ + --hash=sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1 \ + --hash=sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856 \ + --hash=sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8 \ + --hash=sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c \ + --hash=sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5 \ + --hash=sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d \ + --hash=sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f \ + --hash=sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1 \ + --hash=sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311 \ + --hash=sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a \ + --hash=sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9 \ + --hash=sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891 \ + --hash=sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf \ + --hash=sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b \ + --hash=sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9 \ + --hash=sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc \ + --hash=sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f \ + --hash=sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301 \ + --hash=sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b \ + --hash=sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174 \ + --hash=sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd \ + --hash=sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230 \ + --hash=sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367 pydantic==1.7.2; python_version >= "3.6" \ --hash=sha256:dfaa6ed1d509b5aef4142084206584280bb6e9014f01df931ec6febdad5b200a \ --hash=sha256:2182ba2a9290964b278bcc07a8d24207de709125d520efec9ad6fa6f92ee058d \ diff --git a/src/features/extract_from_document.py b/src/features/extract_from_document.py new file mode 100644 index 0000000..fe29b69 --- /dev/null +++ b/src/features/extract_from_document.py @@ -0,0 +1,90 @@ +import base64 +import os +import zipfile +from urllib.parse import urlparse + +import requests +from bs4 import BeautifulSoup + +from features.metadata_base import MetadataBase + + +class ExtractFromFiles(MetadataBase): + def _load_docx(self, docx, filename): + result = requests.get(docx) + if result.status_code == 200: + self.tag_list = result.text.splitlines() + else: + self._logger.warning( + f"Downloading tag list from '{docx}' yielded status code '{result.status_code}'." + ) + + open(filename, "wb").write(result.content) + + @staticmethod + def _extract_docx(filename) -> dict: + document = zipfile.ZipFile(filename) + + xml_files = document.filelist + + extracted_content = [] + images = {} + + for xml_file in xml_files: + if xml_file.filename.find(".xml") >= 0: + content = document.read(xml_file, pwd=None).decode() + soup = BeautifulSoup(content, "xml") + + body = None + if xml_file.filename == "word/document.xml": + body = soup.document.body + elif xml_file.filename == "word/footer1.xml": + body = soup.ftr + elif xml_file.filename == "word/header1.xml": + body = soup.hdr + + text_pieces = [] + if body: + text_pieces = [tag.string for tag in body.find_all("t")] + + extracted_content += text_pieces + elif xml_file.filename.find("media") >= 0: + image = document.read(xml_file, pwd=None) + image = base64.b64encode(image).decode() + + images.update({xml_file.filename: image}) + + content = {"content": extracted_content, "images": images} + + return content + + def _work_docx(self, docx_files): + values = {} + + for file in docx_files: + filename = os.path.basename(urlparse(file).path) + # self._load_docx(file, filename) + content = self._extract_docx(filename) + values.update({filename: content}) + # os.remove(filename) + + return values + + def _start(self, html_content: str, header: dict) -> dict: + soup = self._create_html_soup(html_content) + + raw_links = self._extract_raw_links(soup) + + file_extensions = [os.path.splitext(link)[-1] for link in raw_links] + + docx_files = [ + file + for file, extension in zip(raw_links, file_extensions) + if extension == ".docx" + ] + + values = self._work_docx(docx_files=docx_files) + + content = {**values} + + return content diff --git a/src/manager.py b/src/manager.py index 635f1d3..93913db 100644 --- a/src/manager.py +++ b/src/manager.py @@ -10,6 +10,7 @@ from app.api import app from app.communication import ProcessToDaemonCommunication +from features.extract_from_document import ExtractFromFiles from features.extract_links import ExtractLinks from features.html_based import ( Advertisement, @@ -71,6 +72,7 @@ def _create_extractors(self): Advertisement, EasyPrivacy, ExtractLinks, + ExtractFromFiles, IETracker, Cookies, FanboyAnnoyance, diff --git a/tests/unit/manager_test.py b/tests/unit/manager_test.py index 6ebeed5..4f0f08d 100644 --- a/tests/unit/manager_test.py +++ b/tests/unit/manager_test.py @@ -27,7 +27,7 @@ def test_init(manager: Manager, mocker): assert manager._create_logger.call_count == 1 assert manager._create_api.call_count == 1 assert run_spy.call_count == 0 - assert len(manager.metadata_extractors) == 15 + assert len(manager.metadata_extractors) == 16 assert manager.run_loop