From 5f677e10f0b1446f2e09cd1ff4c2a39d30db3967 Mon Sep 17 00:00:00 2001 From: Uli Date: Sat, 11 Jan 2020 13:29:57 +0100 Subject: [PATCH] feat(): add ms-office formats --- src/documents/models.py | 32 ++++++++++++++++-------- src/documents/views.py | 14 +++++++---- src/paperless_tika/parsers.py | 4 ++- src/paperless_tika/signals.py | 2 +- src/paperless_tika/tests/test_signals.py | 6 ++--- 5 files changed, 38 insertions(+), 20 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index a71edf0f6..0c8824f34 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -197,9 +197,16 @@ class Document(models.Model): TYPE_ODS = "ods" TYPE_ODT = "odt" TYPE_ODP = "odp" + TYPE_XLS = "xls" + TYPE_XLSX = "xlsx" + TYPE_DOC = "doc" + TYPE_DOCX = "docx" + TYPE_PPT = "ppt" + TYPE_PPTX = "pptx" TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF, TYPE_TXT, TYPE_CSV, TYPE_MD, TYPE_ODS, TYPE_ODT, - TYPE_ODP) + TYPE_ODP, TYPE_XLS, TYPE_XLSX, TYPE_DOC, TYPE_DOCX, + TYPE_PPT, TYPE_PPTX) STORAGE_TYPE_UNENCRYPTED = "unencrypted" STORAGE_TYPE_GPG = "gpg" @@ -371,53 +378,58 @@ class FileInfo: non_separated_word=r"([\w,. ]|([^\s]-))" ) ) - - formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv|ods|odt|odp" + # TODO: When one is missing here no test fails! Write one + OFFICE_FORMATS = "ods|odt|odp|xlsx?|docx?|pptx?" + TEXT_FORMATS = "te?xt|md|csv" + IMAGE_FORMATS = "jpe?g|png|gif|tiff?" + FORMATS = "pdf|{}|{}|{}".format( + IMAGE_FORMATS, TEXT_FORMATS, OFFICE_FORMATS + ) REGEXES = OrderedDict([ ("created-correspondent-title-tags", re.compile( r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P.*) - " r"(?P.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>{})$".format(formats), + r"\.(?P<extension>{})$".format(FORMATS), flags=re.IGNORECASE )), ("created-title-tags", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>{})$".format(formats), + r"\.(?P<extension>{})$".format(FORMATS), flags=re.IGNORECASE )), ("created-correspondent-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<correspondent>.*) - " r"(?P<title>.*)" - r"\.(?P<extension>{})$".format(formats), + r"\.(?P<extension>{})$".format(FORMATS), flags=re.IGNORECASE )), ("created-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*)" - r"\.(?P<extension>{})$".format(formats), + r"\.(?P<extension>{})$".format(FORMATS), flags=re.IGNORECASE )), ("correspondent-title-tags", re.compile( r"(?P<correspondent>.*) - " r"(?P<title>.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>{})$".format(formats), + r"\.(?P<extension>{})$".format(FORMATS), flags=re.IGNORECASE )), ("correspondent-title", re.compile( r"(?P<correspondent>.*) - " r"(?P<title>.*)?" - r"\.(?P<extension>{})$".format(formats), + r"\.(?P<extension>{})$".format(FORMATS), flags=re.IGNORECASE )), ("title", re.compile( r"(?P<title>.*)" - r"\.(?P<extension>{})$".format(formats), + r"\.(?P<extension>{})$".format(FORMATS), flags=re.IGNORECASE )) ]) diff --git a/src/documents/views.py b/src/documents/views.py index 5b742ab22..8ee4f5425 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -54,11 +54,15 @@ def render_to_response(self, context, **response_kwargs): Document.TYPE_CSV: "text/csv", Document.TYPE_MD: "text/markdown", Document.TYPE_TXT: "text/plain", - Document.TYPE_ODS: - "application/vnd.oasis.opendocument.spreadsheet", - Document.TYPE_ODT: "application/vnd.oasis.opendocument.text", - Document.TYPE_ODP: - "application/vnd.oasis.opendocument.presentation" + Document.TYPE_ODS: "application/vnd.oasis.opendocument.spreadsheet", # NOQA: E501 + Document.TYPE_ODT: "application/vnd.oasis.opendocument.text", # NOQA: E501 + Document.TYPE_ODP: "application/vnd.oasis.opendocument.presentation", # NOQA: E501 + Document.TYPE_DOC: "application/msword", # NOQA: E501 + Document.TYPE_DOCX: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # NOQA: E501 + Document.TYPE_XLS: "application/vnd.ms-excel", # NOQA: E501 + Document.TYPE_XLSX: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # NOQA: E501 + Document.TYPE_PPT: "application/vnd.ms-powerpoint", # NOQA: E501 + Document.TYPE_PPTX: "application/vnd.openxmlformats-officedocument.presentationml.presentation", # NOQA: E501 } if self.kwargs["kind"] == "thumb": diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 03e4bdbc3..3c6632bd4 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -12,7 +12,9 @@ class TikaDocumentParser(DocumentParser): """ - This parser uses Apache Tika to try and get some text out of + This parser uses Apache-Tika to try and get some text out of office + formats, whether it's a open-office (ODS, ODT, ODP), or ms-office + format (XLS, XLSX, DOC, DOCX, PPT, PPTX) """ CONVERT = settings.CONVERT_BINARY diff --git a/src/paperless_tika/signals.py b/src/paperless_tika/signals.py index 2a7b06c65..67687ea7b 100644 --- a/src/paperless_tika/signals.py +++ b/src/paperless_tika/signals.py @@ -5,7 +5,7 @@ class ConsumerDeclaration: - MATCHING_FILES = re.compile(r"^.*\.(ods|odt|odp)$") + MATCHING_FILES = re.compile(r"^.*\.(ods|odt|odp|xlsx?|docx?|pptx?)$") @classmethod def handle(cls, sender, **kwargs): diff --git a/src/paperless_tika/tests/test_signals.py b/src/paperless_tika/tests/test_signals.py index 1d5e0975a..aef3306f8 100644 --- a/src/paperless_tika/tests/test_signals.py +++ b/src/paperless_tika/tests/test_signals.py @@ -12,9 +12,9 @@ def test_test_handles_various_file_names_true(self): "A document with a . in it", "Doc with -- in it" ) suffixes = ( - "ods", "odt", "odp", - "ODS", "ODT", "ODP", - "oDs", "oDt", "oDp" + "ods", "odt", "odp", "xls", "xlsx", "doc", "docx", "ppt", "pptx", + "ODS", "ODT", "ODP", "XLS", "XLSX", "DOC", "DOCX", "PPT", "PPTX", + "oDs", "oDt", "oDp", "xLs", "xLsX", "dOc", "dOcX", "pPt", "pPtX", ) for prefix in prefixes: