the-paperless-project · Tooa · Jan 11, 2020 · Jan 7, 2020 · Jan 11, 2020
diff --git a/Dockerfile b/Dockerfile
@@ -55,6 +55,11 @@ RUN apk add --no-cache \
 # Setup entrypoint
     chmod 755 /sbin/docker-entrypoint.sh
 
+# WORKAROUND for missing fonts in container
+RUN apk --no-cache add msttcorefonts-installer fontconfig && \
+    update-ms-fonts && \
+    fc-cache -f
+
 WORKDIR /usr/src/paperless/src
 # Mount volumes and set Entrypoint
 VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]

diff --git a/Pipfile b/Pipfile
@@ -38,6 +38,7 @@ psycopg2 = "*"
 djangoql = "*"
 whitenoise = "*"
 brotli = "*"
+tika = "*"
 
 [dev-packages]
 ipython = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/docker-compose.yml.example b/docker-compose.yml.example
@@ -29,13 +29,23 @@ services:
             - PAPERLESS_OCR_LANGUAGES=
         command: ["gunicorn", "-b", "0.0.0.0:8000"]
 
+    tika-server:
+      image: logicalspark/docker-tikaserver:latest
+      healthcheck:
+          test: ["CMD", "curl" , "-f", "http://localhost:9998"]
+          interval: 30s
+          timeout: 10s
+          retries: 5
+
     consumer:
         build: ./
         # uncomment the following line to start automatically on system boot
         # restart: always
         depends_on:
             webserver:
                 condition: service_healthy
+            tika-server:
+                condition: service_healthy
         volumes:
             - data:/usr/src/paperless/data
             - media:/usr/src/paperless/media
@@ -49,6 +59,8 @@ services:
             # want to export your documents.
             # - /path/to/another/arbitrary/place:/export
         env_file: docker-compose.env
+        environment:
+            - TIKA_CLIENT_ONLY=True
         command: ["document_consumer"]
 
 volumes:

diff --git a/docs/utilities.rst b/docs/utilities.rst
@@ -239,10 +239,10 @@ Basic Syntax
 
 Again we'll use the ``manage.py`` script, passing ``change_storage_type``:
 
-.. code:: bash
+.. code:: console
 
     $ /path/to/paperless/src/manage.py change_storage_type --help
-		usage: manage.py change_storage_type [-h] [--version] [-v {0,1,2,3}]
+    usage: manage.py change_storage_type [-h] [--version] [-v {0,1,2,3}]
                                      [--settings SETTINGS]
                                      [--pythonpath PYTHONPATH] [--traceback]
                                      [--no-color] [--passphrase PASSPHRASE]

diff --git a/requirements.txt b/requirements.txt
@@ -78,3 +78,4 @@ virtualenv==16.7.2
 wcwidth==0.1.7
 whitenoise==4.1.3
 zipp==0.5.2
+tika=="1.23"
diff --git a/src/documents/models.py b/src/documents/models.py
@@ -194,8 +194,19 @@ class Document(models.Model):
     TYPE_TXT = "txt"
     TYPE_CSV = "csv"
     TYPE_MD = "md"
+    TYPE_ODS = "ods"
+    TYPE_ODT = "odt"
+    TYPE_ODP = "odp"
+    TYPE_XLS = "xls"
+    TYPE_XLSX = "xlsx"
+    TYPE_DOC = "doc"
+    TYPE_DOCX = "docx"
+    TYPE_PPT = "ppt"
+    TYPE_PPTX = "pptx"
     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
-             TYPE_TXT, TYPE_CSV, TYPE_MD)
+             TYPE_TXT, TYPE_CSV, TYPE_MD, TYPE_ODS, TYPE_ODT,
+             TYPE_ODP, TYPE_XLS, TYPE_XLSX, TYPE_DOC, TYPE_DOCX,
+             TYPE_PPT, TYPE_PPTX)
 
     STORAGE_TYPE_UNENCRYPTED = "unencrypted"
     STORAGE_TYPE_GPG = "gpg"
@@ -367,53 +378,58 @@ class FileInfo:
             non_separated_word=r"([\w,. ]|([^\s]-))"
         )
     )
-
-    formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
+    # TODO: When one is missing here no test fails! Write one
+    OFFICE_FORMATS = "ods|odt|odp|xlsx?|docx?|pptx?"
+    TEXT_FORMATS = "te?xt|md|csv"
+    IMAGE_FORMATS = "jpe?g|png|gif|tiff?"
+    FORMATS = "pdf|{}|{}|{}".format(
+        IMAGE_FORMATS, TEXT_FORMATS, OFFICE_FORMATS
+    )
     REGEXES = OrderedDict([
         ("created-correspondent-title-tags", re.compile(
             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
             r"(?P<correspondent>.*) - "
             r"(?P<title>.*) - "
             r"(?P<tags>[a-z0-9\-,]*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"\.(?P<extension>{})$".format(FORMATS),
             flags=re.IGNORECASE
         )),
         ("created-title-tags", re.compile(
             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
             r"(?P<title>.*) - "
             r"(?P<tags>[a-z0-9\-,]*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"\.(?P<extension>{})$".format(FORMATS),
             flags=re.IGNORECASE
         )),
         ("created-correspondent-title", re.compile(
             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
             r"(?P<correspondent>.*) - "
             r"(?P<title>.*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"\.(?P<extension>{})$".format(FORMATS),
             flags=re.IGNORECASE
         )),
         ("created-title", re.compile(
             r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
             r"(?P<title>.*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"\.(?P<extension>{})$".format(FORMATS),
             flags=re.IGNORECASE
         )),
         ("correspondent-title-tags", re.compile(
             r"(?P<correspondent>.*) - "
             r"(?P<title>.*) - "
             r"(?P<tags>[a-z0-9\-,]*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"\.(?P<extension>{})$".format(FORMATS),
             flags=re.IGNORECASE
         )),
         ("correspondent-title", re.compile(
             r"(?P<correspondent>.*) - "
             r"(?P<title>.*)?"
-            r"\.(?P<extension>{})$".format(formats),
+            r"\.(?P<extension>{})$".format(FORMATS),
             flags=re.IGNORECASE
         )),
         ("title", re.compile(
             r"(?P<title>.*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"\.(?P<extension>{})$".format(FORMATS),
             flags=re.IGNORECASE
         ))
     ])

diff --git a/src/documents/views.py b/src/documents/views.py
@@ -53,7 +53,16 @@ def render_to_response(self, context, **response_kwargs):
             Document.TYPE_TIF: "image/tiff",
             Document.TYPE_CSV: "text/csv",
             Document.TYPE_MD:  "text/markdown",
-            Document.TYPE_TXT: "text/plain"
+            Document.TYPE_TXT: "text/plain",
+            Document.TYPE_ODS: "application/vnd.oasis.opendocument.spreadsheet",  # NOQA: E501
+            Document.TYPE_ODT: "application/vnd.oasis.opendocument.text",  # NOQA: E501
+            Document.TYPE_ODP: "application/vnd.oasis.opendocument.presentation",  # NOQA: E501
+            Document.TYPE_DOC: "application/msword",  # NOQA: E501
+            Document.TYPE_DOCX: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",  # NOQA: E501
+            Document.TYPE_XLS: "application/vnd.ms-excel",  # NOQA: E501
+            Document.TYPE_XLSX: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",  # NOQA: E501
+            Document.TYPE_PPT: "application/vnd.ms-powerpoint",  # NOQA: E501
+            Document.TYPE_PPTX: "application/vnd.openxmlformats-officedocument.presentationml.presentation",  # NOQA: E501
         }
 
         if self.kwargs["kind"] == "thumb":

diff --git a/src/paperless/settings.py b/src/paperless/settings.py
@@ -60,7 +60,7 @@ def __get_boolean(key, default="NO"):
     ALLOWED_HOSTS = _allowed_hosts.split(",")
 
 FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME")
-    
+
 # Application definition
 
 INSTALLED_APPS = [
@@ -80,6 +80,7 @@ def __get_boolean(key, default="NO"):
     "reminders.apps.RemindersConfig",
     "paperless_tesseract.apps.PaperlessTesseractConfig",
     "paperless_text.apps.PaperlessTextConfig",
+    "paperless_tika.apps.PaperlessTikaConfig",
 
     "django.contrib.admin",
 
@@ -109,7 +110,7 @@ def __get_boolean(key, default="NO"):
 STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
 
 # We allow CORS from localhost:8080
-CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "localhost:8080").split(","))
+CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080").split(","))
 
 # If auth is disabled, we just use our "bypass" authentication middleware
 if bool(os.getenv("PAPERLESS_DISABLE_LOGIN", "false").lower() in ("yes", "y", "1", "t", "true")):

diff --git a/src/paperless_tika/__init__.py b/src/paperless_tika/__init__.py
diff --git a/src/paperless_tika/apps.py b/src/paperless_tika/apps.py
@@ -0,0 +1,16 @@
+from django.apps import AppConfig
+
+
+class PaperlessTikaConfig(AppConfig):
+
+    name = "paperless_tika"
+
+    def ready(self):
+
+        from documents.signals import document_consumer_declaration
+
+        from .signals import ConsumerDeclaration
+
+        document_consumer_declaration.connect(ConsumerDeclaration.handle)
+
+        AppConfig.ready(self)
diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py
@@ -0,0 +1,112 @@
+import os
+import subprocess
+from requests.exceptions import ConnectionError
+
+import tika
+from tika import parser
+from django.conf import settings
+
+from documents.parsers import DocumentParser, ParseError
+from paperless_tesseract.parsers import strip_excess_whitespace
+
+
+class TikaDocumentParser(DocumentParser):
+    """
+    This parser uses Apache-Tika to try and get some text out of office
+    formats, whether it's a open-office (ODS, ODT, ODP), or ms-office
+    format (XLS, XLSX, DOC, DOCX, PPT, PPTX)
+    """
+
+    CONVERT = settings.CONVERT_BINARY
+
+    def __init__(self, path):
+        super().__init__(path)
+        self._text = None
+
+    def get_thumbnail(self):
+        """
+        The thumbnail of a text file is just a 500px wide image of the text
+        rendered onto a letter-sized page.
+        """
+        # The below is heavily cribbed from https://askubuntu.com/a/590951
+
+        bg_color = "white"  # bg color
+        text_color = "black"  # text color
+        psize = [500, 647]  # icon size
+        n_lines = 50  # number of lines to show
+        out_path = os.path.join(self.tempdir, "convert.png")
+
+        temp_bg = os.path.join(self.tempdir, "bg.png")
+        temp_txlayer = os.path.join(self.tempdir, "tx.png")
+        picsize = "x".join([str(n) for n in psize])
+        txsize = "x".join([str(n - 8) for n in psize])
+
+        def create_bg():
+            work_size = ",".join([str(n - 1) for n in psize])
+            r = str(round(psize[0] / 10))
+            rounded = ",".join([r, r])
+            run_command(
+                self.CONVERT,
+                "-size ", picsize,
+                ' xc:none -draw ',
+                '"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ',  # NOQA: E501
+                temp_bg
+            )
+
+        def read_text():
+            return self.get_text()
+
+        def create_txlayer():
+            run_command(
+                self.CONVERT,
+                "-background none",
+                "-fill",
+                text_color,
+                "-pointsize", "12",
+                "-border 4 -bordercolor none",
+                "-size ", txsize,
+                ' caption:"', read_text(), '" ',
+                temp_txlayer
+            )
+
+        create_txlayer()
+        create_bg()
+        run_command(
+            self.CONVERT,
+            temp_bg,
+            temp_txlayer,
+            "-background None -layers merge ",
+            out_path
+        )
+
+        return out_path
+
+    def get_text(self):
+
+        if self._text is not None:
+            return self._text
+
+        try:
+            # Workaround for tika-python#273
+            result = parser.from_file(
+                self.document_path,
+                "all",
+                "http://tika-server:9998"
+            )
+            # Strip out excess white space to allow matching to go smoother
+            self._text = strip_excess_whitespace(result["content"])
+            return self._text
+        except ConnectionError as e:
+            raise ParseError(e)
+
+
+def run_command(*args):
+    environment = os.environ.copy()
+    if settings.CONVERT_MEMORY_LIMIT:
+        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
+    if settings.CONVERT_TMPDIR:
+        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
+
+    if not subprocess.Popen(' '.join(args), env=environment,
+                            shell=True).wait() == 0:
+        raise ParseError("Convert failed at {}".format(args))
diff --git a/src/paperless_tika/signals.py b/src/paperless_tika/signals.py
@@ -0,0 +1,23 @@
+import re
+
+from .parsers import TikaDocumentParser
+
+
+class ConsumerDeclaration:
+
+    MATCHING_FILES = re.compile(r"^.*\.(ods|odt|odp|xlsx?|docx?|pptx?)$")
+
+    @classmethod
+    def handle(cls, sender, **kwargs):
+        return cls.test
+
+    @classmethod
+    def test(cls, doc):
+
+        if cls.MATCHING_FILES.match(doc.lower()):
+            return {
+                "parser": TikaDocumentParser,
+                "weight": 10
+            }
+
+        return None
diff --git a/src/paperless_tika/tests/__init__.py b/src/paperless_tika/tests/__init__.py
diff --git a/src/paperless_tika/tests/test_signals.py b/src/paperless_tika/tests/test_signals.py
@@ -0,0 +1,39 @@
+from django.test import TestCase
+
+from ..signals import ConsumerDeclaration
+
+
+class SignalsTestCase(TestCase):
+
+    def test_test_handles_various_file_names_true(self):
+
+        prefixes = (
+            "doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
+            "A document with a . in it", "Doc with -- in it"
+        )
+        suffixes = (
+            "ods", "odt", "odp", "xls", "xlsx", "doc", "docx", "ppt", "pptx",
+            "ODS", "ODT", "ODP", "XLS", "XLSX", "DOC", "DOCX", "PPT", "PPTX",
+            "oDs", "oDt", "oDp", "xLs", "xLsX", "dOc", "dOcX", "pPt", "pPtX",
+        )
+
+        for prefix in prefixes:
+            for suffix in suffixes:
+                name = "{}.{}".format(prefix, suffix)
+                self.assertTrue(ConsumerDeclaration.test(name))
+
+    def test_test_handles_various_file_names_false(self):
+
+        prefixes = ("doc",)
+        suffixes = (
+            "pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm",
+            "bmp", "txt", "markdown", "",
+        )
+
+        for prefix in prefixes:
+            for suffix in suffixes:
+                name = "{}.{}".format(prefix, suffix)
+                self.assertFalse(ConsumerDeclaration.test(name))
+
+        self.assertFalse(ConsumerDeclaration.test(""))
+        self.assertFalse(ConsumerDeclaration.test("doc"))