Merge pull request #197 from danielquinn/pluggable-consumers

Pluggable consumers
the-paperless-project · Mar 25, 2017 · b7cb708 · b7cb708
2 parents 0f7bfc5 + 7611c2b
commit b7cb708
Show file tree

Hide file tree

Showing 21 changed files with 455 additions and 294 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -8,7 +8,9 @@ matrix:
           env: TOXENV=py34
         - python: 3.5
           env: TOXENV=py35
-        - python: 3.5
+        - python: 3.6
+          env: TOXENV=py36
+        - python: 3.6
           env: TOXENV=pep8
 
 install:

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -4,6 +4,14 @@ Changelog
 * 0.3.6
   * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
     correspondent or the tags for a document.
+  * The ``content`` field is now optional, to allow for the edge case of a
+    purely graphical document.
+  * You can no longer add documents via the admin.  This never worked in the
+    first place, so all I've done here is remove the link to the broken form.
+  * The consumer code has been heavily refactored to support a pluggable
+    interface.  Install a paperless consumer via pip and tell paperless about
+    it with an environment variable, and you're good to go.  Proper
+    documentation is on its way.
 
 * 0.3.5
   * A serious facelift for the documents listing page wherein we drop the

diff --git a/src/documents/admin.py b/src/documents/admin.py
@@ -67,6 +67,7 @@ def has_add_permission(self, request):
 
     def created_(self, obj):
         return obj.created.date().strftime("%Y-%m-%d")
+    created_.short_description = "Created"
 
     def thumbnail(self, obj):
         png_img = self._html_tag(

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
@@ -1,35 +1,21 @@
+import datetime
+import hashlib
+import logging
 import os
 import re
 import uuid
-import shutil
-import hashlib
-import logging
-import datetime
-import tempfile
-import itertools
-import subprocess
-from multiprocessing.pool import Pool
-
-import pyocr
-import langdetect
-from PIL import Image
+
 from django.conf import settings
 from django.utils import timezone
 from paperless.db import GnuPG
-from pyocr.tesseract import TesseractError
-from pyocr.libtesseract.tesseract_raw import \
-    TesseractError as OtherTesseractError
 
-from .models import Tag, Document, FileInfo
+from .models import Document, FileInfo, Tag
+from .parsers import ParseError
 from .signals import (
-    document_consumption_started,
-    document_consumption_finished
+    document_consumer_declaration,
+    document_consumption_finished,
+    document_consumption_started
 )
-from .languages import ISO639
-
-
-class OCRError(Exception):
-    pass
 
 
 class ConsumerError(Exception):
@@ -47,13 +33,7 @@ class Consumer(object):
     """
 
     SCRATCH = settings.SCRATCH_DIR
-    CONVERT = settings.CONVERT_BINARY
-    UNPAPER = settings.UNPAPER_BINARY
     CONSUME = settings.CONSUMPTION_DIR
-    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
-    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
-
-    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 
     def __init__(self):
 
@@ -78,6 +58,16 @@ def __init__(self):
             raise ConsumerError(
                 "Consumption directory {} does not exist".format(self.CONSUME))
 
+        self.parsers = []
+        for response in document_consumer_declaration.send(self):
+            self.parsers.append(response[1])
+
+        if not self.parsers:
+            raise ConsumerError(
+                "No parsers could be found, not even the default.  "
+                "This is a problem."
+            )
+
     def log(self, level, message):
         getattr(self.logger, level)(message, extra={
             "group": self.logging_group
@@ -109,6 +99,13 @@ def consume(self):
                 self._ignore.append(doc)
                 continue
 
+            parser_class = self._get_parser_class(doc)
+            if not parser_class:
+                self.log(
+                    "info", "No parsers could be found for {}".format(doc))
+                self._ignore.append(doc)
+                continue
+
             self.logging_group = uuid.uuid4()
 
             self.log("info", "Consuming {}".format(doc))
@@ -119,25 +116,26 @@ def consume(self):
                 logging_group=self.logging_group
             )
 
-            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
-            imgs = self._get_greyscale(tempdir, doc)
-            thumbnail = self._get_thumbnail(tempdir, doc)
+            parsed_document = parser_class(doc)
+            thumbnail = parsed_document.get_thumbnail()
 
             try:
-
-                document = self._store(self._get_ocr(imgs), doc, thumbnail)
-
-            except OCRError as e:
+                document = self._store(
+                    parsed_document.get_text(),
+                    doc,
+                    thumbnail
+                )
+            except ParseError as e:
 
                 self._ignore.append(doc)
-                self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
-                self._cleanup_tempdir(tempdir)
+                self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
+                parsed_document.cleanup()
 
                 continue
 
             else:
 
-                self._cleanup_tempdir(tempdir)
+                parsed_document.cleanup()
                 self._cleanup_doc(doc)
 
                 self.log(
@@ -151,142 +149,20 @@ def consume(self):
                     logging_group=self.logging_group
                 )
 
-    def _get_greyscale(self, tempdir, doc):
-        """
-        Greyscale images are easier for Tesseract to OCR
-        """
-
-        self.log("info", "Generating greyscale image from {}".format(doc))
-
-        # Convert PDF to multiple PNMs
-        pnm = os.path.join(tempdir, "convert-%04d.pnm")
-        run_convert(
-            self.CONVERT,
-            "-density", str(self.DENSITY),
-            "-depth", "8",
-            "-type", "grayscale",
-            doc, pnm,
-        )
-
-        # Get a list of converted images
-        pnms = []
-        for f in os.listdir(tempdir):
-            if f.endswith(".pnm"):
-                pnms.append(os.path.join(tempdir, f))
-
-        # Run unpaper in parallel on converted images
-        with Pool(processes=self.THREADS) as pool:
-            pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
-
-        # Return list of converted images, processed with unpaper
-        pnms = []
-        for f in os.listdir(tempdir):
-            if f.endswith(".unpaper.pnm"):
-                pnms.append(os.path.join(tempdir, f))
-
-        return sorted(filter(lambda __: os.path.isfile(__), pnms))
-
-    def _get_thumbnail(self, tempdir, doc):
-        """
-        The thumbnail of a PDF is just a 500px wide image of the first page.
-        """
-
-        self.log("info", "Generating the thumbnail")
-
-        run_convert(
-            self.CONVERT,
-            "-scale", "500x5000",
-            "-alpha", "remove",
-            doc, os.path.join(tempdir, "convert-%04d.png")
-        )
-
-        return os.path.join(tempdir, "convert-0000.png")
-
-    def _guess_language(self, text):
-        try:
-            guess = langdetect.detect(text)
-            self.log("debug", "Language detected: {}".format(guess))
-            return guess
-        except Exception as e:
-            self.log("warning", "Language detection error: {}".format(e))
-
-    def _get_ocr(self, imgs):
-        """
-        Attempts to do the best job possible OCR'ing the document based on
-        simple language detection trial & error.
-        """
-
-        if not imgs:
-            raise OCRError("No images found")
-
-        self.log("info", "OCRing the document")
-
-        # Since the division gets rounded down by int, this calculation works
-        # for every edge-case, i.e. 1
-        middle = int(len(imgs) / 2)
-        raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
-
-        guessed_language = self._guess_language(raw_text)
-
-        if not guessed_language or guessed_language not in ISO639:
-            self.log("warning", "Language detection failed!")
-            if settings.FORGIVING_OCR:
-                self.log(
-                    "warning",
-                    "As FORGIVING_OCR is enabled, we're going to make the "
-                    "best with what we have."
-                )
-                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
-                return raw_text
-            raise OCRError("Language detection failed")
-
-        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
-            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
-            return raw_text
-
-        try:
-            return self._ocr(imgs, ISO639[guessed_language])
-        except pyocr.pyocr.tesseract.TesseractError:
-            if settings.FORGIVING_OCR:
-                self.log(
-                    "warning",
-                    "OCR for {} failed, but we're going to stick with what "
-                    "we've got since FORGIVING_OCR is enabled.".format(
-                        guessed_language
-                    )
-                )
-                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
-                return raw_text
-            raise OCRError(
-                "The guessed language is not available in this instance of "
-                "Tesseract."
-            )
-
-    def _assemble_ocr_sections(self, imgs, middle, text):
+    def _get_parser_class(self, doc):
         """
-        Given a `middle` value and the text that middle page represents, we OCR
-        the remainder of the document and return the whole thing.
+        Determine the appropriate parser class based on the file
         """
-        text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
-        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
-        return text
 
-    def _ocr(self, imgs, lang):
-        """
-        Performs a single OCR attempt.
-        """
-
-        if not imgs:
-            return ""
-
-        self.log("info", "Parsing for {}".format(lang))
-
-        with Pool(processes=self.THREADS) as pool:
-            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
-            r = " ".join(r)
+        options = []
+        for parser in self.parsers:
+            result = parser(doc)
+            if result:
+                options.append(result)
 
-        # Strip out excess white space to allow matching to go smoother
-        return strip_excess_whitespace(r)
+        # Return the parser with the highest weight.
+        return sorted(
+            options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 
     def _store(self, text, doc, thumbnail):
 
@@ -332,10 +208,6 @@ def _store(self, text, doc, thumbnail):
 
         return document
 
-    def _cleanup_tempdir(self, d):
-        self.log("debug", "Deleting directory {}".format(d))
-        shutil.rmtree(d)
-
     def _cleanup_doc(self, doc):
         self.log("debug", "Deleting document {}".format(doc))
         os.unlink(doc)
@@ -361,41 +233,3 @@ def _is_duplicate(doc):
         with open(doc, "rb") as f:
             checksum = hashlib.md5(f.read()).hexdigest()
         return Document.objects.filter(checksum=checksum).exists()
-
-
-def strip_excess_whitespace(text):
-    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
-    no_leading_whitespace = re.sub(
-        "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
-    no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
-    return no_trailing_whitespace
-
-
-def image_to_string(args):
-    img, lang = args
-    ocr = pyocr.get_available_tools()[0]
-    with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
-        if ocr.can_detect_orientation():
-            try:
-                orientation = ocr.detect_orientation(f, lang=lang)
-                f = f.rotate(orientation["angle"], expand=1)
-            except (TesseractError, OtherTesseractError):
-                pass
-        return ocr.image_to_string(f, lang=lang)
-
-
-def run_unpaper(args):
-    unpaper, pnm = args
-    subprocess.Popen(
-        (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
-
-
-def run_convert(*args):
-
-    environment = os.environ.copy()
-    if settings.CONVERT_MEMORY_LIMIT:
-        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
-    if settings.CONVERT_TMPDIR:
-        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
-
-    subprocess.Popen(args, env=environment).wait()
diff --git a/src/documents/models.py b/src/documents/models.py
@@ -158,13 +158,22 @@ class Document(models.Model):
 
     correspondent = models.ForeignKey(
         Correspondent, blank=True, null=True, related_name="documents")
+
     title = models.CharField(max_length=128, blank=True, db_index=True)
-    content = models.TextField(db_index=True)
+
+    content = models.TextField(
+        db_index=True,
+        blank=True,
+        help_text="The raw, text-only data of the document.  This field is "
+                  "primarily used for searching."
+    )
+
     file_type = models.CharField(
         max_length=4,
         editable=False,
         choices=tuple([(t, t.upper()) for t in TYPES])
     )
+
     tags = models.ManyToManyField(
         Tag, related_name="documents", blank=True)