Skip to content
This repository has been archived by the owner on Feb 19, 2021. It is now read-only.

Commit

Permalink
Merge pull request #197 from danielquinn/pluggable-consumers
Browse files Browse the repository at this point in the history
Pluggable consumers
  • Loading branch information
danielquinn authored Mar 25, 2017
2 parents 0f7bfc5 + 7611c2b commit b7cb708
Show file tree
Hide file tree
Showing 21 changed files with 455 additions and 294 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ matrix:
env: TOXENV=py34
- python: 3.5
env: TOXENV=py35
- python: 3.5
- python: 3.6
env: TOXENV=py36
- python: 3.6
env: TOXENV=pep8

install:
Expand Down
8 changes: 8 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ Changelog
* 0.3.6
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
correspondent or the tags for a document.
* The ``content`` field is now optional, to allow for the edge case of a
purely graphical document.
* You can no longer add documents via the admin. This never worked in the
first place, so all I've done here is remove the link to the broken form.
* The consumer code has been heavily refactored to support a pluggable
interface. Install a paperless consumer via pip and tell paperless about
it with an environment variable, and you're good to go. Proper
documentation is on its way.

* 0.3.5
* A serious facelift for the documents listing page wherein we drop the
Expand Down
1 change: 1 addition & 0 deletions src/documents/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def has_add_permission(self, request):

def created_(self, obj):
return obj.created.date().strftime("%Y-%m-%d")
created_.short_description = "Created"

def thumbnail(self, obj):
png_img = self._html_tag(
Expand Down
260 changes: 47 additions & 213 deletions src/documents/consumer.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,21 @@
import datetime
import hashlib
import logging
import os
import re
import uuid
import shutil
import hashlib
import logging
import datetime
import tempfile
import itertools
import subprocess
from multiprocessing.pool import Pool

import pyocr
import langdetect
from PIL import Image

from django.conf import settings
from django.utils import timezone
from paperless.db import GnuPG
from pyocr.tesseract import TesseractError
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError

from .models import Tag, Document, FileInfo
from .models import Document, FileInfo, Tag
from .parsers import ParseError
from .signals import (
document_consumption_started,
document_consumption_finished
document_consumer_declaration,
document_consumption_finished,
document_consumption_started
)
from .languages import ISO639


class OCRError(Exception):
pass


class ConsumerError(Exception):
Expand All @@ -47,13 +33,7 @@ class Consumer(object):
"""

SCRATCH = settings.SCRATCH_DIR
CONVERT = settings.CONVERT_BINARY
UNPAPER = settings.UNPAPER_BINARY
CONSUME = settings.CONSUMPTION_DIR
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300

DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE

def __init__(self):

Expand All @@ -78,6 +58,16 @@ def __init__(self):
raise ConsumerError(
"Consumption directory {} does not exist".format(self.CONSUME))

self.parsers = []
for response in document_consumer_declaration.send(self):
self.parsers.append(response[1])

if not self.parsers:
raise ConsumerError(
"No parsers could be found, not even the default. "
"This is a problem."
)

def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
Expand Down Expand Up @@ -109,6 +99,13 @@ def consume(self):
self._ignore.append(doc)
continue

parser_class = self._get_parser_class(doc)
if not parser_class:
self.log(
"info", "No parsers could be found for {}".format(doc))
self._ignore.append(doc)
continue

self.logging_group = uuid.uuid4()

self.log("info", "Consuming {}".format(doc))
Expand All @@ -119,25 +116,26 @@ def consume(self):
logging_group=self.logging_group
)

tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
imgs = self._get_greyscale(tempdir, doc)
thumbnail = self._get_thumbnail(tempdir, doc)
parsed_document = parser_class(doc)
thumbnail = parsed_document.get_thumbnail()

try:

document = self._store(self._get_ocr(imgs), doc, thumbnail)

except OCRError as e:
document = self._store(
parsed_document.get_text(),
doc,
thumbnail
)
except ParseError as e:

self._ignore.append(doc)
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
self._cleanup_tempdir(tempdir)
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
parsed_document.cleanup()

continue

else:

self._cleanup_tempdir(tempdir)
parsed_document.cleanup()
self._cleanup_doc(doc)

self.log(
Expand All @@ -151,142 +149,20 @@ def consume(self):
logging_group=self.logging_group
)

def _get_greyscale(self, tempdir, doc):
"""
Greyscale images are easier for Tesseract to OCR
"""

self.log("info", "Generating greyscale image from {}".format(doc))

# Convert PDF to multiple PNMs
pnm = os.path.join(tempdir, "convert-%04d.pnm")
run_convert(
self.CONVERT,
"-density", str(self.DENSITY),
"-depth", "8",
"-type", "grayscale",
doc, pnm,
)

# Get a list of converted images
pnms = []
for f in os.listdir(tempdir):
if f.endswith(".pnm"):
pnms.append(os.path.join(tempdir, f))

# Run unpaper in parallel on converted images
with Pool(processes=self.THREADS) as pool:
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))

# Return list of converted images, processed with unpaper
pnms = []
for f in os.listdir(tempdir):
if f.endswith(".unpaper.pnm"):
pnms.append(os.path.join(tempdir, f))

return sorted(filter(lambda __: os.path.isfile(__), pnms))

def _get_thumbnail(self, tempdir, doc):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""

self.log("info", "Generating the thumbnail")

run_convert(
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
doc, os.path.join(tempdir, "convert-%04d.png")
)

return os.path.join(tempdir, "convert-0000.png")

def _guess_language(self, text):
try:
guess = langdetect.detect(text)
self.log("debug", "Language detected: {}".format(guess))
return guess
except Exception as e:
self.log("warning", "Language detection error: {}".format(e))

def _get_ocr(self, imgs):
"""
Attempts to do the best job possible OCR'ing the document based on
simple language detection trial & error.
"""

if not imgs:
raise OCRError("No images found")

self.log("info", "OCRing the document")

# Since the division gets rounded down by int, this calculation works
# for every edge-case, i.e. 1
middle = int(len(imgs) / 2)
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)

guessed_language = self._guess_language(raw_text)

if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed!")
if settings.FORGIVING_OCR:
self.log(
"warning",
"As FORGIVING_OCR is enabled, we're going to make the "
"best with what we have."
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError("Language detection failed")

if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text

try:
return self._ocr(imgs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self.log(
"warning",
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
)
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError(
"The guessed language is not available in this instance of "
"Tesseract."
)

def _assemble_ocr_sections(self, imgs, middle, text):
def _get_parser_class(self, doc):
"""
Given a `middle` value and the text that middle page represents, we OCR
the remainder of the document and return the whole thing.
Determine the appropriate parser class based on the file
"""
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
return text

def _ocr(self, imgs, lang):
"""
Performs a single OCR attempt.
"""

if not imgs:
return ""

self.log("info", "Parsing for {}".format(lang))

with Pool(processes=self.THREADS) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
r = " ".join(r)
options = []
for parser in self.parsers:
result = parser(doc)
if result:
options.append(result)

# Strip out excess white space to allow matching to go smoother
return strip_excess_whitespace(r)
# Return the parser with the highest weight.
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]

def _store(self, text, doc, thumbnail):

Expand Down Expand Up @@ -332,10 +208,6 @@ def _store(self, text, doc, thumbnail):

return document

def _cleanup_tempdir(self, d):
self.log("debug", "Deleting directory {}".format(d))
shutil.rmtree(d)

def _cleanup_doc(self, doc):
self.log("debug", "Deleting document {}".format(doc))
os.unlink(doc)
Expand All @@ -361,41 +233,3 @@ def _is_duplicate(doc):
with open(doc, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
return Document.objects.filter(checksum=checksum).exists()


def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace


def image_to_string(args):
img, lang = args
ocr = pyocr.get_available_tools()[0]
with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
if ocr.can_detect_orientation():
try:
orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except (TesseractError, OtherTesseractError):
pass
return ocr.image_to_string(f, lang=lang)


def run_unpaper(args):
unpaper, pnm = args
subprocess.Popen(
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()


def run_convert(*args):

environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR

subprocess.Popen(args, env=environment).wait()
11 changes: 10 additions & 1 deletion src/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,22 @@ class Document(models.Model):

correspondent = models.ForeignKey(
Correspondent, blank=True, null=True, related_name="documents")

title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True)

content = models.TextField(
db_index=True,
blank=True,
help_text="The raw, text-only data of the document. This field is "
"primarily used for searching."
)

file_type = models.CharField(
max_length=4,
editable=False,
choices=tuple([(t, t.upper()) for t in TYPES])
)

tags = models.ManyToManyField(
Tag, related_name="documents", blank=True)

Expand Down
Loading

0 comments on commit b7cb708

Please sign in to comment.