Skip to content
This repository has been archived by the owner on Feb 19, 2021. It is now read-only.

Support for Office-Formats with Apache-Tika #600

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ RUN apk add --no-cache \
# Setup entrypoint
chmod 755 /sbin/docker-entrypoint.sh

# WORKAROUND for missing fonts in container
RUN apk --no-cache add msttcorefonts-installer fontconfig && \
update-ms-fonts && \
fc-cache -f

WORKDIR /usr/src/paperless/src
# Mount volumes and set Entrypoint
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]
Expand Down
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ psycopg2 = "*"
djangoql = "*"
whitenoise = "*"
brotli = "*"
tika = "*"

[dev-packages]
ipython = "*"
566 changes: 278 additions & 288 deletions Pipfile.lock

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions docker-compose.yml.example
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,23 @@ services:
- PAPERLESS_OCR_LANGUAGES=
command: ["gunicorn", "-b", "0.0.0.0:8000"]

tika-server:
image: logicalspark/docker-tikaserver:latest
healthcheck:
test: ["CMD", "curl" , "-f", "http://localhost:9998"]
interval: 30s
timeout: 10s
retries: 5

consumer:
build: ./
# uncomment the following line to start automatically on system boot
# restart: always
depends_on:
webserver:
condition: service_healthy
tika-server:
condition: service_healthy
volumes:
- data:/usr/src/paperless/data
- media:/usr/src/paperless/media
Expand All @@ -49,6 +59,8 @@ services:
# want to export your documents.
# - /path/to/another/arbitrary/place:/export
env_file: docker-compose.env
environment:
- TIKA_CLIENT_ONLY=True
command: ["document_consumer"]

volumes:
Expand Down
4 changes: 2 additions & 2 deletions docs/utilities.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,10 @@ Basic Syntax

Again we'll use the ``manage.py`` script, passing ``change_storage_type``:

.. code:: bash
.. code:: console

$ /path/to/paperless/src/manage.py change_storage_type --help
usage: manage.py change_storage_type [-h] [--version] [-v {0,1,2,3}]
usage: manage.py change_storage_type [-h] [--version] [-v {0,1,2,3}]
[--settings SETTINGS]
[--pythonpath PYTHONPATH] [--traceback]
[--no-color] [--passphrase PASSPHRASE]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,4 @@ virtualenv==16.7.2
wcwidth==0.1.7
whitenoise==4.1.3
zipp==0.5.2
tika=="1.23"
36 changes: 26 additions & 10 deletions src/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,19 @@ class Document(models.Model):
TYPE_TXT = "txt"
TYPE_CSV = "csv"
TYPE_MD = "md"
TYPE_ODS = "ods"
TYPE_ODT = "odt"
TYPE_ODP = "odp"
TYPE_XLS = "xls"
TYPE_XLSX = "xlsx"
TYPE_DOC = "doc"
TYPE_DOCX = "docx"
TYPE_PPT = "ppt"
TYPE_PPTX = "pptx"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
TYPE_TXT, TYPE_CSV, TYPE_MD)
TYPE_TXT, TYPE_CSV, TYPE_MD, TYPE_ODS, TYPE_ODT,
TYPE_ODP, TYPE_XLS, TYPE_XLSX, TYPE_DOC, TYPE_DOCX,
TYPE_PPT, TYPE_PPTX)

STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
Expand Down Expand Up @@ -367,53 +378,58 @@ class FileInfo:
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)

formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
# TODO: When one is missing here no test fails! Write one
OFFICE_FORMATS = "ods|odt|odp|xlsx?|docx?|pptx?"
TEXT_FORMATS = "te?xt|md|csv"
IMAGE_FORMATS = "jpe?g|png|gif|tiff?"
FORMATS = "pdf|{}|{}|{}".format(
IMAGE_FORMATS, TEXT_FORMATS, OFFICE_FORMATS
)
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>{})$".format(formats),
r"\.(?P<extension>{})$".format(FORMATS),
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>{})$".format(formats),
r"\.(?P<extension>{})$".format(FORMATS),
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*)"
r"\.(?P<extension>{})$".format(formats),
r"\.(?P<extension>{})$".format(FORMATS),
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)"
r"\.(?P<extension>{})$".format(formats),
r"\.(?P<extension>{})$".format(FORMATS),
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>{})$".format(formats),
r"\.(?P<extension>{})$".format(FORMATS),
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*)?"
r"\.(?P<extension>{})$".format(formats),
r"\.(?P<extension>{})$".format(FORMATS),
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)"
r"\.(?P<extension>{})$".format(formats),
r"\.(?P<extension>{})$".format(FORMATS),
flags=re.IGNORECASE
))
])
Expand Down
11 changes: 10 additions & 1 deletion src/documents/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,16 @@ def render_to_response(self, context, **response_kwargs):
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
Document.TYPE_TXT: "text/plain",
Document.TYPE_ODS: "application/vnd.oasis.opendocument.spreadsheet", # NOQA: E501
Document.TYPE_ODT: "application/vnd.oasis.opendocument.text", # NOQA: E501
Document.TYPE_ODP: "application/vnd.oasis.opendocument.presentation", # NOQA: E501
Document.TYPE_DOC: "application/msword", # NOQA: E501
Document.TYPE_DOCX: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # NOQA: E501
Document.TYPE_XLS: "application/vnd.ms-excel", # NOQA: E501
Document.TYPE_XLSX: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # NOQA: E501
Document.TYPE_PPT: "application/vnd.ms-powerpoint", # NOQA: E501
Document.TYPE_PPTX: "application/vnd.openxmlformats-officedocument.presentationml.presentation", # NOQA: E501
}

if self.kwargs["kind"] == "thumb":
Expand Down
5 changes: 3 additions & 2 deletions src/paperless/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __get_boolean(key, default="NO"):
ALLOWED_HOSTS = _allowed_hosts.split(",")

FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME")

# Application definition

INSTALLED_APPS = [
Expand All @@ -80,6 +80,7 @@ def __get_boolean(key, default="NO"):
"reminders.apps.RemindersConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_tika.apps.PaperlessTikaConfig",

"django.contrib.admin",

Expand Down Expand Up @@ -109,7 +110,7 @@ def __get_boolean(key, default="NO"):
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'

# We allow CORS from localhost:8080
CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "localhost:8080").split(","))
CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080").split(","))

# If auth is disabled, we just use our "bypass" authentication middleware
if bool(os.getenv("PAPERLESS_DISABLE_LOGIN", "false").lower() in ("yes", "y", "1", "t", "true")):
Expand Down
Empty file added src/paperless_tika/__init__.py
Empty file.
16 changes: 16 additions & 0 deletions src/paperless_tika/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from django.apps import AppConfig


class PaperlessTikaConfig(AppConfig):

name = "paperless_tika"

def ready(self):

from documents.signals import document_consumer_declaration

from .signals import ConsumerDeclaration

document_consumer_declaration.connect(ConsumerDeclaration.handle)

AppConfig.ready(self)
112 changes: 112 additions & 0 deletions src/paperless_tika/parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import os
import subprocess
from requests.exceptions import ConnectionError

import tika
from tika import parser
from django.conf import settings

from documents.parsers import DocumentParser, ParseError
from paperless_tesseract.parsers import strip_excess_whitespace


class TikaDocumentParser(DocumentParser):
"""
This parser uses Apache-Tika to try and get some text out of office
formats, whether it's a open-office (ODS, ODT, ODP), or ms-office
format (XLS, XLSX, DOC, DOCX, PPT, PPTX)
"""

CONVERT = settings.CONVERT_BINARY

def __init__(self, path):
super().__init__(path)
self._text = None

def get_thumbnail(self):
"""
The thumbnail of a text file is just a 500px wide image of the text
rendered onto a letter-sized page.
"""
# The below is heavily cribbed from https://askubuntu.com/a/590951

bg_color = "white" # bg color
text_color = "black" # text color
psize = [500, 647] # icon size
n_lines = 50 # number of lines to show
out_path = os.path.join(self.tempdir, "convert.png")

temp_bg = os.path.join(self.tempdir, "bg.png")
temp_txlayer = os.path.join(self.tempdir, "tx.png")
picsize = "x".join([str(n) for n in psize])
txsize = "x".join([str(n - 8) for n in psize])

def create_bg():
work_size = ",".join([str(n - 1) for n in psize])
r = str(round(psize[0] / 10))
rounded = ",".join([r, r])
run_command(
self.CONVERT,
"-size ", picsize,
' xc:none -draw ',
'"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ', # NOQA: E501
temp_bg
)

def read_text():
return self.get_text()

def create_txlayer():
run_command(
self.CONVERT,
"-background none",
"-fill",
text_color,
"-pointsize", "12",
"-border 4 -bordercolor none",
"-size ", txsize,
' caption:"', read_text(), '" ',
temp_txlayer
)

create_txlayer()
create_bg()
run_command(
self.CONVERT,
temp_bg,
temp_txlayer,
"-background None -layers merge ",
out_path
)

return out_path

def get_text(self):

if self._text is not None:
return self._text

try:
# Workaround for tika-python#273
result = parser.from_file(
self.document_path,
"all",
"http://tika-server:9998"
)
# Strip out excess white space to allow matching to go smoother
self._text = strip_excess_whitespace(result["content"])
return self._text
except ConnectionError as e:
raise ParseError(e)


def run_command(*args):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR

if not subprocess.Popen(' '.join(args), env=environment,
shell=True).wait() == 0:
raise ParseError("Convert failed at {}".format(args))
23 changes: 23 additions & 0 deletions src/paperless_tika/signals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import re

from .parsers import TikaDocumentParser


class ConsumerDeclaration:

MATCHING_FILES = re.compile(r"^.*\.(ods|odt|odp|xlsx?|docx?|pptx?)$")

@classmethod
def handle(cls, sender, **kwargs):
return cls.test

@classmethod
def test(cls, doc):

if cls.MATCHING_FILES.match(doc.lower()):
return {
"parser": TikaDocumentParser,
"weight": 10
}

return None
Empty file.
39 changes: 39 additions & 0 deletions src/paperless_tika/tests/test_signals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from django.test import TestCase

from ..signals import ConsumerDeclaration


class SignalsTestCase(TestCase):

def test_test_handles_various_file_names_true(self):

prefixes = (
"doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
"A document with a . in it", "Doc with -- in it"
)
suffixes = (
"ods", "odt", "odp", "xls", "xlsx", "doc", "docx", "ppt", "pptx",
"ODS", "ODT", "ODP", "XLS", "XLSX", "DOC", "DOCX", "PPT", "PPTX",
"oDs", "oDt", "oDp", "xLs", "xLsX", "dOc", "dOcX", "pPt", "pPtX",
)

for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertTrue(ConsumerDeclaration.test(name))

def test_test_handles_various_file_names_false(self):

prefixes = ("doc",)
suffixes = (
"pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm",
"bmp", "txt", "markdown", "",
)

for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertFalse(ConsumerDeclaration.test(name))

self.assertFalse(ConsumerDeclaration.test(""))
self.assertFalse(ConsumerDeclaration.test("doc"))
Loading