Files
paperless-ngx/src/documents/management/commands/document_consumer.py

273 lines
7.7 KiB
Python
Raw Normal View History

2015-12-26 13:21:33 +00:00
import datetime
2015-12-20 19:23:33 +00:00
import glob
2016-01-21 12:50:22 -05:00
import langdetect
2015-12-20 19:23:33 +00:00
import os
import random
import re
import subprocess
import time
2015-12-20 19:23:33 +00:00
import pyocr
from PIL import Image
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import slugify
2015-12-26 13:21:33 +00:00
from django.utils import timezone
2015-12-20 19:23:33 +00:00
2016-01-23 02:58:03 +00:00
from paperless.db import GnuPG
2016-01-23 02:33:29 +00:00
from ...languages import ISO639
2016-01-28 07:23:11 +00:00
from ...models import Document, Sender, Tag
2015-12-20 19:23:33 +00:00
2016-01-23 02:33:29 +00:00
class OCRError(BaseException):
pass
2015-12-20 19:23:33 +00:00
class Command(BaseCommand):
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png
2. Use tesseract on the png
2016-01-29 23:18:03 +00:00
3. Encrypt and store the document in the MEDIA_ROOT
4. Store the OCR'd text in the database
2016-01-29 23:18:03 +00:00
5. Delete the document and image(s)
2015-12-20 19:23:33 +00:00
"""
2015-12-26 13:21:33 +00:00
LOOP_TIME = 10 # Seconds
2015-12-20 19:23:33 +00:00
CONVERT = settings.CONVERT_BINARY
SCRATCH = settings.SCRATCH_DIR
CONSUME = settings.CONSUMPTION_DIR
OCR = pyocr.get_available_tools()[0]
2016-01-23 02:33:29 +00:00
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
2016-01-29 23:18:03 +00:00
MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents")
2015-12-20 19:23:33 +00:00
2016-01-29 23:18:03 +00:00
PARSER_REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
PARSER_REGEX_SENDER_TITLE = re.compile(
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)
2015-12-20 19:23:33 +00:00
def __init__(self, *args, **kwargs):
2016-01-21 12:50:22 -05:00
2015-12-20 19:23:33 +00:00
self.verbosity = 0
self.stats = {}
2016-01-23 02:33:29 +00:00
self._ignore = []
2016-01-21 12:50:22 -05:00
2015-12-20 19:23:33 +00:00
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
self._setup()
try:
while True:
self.loop()
2015-12-26 13:21:33 +00:00
time.sleep(self.LOOP_TIME)
if self.verbosity > 1:
print(".")
except KeyboardInterrupt:
print("Exiting")
def loop(self):
2016-01-29 23:18:03 +00:00
for doc in os.listdir(self.CONSUME):
2015-12-20 19:23:33 +00:00
2016-01-29 23:18:03 +00:00
doc = os.path.join(self.CONSUME, doc)
2016-01-29 23:18:03 +00:00
if not os.path.isfile(doc):
2015-12-20 19:23:33 +00:00
continue
2016-01-29 23:18:03 +00:00
if not re.match(self.PARSER_REGEX_TITLE, doc):
2015-12-20 19:23:33 +00:00
continue
2016-01-29 23:18:03 +00:00
if doc in self._ignore:
2016-01-23 02:33:29 +00:00
continue
2016-01-29 23:18:03 +00:00
if self._is_ready(doc):
continue
2016-01-29 23:18:03 +00:00
self._render("Consuming {}".format(doc), 1)
2015-12-20 19:23:33 +00:00
2016-01-29 23:18:03 +00:00
pngs = self._get_greyscale(doc)
2016-01-23 02:33:29 +00:00
try:
text = self._get_ocr(pngs)
except OCRError:
2016-01-29 23:18:03 +00:00
self._ignore.append(doc)
self._render("OCR FAILURE: {}".format(doc), 0)
2016-01-23 02:33:29 +00:00
continue
2015-12-20 19:23:33 +00:00
2016-01-29 23:18:03 +00:00
self._store(text, doc)
self._cleanup(pngs, doc)
2015-12-20 19:23:33 +00:00
def _setup(self):
2016-01-23 03:42:39 +00:00
if not self.CONSUME:
raise CommandError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)
if not os.path.exists(self.CONSUME):
raise CommandError("Consumption directory {} does not exist".format(
self.CONSUME))
2016-01-29 23:18:03 +00:00
for d in (self.SCRATCH, self.MEDIA_DOCS):
2015-12-20 19:23:33 +00:00
try:
os.makedirs(d)
except FileExistsError:
pass
2016-01-29 23:18:03 +00:00
def _is_ready(self, doc):
"""
2016-01-29 23:18:03 +00:00
Detect whether `doc` is ready to consume or if it's still being written
to by the scanner.
"""
2016-01-29 23:18:03 +00:00
t = os.stat(doc).st_mtime
2016-01-29 23:18:03 +00:00
if self.stats.get(doc) == t:
del(self.stats[doc])
return True
2016-01-29 23:18:03 +00:00
self.stats[doc] = t
return False
2016-01-29 23:18:03 +00:00
def _get_greyscale(self, doc):
2015-12-20 19:23:33 +00:00
2016-01-01 16:13:59 +00:00
self._render(" Generating greyscale image", 2)
i = random.randint(1000000, 9999999)
2015-12-20 19:23:33 +00:00
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
2016-01-29 23:18:03 +00:00
"-type", "grayscale", doc, png
2015-12-20 19:23:33 +00:00
)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs):
2016-01-29 23:18:03 +00:00
self._render(" OCRing the document", 2)
2016-01-01 16:13:59 +00:00
2016-01-23 02:33:29 +00:00
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
2016-01-21 12:50:22 -05:00
guessed_language = langdetect.detect(raw_text)
2016-01-23 02:33:29 +00:00
self._render(" Language detected: {}".format(guessed_language), 2)
if guessed_language not in ISO639:
self._render("Language detection failed!", 0)
if settings.FORGIVING_OCR:
self._render(
"As FORGIVING_OCR is enabled, we're going to make the best "
"with what we have.",
1
)
return raw_text
raise OCRError
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
2016-01-21 12:50:22 -05:00
return raw_text
2016-01-23 02:33:29 +00:00
try:
return self._ocr(pngs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self._render(
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
),
0
)
return raw_text
raise OCRError
2016-01-21 12:50:22 -05:00
def _ocr(self, pngs, lang):
2016-01-23 02:33:29 +00:00
self._render(" Parsing for {}".format(lang), 2)
2015-12-20 19:23:33 +00:00
r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
2016-01-01 16:13:59 +00:00
self._render(" {}".format(f.filename), 3)
2016-01-23 02:33:29 +00:00
r += self.OCR.image_to_string(f, lang=lang)
2015-12-20 19:23:33 +00:00
2016-01-28 07:23:11 +00:00
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
2015-12-20 19:23:33 +00:00
2016-01-29 23:18:03 +00:00
def _store(self, text, doc):
2015-12-20 19:23:33 +00:00
2016-01-29 23:18:03 +00:00
sender, title, file_type = self._parse_file_name(doc)
2016-01-29 23:18:03 +00:00
lower_text = text.lower()
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
stats = os.stat(doc)
2015-12-26 13:21:33 +00:00
2016-01-01 16:13:59 +00:00
self._render(" Saving record to database", 2)
2016-01-29 23:18:03 +00:00
document = Document.objects.create(
2016-01-01 16:13:59 +00:00
sender=sender,
title=title,
content=text,
2016-01-29 23:18:03 +00:00
file_type=file_type,
2016-01-01 16:13:59 +00:00
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
2015-12-26 13:21:33 +00:00
)
2015-12-20 19:23:33 +00:00
2016-01-28 07:23:11 +00:00
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self._render(" Tagging with {}".format(tag_names), 2)
2016-01-29 23:18:03 +00:00
document.tags.add(*relevant_tags)
2016-01-28 07:23:11 +00:00
2016-01-29 23:18:03 +00:00
with open(doc, "rb") as unencrypted:
with open(document.source_path, "wb") as encrypted:
2016-01-01 16:13:59 +00:00
self._render(" Encrypting", 3)
encrypted.write(GnuPG.encrypted(unencrypted))
2015-12-20 19:23:33 +00:00
2016-01-29 23:18:03 +00:00
def _parse_file_name(self, doc):
"""
We use a crude naming convention to make handling the sender and title
easier:
2016-01-29 23:18:03 +00:00
"<sender> - <title>.<suffix>"
"""
2016-01-29 23:18:03 +00:00
# First we attempt "<sender> - <title>.<suffix>"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
if m:
2016-01-29 23:18:03 +00:00
sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})
2016-01-29 23:18:03 +00:00
return sender, title, file_type
# That didn't work, so we assume sender is None
2016-01-29 23:18:03 +00:00
m = re.match(self.PARSER_REGEX_TITLE, doc)
return None, m.group(1), m.group(2)
2016-01-29 23:18:03 +00:00
def _cleanup(self, pngs, doc):
2015-12-20 19:23:33 +00:00
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
2016-01-29 23:18:03 +00:00
for f in list(glob.glob(png_glob)) + [doc]:
2016-01-01 16:13:59 +00:00
self._render(" Deleting {}".format(f), 2)
2015-12-20 19:23:33 +00:00
os.unlink(f)
2016-01-01 16:13:59 +00:00
self._render("", 2)
2016-01-01 16:13:59 +00:00
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)