Files
paperless-ngx/src/documents/management/commands/document_consumer.py

200 lines
5.4 KiB
Python
Raw Normal View History

2015-12-26 13:21:33 +00:00
import datetime
2015-12-20 19:23:33 +00:00
import glob
2016-01-01 16:13:59 +00:00
import gnupg
2015-12-20 19:23:33 +00:00
import os
import random
import re
import subprocess
import time
2015-12-20 19:23:33 +00:00
import pyocr
from PIL import Image
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import slugify
2015-12-26 13:21:33 +00:00
from django.utils import timezone
2015-12-20 19:23:33 +00:00
from documents.models import Document, Sender
from paperless.db import GnuPG
2015-12-20 19:23:33 +00:00
class Command(BaseCommand):
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png
2. Use tesseract on the png
3. Encrypt and store the PDF in the MEDIA_ROOT
4. Store the OCR'd text in the database
5. Delete the pdf and image(s)
2015-12-20 19:23:33 +00:00
"""
2015-12-26 13:21:33 +00:00
LOOP_TIME = 10 # Seconds
2015-12-20 19:23:33 +00:00
CONVERT = settings.CONVERT_BINARY
SCRATCH = settings.SCRATCH_DIR
CONSUME = settings.CONSUMPTION_DIR
OCR = pyocr.get_available_tools()[0]
2016-01-21 09:24:13 +01:00
OCR_LANG = settings.TESSERACT_LANGUAGE
2015-12-20 19:23:33 +00:00
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
2015-12-20 19:23:33 +00:00
def __init__(self, *args, **kwargs):
self.verbosity = 0
self.stats = {}
2016-01-01 16:13:59 +00:00
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
2015-12-20 19:23:33 +00:00
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
if not os.path.exists(self.CONSUME):
raise CommandError("Consumption directory {} does not exist".format(
self.CONSUME))
2015-12-20 19:23:33 +00:00
self._setup()
try:
while True:
self.loop()
2015-12-26 13:21:33 +00:00
time.sleep(self.LOOP_TIME)
if self.verbosity > 1:
print(".")
except KeyboardInterrupt:
print("Exiting")
def loop(self):
2015-12-20 19:23:33 +00:00
for pdf in os.listdir(self.CONSUME):
pdf = os.path.join(self.CONSUME, pdf)
if not os.path.isfile(pdf):
2015-12-20 19:23:33 +00:00
continue
if not re.match(self.PARSER_REGEX_TITLE, pdf):
2015-12-20 19:23:33 +00:00
continue
if self._is_ready(pdf):
continue
2016-01-01 16:13:59 +00:00
self._render("Consuming {}".format(pdf), 1)
2015-12-20 19:23:33 +00:00
pngs = self._get_greyscale(pdf)
text = self._get_ocr(pngs)
2016-01-01 16:13:59 +00:00
self._store(text, pdf)
self._cleanup(pngs, pdf)
2015-12-20 19:23:33 +00:00
def _setup(self):
2016-01-01 16:13:59 +00:00
for d in (self.SCRATCH, self.MEDIA_PDF):
2015-12-20 19:23:33 +00:00
try:
os.makedirs(d)
except FileExistsError:
pass
def _is_ready(self, pdf):
"""
Detect whether `pdf` is ready to consume or if it's still being written
to by the scanner.
"""
t = os.stat(pdf).st_mtime
if self.stats.get(pdf) == t:
del(self.stats[pdf])
return True
self.stats[pdf] = t
return False
2015-12-20 19:23:33 +00:00
def _get_greyscale(self, pdf):
2016-01-01 16:13:59 +00:00
self._render(" Generating greyscale image", 2)
i = random.randint(1000000, 9999999)
2015-12-20 19:23:33 +00:00
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", pdf, png
)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs):
2016-01-01 16:13:59 +00:00
self._render(" OCRing the PDF", 2)
2015-12-20 19:23:33 +00:00
r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
2016-01-01 16:13:59 +00:00
self._render(" {}".format(f.filename), 3)
2016-01-21 09:24:13 +01:00
r += self.OCR.image_to_string(f, lang=self.OCR_LANG)
2015-12-20 19:23:33 +00:00
r += "\n\n\n\n\n\n\n\n"
return r
2016-01-01 16:13:59 +00:00
def _store(self, text, pdf):
2015-12-20 19:23:33 +00:00
sender, title = self._parse_file_name(pdf)
2015-12-26 13:21:33 +00:00
stats = os.stat(pdf)
2016-01-01 16:13:59 +00:00
self._render(" Saving record to database", 2)
2015-12-26 13:21:33 +00:00
doc = Document.objects.create(
2016-01-01 16:13:59 +00:00
sender=sender,
title=title,
content=text,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
2015-12-26 13:21:33 +00:00
)
2015-12-20 19:23:33 +00:00
2016-01-01 16:13:59 +00:00
with open(pdf, "rb") as unencrypted:
with open(doc.pdf_path, "wb") as encrypted:
self._render(" Encrypting", 3)
encrypted.write(GnuPG.encrypted(unencrypted))
2015-12-20 19:23:33 +00:00
def _parse_file_name(self, pdf):
"""
We use a crude naming convention to make handling the sender and title
easier:
"sender - title.pdf"
"""
# First we attempt "sender - title.pdf"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
if m:
sender_name, title = m.group(1), m.group(2)
sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})
return sender, title
# That didn't work, so we assume sender is None
m = re.match(self.PARSER_REGEX_TITLE, pdf)
return None, m.group(1)
2016-01-01 16:13:59 +00:00
def _cleanup(self, pngs, pdf):
2015-12-20 19:23:33 +00:00
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
2016-01-01 16:13:59 +00:00
for f in list(glob.glob(png_glob)) + [pdf]:
self._render(" Deleting {}".format(f), 2)
2015-12-20 19:23:33 +00:00
os.unlink(f)
2016-01-01 16:13:59 +00:00
self._render("", 2)
2016-01-01 16:13:59 +00:00
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)