Files
paperless-ngx/src/paperless_tesseract/tests/test_parser.py

914 lines
31 KiB
Python
Raw Normal View History

import shutil
import tempfile
2020-11-19 20:28:41 +01:00
import uuid
from pathlib import Path
2020-11-19 20:28:41 +01:00
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from ocrmypdf import SubprocessOutputError
from documents.parsers import ParseError
from documents.parsers import run_convert
2020-11-27 14:06:37 +01:00
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_tesseract.parsers import post_process_text
2020-11-19 20:28:41 +01:00
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
2022-11-21 14:56:14 -08:00
2020-11-29 19:58:48 +01:00
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
2021-02-21 00:18:34 +01:00
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
2020-11-29 19:58:48 +01:00
self.assertListEqual(indices, sorted(indices))
def test_post_process_text(self):
2022-11-21 14:56:14 -08:00
text_cases = [
("simple string", "simple string"),
("simple newline\n testing string", "simple newline\ntesting string"),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце",
),
]
for source, result in text_cases:
actual_result = post_process_text(source)
2020-11-27 14:06:37 +01:00
self.assertEqual(
result,
actual_result,
Chore(deps-dev): Bump the development group with 3 updates (#6079) * Chore(deps-dev): Bump the development group with 3 updates Bumps the development group with 3 updates: [ruff](https://github.com/astral-sh/ruff), [pytest](https://github.com/pytest-dev/pytest) and [mkdocs-material](https://github.com/squidfunk/mkdocs-material). Updates `ruff` from 0.3.0 to 0.3.2 - [Release notes](https://github.com/astral-sh/ruff/releases) - [Changelog](https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md) - [Commits](https://github.com/astral-sh/ruff/compare/v0.3.0...v0.3.2) Updates `pytest` from 8.0.2 to 8.1.1 - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/8.0.2...8.1.1) Updates `mkdocs-material` from 9.5.12 to 9.5.13 - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.5.12...9.5.13) --- updated-dependencies: - dependency-name: ruff dependency-type: direct:development update-type: version-update:semver-patch dependency-group: development - dependency-name: pytest dependency-type: direct:development update-type: version-update:semver-minor dependency-group: development - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch dependency-group: development ... Signed-off-by: dependabot[bot] <support@github.com> * Updates pre-commit hook versions and runs it against all files --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Trenton H <797416+stumpylog@users.noreply.github.com>
2024-03-12 07:56:01 -07:00
f"strip_exceess_whitespace({source}) != '{result}', but '{actual_result}'",
2020-11-27 14:06:37 +01:00
)
2020-11-19 20:28:41 +01:00
def test_get_text_from_pdf(self):
2021-02-21 00:18:34 +01:00
parser = RasterisedDocumentParser(uuid.uuid4())
2022-02-27 15:26:41 +01:00
text = parser.extract_text(
None,
self.SAMPLE_FILES / "simple-digital.pdf",
2022-02-27 15:26:41 +01:00
)
2020-11-19 20:28:41 +01:00
2020-11-29 19:58:48 +01:00
self.assertContainsStrings(text.strip(), ["This is a test document."])
2020-11-19 20:28:41 +01:00
def test_get_page_count(self):
"""
GIVEN:
- PDF file with a single page
- PDF file with multiple pages
WHEN:
- The number of pages is requested
THEN:
- The method returns 1 as the expected number of pages
- The method returns the correct number of pages (6)
"""
parser = RasterisedDocumentParser(uuid.uuid4())
page_count = parser.get_page_count(
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 1)
page_count = parser.get_page_count(
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 6)
def test_get_page_count_password_protected(self):
"""
GIVEN:
- Password protected PDF file
WHEN:
- The number of pages is requested
THEN:
- The method returns None
"""
parser = RasterisedDocumentParser(uuid.uuid4())
with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
page_count = parser.get_page_count(
str(self.SAMPLE_FILES / "password-protected.pdf"),
"application/pdf",
)
self.assertEqual(page_count, None)
self.assertIn("Unable to determine PDF page count", cm.output[0])
2020-11-19 20:28:41 +01:00
def test_thumbnail(self):
2020-11-25 19:51:09 +01:00
parser = RasterisedDocumentParser(uuid.uuid4())
2022-02-27 15:26:41 +01:00
thumb = parser.get_thumbnail(
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(thumb)
2020-11-19 20:28:41 +01:00
2021-01-02 15:37:27 +01:00
@mock.patch("documents.parsers.run_convert")
2020-11-19 20:28:41 +01:00
def test_thumbnail_fallback(self, m):
def call_convert(input_file, output_file, **kwargs):
if ".pdf" in str(input_file):
2020-11-19 20:28:41 +01:00
raise ParseError("Does not compute.")
else:
run_convert(input_file=input_file, output_file=output_file, **kwargs)
m.side_effect = call_convert
2020-11-25 19:51:09 +01:00
parser = RasterisedDocumentParser(uuid.uuid4())
2022-02-27 15:26:41 +01:00
thumb = parser.get_thumbnail(
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(thumb)
2021-03-14 14:42:48 +01:00
def test_thumbnail_encrypted(self):
parser = RasterisedDocumentParser(uuid.uuid4())
2022-02-27 15:26:41 +01:00
thumb = parser.get_thumbnail(
str(self.SAMPLE_FILES / "encrypted.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(thumb)
2020-11-26 00:08:23 +01:00
def test_get_dpi(self):
parser = RasterisedDocumentParser(None)
dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png"))
2020-11-26 00:08:23 +01:00
self.assertEqual(dpi, None)
dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png"))
2020-11-26 00:08:23 +01:00
self.assertEqual(dpi, 72)
def test_simple_digital(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
self.assertIsFile(parser.archive_path)
2020-11-26 00:08:23 +01:00
2020-11-29 19:58:48 +01:00
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
2020-11-26 00:08:23 +01:00
def test_with_form(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
self.assertIsFile(parser.archive_path)
2020-11-26 00:08:23 +01:00
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text(),
["Please enter your name in here:", "This is a PDF document with a form."],
)
2020-11-26 00:08:23 +01:00
@override_settings(OCR_MODE="redo")
def test_with_form_error(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
self.assertIsNone(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text(),
["Please enter your name in here:", "This is a PDF document with a form."],
)
2020-11-26 00:08:23 +01:00
2021-03-14 14:42:48 +01:00
@override_settings(OCR_MODE="skip")
def test_signed(self):
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf")
2021-03-14 14:42:48 +01:00
self.assertIsNone(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text(),
[
"This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable",
"automated testing of signed/encrypted PDFs",
],
)
2021-03-14 14:42:48 +01:00
2021-02-21 00:18:34 +01:00
@override_settings(OCR_MODE="skip")
def test_encrypted(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "encrypted.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
2021-02-21 00:18:34 +01:00
self.assertIsNone(parser.archive_path)
2021-03-14 14:42:48 +01:00
self.assertEqual(parser.get_text(), "")
2020-11-26 00:08:23 +01:00
@override_settings(OCR_MODE="redo")
def test_with_form_error_notext(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text(),
["Please enter your name in here:", "This is a PDF document with a form."],
)
2020-11-26 00:08:23 +01:00
@override_settings(OCR_MODE="force")
def test_with_form_force(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text(),
["Please enter your name in here:", "This is a PDF document with a form."],
)
2020-11-26 00:08:23 +01:00
def test_image_simple(self):
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png")
2020-11-26 00:08:23 +01:00
self.assertIsFile(parser.archive_path)
2020-11-26 00:08:23 +01:00
2020-11-29 19:58:48 +01:00
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
2020-11-26 00:08:23 +01:00
2022-02-21 22:38:25 +01:00
def test_image_simple_alpha(self):
2020-11-26 00:08:23 +01:00
parser = RasterisedDocumentParser(None)
with tempfile.TemporaryDirectory() as tempdir:
# Copy sample file to temp directory, as the parsing changes the file
# and this makes it modified to Git
sample_file = self.SAMPLE_FILES / "simple-alpha.png"
dest_file = Path(tempdir) / "simple-alpha.png"
shutil.copy(sample_file, dest_file)
2020-11-26 00:08:23 +01:00
parser.parse(str(dest_file), "image/png")
2022-02-21 22:38:25 +01:00
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
2020-11-26 00:08:23 +01:00
def test_image_calc_a4_dpi(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
dpi = parser.calculate_a4_dpi(
str(self.SAMPLE_FILES / "simple-no-dpi.png"),
2022-02-27 15:26:41 +01:00
)
self.assertEqual(dpi, 62)
2020-11-26 00:08:23 +01:00
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
def test_image_dpi_fail(self, m):
m.return_value = None
2020-11-26 00:08:23 +01:00
parser = RasterisedDocumentParser(None)
def f():
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "simple-no-dpi.png"),
"image/png",
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
self.assertRaises(ParseError, f)
@override_settings(OCR_IMAGE_DPI=72, MAX_IMAGE_PIXELS=0)
2020-11-26 00:08:23 +01:00
def test_image_no_dpi_default(self):
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png")
2020-11-26 00:08:23 +01:00
self.assertIsFile(parser.archive_path)
2020-11-26 00:08:23 +01:00
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["this is a test document."],
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
def test_multi_page(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
@override_settings(OCR_PAGES=2, OCR_MODE="skip")
def test_multi_page_pages_skip(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
def test_multi_page_pages_redo(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
@override_settings(OCR_PAGES=2, OCR_MODE="force")
def test_multi_page_pages_force(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
@override_settings(OCR_MODE="skip")
2020-11-26 00:08:23 +01:00
def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
2022-02-27 15:26:41 +01:00
)
2020-11-26 00:08:23 +01:00
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
def test_multi_page_analog_pages_redo(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR of only pages 1 and 2 requested
- OCR mode set to redo
WHEN:
- Document is parsed
THEN:
- Text of page 1 and 2 extracted
- An archive file is created
"""
2020-11-26 00:08:23 +01:00
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(parser.archive_path)
2020-11-29 19:58:48 +01:00
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
self.assertNotIn("page 3", parser.get_text().lower())
2020-11-26 00:08:23 +01:00
@override_settings(OCR_PAGES=1, OCR_MODE="force")
def test_multi_page_analog_pages_force(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR of only page 1 requested
- OCR mode set to force
WHEN:
- Document is parsed
THEN:
- Only text of page 1 is extracted
- An archive file is created
"""
2020-11-26 00:08:23 +01:00
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(parser.archive_path)
2020-11-29 19:58:48 +01:00
self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
self.assertNotIn("page 2", parser.get_text().lower())
self.assertNotIn("page 3", parser.get_text().lower())
2020-12-01 14:30:13 +01:00
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
2020-12-01 14:30:13 +01:00
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
2020-12-01 14:30:13 +01:00
self.assertIsNone(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
2022-02-27 15:26:41 +01:00
)
2020-12-01 14:30:13 +01:00
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_notext(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- An archive file is created with the OCRd text
"""
2020-12-01 14:30:13 +01:00
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
2022-02-27 15:26:41 +01:00
)
self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
2021-02-21 00:18:34 +01:00
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
"""
GIVEN:
- File with some text contained in images and some in text layer
- OCR mode set to skip
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- An archive file is created with the OCRd text and the original text
"""
2021-02-21 00:18:34 +01:00
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
self.assertIsNotNone(parser.archive_path)
self.assertIsFile(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
)
2021-02-21 00:18:34 +01:00
with (parser.tempdir / "sidecar.txt").open() as f:
2021-02-21 00:18:34 +01:00
sidecar = f.read()
2021-02-28 13:01:26 +01:00
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
2021-02-21 00:18:34 +01:00
2022-11-21 14:56:14 -08:00
@override_settings(OCR_MODE="redo")
def test_single_page_mixed(self):
"""
GIVEN:
- File with some text contained in images and some in text layer
- Text and images are mixed on the same page
- OCR mode set to redo
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Full content of the file is parsed (not just the image text)
- An archive file is created with the OCRd text and the original text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "single-page-mixed.pdf"),
2022-11-21 14:56:14 -08:00
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertIsFile(parser.archive_path)
2022-11-21 14:56:14 -08:00
self.assertContainsStrings(
parser.get_text().lower(),
[
"this is some normal text, present on page 1 of the document.",
"this is some text, but in an image, also on page 1.",
"this is further text on page 1.",
],
)
with (parser.tempdir / "sidecar.txt").open() as f:
2022-11-21 14:56:14 -08:00
sidecar = f.read().lower()
self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
self.assertNotIn(
"this is some normal text, present on page 1 of the document.",
sidecar,
)
2021-02-21 00:18:34 +01:00
@override_settings(OCR_MODE="skip_noarchive")
def test_multi_page_mixed_no_archive(self):
"""
GIVEN:
- File with some text contained in images and some in text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created as original file contains text
"""
2021-02-21 00:18:34 +01:00
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
"application/pdf",
2022-02-27 15:26:41 +01:00
)
2021-02-21 00:18:34 +01:00
self.assertIsNone(parser.archive_path)
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text().lower(),
["page 4", "page 5", "page 6"],
2022-02-27 15:26:41 +01:00
)
2021-02-21 00:18:34 +01:00
@override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
def test_rotate(self):
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf")
2022-02-27 15:26:41 +01:00
self.assertContainsStrings(
parser.get_text(),
[
"This is the text that appears on the first page. Its a lot of text.",
"Even if the pages are rotated, OCRmyPDF still gets the job done.",
"This is a really weird file with lots of nonsense text.",
"If you read this, its your own fault. Also check your screen orientation.",
],
)
2021-02-21 00:18:34 +01:00
def test_multi_page_tiff(self):
"""
GIVEN:
- Multi-page TIFF image
WHEN:
- Image is parsed
THEN:
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.tiff"),
"image/tiff",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
def test_multi_page_tiff_alpha(self):
"""
GIVEN:
- Multi-page TIFF image
- Image include an alpha channel
WHEN:
- Image is parsed
THEN:
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name)
parser.parse(
tmp_file.name,
"image/tiff",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
def test_multi_page_tiff_alpha_srgb(self):
"""
GIVEN:
- Multi-page TIFF image
- Image include an alpha channel
- Image is srgb colorspace
WHEN:
- Image is parsed
THEN:
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
sample_file = str(
self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff",
)
with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name)
parser.parse(
tmp_file.name,
"image/tiff",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
2021-02-22 00:17:16 +01:00
def test_ocrmypdf_parameters(self):
parser = RasterisedDocumentParser(None)
2022-02-27 15:26:41 +01:00
params = parser.construct_ocrmypdf_parameters(
input_file="input.pdf",
output_file="output.pdf",
sidecar_file="sidecar.txt",
mime_type="application/pdf",
safe_fallback=False,
)
2021-02-22 00:17:16 +01:00
2022-02-27 15:26:41 +01:00
self.assertEqual(params["input_file"], "input.pdf")
self.assertEqual(params["output_file"], "output.pdf")
self.assertEqual(params["sidecar"], "sidecar.txt")
2021-02-22 00:17:16 +01:00
with override_settings(OCR_CLEAN="none"):
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
parser = RasterisedDocumentParser(None)
2021-02-22 00:17:16 +01:00
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("clean", params)
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean"):
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
parser = RasterisedDocumentParser(None)
2021-02-22 00:17:16 +01:00
params = parser.construct_ocrmypdf_parameters("", "", "", "")
2022-02-27 15:26:41 +01:00
self.assertTrue(params["clean"])
2021-02-22 00:17:16 +01:00
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
parser = RasterisedDocumentParser(None)
2021-02-22 00:17:16 +01:00
params = parser.construct_ocrmypdf_parameters("", "", "", "")
2022-02-27 15:26:41 +01:00
self.assertTrue(params["clean_final"])
2021-02-22 00:17:16 +01:00
self.assertNotIn("clean", params)
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
parser = RasterisedDocumentParser(None)
2021-02-22 00:17:16 +01:00
params = parser.construct_ocrmypdf_parameters("", "", "", "")
2022-02-27 15:26:41 +01:00
self.assertTrue(params["clean"])
2021-02-22 00:17:16 +01:00
self.assertNotIn("clean_final", params)
with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
parser = RasterisedDocumentParser(None)
2021-02-22 00:17:16 +01:00
params = parser.construct_ocrmypdf_parameters("", "", "", "")
2022-02-27 15:26:41 +01:00
self.assertTrue(params["deskew"])
2021-02-22 00:17:16 +01:00
with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
parser = RasterisedDocumentParser(None)
2021-02-22 00:17:16 +01:00
params = parser.construct_ocrmypdf_parameters("", "", "", "")
2022-02-27 15:26:41 +01:00
self.assertNotIn("deskew", params)
2021-02-22 00:17:16 +01:00
with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
parser = RasterisedDocumentParser(None)
2021-02-22 00:17:16 +01:00
params = parser.construct_ocrmypdf_parameters("", "", "", "")
2022-02-27 15:26:41 +01:00
self.assertNotIn("deskew", params)
with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertIn("max_image_mpixels", params)
self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("max_image_mpixels", params)
def test_rtl_language_detection(self):
"""
GIVEN:
- File with text in an RTL language
WHEN:
- Document is parsed
THEN:
- Text from the document is extracted
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "rtl-test.pdf"),
"application/pdf",
)
# Copied from the PDF to here. Don't even look at it
self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
@mock.patch("ocrmypdf.ocr")
def test_gs_rendering_error(self, m):
m.side_effect = SubprocessOutputError("Ghostscript PDF/A rendering failed")
parser = RasterisedDocumentParser(None)
self.assertRaises(
ParseError,
parser.parse,
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).parent / "samples"
def test_bmp(self):
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp")
self.assertIsFile(parser.archive_path)
2022-11-14 15:38:35 -08:00
self.assertIn("this is a test document", parser.get_text().lower())
def test_jpg(self):
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg")
self.assertIsFile(parser.archive_path)
2022-11-14 15:38:35 -08:00
self.assertIn("this is a test document", parser.get_text().lower())
def test_heic(self):
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic")
self.assertIsFile(parser.archive_path)
self.assertIn("pizza", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=200)
def test_gif(self):
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif")
self.assertIsFile(parser.archive_path)
2022-11-14 15:38:35 -08:00
self.assertIn("this is a test document", parser.get_text().lower())
def test_tiff(self):
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff")
self.assertIsFile(parser.archive_path)
2022-11-14 15:38:35 -08:00
self.assertIn("this is a test document", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=72)
def test_webp(self):
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "document.webp"),
"image/webp",
)
self.assertIsFile(parser.archive_path)
# Older tesseracts consistently mangle the space between "a webp",
# tesseract 5.3.0 seems to do a better job, so we're accepting both
self.assertRegex(
2022-11-14 15:38:35 -08:00
parser.get_text().lower(),
r"this is a ?webp document, created 11/14/2022.",
2022-11-14 15:38:35 -08:00
)