paperless-ngx/src/paperless_tesseract/tests/test_parser.py

import shutil
import tempfile
import uuid
from pathlib import Path
from unittest import mock

from django.test import TestCase
from django.test import override_settings
from ocrmypdf import SubprocessOutputError

from documents.parsers import ParseError
from documents.parsers import run_convert
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_tesseract.parsers import post_process_text


class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"

    def assertContainsStrings(self, content, strings):
        # Asserts that all strings appear in content, in the given order.
        indices = []
        for s in strings:
            if s in content:
                indices.append(content.index(s))
            else:
                self.fail(f"'{s}' is not in '{content}'")
        self.assertListEqual(indices, sorted(indices))

    def test_post_process_text(self):
        text_cases = [
            ("simple     string", "simple string"),
            ("simple    newline\n   testing string", "simple newline\ntesting string"),
            (
                "utf-8   строка с пробелами в конце  ",
                "utf-8 строка с пробелами в конце",
            ),
        ]

        for source, result in text_cases:
            actual_result = post_process_text(source)
            self.assertEqual(
                result,
                actual_result,
                f"strip_exceess_whitespace({source}) != '{result}', but '{actual_result}'",
            )

    def test_get_text_from_pdf(self):
        parser = RasterisedDocumentParser(uuid.uuid4())
        text = parser.extract_text(
            None,
            self.SAMPLE_FILES / "simple-digital.pdf",
        )

        self.assertContainsStrings(text.strip(), ["This is a test document."])

    def test_get_page_count(self):
        """
        GIVEN:
            - PDF file with a single page
            - PDF file with multiple pages
        WHEN:
            - The number of pages is requested
        THEN:
            - The method returns 1 as the expected number of pages
            - The method returns the correct number of pages (6)
        """
        parser = RasterisedDocumentParser(uuid.uuid4())
        page_count = parser.get_page_count(
            str(self.SAMPLE_FILES / "simple-digital.pdf"),
            "application/pdf",
        )
        self.assertEqual(page_count, 1)

        page_count = parser.get_page_count(
            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
            "application/pdf",
        )
        self.assertEqual(page_count, 6)

    def test_get_page_count_password_protected(self):
        """
        GIVEN:
            - Password protected PDF file
        WHEN:
            - The number of pages is requested
        THEN:
            - The method returns None
        """
        parser = RasterisedDocumentParser(uuid.uuid4())
        with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
            page_count = parser.get_page_count(
                str(self.SAMPLE_FILES / "password-protected.pdf"),
                "application/pdf",
            )
            self.assertEqual(page_count, None)
            self.assertIn("Unable to determine PDF page count", cm.output[0])

    def test_thumbnail(self):
        parser = RasterisedDocumentParser(uuid.uuid4())
        thumb = parser.get_thumbnail(
            str(self.SAMPLE_FILES / "simple-digital.pdf"),
            "application/pdf",
        )
        self.assertIsFile(thumb)

    @mock.patch("documents.parsers.run_convert")
    def test_thumbnail_fallback(self, m):
        def call_convert(input_file, output_file, **kwargs):
            if ".pdf" in str(input_file):
                raise ParseError("Does not compute.")
            else:
                run_convert(input_file=input_file, output_file=output_file, **kwargs)

        m.side_effect = call_convert

        parser = RasterisedDocumentParser(uuid.uuid4())
        thumb = parser.get_thumbnail(
            str(self.SAMPLE_FILES / "simple-digital.pdf"),
            "application/pdf",
        )
        self.assertIsFile(thumb)

    def test_thumbnail_encrypted(self):
        parser = RasterisedDocumentParser(uuid.uuid4())
        thumb = parser.get_thumbnail(
            str(self.SAMPLE_FILES / "encrypted.pdf"),
            "application/pdf",
        )
        self.assertIsFile(thumb)

    def test_get_dpi(self):
        parser = RasterisedDocumentParser(None)

        dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png"))
        self.assertEqual(dpi, None)

        dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png"))
        self.assertEqual(dpi, 72)

    def test_simple_digital(self):
        parser = RasterisedDocumentParser(None)

        parser.parse(
            str(self.SAMPLE_FILES / "simple-digital.pdf"),
            "application/pdf",
        )

        self.assertIsFile(parser.archive_path)

        self.assertContainsStrings(parser.get_text(), ["This is a test document."])

    def test_with_form(self):
        parser = RasterisedDocumentParser(None)

        parser.parse(
            str(self.SAMPLE_FILES / "with-form.pdf"),
            "application/pdf",
        )

        self.assertIsFile(parser.archive_path)

        self.assertContainsStrings(
            parser.get_text(),
            ["Please enter your name in here:", "This is a PDF document with a form."],
        )

    @override_settings(OCR_MODE="redo")
    def test_with_form_error(self):
        parser = RasterisedDocumentParser(None)

        parser.parse(
            str(self.SAMPLE_FILES / "with-form.pdf"),
            "application/pdf",
        )

        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text(),
            ["Please enter your name in here:", "This is a PDF document with a form."],
        )

    @override_settings(OCR_MODE="skip")
    def test_signed(self):
        parser = RasterisedDocumentParser(None)

        parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf")

        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text(),
            [
                "This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable",
                "automated testing of signed/encrypted PDFs",
            ],
        )

    @override_settings(OCR_MODE="skip")
    def test_encrypted(self):
        parser = RasterisedDocumentParser(None)

        parser.parse(
            str(self.SAMPLE_FILES / "encrypted.pdf"),
            "application/pdf",
        )

        self.assertIsNone(parser.archive_path)
        self.assertEqual(parser.get_text(), "")

    @override_settings(OCR_MODE="redo")
    def test_with_form_error_notext(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "with-form.pdf"),
            "application/pdf",
        )

        self.assertContainsStrings(
            parser.get_text(),
            ["Please enter your name in here:", "This is a PDF document with a form."],
        )

    @override_settings(OCR_MODE="force")
    def test_with_form_force(self):
        parser = RasterisedDocumentParser(None)

        parser.parse(
            str(self.SAMPLE_FILES / "with-form.pdf"),
            "application/pdf",
        )

        self.assertContainsStrings(
            parser.get_text(),
            ["Please enter your name in here:", "This is a PDF document with a form."],
        )

    def test_image_simple(self):
        parser = RasterisedDocumentParser(None)

        parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png")

        self.assertIsFile(parser.archive_path)

        self.assertContainsStrings(parser.get_text(), ["This is a test document."])

    def test_image_simple_alpha(self):
        parser = RasterisedDocumentParser(None)

        with tempfile.TemporaryDirectory() as tempdir:
            # Copy sample file to temp directory, as the parsing changes the file
            # and this makes it modified to Git
            sample_file = self.SAMPLE_FILES / "simple-alpha.png"
            dest_file = Path(tempdir) / "simple-alpha.png"
            shutil.copy(sample_file, dest_file)

            parser.parse(str(dest_file), "image/png")

            self.assertIsFile(parser.archive_path)

            self.assertContainsStrings(parser.get_text(), ["This is a test document."])

    def test_image_calc_a4_dpi(self):
        parser = RasterisedDocumentParser(None)

        dpi = parser.calculate_a4_dpi(
            str(self.SAMPLE_FILES / "simple-no-dpi.png"),
        )

        self.assertEqual(dpi, 62)

    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
    def test_image_dpi_fail(self, m):
        m.return_value = None
        parser = RasterisedDocumentParser(None)

        def f():
            parser.parse(
                str(self.SAMPLE_FILES / "simple-no-dpi.png"),
                "image/png",
            )

        self.assertRaises(ParseError, f)

    @override_settings(OCR_IMAGE_DPI=72, MAX_IMAGE_PIXELS=0)
    def test_image_no_dpi_default(self):
        parser = RasterisedDocumentParser(None)

        parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png")

        self.assertIsFile(parser.archive_path)

        self.assertContainsStrings(
            parser.get_text().lower(),
            ["this is a test document."],
        )

    def test_multi_page(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_PAGES=2, OCR_MODE="skip")
    def test_multi_page_pages_skip(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
    def test_multi_page_pages_redo(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_PAGES=2, OCR_MODE="force")
    def test_multi_page_pages_force(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_MODE="skip")
    def test_multi_page_analog_pages_skip(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
            "application/pdf",
        )
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
    def test_multi_page_analog_pages_redo(self):
        """
        GIVEN:
            - File with text contained in images but no text layer
            - OCR of only pages 1 and 2 requested
            - OCR mode set to redo
        WHEN:
            - Document is parsed
        THEN:
            - Text of page 1 and 2 extracted
            - An archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
            "application/pdf",
        )
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
        self.assertNotIn("page 3", parser.get_text().lower())

    @override_settings(OCR_PAGES=1, OCR_MODE="force")
    def test_multi_page_analog_pages_force(self):
        """
        GIVEN:
            - File with text contained in images but no text layer
            - OCR of only page 1 requested
            - OCR mode set to force
        WHEN:
            - Document is parsed
        THEN:
            - Only text of page 1 is extracted
            - An archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
            "application/pdf",
        )
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
        self.assertNotIn("page 2", parser.get_text().lower())
        self.assertNotIn("page 3", parser.get_text().lower())

    @override_settings(OCR_MODE="skip_noarchive")
    def test_skip_noarchive_withtext(self):
        """
        GIVEN:
            - File with existing text layer
            - OCR mode set to skip_noarchive
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - No archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_MODE="skip_noarchive")
    def test_skip_noarchive_notext(self):
        """
        GIVEN:
            - File with text contained in images but no text layer
            - OCR mode set to skip_noarchive
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - An archive file is created with the OCRd text
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
            "application/pdf",
        )

        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

        self.assertIsNotNone(parser.archive_path)

    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
    def test_skip_archive_never_withtext(self):
        """
        GIVEN:
            - File with existing text layer
            - OCR_SKIP_ARCHIVE_FILE set to never
        WHEN:
            - Document is parsed
        THEN:
            - Text from text layer is extracted
            - Archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
    def test_skip_archive_never_withimages(self):
        """
        GIVEN:
            - File with text contained in images but no text layer
            - OCR_SKIP_ARCHIVE_FILE set to never
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - Archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
    def test_skip_archive_withtext_withtext(self):
        """
        GIVEN:
            - File with existing text layer
            - OCR_SKIP_ARCHIVE_FILE set to with_text
        WHEN:
            - Document is parsed
        THEN:
            - Text from text layer is extracted
            - No archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
    def test_skip_archive_withtext_withimages(self):
        """
        GIVEN:
            - File with text contained in images but no text layer
            - OCR_SKIP_ARCHIVE_FILE set to with_text
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - Archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
    def test_skip_archive_always_withtext(self):
        """
        GIVEN:
            - File with existing text layer
            - OCR_SKIP_ARCHIVE_FILE set to always
        WHEN:
            - Document is parsed
        THEN:
            - Text from text layer is extracted
            - No archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
    def test_skip_archive_always_withimages(self):
        """
        GIVEN:
            - File with text contained in images but no text layer
            - OCR_SKIP_ARCHIVE_FILE set to always
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - No archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
            "application/pdf",
        )
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    @override_settings(OCR_MODE="skip")
    def test_multi_page_mixed(self):
        """
        GIVEN:
            - File with some text contained in images and some in text layer
            - OCR mode set to skip
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - An archive file is created with the OCRd text and the original text
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
        )

        with (parser.tempdir / "sidecar.txt").open() as f:
            sidecar = f.read()

        self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)

    @override_settings(OCR_MODE="redo")
    def test_single_page_mixed(self):
        """
        GIVEN:
            - File with some text contained in images and some in text layer
            - Text and images are mixed on the same page
            - OCR mode set to redo
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - Full content of the file is parsed (not just the image text)
            - An archive file is created with the OCRd text and the original text
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "single-page-mixed.pdf"),
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            [
                "this is some normal text, present on page 1 of the document.",
                "this is some text, but in an image, also on page 1.",
                "this is further text on page 1.",
            ],
        )

        with (parser.tempdir / "sidecar.txt").open() as f:
            sidecar = f.read().lower()

        self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
        self.assertNotIn(
            "this is some normal text, present on page 1 of the document.",
            sidecar,
        )

    @override_settings(OCR_MODE="skip_noarchive")
    def test_multi_page_mixed_no_archive(self):
        """
        GIVEN:
            - File with some text contained in images and some in text layer
            - OCR mode set to skip_noarchive
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - No archive file is created as original file contains text
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
            "application/pdf",
        )
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 4", "page 5", "page 6"],
        )

    @override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
    def test_rotate(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf")
        self.assertContainsStrings(
            parser.get_text(),
            [
                "This is the text that appears on the first page. It’s a lot of text.",
                "Even if the pages are rotated, OCRmyPDF still gets the job done.",
                "This is a really weird file with lots of nonsense text.",
                "If you read this, it’s your own fault. Also check your screen orientation.",
            ],
        )

    def test_multi_page_tiff(self):
        """
        GIVEN:
            - Multi-page TIFF image
        WHEN:
            - Image is parsed
        THEN:
            - Text from all pages extracted
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "multi-page-images.tiff"),
            "image/tiff",
        )
        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

    def test_multi_page_tiff_alpha(self):
        """
        GIVEN:
            - Multi-page TIFF image
            - Image include an alpha channel
        WHEN:
            - Image is parsed
        THEN:
            - Text from all pages extracted
        """
        parser = RasterisedDocumentParser(None)
        sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
        with tempfile.NamedTemporaryFile() as tmp_file:
            shutil.copy(sample_file, tmp_file.name)
            parser.parse(
                tmp_file.name,
                "image/tiff",
            )
            self.assertIsFile(parser.archive_path)
            self.assertContainsStrings(
                parser.get_text().lower(),
                ["page 1", "page 2", "page 3"],
            )

    def test_multi_page_tiff_alpha_srgb(self):
        """
        GIVEN:
            - Multi-page TIFF image
            - Image include an alpha channel
            - Image is srgb colorspace
        WHEN:
            - Image is parsed
        THEN:
            - Text from all pages extracted
        """
        parser = RasterisedDocumentParser(None)
        sample_file = str(
            self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff",
        )
        with tempfile.NamedTemporaryFile() as tmp_file:
            shutil.copy(sample_file, tmp_file.name)
            parser.parse(
                tmp_file.name,
                "image/tiff",
            )
            self.assertIsFile(parser.archive_path)
            self.assertContainsStrings(
                parser.get_text().lower(),
                ["page 1", "page 2", "page 3"],
            )

    def test_ocrmypdf_parameters(self):
        parser = RasterisedDocumentParser(None)
        params = parser.construct_ocrmypdf_parameters(
            input_file="input.pdf",
            output_file="output.pdf",
            sidecar_file="sidecar.txt",
            mime_type="application/pdf",
            safe_fallback=False,
        )

        self.assertEqual(params["input_file"], "input.pdf")
        self.assertEqual(params["output_file"], "output.pdf")
        self.assertEqual(params["sidecar"], "sidecar.txt")

        with override_settings(OCR_CLEAN="none"):
            parser = RasterisedDocumentParser(None)
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertNotIn("clean", params)
            self.assertNotIn("clean_final", params)

        with override_settings(OCR_CLEAN="clean"):
            parser = RasterisedDocumentParser(None)
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertTrue(params["clean"])
            self.assertNotIn("clean_final", params)

        with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
            parser = RasterisedDocumentParser(None)
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertTrue(params["clean_final"])
            self.assertNotIn("clean", params)

        with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
            parser = RasterisedDocumentParser(None)
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertTrue(params["clean"])
            self.assertNotIn("clean_final", params)

        with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
            parser = RasterisedDocumentParser(None)
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertTrue(params["deskew"])

        with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
            parser = RasterisedDocumentParser(None)
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertNotIn("deskew", params)

        with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
            parser = RasterisedDocumentParser(None)
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertNotIn("deskew", params)

        with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
            parser = RasterisedDocumentParser(None)
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertIn("max_image_mpixels", params)
            self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)

        with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
            parser = RasterisedDocumentParser(None)
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertNotIn("max_image_mpixels", params)

    def test_rtl_language_detection(self):
        """
        GIVEN:
            - File with text in an RTL language
        WHEN:
            - Document is parsed
        THEN:
            - Text from the document is extracted
        """
        parser = RasterisedDocumentParser(None)

        parser.parse(
            str(self.SAMPLE_FILES / "rtl-test.pdf"),
            "application/pdf",
        )

        # Copied from the PDF to here.  Don't even look at it
        self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())

    @mock.patch("ocrmypdf.ocr")
    def test_gs_rendering_error(self, m):
        m.side_effect = SubprocessOutputError("Ghostscript PDF/A rendering failed")
        parser = RasterisedDocumentParser(None)

        self.assertRaises(
            ParseError,
            parser.parse,
            str(self.SAMPLE_FILES / "simple-digital.pdf"),
            "application/pdf",
        )


class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
    SAMPLE_FILES = Path(__file__).parent / "samples"

    def test_bmp(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp")
        self.assertIsFile(parser.archive_path)
        self.assertIn("this is a test document", parser.get_text().lower())

    def test_jpg(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg")
        self.assertIsFile(parser.archive_path)
        self.assertIn("this is a test document", parser.get_text().lower())

    def test_heic(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic")
        self.assertIsFile(parser.archive_path)
        self.assertIn("pizza", parser.get_text().lower())

    @override_settings(OCR_IMAGE_DPI=200)
    def test_gif(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif")
        self.assertIsFile(parser.archive_path)
        self.assertIn("this is a test document", parser.get_text().lower())

    def test_tiff(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff")
        self.assertIsFile(parser.archive_path)
        self.assertIn("this is a test document", parser.get_text().lower())

    @override_settings(OCR_IMAGE_DPI=72)
    def test_webp(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(
            str(self.SAMPLE_FILES / "document.webp"),
            "image/webp",
        )
        self.assertIsFile(parser.archive_path)
        # Older tesseracts consistently mangle the space between "a webp",
        # tesseract 5.3.0 seems to do a better job, so we're accepting both
        self.assertRegex(
            parser.get_text().lower(),
            r"this is a ?webp document, created 11/14/2022.",
        )
-												Changes the simple-alpha parsing test to use a tempdir so the original isn't modified in Git

											
										
										
											2022-06-13 17:14:38 -07:00
+								import shutil
 								import tempfile
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
+								import uuid
-												Adds better handling for files with invalid utf8 content

											
										
										
											2023-05-12 14:21:32 -07:00
+								from pathlib import Path
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
+								from unittest import mock
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								from django.test import TestCase
-												Fixes ruff not running isort against the codebase

											
										
										
											2023-04-20 08:10:17 -07:00
+								from django.test import override_settings
-												Fix: ghostscript rendering error doesnt trigger frontend failure message (#4092)

* Raise ParseError from gs rendering error

* catch all parser errors as generic exception

* Differentiate generic vs parse errors during consumption
											
										
										
											2023-08-31 19:49:00 -07:00
+								from ocrmypdf import SubprocessOutputError
-												Fixes ruff not running isort against the codebase

											
										
										
											2023-04-20 08:10:17 -07:00
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								from documents.parsers import ParseError
 								from documents.parsers import run_convert
-												test case fixes.

											
										
										
											2020-11-27 14:06:37 +01:00
+								from documents.tests.utils import DirectoriesMixin
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								from documents.tests.utils import FileSystemAssertsMixin
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								from paperless_tesseract.parsers import RasterisedDocumentParser
-												Fixes ruff not running isort against the codebase

											
										
										
											2023-04-20 08:10:17 -07:00
+								from paperless_tesseract.parsers import post_process_text
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-												Adds better handling for files with invalid utf8 content

											
										
										
											2023-05-12 14:21:32 -07:00
+								    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
-												Adds a test to cover this edge case

											
										
										
											2022-11-21 14:56:14 -08:00
-												more tests!

											
										
										
											2020-11-29 19:58:48 +01:00
+								    def assertContainsStrings(self, content, strings):
 								        # Asserts that all strings appear in content, in the given order.
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
+								        indices = []
 								        for s in strings:
 								            if s in content:
 								                indices.append(content.index(s))
 								            else:
 								                self.fail(f"'{s}' is not in '{content}'")
-												more tests!

											
										
										
											2020-11-29 19:58:48 +01:00
+								        self.assertListEqual(indices, sorted(indices))
-												also apply \0 removal to sidecar contents

											
										
										
											2021-03-22 23:08:34 +01:00
+								    def test_post_process_text(self):
-												Adds a test to cover this edge case

											
										
										
											2022-11-21 14:56:14 -08:00
+								        text_cases = [
 								            ("simple     string", "simple string"),
 								            ("simple    newline\n   testing string", "simple newline\ntesting string"),
 								            (
 								                "utf-8   строка с пробелами в конце  ",
 								                "utf-8 строка с пробелами в конце",
 								            ),
 								        ]
 								        for source, result in text_cases:
-												also apply \0 removal to sidecar contents

											
										
										
											2021-03-22 23:08:34 +01:00
+								            actual_result = post_process_text(source)
-												test case fixes.

											
										
										
											2020-11-27 14:06:37 +01:00
+								            self.assertEqual(
 								                result,
 								                actual_result,
-												Chore(deps-dev): Bump the development group with 3 updates (#6079)

* Chore(deps-dev): Bump the development group with 3 updates

Bumps the development group with 3 updates: [ruff](https://github.com/astral-sh/ruff), [pytest](https://github.com/pytest-dev/pytest) and [mkdocs-material](https://github.com/squidfunk/mkdocs-material).


Updates `ruff` from 0.3.0 to 0.3.2
- [Release notes](https://github.com/astral-sh/ruff/releases)
- [Changelog](https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md)
- [Commits](https://github.com/astral-sh/ruff/compare/v0.3.0...v0.3.2)

Updates `pytest` from 8.0.2 to 8.1.1
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/8.0.2...8.1.1)

Updates `mkdocs-material` from 9.5.12 to 9.5.13
- [Release notes](https://github.com/squidfunk/mkdocs-material/releases)
- [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG)
- [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.5.12...9.5.13)

---
updated-dependencies:
- dependency-name: ruff
  dependency-type: direct:development
  update-type: version-update:semver-patch
  dependency-group: development
- dependency-name: pytest
  dependency-type: direct:development
  update-type: version-update:semver-minor
  dependency-group: development
- dependency-name: mkdocs-material
  dependency-type: direct:development
  update-type: version-update:semver-patch
  dependency-group: development
...

Signed-off-by: dependabot[bot] <support@github.com>

* Updates pre-commit hook versions and runs it against all files

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Trenton H <797416+stumpylog@users.noreply.github.com>
											
										
										
											2024-03-12 07:56:01 -07:00
+								                f"strip_exceess_whitespace({source}) != '{result}', but '{actual_result}'",
-												test case fixes.

											
										
										
											2020-11-27 14:06:37 +01:00
+								            )
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
 								    def test_get_text_from_pdf(self):
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
+								        parser = RasterisedDocumentParser(uuid.uuid4())
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        text = parser.extract_text(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            None,
-												Adds better handling for files with invalid utf8 content

											
										
										
											2023-05-12 14:21:32 -07:00
+								            self.SAMPLE_FILES / "simple-digital.pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
-												more tests!

											
										
										
											2020-11-29 19:58:48 +01:00
+								        self.assertContainsStrings(text.strip(), ["This is a test document."])
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
-												Feature: page count (#7750)

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2024-09-25 17:22:12 +02:00
+								    def test_get_page_count(self):
 								        """
 								        GIVEN:
 								            - PDF file with a single page
 								            - PDF file with multiple pages
 								        WHEN:
 								            - The number of pages is requested
 								        THEN:
 								            - The method returns 1 as the expected number of pages
 								            - The method returns the correct number of pages (6)
 								        """
 								        parser = RasterisedDocumentParser(uuid.uuid4())
 								        page_count = parser.get_page_count(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-												Feature: page count (#7750)

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2024-09-25 17:22:12 +02:00
+								            "application/pdf",
 								        )
 								        self.assertEqual(page_count, 1)
 								        page_count = parser.get_page_count(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
-												Feature: page count (#7750)

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2024-09-25 17:22:12 +02:00
+								            "application/pdf",
 								        )
 								        self.assertEqual(page_count, 6)
-												Fix: handle page count exception for pw-protected files (#8240)


											
										
										
											2024-11-10 03:33:47 -08:00
+								    def test_get_page_count_password_protected(self):
 								        """
 								        GIVEN:
 								            - Password protected PDF file
 								        WHEN:
 								            - The number of pages is requested
 								        THEN:
 								            - The method returns None
 								        """
 								        parser = RasterisedDocumentParser(uuid.uuid4())
 								        with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
 								            page_count = parser.get_page_count(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								                str(self.SAMPLE_FILES / "password-protected.pdf"),
-												Fix: handle page count exception for pw-protected files (#8240)


											
										
										
											2024-11-10 03:33:47 -08:00
+								                "application/pdf",
 								            )
 								            self.assertEqual(page_count, None)
 								            self.assertIn("Unable to determine PDF page count", cm.output[0])
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
+								    def test_thumbnail(self):
-												fixed the test cases

											
										
										
											2020-11-25 19:51:09 +01:00
+								        parser = RasterisedDocumentParser(uuid.uuid4())
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        thumb = parser.get_thumbnail(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(thumb)
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
-												fix bugs and test cases

											
										
										
											2021-01-02 15:37:27 +01:00
+								    @mock.patch("documents.parsers.run_convert")
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
+								    def test_thumbnail_fallback(self, m):
 								        def call_convert(input_file, output_file, **kwargs):
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            if ".pdf" in str(input_file):
-												testing the tesseract parser

											
										
										
											2020-11-19 20:28:41 +01:00
+								                raise ParseError("Does not compute.")
 								            else:
 								                run_convert(input_file=input_file, output_file=output_file, **kwargs)
 								        m.side_effect = call_convert
-												fixed the test cases

											
										
										
											2020-11-25 19:51:09 +01:00
+								        parser = RasterisedDocumentParser(uuid.uuid4())
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        thumb = parser.get_thumbnail(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(thumb)
-												fixes #631

											
										
										
											2021-03-14 14:42:48 +01:00
 								    def test_thumbnail_encrypted(self):
 								        parser = RasterisedDocumentParser(uuid.uuid4())
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        thumb = parser.get_thumbnail(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "encrypted.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(thumb)
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    def test_get_dpi(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png"))
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
+								        self.assertEqual(dpi, None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png"))
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
+								        self.assertEqual(dpi, 72)
 								    def test_simple_digital(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												more tests!

											
										
										
											2020-11-29 19:58:48 +01:00
+								        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    def test_with_form(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "with-form.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
 								            parser.get_text(),
 								            ["Please enter your name in here:", "This is a PDF document with a form."],
 								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    @override_settings(OCR_MODE="redo")
 								    def test_with_form_error(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "with-form.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								        self.assertIsNone(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
 								            parser.get_text(),
 								            ["Please enter your name in here:", "This is a PDF document with a form."],
 								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												fixes #631

											
										
										
											2021-03-14 14:42:48 +01:00
+								    @override_settings(OCR_MODE="skip")
 								    def test_signed(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf")
-												fixes #631

											
										
										
											2021-03-14 14:42:48 +01:00
 								        self.assertIsNone(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
 								            parser.get_text(),
 								            [
 								                "This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable",
 								                "automated testing of signed/encrypted PDFs",
 								            ],
 								        )
-												fixes #631

											
										
										
											2021-03-14 14:42:48 +01:00
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
+								    @override_settings(OCR_MODE="skip")
 								    def test_encrypted(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "encrypted.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
 								        self.assertIsNone(parser.archive_path)
-												fixes #631

											
										
										
											2021-03-14 14:42:48 +01:00
+								        self.assertEqual(parser.get_text(), "")
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
+								    @override_settings(OCR_MODE="redo")
 								    def test_with_form_error_notext(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "with-form.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
 								            parser.get_text(),
 								            ["Please enter your name in here:", "This is a PDF document with a form."],
 								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    @override_settings(OCR_MODE="force")
 								    def test_with_form_force(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "with-form.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
 								            parser.get_text(),
 								            ["Please enter your name in here:", "This is a PDF document with a form."],
 								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    def test_image_simple(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png")
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												more tests!

											
										
										
											2020-11-29 19:58:48 +01:00
+								        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Modify test for PNG image with alpha

											
										
										
											2022-02-21 22:38:25 +01:00
+								    def test_image_simple_alpha(self):
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
+								        parser = RasterisedDocumentParser(None)
-												Changes the simple-alpha parsing test to use a tempdir so the original isn't modified in Git

											
										
										
											2022-06-13 17:14:38 -07:00
+								        with tempfile.TemporaryDirectory() as tempdir:
 								            # Copy sample file to temp directory, as the parsing changes the file
 								            # and this makes it modified to Git
-												Chore: switch from os.path to pathlib.Path (#9933)

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2025-06-18 19:16:59 +02:00
+								            sample_file = self.SAMPLE_FILES / "simple-alpha.png"
 								            dest_file = Path(tempdir) / "simple-alpha.png"
-												Changes the simple-alpha parsing test to use a tempdir so the original isn't modified in Git

											
										
										
											2022-06-13 17:14:38 -07:00
+								            shutil.copy(sample_file, dest_file)
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            parser.parse(str(dest_file), "image/png")
-												Modify test for PNG image with alpha

											
										
										
											2022-02-21 22:38:25 +01:00
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								            self.assertIsFile(parser.archive_path)
-												Changes the simple-alpha parsing test to use a tempdir so the original isn't modified in Git

											
										
										
											2022-06-13 17:14:38 -07:00
 								            self.assertContainsStrings(parser.get_text(), ["This is a test document."])
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												local import of ocrmypdf so that the webserver does not load that

											
										
										
											2021-02-15 12:18:10 +01:00
+								    def test_image_calc_a4_dpi(self):
-												fixes bauerj/paperless_app#23 and most of all other scanner apps out there.

											
										
										
											2020-12-12 18:25:15 +01:00
+								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        dpi = parser.calculate_a4_dpi(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "simple-no-dpi.png"),
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												fixes bauerj/paperless_app#23 and most of all other scanner apps out there.

											
										
										
											2020-12-12 18:25:15 +01:00
-												local import of ocrmypdf so that the webserver does not load that

											
										
										
											2021-02-15 12:18:10 +01:00
+								        self.assertEqual(dpi, 62)
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												fixes bauerj/paperless_app#23 and most of all other scanner apps out there.

											
										
										
											2020-12-12 18:25:15 +01:00
+								    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
 								    def test_image_dpi_fail(self, m):
 								        m.return_value = None
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
+								        parser = RasterisedDocumentParser(None)
 								        def f():
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								            parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								                str(self.SAMPLE_FILES / "simple-no-dpi.png"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								                "image/png",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								            )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								        self.assertRaises(ParseError, f)
-												Feature: Allow user to control PIL image pixel limit (#5997)


											
										
										
											2024-03-04 16:19:56 -08:00
+								    @override_settings(OCR_IMAGE_DPI=72, MAX_IMAGE_PIXELS=0)
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
+								    def test_image_no_dpi_default(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png")
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            parser.get_text().lower(),
 								            ["this is a test document."],
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    def test_multi_page(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    @override_settings(OCR_PAGES=2, OCR_MODE="skip")
 								    def test_multi_page_pages_skip(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
 								    def test_multi_page_pages_redo(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    @override_settings(OCR_PAGES=2, OCR_MODE="force")
 								    def test_multi_page_pages_force(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
-												Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

											
										
										
											2023-02-23 22:42:57 -05:00
+								    @override_settings(OCR_MODE="skip")
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
+								    def test_multi_page_analog_pages_skip(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
 								    def test_multi_page_analog_pages_redo(self):
-												Reverts the change around skip_noarchive to align with how it is documented to work

											
										
										
											2022-10-19 11:37:47 -07:00
+								        """
 								        GIVEN:
 								            - File with text contained in images but no text layer
 								            - OCR of only pages 1 and 2 requested
 								            - OCR mode set to redo
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text of page 1 and 2 extracted
 								            - An archive file is created
 								        """
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
+								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												more tests!

											
										
										
											2020-11-29 19:58:48 +01:00
+								        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
-												Adding more test coverage, in particular around Tika and its parser

											
										
										
											2023-02-02 12:46:49 -08:00
+								        self.assertNotIn("page 3", parser.get_text().lower())
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
 								    @override_settings(OCR_PAGES=1, OCR_MODE="force")
 								    def test_multi_page_analog_pages_force(self):
-												Reverts the change around skip_noarchive to align with how it is documented to work

											
										
										
											2022-10-19 11:37:47 -07:00
+								        """
 								        GIVEN:
 								            - File with text contained in images but no text layer
 								            - OCR of only page 1 requested
 								            - OCR mode set to force
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Only text of page 1 is extracted
 								            - An archive file is created
 								        """
-												more tests of the new parser

											
										
										
											2020-11-26 00:08:23 +01:00
+								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												more tests!

											
										
										
											2020-11-29 19:58:48 +01:00
+								        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
-												Adding more test coverage, in particular around Tika and its parser

											
										
										
											2023-02-02 12:46:49 -08:00
+								        self.assertNotIn("page 2", parser.get_text().lower())
 								        self.assertNotIn("page 3", parser.get_text().lower())
-												testing the new noarchive option.

											
										
										
											2020-12-01 14:30:13 +01:00
 								    @override_settings(OCR_MODE="skip_noarchive")
 								    def test_skip_noarchive_withtext(self):
-												Fixes the creation of an archive file, even if noarchive was specified

											
										
										
											2022-08-20 13:47:56 -07:00
+								        """
 								        GIVEN:
 								            - File with existing text layer
 								            - OCR mode set to skip_noarchive
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from images is extracted
 								            - No archive file is created
 								        """
-												testing the new noarchive option.

											
										
										
											2020-12-01 14:30:13 +01:00
+								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												testing the new noarchive option.

											
										
										
											2020-12-01 14:30:13 +01:00
+								        self.assertIsNone(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												testing the new noarchive option.

											
										
										
											2020-12-01 14:30:13 +01:00
 								    @override_settings(OCR_MODE="skip_noarchive")
 								    def test_skip_noarchive_notext(self):
-												Fixes the creation of an archive file, even if noarchive was specified

											
										
										
											2022-08-20 13:47:56 -07:00
+								        """
 								        GIVEN:
 								            - File with text contained in images but no text layer
 								            - OCR mode set to skip_noarchive
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from images is extracted
-												Reverts the change around skip_noarchive to align with how it is documented to work

											
										
										
											2022-10-19 11:37:47 -07:00
+								            - An archive file is created with the OCRd text
-												Fixes the creation of an archive file, even if noarchive was specified

											
										
										
											2022-08-20 13:47:56 -07:00
+								        """
-												testing the new noarchive option.

											
										
										
											2020-12-01 14:30:13 +01:00
+								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Fixes the creation of an archive file, even if noarchive was specified

											
										
										
											2022-08-20 13:47:56 -07:00
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												a couple fixes and more supported image files

											
										
										
											2020-12-02 17:39:49 +01:00
-												Reverts the change around skip_noarchive to align with how it is documented to work

											
										
										
											2022-10-19 11:37:47 -07:00
+								        self.assertIsNotNone(parser.archive_path)
-												Fixes the creation of an archive file, even if noarchive was specified

											
										
										
											2022-08-20 13:47:56 -07:00
-												Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

											
										
										
											2023-02-23 22:42:57 -05:00
+								    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
 								    def test_skip_archive_never_withtext(self):
 								        """
 								        GIVEN:
 								            - File with existing text layer
 								            - OCR_SKIP_ARCHIVE_FILE set to never
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from text layer is extracted
 								            - Archive file is created
 								        """
 								        parser = RasterisedDocumentParser(None)
 								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-												Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

											
										
										
											2023-02-23 22:42:57 -05:00
+								            "application/pdf",
 								        )
 								        self.assertIsNotNone(parser.archive_path)
 								        self.assertContainsStrings(
 								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
 								        )
 								    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
 								    def test_skip_archive_never_withimages(self):
 								        """
 								        GIVEN:
 								            - File with text contained in images but no text layer
 								            - OCR_SKIP_ARCHIVE_FILE set to never
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from images is extracted
 								            - Archive file is created
 								        """
 								        parser = RasterisedDocumentParser(None)
 								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-												Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

											
										
										
											2023-02-23 22:42:57 -05:00
+								            "application/pdf",
 								        )
 								        self.assertIsNotNone(parser.archive_path)
 								        self.assertContainsStrings(
 								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
 								        )
 								    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
 								    def test_skip_archive_withtext_withtext(self):
 								        """
 								        GIVEN:
 								            - File with existing text layer
 								            - OCR_SKIP_ARCHIVE_FILE set to with_text
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from text layer is extracted
 								            - No archive file is created
 								        """
 								        parser = RasterisedDocumentParser(None)
 								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-												Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

											
										
										
											2023-02-23 22:42:57 -05:00
+								            "application/pdf",
 								        )
 								        self.assertIsNone(parser.archive_path)
 								        self.assertContainsStrings(
 								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
 								        )
 								    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
 								    def test_skip_archive_withtext_withimages(self):
 								        """
 								        GIVEN:
 								            - File with text contained in images but no text layer
 								            - OCR_SKIP_ARCHIVE_FILE set to with_text
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from images is extracted
 								            - Archive file is created
 								        """
 								        parser = RasterisedDocumentParser(None)
 								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-												Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

											
										
										
											2023-02-23 22:42:57 -05:00
+								            "application/pdf",
 								        )
 								        self.assertIsNotNone(parser.archive_path)
 								        self.assertContainsStrings(
 								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
 								        )
 								    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
 								    def test_skip_archive_always_withtext(self):
 								        """
 								        GIVEN:
 								            - File with existing text layer
 								            - OCR_SKIP_ARCHIVE_FILE set to always
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from text layer is extracted
 								            - No archive file is created
 								        """
 								        parser = RasterisedDocumentParser(None)
 								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
-												Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

											
										
										
											2023-02-23 22:42:57 -05:00
+								            "application/pdf",
 								        )
 								        self.assertIsNone(parser.archive_path)
 								        self.assertContainsStrings(
 								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
 								        )
 								    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
 								    def test_skip_archive_always_withimages(self):
 								        """
 								        GIVEN:
 								            - File with text contained in images but no text layer
 								            - OCR_SKIP_ARCHIVE_FILE set to always
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from images is extracted
 								            - No archive file is created
 								        """
 								        parser = RasterisedDocumentParser(None)
 								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-images.pdf"),
-												Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

											
										
										
											2023-02-23 22:42:57 -05:00
+								            "application/pdf",
 								        )
 								        self.assertIsNone(parser.archive_path)
 								        self.assertContainsStrings(
 								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
 								        )
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
+								    @override_settings(OCR_MODE="skip")
 								    def test_multi_page_mixed(self):
-												Reverts the change around skip_noarchive to align with how it is documented to work

											
										
										
											2022-10-19 11:37:47 -07:00
+								        """
 								        GIVEN:
 								            - File with some text contained in images and some in text layer
 								            - OCR mode set to skip
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from images is extracted
 								            - An archive file is created with the OCRd text and the original text
 								        """
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
+								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												Reverts the change around skip_noarchive to align with how it is documented to work

											
										
										
											2022-10-19 11:37:47 -07:00
+								        self.assertIsNotNone(parser.archive_path)
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
 								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
 								        )
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
-												Chore: switch from os.path to pathlib.Path (#9933)

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2025-06-18 19:16:59 +02:00
+								        with (parser.tempdir / "sidecar.txt").open() as f:
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
+								            sidecar = f.read()
-												update dependencies

											
										
										
											2021-02-28 13:01:26 +01:00
+								        self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
-												Adds a test to cover this edge case

											
										
										
											2022-11-21 14:56:14 -08:00
+								    @override_settings(OCR_MODE="redo")
 								    def test_single_page_mixed(self):
 								        """
 								        GIVEN:
 								            - File with some text contained in images and some in text layer
 								            - Text and images are mixed on the same page
 								            - OCR mode set to redo
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from images is extracted
 								            - Full content of the file is parsed (not just the image text)
 								            - An archive file is created with the OCRd text and the original text
 								        """
 								        parser = RasterisedDocumentParser(None)
 								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "single-page-mixed.pdf"),
-												Adds a test to cover this edge case

											
										
										
											2022-11-21 14:56:14 -08:00
+								            "application/pdf",
 								        )
 								        self.assertIsNotNone(parser.archive_path)
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Adds a test to cover this edge case

											
										
										
											2022-11-21 14:56:14 -08:00
+								        self.assertContainsStrings(
 								            parser.get_text().lower(),
 								            [
 								                "this is some normal text, present on page 1 of the document.",
 								                "this is some text, but in an image, also on page 1.",
 								                "this is further text on page 1.",
 								            ],
 								        )
-												Chore: switch from os.path to pathlib.Path (#9933)

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2025-06-18 19:16:59 +02:00
+								        with (parser.tempdir / "sidecar.txt").open() as f:
-												Adds a test to cover this edge case

											
										
										
											2022-11-21 14:56:14 -08:00
+								            sidecar = f.read().lower()
 								        self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
 								        self.assertNotIn(
 								            "this is some normal text, present on page 1 of the document.",
 								            sidecar,
 								        )
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
+								    @override_settings(OCR_MODE="skip_noarchive")
 								    def test_multi_page_mixed_no_archive(self):
-												Fixes the creation of an archive file, even if noarchive was specified

											
										
										
											2022-08-20 13:47:56 -07:00
+								        """
 								        GIVEN:
 								            - File with some text contained in images and some in text layer
 								            - OCR mode set to skip_noarchive
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from images is extracted
-												Reverts the change around skip_noarchive to align with how it is documented to work

											
										
										
											2022-10-19 11:37:47 -07:00
+								            - No archive file is created as original file contains text
-												Fixes the creation of an archive file, even if noarchive was specified

											
										
										
											2022-08-20 13:47:56 -07:00
+								        """
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
+								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            "application/pdf",
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
+								        self.assertIsNone(parser.archive_path)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
-												Runs the pre-commit hooks over all the Python files

											
										
										
											2022-03-11 10:55:51 -08:00
+								            parser.get_text().lower(),
 								            ["page 4", "page 5", "page 6"],
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        )
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
 								    @override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
 								    def test_rotate(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf")
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertContainsStrings(
 								            parser.get_text(),
 								            [
 								                "This is the text that appears on the first page. It’s a lot of text.",
 								                "Even if the pages are rotated, OCRmyPDF still gets the job done.",
 								                "This is a really weird file with lots of nonsense text.",
 								                "If you read this, it’s your own fault. Also check your screen orientation.",
 								            ],
 								        )
-												tests

											
										
										
											2021-02-21 00:18:34 +01:00
-												Adds testing coverage of multipage TIFF with alpha, without and with alpha/sRGB

											
										
										
											2023-01-03 08:21:23 -08:00
+								    def test_multi_page_tiff(self):
 								        """
 								        GIVEN:
 								            - Multi-page TIFF image
 								        WHEN:
 								            - Image is parsed
 								        THEN:
 								            - Text from all pages extracted
 								        """
 								        parser = RasterisedDocumentParser(None)
 								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "multi-page-images.tiff"),
-												Adds testing coverage of multipage TIFF with alpha, without and with alpha/sRGB

											
										
										
											2023-01-03 08:21:23 -08:00
+								            "image/tiff",
 								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Adds testing coverage of multipage TIFF with alpha, without and with alpha/sRGB

											
										
										
											2023-01-03 08:21:23 -08:00
+								        self.assertContainsStrings(
 								            parser.get_text().lower(),
 								            ["page 1", "page 2", "page 3"],
 								        )
 								    def test_multi_page_tiff_alpha(self):
 								        """
 								        GIVEN:
 								            - Multi-page TIFF image
 								            - Image include an alpha channel
 								        WHEN:
 								            - Image is parsed
 								        THEN:
 								            - Text from all pages extracted
 								        """
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#9933)

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2025-06-18 19:16:59 +02:00
+								        sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
-												Fixes some sample test files showing as modified after running tests

											
										
										
											2023-01-04 09:01:23 -08:00
+								        with tempfile.NamedTemporaryFile() as tmp_file:
 								            shutil.copy(sample_file, tmp_file.name)
 								            parser.parse(
 								                tmp_file.name,
 								                "image/tiff",
 								            )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								            self.assertIsFile(parser.archive_path)
-												Fixes some sample test files showing as modified after running tests

											
										
										
											2023-01-04 09:01:23 -08:00
+								            self.assertContainsStrings(
 								                parser.get_text().lower(),
 								                ["page 1", "page 2", "page 3"],
 								            )
-												Adds testing coverage of multipage TIFF with alpha, without and with alpha/sRGB

											
										
										
											2023-01-03 08:21:23 -08:00
 								    def test_multi_page_tiff_alpha_srgb(self):
 								        """
 								        GIVEN:
 								            - Multi-page TIFF image
 								            - Image include an alpha channel
 								            - Image is srgb colorspace
 								        WHEN:
 								            - Image is parsed
 								        THEN:
 								            - Text from all pages extracted
 								        """
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        sample_file = str(
 								            self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff",
 								        )
-												Fixes some sample test files showing as modified after running tests

											
										
										
											2023-01-04 09:01:23 -08:00
+								        with tempfile.NamedTemporaryFile() as tmp_file:
 								            shutil.copy(sample_file, tmp_file.name)
 								            parser.parse(
 								                tmp_file.name,
 								                "image/tiff",
 								            )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								            self.assertIsFile(parser.archive_path)
-												Fixes some sample test files showing as modified after running tests

											
										
										
											2023-01-04 09:01:23 -08:00
+								            self.assertContainsStrings(
 								                parser.get_text().lower(),
 								                ["page 1", "page 2", "page 3"],
 								            )
-												Adds testing coverage of multipage TIFF with alpha, without and with alpha/sRGB

											
										
										
											2023-01-03 08:21:23 -08:00
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								    def test_ocrmypdf_parameters(self):
 								        parser = RasterisedDocumentParser(None)
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        params = parser.construct_ocrmypdf_parameters(
 								            input_file="input.pdf",
 								            output_file="output.pdf",
 								            sidecar_file="sidecar.txt",
 								            mime_type="application/pdf",
 								            safe_fallback=False,
 								        )
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								        self.assertEqual(params["input_file"], "input.pdf")
 								        self.assertEqual(params["output_file"], "output.pdf")
 								        self.assertEqual(params["sidecar"], "sidecar.txt")
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
 								        with override_settings(OCR_CLEAN="none"):
-												Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2023-12-29 15:42:56 -08:00
+								            parser = RasterisedDocumentParser(None)
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            params = parser.construct_ocrmypdf_parameters("", "", "", "")
 								            self.assertNotIn("clean", params)
 								            self.assertNotIn("clean_final", params)
 								        with override_settings(OCR_CLEAN="clean"):
-												Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2023-12-29 15:42:56 -08:00
+								            parser = RasterisedDocumentParser(None)
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								            self.assertTrue(params["clean"])
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            self.assertNotIn("clean_final", params)
 								        with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
-												Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2023-12-29 15:42:56 -08:00
+								            parser = RasterisedDocumentParser(None)
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								            self.assertTrue(params["clean_final"])
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            self.assertNotIn("clean", params)
 								        with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
-												Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2023-12-29 15:42:56 -08:00
+								            parser = RasterisedDocumentParser(None)
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								            self.assertTrue(params["clean"])
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            self.assertNotIn("clean_final", params)
 								        with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
-												Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2023-12-29 15:42:56 -08:00
+								            parser = RasterisedDocumentParser(None)
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								            self.assertTrue(params["deskew"])
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
 								        with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
-												Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2023-12-29 15:42:56 -08:00
+								            parser = RasterisedDocumentParser(None)
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								            self.assertNotIn("deskew", params)
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
 								        with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
-												Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2023-12-29 15:42:56 -08:00
+								            parser = RasterisedDocumentParser(None)
-												tests

											
										
										
											2021-02-22 00:17:16 +01:00
+								            params = parser.construct_ocrmypdf_parameters("", "", "", "")
-												Format Python code with black

											
										
										
											2022-02-27 15:26:41 +01:00
+								            self.assertNotIn("deskew", params)
-												Adding more test coverage, in particular around Tika and its parser

											
										
										
											2023-02-02 12:46:49 -08:00
+								        with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
-												Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2023-12-29 15:42:56 -08:00
+								            parser = RasterisedDocumentParser(None)
-												Adding more test coverage, in particular around Tika and its parser

											
										
										
											2023-02-02 12:46:49 -08:00
+								            params = parser.construct_ocrmypdf_parameters("", "", "", "")
 								            self.assertIn("max_image_mpixels", params)
 								            self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
 								        with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
-												Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2023-12-29 15:42:56 -08:00
+								            parser = RasterisedDocumentParser(None)
-												Adding more test coverage, in particular around Tika and its parser

											
										
										
											2023-02-02 12:46:49 -08:00
+								            params = parser.construct_ocrmypdf_parameters("", "", "", "")
 								            self.assertNotIn("max_image_mpixels", params)
-												In the case of an RTL language being extracted via pdfminer.six, fall back to forced OCR, which handles RTL text better

											
										
										
											2022-11-29 13:19:16 -08:00
+								    def test_rtl_language_detection(self):
 								        """
 								        GIVEN:
 								            - File with text in an RTL language
 								        WHEN:
 								            - Document is parsed
 								        THEN:
 								            - Text from the document is extracted
 								        """
 								        parser = RasterisedDocumentParser(None)
-												Try a new way of extracting text from a given PDF file

											
										
										
											2023-01-01 15:57:22 -08:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "rtl-test.pdf"),
-												Try a new way of extracting text from a given PDF file

											
										
										
											2023-01-01 15:57:22 -08:00
+								            "application/pdf",
 								        )
-												In the case of an RTL language being extracted via pdfminer.six, fall back to forced OCR, which handles RTL text better

											
										
										
											2022-11-29 13:19:16 -08:00
-												Try a new way of extracting text from a given PDF file

											
										
										
											2023-01-01 15:57:22 -08:00
+								        # Copied from the PDF to here.  Don't even look at it
 								        self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
-												In the case of an RTL language being extracted via pdfminer.six, fall back to forced OCR, which handles RTL text better

											
										
										
											2022-11-29 13:19:16 -08:00
-												Fix: ghostscript rendering error doesnt trigger frontend failure message (#4092)

* Raise ParseError from gs rendering error

* catch all parser errors as generic exception

* Differentiate generic vs parse errors during consumption
											
										
										
											2023-08-31 19:49:00 -07:00
+								    @mock.patch("ocrmypdf.ocr")
 								    def test_gs_rendering_error(self, m):
 								        m.side_effect = SubprocessOutputError("Ghostscript PDF/A rendering failed")
 								        parser = RasterisedDocumentParser(None)
 								        self.assertRaises(
 								            ParseError,
 								            parser.parse,
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "simple-digital.pdf"),
-												Fix: ghostscript rendering error doesnt trigger frontend failure message (#4092)

* Raise ParseError from gs rendering error

* catch all parser errors as generic exception

* Differentiate generic vs parse errors during consumption
											
										
										
											2023-08-31 19:49:00 -07:00
+								            "application/pdf",
 								        )
-												a couple fixes and more supported image files

											
										
										
											2020-12-02 17:39:49 +01:00
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-												Chore: switch from os.path to pathlib.Path (#9933)

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2025-06-18 19:16:59 +02:00
+								    SAMPLE_FILES = Path(__file__).parent / "samples"
-												a couple fixes and more supported image files

											
										
										
											2020-12-02 17:39:49 +01:00
 								    def test_bmp(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp")
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Allows parsing of WebP format images

											
										
										
											2022-11-14 15:38:35 -08:00
+								        self.assertIn("this is a test document", parser.get_text().lower())
-												a couple fixes and more supported image files

											
										
										
											2020-12-02 17:39:49 +01:00
 								    def test_jpg(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg")
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Allows parsing of WebP format images

											
										
										
											2022-11-14 15:38:35 -08:00
+								        self.assertIn("this is a test document", parser.get_text().lower())
-												a couple fixes and more supported image files

											
										
										
											2020-12-02 17:39:49 +01:00
-												Enhancement: support heic images (#9771)


											
										
										
											2025-04-23 09:22:21 -07:00
+								    def test_heic(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic")
-												Enhancement: support heic images (#9771)


											
										
										
											2025-04-23 09:22:21 -07:00
+								        self.assertIsFile(parser.archive_path)
 								        self.assertIn("pizza", parser.get_text().lower())
-												a couple fixes and more supported image files

											
										
										
											2020-12-02 17:39:49 +01:00
+								    @override_settings(OCR_IMAGE_DPI=200)
 								    def test_gif(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif")
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Allows parsing of WebP format images

											
										
										
											2022-11-14 15:38:35 -08:00
+								        self.assertIn("this is a test document", parser.get_text().lower())
-												a couple fixes and more supported image files

											
										
										
											2020-12-02 17:39:49 +01:00
 								    def test_tiff(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								        parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff")
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Allows parsing of WebP format images

											
										
										
											2022-11-14 15:38:35 -08:00
+								        self.assertIn("this is a test document", parser.get_text().lower())
 								    @override_settings(OCR_IMAGE_DPI=72)
 								    def test_webp(self):
 								        parser = RasterisedDocumentParser(None)
-												Chore: switch from os.path to pathlib.Path (#9933)

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2025-06-18 19:16:59 +02:00
+								        parser.parse(
-												Chore: switch from os.path to pathlib.Path (#10539)


											
										
										
											2025-09-03 17:12:41 +02:00
+								            str(self.SAMPLE_FILES / "document.webp"),
-												Chore: switch from os.path to pathlib.Path (#9933)

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
											
										
										
											2025-06-18 19:16:59 +02:00
+								            "image/webp",
 								        )
-												Creates a mix-in for asserting file system states

											
										
										
											2023-02-19 18:00:45 -08:00
+								        self.assertIsFile(parser.archive_path)
-												Don't consider better OCR as failing

Tesseract 5.3.0 does a better job at OCR, and correctly
reads "a webp" instead of "awebp", this is good, so we
don't want the test to fail.

											
										
										
											2023-07-11 16:41:31 +02:00
+								        # Older tesseracts consistently mangle the space between "a webp",
 								        # tesseract 5.3.0 seems to do a better job, so we're accepting both
 								        self.assertRegex(
-												Allows parsing of WebP format images

											
										
										
											2022-11-14 15:38:35 -08:00
+								            parser.get_text().lower(),
-												Don't consider better OCR as failing

Tesseract 5.3.0 does a better job at OCR, and correctly
reads "a webp" instead of "awebp", this is good, so we
don't want the test to fail.

											
										
										
											2023-07-11 16:41:31 +02:00
+								            r"this is a ?webp document, created 11/14/2022.",
-												Allows parsing of WebP format images

											
										
										
											2022-11-14 15:38:35 -08:00
+								        )