Files
paperless-ngx/src/documents/classifier.py

575 lines
22 KiB
Python
Raw Normal View History

from __future__ import annotations
2018-09-05 12:43:11 +02:00
import logging
import pickle
import re
import warnings
from functools import lru_cache
from hashlib import sha256
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Iterator
from datetime import datetime
from numpy import ndarray
2020-11-27 15:00:16 +01:00
from django.conf import settings
from django.core.cache import cache
from django.core.cache import caches
from documents.caching import CACHE_5_MINUTES
from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY
from documents.caching import StoredLRUCache
from documents.models import Document
from documents.models import MatchingModel
logger = logging.getLogger("paperless.classifier")
ADVANCED_TEXT_PROCESSING_ENABLED = (
settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED
)
read_cache = caches["read-cache"]
RE_DIGIT = re.compile(r"\d")
RE_WORD = re.compile(r"\b[\w]+\b") # words that may contain digits
class IncompatibleClassifierVersionError(Exception):
def __init__(self, message: str, *args: object) -> None:
self.message: str = message
super().__init__(*args)
2021-05-19 23:11:24 +02:00
class ClassifierModelCorruptError(Exception):
pass
@lru_cache(maxsize=1)
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
if not settings.MODEL_FILE.is_file():
logger.debug(
"Document classification model does not exist (yet), not "
"performing automatic matching.",
)
return None
classifier = DocumentClassifier()
try:
classifier.load()
logger.debug("classifier_id=%s", id(classifier))
logger.debug(
"classifier_data_vectorizer_hash=%s",
classifier.data_vectorizer_hash,
)
except IncompatibleClassifierVersionError as e:
logger.info(f"Classifier version incompatible: {e.message}, will re-train")
Path(settings.MODEL_FILE).unlink()
classifier = None
if raise_exception:
raise e
except ClassifierModelCorruptError as e:
# there's something wrong with the model file.
logger.exception(
"Unrecoverable error while loading document "
"classification model, deleting model file.",
)
Path(settings.MODEL_FILE).unlink
classifier = None
if raise_exception:
raise e
except OSError as e:
logger.exception("IO error while loading document classification model")
2021-05-19 23:11:24 +02:00
classifier = None
if raise_exception:
raise e
except Exception as e: # pragma: no cover
logger.exception("Unknown error while loading document classification model")
classifier = None
if raise_exception:
raise e
return classifier
class DocumentClassifier:
# v7 - Updated scikit-learn package version
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
# v8 - Added storage path classifier
# v9 - Changed from hashing to time/ids for re-train check
2025-08-31 14:18:31 -07:00
# v10 - Switch persistence to joblib with memory-mapping to reduce load-time memory spikes
FORMAT_VERSION = 10
def __init__(self) -> None:
# last time a document changed and therefore training might be required
2024-09-26 12:22:24 -07:00
self.last_doc_change_time: datetime | None = None
# Hash of primary keys of AUTO matching values last used in training
2024-09-26 12:22:24 -07:00
self.last_auto_type_hash: bytes | None = None
self.data_vectorizer = None
self.data_vectorizer_hash = None
2018-09-26 10:51:42 +02:00
self.tags_binarizer = None
self.tags_classifier = None
self.correspondent_classifier = None
self.document_type_classifier = None
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
self.storage_path_classifier = None
self._stemmer = None
# 10,000 elements roughly use 200 to 500 KB per worker,
# and also in the shared Redis cache,
# Keep this cache small to minimize lookup and I/O latency.
if ADVANCED_TEXT_PROCESSING_ENABLED:
self._stem_cache = StoredLRUCache(
f"stem_cache_v{self.FORMAT_VERSION}",
capacity=10000,
)
self._stop_words = None
def _update_data_vectorizer_hash(self):
self.data_vectorizer_hash = sha256(
pickle.dumps(self.data_vectorizer),
).hexdigest()
def load(self) -> None:
2025-08-31 14:18:31 -07:00
import joblib
from sklearn.exceptions import InconsistentVersionWarning
# Catch warnings for processing
with warnings.catch_warnings(record=True) as w:
2025-08-31 14:18:31 -07:00
try:
state = joblib.load(settings.MODEL_FILE, mmap_mode="r")
except Exception as err:
# As a fallback, try to detect old pickle-based and mark incompatible
try:
with Path(settings.MODEL_FILE).open("rb") as f:
_ = pickle.load(f)
raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.",
) from err
except IncompatibleClassifierVersionError:
raise
except Exception:
# Not even a readable pickle header
raise ClassifierModelCorruptError from err
try:
if (
not isinstance(state, dict)
or state.get("format_version") != self.FORMAT_VERSION
):
raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.",
)
2025-08-31 14:18:31 -07:00
self.last_doc_change_time = state.get("last_doc_change_time")
self.last_auto_type_hash = state.get("last_auto_type_hash")
self.data_vectorizer = state.get("data_vectorizer")
self._update_data_vectorizer_hash()
self.tags_binarizer = state.get("tags_binarizer")
self.tags_classifier = state.get("tags_classifier")
self.correspondent_classifier = state.get("correspondent_classifier")
self.document_type_classifier = state.get("document_type_classifier")
self.storage_path_classifier = state.get("storage_path_classifier")
except IncompatibleClassifierVersionError:
raise
except Exception as err:
raise ClassifierModelCorruptError from err
# Check for the warning about unpickling from differing versions
# and consider it incompatible
sk_learn_warning_url = (
"https://scikit-learn.org/stable/"
"model_persistence.html"
"#security-maintainability-limitations"
)
for warning in w:
# The warning is inconsistent, the MLPClassifier is a specific warning, others have not updated yet
if issubclass(warning.category, InconsistentVersionWarning) or (
issubclass(warning.category, UserWarning)
and sk_learn_warning_url in str(warning.message)
):
raise IncompatibleClassifierVersionError("sklearn version update")
2021-02-06 20:54:58 +01:00
def save(self) -> None:
2025-08-31 14:18:31 -07:00
import joblib
2025-08-31 14:18:31 -07:00
target_file: Path = settings.MODEL_FILE
target_file_temp: Path = target_file.with_suffix(".joblib.part")
state = {
"format_version": self.FORMAT_VERSION,
"last_doc_change_time": self.last_doc_change_time,
"last_auto_type_hash": self.last_auto_type_hash,
"data_vectorizer": self.data_vectorizer,
"tags_binarizer": self.tags_binarizer,
"tags_classifier": self.tags_classifier,
"correspondent_classifier": self.correspondent_classifier,
"document_type_classifier": self.document_type_classifier,
"storage_path_classifier": self.storage_path_classifier,
}
joblib.dump(state, target_file_temp, compress=3)
target_file_temp.rename(target_file)
def train(self) -> bool:
# Get non-inbox documents
docs_queryset = (
Document.objects.exclude(
tags__is_inbox_tag=True,
)
.select_related("document_type", "correspondent", "storage_path")
.prefetch_related("tags")
.order_by("pk")
)
# No documents exit to train against
if docs_queryset.count() == 0:
raise ValueError("No training data available.")
labels_tags = []
labels_correspondent = []
labels_document_type = []
labels_storage_path = []
2018-09-05 12:43:11 +02:00
# Step 1: Extract and preprocess training data from the database.
2021-02-05 01:10:29 +01:00
logger.debug("Gathering data from database...")
hasher = sha256()
for doc in docs_queryset:
2018-09-26 10:51:42 +02:00
y = -1
2020-11-21 15:34:00 +01:00
dt = doc.document_type
if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
y = dt.pk
hasher.update(y.to_bytes(4, "little", signed=True))
2018-09-26 10:51:42 +02:00
labels_document_type.append(y)
y = -1
2020-11-21 15:34:00 +01:00
cor = doc.correspondent
if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
y = cor.pk
hasher.update(y.to_bytes(4, "little", signed=True))
2018-09-26 10:51:42 +02:00
labels_correspondent.append(y)
tags: list[int] = list(
doc.tags.filter(matching_algorithm=MatchingModel.MATCH_AUTO)
.order_by("pk")
.values_list("pk", flat=True),
2022-02-27 15:26:41 +01:00
)
for tag in tags:
hasher.update(tag.to_bytes(4, "little", signed=True))
2018-09-05 12:43:11 +02:00
labels_tags.append(tags)
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
y = -1
sp = doc.storage_path
if sp and sp.matching_algorithm == MatchingModel.MATCH_AUTO:
y = sp.pk
hasher.update(y.to_bytes(4, "little", signed=True))
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
labels_storage_path.append(y)
labels_tags_unique = {tag for tags in labels_tags for tag in tags}
num_tags = len(labels_tags_unique)
# Check if retraining is actually required.
# A document has been updated since the classifier was trained
# New auto tags, types, correspondent, storage paths exist
latest_doc_change = docs_queryset.latest("modified").modified
if (
self.last_doc_change_time is not None
and self.last_doc_change_time >= latest_doc_change
) and self.last_auto_type_hash == hasher.digest():
logger.info("No updates since last training")
# Set the classifier information into the cache
# Caching for 50 minutes, so slightly less than the normal retrain time
cache.set(
CLASSIFIER_MODIFIED_KEY,
self.last_doc_change_time,
CACHE_50_MINUTES,
)
cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
return False
# subtract 1 since -1 (null) is also part of the classes.
# union with {-1} accounts for cases where all documents have
# correspondents and types assigned, so -1 isn't part of labels_x, which
# it usually is.
num_correspondents: int = len(set(labels_correspondent) | {-1}) - 1
num_document_types: int = len(set(labels_document_type) | {-1}) - 1
num_storage_paths: int = len(set(labels_storage_path) | {-1}) - 1
2021-02-05 01:10:29 +01:00
logger.debug(
Chore(deps-dev): Bump the development group with 3 updates (#6079) * Chore(deps-dev): Bump the development group with 3 updates Bumps the development group with 3 updates: [ruff](https://github.com/astral-sh/ruff), [pytest](https://github.com/pytest-dev/pytest) and [mkdocs-material](https://github.com/squidfunk/mkdocs-material). Updates `ruff` from 0.3.0 to 0.3.2 - [Release notes](https://github.com/astral-sh/ruff/releases) - [Changelog](https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md) - [Commits](https://github.com/astral-sh/ruff/compare/v0.3.0...v0.3.2) Updates `pytest` from 8.0.2 to 8.1.1 - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/8.0.2...8.1.1) Updates `mkdocs-material` from 9.5.12 to 9.5.13 - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.5.12...9.5.13) --- updated-dependencies: - dependency-name: ruff dependency-type: direct:development update-type: version-update:semver-patch dependency-group: development - dependency-name: pytest dependency-type: direct:development update-type: version-update:semver-minor dependency-group: development - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch dependency-group: development ... Signed-off-by: dependabot[bot] <support@github.com> * Updates pre-commit hook versions and runs it against all files --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Trenton H <797416+stumpylog@users.noreply.github.com>
2024-03-12 07:56:01 -07:00
f"{docs_queryset.count()} documents, {num_tags} tag(s), {num_correspondents} correspondent(s), "
2025-01-18 07:43:02 -08:00
f"{num_document_types} document type(s). {num_storage_paths} storage path(s)",
2018-09-26 10:51:42 +02:00
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
2018-09-05 12:43:11 +02:00
# Step 2: vectorize data
2021-02-05 01:10:29 +01:00
logger.debug("Vectorizing data...")
def content_generator() -> Iterator[str]:
"""
Generates the content for documents, but once at a time
"""
for doc in docs_queryset:
yield self.preprocess_content(doc.content, shared_cache=False)
2018-09-26 10:51:42 +02:00
self.data_vectorizer = CountVectorizer(
analyzer="word",
ngram_range=(1, 2),
min_df=0.01,
2018-09-26 10:51:42 +02:00
)
data_vectorized: ndarray = self.data_vectorizer.fit_transform(
content_generator(),
)
2018-09-05 12:43:11 +02:00
# See the notes here:
2023-11-13 09:09:56 -08:00
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# This attribute isn't needed to function and can be large
self.data_vectorizer.stop_words_ = None
2018-09-05 12:43:11 +02:00
# Step 3: train the classifiers
if num_tags > 0:
2021-02-05 01:10:29 +01:00
logger.debug("Training tags classifier...")
if num_tags == 1:
# Special case where only one tag has auto:
# Fallback to binary classification.
2022-02-27 15:26:41 +01:00
labels_tags = [
label[0] if len(label) == 1 else -1 for label in labels_tags
]
self.tags_binarizer = LabelBinarizer()
labels_tags_vectorized: ndarray = self.tags_binarizer.fit_transform(
labels_tags,
2022-02-27 15:26:41 +01:00
).ravel()
else:
self.tags_binarizer = MultiLabelBinarizer()
2022-02-27 15:26:41 +01:00
labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
2020-11-21 15:34:00 +01:00
self.tags_classifier = MLPClassifier(tol=0.01)
2018-09-05 12:43:11 +02:00
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
else:
self.tags_classifier = None
2022-02-27 15:26:41 +01:00
logger.debug("There are no tags. Not training tags classifier.")
2018-09-05 12:43:11 +02:00
if num_correspondents > 0:
2022-02-27 15:26:41 +01:00
logger.debug("Training correspondent classifier...")
2020-11-21 15:34:00 +01:00
self.correspondent_classifier = MLPClassifier(tol=0.01)
2022-02-27 15:26:41 +01:00
self.correspondent_classifier.fit(data_vectorized, labels_correspondent)
2018-09-05 12:43:11 +02:00
else:
self.correspondent_classifier = None
2021-02-05 01:10:29 +01:00
logger.debug(
"There are no correspondents. Not training correspondent classifier.",
2018-09-26 10:51:42 +02:00
)
2018-09-05 12:43:11 +02:00
if num_document_types > 0:
2022-02-27 15:26:41 +01:00
logger.debug("Training document type classifier...")
2020-11-21 15:34:00 +01:00
self.document_type_classifier = MLPClassifier(tol=0.01)
2022-02-27 15:26:41 +01:00
self.document_type_classifier.fit(data_vectorized, labels_document_type)
2018-09-05 12:43:11 +02:00
else:
self.document_type_classifier = None
2021-02-05 01:10:29 +01:00
logger.debug(
"There are no document types. Not training document type classifier.",
2018-09-26 10:51:42 +02:00
)
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
if num_storage_paths > 0:
logger.debug(
"Training storage paths classifier...",
)
self.storage_path_classifier = MLPClassifier(tol=0.01)
self.storage_path_classifier.fit(
data_vectorized,
labels_storage_path,
)
else:
self.storage_path_classifier = None
logger.debug(
"There are no storage paths. Not training storage path classifier.",
)
self.last_doc_change_time = latest_doc_change
self.last_auto_type_hash = hasher.digest()
self._update_data_vectorizer_hash()
# Set the classifier information into the cache
# Caching for 50 minutes, so slightly less than the normal retrain time
cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_50_MINUTES)
cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
return True
def _init_advanced_text_processing(self):
if self._stop_words is None or self._stemmer is None:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# Not really hacky, since it isn't private and is documented, but
# set the search path for NLTK data to the single location it should be in
nltk.data.path = [settings.NLTK_DIR]
try:
# Preload the corpus early, to force the lazy loader to transform
stopwords.ensure_loaded()
# Do some one time setup
# Sometimes, somehow, there's multiple threads loading the corpus
# and it's not thread safe, raising an AttributeError
self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
self._stop_words = frozenset(stopwords.words(settings.NLTK_LANGUAGE))
except AttributeError:
logger.debug("Could not initialize NLTK for advanced text processing.")
return False
return True
def stem_and_skip_stop_words(self, words: list[str], *, shared_cache=True):
"""
Reduce a list of words to their stem. Stop words are converted to empty strings.
:param words: the list of words to stem
"""
def _stem_and_skip_stop_word(word: str):
"""
Reduce a given word to its stem. If it's a stop word, return an empty string.
E.g. "amazement", "amaze" and "amazed" all return "amaz".
"""
cached = self._stem_cache.get(word)
if cached is not None:
return cached
elif word in self._stop_words:
return ""
# Assumption: words that contain numbers are never stemmed
elif RE_DIGIT.search(word):
return word
else:
result = self._stemmer.stem(word)
self._stem_cache.set(word, result)
return result
if shared_cache:
self._stem_cache.load()
# Stem the words and skip stop words
result = " ".join(
filter(None, (_stem_and_skip_stop_word(w) for w in words)),
)
if shared_cache:
self._stem_cache.save()
return result
def preprocess_content(
self,
content: str,
*,
shared_cache=True,
) -> str:
"""
Process the contents of a document, distilling it down into
words which are meaningful to the content.
A stemmer cache is shared across workers with the parameter "shared_cache".
This is unnecessary when training the classifier.
"""
# Lower case the document, reduce space,
# and keep only letters and digits.
content = " ".join(match.group().lower() for match in RE_WORD.finditer(content))
if ADVANCED_TEXT_PROCESSING_ENABLED:
from nltk.tokenize import word_tokenize
if not self._init_advanced_text_processing():
return content
# Tokenize
# This splits the content into tokens, roughly words
words = word_tokenize(content, language=settings.NLTK_LANGUAGE)
# Stem the words and skip stop words
content = self.stem_and_skip_stop_words(words, shared_cache=shared_cache)
return content
def _get_vectorizer_cache_key(self, content: str):
hash = sha256(content.encode())
hash.update(
f"|{self.FORMAT_VERSION}|{settings.NLTK_LANGUAGE}|{settings.NLTK_ENABLED}|{self.data_vectorizer_hash}".encode(),
)
return f"vectorized_content_{hash.hexdigest()}"
def _vectorize(self, content: str):
key = self._get_vectorizer_cache_key(content)
serialized_result = read_cache.get(key)
if serialized_result is None:
result = self.data_vectorizer.transform([self.preprocess_content(content)])
read_cache.set(key, pickle.dumps(result), CACHE_5_MINUTES)
else:
read_cache.touch(key, CACHE_5_MINUTES)
result = pickle.loads(serialized_result)
return result
2024-09-26 12:22:24 -07:00
def predict_correspondent(self, content: str) -> int | None:
if self.correspondent_classifier:
X = self._vectorize(content)
correspondent_id = self.correspondent_classifier.predict(X)
2018-09-26 10:51:42 +02:00
if correspondent_id != -1:
return correspondent_id
2018-09-26 10:51:42 +02:00
else:
return None
else:
return None
2018-09-26 10:51:42 +02:00
2024-09-26 12:22:24 -07:00
def predict_document_type(self, content: str) -> int | None:
if self.document_type_classifier:
X = self._vectorize(content)
document_type_id = self.document_type_classifier.predict(X)
2018-09-26 10:51:42 +02:00
if document_type_id != -1:
return document_type_id
2018-09-26 10:51:42 +02:00
else:
return None
else:
return None
2018-09-26 10:51:42 +02:00
def predict_tags(self, content: str) -> list[int]:
from sklearn.utils.multiclass import type_of_target
if self.tags_classifier:
X = self._vectorize(content)
y = self.tags_classifier.predict(X)
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
2022-02-27 15:26:41 +01:00
if type_of_target(y).startswith("multilabel"):
# the usual case when there are multiple tags.
return list(tags_ids)
2022-02-27 15:26:41 +01:00
elif type_of_target(y) == "binary" and tags_ids != -1:
# This is for when we have binary classification with only one
# tag and the result is to assign this tag.
return [tags_ids]
else:
# Usually binary as well with -1 as the result, but we're
# going to catch everything else here as well.
return []
else:
return []
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
2024-09-26 12:22:24 -07:00
def predict_storage_path(self, content: str) -> int | None:
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
if self.storage_path_classifier:
X = self._vectorize(content)
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
storage_path_id = self.storage_path_classifier.predict(X)
if storage_path_id != -1:
return storage_path_id
else:
return None
else:
return None