Add document analysis: page count, text extraction & language detection
Some checks failed
Build and Push Docker Image / build (push) Failing after 11s

Introduces a document analyzer service that extracts page count (via pdf-parse),
text preview, and detected language (via franc) from uploaded files. Analysis runs
automatically on upload (configurable via SystemSettings) and can be triggered
retroactively for existing files. Results are displayed as badges in the FileViewer
and fed to AI screening for language-based filtering criteria.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt
2026-02-17 10:08:04 +01:00
parent 90f36ac9b2
commit e5b7cdf670
13 changed files with 565 additions and 10 deletions

View File

@@ -0,0 +1,5 @@
-- AlterTable
ALTER TABLE "ProjectFile" ADD COLUMN "textPreview" TEXT;
ALTER TABLE "ProjectFile" ADD COLUMN "detectedLang" TEXT;
ALTER TABLE "ProjectFile" ADD COLUMN "langConfidence" DOUBLE PRECISION;
ALTER TABLE "ProjectFile" ADD COLUMN "analyzedAt" TIMESTAMP(3);

View File

@@ -689,6 +689,12 @@ model ProjectFile {
size Int // bytes
pageCount Int? // Number of pages (PDFs, presentations, etc.)
// Document analysis (optional, populated by document-analyzer service)
textPreview String? @db.Text // First ~2000 chars of extracted text
detectedLang String? // ISO 639-3 code (e.g. 'eng', 'fra', 'und')
langConfidence Float? // 0.01.0 confidence
analyzedAt DateTime? // When analysis last ran
// MinIO location
bucket String
objectKey String