Add document analysis: page count, text extraction & language detection

Introduces a document analyzer service that extracts page count (via pdf-parse), text preview, and detected language (via franc) from uploaded files. Analysis runs automatically on upload (configurable via SystemSettings) and can be triggered retroactively for existing files. Results are displayed as badges in the FileViewer and fed to AI screening for language-based filtering criteria. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 10:08:04 +01:00
parent 90f36ac9b2
commit e5b7cdf670
13 changed files with 565 additions and 10 deletions
--- a/prisma/migrations/20260217100000_add_document_analysis_fields/migration.sql
+++ b/prisma/migrations/20260217100000_add_document_analysis_fields/migration.sql
@@ -0,0 +1,5 @@
+-- AlterTable
+ALTER TABLE "ProjectFile" ADD COLUMN "textPreview" TEXT;
+ALTER TABLE "ProjectFile" ADD COLUMN "detectedLang" TEXT;
+ALTER TABLE "ProjectFile" ADD COLUMN "langConfidence" DOUBLE PRECISION;
+ALTER TABLE "ProjectFile" ADD COLUMN "analyzedAt" TIMESTAMP(3);
--- a/prisma/schema.prisma
+++ b/prisma/schema.prisma
@@ -689,6 +689,12 @@ model ProjectFile {
  size      Int // bytes
  pageCount Int? // Number of pages (PDFs, presentations, etc.)

+  // Document analysis (optional, populated by document-analyzer service)
+  textPreview    String?   @db.Text // First ~2000 chars of extracted text
+  detectedLang   String?            // ISO 639-3 code (e.g. 'eng', 'fra', 'und')
+  langConfidence Float?             // 0.0–1.0 confidence
+  analyzedAt     DateTime?          // When analysis last ran
+
  // MinIO location
  bucket    String
  objectKey String