Add document analysis: page count, text extraction & language detection
Build and Push Docker Image / build (push) Failing after 11s
Details
Build and Push Docker Image / build (push) Failing after 11s
Details
Introduces a document analyzer service that extracts page count (via pdf-parse), text preview, and detected language (via franc) from uploaded files. Analysis runs automatically on upload (configurable via SystemSettings) and can be triggered retroactively for existing files. Results are displayed as badges in the FileViewer and fed to AI screening for language-based filtering criteria. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
90f36ac9b2
commit
e5b7cdf670
|
|
@ -49,6 +49,7 @@
|
||||||
"cmdk": "^1.0.4",
|
"cmdk": "^1.0.4",
|
||||||
"csv-parse": "^6.1.0",
|
"csv-parse": "^6.1.0",
|
||||||
"date-fns": "^4.1.0",
|
"date-fns": "^4.1.0",
|
||||||
|
"franc": "^6.2.0",
|
||||||
"html2canvas": "^1.4.1",
|
"html2canvas": "^1.4.1",
|
||||||
"jspdf": "^4.1.0",
|
"jspdf": "^4.1.0",
|
||||||
"jspdf-autotable": "^5.0.7",
|
"jspdf-autotable": "^5.0.7",
|
||||||
|
|
@ -6147,6 +6148,16 @@
|
||||||
"react-dom": "^18 || ^19 || ^19.0.0-rc"
|
"react-dom": "^18 || ^19 || ^19.0.0-rc"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/collapse-white-space": {
|
||||||
|
"version": "2.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/collapse-white-space/-/collapse-white-space-2.1.0.tgz",
|
||||||
|
"integrity": "sha512-loKTxY1zCOuG4j9f6EPnuyyYkf58RnhhWTvRoZEokgB+WbdXehfjFviyOVYkqzEWz1Q5kRiZdBYS5SwxbQYwzw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/wooorm"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/color-convert": {
|
"node_modules/color-convert": {
|
||||||
"version": "2.0.1",
|
"version": "2.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
|
||||||
|
|
@ -7736,6 +7747,19 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/franc": {
|
||||||
|
"version": "6.2.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/franc/-/franc-6.2.0.tgz",
|
||||||
|
"integrity": "sha512-rcAewP7PSHvjq7Kgd7dhj82zE071kX5B4W1M4ewYMf/P+i6YsDQmj62Xz3VQm9zyUzUXwhIde/wHLGCMrM+yGg==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"trigram-utils": "^2.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/wooorm"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/fsevents": {
|
"node_modules/fsevents": {
|
||||||
"version": "2.3.2",
|
"version": "2.3.2",
|
||||||
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
||||||
|
|
@ -10441,6 +10465,16 @@
|
||||||
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
|
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/n-gram": {
|
||||||
|
"version": "2.0.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/n-gram/-/n-gram-2.0.2.tgz",
|
||||||
|
"integrity": "sha512-S24aGsn+HLBxUGVAUFOwGpKs7LBcG4RudKU//eWzt/mQ97/NMKQxDWHyHx63UNWk/OOdihgmzoETn1tf5nQDzQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/wooorm"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/nanoid": {
|
"node_modules/nanoid": {
|
||||||
"version": "3.3.11",
|
"version": "3.3.11",
|
||||||
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
|
"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
|
||||||
|
|
@ -13110,6 +13144,20 @@
|
||||||
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
|
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/trigram-utils": {
|
||||||
|
"version": "2.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/trigram-utils/-/trigram-utils-2.0.1.tgz",
|
||||||
|
"integrity": "sha512-nfWIXHEaB+HdyslAfMxSqWKDdmqY9I32jS7GnqpdWQnLH89r6A5sdk3fDVYqGAZ0CrT8ovAFSAo6HRiWcWNIGQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"collapse-white-space": "^2.0.0",
|
||||||
|
"n-gram": "^2.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/wooorm"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/trim-lines": {
|
"node_modules/trim-lines": {
|
||||||
"version": "3.0.1",
|
"version": "3.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz",
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,7 @@
|
||||||
"cmdk": "^1.0.4",
|
"cmdk": "^1.0.4",
|
||||||
"csv-parse": "^6.1.0",
|
"csv-parse": "^6.1.0",
|
||||||
"date-fns": "^4.1.0",
|
"date-fns": "^4.1.0",
|
||||||
|
"franc": "^6.2.0",
|
||||||
"html2canvas": "^1.4.1",
|
"html2canvas": "^1.4.1",
|
||||||
"jspdf": "^4.1.0",
|
"jspdf": "^4.1.0",
|
||||||
"jspdf-autotable": "^5.0.7",
|
"jspdf-autotable": "^5.0.7",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
-- AlterTable
|
||||||
|
ALTER TABLE "ProjectFile" ADD COLUMN "textPreview" TEXT;
|
||||||
|
ALTER TABLE "ProjectFile" ADD COLUMN "detectedLang" TEXT;
|
||||||
|
ALTER TABLE "ProjectFile" ADD COLUMN "langConfidence" DOUBLE PRECISION;
|
||||||
|
ALTER TABLE "ProjectFile" ADD COLUMN "analyzedAt" TIMESTAMP(3);
|
||||||
|
|
@ -689,6 +689,12 @@ model ProjectFile {
|
||||||
size Int // bytes
|
size Int // bytes
|
||||||
pageCount Int? // Number of pages (PDFs, presentations, etc.)
|
pageCount Int? // Number of pages (PDFs, presentations, etc.)
|
||||||
|
|
||||||
|
// Document analysis (optional, populated by document-analyzer service)
|
||||||
|
textPreview String? @db.Text // First ~2000 chars of extracted text
|
||||||
|
detectedLang String? // ISO 639-3 code (e.g. 'eng', 'fra', 'und')
|
||||||
|
langConfidence Float? // 0.0–1.0 confidence
|
||||||
|
analyzedAt DateTime? // When analysis last ran
|
||||||
|
|
||||||
// MinIO location
|
// MinIO location
|
||||||
bucket String
|
bucket String
|
||||||
objectKey String
|
objectKey String
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,10 @@ import {
|
||||||
Heart,
|
Heart,
|
||||||
Crown,
|
Crown,
|
||||||
UserPlus,
|
UserPlus,
|
||||||
|
Loader2,
|
||||||
|
ScanSearch,
|
||||||
} from 'lucide-react'
|
} from 'lucide-react'
|
||||||
|
import { toast } from 'sonner'
|
||||||
import { formatDate, formatDateOnly } from '@/lib/utils'
|
import { formatDate, formatDateOnly } from '@/lib/utils'
|
||||||
|
|
||||||
interface PageProps {
|
interface PageProps {
|
||||||
|
|
@ -529,15 +532,20 @@ function ProjectDetailContent({ projectId }: { projectId: string }) {
|
||||||
<AnimatedCard index={4}>
|
<AnimatedCard index={4}>
|
||||||
<Card>
|
<Card>
|
||||||
<CardHeader>
|
<CardHeader>
|
||||||
<CardTitle className="flex items-center gap-2.5 text-lg">
|
<div className="flex items-center justify-between">
|
||||||
<div className="rounded-lg bg-rose-500/10 p-1.5">
|
<div>
|
||||||
<FileText className="h-4 w-4 text-rose-500" />
|
<CardTitle className="flex items-center gap-2.5 text-lg">
|
||||||
|
<div className="rounded-lg bg-rose-500/10 p-1.5">
|
||||||
|
<FileText className="h-4 w-4 text-rose-500" />
|
||||||
|
</div>
|
||||||
|
Files
|
||||||
|
</CardTitle>
|
||||||
|
<CardDescription>
|
||||||
|
Project documents and materials organized by competition round
|
||||||
|
</CardDescription>
|
||||||
</div>
|
</div>
|
||||||
Files
|
<AnalyzeDocumentsButton projectId={projectId} onComplete={() => utils.file.listByProject.invalidate({ projectId })} />
|
||||||
</CardTitle>
|
</div>
|
||||||
<CardDescription>
|
|
||||||
Project documents and materials organized by competition round
|
|
||||||
</CardDescription>
|
|
||||||
</CardHeader>
|
</CardHeader>
|
||||||
<CardContent className="space-y-6">
|
<CardContent className="space-y-6">
|
||||||
{/* Requirements organized by round */}
|
{/* Requirements organized by round */}
|
||||||
|
|
@ -664,6 +672,11 @@ function ProjectDetailContent({ projectId }: { projectId: string }) {
|
||||||
size: f.size,
|
size: f.size,
|
||||||
bucket: f.bucket,
|
bucket: f.bucket,
|
||||||
objectKey: f.objectKey,
|
objectKey: f.objectKey,
|
||||||
|
pageCount: f.pageCount,
|
||||||
|
textPreview: f.textPreview,
|
||||||
|
detectedLang: f.detectedLang,
|
||||||
|
langConfidence: f.langConfidence,
|
||||||
|
analyzedAt: f.analyzedAt ? String(f.analyzedAt) : null,
|
||||||
}))}
|
}))}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -847,6 +860,36 @@ function ProjectDetailSkeleton() {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function AnalyzeDocumentsButton({ projectId, onComplete }: { projectId: string; onComplete: () => void }) {
|
||||||
|
const analyzeMutation = trpc.file.analyzeProjectFiles.useMutation({
|
||||||
|
onSuccess: (result) => {
|
||||||
|
toast.success(
|
||||||
|
`Analyzed ${result.analyzed} file${result.analyzed !== 1 ? 's' : ''}${result.failed > 0 ? ` (${result.failed} failed)` : ''}`
|
||||||
|
)
|
||||||
|
onComplete()
|
||||||
|
},
|
||||||
|
onError: (error) => {
|
||||||
|
toast.error(error.message || 'Analysis failed')
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Button
|
||||||
|
variant="outline"
|
||||||
|
size="sm"
|
||||||
|
onClick={() => analyzeMutation.mutate({ projectId })}
|
||||||
|
disabled={analyzeMutation.isPending}
|
||||||
|
>
|
||||||
|
{analyzeMutation.isPending ? (
|
||||||
|
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
|
||||||
|
) : (
|
||||||
|
<ScanSearch className="mr-2 h-4 w-4" />
|
||||||
|
)}
|
||||||
|
{analyzeMutation.isPending ? 'Analyzing...' : 'Analyze Documents'}
|
||||||
|
</Button>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
export default function ProjectDetailPage({ params }: PageProps) {
|
export default function ProjectDetailPage({ params }: PageProps) {
|
||||||
const { id } = use(params)
|
const { id } = use(params)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1599,7 +1599,7 @@ function FilteringRulesSection({ roundId }: { roundId: string }) {
|
||||||
className="text-sm"
|
className="text-sm"
|
||||||
/>
|
/>
|
||||||
<p className="text-xs text-muted-foreground mt-1">
|
<p className="text-xs text-muted-foreground mt-1">
|
||||||
The AI has access to: category, country, region, founded year, ocean issue, tags, description, file details (type, page count, size), and team size.
|
The AI has access to: category, country, region, founded year, ocean issue, tags, description, file details (type, page count, size, detected language), and team size.
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -65,6 +65,12 @@ interface ProjectFile {
|
||||||
isLate?: boolean
|
isLate?: boolean
|
||||||
requirementId?: string | null
|
requirementId?: string | null
|
||||||
requirement?: FileRequirementInfo | null
|
requirement?: FileRequirementInfo | null
|
||||||
|
// Document analysis fields
|
||||||
|
pageCount?: number | null
|
||||||
|
textPreview?: string | null
|
||||||
|
detectedLang?: string | null
|
||||||
|
langConfidence?: number | null
|
||||||
|
analyzedAt?: Date | string | null
|
||||||
}
|
}
|
||||||
|
|
||||||
interface RoundGroup {
|
interface RoundGroup {
|
||||||
|
|
@ -270,6 +276,25 @@ function FileItem({ file }: { file: ProjectFile }) {
|
||||||
</Badge>
|
</Badge>
|
||||||
)}
|
)}
|
||||||
<span>{formatFileSize(file.size)}</span>
|
<span>{formatFileSize(file.size)}</span>
|
||||||
|
{file.pageCount != null && (
|
||||||
|
<Badge variant="outline" className="text-xs gap-1">
|
||||||
|
<FileText className="h-3 w-3" />
|
||||||
|
{file.pageCount} {file.pageCount === 1 ? 'page' : 'pages'}
|
||||||
|
</Badge>
|
||||||
|
)}
|
||||||
|
{file.detectedLang && file.detectedLang !== 'und' && (
|
||||||
|
<Badge
|
||||||
|
variant="outline"
|
||||||
|
className={cn('text-xs font-mono uppercase', {
|
||||||
|
'border-green-300 text-green-700 bg-green-50': file.langConfidence != null && file.langConfidence >= 0.8,
|
||||||
|
'border-amber-300 text-amber-700 bg-amber-50': file.langConfidence != null && file.langConfidence >= 0.4 && file.langConfidence < 0.8,
|
||||||
|
'border-red-300 text-red-700 bg-red-50': file.langConfidence != null && file.langConfidence < 0.4,
|
||||||
|
})}
|
||||||
|
title={`Language: ${file.detectedLang} (${Math.round((file.langConfidence ?? 0) * 100)}% confidence)`}
|
||||||
|
>
|
||||||
|
{file.detectedLang.toUpperCase()}
|
||||||
|
</Badge>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -421,6 +421,14 @@ export const applicantRouter = router({
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Auto-analyze document (fire-and-forget, delayed for presigned upload)
|
||||||
|
import('../services/document-analyzer').then(({ analyzeFileDelayed, isAutoAnalysisEnabled }) =>
|
||||||
|
isAutoAnalysisEnabled().then((enabled) => {
|
||||||
|
if (enabled) analyzeFileDelayed(file.id).catch((err) =>
|
||||||
|
console.warn('[DocAnalyzer] Post-upload analysis failed:', err))
|
||||||
|
})
|
||||||
|
).catch(() => {})
|
||||||
|
|
||||||
return file
|
return file
|
||||||
}),
|
}),
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -206,6 +206,14 @@ export const fileRouter = router({
|
||||||
userAgent: ctx.userAgent,
|
userAgent: ctx.userAgent,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// Auto-analyze document (fire-and-forget, delayed for presigned upload)
|
||||||
|
import('../services/document-analyzer').then(({ analyzeFileDelayed, isAutoAnalysisEnabled }) =>
|
||||||
|
isAutoAnalysisEnabled().then((enabled) => {
|
||||||
|
if (enabled) analyzeFileDelayed(file.id).catch((err) =>
|
||||||
|
console.warn('[DocAnalyzer] Post-upload analysis failed:', err))
|
||||||
|
})
|
||||||
|
).catch(() => {})
|
||||||
|
|
||||||
return {
|
return {
|
||||||
uploadUrl,
|
uploadUrl,
|
||||||
file,
|
file,
|
||||||
|
|
@ -1201,6 +1209,14 @@ export const fileRouter = router({
|
||||||
userAgent: ctx.userAgent,
|
userAgent: ctx.userAgent,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// Auto-analyze document (fire-and-forget, delayed for presigned upload)
|
||||||
|
import('../services/document-analyzer').then(({ analyzeFileDelayed, isAutoAnalysisEnabled }) =>
|
||||||
|
isAutoAnalysisEnabled().then((enabled) => {
|
||||||
|
if (enabled) analyzeFileDelayed(file.id).catch((err) =>
|
||||||
|
console.warn('[DocAnalyzer] Post-upload analysis failed:', err))
|
||||||
|
})
|
||||||
|
).catch(() => {})
|
||||||
|
|
||||||
return { uploadUrl, file }
|
return { uploadUrl, file }
|
||||||
}),
|
}),
|
||||||
|
|
||||||
|
|
@ -1510,6 +1526,14 @@ export const fileRouter = router({
|
||||||
ctx.prisma,
|
ctx.prisma,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Auto-analyze document (fire-and-forget, delayed for presigned upload)
|
||||||
|
import('../services/document-analyzer').then(({ analyzeFileDelayed, isAutoAnalysisEnabled }) =>
|
||||||
|
isAutoAnalysisEnabled().then((enabled) => {
|
||||||
|
if (enabled) analyzeFileDelayed(file.id).catch((err) =>
|
||||||
|
console.warn('[DocAnalyzer] Post-upload analysis failed:', err))
|
||||||
|
})
|
||||||
|
).catch(() => {})
|
||||||
|
|
||||||
return { uploadUrl, file }
|
return { uploadUrl, file }
|
||||||
}),
|
}),
|
||||||
|
|
||||||
|
|
@ -1545,4 +1569,25 @@ export const fileRouter = router({
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
}),
|
}),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyze all files for a specific project (page count, language, text preview).
|
||||||
|
* Retroactive: re-analyzes even previously analyzed files.
|
||||||
|
*/
|
||||||
|
analyzeProjectFiles: adminProcedure
|
||||||
|
.input(z.object({ projectId: z.string() }))
|
||||||
|
.mutation(async ({ input }) => {
|
||||||
|
const { analyzeProjectFiles } = await import('../services/document-analyzer')
|
||||||
|
return analyzeProjectFiles(input.projectId)
|
||||||
|
}),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Batch analyze all unanalyzed files across the platform.
|
||||||
|
* For retroactive analysis of files uploaded before this feature.
|
||||||
|
*/
|
||||||
|
analyzeAllFiles: adminProcedure
|
||||||
|
.mutation(async () => {
|
||||||
|
const { analyzeAllUnanalyzed } = await import('../services/document-analyzer')
|
||||||
|
return analyzeAllUnanalyzed()
|
||||||
|
}),
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,8 @@ export async function runFilteringJob(jobId: string, roundId: string, userId: st
|
||||||
mimeType: true,
|
mimeType: true,
|
||||||
size: true,
|
size: true,
|
||||||
pageCount: true,
|
pageCount: true,
|
||||||
|
detectedLang: true,
|
||||||
|
langConfidence: true,
|
||||||
objectKey: true,
|
objectKey: true,
|
||||||
roundId: true,
|
roundId: true,
|
||||||
createdAt: true,
|
createdAt: true,
|
||||||
|
|
|
||||||
|
|
@ -179,10 +179,11 @@ Return a JSON object with this exact structure:
|
||||||
- founded_year: when the company/initiative was founded (use for age checks)
|
- founded_year: when the company/initiative was founded (use for age checks)
|
||||||
- ocean_issue: the ocean conservation area
|
- ocean_issue: the ocean conservation area
|
||||||
- file_count, file_types: uploaded documents summary
|
- file_count, file_types: uploaded documents summary
|
||||||
- files[]: per-file details with file_type, page_count (if known), size_kb, round_name (which round the file was submitted for), and is_current_round flag
|
- files[]: per-file details with file_type, page_count (if known), size_kb, detected_lang (ISO 639-3 language code like 'eng', 'fra'), lang_confidence (0-1), round_name (which round the file was submitted for), and is_current_round flag
|
||||||
- description: project summary text
|
- description: project summary text
|
||||||
- tags: topic tags
|
- tags: topic tags
|
||||||
- If document content is provided (text_content field in files), use it for deeper analysis. Pay SPECIAL ATTENTION to files from the current round (is_current_round=true) as they are the most recent and relevant submissions.
|
- If document content is provided (text_content field in files), use it for deeper analysis. Pay SPECIAL ATTENTION to files from the current round (is_current_round=true) as they are the most recent and relevant submissions.
|
||||||
|
- If detected_lang is provided, use it to evaluate language requirements (e.g. 'eng' = English, 'fra' = French). lang_confidence indicates detection reliability.
|
||||||
|
|
||||||
## Guidelines
|
## Guidelines
|
||||||
- Evaluate ONLY against the provided criteria, not your own standards
|
- Evaluate ONLY against the provided criteria, not your own standards
|
||||||
|
|
|
||||||
|
|
@ -83,6 +83,8 @@ export interface AnonymizedFileInfo {
|
||||||
file_type: string // FileType enum value
|
file_type: string // FileType enum value
|
||||||
page_count: number | null // Number of pages if known
|
page_count: number | null // Number of pages if known
|
||||||
size_kb: number // File size in KB
|
size_kb: number // File size in KB
|
||||||
|
detected_lang?: string | null // ISO 639-3 language code (e.g. 'eng', 'fra')
|
||||||
|
lang_confidence?: number | null // 0.0–1.0 confidence score
|
||||||
round_name?: string | null // Which round the file was submitted for
|
round_name?: string | null // Which round the file was submitted for
|
||||||
is_current_round?: boolean // Whether this file belongs to the current filtering/evaluation round
|
is_current_round?: boolean // Whether this file belongs to the current filtering/evaluation round
|
||||||
text_content?: string // Extracted text content (when aiParseFiles is enabled)
|
text_content?: string // Extracted text content (when aiParseFiles is enabled)
|
||||||
|
|
@ -309,6 +311,8 @@ export function anonymizeProjectForAI(
|
||||||
file_type: f.fileType ?? 'OTHER',
|
file_type: f.fileType ?? 'OTHER',
|
||||||
page_count: f.pageCount ?? null,
|
page_count: f.pageCount ?? null,
|
||||||
size_kb: Math.round((f.size ?? 0) / 1024),
|
size_kb: Math.round((f.size ?? 0) / 1024),
|
||||||
|
...(f.detectedLang ? { detected_lang: f.detectedLang } : {}),
|
||||||
|
...(f.langConfidence != null ? { lang_confidence: f.langConfidence } : {}),
|
||||||
...(f.roundName ? { round_name: f.roundName } : {}),
|
...(f.roundName ? { round_name: f.roundName } : {}),
|
||||||
...(f.isCurrentRound !== undefined ? { is_current_round: f.isCurrentRound } : {}),
|
...(f.isCurrentRound !== undefined ? { is_current_round: f.isCurrentRound } : {}),
|
||||||
...(f.textContent ? { text_content: f.textContent } : {}),
|
...(f.textContent ? { text_content: f.textContent } : {}),
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,367 @@
|
||||||
|
/**
|
||||||
|
* Document Analyzer Service
|
||||||
|
*
|
||||||
|
* Extracts metadata from uploaded files:
|
||||||
|
* - Page count (PDFs)
|
||||||
|
* - Text preview (first ~2000 chars)
|
||||||
|
* - Language detection via franc
|
||||||
|
*
|
||||||
|
* Runs optionally on upload (controlled by SystemSettings) and
|
||||||
|
* retroactively via admin endpoint.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { getStorageProvider } from '@/lib/storage'
|
||||||
|
import { isParseableMimeType } from './file-content-extractor'
|
||||||
|
import { prisma } from '@/lib/prisma'
|
||||||
|
|
||||||
|
const TEXT_PREVIEW_LIMIT = 2000
|
||||||
|
const BATCH_SIZE = 10
|
||||||
|
|
||||||
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export type AnalysisResult = {
|
||||||
|
fileId: string
|
||||||
|
pageCount: number | null
|
||||||
|
textPreview: string | null
|
||||||
|
detectedLang: string | null
|
||||||
|
langConfidence: number | null
|
||||||
|
error?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Language Detection ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect language using franc. Returns ISO 639-3 code and confidence.
|
||||||
|
* franc returns a distance-based score where lower = better match.
|
||||||
|
* We convert to 0-1 confidence where 1 = perfect match.
|
||||||
|
*/
|
||||||
|
async function detectLanguage(
|
||||||
|
text: string
|
||||||
|
): Promise<{ lang: string; confidence: number }> {
|
||||||
|
if (!text || text.trim().length < 20) {
|
||||||
|
return { lang: 'und', confidence: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use a reasonable sample for detection (first 5000 chars)
|
||||||
|
const sample = text.slice(0, 5000)
|
||||||
|
|
||||||
|
const { francAll } = await import('franc')
|
||||||
|
const results = francAll(sample, { minLength: 10 })
|
||||||
|
|
||||||
|
if (!results || results.length === 0 || results[0][0] === 'und') {
|
||||||
|
return { lang: 'und', confidence: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
const topLang = results[0][0]
|
||||||
|
const topScore = results[0][1] // 1.0 = best match, 0.0 = worst
|
||||||
|
|
||||||
|
// franc scores: 1.0 is best match, scale drops from there
|
||||||
|
// Convert to a 0-1 confidence
|
||||||
|
const confidence = Math.max(0, Math.min(1, topScore))
|
||||||
|
|
||||||
|
return { lang: topLang, confidence: Math.round(confidence * 100) / 100 }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Core Analysis ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyze a single file: extract page count, text preview, and detect language.
|
||||||
|
* Downloads the file from storage, parses it, and returns results.
|
||||||
|
*/
|
||||||
|
export async function analyzeFileContent(
|
||||||
|
objectKey: string,
|
||||||
|
bucket: string,
|
||||||
|
mimeType: string,
|
||||||
|
fileName: string,
|
||||||
|
fileId: string
|
||||||
|
): Promise<AnalysisResult> {
|
||||||
|
const result: AnalysisResult = {
|
||||||
|
fileId,
|
||||||
|
pageCount: null,
|
||||||
|
textPreview: null,
|
||||||
|
detectedLang: null,
|
||||||
|
langConfidence: null,
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isParseableMimeType(mimeType)) {
|
||||||
|
return { ...result, error: 'Unsupported mime type for analysis' }
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const storage = await getStorageProvider()
|
||||||
|
const buffer = await storage.getObject(objectKey)
|
||||||
|
|
||||||
|
let text = ''
|
||||||
|
let pageCount: number | null = null
|
||||||
|
|
||||||
|
if (mimeType === 'application/pdf') {
|
||||||
|
const pdfParseModule = await import('pdf-parse')
|
||||||
|
const pdfParse =
|
||||||
|
typeof pdfParseModule === 'function'
|
||||||
|
? pdfParseModule
|
||||||
|
: (pdfParseModule as any).default ?? pdfParseModule
|
||||||
|
const pdf = await pdfParse(buffer)
|
||||||
|
text = pdf.text || ''
|
||||||
|
pageCount = pdf.numpages ?? null
|
||||||
|
} else {
|
||||||
|
// Text-based files (plain text, CSV, markdown, HTML, RTF)
|
||||||
|
text = buffer.toString('utf-8')
|
||||||
|
}
|
||||||
|
|
||||||
|
result.pageCount = pageCount
|
||||||
|
|
||||||
|
// Text preview
|
||||||
|
if (text.trim()) {
|
||||||
|
result.textPreview =
|
||||||
|
text.length > TEXT_PREVIEW_LIMIT
|
||||||
|
? text.slice(0, TEXT_PREVIEW_LIMIT)
|
||||||
|
: text
|
||||||
|
}
|
||||||
|
|
||||||
|
// Language detection
|
||||||
|
if (text.trim().length >= 20) {
|
||||||
|
const langResult = await detectLanguage(text)
|
||||||
|
result.detectedLang = langResult.lang
|
||||||
|
result.langConfidence = langResult.confidence
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(
|
||||||
|
`[DocAnalyzer] Failed to analyze ${fileName}:`,
|
||||||
|
error instanceof Error ? error.message : error
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
...result,
|
||||||
|
error: error instanceof Error ? error.message : 'Analysis failed',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── DB-Integrated Operations ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyze a single file by ID and persist results to DB.
|
||||||
|
*/
|
||||||
|
export async function analyzeFile(fileId: string): Promise<AnalysisResult> {
|
||||||
|
const file = await prisma.projectFile.findUnique({
|
||||||
|
where: { id: fileId },
|
||||||
|
select: {
|
||||||
|
id: true,
|
||||||
|
objectKey: true,
|
||||||
|
bucket: true,
|
||||||
|
mimeType: true,
|
||||||
|
fileName: true,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!file) {
|
||||||
|
return {
|
||||||
|
fileId,
|
||||||
|
pageCount: null,
|
||||||
|
textPreview: null,
|
||||||
|
detectedLang: null,
|
||||||
|
langConfidence: null,
|
||||||
|
error: 'File not found',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await analyzeFileContent(
|
||||||
|
file.objectKey,
|
||||||
|
file.bucket,
|
||||||
|
file.mimeType,
|
||||||
|
file.fileName,
|
||||||
|
file.id
|
||||||
|
)
|
||||||
|
|
||||||
|
// Persist results
|
||||||
|
await prisma.projectFile.update({
|
||||||
|
where: { id: fileId },
|
||||||
|
data: {
|
||||||
|
pageCount: result.pageCount,
|
||||||
|
textPreview: result.textPreview,
|
||||||
|
detectedLang: result.detectedLang,
|
||||||
|
langConfidence: result.langConfidence,
|
||||||
|
analyzedAt: new Date(),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyze a single file by ID with a delay (for post-upload use).
|
||||||
|
* The delay accounts for presigned URL uploads where the file
|
||||||
|
* may not be in storage yet when the DB record is created.
|
||||||
|
*/
|
||||||
|
export async function analyzeFileDelayed(
|
||||||
|
fileId: string,
|
||||||
|
delayMs = 3000
|
||||||
|
): Promise<AnalysisResult> {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, delayMs))
|
||||||
|
return analyzeFile(fileId)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyze all files for a specific project.
|
||||||
|
*/
|
||||||
|
export async function analyzeProjectFiles(
|
||||||
|
projectId: string
|
||||||
|
): Promise<{ analyzed: number; failed: number; total: number }> {
|
||||||
|
const files = await prisma.projectFile.findMany({
|
||||||
|
where: { projectId },
|
||||||
|
select: {
|
||||||
|
id: true,
|
||||||
|
objectKey: true,
|
||||||
|
bucket: true,
|
||||||
|
mimeType: true,
|
||||||
|
fileName: true,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
let analyzed = 0
|
||||||
|
let failed = 0
|
||||||
|
|
||||||
|
// Process in batches
|
||||||
|
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
||||||
|
const batch = files.slice(i, i + BATCH_SIZE)
|
||||||
|
const results = await Promise.allSettled(
|
||||||
|
batch.map(async (file) => {
|
||||||
|
if (!isParseableMimeType(file.mimeType)) {
|
||||||
|
// Mark non-parseable files as analyzed with no data
|
||||||
|
await prisma.projectFile.update({
|
||||||
|
where: { id: file.id },
|
||||||
|
data: { analyzedAt: new Date() },
|
||||||
|
})
|
||||||
|
return 'skipped'
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await analyzeFileContent(
|
||||||
|
file.objectKey,
|
||||||
|
file.bucket,
|
||||||
|
file.mimeType,
|
||||||
|
file.fileName,
|
||||||
|
file.id
|
||||||
|
)
|
||||||
|
|
||||||
|
await prisma.projectFile.update({
|
||||||
|
where: { id: file.id },
|
||||||
|
data: {
|
||||||
|
pageCount: result.pageCount,
|
||||||
|
textPreview: result.textPreview,
|
||||||
|
detectedLang: result.detectedLang,
|
||||||
|
langConfidence: result.langConfidence,
|
||||||
|
analyzedAt: new Date(),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
return result.error ? 'failed' : 'analyzed'
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
for (const r of results) {
|
||||||
|
if (r.status === 'fulfilled') {
|
||||||
|
if (r.value === 'analyzed') analyzed++
|
||||||
|
else if (r.value === 'failed') failed++
|
||||||
|
} else {
|
||||||
|
failed++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { analyzed, failed, total: files.length }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retroactive batch analysis: analyze all files that haven't been analyzed yet.
|
||||||
|
* Returns counts. Processes in batches to avoid memory issues.
|
||||||
|
*/
|
||||||
|
export async function analyzeAllUnanalyzed(): Promise<{
|
||||||
|
analyzed: number
|
||||||
|
failed: number
|
||||||
|
skipped: number
|
||||||
|
total: number
|
||||||
|
}> {
|
||||||
|
const files = await prisma.projectFile.findMany({
|
||||||
|
where: { analyzedAt: null },
|
||||||
|
select: {
|
||||||
|
id: true,
|
||||||
|
objectKey: true,
|
||||||
|
bucket: true,
|
||||||
|
mimeType: true,
|
||||||
|
fileName: true,
|
||||||
|
},
|
||||||
|
orderBy: { createdAt: 'desc' },
|
||||||
|
})
|
||||||
|
|
||||||
|
let analyzed = 0
|
||||||
|
let failed = 0
|
||||||
|
let skipped = 0
|
||||||
|
|
||||||
|
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
||||||
|
const batch = files.slice(i, i + BATCH_SIZE)
|
||||||
|
const results = await Promise.allSettled(
|
||||||
|
batch.map(async (file) => {
|
||||||
|
if (!isParseableMimeType(file.mimeType)) {
|
||||||
|
await prisma.projectFile.update({
|
||||||
|
where: { id: file.id },
|
||||||
|
data: { analyzedAt: new Date() },
|
||||||
|
})
|
||||||
|
return 'skipped'
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await analyzeFileContent(
|
||||||
|
file.objectKey,
|
||||||
|
file.bucket,
|
||||||
|
file.mimeType,
|
||||||
|
file.fileName,
|
||||||
|
file.id
|
||||||
|
)
|
||||||
|
|
||||||
|
await prisma.projectFile.update({
|
||||||
|
where: { id: file.id },
|
||||||
|
data: {
|
||||||
|
pageCount: result.pageCount,
|
||||||
|
textPreview: result.textPreview,
|
||||||
|
detectedLang: result.detectedLang,
|
||||||
|
langConfidence: result.langConfidence,
|
||||||
|
analyzedAt: new Date(),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
return result.error ? 'failed' : 'analyzed'
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
for (const r of results) {
|
||||||
|
if (r.status === 'fulfilled') {
|
||||||
|
if (r.value === 'analyzed') analyzed++
|
||||||
|
else if (r.value === 'failed') failed++
|
||||||
|
else if (r.value === 'skipped') skipped++
|
||||||
|
} else {
|
||||||
|
failed++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`[DocAnalyzer] Batch progress: ${i + batch.length}/${files.length} (${analyzed} analyzed, ${skipped} skipped, ${failed} failed)`
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return { analyzed, failed, skipped, total: files.length }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if auto-analysis is enabled via SystemSettings.
|
||||||
|
*/
|
||||||
|
export async function isAutoAnalysisEnabled(): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
const setting = await prisma.systemSettings.findUnique({
|
||||||
|
where: { key: 'file_analysis_auto_enabled' },
|
||||||
|
})
|
||||||
|
// Default to true if setting doesn't exist
|
||||||
|
return setting?.value !== 'false'
|
||||||
|
} catch {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue