diff --git a/src/lib/services/berth-pdf-parser.ts b/src/lib/services/berth-pdf-parser.ts index b64a6567..c28a53b2 100644 --- a/src/lib/services/berth-pdf-parser.ts +++ b/src/lib/services/berth-pdf-parser.ts @@ -178,49 +178,52 @@ async function tryAcroForm(buffer: Buffer): Promise { }; } -// ─── tier 2: OCR via Tesseract ─────────────────────────────────────────────── +// ─── tier 2: text extraction via unpdf ──────────────────────────────────────── /** - * Runs Tesseract against a PDF rasterized to one image per page. Tesseract.js - * accepts image inputs; we use a lazy `pdfjs-dist`-style rasterization fallback - * via dynamic import. To keep the parser unit-testable without a WASM bundle, - * the actual recognize() call is encapsulated in the `runOcr` adapter that - * production wires to tesseract.js and tests can stub. + * Tier-2 extracts text directly from the PDF via `unpdf` (a serverless- + * friendly pdfjs wrapper). This works for text-PDFs — i.e. PDFs that + * contain real text streams, not scanned page images. Scanned/raster PDFs + * land here with empty extracted text and fall through to the AI tier. + * + * The earlier design called for tesseract.js rasterization, but + * `tesseract.recognize` doesn't accept a PDF buffer — it expects an image. + * That old code path silently failed at runtime; unpdf is the correct + * primitive for "pull text out of a PDF on the server." + * + * The `recognize` adapter shape is retained for backward compatibility with + * the test suite and the `parseAnyBerthPdf(buffer, { adapter })` override. */ export interface OcrAdapter { - /** Returns plain text + a 0..100 mean confidence score. */ + /** Returns plain text + a 0..1 mean confidence score (mapped to 0..100 below). */ recognize(buffer: Buffer): Promise<{ text: string; confidence: number }>; } -/** Hard cap on Tesseract OCR runtime. A crafted PDF rasterizing to - * high-resolution noise can pin the process indefinitely (CPU bomb). - * 30 seconds covers the legitimate single-page-spec case by a wide - * margin while bounding the worst-case worker hold-time. The AI - * fallback tier handles cases where OCR couldn't finish. */ +/** Hard cap on tier-2 runtime. A crafted PDF could pathologically slow + * pdfjs parsing; 30s covers any reasonable berth-spec by orders of + * magnitude while bounding the worst-case worker hold-time. */ const OCR_TIMEOUT_MS = 30_000; -/** Default adapter — dynamically imports tesseract.js so the WASM bundle isn't - * pulled into client builds. */ async function defaultOcrAdapter(): Promise { return { recognize: async (buffer: Buffer) => { - const tesseract = await import('tesseract.js'); - // Race the OCR against a timeout so a runaway recognition can't - // hold the worker forever. The race-loser pattern doesn't - // actually cancel Tesseract (no AbortController support), but it - // does free the awaiter so the caller can fall through to AI. + const { extractText } = await import('unpdf'); let timeoutHandle: NodeJS.Timeout | undefined; const timeout = new Promise<{ text: string; confidence: number }>((_, reject) => { timeoutHandle = setTimeout( - () => reject(new Error(`Tesseract OCR exceeded ${OCR_TIMEOUT_MS}ms timeout`)), + () => reject(new Error(`unpdf text extraction exceeded ${OCR_TIMEOUT_MS}ms timeout`)), OCR_TIMEOUT_MS, ); }); try { + // unpdf accepts a Uint8Array; Buffer is a Uint8Array subtype. const result = await Promise.race([ - tesseract.recognize(buffer, 'eng').then((r) => ({ - text: r.data.text ?? '', - confidence: typeof r.data.confidence === 'number' ? r.data.confidence : 0, + extractText(new Uint8Array(buffer), { mergePages: true }).then((r) => ({ + text: r.text ?? '', + // unpdf yields high-fidelity text; we mark it as 0..1 -> 0.9 + // confidence when non-empty so per-field regex confidence + // (which is already calibrated) wins out over the floor. + confidence: r.text && r.text.trim().length > 0 ? 90 : 0, })), timeout, ]);