feat(berth-parser): unpdf for tier-2 PDF text extraction

Phase 1 / commit 13 of 14 — replaces a quietly-broken tesseract.js pathway with unpdf for tier-2 of the berth-PDF parser. The previous code did: const tesseract = await import('tesseract.js'); await tesseract.recognize(buffer, 'eng'); // ← buffer is a PDF tesseract.recognize() expects an image, not a PDF. The PDFs we get from the AcroForm-stripped berth-spec sheets would have failed at runtime (either an "unsupported format" error or silently empty text). Tier-2 was dark code. unpdf (serverless-friendly pdfjs wrapper) extracts text directly from the PDF stream. Works on text-PDFs (real text streams), returns empty on scanned/raster PDFs — those legitimately fall through to the AI tier where they belong. The OcrAdapter interface shape is preserved so: - Existing unit tests that stub the adapter still work - parseAnyBerthPdf(buffer, { adapter }) override still works - The 30-second timeout race + warning collection still works tesseract.js stays as a dep — scan-shell.tsx (receipt scanner) still uses it for on-device image OCR, which is its intended use case. 1298/1298 vitest green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 21:13:10 +02:00
parent 411d0764e8
commit e8a852856e
1 changed files with 26 additions and 23 deletions
--- a/src/lib/services/berth-pdf-parser.ts
+++ b/src/lib/services/berth-pdf-parser.ts
@@ -178,49 +178,52 @@ async function tryAcroForm(buffer: Buffer): Promise<ParseResult | null> {
  };
 }

-// ─── tier 2: OCR via Tesseract ───────────────────────────────────────────────
+// ─── tier 2: text extraction via unpdf ────────────────────────────────────────

 /**
- * Runs Tesseract against a PDF rasterized to one image per page. Tesseract.js
- * accepts image inputs; we use a lazy `pdfjs-dist`-style rasterization fallback
- * via dynamic import. To keep the parser unit-testable without a WASM bundle,
- * the actual recognize() call is encapsulated in the `runOcr` adapter that
- * production wires to tesseract.js and tests can stub.
+ * Tier-2 extracts text directly from the PDF via `unpdf` (a serverless-
+ * friendly pdfjs wrapper). This works for text-PDFs — i.e. PDFs that
+ * contain real text streams, not scanned page images. Scanned/raster PDFs
+ * land here with empty extracted text and fall through to the AI tier.
+ *
+ * The earlier design called for tesseract.js rasterization, but
+ * `tesseract.recognize` doesn't accept a PDF buffer — it expects an image.
+ * That old code path silently failed at runtime; unpdf is the correct
+ * primitive for "pull text out of a PDF on the server."
+ *
+ * The `recognize` adapter shape is retained for backward compatibility with
+ * the test suite and the `parseAnyBerthPdf(buffer, { adapter })` override.
 */
 export interface OcrAdapter {
-  /** Returns plain text + a 0..100 mean confidence score. */
+  /** Returns plain text + a 0..1 mean confidence score (mapped to 0..100 below). */
  recognize(buffer: Buffer): Promise<{ text: string; confidence: number }>;
 }

-/** Hard cap on Tesseract OCR runtime. A crafted PDF rasterizing to
- *  high-resolution noise can pin the process indefinitely (CPU bomb).
- *  30 seconds covers the legitimate single-page-spec case by a wide
- *  margin while bounding the worst-case worker hold-time. The AI
- *  fallback tier handles cases where OCR couldn't finish. */
+/** Hard cap on tier-2 runtime. A crafted PDF could pathologically slow
+ *  pdfjs parsing; 30s covers any reasonable berth-spec by orders of
+ *  magnitude while bounding the worst-case worker hold-time. */
 const OCR_TIMEOUT_MS = 30_000;

-/** Default adapter — dynamically imports tesseract.js so the WASM bundle isn't
- *  pulled into client builds. */
 async function defaultOcrAdapter(): Promise<OcrAdapter> {
  return {
    recognize: async (buffer: Buffer) => {
-      const tesseract = await import('tesseract.js');
-      // Race the OCR against a timeout so a runaway recognition can't
-      // hold the worker forever. The race-loser pattern doesn't
-      // actually cancel Tesseract (no AbortController support), but it
-      // does free the awaiter so the caller can fall through to AI.
+      const { extractText } = await import('unpdf');
      let timeoutHandle: NodeJS.Timeout | undefined;
      const timeout = new Promise<{ text: string; confidence: number }>((_, reject) => {
        timeoutHandle = setTimeout(
-          () => reject(new Error(`Tesseract OCR exceeded ${OCR_TIMEOUT_MS}ms timeout`)),
+          () => reject(new Error(`unpdf text extraction exceeded ${OCR_TIMEOUT_MS}ms timeout`)),
          OCR_TIMEOUT_MS,
        );
      });
      try {
+        // unpdf accepts a Uint8Array; Buffer is a Uint8Array subtype.
        const result = await Promise.race([
-          tesseract.recognize(buffer, 'eng').then((r) => ({
-            text: r.data.text ?? '',
-            confidence: typeof r.data.confidence === 'number' ? r.data.confidence : 0,
+          extractText(new Uint8Array(buffer), { mergePages: true }).then((r) => ({
+            text: r.text ?? '',
+            // unpdf yields high-fidelity text; we mark it as 0..1 -> 0.9
+            // confidence when non-empty so per-field regex confidence
+            // (which is already calibrated) wins out over the floor.
+            confidence: r.text && r.text.trim().length > 0 ? 90 : 0,
          })),
          timeout,
        ]);