feat(berth-parser): unpdf for tier-2 PDF text extraction
Phase 1 / commit 13 of 14 — replaces a quietly-broken tesseract.js
pathway with unpdf for tier-2 of the berth-PDF parser.
The previous code did:
const tesseract = await import('tesseract.js');
await tesseract.recognize(buffer, 'eng'); // ← buffer is a PDF
tesseract.recognize() expects an image, not a PDF. The PDFs we get from
the AcroForm-stripped berth-spec sheets would have failed at runtime
(either an "unsupported format" error or silently empty text). Tier-2
was dark code.
unpdf (serverless-friendly pdfjs wrapper) extracts text directly from
the PDF stream. Works on text-PDFs (real text streams), returns empty
on scanned/raster PDFs — those legitimately fall through to the AI
tier where they belong.
The OcrAdapter interface shape is preserved so:
- Existing unit tests that stub the adapter still work
- parseAnyBerthPdf(buffer, { adapter }) override still works
- The 30-second timeout race + warning collection still works
tesseract.js stays as a dep — scan-shell.tsx (receipt scanner) still
uses it for on-device image OCR, which is its intended use case.
1298/1298 vitest green.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -178,49 +178,52 @@ async function tryAcroForm(buffer: Buffer): Promise<ParseResult | null> {
|
||||
};
|
||||
}
|
||||
|
||||
// ─── tier 2: OCR via Tesseract ───────────────────────────────────────────────
|
||||
// ─── tier 2: text extraction via unpdf ────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Runs Tesseract against a PDF rasterized to one image per page. Tesseract.js
|
||||
* accepts image inputs; we use a lazy `pdfjs-dist`-style rasterization fallback
|
||||
* via dynamic import. To keep the parser unit-testable without a WASM bundle,
|
||||
* the actual recognize() call is encapsulated in the `runOcr` adapter that
|
||||
* production wires to tesseract.js and tests can stub.
|
||||
* Tier-2 extracts text directly from the PDF via `unpdf` (a serverless-
|
||||
* friendly pdfjs wrapper). This works for text-PDFs — i.e. PDFs that
|
||||
* contain real text streams, not scanned page images. Scanned/raster PDFs
|
||||
* land here with empty extracted text and fall through to the AI tier.
|
||||
*
|
||||
* The earlier design called for tesseract.js rasterization, but
|
||||
* `tesseract.recognize` doesn't accept a PDF buffer — it expects an image.
|
||||
* That old code path silently failed at runtime; unpdf is the correct
|
||||
* primitive for "pull text out of a PDF on the server."
|
||||
*
|
||||
* The `recognize` adapter shape is retained for backward compatibility with
|
||||
* the test suite and the `parseAnyBerthPdf(buffer, { adapter })` override.
|
||||
*/
|
||||
export interface OcrAdapter {
|
||||
/** Returns plain text + a 0..100 mean confidence score. */
|
||||
/** Returns plain text + a 0..1 mean confidence score (mapped to 0..100 below). */
|
||||
recognize(buffer: Buffer): Promise<{ text: string; confidence: number }>;
|
||||
}
|
||||
|
||||
/** Hard cap on Tesseract OCR runtime. A crafted PDF rasterizing to
|
||||
* high-resolution noise can pin the process indefinitely (CPU bomb).
|
||||
* 30 seconds covers the legitimate single-page-spec case by a wide
|
||||
* margin while bounding the worst-case worker hold-time. The AI
|
||||
* fallback tier handles cases where OCR couldn't finish. */
|
||||
/** Hard cap on tier-2 runtime. A crafted PDF could pathologically slow
|
||||
* pdfjs parsing; 30s covers any reasonable berth-spec by orders of
|
||||
* magnitude while bounding the worst-case worker hold-time. */
|
||||
const OCR_TIMEOUT_MS = 30_000;
|
||||
|
||||
/** Default adapter — dynamically imports tesseract.js so the WASM bundle isn't
|
||||
* pulled into client builds. */
|
||||
async function defaultOcrAdapter(): Promise<OcrAdapter> {
|
||||
return {
|
||||
recognize: async (buffer: Buffer) => {
|
||||
const tesseract = await import('tesseract.js');
|
||||
// Race the OCR against a timeout so a runaway recognition can't
|
||||
// hold the worker forever. The race-loser pattern doesn't
|
||||
// actually cancel Tesseract (no AbortController support), but it
|
||||
// does free the awaiter so the caller can fall through to AI.
|
||||
const { extractText } = await import('unpdf');
|
||||
let timeoutHandle: NodeJS.Timeout | undefined;
|
||||
const timeout = new Promise<{ text: string; confidence: number }>((_, reject) => {
|
||||
timeoutHandle = setTimeout(
|
||||
() => reject(new Error(`Tesseract OCR exceeded ${OCR_TIMEOUT_MS}ms timeout`)),
|
||||
() => reject(new Error(`unpdf text extraction exceeded ${OCR_TIMEOUT_MS}ms timeout`)),
|
||||
OCR_TIMEOUT_MS,
|
||||
);
|
||||
});
|
||||
try {
|
||||
// unpdf accepts a Uint8Array; Buffer is a Uint8Array subtype.
|
||||
const result = await Promise.race([
|
||||
tesseract.recognize(buffer, 'eng').then((r) => ({
|
||||
text: r.data.text ?? '',
|
||||
confidence: typeof r.data.confidence === 'number' ? r.data.confidence : 0,
|
||||
extractText(new Uint8Array(buffer), { mergePages: true }).then((r) => ({
|
||||
text: r.text ?? '',
|
||||
// unpdf yields high-fidelity text; we mark it as 0..1 -> 0.9
|
||||
// confidence when non-empty so per-field regex confidence
|
||||
// (which is already calibrated) wins out over the floor.
|
||||
confidence: r.text && r.text.trim().length > 0 ? 90 : 0,
|
||||
})),
|
||||
timeout,
|
||||
]);
|
||||
|
||||
Reference in New Issue
Block a user