Files
pn-new-crm/src/lib/services/berth-pdf-parser.ts
Matt Ciaccio fc7595faf8 fix(audit-tier-2): error-surface hygiene — toastError + CodedError sweep
Two mechanical sweeps closing the audit's HIGH §16 + MED §11 findings:

* 38 client components / 56 toast.error sites converted to
  toastError(err) so the new admin error inspector becomes usable from
  user-reported issues — every failed inline-edit, save, send, archive,
  upload, etc. now carries the request-id + error-code (Copy ID action).
* 26 service files / 62 bare-Error throws converted to CodedError or
  the existing AppError subclasses.  Adds new error codes:
  DOCUMENSO_UPSTREAM_ERROR (502), DOCUMENSO_AUTH_FAILURE (502),
  DOCUMENSO_TIMEOUT (504), OCR_UPSTREAM_ERROR (502),
  IMAP_UPSTREAM_ERROR (502), UMAMI_UPSTREAM_ERROR (502),
  UMAMI_NOT_CONFIGURED (409), and INSERT_RETURNING_EMPTY (500) for
  post-insert returning-empty guards.
* Five vitest assertions updated to match the new user-facing wording
  (client-merge "already been merged", expense/interest "couldn't find
  that …", documenso "signing service didn't respond").

Test status: 1168/1168 vitest, tsc clean.

Refs: docs/audit-comprehensive-2026-05-05.md HIGH §16 (auditor-H Issue 1)
+ MED §11 (auditor-G Issue 1).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 20:18:05 +02:00

529 lines
20 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Reverse parser for per-berth PDFs (Phase 6b — see plan §4.7b and §9.2).
*
* Three tiers, each falling back to the next:
*
* 1. AcroForm — read named text fields via pdf-lib. The sample
* `Berth_Spec_Sheet_A1.pdf` has 0 AcroForm fields (designers export the
* PDF flat), so this tier is built defensively for future templates that
* may include named form fields. When fields exist, this is the highest-
* confidence path because there's no OCR loss.
*
* 2. OCR — Tesseract.js extracts text from the page; positional/regex
* heuristics keyed off the labels documented in §9.2 pull out values.
* Returns per-field confidence scores.
*
* 3. AI fallback — gated on `getResolvedOcrConfig(...)` returning a usable
* OpenAI/Claude config. Only invoked when OCR confidence is below
* threshold for too many fields AND the rep opts in via the diff dialog.
* A null `apiKey` causes this tier to return a clear "not configured"
* error rather than silently falling back to OCR-only.
*/
import { PDFDocument } from 'pdf-lib';
import { ValidationError } from '@/lib/errors';
// ─── shared types ────────────────────────────────────────────────────────────
export type ParserEngine = 'acroform' | 'ocr' | 'ai';
/**
* Canonical extracted shape. Keys map 1:1 to nullable columns on the `berths`
* table; `mooringNumber` is special (used for the §14.6 mismatch warning).
*/
export interface ExtractedBerthFields {
mooringNumber?: string | null;
lengthFt?: number | null;
lengthM?: number | null;
widthFt?: number | null;
widthM?: number | null;
/** Water depth at the berth (separate from a vessel's max draft). */
waterDepth?: number | null;
waterDepthM?: number | null;
/** Max draught of vessel — falls back to the berth's draft column. */
draftFt?: number | null;
draftM?: number | null;
bowFacing?: string | null;
sidePontoon?: string | null;
powerCapacity?: number | null;
voltage?: number | null;
mooringType?: string | null;
cleatType?: string | null;
cleatCapacity?: string | null;
bollardType?: string | null;
bollardCapacity?: string | null;
access?: string | null;
weeklyRateHighUsd?: number | null;
weeklyRateLowUsd?: number | null;
dailyRateHighUsd?: number | null;
dailyRateLowUsd?: number | null;
/** ISO date YYYY-MM-DD. */
pricingValidUntil?: string | null;
price?: number | null;
}
export interface ParsedField<T = unknown> {
value: T;
/** 0..1 confidence; 1 means "absolute match" (AcroForm or unambiguous regex). */
confidence: number;
/** Engine that produced this field; helps the diff dialog explain itself. */
engine: ParserEngine;
}
export interface ParseResult {
engine: ParserEngine;
/** Sparse — only fields the parser was able to extract. */
fields: Partial<Record<keyof ExtractedBerthFields, ParsedField>>;
/** Mean confidence across all extracted fields (0..1). */
meanConfidence: number;
/** Raw text the OCR or AI tier produced — useful for the diff dialog audit. */
rawText?: string;
/** Set when a tier degraded; the API surface uses this to decide whether to
* surface the "AI parse" button. */
warnings: string[];
}
// ─── magic-byte check (§14.6 critical) ───────────────────────────────────────
/** Reads first 5 bytes; returns true iff they are `%PDF-`. */
export function isPdfMagic(buffer: Buffer): boolean {
if (buffer.length < 5) return false;
return (
buffer[0] === 0x25 && // %
buffer[1] === 0x50 && // P
buffer[2] === 0x44 && // D
buffer[3] === 0x46 && // F
buffer[4] === 0x2d // -
);
}
// ─── tier 1: AcroForm ────────────────────────────────────────────────────────
/**
* AcroForm field name → ExtractedBerthFields key. Mirrors the names §4.7b
* mentions ("length_ft", "mooring_number"…) plus a couple of tolerant aliases.
*/
const ACROFORM_FIELD_MAP: Record<string, keyof ExtractedBerthFields> = {
mooring_number: 'mooringNumber',
berth_number: 'mooringNumber',
length_ft: 'lengthFt',
length_m: 'lengthM',
width_ft: 'widthFt',
width_m: 'widthM',
draft_ft: 'draftFt',
draft_m: 'draftM',
water_depth: 'waterDepth',
water_depth_m: 'waterDepthM',
bow_facing: 'bowFacing',
side_pontoon: 'sidePontoon',
pontoon: 'sidePontoon',
power_capacity: 'powerCapacity',
voltage: 'voltage',
mooring_type: 'mooringType',
cleat_type: 'cleatType',
cleat_capacity: 'cleatCapacity',
bollard_type: 'bollardType',
bollard_capacity: 'bollardCapacity',
access: 'access',
weekly_rate_high_usd: 'weeklyRateHighUsd',
weekly_rate_low_usd: 'weeklyRateLowUsd',
daily_rate_high_usd: 'dailyRateHighUsd',
daily_rate_low_usd: 'dailyRateLowUsd',
pricing_valid_until: 'pricingValidUntil',
price: 'price',
};
async function tryAcroForm(buffer: Buffer): Promise<ParseResult | null> {
let doc: PDFDocument;
try {
doc = await PDFDocument.load(buffer, { ignoreEncryption: true });
} catch {
return null;
}
let form: ReturnType<PDFDocument['getForm']>;
try {
form = doc.getForm();
} catch {
return null;
}
const fields = form.getFields();
if (fields.length === 0) return null;
const out: Partial<Record<keyof ExtractedBerthFields, ParsedField>> = {};
for (const field of fields) {
const name = field.getName().toLowerCase();
const target = ACROFORM_FIELD_MAP[name];
if (!target) continue;
// pdf-lib doesn't expose a generic "get value" — narrow to text fields.
let raw: string | undefined;
try {
const tf = form.getTextField(field.getName());
raw = tf.getText() ?? undefined;
} catch {
continue;
}
if (!raw || raw.trim().length === 0) continue;
const parsed = coerceFieldValue(target, raw.trim());
if (parsed === null) continue;
out[target] = { value: parsed, confidence: 1, engine: 'acroform' };
}
if (Object.keys(out).length === 0) return null;
return {
engine: 'acroform',
fields: out,
meanConfidence: 1,
warnings: [],
};
}
// ─── tier 2: OCR via Tesseract ───────────────────────────────────────────────
/**
* Runs Tesseract against a PDF rasterized to one image per page. Tesseract.js
* accepts image inputs; we use a lazy `pdfjs-dist`-style rasterization fallback
* via dynamic import. To keep the parser unit-testable without a WASM bundle,
* the actual recognize() call is encapsulated in the `runOcr` adapter that
* production wires to tesseract.js and tests can stub.
*/
export interface OcrAdapter {
/** Returns plain text + a 0..100 mean confidence score. */
recognize(buffer: Buffer): Promise<{ text: string; confidence: number }>;
}
/** Hard cap on Tesseract OCR runtime. A crafted PDF rasterizing to
* high-resolution noise can pin the process indefinitely (CPU bomb).
* 30 seconds covers the legitimate single-page-spec case by a wide
* margin while bounding the worst-case worker hold-time. The AI
* fallback tier handles cases where OCR couldn't finish. */
const OCR_TIMEOUT_MS = 30_000;
/** Default adapter — dynamically imports tesseract.js so the WASM bundle isn't
* pulled into client builds. */
async function defaultOcrAdapter(): Promise<OcrAdapter> {
return {
recognize: async (buffer: Buffer) => {
const tesseract = await import('tesseract.js');
// Race the OCR against a timeout so a runaway recognition can't
// hold the worker forever. The race-loser pattern doesn't
// actually cancel Tesseract (no AbortController support), but it
// does free the awaiter so the caller can fall through to AI.
let timeoutHandle: NodeJS.Timeout | undefined;
const timeout = new Promise<{ text: string; confidence: number }>((_, reject) => {
timeoutHandle = setTimeout(
() => reject(new Error(`Tesseract OCR exceeded ${OCR_TIMEOUT_MS}ms timeout`)),
OCR_TIMEOUT_MS,
);
});
try {
const result = await Promise.race([
tesseract.recognize(buffer, 'eng').then((r) => ({
text: r.data.text ?? '',
confidence: typeof r.data.confidence === 'number' ? r.data.confidence : 0,
})),
timeout,
]);
return result;
} finally {
if (timeoutHandle) clearTimeout(timeoutHandle);
}
},
};
}
/**
* Heuristic extraction from OCR text. The patterns mirror the layout
* documented in plan §9.2:
*
* - "Length: 206' 8" / 63m"
* - "Mooring: A12" or large "A1" near "BERTH NUMBER"
* - "WEEK HIGH / LOW" and "DAY HIGH / LOW" pricing blocks
* - "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL <date>"
*/
export function extractFromOcrText(rawText: string): {
fields: Partial<Record<keyof ExtractedBerthFields, ParsedField>>;
warnings: string[];
} {
const warnings: string[] = [];
const out: Partial<Record<keyof ExtractedBerthFields, ParsedField>> = {};
// Normalize whitespace for line-based regexes but keep structure.
const text = rawText.replace(/ /g, ' ');
// Mooring number: BERTH NUMBER block. We try a couple of layouts.
const mooringMatch =
text.match(/BERTH\s+NUMBER[\s\S]{0,80}?\b([A-Z]\d{1,3})\b/i) ??
text.match(/^\s*([A-Z]\d{1,3})\s*$/m) ??
text.match(/Mooring(?:\s+Number)?\s*[:#]?\s*([A-Z]\d{1,3})/i);
if (mooringMatch) {
out.mooringNumber = { value: mooringMatch[1]!.toUpperCase(), confidence: 0.85, engine: 'ocr' };
}
// Length / Width / Water Depth — `Label: <imperial> / <metric>` form.
// Imperial may be `206' 8"` style; we capture the numeric prefix in feet
// and parse the metric independently because they're rarely lossless.
const dimensional = (
label: string,
ftKey: keyof ExtractedBerthFields,
mKey: keyof ExtractedBerthFields,
) => {
const re = new RegExp(
`${label}\\s*[:.]?\\s*([0-9]+(?:'\\s*[0-9]+\")?(?:\\.[0-9]+)?)\\s*(?:ft)?\\s*\\/\\s*([0-9]+(?:\\.[0-9]+)?)\\s*m`,
'i',
);
const m = text.match(re);
if (!m) return;
const ft = parseFeetInches(m[1]!);
const meters = Number(m[2]);
if (ft != null && Number.isFinite(ft)) {
out[ftKey] = { value: ft, confidence: 0.8, engine: 'ocr' } as ParsedField;
}
if (Number.isFinite(meters)) {
out[mKey] = { value: meters, confidence: 0.85, engine: 'ocr' } as ParsedField;
}
if (ft != null && Number.isFinite(meters) && Math.abs(ft * 0.3048 - meters) / meters > 0.01) {
warnings.push(
`${label}: imperial/metric mismatch — ${ft}ft vs ${meters}m differ >1% (using imperial as source of truth).`,
);
}
};
dimensional('Length', 'lengthFt', 'lengthM');
dimensional('Width', 'widthFt', 'widthM');
dimensional('Water\\s+Depth', 'waterDepth', 'waterDepthM');
// Max draft of vessel maps to the berth's draft column.
dimensional('Max\\.?\\s*draught(?:\\s+of\\s+vessel)?', 'draftFt', 'draftM');
// Singular labels (`Bow Facing: East`, `Pontoon: QUAY PT`).
const labelToKey: Array<[RegExp, keyof ExtractedBerthFields]> = [
[/Bow\s+Facing\s*[:.]?\s*([A-Za-z .]+?)(?:\n|$)/i, 'bowFacing'],
[/Pontoon\s*[:.]?\s*([A-Za-z0-9 .\-]+?)(?:\n|$)/i, 'sidePontoon'],
[/Mooring\s+Type\s*[:.]?\s*([A-Za-z0-9 \-\/]+?)(?:\n|$)/i, 'mooringType'],
[/Cleat\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatType'],
[/Cleat\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatCapacity'],
[/Bollard\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardType'],
[/Bollard\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardCapacity'],
[/Access\s*[:.]?\s*([A-Za-z0-9 .,()\-]+?)(?:\n|$)/i, 'access'],
];
for (const [re, key] of labelToKey) {
const m = text.match(re);
if (m && m[1]) {
out[key] = { value: m[1].trim(), confidence: 0.75, engine: 'ocr' } as ParsedField;
}
}
// Power Capacity (kW) and Voltage at 60Hz.
const powerMatch = text.match(/Power\s+Capacity\s*[:.]?\s*([0-9]+(?:\.[0-9]+)?)\s*kW/i);
if (powerMatch) {
out.powerCapacity = { value: Number(powerMatch[1]), confidence: 0.85, engine: 'ocr' };
}
const voltageMatch = text.match(/Voltage(?:\s+at\s+60\s*Hz)?\s*[:.]?\s*([0-9]+)\s*V/i);
if (voltageMatch) {
out.voltage = { value: Number(voltageMatch[1]), confidence: 0.85, engine: 'ocr' };
}
// Pricing: "WEEK HIGH / LOW: 11,341 USD / 8,100 USD"
const weekMatch = text.match(
/WEEK\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i,
);
if (weekMatch) {
out.weeklyRateHighUsd = {
value: Number(weekMatch[1]!.replace(/,/g, '')),
confidence: 0.8,
engine: 'ocr',
};
out.weeklyRateLowUsd = {
value: Number(weekMatch[2]!.replace(/,/g, '')),
confidence: 0.8,
engine: 'ocr',
};
}
const dayMatch = text.match(
/DAY\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i,
);
if (dayMatch) {
out.dailyRateHighUsd = {
value: Number(dayMatch[1]!.replace(/,/g, '')),
confidence: 0.8,
engine: 'ocr',
};
out.dailyRateLowUsd = {
value: Number(dayMatch[2]!.replace(/,/g, '')),
confidence: 0.8,
engine: 'ocr',
};
}
// Purchase price: "PURCHASE PRICE:\nFEE SIMPLE OR STRATA LOT\n3,880,800 USD"
const priceMatch = text.match(/PURCHASE\s+PRICE[\s\S]{0,80}?([0-9][0-9,]+)\s*USD/i);
if (priceMatch) {
out.price = { value: Number(priceMatch[1]!.replace(/,/g, '')), confidence: 0.7, engine: 'ocr' };
}
// Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025"
const validityMatch = text.match(
/CONFIRMED\s+THROUGH\s+UNTIL\s+([A-Za-z]+\s+[0-9]{1,2})(?:[A-Z]{2})?,?\s+([0-9]{4})/i,
);
if (validityMatch) {
const iso = parseHumanDate(`${validityMatch[1]} ${validityMatch[2]}`);
if (iso) {
out.pricingValidUntil = { value: iso, confidence: 0.75, engine: 'ocr' };
} else {
warnings.push(
'Could not normalize "CONFIRMED THROUGH UNTIL" date; pricing_valid_until skipped.',
);
}
}
return { fields: out, warnings };
}
async function tryOcr(buffer: Buffer, adapter?: OcrAdapter): Promise<ParseResult | null> {
const ocr = adapter ?? (await defaultOcrAdapter());
const result = await ocr.recognize(buffer);
if (!result.text || result.text.length === 0) {
return {
engine: 'ocr',
fields: {},
meanConfidence: 0,
rawText: '',
warnings: ['OCR produced no text.'],
};
}
const { fields, warnings } = extractFromOcrText(result.text);
// Tesseract gives 0..100; normalize to 0..1 and use it as a global floor —
// per-field confidence is set by the regex tier above.
const floor = Math.max(0, Math.min(result.confidence, 100)) / 100;
for (const key of Object.keys(fields) as Array<keyof ExtractedBerthFields>) {
const f = fields[key];
if (f) f.confidence = Math.min(f.confidence, Math.max(floor, 0.5));
}
const values = Object.values(fields);
const meanConfidence =
values.length === 0
? 0
: values.reduce((sum, v) => sum + (v?.confidence ?? 0), 0) / values.length;
return {
engine: 'ocr',
fields,
meanConfidence,
rawText: result.text,
warnings,
};
}
// ─── tier 3: AI fallback ─────────────────────────────────────────────────────
/** Confidence floor below which we recommend the AI tier in the diff dialog. */
export const OCR_LOW_CONFIDENCE_THRESHOLD = 0.55;
/** True when the rep should be offered an "AI parse" button. */
export function shouldOfferAiTier(parse: ParseResult): boolean {
if (parse.engine !== 'ocr') return false;
if (Object.keys(parse.fields).length === 0) return true;
return parse.meanConfidence < OCR_LOW_CONFIDENCE_THRESHOLD;
}
// ─── public entry point ──────────────────────────────────────────────────────
export interface ParseBerthPdfOptions {
/** Override Tesseract for testing. Production flows resolve the default. */
ocrAdapter?: OcrAdapter;
/** Skip the OCR tier when only AcroForm is wanted (e.g. unit tests). */
skipOcr?: boolean;
}
/**
* Parse a per-berth PDF buffer. Each tier falls back to the next; the
* returned result's `engine` field tells callers which tier produced the
* fields (used by the reconcile-diff dialog to colour confidence chips).
*
* The AI tier is never invoked from this entry point — that's a separate
* deliberate action triggered from the diff dialog so OPENAI_API_KEY isn't
* spent on every upload.
*/
export async function parseBerthPdf(
buffer: Buffer,
opts: ParseBerthPdfOptions = {},
): Promise<ParseResult> {
if (!isPdfMagic(buffer)) {
throw new ValidationError('PDF magic-byte check failed: file does not begin with %PDF-');
}
const acro = await tryAcroForm(buffer);
if (acro && Object.keys(acro.fields).length > 0) return acro;
if (opts.skipOcr) {
return {
engine: 'ocr',
fields: {},
meanConfidence: 0,
warnings: ['skipOcr=true; no AcroForm fields found.'],
};
}
const ocr = await tryOcr(buffer, opts.ocrAdapter);
return (
ocr ?? {
engine: 'ocr',
fields: {},
meanConfidence: 0,
warnings: ['OCR adapter returned null.'],
}
);
}
// ─── helpers ─────────────────────────────────────────────────────────────────
/** Coerce an AcroForm raw value to the right scalar for the target column. */
function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string | number | null {
// String columns
const stringKeys: Array<keyof ExtractedBerthFields> = [
'mooringNumber',
'bowFacing',
'sidePontoon',
'mooringType',
'cleatType',
'cleatCapacity',
'bollardType',
'bollardCapacity',
'access',
'pricingValidUntil',
];
if (stringKeys.includes(key)) {
if (key === 'pricingValidUntil') {
// Accept ISO YYYY-MM-DD as-is; otherwise try a humane parse.
if (/^\d{4}-\d{2}-\d{2}$/.test(raw)) return raw;
return parseHumanDate(raw);
}
return raw;
}
// Numeric columns: strip currency / unit suffixes and commas. Berth
// dimensions / capacities / prices are all non-negative — reject
// negatives outright so an AcroForm with `length_ft="-50"` doesn't
// poison the recommender feasibility filter when applied.
const numeric = Number(raw.replace(/[^0-9.\-]/g, ''));
if (!Number.isFinite(numeric)) return null;
return numeric < 0 ? null : numeric;
}
/** Parse a human date like "September 15 2025" → "2025-09-15". */
export function parseHumanDate(raw: string): string | null {
const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim();
// Force UTC interpretation by appending a Z; otherwise dates without an
// explicit zone get parsed in the runner's local TZ and `toISOString()`
// shifts the day by ±1 (caught a -0700 -> 09-14 regression locally).
const d = new Date(cleaned + ' UTC');
if (Number.isNaN(d.getTime())) return null;
return d.toISOString().slice(0, 10);
}
/** Convert "206' 8\"" or "82" → 206.667 / 82. Returns null on parse failure. */
export function parseFeetInches(raw: string): number | null {
const trimmed = raw.trim();
const ftIn = trimmed.match(/^([0-9]+)\s*'\s*([0-9]+)\s*"$/);
if (ftIn) {
return Number(ftIn[1]) + Number(ftIn[2]) / 12;
}
const ftOnly = trimmed.match(/^([0-9]+(?:\.[0-9]+)?)/);
if (ftOnly) return Number(ftOnly[1]);
return null;
}