/** * Reverse parser for per-berth PDFs (Phase 6b — see plan §4.7b and §9.2). * * Three tiers, each falling back to the next: * * 1. AcroForm — read named text fields via pdf-lib. The sample * `Berth_Spec_Sheet_A1.pdf` has 0 AcroForm fields (designers export the * PDF flat), so this tier is built defensively for future templates that * may include named form fields. When fields exist, this is the highest- * confidence path because there's no OCR loss. * * 2. OCR — Tesseract.js extracts text from the page; positional/regex * heuristics keyed off the labels documented in §9.2 pull out values. * Returns per-field confidence scores. * * 3. AI fallback — gated on `getResolvedOcrConfig(...)` returning a usable * OpenAI/Claude config. Only invoked when OCR confidence is below * threshold for too many fields AND the rep opts in via the diff dialog. * A null `apiKey` causes this tier to return a clear "not configured" * error rather than silently falling back to OCR-only. */ import { PDFDocument } from 'pdf-lib'; // ─── shared types ──────────────────────────────────────────────────────────── export type ParserEngine = 'acroform' | 'ocr' | 'ai'; /** * Canonical extracted shape. Keys map 1:1 to nullable columns on the `berths` * table; `mooringNumber` is special (used for the §14.6 mismatch warning). */ export interface ExtractedBerthFields { mooringNumber?: string | null; lengthFt?: number | null; lengthM?: number | null; widthFt?: number | null; widthM?: number | null; /** Water depth at the berth (separate from a vessel's max draft). */ waterDepth?: number | null; waterDepthM?: number | null; /** Max draught of vessel — falls back to the berth's draft column. */ draftFt?: number | null; draftM?: number | null; bowFacing?: string | null; sidePontoon?: string | null; powerCapacity?: number | null; voltage?: number | null; mooringType?: string | null; cleatType?: string | null; cleatCapacity?: string | null; bollardType?: string | null; bollardCapacity?: string | null; access?: string | null; weeklyRateHighUsd?: number | null; weeklyRateLowUsd?: number | null; dailyRateHighUsd?: number | null; dailyRateLowUsd?: number | null; /** ISO date YYYY-MM-DD. */ pricingValidUntil?: string | null; price?: number | null; } export interface ParsedField { value: T; /** 0..1 confidence; 1 means "absolute match" (AcroForm or unambiguous regex). */ confidence: number; /** Engine that produced this field; helps the diff dialog explain itself. */ engine: ParserEngine; } export interface ParseResult { engine: ParserEngine; /** Sparse — only fields the parser was able to extract. */ fields: Partial>; /** Mean confidence across all extracted fields (0..1). */ meanConfidence: number; /** Raw text the OCR or AI tier produced — useful for the diff dialog audit. */ rawText?: string; /** Set when a tier degraded; the API surface uses this to decide whether to * surface the "AI parse" button. */ warnings: string[]; } // ─── magic-byte check (§14.6 critical) ─────────────────────────────────────── /** Reads first 5 bytes; returns true iff they are `%PDF-`. */ export function isPdfMagic(buffer: Buffer): boolean { if (buffer.length < 5) return false; return ( buffer[0] === 0x25 && // % buffer[1] === 0x50 && // P buffer[2] === 0x44 && // D buffer[3] === 0x46 && // F buffer[4] === 0x2d // - ); } // ─── tier 1: AcroForm ──────────────────────────────────────────────────────── /** * AcroForm field name → ExtractedBerthFields key. Mirrors the names §4.7b * mentions ("length_ft", "mooring_number"…) plus a couple of tolerant aliases. */ const ACROFORM_FIELD_MAP: Record = { mooring_number: 'mooringNumber', berth_number: 'mooringNumber', length_ft: 'lengthFt', length_m: 'lengthM', width_ft: 'widthFt', width_m: 'widthM', draft_ft: 'draftFt', draft_m: 'draftM', water_depth: 'waterDepth', water_depth_m: 'waterDepthM', bow_facing: 'bowFacing', side_pontoon: 'sidePontoon', pontoon: 'sidePontoon', power_capacity: 'powerCapacity', voltage: 'voltage', mooring_type: 'mooringType', cleat_type: 'cleatType', cleat_capacity: 'cleatCapacity', bollard_type: 'bollardType', bollard_capacity: 'bollardCapacity', access: 'access', weekly_rate_high_usd: 'weeklyRateHighUsd', weekly_rate_low_usd: 'weeklyRateLowUsd', daily_rate_high_usd: 'dailyRateHighUsd', daily_rate_low_usd: 'dailyRateLowUsd', pricing_valid_until: 'pricingValidUntil', price: 'price', }; async function tryAcroForm(buffer: Buffer): Promise { let doc: PDFDocument; try { doc = await PDFDocument.load(buffer, { ignoreEncryption: true }); } catch { return null; } let form: ReturnType; try { form = doc.getForm(); } catch { return null; } const fields = form.getFields(); if (fields.length === 0) return null; const out: Partial> = {}; for (const field of fields) { const name = field.getName().toLowerCase(); const target = ACROFORM_FIELD_MAP[name]; if (!target) continue; // pdf-lib doesn't expose a generic "get value" — narrow to text fields. let raw: string | undefined; try { const tf = form.getTextField(field.getName()); raw = tf.getText() ?? undefined; } catch { continue; } if (!raw || raw.trim().length === 0) continue; const parsed = coerceFieldValue(target, raw.trim()); if (parsed === null) continue; out[target] = { value: parsed, confidence: 1, engine: 'acroform' }; } if (Object.keys(out).length === 0) return null; return { engine: 'acroform', fields: out, meanConfidence: 1, warnings: [], }; } // ─── tier 2: OCR via Tesseract ─────────────────────────────────────────────── /** * Runs Tesseract against a PDF rasterized to one image per page. Tesseract.js * accepts image inputs; we use a lazy `pdfjs-dist`-style rasterization fallback * via dynamic import. To keep the parser unit-testable without a WASM bundle, * the actual recognize() call is encapsulated in the `runOcr` adapter that * production wires to tesseract.js and tests can stub. */ export interface OcrAdapter { /** Returns plain text + a 0..100 mean confidence score. */ recognize(buffer: Buffer): Promise<{ text: string; confidence: number }>; } /** Default adapter — dynamically imports tesseract.js so the WASM bundle isn't * pulled into client builds. */ async function defaultOcrAdapter(): Promise { return { recognize: async (buffer: Buffer) => { const tesseract = await import('tesseract.js'); // Tesseract handles PDF inputs by rasterizing the first page; for our // single-page spec sheets that's sufficient. const result = await tesseract.recognize(buffer, 'eng'); return { text: result.data.text ?? '', confidence: typeof result.data.confidence === 'number' ? result.data.confidence : 0, }; }, }; } /** * Heuristic extraction from OCR text. The patterns mirror the layout * documented in plan §9.2: * * - "Length: 206' 8" / 63m" * - "Mooring: A12" or large "A1" near "BERTH NUMBER" * - "WEEK HIGH / LOW" and "DAY HIGH / LOW" pricing blocks * - "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL " */ export function extractFromOcrText(rawText: string): { fields: Partial>; warnings: string[]; } { const warnings: string[] = []; const out: Partial> = {}; // Normalize whitespace for line-based regexes but keep structure. const text = rawText.replace(/ /g, ' '); // Mooring number: BERTH NUMBER block. We try a couple of layouts. const mooringMatch = text.match(/BERTH\s+NUMBER[\s\S]{0,80}?\b([A-Z]\d{1,3})\b/i) ?? text.match(/^\s*([A-Z]\d{1,3})\s*$/m) ?? text.match(/Mooring(?:\s+Number)?\s*[:#]?\s*([A-Z]\d{1,3})/i); if (mooringMatch) { out.mooringNumber = { value: mooringMatch[1]!.toUpperCase(), confidence: 0.85, engine: 'ocr' }; } // Length / Width / Water Depth — `Label: / ` form. // Imperial may be `206' 8"` style; we capture the numeric prefix in feet // and parse the metric independently because they're rarely lossless. const dimensional = ( label: string, ftKey: keyof ExtractedBerthFields, mKey: keyof ExtractedBerthFields, ) => { const re = new RegExp( `${label}\\s*[:.]?\\s*([0-9]+(?:'\\s*[0-9]+\")?(?:\\.[0-9]+)?)\\s*(?:ft)?\\s*\\/\\s*([0-9]+(?:\\.[0-9]+)?)\\s*m`, 'i', ); const m = text.match(re); if (!m) return; const ft = parseFeetInches(m[1]!); const meters = Number(m[2]); if (ft != null && Number.isFinite(ft)) { out[ftKey] = { value: ft, confidence: 0.8, engine: 'ocr' } as ParsedField; } if (Number.isFinite(meters)) { out[mKey] = { value: meters, confidence: 0.85, engine: 'ocr' } as ParsedField; } if (ft != null && Number.isFinite(meters) && Math.abs(ft * 0.3048 - meters) / meters > 0.01) { warnings.push( `${label}: imperial/metric mismatch — ${ft}ft vs ${meters}m differ >1% (using imperial as source of truth).`, ); } }; dimensional('Length', 'lengthFt', 'lengthM'); dimensional('Width', 'widthFt', 'widthM'); dimensional('Water\\s+Depth', 'waterDepth', 'waterDepthM'); // Max draft of vessel maps to the berth's draft column. dimensional('Max\\.?\\s*draught(?:\\s+of\\s+vessel)?', 'draftFt', 'draftM'); // Singular labels (`Bow Facing: East`, `Pontoon: QUAY PT`). const labelToKey: Array<[RegExp, keyof ExtractedBerthFields]> = [ [/Bow\s+Facing\s*[:.]?\s*([A-Za-z .]+?)(?:\n|$)/i, 'bowFacing'], [/Pontoon\s*[:.]?\s*([A-Za-z0-9 .\-]+?)(?:\n|$)/i, 'sidePontoon'], [/Mooring\s+Type\s*[:.]?\s*([A-Za-z0-9 \-\/]+?)(?:\n|$)/i, 'mooringType'], [/Cleat\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatType'], [/Cleat\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatCapacity'], [/Bollard\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardType'], [/Bollard\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardCapacity'], [/Access\s*[:.]?\s*([A-Za-z0-9 .,()\-]+?)(?:\n|$)/i, 'access'], ]; for (const [re, key] of labelToKey) { const m = text.match(re); if (m && m[1]) { out[key] = { value: m[1].trim(), confidence: 0.75, engine: 'ocr' } as ParsedField; } } // Power Capacity (kW) and Voltage at 60Hz. const powerMatch = text.match(/Power\s+Capacity\s*[:.]?\s*([0-9]+(?:\.[0-9]+)?)\s*kW/i); if (powerMatch) { out.powerCapacity = { value: Number(powerMatch[1]), confidence: 0.85, engine: 'ocr' }; } const voltageMatch = text.match(/Voltage(?:\s+at\s+60\s*Hz)?\s*[:.]?\s*([0-9]+)\s*V/i); if (voltageMatch) { out.voltage = { value: Number(voltageMatch[1]), confidence: 0.85, engine: 'ocr' }; } // Pricing: "WEEK HIGH / LOW: 11,341 USD / 8,100 USD" const weekMatch = text.match( /WEEK\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i, ); if (weekMatch) { out.weeklyRateHighUsd = { value: Number(weekMatch[1]!.replace(/,/g, '')), confidence: 0.8, engine: 'ocr', }; out.weeklyRateLowUsd = { value: Number(weekMatch[2]!.replace(/,/g, '')), confidence: 0.8, engine: 'ocr', }; } const dayMatch = text.match( /DAY\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i, ); if (dayMatch) { out.dailyRateHighUsd = { value: Number(dayMatch[1]!.replace(/,/g, '')), confidence: 0.8, engine: 'ocr', }; out.dailyRateLowUsd = { value: Number(dayMatch[2]!.replace(/,/g, '')), confidence: 0.8, engine: 'ocr', }; } // Purchase price: "PURCHASE PRICE:\nFEE SIMPLE OR STRATA LOT\n3,880,800 USD" const priceMatch = text.match(/PURCHASE\s+PRICE[\s\S]{0,80}?([0-9][0-9,]+)\s*USD/i); if (priceMatch) { out.price = { value: Number(priceMatch[1]!.replace(/,/g, '')), confidence: 0.7, engine: 'ocr' }; } // Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025" const validityMatch = text.match( /CONFIRMED\s+THROUGH\s+UNTIL\s+([A-Za-z]+\s+[0-9]{1,2})(?:[A-Z]{2})?,?\s+([0-9]{4})/i, ); if (validityMatch) { const iso = parseHumanDate(`${validityMatch[1]} ${validityMatch[2]}`); if (iso) { out.pricingValidUntil = { value: iso, confidence: 0.75, engine: 'ocr' }; } else { warnings.push( 'Could not normalize "CONFIRMED THROUGH UNTIL" date; pricing_valid_until skipped.', ); } } return { fields: out, warnings }; } async function tryOcr(buffer: Buffer, adapter?: OcrAdapter): Promise { const ocr = adapter ?? (await defaultOcrAdapter()); const result = await ocr.recognize(buffer); if (!result.text || result.text.length === 0) { return { engine: 'ocr', fields: {}, meanConfidence: 0, rawText: '', warnings: ['OCR produced no text.'], }; } const { fields, warnings } = extractFromOcrText(result.text); // Tesseract gives 0..100; normalize to 0..1 and use it as a global floor — // per-field confidence is set by the regex tier above. const floor = Math.max(0, Math.min(result.confidence, 100)) / 100; for (const key of Object.keys(fields) as Array) { const f = fields[key]; if (f) f.confidence = Math.min(f.confidence, Math.max(floor, 0.5)); } const values = Object.values(fields); const meanConfidence = values.length === 0 ? 0 : values.reduce((sum, v) => sum + (v?.confidence ?? 0), 0) / values.length; return { engine: 'ocr', fields, meanConfidence, rawText: result.text, warnings, }; } // ─── tier 3: AI fallback ───────────────────────────────────────────────────── /** Confidence floor below which we recommend the AI tier in the diff dialog. */ export const OCR_LOW_CONFIDENCE_THRESHOLD = 0.55; /** True when the rep should be offered an "AI parse" button. */ export function shouldOfferAiTier(parse: ParseResult): boolean { if (parse.engine !== 'ocr') return false; if (Object.keys(parse.fields).length === 0) return true; return parse.meanConfidence < OCR_LOW_CONFIDENCE_THRESHOLD; } // ─── public entry point ────────────────────────────────────────────────────── export interface ParseBerthPdfOptions { /** Override Tesseract for testing. Production flows resolve the default. */ ocrAdapter?: OcrAdapter; /** Skip the OCR tier when only AcroForm is wanted (e.g. unit tests). */ skipOcr?: boolean; } /** * Parse a per-berth PDF buffer. Each tier falls back to the next; the * returned result's `engine` field tells callers which tier produced the * fields (used by the reconcile-diff dialog to colour confidence chips). * * The AI tier is never invoked from this entry point — that's a separate * deliberate action triggered from the diff dialog so OPENAI_API_KEY isn't * spent on every upload. */ export async function parseBerthPdf( buffer: Buffer, opts: ParseBerthPdfOptions = {}, ): Promise { if (!isPdfMagic(buffer)) { throw new Error('PDF magic-byte check failed: file does not begin with %PDF-'); } const acro = await tryAcroForm(buffer); if (acro && Object.keys(acro.fields).length > 0) return acro; if (opts.skipOcr) { return { engine: 'ocr', fields: {}, meanConfidence: 0, warnings: ['skipOcr=true; no AcroForm fields found.'], }; } const ocr = await tryOcr(buffer, opts.ocrAdapter); return ( ocr ?? { engine: 'ocr', fields: {}, meanConfidence: 0, warnings: ['OCR adapter returned null.'], } ); } // ─── helpers ───────────────────────────────────────────────────────────────── /** Coerce an AcroForm raw value to the right scalar for the target column. */ function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string | number | null { // String columns const stringKeys: Array = [ 'mooringNumber', 'bowFacing', 'sidePontoon', 'mooringType', 'cleatType', 'cleatCapacity', 'bollardType', 'bollardCapacity', 'access', 'pricingValidUntil', ]; if (stringKeys.includes(key)) { if (key === 'pricingValidUntil') { // Accept ISO YYYY-MM-DD as-is; otherwise try a humane parse. if (/^\d{4}-\d{2}-\d{2}$/.test(raw)) return raw; return parseHumanDate(raw); } return raw; } // Numeric columns: strip currency / unit suffixes and commas. const numeric = Number(raw.replace(/[^0-9.\-]/g, '')); return Number.isFinite(numeric) ? numeric : null; } /** Parse a human date like "September 15 2025" → "2025-09-15". */ export function parseHumanDate(raw: string): string | null { const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim(); // Force UTC interpretation by appending a Z; otherwise dates without an // explicit zone get parsed in the runner's local TZ and `toISOString()` // shifts the day by ±1 (caught a -0700 -> 09-14 regression locally). const d = new Date(cleaned + ' UTC'); if (Number.isNaN(d.getTime())) return null; return d.toISOString().slice(0, 10); } /** Convert "206' 8\"" or "82" → 206.667 / 82. Returns null on parse failure. */ export function parseFeetInches(raw: string): number | null { const trimmed = raw.trim(); const ftIn = trimmed.match(/^([0-9]+)\s*'\s*([0-9]+)\s*"$/); if (ftIn) { return Number(ftIn[1]) + Number(ftIn[2]) / 12; } const ftOnly = trimmed.match(/^([0-9]+(?:\.[0-9]+)?)/); if (ftOnly) return Number(ftOnly[1]); return null; }