Files
pn-new-crm/src/lib/services/berth-pdf-parser.ts
Matt 221ae5784e chore(autonomous-session): consolidate uncommitted work from prior session
Bundles the prior autonomous-session output that was sitting unstaged:

- Em-dash sweep across src/ + tests/ (en-dash/em-dash to hyphen, ~2280 instances)
- country-flag-icons rollout (CountryFlag component, replaces emoji glyphs that
  never rendered on Windows; lazy-loads the 3x2 SVG index as a single chunk
  after the per-subpath dynamic-import approach silently failed in webpack)
- Admin IA Phase 1+2: 7-domain regroup, 41 to 38 pages, /admin/berths index,
  redirects (ocr to ai, reports to dashboard, invitations to users),
  docs/admin-ia-proposal.md
- Per-template email tester (registry + endpoint + UI on Email admin page)
- Cancel-document mode picker (delete-from-Documenso vs keep-for-audit)
- Dashboard PDF report: 25 widgets, SVG charts, date-range picker, 11 resolvers
- Customize-widgets per-region sortables at xl+ (charts/rails/feed); single
  flat sortable below xl when the layout stacks; per-viewport saved orders
- Audit doc updates capturing each shipped item
- Lint fixes: react-compiler immutability in DonutChart (reduce instead of
  let-reassign), set-state-in-effect disables in CountryFlag and
  UploadForSigning preview-bytes effect, unused 'confirm' destructures in
  interest contract + reservation tabs, unescaped apostrophe in test-template
  card copy
2026-05-23 00:52:59 +02:00

532 lines
20 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Reverse parser for per-berth PDFs (Phase 6b - see plan §4.7b and §9.2).
*
* Three tiers, each falling back to the next:
*
* 1. AcroForm - read named text fields via pdf-lib. The sample
* `Berth_Spec_Sheet_A1.pdf` has 0 AcroForm fields (designers export the
* PDF flat), so this tier is built defensively for future templates that
* may include named form fields. When fields exist, this is the highest-
* confidence path because there's no OCR loss.
*
* 2. OCR - Tesseract.js extracts text from the page; positional/regex
* heuristics keyed off the labels documented in §9.2 pull out values.
* Returns per-field confidence scores.
*
* 3. AI fallback - gated on `getResolvedOcrConfig(...)` returning a usable
* OpenAI/Claude config. Only invoked when OCR confidence is below
* threshold for too many fields AND the rep opts in via the diff dialog.
* A null `apiKey` causes this tier to return a clear "not configured"
* error rather than silently falling back to OCR-only.
*/
import { PDFDocument } from 'pdf-lib';
import { ValidationError } from '@/lib/errors';
// ─── shared types ────────────────────────────────────────────────────────────
export type ParserEngine = 'acroform' | 'ocr' | 'ai';
/**
* Canonical extracted shape. Keys map 1:1 to nullable columns on the `berths`
* table; `mooringNumber` is special (used for the §14.6 mismatch warning).
*/
export interface ExtractedBerthFields {
mooringNumber?: string | null;
lengthFt?: number | null;
lengthM?: number | null;
widthFt?: number | null;
widthM?: number | null;
/** Water depth at the berth (separate from a vessel's max draft). */
waterDepth?: number | null;
waterDepthM?: number | null;
/** Max draught of vessel - falls back to the berth's draft column. */
draftFt?: number | null;
draftM?: number | null;
bowFacing?: string | null;
sidePontoon?: string | null;
powerCapacity?: number | null;
voltage?: number | null;
mooringType?: string | null;
cleatType?: string | null;
cleatCapacity?: string | null;
bollardType?: string | null;
bollardCapacity?: string | null;
access?: string | null;
weeklyRateHighUsd?: number | null;
weeklyRateLowUsd?: number | null;
dailyRateHighUsd?: number | null;
dailyRateLowUsd?: number | null;
/** ISO date YYYY-MM-DD. */
pricingValidUntil?: string | null;
price?: number | null;
}
export interface ParsedField<T = unknown> {
value: T;
/** 0..1 confidence; 1 means "absolute match" (AcroForm or unambiguous regex). */
confidence: number;
/** Engine that produced this field; helps the diff dialog explain itself. */
engine: ParserEngine;
}
export interface ParseResult {
engine: ParserEngine;
/** Sparse - only fields the parser was able to extract. */
fields: Partial<Record<keyof ExtractedBerthFields, ParsedField>>;
/** Mean confidence across all extracted fields (0..1). */
meanConfidence: number;
/** Raw text the OCR or AI tier produced - useful for the diff dialog audit. */
rawText?: string;
/** Set when a tier degraded; the API surface uses this to decide whether to
* surface the "AI parse" button. */
warnings: string[];
}
// ─── magic-byte check (§14.6 critical) ───────────────────────────────────────
/** Reads first 5 bytes; returns true iff they are `%PDF-`. */
export function isPdfMagic(buffer: Buffer): boolean {
if (buffer.length < 5) return false;
return (
buffer[0] === 0x25 && // %
buffer[1] === 0x50 && // P
buffer[2] === 0x44 && // D
buffer[3] === 0x46 && // F
buffer[4] === 0x2d // -
);
}
// ─── tier 1: AcroForm ────────────────────────────────────────────────────────
/**
* AcroForm field name → ExtractedBerthFields key. Mirrors the names §4.7b
* mentions ("length_ft", "mooring_number"…) plus a couple of tolerant aliases.
*/
const ACROFORM_FIELD_MAP: Record<string, keyof ExtractedBerthFields> = {
mooring_number: 'mooringNumber',
berth_number: 'mooringNumber',
length_ft: 'lengthFt',
length_m: 'lengthM',
width_ft: 'widthFt',
width_m: 'widthM',
draft_ft: 'draftFt',
draft_m: 'draftM',
water_depth: 'waterDepth',
water_depth_m: 'waterDepthM',
bow_facing: 'bowFacing',
side_pontoon: 'sidePontoon',
pontoon: 'sidePontoon',
power_capacity: 'powerCapacity',
voltage: 'voltage',
mooring_type: 'mooringType',
cleat_type: 'cleatType',
cleat_capacity: 'cleatCapacity',
bollard_type: 'bollardType',
bollard_capacity: 'bollardCapacity',
access: 'access',
weekly_rate_high_usd: 'weeklyRateHighUsd',
weekly_rate_low_usd: 'weeklyRateLowUsd',
daily_rate_high_usd: 'dailyRateHighUsd',
daily_rate_low_usd: 'dailyRateLowUsd',
pricing_valid_until: 'pricingValidUntil',
price: 'price',
};
async function tryAcroForm(buffer: Buffer): Promise<ParseResult | null> {
let doc: PDFDocument;
try {
doc = await PDFDocument.load(buffer, { ignoreEncryption: true });
} catch {
return null;
}
let form: ReturnType<PDFDocument['getForm']>;
try {
form = doc.getForm();
} catch {
return null;
}
const fields = form.getFields();
if (fields.length === 0) return null;
const out: Partial<Record<keyof ExtractedBerthFields, ParsedField>> = {};
for (const field of fields) {
const name = field.getName().toLowerCase();
const target = ACROFORM_FIELD_MAP[name];
if (!target) continue;
// pdf-lib doesn't expose a generic "get value" - narrow to text fields.
let raw: string | undefined;
try {
const tf = form.getTextField(field.getName());
raw = tf.getText() ?? undefined;
} catch {
continue;
}
if (!raw || raw.trim().length === 0) continue;
const parsed = coerceFieldValue(target, raw.trim());
if (parsed === null) continue;
out[target] = { value: parsed, confidence: 1, engine: 'acroform' };
}
if (Object.keys(out).length === 0) return null;
return {
engine: 'acroform',
fields: out,
meanConfidence: 1,
warnings: [],
};
}
// ─── tier 2: text extraction via unpdf ────────────────────────────────────────
/**
* Tier-2 extracts text directly from the PDF via `unpdf` (a serverless-
* friendly pdfjs wrapper). This works for text-PDFs - i.e. PDFs that
* contain real text streams, not scanned page images. Scanned/raster PDFs
* land here with empty extracted text and fall through to the AI tier.
*
* The earlier design called for tesseract.js rasterization, but
* `tesseract.recognize` doesn't accept a PDF buffer - it expects an image.
* That old code path silently failed at runtime; unpdf is the correct
* primitive for "pull text out of a PDF on the server."
*
* The `recognize` adapter shape is retained for backward compatibility with
* the test suite and the `parseAnyBerthPdf(buffer, { adapter })` override.
*/
export interface OcrAdapter {
/** Returns plain text + a 0..1 mean confidence score (mapped to 0..100 below). */
recognize(buffer: Buffer): Promise<{ text: string; confidence: number }>;
}
/** Hard cap on tier-2 runtime. A crafted PDF could pathologically slow
* pdfjs parsing; 30s covers any reasonable berth-spec by orders of
* magnitude while bounding the worst-case worker hold-time. */
const OCR_TIMEOUT_MS = 30_000;
async function defaultOcrAdapter(): Promise<OcrAdapter> {
return {
recognize: async (buffer: Buffer) => {
const { extractText } = await import('unpdf');
let timeoutHandle: NodeJS.Timeout | undefined;
const timeout = new Promise<{ text: string; confidence: number }>((_, reject) => {
timeoutHandle = setTimeout(
() => reject(new Error(`unpdf text extraction exceeded ${OCR_TIMEOUT_MS}ms timeout`)),
OCR_TIMEOUT_MS,
);
});
try {
// unpdf accepts a Uint8Array; Buffer is a Uint8Array subtype.
const result = await Promise.race([
extractText(new Uint8Array(buffer), { mergePages: true }).then((r) => ({
text: r.text ?? '',
// unpdf yields high-fidelity text; we mark it as 0..1 -> 0.9
// confidence when non-empty so per-field regex confidence
// (which is already calibrated) wins out over the floor.
confidence: r.text && r.text.trim().length > 0 ? 90 : 0,
})),
timeout,
]);
return result;
} finally {
if (timeoutHandle) clearTimeout(timeoutHandle);
}
},
};
}
/**
* Heuristic extraction from OCR text. The patterns mirror the layout
* documented in plan §9.2:
*
* - "Length: 206' 8" / 63m"
* - "Mooring: A12" or large "A1" near "BERTH NUMBER"
* - "WEEK HIGH / LOW" and "DAY HIGH / LOW" pricing blocks
* - "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL <date>"
*/
export function extractFromOcrText(rawText: string): {
fields: Partial<Record<keyof ExtractedBerthFields, ParsedField>>;
warnings: string[];
} {
const warnings: string[] = [];
const out: Partial<Record<keyof ExtractedBerthFields, ParsedField>> = {};
// Normalize whitespace for line-based regexes but keep structure.
const text = rawText.replace(/ /g, ' ');
// Mooring number: BERTH NUMBER block. We try a couple of layouts.
const mooringMatch =
text.match(/BERTH\s+NUMBER[\s\S]{0,80}?\b([A-Z]\d{1,3})\b/i) ??
text.match(/^\s*([A-Z]\d{1,3})\s*$/m) ??
text.match(/Mooring(?:\s+Number)?\s*[:#]?\s*([A-Z]\d{1,3})/i);
if (mooringMatch) {
out.mooringNumber = { value: mooringMatch[1]!.toUpperCase(), confidence: 0.85, engine: 'ocr' };
}
// Length / Width / Water Depth - `Label: <imperial> / <metric>` form.
// Imperial may be `206' 8"` style; we capture the numeric prefix in feet
// and parse the metric independently because they're rarely lossless.
const dimensional = (
label: string,
ftKey: keyof ExtractedBerthFields,
mKey: keyof ExtractedBerthFields,
) => {
const re = new RegExp(
`${label}\\s*[:.]?\\s*([0-9]+(?:'\\s*[0-9]+\")?(?:\\.[0-9]+)?)\\s*(?:ft)?\\s*\\/\\s*([0-9]+(?:\\.[0-9]+)?)\\s*m`,
'i',
);
const m = text.match(re);
if (!m) return;
const ft = parseFeetInches(m[1]!);
const meters = Number(m[2]);
if (ft != null && Number.isFinite(ft)) {
out[ftKey] = { value: ft, confidence: 0.8, engine: 'ocr' } as ParsedField;
}
if (Number.isFinite(meters)) {
out[mKey] = { value: meters, confidence: 0.85, engine: 'ocr' } as ParsedField;
}
if (ft != null && Number.isFinite(meters) && Math.abs(ft * 0.3048 - meters) / meters > 0.01) {
warnings.push(
`${label}: imperial/metric mismatch - ${ft}ft vs ${meters}m differ >1% (using imperial as source of truth).`,
);
}
};
dimensional('Length', 'lengthFt', 'lengthM');
dimensional('Width', 'widthFt', 'widthM');
dimensional('Water\\s+Depth', 'waterDepth', 'waterDepthM');
// Max draft of vessel maps to the berth's draft column.
dimensional('Max\\.?\\s*draught(?:\\s+of\\s+vessel)?', 'draftFt', 'draftM');
// Singular labels (`Bow Facing: East`, `Pontoon: QUAY PT`).
const labelToKey: Array<[RegExp, keyof ExtractedBerthFields]> = [
[/Bow\s+Facing\s*[:.]?\s*([A-Za-z .]+?)(?:\n|$)/i, 'bowFacing'],
[/Pontoon\s*[:.]?\s*([A-Za-z0-9 .\-]+?)(?:\n|$)/i, 'sidePontoon'],
[/Mooring\s+Type\s*[:.]?\s*([A-Za-z0-9 \-\/]+?)(?:\n|$)/i, 'mooringType'],
[/Cleat\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatType'],
[/Cleat\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatCapacity'],
[/Bollard\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardType'],
[/Bollard\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardCapacity'],
[/Access\s*[:.]?\s*([A-Za-z0-9 .,()\-]+?)(?:\n|$)/i, 'access'],
];
for (const [re, key] of labelToKey) {
const m = text.match(re);
if (m && m[1]) {
out[key] = { value: m[1].trim(), confidence: 0.75, engine: 'ocr' } as ParsedField;
}
}
// Power Capacity (kW) and Voltage at 60Hz.
const powerMatch = text.match(/Power\s+Capacity\s*[:.]?\s*([0-9]+(?:\.[0-9]+)?)\s*kW/i);
if (powerMatch) {
out.powerCapacity = { value: Number(powerMatch[1]), confidence: 0.85, engine: 'ocr' };
}
const voltageMatch = text.match(/Voltage(?:\s+at\s+60\s*Hz)?\s*[:.]?\s*([0-9]+)\s*V/i);
if (voltageMatch) {
out.voltage = { value: Number(voltageMatch[1]), confidence: 0.85, engine: 'ocr' };
}
// Pricing: "WEEK HIGH / LOW: 11,341 USD / 8,100 USD"
const weekMatch = text.match(
/WEEK\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i,
);
if (weekMatch) {
out.weeklyRateHighUsd = {
value: Number(weekMatch[1]!.replace(/,/g, '')),
confidence: 0.8,
engine: 'ocr',
};
out.weeklyRateLowUsd = {
value: Number(weekMatch[2]!.replace(/,/g, '')),
confidence: 0.8,
engine: 'ocr',
};
}
const dayMatch = text.match(
/DAY\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i,
);
if (dayMatch) {
out.dailyRateHighUsd = {
value: Number(dayMatch[1]!.replace(/,/g, '')),
confidence: 0.8,
engine: 'ocr',
};
out.dailyRateLowUsd = {
value: Number(dayMatch[2]!.replace(/,/g, '')),
confidence: 0.8,
engine: 'ocr',
};
}
// Purchase price: "PURCHASE PRICE:\nFEE SIMPLE OR STRATA LOT\n3,880,800 USD"
const priceMatch = text.match(/PURCHASE\s+PRICE[\s\S]{0,80}?([0-9][0-9,]+)\s*USD/i);
if (priceMatch) {
out.price = { value: Number(priceMatch[1]!.replace(/,/g, '')), confidence: 0.7, engine: 'ocr' };
}
// Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025"
const validityMatch = text.match(
/CONFIRMED\s+THROUGH\s+UNTIL\s+([A-Za-z]+\s+[0-9]{1,2})(?:[A-Z]{2})?,?\s+([0-9]{4})/i,
);
if (validityMatch) {
const iso = parseHumanDate(`${validityMatch[1]} ${validityMatch[2]}`);
if (iso) {
out.pricingValidUntil = { value: iso, confidence: 0.75, engine: 'ocr' };
} else {
warnings.push(
'Could not normalize "CONFIRMED THROUGH UNTIL" date; pricing_valid_until skipped.',
);
}
}
return { fields: out, warnings };
}
async function tryOcr(buffer: Buffer, adapter?: OcrAdapter): Promise<ParseResult | null> {
const ocr = adapter ?? (await defaultOcrAdapter());
const result = await ocr.recognize(buffer);
if (!result.text || result.text.length === 0) {
return {
engine: 'ocr',
fields: {},
meanConfidence: 0,
rawText: '',
warnings: ['OCR produced no text.'],
};
}
const { fields, warnings } = extractFromOcrText(result.text);
// Tesseract gives 0..100; normalize to 0..1 and use it as a global floor -
// per-field confidence is set by the regex tier above.
const floor = Math.max(0, Math.min(result.confidence, 100)) / 100;
for (const key of Object.keys(fields) as Array<keyof ExtractedBerthFields>) {
const f = fields[key];
if (f) f.confidence = Math.min(f.confidence, Math.max(floor, 0.5));
}
const values = Object.values(fields);
const meanConfidence =
values.length === 0
? 0
: values.reduce((sum, v) => sum + (v?.confidence ?? 0), 0) / values.length;
return {
engine: 'ocr',
fields,
meanConfidence,
rawText: result.text,
warnings,
};
}
// ─── tier 3: AI fallback ─────────────────────────────────────────────────────
/** Confidence floor below which we recommend the AI tier in the diff dialog. */
export const OCR_LOW_CONFIDENCE_THRESHOLD = 0.55;
/** True when the rep should be offered an "AI parse" button. */
export function shouldOfferAiTier(parse: ParseResult): boolean {
if (parse.engine !== 'ocr') return false;
if (Object.keys(parse.fields).length === 0) return true;
return parse.meanConfidence < OCR_LOW_CONFIDENCE_THRESHOLD;
}
// ─── public entry point ──────────────────────────────────────────────────────
export interface ParseBerthPdfOptions {
/** Override Tesseract for testing. Production flows resolve the default. */
ocrAdapter?: OcrAdapter;
/** Skip the OCR tier when only AcroForm is wanted (e.g. unit tests). */
skipOcr?: boolean;
}
/**
* Parse a per-berth PDF buffer. Each tier falls back to the next; the
* returned result's `engine` field tells callers which tier produced the
* fields (used by the reconcile-diff dialog to colour confidence chips).
*
* The AI tier is never invoked from this entry point - that's a separate
* deliberate action triggered from the diff dialog so OPENAI_API_KEY isn't
* spent on every upload.
*/
export async function parseBerthPdf(
buffer: Buffer,
opts: ParseBerthPdfOptions = {},
): Promise<ParseResult> {
if (!isPdfMagic(buffer)) {
throw new ValidationError('PDF magic-byte check failed: file does not begin with %PDF-');
}
const acro = await tryAcroForm(buffer);
if (acro && Object.keys(acro.fields).length > 0) return acro;
if (opts.skipOcr) {
return {
engine: 'ocr',
fields: {},
meanConfidence: 0,
warnings: ['skipOcr=true; no AcroForm fields found.'],
};
}
const ocr = await tryOcr(buffer, opts.ocrAdapter);
return (
ocr ?? {
engine: 'ocr',
fields: {},
meanConfidence: 0,
warnings: ['OCR adapter returned null.'],
}
);
}
// ─── helpers ─────────────────────────────────────────────────────────────────
/** Coerce an AcroForm raw value to the right scalar for the target column. */
function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string | number | null {
// String columns
const stringKeys: Array<keyof ExtractedBerthFields> = [
'mooringNumber',
'bowFacing',
'sidePontoon',
'mooringType',
'cleatType',
'cleatCapacity',
'bollardType',
'bollardCapacity',
'access',
'pricingValidUntil',
];
if (stringKeys.includes(key)) {
if (key === 'pricingValidUntil') {
// Accept ISO YYYY-MM-DD as-is; otherwise try a humane parse.
if (/^\d{4}-\d{2}-\d{2}$/.test(raw)) return raw;
return parseHumanDate(raw);
}
return raw;
}
// Numeric columns: strip currency / unit suffixes and commas. Berth
// dimensions / capacities / prices are all non-negative - reject
// negatives outright so an AcroForm with `length_ft="-50"` doesn't
// poison the recommender feasibility filter when applied.
const numeric = Number(raw.replace(/[^0-9.\-]/g, ''));
if (!Number.isFinite(numeric)) return null;
return numeric < 0 ? null : numeric;
}
/** Parse a human date like "September 15 2025" → "2025-09-15". */
export function parseHumanDate(raw: string): string | null {
const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim();
// Force UTC interpretation by appending a Z; otherwise dates without an
// explicit zone get parsed in the runner's local TZ and `toISOString()`
// shifts the day by ±1 (caught a -0700 -> 09-14 regression locally).
const d = new Date(cleaned + ' UTC');
if (Number.isNaN(d.getTime())) return null;
return d.toISOString().slice(0, 10);
}
/** Convert "206' 8\"" or "82" → 206.667 / 82. Returns null on parse failure. */
export function parseFeetInches(raw: string): number | null {
const trimmed = raw.trim();
const ftIn = trimmed.match(/^([0-9]+)\s*'\s*([0-9]+)\s*"$/);
if (ftIn) {
return Number(ftIn[1]) + Number(ftIn[2]) / 12;
}
const ftOnly = trimmed.match(/^([0-9]+(?:\.[0-9]+)?)/);
if (ftOnly) return Number(ftOnly[1]);
return null;
}