src/lib/services/berth-pdf-parser.ts

/**
 * Reverse parser for per-berth PDFs (Phase 6b — see plan §4.7b and §9.2).
 *
 * Three tiers, each falling back to the next:
 *
 *   1. AcroForm — read named text fields via pdf-lib. The sample
 *      `Berth_Spec_Sheet_A1.pdf` has 0 AcroForm fields (designers export the
 *      PDF flat), so this tier is built defensively for future templates that
 *      may include named form fields. When fields exist, this is the highest-
 *      confidence path because there's no OCR loss.
 *
 *   2. OCR — Tesseract.js extracts text from the page; positional/regex
 *      heuristics keyed off the labels documented in §9.2 pull out values.
 *      Returns per-field confidence scores.
 *
 *   3. AI fallback — gated on `getResolvedOcrConfig(...)` returning a usable
 *      OpenAI/Claude config. Only invoked when OCR confidence is below
 *      threshold for too many fields AND the rep opts in via the diff dialog.
 *      A null `apiKey` causes this tier to return a clear "not configured"
 *      error rather than silently falling back to OCR-only.
 */

import { PDFDocument } from 'pdf-lib';

// ─── shared types ────────────────────────────────────────────────────────────

export type ParserEngine = 'acroform' | 'ocr' | 'ai';

/**
 * Canonical extracted shape. Keys map 1:1 to nullable columns on the `berths`
 * table; `mooringNumber` is special (used for the §14.6 mismatch warning).
 */
export interface ExtractedBerthFields {
  mooringNumber?: string | null;
  lengthFt?: number | null;
  lengthM?: number | null;
  widthFt?: number | null;
  widthM?: number | null;
  /** Water depth at the berth (separate from a vessel's max draft). */
  waterDepth?: number | null;
  waterDepthM?: number | null;
  /** Max draught of vessel — falls back to the berth's draft column. */
  draftFt?: number | null;
  draftM?: number | null;
  bowFacing?: string | null;
  sidePontoon?: string | null;
  powerCapacity?: number | null;
  voltage?: number | null;
  mooringType?: string | null;
  cleatType?: string | null;
  cleatCapacity?: string | null;
  bollardType?: string | null;
  bollardCapacity?: string | null;
  access?: string | null;
  weeklyRateHighUsd?: number | null;
  weeklyRateLowUsd?: number | null;
  dailyRateHighUsd?: number | null;
  dailyRateLowUsd?: number | null;
  /** ISO date YYYY-MM-DD. */
  pricingValidUntil?: string | null;
  price?: number | null;
}

export interface ParsedField<T = unknown> {
  value: T;
  /** 0..1 confidence; 1 means "absolute match" (AcroForm or unambiguous regex). */
  confidence: number;
  /** Engine that produced this field; helps the diff dialog explain itself. */
  engine: ParserEngine;
}

export interface ParseResult {
  engine: ParserEngine;
  /** Sparse — only fields the parser was able to extract. */
  fields: Partial<Record<keyof ExtractedBerthFields, ParsedField>>;
  /** Mean confidence across all extracted fields (0..1). */
  meanConfidence: number;
  /** Raw text the OCR or AI tier produced — useful for the diff dialog audit. */
  rawText?: string;
  /** Set when a tier degraded; the API surface uses this to decide whether to
   *  surface the "AI parse" button. */
  warnings: string[];
}

// ─── magic-byte check (§14.6 critical) ───────────────────────────────────────

/** Reads first 5 bytes; returns true iff they are `%PDF-`. */
export function isPdfMagic(buffer: Buffer): boolean {
  if (buffer.length < 5) return false;
  return (
    buffer[0] === 0x25 && // %
    buffer[1] === 0x50 && // P
    buffer[2] === 0x44 && // D
    buffer[3] === 0x46 && // F
    buffer[4] === 0x2d //   -
  );
}

// ─── tier 1: AcroForm ────────────────────────────────────────────────────────

/**
 * AcroForm field name → ExtractedBerthFields key. Mirrors the names §4.7b
 * mentions ("length_ft", "mooring_number"…) plus a couple of tolerant aliases.
 */
const ACROFORM_FIELD_MAP: Record<string, keyof ExtractedBerthFields> = {
  mooring_number: 'mooringNumber',
  berth_number: 'mooringNumber',
  length_ft: 'lengthFt',
  length_m: 'lengthM',
  width_ft: 'widthFt',
  width_m: 'widthM',
  draft_ft: 'draftFt',
  draft_m: 'draftM',
  water_depth: 'waterDepth',
  water_depth_m: 'waterDepthM',
  bow_facing: 'bowFacing',
  side_pontoon: 'sidePontoon',
  pontoon: 'sidePontoon',
  power_capacity: 'powerCapacity',
  voltage: 'voltage',
  mooring_type: 'mooringType',
  cleat_type: 'cleatType',
  cleat_capacity: 'cleatCapacity',
  bollard_type: 'bollardType',
  bollard_capacity: 'bollardCapacity',
  access: 'access',
  weekly_rate_high_usd: 'weeklyRateHighUsd',
  weekly_rate_low_usd: 'weeklyRateLowUsd',
  daily_rate_high_usd: 'dailyRateHighUsd',
  daily_rate_low_usd: 'dailyRateLowUsd',
  pricing_valid_until: 'pricingValidUntil',
  price: 'price',
};

async function tryAcroForm(buffer: Buffer): Promise<ParseResult | null> {
  let doc: PDFDocument;
  try {
    doc = await PDFDocument.load(buffer, { ignoreEncryption: true });
  } catch {
    return null;
  }
  let form: ReturnType<PDFDocument['getForm']>;
  try {
    form = doc.getForm();
  } catch {
    return null;
  }
  const fields = form.getFields();
  if (fields.length === 0) return null;

  const out: Partial<Record<keyof ExtractedBerthFields, ParsedField>> = {};
  for (const field of fields) {
    const name = field.getName().toLowerCase();
    const target = ACROFORM_FIELD_MAP[name];
    if (!target) continue;
    // pdf-lib doesn't expose a generic "get value" — narrow to text fields.
    let raw: string | undefined;
    try {
      const tf = form.getTextField(field.getName());
      raw = tf.getText() ?? undefined;
    } catch {
      continue;
    }
    if (!raw || raw.trim().length === 0) continue;
    const parsed = coerceFieldValue(target, raw.trim());
    if (parsed === null) continue;
    out[target] = { value: parsed, confidence: 1, engine: 'acroform' };
  }

  if (Object.keys(out).length === 0) return null;
  return {
    engine: 'acroform',
    fields: out,
    meanConfidence: 1,
    warnings: [],
  };
}

// ─── tier 2: OCR via Tesseract ───────────────────────────────────────────────

/**
 * Runs Tesseract against a PDF rasterized to one image per page. Tesseract.js
 * accepts image inputs; we use a lazy `pdfjs-dist`-style rasterization fallback
 * via dynamic import. To keep the parser unit-testable without a WASM bundle,
 * the actual recognize() call is encapsulated in the `runOcr` adapter that
 * production wires to tesseract.js and tests can stub.
 */
export interface OcrAdapter {
  /** Returns plain text + a 0..100 mean confidence score. */
  recognize(buffer: Buffer): Promise<{ text: string; confidence: number }>;
}

/** Default adapter — dynamically imports tesseract.js so the WASM bundle isn't
 *  pulled into client builds. */
async function defaultOcrAdapter(): Promise<OcrAdapter> {
  return {
    recognize: async (buffer: Buffer) => {
      const tesseract = await import('tesseract.js');
      // Tesseract handles PDF inputs by rasterizing the first page; for our
      // single-page spec sheets that's sufficient.
      const result = await tesseract.recognize(buffer, 'eng');
      return {
        text: result.data.text ?? '',
        confidence: typeof result.data.confidence === 'number' ? result.data.confidence : 0,
      };
    },
  };
}

/**
 * Heuristic extraction from OCR text. The patterns mirror the layout
 * documented in plan §9.2:
 *
 *   - "Length: 206' 8" / 63m"
 *   - "Mooring: A12" or large "A1" near "BERTH NUMBER"
 *   - "WEEK HIGH / LOW" and "DAY HIGH / LOW" pricing blocks
 *   - "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL <date>"
 */
export function extractFromOcrText(rawText: string): {
  fields: Partial<Record<keyof ExtractedBerthFields, ParsedField>>;
  warnings: string[];
} {
  const warnings: string[] = [];
  const out: Partial<Record<keyof ExtractedBerthFields, ParsedField>> = {};

  // Normalize whitespace for line-based regexes but keep structure.
  const text = rawText.replace(/ /g, ' ');

  // Mooring number: BERTH NUMBER block. We try a couple of layouts.
  const mooringMatch =
    text.match(/BERTH\s+NUMBER[\s\S]{0,80}?\b([A-Z]\d{1,3})\b/i) ??
    text.match(/^\s*([A-Z]\d{1,3})\s*$/m) ??
    text.match(/Mooring(?:\s+Number)?\s*[:#]?\s*([A-Z]\d{1,3})/i);
  if (mooringMatch) {
    out.mooringNumber = { value: mooringMatch[1]!.toUpperCase(), confidence: 0.85, engine: 'ocr' };
  }

  // Length / Width / Water Depth — `Label: <imperial> / <metric>` form.
  // Imperial may be `206' 8"` style; we capture the numeric prefix in feet
  // and parse the metric independently because they're rarely lossless.
  const dimensional = (
    label: string,
    ftKey: keyof ExtractedBerthFields,
    mKey: keyof ExtractedBerthFields,
  ) => {
    const re = new RegExp(
      `${label}\\s*[:.]?\\s*([0-9]+(?:'\\s*[0-9]+\")?(?:\\.[0-9]+)?)\\s*(?:ft)?\\s*\\/\\s*([0-9]+(?:\\.[0-9]+)?)\\s*m`,
      'i',
    );
    const m = text.match(re);
    if (!m) return;
    const ft = parseFeetInches(m[1]!);
    const meters = Number(m[2]);
    if (ft != null && Number.isFinite(ft)) {
      out[ftKey] = { value: ft, confidence: 0.8, engine: 'ocr' } as ParsedField;
    }
    if (Number.isFinite(meters)) {
      out[mKey] = { value: meters, confidence: 0.85, engine: 'ocr' } as ParsedField;
    }
    if (ft != null && Number.isFinite(meters) && Math.abs(ft * 0.3048 - meters) / meters > 0.01) {
      warnings.push(
        `${label}: imperial/metric mismatch — ${ft}ft vs ${meters}m differ >1% (using imperial as source of truth).`,
      );
    }
  };
  dimensional('Length', 'lengthFt', 'lengthM');
  dimensional('Width', 'widthFt', 'widthM');
  dimensional('Water\\s+Depth', 'waterDepth', 'waterDepthM');
  // Max draft of vessel maps to the berth's draft column.
  dimensional('Max\\.?\\s*draught(?:\\s+of\\s+vessel)?', 'draftFt', 'draftM');

  // Singular labels (`Bow Facing: East`, `Pontoon: QUAY PT`).
  const labelToKey: Array<[RegExp, keyof ExtractedBerthFields]> = [
    [/Bow\s+Facing\s*[:.]?\s*([A-Za-z .]+?)(?:\n|$)/i, 'bowFacing'],
    [/Pontoon\s*[:.]?\s*([A-Za-z0-9 .\-]+?)(?:\n|$)/i, 'sidePontoon'],
    [/Mooring\s+Type\s*[:.]?\s*([A-Za-z0-9 \-\/]+?)(?:\n|$)/i, 'mooringType'],
    [/Cleat\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatType'],
    [/Cleat\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatCapacity'],
    [/Bollard\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardType'],
    [/Bollard\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardCapacity'],
    [/Access\s*[:.]?\s*([A-Za-z0-9 .,()\-]+?)(?:\n|$)/i, 'access'],
  ];
  for (const [re, key] of labelToKey) {
    const m = text.match(re);
    if (m && m[1]) {
      out[key] = { value: m[1].trim(), confidence: 0.75, engine: 'ocr' } as ParsedField;
    }
  }

  // Power Capacity (kW) and Voltage at 60Hz.
  const powerMatch = text.match(/Power\s+Capacity\s*[:.]?\s*([0-9]+(?:\.[0-9]+)?)\s*kW/i);
  if (powerMatch) {
    out.powerCapacity = { value: Number(powerMatch[1]), confidence: 0.85, engine: 'ocr' };
  }
  const voltageMatch = text.match(/Voltage(?:\s+at\s+60\s*Hz)?\s*[:.]?\s*([0-9]+)\s*V/i);
  if (voltageMatch) {
    out.voltage = { value: Number(voltageMatch[1]), confidence: 0.85, engine: 'ocr' };
  }

  // Pricing: "WEEK HIGH / LOW: 11,341 USD / 8,100 USD"
  const weekMatch = text.match(
    /WEEK\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i,
  );
  if (weekMatch) {
    out.weeklyRateHighUsd = {
      value: Number(weekMatch[1]!.replace(/,/g, '')),
      confidence: 0.8,
      engine: 'ocr',
    };
    out.weeklyRateLowUsd = {
      value: Number(weekMatch[2]!.replace(/,/g, '')),
      confidence: 0.8,
      engine: 'ocr',
    };
  }
  const dayMatch = text.match(
    /DAY\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i,
  );
  if (dayMatch) {
    out.dailyRateHighUsd = {
      value: Number(dayMatch[1]!.replace(/,/g, '')),
      confidence: 0.8,
      engine: 'ocr',
    };
    out.dailyRateLowUsd = {
      value: Number(dayMatch[2]!.replace(/,/g, '')),
      confidence: 0.8,
      engine: 'ocr',
    };
  }

  // Purchase price: "PURCHASE PRICE:\nFEE SIMPLE OR STRATA LOT\n3,880,800 USD"
  const priceMatch = text.match(/PURCHASE\s+PRICE[\s\S]{0,80}?([0-9][0-9,]+)\s*USD/i);
  if (priceMatch) {
    out.price = { value: Number(priceMatch[1]!.replace(/,/g, '')), confidence: 0.7, engine: 'ocr' };
  }

  // Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025"
  const validityMatch = text.match(
    /CONFIRMED\s+THROUGH\s+UNTIL\s+([A-Za-z]+\s+[0-9]{1,2})(?:[A-Z]{2})?,?\s+([0-9]{4})/i,
  );
  if (validityMatch) {
    const iso = parseHumanDate(`${validityMatch[1]} ${validityMatch[2]}`);
    if (iso) {
      out.pricingValidUntil = { value: iso, confidence: 0.75, engine: 'ocr' };
    } else {
      warnings.push(
        'Could not normalize "CONFIRMED THROUGH UNTIL" date; pricing_valid_until skipped.',
      );
    }
  }

  return { fields: out, warnings };
}

async function tryOcr(buffer: Buffer, adapter?: OcrAdapter): Promise<ParseResult | null> {
  const ocr = adapter ?? (await defaultOcrAdapter());
  const result = await ocr.recognize(buffer);
  if (!result.text || result.text.length === 0) {
    return {
      engine: 'ocr',
      fields: {},
      meanConfidence: 0,
      rawText: '',
      warnings: ['OCR produced no text.'],
    };
  }
  const { fields, warnings } = extractFromOcrText(result.text);
  // Tesseract gives 0..100; normalize to 0..1 and use it as a global floor —
  // per-field confidence is set by the regex tier above.
  const floor = Math.max(0, Math.min(result.confidence, 100)) / 100;
  for (const key of Object.keys(fields) as Array<keyof ExtractedBerthFields>) {
    const f = fields[key];
    if (f) f.confidence = Math.min(f.confidence, Math.max(floor, 0.5));
  }
  const values = Object.values(fields);
  const meanConfidence =
    values.length === 0
      ? 0
      : values.reduce((sum, v) => sum + (v?.confidence ?? 0), 0) / values.length;
  return {
    engine: 'ocr',
    fields,
    meanConfidence,
    rawText: result.text,
    warnings,
  };
}

// ─── tier 3: AI fallback ─────────────────────────────────────────────────────

/** Confidence floor below which we recommend the AI tier in the diff dialog. */
export const OCR_LOW_CONFIDENCE_THRESHOLD = 0.55;

/** True when the rep should be offered an "AI parse" button. */
export function shouldOfferAiTier(parse: ParseResult): boolean {
  if (parse.engine !== 'ocr') return false;
  if (Object.keys(parse.fields).length === 0) return true;
  return parse.meanConfidence < OCR_LOW_CONFIDENCE_THRESHOLD;
}

// ─── public entry point ──────────────────────────────────────────────────────

export interface ParseBerthPdfOptions {
  /** Override Tesseract for testing. Production flows resolve the default. */
  ocrAdapter?: OcrAdapter;
  /** Skip the OCR tier when only AcroForm is wanted (e.g. unit tests). */
  skipOcr?: boolean;
}

/**
 * Parse a per-berth PDF buffer. Each tier falls back to the next; the
 * returned result's `engine` field tells callers which tier produced the
 * fields (used by the reconcile-diff dialog to colour confidence chips).
 *
 * The AI tier is never invoked from this entry point — that's a separate
 * deliberate action triggered from the diff dialog so OPENAI_API_KEY isn't
 * spent on every upload.
 */
export async function parseBerthPdf(
  buffer: Buffer,
  opts: ParseBerthPdfOptions = {},
): Promise<ParseResult> {
  if (!isPdfMagic(buffer)) {
    throw new Error('PDF magic-byte check failed: file does not begin with %PDF-');
  }
  const acro = await tryAcroForm(buffer);
  if (acro && Object.keys(acro.fields).length > 0) return acro;
  if (opts.skipOcr) {
    return {
      engine: 'ocr',
      fields: {},
      meanConfidence: 0,
      warnings: ['skipOcr=true; no AcroForm fields found.'],
    };
  }
  const ocr = await tryOcr(buffer, opts.ocrAdapter);
  return (
    ocr ?? {
      engine: 'ocr',
      fields: {},
      meanConfidence: 0,
      warnings: ['OCR adapter returned null.'],
    }
  );
}

// ─── helpers ─────────────────────────────────────────────────────────────────

/** Coerce an AcroForm raw value to the right scalar for the target column. */
function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string | number | null {
  // String columns
  const stringKeys: Array<keyof ExtractedBerthFields> = [
    'mooringNumber',
    'bowFacing',
    'sidePontoon',
    'mooringType',
    'cleatType',
    'cleatCapacity',
    'bollardType',
    'bollardCapacity',
    'access',
    'pricingValidUntil',
  ];
  if (stringKeys.includes(key)) {
    if (key === 'pricingValidUntil') {
      // Accept ISO YYYY-MM-DD as-is; otherwise try a humane parse.
      if (/^\d{4}-\d{2}-\d{2}$/.test(raw)) return raw;
      return parseHumanDate(raw);
    }
    return raw;
  }
  // Numeric columns: strip currency / unit suffixes and commas.
  const numeric = Number(raw.replace(/[^0-9.\-]/g, ''));
  return Number.isFinite(numeric) ? numeric : null;
}

/** Parse a human date like "September 15 2025" → "2025-09-15". */
export function parseHumanDate(raw: string): string | null {
  const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim();
  // Force UTC interpretation by appending a Z; otherwise dates without an
  // explicit zone get parsed in the runner's local TZ and `toISOString()`
  // shifts the day by ±1 (caught a -0700 -> 09-14 regression locally).
  const d = new Date(cleaned + ' UTC');
  if (Number.isNaN(d.getTime())) return null;
  return d.toISOString().slice(0, 10);
}

/** Convert "206' 8\"" or "82" → 206.667 / 82. Returns null on parse failure. */
export function parseFeetInches(raw: string): number | null {
  const trimmed = raw.trim();
  const ftIn = trimmed.match(/^([0-9]+)\s*'\s*([0-9]+)\s*"$/);
  if (ftIn) {
    return Number(ftIn[1]) + Number(ftIn[2]) / 12;
  }
  const ftOnly = trimmed.match(/^([0-9]+(?:\.[0-9]+)?)/);
  if (ftOnly) return Number(ftOnly[1]);
  return null;
}
-												feat(berths): per-berth PDF storage (versioned) + reverse parser

Phase 6b of the berth-recommender refactor (see
docs/berth-recommender-and-pdf-plan.md §3.2, §3.3, §4.7b, §11.1, §14.6).
Builds on the Phase 6a pluggable storage backend (commit 83693dd) — every
file write goes through `getStorageBackend()`; no direct minio imports.

Schema (migration 0030_berth_pdf_versions):
  - new table `berth_pdf_versions` with monotonic `version_number` per
    berth, `storage_key` (renamed convention from §4.7a), sha256, size,
    `download_url_expires_at` cache slot for §11.1 signed-URL throttling,
    and `parse_results` jsonb for the audit trail.
  - new column `berths.current_pdf_version_id` (deferred from Phase 0)
    with FK to `berth_pdf_versions(id)` ON DELETE SET NULL.
  - relations + types exported from `schema/berths.ts`.

3-tier reverse parser (`lib/services/berth-pdf-parser.ts`):
  1. AcroForm via pdf-lib — pulls named fields (`length_ft`,
     `mooring_number`, etc.) at confidence 1. Sample PDF has 0 such
     fields, so this is defensive coverage for future templates.
  2. OCR via Tesseract.js — positional/regex heuristics keyed off the
     §9.2 layout (Length/Width/Water Depth as `<imperial> / <metric>`,
     `WEEK HIGH / LOW`, `CONFIRMED THROUGH UNTIL <date>`, etc.). Returns
     per-field confidence + global mean; flags imperial-vs-metric drift
     >1% in `warnings`.
  3. AI fallback — gated via `getResolvedOcrConfig()` (existing
     openai/claude provider). Surfaced from the diff dialog only when
     `shouldOfferAiTier()` returns true (mean OCR confidence below
     0.55 threshold), so OPENAI_API_KEY isn't burned on every upload.

Service layer (`lib/services/berth-pdf.service.ts`):
  - `uploadBerthPdf()` — magic-byte check, size cap, version-number
    bump + current pointer in one transaction.
  - `reconcilePdfWithBerth()` — auto-applies fields where CRM is null;
    flags conflicts when CRM and PDF disagree; tolerates ±1% on numeric
    columns; warns on mooring-number-in-PDF mismatch (§14.6).
  - `applyParseResults()` — hard allowlist of writable columns;
    stamps `appliedFields` onto `parse_results` for audit.
  - `rollbackToVersion()` — pointer flip only, never re-parses (§14.6).
  - `listBerthPdfVersions()` — version list with 15-min signed URLs.
  - `getMaxUploadMb()` — port-override → global → default 15 lookup
    on `system_settings.berth_pdf_max_upload_mb`.

§14.6 critical mitigations:
  - Magic-byte check (`%PDF-`) on every upload; mismatch deletes the
    storage object and rejects the request.
  - Size cap from `system_settings.berth_pdf_max_upload_mb` (default
    15 MB); enforced in the upload-url presign AND server-side.
  - 0-byte uploads rejected.
  - Mooring-number mismatch surfaces as a `warnings[]` entry on the
    reconcile result so the rep sees it in the diff dialog.
  - Imperial vs metric ±1% tolerance in both the parser warnings and
    the reconcile equality check.
  - Path traversal already blocked at the storage layer (Phase 6a).

API + UI:
  - `POST /api/v1/berths/[id]/pdf-upload-url` — presigned URL (S3) or
    HMAC-signed proxy URL (filesystem) sized to the per-port cap.
  - `POST /api/v1/berths/[id]/pdf-versions` — verifies the upload via
    `backend.head()`, writes the row, bumps `current_pdf_version_id`.
  - `GET /api/v1/berths/[id]/pdf-versions` — version list + signed URLs.
  - `POST /api/v1/berths/[id]/pdf-versions/[versionId]/rollback`.
  - `POST /api/v1/berths/[id]/pdf-versions/parse-results/apply` —
    rep-confirmed diff payload.
  - New "Documents" tab on the berth detail page (`berth-tabs.tsx`)
    with current-PDF panel, version history, Replace PDF button, and
    `<PdfReconcileDialog>` for the auto-applied + conflicts UX.

System settings:
  - `berth_pdf_max_upload_mb` (default 15) — caps presigned-upload size
    + server-side validation. Resolved port-override → global → default.

Tests:
  - `tests/unit/services/berth-pdf-parser.test.ts` — magic bytes,
    feet-inches, human dates, full §9.2-shaped OCR text → 18 fields,
    drift warning, AI-tier gate.
  - `tests/unit/services/berth-pdf-acroform.test.ts` — synthetic
    pdf-lib AcroForm round-trip.
  - `tests/integration/berth-pdf-versions.test.ts` — upload, version-
    number bump, magic-byte rejection, reconcile auto-applied vs
    conflicts vs ±1% tolerance, mooring-number warning,
    applyParseResults allowlist enforcement, rollback semantics.

Acceptance: `pnpm exec tsc --noEmit` clean, `pnpm exec vitest run`
green at 1103/1103.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-05 03:34:24 +02:00
+								/**
 								 * Reverse parser for per-berth PDFs (Phase 6b — see plan §4.7b and §9.2).
 								 *
 								 * Three tiers, each falling back to the next:
 								 *
 								 *   1. AcroForm — read named text fields via pdf-lib. The sample
 								 *      `Berth_Spec_Sheet_A1.pdf` has 0 AcroForm fields (designers export the
 								 *      PDF flat), so this tier is built defensively for future templates that
 								 *      may include named form fields. When fields exist, this is the highest-
 								 *      confidence path because there's no OCR loss.
 								 *
 								 *   2. OCR — Tesseract.js extracts text from the page; positional/regex
 								 *      heuristics keyed off the labels documented in §9.2 pull out values.
 								 *      Returns per-field confidence scores.
 								 *
 								 *   3. AI fallback — gated on `getResolvedOcrConfig(...)` returning a usable
 								 *      OpenAI/Claude config. Only invoked when OCR confidence is below
 								 *      threshold for too many fields AND the rep opts in via the diff dialog.
 								 *      A null `apiKey` causes this tier to return a clear "not configured"
 								 *      error rather than silently falling back to OCR-only.
 								 */
 								import { PDFDocument } from 'pdf-lib';
 								// ─── shared types ────────────────────────────────────────────────────────────
 								export type ParserEngine = 'acroform' | 'ocr' | 'ai';
 								/**
 								 * Canonical extracted shape. Keys map 1:1 to nullable columns on the `berths`
 								 * table; `mooringNumber` is special (used for the §14.6 mismatch warning).
 								 */
 								export interface ExtractedBerthFields {
 								  mooringNumber?: string | null;
 								  lengthFt?: number | null;
 								  lengthM?: number | null;
 								  widthFt?: number | null;
 								  widthM?: number | null;
 								  /** Water depth at the berth (separate from a vessel's max draft). */
 								  waterDepth?: number | null;
 								  waterDepthM?: number | null;
 								  /** Max draught of vessel — falls back to the berth's draft column. */
 								  draftFt?: number | null;
 								  draftM?: number | null;
 								  bowFacing?: string | null;
 								  sidePontoon?: string | null;
 								  powerCapacity?: number | null;
 								  voltage?: number | null;
 								  mooringType?: string | null;
 								  cleatType?: string | null;
 								  cleatCapacity?: string | null;
 								  bollardType?: string | null;
 								  bollardCapacity?: string | null;
 								  access?: string | null;
 								  weeklyRateHighUsd?: number | null;
 								  weeklyRateLowUsd?: number | null;
 								  dailyRateHighUsd?: number | null;
 								  dailyRateLowUsd?: number | null;
 								  /** ISO date YYYY-MM-DD. */
 								  pricingValidUntil?: string | null;
 								  price?: number | null;
 								}
 								export interface ParsedField<T = unknown> {
 								  value: T;
 								  /** 0..1 confidence; 1 means "absolute match" (AcroForm or unambiguous regex). */
 								  confidence: number;
 								  /** Engine that produced this field; helps the diff dialog explain itself. */
 								  engine: ParserEngine;
 								}
 								export interface ParseResult {
 								  engine: ParserEngine;
 								  /** Sparse — only fields the parser was able to extract. */
 								  fields: Partial<Record<keyof ExtractedBerthFields, ParsedField>>;
 								  /** Mean confidence across all extracted fields (0..1). */
 								  meanConfidence: number;
 								  /** Raw text the OCR or AI tier produced — useful for the diff dialog audit. */
 								  rawText?: string;
 								  /** Set when a tier degraded; the API surface uses this to decide whether to
 								   *  surface the "AI parse" button. */
 								  warnings: string[];
 								}
 								// ─── magic-byte check (§14.6 critical) ───────────────────────────────────────
 								/** Reads first 5 bytes; returns true iff they are `%PDF-`. */
 								export function isPdfMagic(buffer: Buffer): boolean {
 								  if (buffer.length < 5) return false;
 								  return (
 								    buffer[0] === 0x25 && // %
 								    buffer[1] === 0x50 && // P
 								    buffer[2] === 0x44 && // D
 								    buffer[3] === 0x46 && // F
 								    buffer[4] === 0x2d //   -
 								  );
 								}
 								// ─── tier 1: AcroForm ────────────────────────────────────────────────────────
 								/**
 								 * AcroForm field name → ExtractedBerthFields key. Mirrors the names §4.7b
 								 * mentions ("length_ft", "mooring_number"…) plus a couple of tolerant aliases.
 								 */
 								const ACROFORM_FIELD_MAP: Record<string, keyof ExtractedBerthFields> = {
 								  mooring_number: 'mooringNumber',
 								  berth_number: 'mooringNumber',
 								  length_ft: 'lengthFt',
 								  length_m: 'lengthM',
 								  width_ft: 'widthFt',
 								  width_m: 'widthM',
 								  draft_ft: 'draftFt',
 								  draft_m: 'draftM',
 								  water_depth: 'waterDepth',
 								  water_depth_m: 'waterDepthM',
 								  bow_facing: 'bowFacing',
 								  side_pontoon: 'sidePontoon',
 								  pontoon: 'sidePontoon',
 								  power_capacity: 'powerCapacity',
 								  voltage: 'voltage',
 								  mooring_type: 'mooringType',
 								  cleat_type: 'cleatType',
 								  cleat_capacity: 'cleatCapacity',
 								  bollard_type: 'bollardType',
 								  bollard_capacity: 'bollardCapacity',
 								  access: 'access',
 								  weekly_rate_high_usd: 'weeklyRateHighUsd',
 								  weekly_rate_low_usd: 'weeklyRateLowUsd',
 								  daily_rate_high_usd: 'dailyRateHighUsd',
 								  daily_rate_low_usd: 'dailyRateLowUsd',
 								  pricing_valid_until: 'pricingValidUntil',
 								  price: 'price',
 								};
 								async function tryAcroForm(buffer: Buffer): Promise<ParseResult | null> {
 								  let doc: PDFDocument;
 								  try {
 								    doc = await PDFDocument.load(buffer, { ignoreEncryption: true });
 								  } catch {
 								    return null;
 								  }
 								  let form: ReturnType<PDFDocument['getForm']>;
 								  try {
 								    form = doc.getForm();
 								  } catch {
 								    return null;
 								  }
 								  const fields = form.getFields();
 								  if (fields.length === 0) return null;
 								  const out: Partial<Record<keyof ExtractedBerthFields, ParsedField>> = {};
 								  for (const field of fields) {
 								    const name = field.getName().toLowerCase();
 								    const target = ACROFORM_FIELD_MAP[name];
 								    if (!target) continue;
 								    // pdf-lib doesn't expose a generic "get value" — narrow to text fields.
 								    let raw: string | undefined;
 								    try {
 								      const tf = form.getTextField(field.getName());
 								      raw = tf.getText() ?? undefined;
 								    } catch {
 								      continue;
 								    }
 								    if (!raw || raw.trim().length === 0) continue;
 								    const parsed = coerceFieldValue(target, raw.trim());
 								    if (parsed === null) continue;
 								    out[target] = { value: parsed, confidence: 1, engine: 'acroform' };
 								  }
 								  if (Object.keys(out).length === 0) return null;
 								  return {
 								    engine: 'acroform',
 								    fields: out,
 								    meanConfidence: 1,
 								    warnings: [],
 								  };
 								}
 								// ─── tier 2: OCR via Tesseract ───────────────────────────────────────────────
 								/**
 								 * Runs Tesseract against a PDF rasterized to one image per page. Tesseract.js
 								 * accepts image inputs; we use a lazy `pdfjs-dist`-style rasterization fallback
 								 * via dynamic import. To keep the parser unit-testable without a WASM bundle,
 								 * the actual recognize() call is encapsulated in the `runOcr` adapter that
 								 * production wires to tesseract.js and tests can stub.
 								 */
 								export interface OcrAdapter {
 								  /** Returns plain text + a 0..100 mean confidence score. */
 								  recognize(buffer: Buffer): Promise<{ text: string; confidence: number }>;
 								}
 								/** Default adapter — dynamically imports tesseract.js so the WASM bundle isn't
 								 *  pulled into client builds. */
 								async function defaultOcrAdapter(): Promise<OcrAdapter> {
 								  return {
 								    recognize: async (buffer: Buffer) => {
 								      const tesseract = await import('tesseract.js');
 								      // Tesseract handles PDF inputs by rasterizing the first page; for our
 								      // single-page spec sheets that's sufficient.
 								      const result = await tesseract.recognize(buffer, 'eng');
 								      return {
 								        text: result.data.text ?? '',
 								        confidence: typeof result.data.confidence === 'number' ? result.data.confidence : 0,
 								      };
 								    },
 								  };
 								}
 								/**
 								 * Heuristic extraction from OCR text. The patterns mirror the layout
 								 * documented in plan §9.2:
 								 *
 								 *   - "Length: 206' 8" / 63m"
 								 *   - "Mooring: A12" or large "A1" near "BERTH NUMBER"
 								 *   - "WEEK HIGH / LOW" and "DAY HIGH / LOW" pricing blocks
 								 *   - "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL <date>"
 								 */
 								export function extractFromOcrText(rawText: string): {
 								  fields: Partial<Record<keyof ExtractedBerthFields, ParsedField>>;
 								  warnings: string[];
 								} {
 								  const warnings: string[] = [];
 								  const out: Partial<Record<keyof ExtractedBerthFields, ParsedField>> = {};
 								  // Normalize whitespace for line-based regexes but keep structure.
 								  const text = rawText.replace(/ /g, ' ');
 								  // Mooring number: BERTH NUMBER block. We try a couple of layouts.
 								  const mooringMatch =
 								    text.match(/BERTH\s+NUMBER[\s\S]{0,80}?\b([A-Z]\d{1,3})\b/i) ??
 								    text.match(/^\s*([A-Z]\d{1,3})\s*$/m) ??
 								    text.match(/Mooring(?:\s+Number)?\s*[:#]?\s*([A-Z]\d{1,3})/i);
 								  if (mooringMatch) {
 								    out.mooringNumber = { value: mooringMatch[1]!.toUpperCase(), confidence: 0.85, engine: 'ocr' };
 								  }
 								  // Length / Width / Water Depth — `Label: <imperial> / <metric>` form.
 								  // Imperial may be `206' 8"` style; we capture the numeric prefix in feet
 								  // and parse the metric independently because they're rarely lossless.
 								  const dimensional = (
 								    label: string,
 								    ftKey: keyof ExtractedBerthFields,
 								    mKey: keyof ExtractedBerthFields,
 								  ) => {
 								    const re = new RegExp(
 								      `${label}\\s*[:.]?\\s*([0-9]+(?:'\\s*[0-9]+\")?(?:\\.[0-9]+)?)\\s*(?:ft)?\\s*\\/\\s*([0-9]+(?:\\.[0-9]+)?)\\s*m`,
 								      'i',
 								    );
 								    const m = text.match(re);
 								    if (!m) return;
 								    const ft = parseFeetInches(m[1]!);
 								    const meters = Number(m[2]);
 								    if (ft != null && Number.isFinite(ft)) {
 								      out[ftKey] = { value: ft, confidence: 0.8, engine: 'ocr' } as ParsedField;
 								    }
 								    if (Number.isFinite(meters)) {
 								      out[mKey] = { value: meters, confidence: 0.85, engine: 'ocr' } as ParsedField;
 								    }
 								    if (ft != null && Number.isFinite(meters) && Math.abs(ft * 0.3048 - meters) / meters > 0.01) {
 								      warnings.push(
 								        `${label}: imperial/metric mismatch — ${ft}ft vs ${meters}m differ >1% (using imperial as source of truth).`,
 								      );
 								    }
 								  };
 								  dimensional('Length', 'lengthFt', 'lengthM');
 								  dimensional('Width', 'widthFt', 'widthM');
 								  dimensional('Water\\s+Depth', 'waterDepth', 'waterDepthM');
 								  // Max draft of vessel maps to the berth's draft column.
 								  dimensional('Max\\.?\\s*draught(?:\\s+of\\s+vessel)?', 'draftFt', 'draftM');
 								  // Singular labels (`Bow Facing: East`, `Pontoon: QUAY PT`).
 								  const labelToKey: Array<[RegExp, keyof ExtractedBerthFields]> = [
 								    [/Bow\s+Facing\s*[:.]?\s*([A-Za-z .]+?)(?:\n|$)/i, 'bowFacing'],
 								    [/Pontoon\s*[:.]?\s*([A-Za-z0-9 .\-]+?)(?:\n|$)/i, 'sidePontoon'],
 								    [/Mooring\s+Type\s*[:.]?\s*([A-Za-z0-9 \-\/]+?)(?:\n|$)/i, 'mooringType'],
 								    [/Cleat\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatType'],
 								    [/Cleat\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'cleatCapacity'],
 								    [/Bollard\s+Type\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardType'],
 								    [/Bollard\s+Capacity\s*[:.]?\s*([A-Za-z0-9 \-]+?)(?:\n|$)/i, 'bollardCapacity'],
 								    [/Access\s*[:.]?\s*([A-Za-z0-9 .,()\-]+?)(?:\n|$)/i, 'access'],
 								  ];
 								  for (const [re, key] of labelToKey) {
 								    const m = text.match(re);
 								    if (m && m[1]) {
 								      out[key] = { value: m[1].trim(), confidence: 0.75, engine: 'ocr' } as ParsedField;
 								    }
 								  }
 								  // Power Capacity (kW) and Voltage at 60Hz.
 								  const powerMatch = text.match(/Power\s+Capacity\s*[:.]?\s*([0-9]+(?:\.[0-9]+)?)\s*kW/i);
 								  if (powerMatch) {
 								    out.powerCapacity = { value: Number(powerMatch[1]), confidence: 0.85, engine: 'ocr' };
 								  }
 								  const voltageMatch = text.match(/Voltage(?:\s+at\s+60\s*Hz)?\s*[:.]?\s*([0-9]+)\s*V/i);
 								  if (voltageMatch) {
 								    out.voltage = { value: Number(voltageMatch[1]), confidence: 0.85, engine: 'ocr' };
 								  }
 								  // Pricing: "WEEK HIGH / LOW: 11,341 USD / 8,100 USD"
 								  const weekMatch = text.match(
 								    /WEEK\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i,
 								  );
 								  if (weekMatch) {
 								    out.weeklyRateHighUsd = {
 								      value: Number(weekMatch[1]!.replace(/,/g, '')),
 								      confidence: 0.8,
 								      engine: 'ocr',
 								    };
 								    out.weeklyRateLowUsd = {
 								      value: Number(weekMatch[2]!.replace(/,/g, '')),
 								      confidence: 0.8,
 								      engine: 'ocr',
 								    };
 								  }
 								  const dayMatch = text.match(
 								    /DAY\s+HIGH\s*\/\s*LOW[:.\s]*([0-9,]+)\s*USD\s*\/\s*([0-9,]+)\s*USD/i,
 								  );
 								  if (dayMatch) {
 								    out.dailyRateHighUsd = {
 								      value: Number(dayMatch[1]!.replace(/,/g, '')),
 								      confidence: 0.8,
 								      engine: 'ocr',
 								    };
 								    out.dailyRateLowUsd = {
 								      value: Number(dayMatch[2]!.replace(/,/g, '')),
 								      confidence: 0.8,
 								      engine: 'ocr',
 								    };
 								  }
 								  // Purchase price: "PURCHASE PRICE:\nFEE SIMPLE OR STRATA LOT\n3,880,800 USD"
 								  const priceMatch = text.match(/PURCHASE\s+PRICE[\s\S]{0,80}?([0-9][0-9,]+)\s*USD/i);
 								  if (priceMatch) {
 								    out.price = { value: Number(priceMatch[1]!.replace(/,/g, '')), confidence: 0.7, engine: 'ocr' };
 								  }
 								  // Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025"
 								  const validityMatch = text.match(
 								    /CONFIRMED\s+THROUGH\s+UNTIL\s+([A-Za-z]+\s+[0-9]{1,2})(?:[A-Z]{2})?,?\s+([0-9]{4})/i,
 								  );
 								  if (validityMatch) {
 								    const iso = parseHumanDate(`${validityMatch[1]} ${validityMatch[2]}`);
 								    if (iso) {
 								      out.pricingValidUntil = { value: iso, confidence: 0.75, engine: 'ocr' };
 								    } else {
 								      warnings.push(
 								        'Could not normalize "CONFIRMED THROUGH UNTIL" date; pricing_valid_until skipped.',
 								      );
 								    }
 								  }
 								  return { fields: out, warnings };
 								}
 								async function tryOcr(buffer: Buffer, adapter?: OcrAdapter): Promise<ParseResult | null> {
 								  const ocr = adapter ?? (await defaultOcrAdapter());
 								  const result = await ocr.recognize(buffer);
 								  if (!result.text || result.text.length === 0) {
 								    return {
 								      engine: 'ocr',
 								      fields: {},
 								      meanConfidence: 0,
 								      rawText: '',
 								      warnings: ['OCR produced no text.'],
 								    };
 								  }
 								  const { fields, warnings } = extractFromOcrText(result.text);
 								  // Tesseract gives 0..100; normalize to 0..1 and use it as a global floor —
 								  // per-field confidence is set by the regex tier above.
 								  const floor = Math.max(0, Math.min(result.confidence, 100)) / 100;
 								  for (const key of Object.keys(fields) as Array<keyof ExtractedBerthFields>) {
 								    const f = fields[key];
 								    if (f) f.confidence = Math.min(f.confidence, Math.max(floor, 0.5));
 								  }
 								  const values = Object.values(fields);
 								  const meanConfidence =
 								    values.length === 0
 								      ? 0
 								      : values.reduce((sum, v) => sum + (v?.confidence ?? 0), 0) / values.length;
 								  return {
 								    engine: 'ocr',
 								    fields,
 								    meanConfidence,
 								    rawText: result.text,
 								    warnings,
 								  };
 								}
 								// ─── tier 3: AI fallback ─────────────────────────────────────────────────────
 								/** Confidence floor below which we recommend the AI tier in the diff dialog. */
 								export const OCR_LOW_CONFIDENCE_THRESHOLD = 0.55;
 								/** True when the rep should be offered an "AI parse" button. */
 								export function shouldOfferAiTier(parse: ParseResult): boolean {
 								  if (parse.engine !== 'ocr') return false;
 								  if (Object.keys(parse.fields).length === 0) return true;
 								  return parse.meanConfidence < OCR_LOW_CONFIDENCE_THRESHOLD;
 								}
 								// ─── public entry point ──────────────────────────────────────────────────────
 								export interface ParseBerthPdfOptions {
 								  /** Override Tesseract for testing. Production flows resolve the default. */
 								  ocrAdapter?: OcrAdapter;
 								  /** Skip the OCR tier when only AcroForm is wanted (e.g. unit tests). */
 								  skipOcr?: boolean;
 								}
 								/**
 								 * Parse a per-berth PDF buffer. Each tier falls back to the next; the
 								 * returned result's `engine` field tells callers which tier produced the
 								 * fields (used by the reconcile-diff dialog to colour confidence chips).
 								 *
 								 * The AI tier is never invoked from this entry point — that's a separate
 								 * deliberate action triggered from the diff dialog so OPENAI_API_KEY isn't
 								 * spent on every upload.
 								 */
 								export async function parseBerthPdf(
 								  buffer: Buffer,
 								  opts: ParseBerthPdfOptions = {},
 								): Promise<ParseResult> {
 								  if (!isPdfMagic(buffer)) {
 								    throw new Error('PDF magic-byte check failed: file does not begin with %PDF-');
 								  }
 								  const acro = await tryAcroForm(buffer);
 								  if (acro && Object.keys(acro.fields).length > 0) return acro;
 								  if (opts.skipOcr) {
 								    return {
 								      engine: 'ocr',
 								      fields: {},
 								      meanConfidence: 0,
 								      warnings: ['skipOcr=true; no AcroForm fields found.'],
 								    };
 								  }
 								  const ocr = await tryOcr(buffer, opts.ocrAdapter);
 								  return (
 								    ocr ?? {
 								      engine: 'ocr',
 								      fields: {},
 								      meanConfidence: 0,
 								      warnings: ['OCR adapter returned null.'],
 								    }
 								  );
 								}
 								// ─── helpers ─────────────────────────────────────────────────────────────────
 								/** Coerce an AcroForm raw value to the right scalar for the target column. */
 								function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string | number | null {
 								  // String columns
 								  const stringKeys: Array<keyof ExtractedBerthFields> = [
 								    'mooringNumber',
 								    'bowFacing',
 								    'sidePontoon',
 								    'mooringType',
 								    'cleatType',
 								    'cleatCapacity',
 								    'bollardType',
 								    'bollardCapacity',
 								    'access',
 								    'pricingValidUntil',
 								  ];
 								  if (stringKeys.includes(key)) {
 								    if (key === 'pricingValidUntil') {
 								      // Accept ISO YYYY-MM-DD as-is; otherwise try a humane parse.
 								      if (/^\d{4}-\d{2}-\d{2}$/.test(raw)) return raw;
 								      return parseHumanDate(raw);
 								    }
 								    return raw;
 								  }
 								  // Numeric columns: strip currency / unit suffixes and commas.
 								  const numeric = Number(raw.replace(/[^0-9.\-]/g, ''));
 								  return Number.isFinite(numeric) ? numeric : null;
 								}
 								/** Parse a human date like "September 15 2025" → "2025-09-15". */
 								export function parseHumanDate(raw: string): string | null {
 								  const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim();
 								  // Force UTC interpretation by appending a Z; otherwise dates without an
 								  // explicit zone get parsed in the runner's local TZ and `toISOString()`
 								  // shifts the day by ±1 (caught a -0700 -> 09-14 regression locally).
 								  const d = new Date(cleaned + ' UTC');
 								  if (Number.isNaN(d.getTime())) return null;
 								  return d.toISOString().slice(0, 10);
 								}
 								/** Convert "206' 8\"" or "82" → 206.667 / 82. Returns null on parse failure. */
 								export function parseFeetInches(raw: string): number | null {
 								  const trimmed = raw.trim();
 								  const ftIn = trimmed.match(/^([0-9]+)\s*'\s*([0-9]+)\s*"$/);
 								  if (ftIn) {
 								    return Number(ftIn[1]) + Number(ftIn[2]) / 12;
 								  }
 								  const ftOnly = trimmed.match(/^([0-9]+(?:\.[0-9]+)?)/);
 								  if (ftOnly) return Number(ftOnly[1]);
 								  return null;
 								}