From f7425d1231871008331c9d479e056223c31333a4 Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 19 Jun 2026 10:30:12 +0200 Subject: [PATCH] =?UTF-8?q?fix(berths):=20CM-2=20=E2=80=94=20robust=20purc?= =?UTF-8?q?hase-price=20extraction=20(clean-token=20+=20magnitude=20floor)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- src/lib/services/berth-pdf-parser.ts | 68 ++++++++++++++++++-- tests/unit/services/berth-pdf-parser.test.ts | 42 ++++++++++++ 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/src/lib/services/berth-pdf-parser.ts b/src/lib/services/berth-pdf-parser.ts index 6f01a275..d21f9636 100644 --- a/src/lib/services/berth-pdf-parser.ts +++ b/src/lib/services/berth-pdf-parser.ts @@ -357,10 +357,14 @@ export function extractFromOcrText(rawText: string): { }; } - // Purchase price: "PURCHASE PRICE:\nFEE SIMPLE OR STRATA LOT\n3,880,800 USD" - const priceMatch = text.match(/PURCHASE\s+PRICE[\s\S]{0,80}?([0-9][0-9,]+)\s*USD/i); - if (priceMatch) { - out.price = { value: Number(priceMatch[1]!.replace(/,/g, '')), confidence: 0.7, engine: 'ocr' }; + // Purchase price: the single clean comma-grouped currency figure. The rates + // on the same sheet are letter-spaced (garble) and below the floor, so they + // never collide with the main price. See extractPurchasePrice(). + const priceResult = extractPurchasePrice(text); + if (priceResult.value != null) { + out.price = { value: priceResult.value, confidence: priceResult.confidence, engine: 'ocr' }; + } else if (priceResult.warning) { + warnings.push(priceResult.warning); } // Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025" @@ -507,6 +511,62 @@ function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string return numeric < 0 ? null : numeric; } +/** + * Floor that separates a 6–7-figure purchase price from the ≤~12k weekly/daily + * lease rates printed on the same sheet. Observed prices: 277,200 … 5,433,120; + * observed weekly highs ≤ 11,341. A wide-margin separator. + */ +export const PURCHASE_PRICE_FLOOR = 50_000; + +/** + * Strict clean comma-grouped currency token. On the real spec sheets the + * purchase price is the one figure rendered WITHOUT letter-spacing (the large + * bold number); the weekly/daily rates ARE letter-spaced and garble in text + * extraction, so they never match this pattern. The floor is a second guard + * for clean/synthetic PDFs where rates would also extract cleanly. + */ +const PRICE_TOKEN_RE = /\b(\d{1,3}(?:,\d{3})+)\s?(USD|EUR|GBP)\b/gi; + +/** + * Extract the single main purchase price from raw PDF text. Returns + * `value: null` (with a warning) when zero above-floor tokens are found, or + * when two or more DISTINCT above-floor values appear (genuinely ambiguous — + * flag for human review rather than guess). + */ +export function extractPurchasePrice(rawText: string): { + value: number | null; + currency: string | null; + confidence: number; + warning?: string; +} { + const candidates: Array<{ value: number; currency: string }> = []; + for (const m of rawText.matchAll(PRICE_TOKEN_RE)) { + const value = Number(m[1]!.replace(/,/g, '')); + if (Number.isFinite(value) && value >= PURCHASE_PRICE_FLOOR) { + candidates.push({ value, currency: m[2]!.toUpperCase() }); + } + } + if (candidates.length === 0) { + return { + value: null, + currency: null, + confidence: 0, + warning: 'No purchase-price token found (no clean figure ≥ floor).', + }; + } + const distinct = [...new Set(candidates.map((c) => c.value))]; + if (distinct.length > 1) { + return { + value: null, + currency: null, + confidence: 0, + warning: `Multiple purchase-price candidates (${distinct.join(', ')}) — needs review.`, + }; + } + const best = candidates[0]!; + return { value: best.value, currency: best.currency, confidence: 0.95 }; +} + /** Parse a human date like "September 15 2025" → "2025-09-15". */ export function parseHumanDate(raw: string): string | null { const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim(); diff --git a/tests/unit/services/berth-pdf-parser.test.ts b/tests/unit/services/berth-pdf-parser.test.ts index ebb73311..e5e1b6c1 100644 --- a/tests/unit/services/berth-pdf-parser.test.ts +++ b/tests/unit/services/berth-pdf-parser.test.ts @@ -13,9 +13,11 @@ import { describe, expect, it } from 'vitest'; import { extractFromOcrText, + extractPurchasePrice, isPdfMagic, parseFeetInches, parseHumanDate, + PURCHASE_PRICE_FLOOR, shouldOfferAiTier, } from '@/lib/services/berth-pdf-parser'; @@ -191,3 +193,43 @@ describe('shouldOfferAiTier', () => { ).toBe(false); }); }); + +describe('extractPurchasePrice', () => { + it('isolates the single clean main price among letter-spaced rate garble', () => { + // Real-sheet shape: rates are letter-spaced (so they never match the strict + // token); the main price renders clean. + const text = + 'W E E K H I G H / LO W : 1 1 , 3 4 1 U S D / 8 , 1 0 0 U S D 3,880,800 USD ' + + 'DAY H I G H / LO W : 1 , 8 9 0 U S D / 1 , 3 5 0 U S D'; + const r = extractPurchasePrice(text); + expect(r.value).toBe(3880800); + expect(r.currency).toBe('USD'); + expect(r.confidence).toBeGreaterThanOrEqual(0.9); + }); + + it('excludes clean rate tokens below the floor (synthetic clean sheet)', () => { + const text = '3,880,800 USD WEEK HIGH / LOW: 11,341 USD / 8,100 USD'; + expect(extractPurchasePrice(text).value).toBe(3880800); + }); + + it('returns null + warning when no price-magnitude token is present', () => { + const r = extractPurchasePrice('no prices here, just 12 USD of nothing'); + expect(r.value).toBeNull(); + expect(r.warning).toMatch(/no purchase-price/i); + }); + + it('flags ambiguity when two DISTINCT above-floor tokens appear', () => { + const r = extractPurchasePrice('3,880,800 USD and also 1,247,400 USD'); + expect(r.value).toBeNull(); + expect(r.warning).toMatch(/multiple/i); + }); + + it('treats a repeated identical price as unambiguous', () => { + const r = extractPurchasePrice('720,720 USD ... header ... 720,720 USD'); + expect(r.value).toBe(720720); + }); + + it('exposes the floor constant', () => { + expect(PURCHASE_PRICE_FLOOR).toBe(50_000); + }); +});