fix(berths): CM-2 — robust purchase-price extraction (clean-token + magnitude floor)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-19 10:30:12 +02:00
parent df8c26d1b3
commit f7425d1231
2 changed files with 106 additions and 4 deletions

View File

@@ -357,10 +357,14 @@ export function extractFromOcrText(rawText: string): {
};
}
// Purchase price: "PURCHASE PRICE:\nFEE SIMPLE OR STRATA LOT\n3,880,800 USD"
const priceMatch = text.match(/PURCHASE\s+PRICE[\s\S]{0,80}?([0-9][0-9,]+)\s*USD/i);
if (priceMatch) {
out.price = { value: Number(priceMatch[1]!.replace(/,/g, '')), confidence: 0.7, engine: 'ocr' };
// Purchase price: the single clean comma-grouped currency figure. The rates
// on the same sheet are letter-spaced (garble) and below the floor, so they
// never collide with the main price. See extractPurchasePrice().
const priceResult = extractPurchasePrice(text);
if (priceResult.value != null) {
out.price = { value: priceResult.value, confidence: priceResult.confidence, engine: 'ocr' };
} else if (priceResult.warning) {
warnings.push(priceResult.warning);
}
// Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025"
@@ -507,6 +511,62 @@ function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string
return numeric < 0 ? null : numeric;
}
/**
* Floor that separates a 67-figure purchase price from the ≤~12k weekly/daily
* lease rates printed on the same sheet. Observed prices: 277,200 … 5,433,120;
* observed weekly highs ≤ 11,341. A wide-margin separator.
*/
export const PURCHASE_PRICE_FLOOR = 50_000;
/**
* Strict clean comma-grouped currency token. On the real spec sheets the
* purchase price is the one figure rendered WITHOUT letter-spacing (the large
* bold number); the weekly/daily rates ARE letter-spaced and garble in text
* extraction, so they never match this pattern. The floor is a second guard
* for clean/synthetic PDFs where rates would also extract cleanly.
*/
const PRICE_TOKEN_RE = /\b(\d{1,3}(?:,\d{3})+)\s?(USD|EUR|GBP)\b/gi;
/**
* Extract the single main purchase price from raw PDF text. Returns
* `value: null` (with a warning) when zero above-floor tokens are found, or
* when two or more DISTINCT above-floor values appear (genuinely ambiguous —
* flag for human review rather than guess).
*/
export function extractPurchasePrice(rawText: string): {
value: number | null;
currency: string | null;
confidence: number;
warning?: string;
} {
const candidates: Array<{ value: number; currency: string }> = [];
for (const m of rawText.matchAll(PRICE_TOKEN_RE)) {
const value = Number(m[1]!.replace(/,/g, ''));
if (Number.isFinite(value) && value >= PURCHASE_PRICE_FLOOR) {
candidates.push({ value, currency: m[2]!.toUpperCase() });
}
}
if (candidates.length === 0) {
return {
value: null,
currency: null,
confidence: 0,
warning: 'No purchase-price token found (no clean figure ≥ floor).',
};
}
const distinct = [...new Set(candidates.map((c) => c.value))];
if (distinct.length > 1) {
return {
value: null,
currency: null,
confidence: 0,
warning: `Multiple purchase-price candidates (${distinct.join(', ')}) — needs review.`,
};
}
const best = candidates[0]!;
return { value: best.value, currency: best.currency, confidence: 0.95 };
}
/** Parse a human date like "September 15 2025" → "2025-09-15". */
export function parseHumanDate(raw: string): string | null {
const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim();

View File

@@ -13,9 +13,11 @@ import { describe, expect, it } from 'vitest';
import {
extractFromOcrText,
extractPurchasePrice,
isPdfMagic,
parseFeetInches,
parseHumanDate,
PURCHASE_PRICE_FLOOR,
shouldOfferAiTier,
} from '@/lib/services/berth-pdf-parser';
@@ -191,3 +193,43 @@ describe('shouldOfferAiTier', () => {
).toBe(false);
});
});
describe('extractPurchasePrice', () => {
it('isolates the single clean main price among letter-spaced rate garble', () => {
// Real-sheet shape: rates are letter-spaced (so they never match the strict
// token); the main price renders clean.
const text =
'W E E K H I G H / LO W : 1 1 , 3 4 1 U S D / 8 , 1 0 0 U S D 3,880,800 USD ' +
'DAY H I G H / LO W : 1 , 8 9 0 U S D / 1 , 3 5 0 U S D';
const r = extractPurchasePrice(text);
expect(r.value).toBe(3880800);
expect(r.currency).toBe('USD');
expect(r.confidence).toBeGreaterThanOrEqual(0.9);
});
it('excludes clean rate tokens below the floor (synthetic clean sheet)', () => {
const text = '3,880,800 USD WEEK HIGH / LOW: 11,341 USD / 8,100 USD';
expect(extractPurchasePrice(text).value).toBe(3880800);
});
it('returns null + warning when no price-magnitude token is present', () => {
const r = extractPurchasePrice('no prices here, just 12 USD of nothing');
expect(r.value).toBeNull();
expect(r.warning).toMatch(/no purchase-price/i);
});
it('flags ambiguity when two DISTINCT above-floor tokens appear', () => {
const r = extractPurchasePrice('3,880,800 USD and also 1,247,400 USD');
expect(r.value).toBeNull();
expect(r.warning).toMatch(/multiple/i);
});
it('treats a repeated identical price as unambiguous', () => {
const r = extractPurchasePrice('720,720 USD ... header ... 720,720 USD');
expect(r.value).toBe(720720);
});
it('exposes the floor constant', () => {
expect(PURCHASE_PRICE_FLOOR).toBe(50_000);
});
});