fix(berths): CM-2 — robust purchase-price extraction (clean-token + magnitude floor)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-19 10:30:12 +02:00
parent df8c26d1b3
commit f7425d1231
2 changed files with 106 additions and 4 deletions

View File

@@ -357,10 +357,14 @@ export function extractFromOcrText(rawText: string): {
};
}
// Purchase price: "PURCHASE PRICE:\nFEE SIMPLE OR STRATA LOT\n3,880,800 USD"
const priceMatch = text.match(/PURCHASE\s+PRICE[\s\S]{0,80}?([0-9][0-9,]+)\s*USD/i);
if (priceMatch) {
out.price = { value: Number(priceMatch[1]!.replace(/,/g, '')), confidence: 0.7, engine: 'ocr' };
// Purchase price: the single clean comma-grouped currency figure. The rates
// on the same sheet are letter-spaced (garble) and below the floor, so they
// never collide with the main price. See extractPurchasePrice().
const priceResult = extractPurchasePrice(text);
if (priceResult.value != null) {
out.price = { value: priceResult.value, confidence: priceResult.confidence, engine: 'ocr' };
} else if (priceResult.warning) {
warnings.push(priceResult.warning);
}
// Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025"
@@ -507,6 +511,62 @@ function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string
return numeric < 0 ? null : numeric;
}
/**
* Floor that separates a 67-figure purchase price from the ≤~12k weekly/daily
* lease rates printed on the same sheet. Observed prices: 277,200 … 5,433,120;
* observed weekly highs ≤ 11,341. A wide-margin separator.
*/
export const PURCHASE_PRICE_FLOOR = 50_000;
/**
* Strict clean comma-grouped currency token. On the real spec sheets the
* purchase price is the one figure rendered WITHOUT letter-spacing (the large
* bold number); the weekly/daily rates ARE letter-spaced and garble in text
* extraction, so they never match this pattern. The floor is a second guard
* for clean/synthetic PDFs where rates would also extract cleanly.
*/
const PRICE_TOKEN_RE = /\b(\d{1,3}(?:,\d{3})+)\s?(USD|EUR|GBP)\b/gi;
/**
* Extract the single main purchase price from raw PDF text. Returns
* `value: null` (with a warning) when zero above-floor tokens are found, or
* when two or more DISTINCT above-floor values appear (genuinely ambiguous —
* flag for human review rather than guess).
*/
export function extractPurchasePrice(rawText: string): {
value: number | null;
currency: string | null;
confidence: number;
warning?: string;
} {
const candidates: Array<{ value: number; currency: string }> = [];
for (const m of rawText.matchAll(PRICE_TOKEN_RE)) {
const value = Number(m[1]!.replace(/,/g, ''));
if (Number.isFinite(value) && value >= PURCHASE_PRICE_FLOOR) {
candidates.push({ value, currency: m[2]!.toUpperCase() });
}
}
if (candidates.length === 0) {
return {
value: null,
currency: null,
confidence: 0,
warning: 'No purchase-price token found (no clean figure ≥ floor).',
};
}
const distinct = [...new Set(candidates.map((c) => c.value))];
if (distinct.length > 1) {
return {
value: null,
currency: null,
confidence: 0,
warning: `Multiple purchase-price candidates (${distinct.join(', ')}) — needs review.`,
};
}
const best = candidates[0]!;
return { value: best.value, currency: best.currency, confidence: 0.95 };
}
/** Parse a human date like "September 15 2025" → "2025-09-15". */
export function parseHumanDate(raw: string): string | null {
const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim();