fix(berths): CM-2 — robust purchase-price extraction (clean-token + magnitude floor)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -357,10 +357,14 @@ export function extractFromOcrText(rawText: string): {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Purchase price: "PURCHASE PRICE:\nFEE SIMPLE OR STRATA LOT\n3,880,800 USD"
|
// Purchase price: the single clean comma-grouped currency figure. The rates
|
||||||
const priceMatch = text.match(/PURCHASE\s+PRICE[\s\S]{0,80}?([0-9][0-9,]+)\s*USD/i);
|
// on the same sheet are letter-spaced (garble) and below the floor, so they
|
||||||
if (priceMatch) {
|
// never collide with the main price. See extractPurchasePrice().
|
||||||
out.price = { value: Number(priceMatch[1]!.replace(/,/g, '')), confidence: 0.7, engine: 'ocr' };
|
const priceResult = extractPurchasePrice(text);
|
||||||
|
if (priceResult.value != null) {
|
||||||
|
out.price = { value: priceResult.value, confidence: priceResult.confidence, engine: 'ocr' };
|
||||||
|
} else if (priceResult.warning) {
|
||||||
|
warnings.push(priceResult.warning);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025"
|
// Pricing validity: "ALL PRICES ABOVE ARE CONFIRMED THROUGH UNTIL SEPTEMBER 15TH, 2025"
|
||||||
@@ -507,6 +511,62 @@ function coerceFieldValue(key: keyof ExtractedBerthFields, raw: string): string
|
|||||||
return numeric < 0 ? null : numeric;
|
return numeric < 0 ? null : numeric;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Floor that separates a 6–7-figure purchase price from the ≤~12k weekly/daily
|
||||||
|
* lease rates printed on the same sheet. Observed prices: 277,200 … 5,433,120;
|
||||||
|
* observed weekly highs ≤ 11,341. A wide-margin separator.
|
||||||
|
*/
|
||||||
|
export const PURCHASE_PRICE_FLOOR = 50_000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Strict clean comma-grouped currency token. On the real spec sheets the
|
||||||
|
* purchase price is the one figure rendered WITHOUT letter-spacing (the large
|
||||||
|
* bold number); the weekly/daily rates ARE letter-spaced and garble in text
|
||||||
|
* extraction, so they never match this pattern. The floor is a second guard
|
||||||
|
* for clean/synthetic PDFs where rates would also extract cleanly.
|
||||||
|
*/
|
||||||
|
const PRICE_TOKEN_RE = /\b(\d{1,3}(?:,\d{3})+)\s?(USD|EUR|GBP)\b/gi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the single main purchase price from raw PDF text. Returns
|
||||||
|
* `value: null` (with a warning) when zero above-floor tokens are found, or
|
||||||
|
* when two or more DISTINCT above-floor values appear (genuinely ambiguous —
|
||||||
|
* flag for human review rather than guess).
|
||||||
|
*/
|
||||||
|
export function extractPurchasePrice(rawText: string): {
|
||||||
|
value: number | null;
|
||||||
|
currency: string | null;
|
||||||
|
confidence: number;
|
||||||
|
warning?: string;
|
||||||
|
} {
|
||||||
|
const candidates: Array<{ value: number; currency: string }> = [];
|
||||||
|
for (const m of rawText.matchAll(PRICE_TOKEN_RE)) {
|
||||||
|
const value = Number(m[1]!.replace(/,/g, ''));
|
||||||
|
if (Number.isFinite(value) && value >= PURCHASE_PRICE_FLOOR) {
|
||||||
|
candidates.push({ value, currency: m[2]!.toUpperCase() });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (candidates.length === 0) {
|
||||||
|
return {
|
||||||
|
value: null,
|
||||||
|
currency: null,
|
||||||
|
confidence: 0,
|
||||||
|
warning: 'No purchase-price token found (no clean figure ≥ floor).',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
const distinct = [...new Set(candidates.map((c) => c.value))];
|
||||||
|
if (distinct.length > 1) {
|
||||||
|
return {
|
||||||
|
value: null,
|
||||||
|
currency: null,
|
||||||
|
confidence: 0,
|
||||||
|
warning: `Multiple purchase-price candidates (${distinct.join(', ')}) — needs review.`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
const best = candidates[0]!;
|
||||||
|
return { value: best.value, currency: best.currency, confidence: 0.95 };
|
||||||
|
}
|
||||||
|
|
||||||
/** Parse a human date like "September 15 2025" → "2025-09-15". */
|
/** Parse a human date like "September 15 2025" → "2025-09-15". */
|
||||||
export function parseHumanDate(raw: string): string | null {
|
export function parseHumanDate(raw: string): string | null {
|
||||||
const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim();
|
const cleaned = raw.replace(/(\d+)(st|nd|rd|th)/i, '$1').trim();
|
||||||
|
|||||||
@@ -13,9 +13,11 @@ import { describe, expect, it } from 'vitest';
|
|||||||
|
|
||||||
import {
|
import {
|
||||||
extractFromOcrText,
|
extractFromOcrText,
|
||||||
|
extractPurchasePrice,
|
||||||
isPdfMagic,
|
isPdfMagic,
|
||||||
parseFeetInches,
|
parseFeetInches,
|
||||||
parseHumanDate,
|
parseHumanDate,
|
||||||
|
PURCHASE_PRICE_FLOOR,
|
||||||
shouldOfferAiTier,
|
shouldOfferAiTier,
|
||||||
} from '@/lib/services/berth-pdf-parser';
|
} from '@/lib/services/berth-pdf-parser';
|
||||||
|
|
||||||
@@ -191,3 +193,43 @@ describe('shouldOfferAiTier', () => {
|
|||||||
).toBe(false);
|
).toBe(false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('extractPurchasePrice', () => {
|
||||||
|
it('isolates the single clean main price among letter-spaced rate garble', () => {
|
||||||
|
// Real-sheet shape: rates are letter-spaced (so they never match the strict
|
||||||
|
// token); the main price renders clean.
|
||||||
|
const text =
|
||||||
|
'W E E K H I G H / LO W : 1 1 , 3 4 1 U S D / 8 , 1 0 0 U S D 3,880,800 USD ' +
|
||||||
|
'DAY H I G H / LO W : 1 , 8 9 0 U S D / 1 , 3 5 0 U S D';
|
||||||
|
const r = extractPurchasePrice(text);
|
||||||
|
expect(r.value).toBe(3880800);
|
||||||
|
expect(r.currency).toBe('USD');
|
||||||
|
expect(r.confidence).toBeGreaterThanOrEqual(0.9);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('excludes clean rate tokens below the floor (synthetic clean sheet)', () => {
|
||||||
|
const text = '3,880,800 USD WEEK HIGH / LOW: 11,341 USD / 8,100 USD';
|
||||||
|
expect(extractPurchasePrice(text).value).toBe(3880800);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns null + warning when no price-magnitude token is present', () => {
|
||||||
|
const r = extractPurchasePrice('no prices here, just 12 USD of nothing');
|
||||||
|
expect(r.value).toBeNull();
|
||||||
|
expect(r.warning).toMatch(/no purchase-price/i);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('flags ambiguity when two DISTINCT above-floor tokens appear', () => {
|
||||||
|
const r = extractPurchasePrice('3,880,800 USD and also 1,247,400 USD');
|
||||||
|
expect(r.value).toBeNull();
|
||||||
|
expect(r.warning).toMatch(/multiple/i);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('treats a repeated identical price as unambiguous', () => {
|
||||||
|
const r = extractPurchasePrice('720,720 USD ... header ... 720,720 USD');
|
||||||
|
expect(r.value).toBe(720720);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('exposes the floor constant', () => {
|
||||||
|
expect(PURCHASE_PRICE_FLOOR).toBe(50_000);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user