303 lines
9.6 KiB
TypeScript
303 lines
9.6 KiB
TypeScript
|
|
/**
|
|||
|
|
* Heuristic parser for raw OCR text from a receipt image.
|
|||
|
|
*
|
|||
|
|
* Tesseract returns plain text — we extract structured fields (vendor, date,
|
|||
|
|
* amount, currency, line items) using regex/positional rules. The output
|
|||
|
|
* matches `ParsedReceipt` from `ocr-providers.ts` so callers don't need to
|
|||
|
|
* branch on which engine produced it.
|
|||
|
|
*
|
|||
|
|
* Confidence is computed from how many fields we managed to recover, scaled
|
|||
|
|
* by Tesseract's own per-line confidence when provided.
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
import type { ParsedReceipt, ParsedReceiptLineItem } from '@/lib/services/ocr-providers';
|
|||
|
|
|
|||
|
|
/** ISO 4217 codes we recognize, plus common symbol → ISO map. */
|
|||
|
|
const CURRENCY_SYMBOLS: Record<string, string> = {
|
|||
|
|
$: 'USD',
|
|||
|
|
'€': 'EUR',
|
|||
|
|
'£': 'GBP',
|
|||
|
|
'¥': 'JPY',
|
|||
|
|
'₣': 'CHF',
|
|||
|
|
'₹': 'INR',
|
|||
|
|
'₽': 'RUB',
|
|||
|
|
'₱': 'PHP',
|
|||
|
|
'₩': 'KRW',
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const CURRENCY_CODES = new Set([
|
|||
|
|
'USD',
|
|||
|
|
'EUR',
|
|||
|
|
'GBP',
|
|||
|
|
'JPY',
|
|||
|
|
'CHF',
|
|||
|
|
'CAD',
|
|||
|
|
'AUD',
|
|||
|
|
'NZD',
|
|||
|
|
'SEK',
|
|||
|
|
'NOK',
|
|||
|
|
'DKK',
|
|||
|
|
'PLN',
|
|||
|
|
'CZK',
|
|||
|
|
'HUF',
|
|||
|
|
'INR',
|
|||
|
|
'CNY',
|
|||
|
|
'HKD',
|
|||
|
|
'SGD',
|
|||
|
|
'AED',
|
|||
|
|
'ILS',
|
|||
|
|
'TRY',
|
|||
|
|
'ZAR',
|
|||
|
|
'BRL',
|
|||
|
|
'MXN',
|
|||
|
|
'RUB',
|
|||
|
|
'KRW',
|
|||
|
|
]);
|
|||
|
|
|
|||
|
|
/** Patterns we try in order; the first match wins. */
|
|||
|
|
const DATE_PATTERNS: Array<{ regex: RegExp; build: (m: RegExpMatchArray) => string | null }> = [
|
|||
|
|
// ISO 2024-04-28
|
|||
|
|
{
|
|||
|
|
regex: /\b(\d{4})-(\d{1,2})-(\d{1,2})\b/,
|
|||
|
|
build: (m) => normalizeDate(m[1]!, m[2]!, m[3]!),
|
|||
|
|
},
|
|||
|
|
// 28/04/2024 or 28-04-2024 (DMY — common in EU)
|
|||
|
|
{
|
|||
|
|
regex: /\b(\d{1,2})[/.\-](\d{1,2})[/.\-](\d{2,4})\b/,
|
|||
|
|
build: (m) => {
|
|||
|
|
const d = m[1]!;
|
|||
|
|
const mo = m[2]!;
|
|||
|
|
const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
|
|||
|
|
// We can't tell DMY from MDY; trust DMY which is more common globally
|
|||
|
|
// and won't fail validation as long as month <= 12.
|
|||
|
|
return normalizeDate(y, mo, d);
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
// 28 Apr 2024 / 28-Apr-2024
|
|||
|
|
{
|
|||
|
|
regex: /\b(\d{1,2})\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+(\d{2,4})\b/i,
|
|||
|
|
build: (m) => {
|
|||
|
|
const months: Record<string, string> = {
|
|||
|
|
jan: '01',
|
|||
|
|
feb: '02',
|
|||
|
|
mar: '03',
|
|||
|
|
apr: '04',
|
|||
|
|
may: '05',
|
|||
|
|
jun: '06',
|
|||
|
|
jul: '07',
|
|||
|
|
aug: '08',
|
|||
|
|
sep: '09',
|
|||
|
|
oct: '10',
|
|||
|
|
nov: '11',
|
|||
|
|
dec: '12',
|
|||
|
|
};
|
|||
|
|
const mo = months[m[2]!.toLowerCase().slice(0, 3)];
|
|||
|
|
if (!mo) return null;
|
|||
|
|
const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
|
|||
|
|
return normalizeDate(y, mo, m[1]!);
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
];
|
|||
|
|
|
|||
|
|
function normalizeDate(year: string, month: string, day: string): string | null {
|
|||
|
|
const y = year.padStart(4, '0');
|
|||
|
|
const m = month.padStart(2, '0');
|
|||
|
|
const d = day.padStart(2, '0');
|
|||
|
|
const candidate = `${y}-${m}-${d}`;
|
|||
|
|
// Sanity-check by round-tripping through Date — drops invalid days.
|
|||
|
|
const t = new Date(candidate);
|
|||
|
|
if (Number.isNaN(t.getTime()) || t.toISOString().slice(0, 10) !== candidate) return null;
|
|||
|
|
// Don't accept implausibly old or future-dated receipts.
|
|||
|
|
const yr = Number(y);
|
|||
|
|
if (yr < 2000 || yr > 2100) return null;
|
|||
|
|
return candidate;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Pulls the first recognizable date out of `text`. */
|
|||
|
|
function extractDate(text: string): string | null {
|
|||
|
|
for (const { regex, build } of DATE_PATTERNS) {
|
|||
|
|
const m = text.match(regex);
|
|||
|
|
if (m) {
|
|||
|
|
const d = build(m);
|
|||
|
|
if (d) return d;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return null;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Detects a currency symbol or 3-letter ISO code anywhere in `text`. */
|
|||
|
|
function extractCurrency(text: string): string | null {
|
|||
|
|
for (const sym of Object.keys(CURRENCY_SYMBOLS)) {
|
|||
|
|
if (text.includes(sym)) return CURRENCY_SYMBOLS[sym]!;
|
|||
|
|
}
|
|||
|
|
// Match a stand-alone uppercase 3-letter token.
|
|||
|
|
const m = text.match(/\b([A-Z]{3})\b/g);
|
|||
|
|
if (m) {
|
|||
|
|
for (const code of m) {
|
|||
|
|
if (CURRENCY_CODES.has(code)) return code;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return null;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Extracts the receipt total. Strategy:
|
|||
|
|
* 1. Look for a line containing "total", "amount due", "grand total",
|
|||
|
|
* "balance due", "to pay" — preferring the last match (subtotals
|
|||
|
|
* come earlier on the receipt).
|
|||
|
|
* 2. Fall back to the largest decimal number on the receipt.
|
|||
|
|
*/
|
|||
|
|
function extractAmount(lines: string[]): number | null {
|
|||
|
|
const totalMarker = /\b(grand\s*total|total\s*due|balance\s*due|amount\s*due|total|to\s*pay)\b/i;
|
|||
|
|
let best: { amount: number; priority: number } | null = null;
|
|||
|
|
|
|||
|
|
for (const line of lines) {
|
|||
|
|
if (!totalMarker.test(line)) continue;
|
|||
|
|
const numbers = extractNumbers(line);
|
|||
|
|
if (numbers.length === 0) continue;
|
|||
|
|
// Take the largest number on this line (subtotal+tax often appear before total).
|
|||
|
|
const amt = Math.max(...numbers);
|
|||
|
|
// Prefer "grand total" / "total due" over plain "total" / "subtotal-adjacent".
|
|||
|
|
const priority = /grand\s*total|total\s*due|balance\s*due|amount\s*due|to\s*pay/i.test(line)
|
|||
|
|
? 2
|
|||
|
|
: 1;
|
|||
|
|
if (!best || priority > best.priority || (priority === best.priority && amt > best.amount)) {
|
|||
|
|
best = { amount: amt, priority };
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (best) return best.amount;
|
|||
|
|
|
|||
|
|
// Fallback: largest decimal on the whole receipt.
|
|||
|
|
const all = lines.flatMap(extractNumbers);
|
|||
|
|
if (all.length === 0) return null;
|
|||
|
|
return Math.max(...all);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Pulls numeric values out of a line, supporting `1,234.56` and `1.234,56`. */
|
|||
|
|
function extractNumbers(line: string): number[] {
|
|||
|
|
const out: number[] = [];
|
|||
|
|
const re = /(?<![A-Za-z0-9])-?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{1,2})?(?![A-Za-z0-9])/g;
|
|||
|
|
for (const match of line.matchAll(re)) {
|
|||
|
|
const raw = match[0];
|
|||
|
|
const parsed = parseLocaleNumber(raw);
|
|||
|
|
if (parsed != null && Math.abs(parsed) >= 0.01) out.push(parsed);
|
|||
|
|
}
|
|||
|
|
return out;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function parseLocaleNumber(raw: string): number | null {
|
|||
|
|
// Decide whether `,` or `.` is the decimal separator by looking at the last one.
|
|||
|
|
const lastComma = raw.lastIndexOf(',');
|
|||
|
|
const lastDot = raw.lastIndexOf('.');
|
|||
|
|
let cleaned: string;
|
|||
|
|
if (lastComma === -1 && lastDot === -1) {
|
|||
|
|
cleaned = raw;
|
|||
|
|
} else if (lastComma > lastDot) {
|
|||
|
|
// Comma is decimal: 1.234,56 → 1234.56
|
|||
|
|
cleaned = raw.replace(/\./g, '').replace(',', '.');
|
|||
|
|
} else {
|
|||
|
|
// Dot is decimal: 1,234.56 → 1234.56
|
|||
|
|
cleaned = raw.replace(/,/g, '');
|
|||
|
|
}
|
|||
|
|
const n = Number(cleaned);
|
|||
|
|
return Number.isFinite(n) ? n : null;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Vendor heuristic: first non-blank line that isn't a date/number-only line
|
|||
|
|
* and isn't shorter than 3 chars. Receipts almost always print the merchant
|
|||
|
|
* name at the top.
|
|||
|
|
*/
|
|||
|
|
function extractVendor(lines: string[]): string | null {
|
|||
|
|
for (const line of lines.slice(0, 6)) {
|
|||
|
|
const trimmed = line.trim();
|
|||
|
|
if (trimmed.length < 3) continue;
|
|||
|
|
// Vendor lines must include at least two alphabetic characters — drops
|
|||
|
|
// pure-punctuation noise like "@@@" and divider rows like "===".
|
|||
|
|
if ((trimmed.match(/[A-Za-z]/g) ?? []).length < 2) continue;
|
|||
|
|
if (DATE_PATTERNS.some((p) => p.regex.test(trimmed))) continue;
|
|||
|
|
if (/^(receipt|invoice|tax invoice|order|ticket)/i.test(trimmed)) continue;
|
|||
|
|
return trimmed.slice(0, 120);
|
|||
|
|
}
|
|||
|
|
return null;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/** Pulls line items: lines with both descriptive text and a trailing number. */
|
|||
|
|
function extractLineItems(lines: string[]): ParsedReceiptLineItem[] {
|
|||
|
|
const skipMarker = /\b(subtotal|tax|vat|gst|total|tip|service|change|cash|card|tend|due)\b/i;
|
|||
|
|
const out: ParsedReceiptLineItem[] = [];
|
|||
|
|
for (const line of lines) {
|
|||
|
|
if (skipMarker.test(line)) continue;
|
|||
|
|
// Skip header-ish rows: dates, postal codes, "Date:" / "Time:" labels.
|
|||
|
|
if (DATE_PATTERNS.some((p) => p.regex.test(line))) continue;
|
|||
|
|
if (
|
|||
|
|
/^\s*(date|time|tel|phone|store|store#|cashier|order|table|receipt|invoice)\b/i.test(line)
|
|||
|
|
) {
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
// Skip lines that look like an address: leading street number, common suffixes.
|
|||
|
|
if (/^\s*\d+\s+\w/.test(line) && /\b(st|ave|blvd|rd|way|lane|ln|drive|dr)\b/i.test(line)) {
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
const numbers = extractNumbers(line);
|
|||
|
|
if (numbers.length === 0) continue;
|
|||
|
|
// Line items always have the price at the END; if the only number is at
|
|||
|
|
// the start (e.g. street number), this isn't a line item.
|
|||
|
|
const trailingNumber = /[.,]?\d[\d.,]*\s*$/.test(line);
|
|||
|
|
if (!trailingNumber) continue;
|
|||
|
|
const lastNum = numbers[numbers.length - 1]!;
|
|||
|
|
const numStr = String(lastNum);
|
|||
|
|
const idx = line.lastIndexOf(numStr.replace(/\.\d+$/, '')); // approximate match
|
|||
|
|
const description = (idx > 0 ? line.slice(0, idx) : line.replace(/[\d.,]+$/, ''))
|
|||
|
|
.trim()
|
|||
|
|
.replace(/[.\-–—\s]+$/, '');
|
|||
|
|
if (description.length < 2) continue;
|
|||
|
|
out.push({ description: description.slice(0, 120), amount: lastNum });
|
|||
|
|
if (out.length >= 20) break;
|
|||
|
|
}
|
|||
|
|
return out;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Confidence = fraction of headline fields recovered, scaled by avg
|
|||
|
|
* Tesseract per-line confidence (1 if not provided).
|
|||
|
|
*/
|
|||
|
|
function computeConfidence(
|
|||
|
|
fields: { vendor: unknown; date: unknown; amount: unknown },
|
|||
|
|
ocrConfidence: number | null,
|
|||
|
|
): number {
|
|||
|
|
const recovered = [fields.vendor, fields.date, fields.amount].filter(Boolean).length;
|
|||
|
|
const fieldScore = recovered / 3;
|
|||
|
|
const ocrScore = ocrConfidence == null ? 1 : Math.max(0, Math.min(1, ocrConfidence / 100));
|
|||
|
|
return Number((fieldScore * ocrScore).toFixed(2));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export interface ParseReceiptInput {
|
|||
|
|
text: string;
|
|||
|
|
/** 0–100 from Tesseract, or null if we don't have it. */
|
|||
|
|
ocrConfidence?: number | null;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export function parseReceiptText({ text, ocrConfidence = null }: ParseReceiptInput): ParsedReceipt {
|
|||
|
|
const lines = text
|
|||
|
|
.split(/\r?\n/)
|
|||
|
|
.map((l) => l.trim())
|
|||
|
|
.filter(Boolean);
|
|||
|
|
|
|||
|
|
const vendor = extractVendor(lines);
|
|||
|
|
const date = extractDate(text);
|
|||
|
|
const amount = extractAmount(lines);
|
|||
|
|
const currency = extractCurrency(text);
|
|||
|
|
const lineItems = extractLineItems(lines);
|
|||
|
|
const confidence = computeConfidence({ vendor, date, amount }, ocrConfidence);
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
establishment: vendor,
|
|||
|
|
date,
|
|||
|
|
amount,
|
|||
|
|
currency,
|
|||
|
|
lineItems,
|
|||
|
|
confidence,
|
|||
|
|
};
|
|||
|
|
}
|