/** * Heuristic parser for raw OCR text from a receipt image. * * Tesseract returns plain text - we extract structured fields (vendor, date, * amount, currency, line items) using regex/positional rules. The output * matches `ParsedReceipt` from `ocr-providers.ts` so callers don't need to * branch on which engine produced it. * * Confidence is computed from how many fields we managed to recover, scaled * by Tesseract's own per-line confidence when provided. */ import type { ParsedReceipt, ParsedReceiptLineItem } from '@/lib/services/ocr-providers'; /** ISO 4217 codes we recognize, plus common symbol → ISO map. */ const CURRENCY_SYMBOLS: Record = { $: 'USD', '€': 'EUR', '£': 'GBP', '¥': 'JPY', '₣': 'CHF', '₹': 'INR', '₽': 'RUB', '₱': 'PHP', '₩': 'KRW', }; const CURRENCY_CODES = new Set([ 'USD', 'EUR', 'GBP', 'JPY', 'CHF', 'CAD', 'AUD', 'NZD', 'SEK', 'NOK', 'DKK', 'PLN', 'CZK', 'HUF', 'INR', 'CNY', 'HKD', 'SGD', 'AED', 'ILS', 'TRY', 'ZAR', 'BRL', 'MXN', 'RUB', 'KRW', ]); /** Patterns we try in order; the first match wins. */ const DATE_PATTERNS: Array<{ regex: RegExp; build: (m: RegExpMatchArray) => string | null }> = [ // ISO 2024-04-28 { regex: /\b(\d{4})-(\d{1,2})-(\d{1,2})\b/, build: (m) => normalizeDate(m[1]!, m[2]!, m[3]!), }, // 28/04/2024 or 28-04-2024 (DMY - common in EU) { regex: /\b(\d{1,2})[/.\-](\d{1,2})[/.\-](\d{2,4})\b/, build: (m) => { const d = m[1]!; const mo = m[2]!; const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!; // We can't tell DMY from MDY; trust DMY which is more common globally // and won't fail validation as long as month <= 12. return normalizeDate(y, mo, d); }, }, // 28 Apr 2024 / 28-Apr-2024 { regex: /\b(\d{1,2})\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+(\d{2,4})\b/i, build: (m) => { const months: Record = { jan: '01', feb: '02', mar: '03', apr: '04', may: '05', jun: '06', jul: '07', aug: '08', sep: '09', oct: '10', nov: '11', dec: '12', }; const mo = months[m[2]!.toLowerCase().slice(0, 3)]; if (!mo) return null; const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!; return normalizeDate(y, mo, m[1]!); }, }, ]; function normalizeDate(year: string, month: string, day: string): string | null { const y = year.padStart(4, '0'); const m = month.padStart(2, '0'); const d = day.padStart(2, '0'); const candidate = `${y}-${m}-${d}`; // Sanity-check by round-tripping through Date - drops invalid days. const t = new Date(candidate); if (Number.isNaN(t.getTime()) || t.toISOString().slice(0, 10) !== candidate) return null; // Don't accept implausibly old or future-dated receipts. const yr = Number(y); if (yr < 2000 || yr > 2100) return null; return candidate; } /** Pulls the first recognizable date out of `text`. */ function extractDate(text: string): string | null { for (const { regex, build } of DATE_PATTERNS) { const m = text.match(regex); if (m) { const d = build(m); if (d) return d; } } return null; } /** Detects a currency symbol or 3-letter ISO code anywhere in `text`. */ function extractCurrency(text: string): string | null { for (const sym of Object.keys(CURRENCY_SYMBOLS)) { if (text.includes(sym)) return CURRENCY_SYMBOLS[sym]!; } // Match a stand-alone uppercase 3-letter token. const m = text.match(/\b([A-Z]{3})\b/g); if (m) { for (const code of m) { if (CURRENCY_CODES.has(code)) return code; } } return null; } /** * Extracts the receipt total. Strategy: * 1. Look for a line containing "total", "amount due", "grand total", * "balance due", "to pay" - preferring the last match (subtotals * come earlier on the receipt). * 2. Fall back to the largest decimal number on the receipt. */ function extractAmount(lines: string[]): number | null { const totalMarker = /\b(grand\s*total|total\s*due|balance\s*due|amount\s*due|total|to\s*pay)\b/i; let best: { amount: number; priority: number } | null = null; for (const line of lines) { if (!totalMarker.test(line)) continue; const numbers = extractNumbers(line); if (numbers.length === 0) continue; // Take the largest number on this line (subtotal+tax often appear before total). const amt = Math.max(...numbers); // Prefer "grand total" / "total due" over plain "total" / "subtotal-adjacent". const priority = /grand\s*total|total\s*due|balance\s*due|amount\s*due|to\s*pay/i.test(line) ? 2 : 1; if (!best || priority > best.priority || (priority === best.priority && amt > best.amount)) { best = { amount: amt, priority }; } } if (best) return best.amount; // Fallback: largest decimal on the whole receipt. const all = lines.flatMap(extractNumbers); if (all.length === 0) return null; return Math.max(...all); } /** Pulls numeric values out of a line, supporting `1,234.56` and `1.234,56`. */ function extractNumbers(line: string): number[] { const out: number[] = []; const re = /(?= 0.01) out.push(parsed); } return out; } function parseLocaleNumber(raw: string): number | null { // Decide whether `,` or `.` is the decimal separator by looking at the last one. const lastComma = raw.lastIndexOf(','); const lastDot = raw.lastIndexOf('.'); let cleaned: string; if (lastComma === -1 && lastDot === -1) { cleaned = raw; } else if (lastComma > lastDot) { // Comma is decimal: 1.234,56 → 1234.56 cleaned = raw.replace(/\./g, '').replace(',', '.'); } else { // Dot is decimal: 1,234.56 → 1234.56 cleaned = raw.replace(/,/g, ''); } const n = Number(cleaned); return Number.isFinite(n) ? n : null; } /** * Vendor heuristic: first non-blank line that isn't a date/number-only line * and isn't shorter than 3 chars. Receipts almost always print the merchant * name at the top. */ function extractVendor(lines: string[]): string | null { for (const line of lines.slice(0, 6)) { const trimmed = line.trim(); if (trimmed.length < 3) continue; // Vendor lines must include at least two alphabetic characters - drops // pure-punctuation noise like "@@@" and divider rows like "===". if ((trimmed.match(/[A-Za-z]/g) ?? []).length < 2) continue; if (DATE_PATTERNS.some((p) => p.regex.test(trimmed))) continue; if (/^(receipt|invoice|tax invoice|order|ticket)/i.test(trimmed)) continue; return trimmed.slice(0, 120); } return null; } /** Pulls line items: lines with both descriptive text and a trailing number. */ function extractLineItems(lines: string[]): ParsedReceiptLineItem[] { const skipMarker = /\b(subtotal|tax|vat|gst|total|tip|service|change|cash|card|tend|due)\b/i; const out: ParsedReceiptLineItem[] = []; for (const line of lines) { if (skipMarker.test(line)) continue; // Skip header-ish rows: dates, postal codes, "Date:" / "Time:" labels. if (DATE_PATTERNS.some((p) => p.regex.test(line))) continue; if ( /^\s*(date|time|tel|phone|store|store#|cashier|order|table|receipt|invoice)\b/i.test(line) ) { continue; } // Skip lines that look like an address: leading street number, common suffixes. if (/^\s*\d+\s+\w/.test(line) && /\b(st|ave|blvd|rd|way|lane|ln|drive|dr)\b/i.test(line)) { continue; } const numbers = extractNumbers(line); if (numbers.length === 0) continue; // Line items always have the price at the END; if the only number is at // the start (e.g. street number), this isn't a line item. const trailingNumber = /[.,]?\d[\d.,]*\s*$/.test(line); if (!trailingNumber) continue; const lastNum = numbers[numbers.length - 1]!; const numStr = String(lastNum); const idx = line.lastIndexOf(numStr.replace(/\.\d+$/, '')); // approximate match const description = (idx > 0 ? line.slice(0, idx) : line.replace(/[\d.,]+$/, '')) .trim() .replace(/[.\-–-\s]+$/, ''); if (description.length < 2) continue; out.push({ description: description.slice(0, 120), amount: lastNum }); if (out.length >= 20) break; } return out; } /** * Confidence = fraction of headline fields recovered, scaled by avg * Tesseract per-line confidence (1 if not provided). */ function computeConfidence( fields: { vendor: unknown; date: unknown; amount: unknown }, ocrConfidence: number | null, ): number { const recovered = [fields.vendor, fields.date, fields.amount].filter(Boolean).length; const fieldScore = recovered / 3; const ocrScore = ocrConfidence == null ? 1 : Math.max(0, Math.min(1, ocrConfidence / 100)); return Number((fieldScore * ocrScore).toFixed(2)); } export interface ParseReceiptInput { text: string; /** 0–100 from Tesseract, or null if we don't have it. */ ocrConfidence?: number | null; } export function parseReceiptText({ text, ocrConfidence = null }: ParseReceiptInput): ParsedReceipt { const lines = text .split(/\r?\n/) .map((l) => l.trim()) .filter(Boolean); const vendor = extractVendor(lines); const date = extractDate(text); const amount = extractAmount(lines); const currency = extractCurrency(text); const lineItems = extractLineItems(lines); const confidence = computeConfidence({ vendor, date, amount }, ocrConfidence); return { establishment: vendor, date, amount, currency, lineItems, confidence, }; }