Files
pn-new-crm/src/lib/ocr/parse-receipt-text.ts
Matt Ciaccio 8699f81879
Some checks failed
Build & Push Docker Images / lint (push) Failing after 1m18s
Build & Push Docker Images / build-and-push (push) Has been skipped
chore(style): codebase em-dash sweep + minor layout polish
Replaces every em-dash and en-dash with regular ASCII hyphens
across comments, JSX strings, and dev-facing logs. Mostly cosmetic
but stops the inconsistent mix that crept in over the last few
months (some files used em-dashes in comments, others didn't,
some used both).

Bundles two small dashboard-layout tweaks that touch a couple of
already-modified files:
- (dashboard)/layout.tsx main padding goes from p-6 to pt-3 px-6
  pb-6 so page content sits closer to the topbar.
- Sidebar now receives the ports list it needs for the footer
  port switcher.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 22:57:01 +02:00

303 lines
9.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Heuristic parser for raw OCR text from a receipt image.
*
* Tesseract returns plain text - we extract structured fields (vendor, date,
* amount, currency, line items) using regex/positional rules. The output
* matches `ParsedReceipt` from `ocr-providers.ts` so callers don't need to
* branch on which engine produced it.
*
* Confidence is computed from how many fields we managed to recover, scaled
* by Tesseract's own per-line confidence when provided.
*/
import type { ParsedReceipt, ParsedReceiptLineItem } from '@/lib/services/ocr-providers';
/** ISO 4217 codes we recognize, plus common symbol → ISO map. */
const CURRENCY_SYMBOLS: Record<string, string> = {
$: 'USD',
'€': 'EUR',
'£': 'GBP',
'¥': 'JPY',
'₣': 'CHF',
'₹': 'INR',
'₽': 'RUB',
'₱': 'PHP',
'₩': 'KRW',
};
const CURRENCY_CODES = new Set([
'USD',
'EUR',
'GBP',
'JPY',
'CHF',
'CAD',
'AUD',
'NZD',
'SEK',
'NOK',
'DKK',
'PLN',
'CZK',
'HUF',
'INR',
'CNY',
'HKD',
'SGD',
'AED',
'ILS',
'TRY',
'ZAR',
'BRL',
'MXN',
'RUB',
'KRW',
]);
/** Patterns we try in order; the first match wins. */
const DATE_PATTERNS: Array<{ regex: RegExp; build: (m: RegExpMatchArray) => string | null }> = [
// ISO 2024-04-28
{
regex: /\b(\d{4})-(\d{1,2})-(\d{1,2})\b/,
build: (m) => normalizeDate(m[1]!, m[2]!, m[3]!),
},
// 28/04/2024 or 28-04-2024 (DMY - common in EU)
{
regex: /\b(\d{1,2})[/.\-](\d{1,2})[/.\-](\d{2,4})\b/,
build: (m) => {
const d = m[1]!;
const mo = m[2]!;
const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
// We can't tell DMY from MDY; trust DMY which is more common globally
// and won't fail validation as long as month <= 12.
return normalizeDate(y, mo, d);
},
},
// 28 Apr 2024 / 28-Apr-2024
{
regex: /\b(\d{1,2})\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+(\d{2,4})\b/i,
build: (m) => {
const months: Record<string, string> = {
jan: '01',
feb: '02',
mar: '03',
apr: '04',
may: '05',
jun: '06',
jul: '07',
aug: '08',
sep: '09',
oct: '10',
nov: '11',
dec: '12',
};
const mo = months[m[2]!.toLowerCase().slice(0, 3)];
if (!mo) return null;
const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
return normalizeDate(y, mo, m[1]!);
},
},
];
function normalizeDate(year: string, month: string, day: string): string | null {
const y = year.padStart(4, '0');
const m = month.padStart(2, '0');
const d = day.padStart(2, '0');
const candidate = `${y}-${m}-${d}`;
// Sanity-check by round-tripping through Date - drops invalid days.
const t = new Date(candidate);
if (Number.isNaN(t.getTime()) || t.toISOString().slice(0, 10) !== candidate) return null;
// Don't accept implausibly old or future-dated receipts.
const yr = Number(y);
if (yr < 2000 || yr > 2100) return null;
return candidate;
}
/** Pulls the first recognizable date out of `text`. */
function extractDate(text: string): string | null {
for (const { regex, build } of DATE_PATTERNS) {
const m = text.match(regex);
if (m) {
const d = build(m);
if (d) return d;
}
}
return null;
}
/** Detects a currency symbol or 3-letter ISO code anywhere in `text`. */
function extractCurrency(text: string): string | null {
for (const sym of Object.keys(CURRENCY_SYMBOLS)) {
if (text.includes(sym)) return CURRENCY_SYMBOLS[sym]!;
}
// Match a stand-alone uppercase 3-letter token.
const m = text.match(/\b([A-Z]{3})\b/g);
if (m) {
for (const code of m) {
if (CURRENCY_CODES.has(code)) return code;
}
}
return null;
}
/**
* Extracts the receipt total. Strategy:
* 1. Look for a line containing "total", "amount due", "grand total",
* "balance due", "to pay" - preferring the last match (subtotals
* come earlier on the receipt).
* 2. Fall back to the largest decimal number on the receipt.
*/
function extractAmount(lines: string[]): number | null {
const totalMarker = /\b(grand\s*total|total\s*due|balance\s*due|amount\s*due|total|to\s*pay)\b/i;
let best: { amount: number; priority: number } | null = null;
for (const line of lines) {
if (!totalMarker.test(line)) continue;
const numbers = extractNumbers(line);
if (numbers.length === 0) continue;
// Take the largest number on this line (subtotal+tax often appear before total).
const amt = Math.max(...numbers);
// Prefer "grand total" / "total due" over plain "total" / "subtotal-adjacent".
const priority = /grand\s*total|total\s*due|balance\s*due|amount\s*due|to\s*pay/i.test(line)
? 2
: 1;
if (!best || priority > best.priority || (priority === best.priority && amt > best.amount)) {
best = { amount: amt, priority };
}
}
if (best) return best.amount;
// Fallback: largest decimal on the whole receipt.
const all = lines.flatMap(extractNumbers);
if (all.length === 0) return null;
return Math.max(...all);
}
/** Pulls numeric values out of a line, supporting `1,234.56` and `1.234,56`. */
function extractNumbers(line: string): number[] {
const out: number[] = [];
const re = /(?<![A-Za-z0-9])-?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{1,2})?(?![A-Za-z0-9])/g;
for (const match of line.matchAll(re)) {
const raw = match[0];
const parsed = parseLocaleNumber(raw);
if (parsed != null && Math.abs(parsed) >= 0.01) out.push(parsed);
}
return out;
}
function parseLocaleNumber(raw: string): number | null {
// Decide whether `,` or `.` is the decimal separator by looking at the last one.
const lastComma = raw.lastIndexOf(',');
const lastDot = raw.lastIndexOf('.');
let cleaned: string;
if (lastComma === -1 && lastDot === -1) {
cleaned = raw;
} else if (lastComma > lastDot) {
// Comma is decimal: 1.234,56 → 1234.56
cleaned = raw.replace(/\./g, '').replace(',', '.');
} else {
// Dot is decimal: 1,234.56 → 1234.56
cleaned = raw.replace(/,/g, '');
}
const n = Number(cleaned);
return Number.isFinite(n) ? n : null;
}
/**
* Vendor heuristic: first non-blank line that isn't a date/number-only line
* and isn't shorter than 3 chars. Receipts almost always print the merchant
* name at the top.
*/
function extractVendor(lines: string[]): string | null {
for (const line of lines.slice(0, 6)) {
const trimmed = line.trim();
if (trimmed.length < 3) continue;
// Vendor lines must include at least two alphabetic characters - drops
// pure-punctuation noise like "@@@" and divider rows like "===".
if ((trimmed.match(/[A-Za-z]/g) ?? []).length < 2) continue;
if (DATE_PATTERNS.some((p) => p.regex.test(trimmed))) continue;
if (/^(receipt|invoice|tax invoice|order|ticket)/i.test(trimmed)) continue;
return trimmed.slice(0, 120);
}
return null;
}
/** Pulls line items: lines with both descriptive text and a trailing number. */
function extractLineItems(lines: string[]): ParsedReceiptLineItem[] {
const skipMarker = /\b(subtotal|tax|vat|gst|total|tip|service|change|cash|card|tend|due)\b/i;
const out: ParsedReceiptLineItem[] = [];
for (const line of lines) {
if (skipMarker.test(line)) continue;
// Skip header-ish rows: dates, postal codes, "Date:" / "Time:" labels.
if (DATE_PATTERNS.some((p) => p.regex.test(line))) continue;
if (
/^\s*(date|time|tel|phone|store|store#|cashier|order|table|receipt|invoice)\b/i.test(line)
) {
continue;
}
// Skip lines that look like an address: leading street number, common suffixes.
if (/^\s*\d+\s+\w/.test(line) && /\b(st|ave|blvd|rd|way|lane|ln|drive|dr)\b/i.test(line)) {
continue;
}
const numbers = extractNumbers(line);
if (numbers.length === 0) continue;
// Line items always have the price at the END; if the only number is at
// the start (e.g. street number), this isn't a line item.
const trailingNumber = /[.,]?\d[\d.,]*\s*$/.test(line);
if (!trailingNumber) continue;
const lastNum = numbers[numbers.length - 1]!;
const numStr = String(lastNum);
const idx = line.lastIndexOf(numStr.replace(/\.\d+$/, '')); // approximate match
const description = (idx > 0 ? line.slice(0, idx) : line.replace(/[\d.,]+$/, ''))
.trim()
.replace(/[.\--\s]+$/, '');
if (description.length < 2) continue;
out.push({ description: description.slice(0, 120), amount: lastNum });
if (out.length >= 20) break;
}
return out;
}
/**
* Confidence = fraction of headline fields recovered, scaled by avg
* Tesseract per-line confidence (1 if not provided).
*/
function computeConfidence(
fields: { vendor: unknown; date: unknown; amount: unknown },
ocrConfidence: number | null,
): number {
const recovered = [fields.vendor, fields.date, fields.amount].filter(Boolean).length;
const fieldScore = recovered / 3;
const ocrScore = ocrConfidence == null ? 1 : Math.max(0, Math.min(1, ocrConfidence / 100));
return Number((fieldScore * ocrScore).toFixed(2));
}
export interface ParseReceiptInput {
text: string;
/** 0100 from Tesseract, or null if we don't have it. */
ocrConfidence?: number | null;
}
export function parseReceiptText({ text, ocrConfidence = null }: ParseReceiptInput): ParsedReceipt {
const lines = text
.split(/\r?\n/)
.map((l) => l.trim())
.filter(Boolean);
const vendor = extractVendor(lines);
const date = extractDate(text);
const amount = extractAmount(lines);
const currency = extractCurrency(text);
const lineItems = extractLineItems(lines);
const confidence = computeConfidence({ vendor, date, amount }, ocrConfidence);
return {
establishment: vendor,
date,
amount,
currency,
lineItems,
confidence,
};
}