src/lib/ocr/parse-receipt-text.ts

/**
 * Heuristic parser for raw OCR text from a receipt image.
 *
 * Tesseract returns plain text — we extract structured fields (vendor, date,
 * amount, currency, line items) using regex/positional rules. The output
 * matches `ParsedReceipt` from `ocr-providers.ts` so callers don't need to
 * branch on which engine produced it.
 *
 * Confidence is computed from how many fields we managed to recover, scaled
 * by Tesseract's own per-line confidence when provided.
 */

import type { ParsedReceipt, ParsedReceiptLineItem } from '@/lib/services/ocr-providers';

/** ISO 4217 codes we recognize, plus common symbol → ISO map. */
const CURRENCY_SYMBOLS: Record<string, string> = {
  $: 'USD',
  '€': 'EUR',
  '£': 'GBP',
  '¥': 'JPY',
  '₣': 'CHF',
  '₹': 'INR',
  '₽': 'RUB',
  '₱': 'PHP',
  '₩': 'KRW',
};

const CURRENCY_CODES = new Set([
  'USD',
  'EUR',
  'GBP',
  'JPY',
  'CHF',
  'CAD',
  'AUD',
  'NZD',
  'SEK',
  'NOK',
  'DKK',
  'PLN',
  'CZK',
  'HUF',
  'INR',
  'CNY',
  'HKD',
  'SGD',
  'AED',
  'ILS',
  'TRY',
  'ZAR',
  'BRL',
  'MXN',
  'RUB',
  'KRW',
]);

/** Patterns we try in order; the first match wins. */
const DATE_PATTERNS: Array<{ regex: RegExp; build: (m: RegExpMatchArray) => string | null }> = [
  // ISO 2024-04-28
  {
    regex: /\b(\d{4})-(\d{1,2})-(\d{1,2})\b/,
    build: (m) => normalizeDate(m[1]!, m[2]!, m[3]!),
  },
  // 28/04/2024 or 28-04-2024 (DMY — common in EU)
  {
    regex: /\b(\d{1,2})[/.\-](\d{1,2})[/.\-](\d{2,4})\b/,
    build: (m) => {
      const d = m[1]!;
      const mo = m[2]!;
      const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
      // We can't tell DMY from MDY; trust DMY which is more common globally
      // and won't fail validation as long as month <= 12.
      return normalizeDate(y, mo, d);
    },
  },
  // 28 Apr 2024 / 28-Apr-2024
  {
    regex: /\b(\d{1,2})\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+(\d{2,4})\b/i,
    build: (m) => {
      const months: Record<string, string> = {
        jan: '01',
        feb: '02',
        mar: '03',
        apr: '04',
        may: '05',
        jun: '06',
        jul: '07',
        aug: '08',
        sep: '09',
        oct: '10',
        nov: '11',
        dec: '12',
      };
      const mo = months[m[2]!.toLowerCase().slice(0, 3)];
      if (!mo) return null;
      const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
      return normalizeDate(y, mo, m[1]!);
    },
  },
];

function normalizeDate(year: string, month: string, day: string): string | null {
  const y = year.padStart(4, '0');
  const m = month.padStart(2, '0');
  const d = day.padStart(2, '0');
  const candidate = `${y}-${m}-${d}`;
  // Sanity-check by round-tripping through Date — drops invalid days.
  const t = new Date(candidate);
  if (Number.isNaN(t.getTime()) || t.toISOString().slice(0, 10) !== candidate) return null;
  // Don't accept implausibly old or future-dated receipts.
  const yr = Number(y);
  if (yr < 2000 || yr > 2100) return null;
  return candidate;
}

/** Pulls the first recognizable date out of `text`. */
function extractDate(text: string): string | null {
  for (const { regex, build } of DATE_PATTERNS) {
    const m = text.match(regex);
    if (m) {
      const d = build(m);
      if (d) return d;
    }
  }
  return null;
}

/** Detects a currency symbol or 3-letter ISO code anywhere in `text`. */
function extractCurrency(text: string): string | null {
  for (const sym of Object.keys(CURRENCY_SYMBOLS)) {
    if (text.includes(sym)) return CURRENCY_SYMBOLS[sym]!;
  }
  // Match a stand-alone uppercase 3-letter token.
  const m = text.match(/\b([A-Z]{3})\b/g);
  if (m) {
    for (const code of m) {
      if (CURRENCY_CODES.has(code)) return code;
    }
  }
  return null;
}

/**
 * Extracts the receipt total. Strategy:
 *   1. Look for a line containing "total", "amount due", "grand total",
 *      "balance due", "to pay" — preferring the last match (subtotals
 *      come earlier on the receipt).
 *   2. Fall back to the largest decimal number on the receipt.
 */
function extractAmount(lines: string[]): number | null {
  const totalMarker = /\b(grand\s*total|total\s*due|balance\s*due|amount\s*due|total|to\s*pay)\b/i;
  let best: { amount: number; priority: number } | null = null;

  for (const line of lines) {
    if (!totalMarker.test(line)) continue;
    const numbers = extractNumbers(line);
    if (numbers.length === 0) continue;
    // Take the largest number on this line (subtotal+tax often appear before total).
    const amt = Math.max(...numbers);
    // Prefer "grand total" / "total due" over plain "total" / "subtotal-adjacent".
    const priority = /grand\s*total|total\s*due|balance\s*due|amount\s*due|to\s*pay/i.test(line)
      ? 2
      : 1;
    if (!best || priority > best.priority || (priority === best.priority && amt > best.amount)) {
      best = { amount: amt, priority };
    }
  }
  if (best) return best.amount;

  // Fallback: largest decimal on the whole receipt.
  const all = lines.flatMap(extractNumbers);
  if (all.length === 0) return null;
  return Math.max(...all);
}

/** Pulls numeric values out of a line, supporting `1,234.56` and `1.234,56`. */
function extractNumbers(line: string): number[] {
  const out: number[] = [];
  const re = /(?<![A-Za-z0-9])-?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{1,2})?(?![A-Za-z0-9])/g;
  for (const match of line.matchAll(re)) {
    const raw = match[0];
    const parsed = parseLocaleNumber(raw);
    if (parsed != null && Math.abs(parsed) >= 0.01) out.push(parsed);
  }
  return out;
}

function parseLocaleNumber(raw: string): number | null {
  // Decide whether `,` or `.` is the decimal separator by looking at the last one.
  const lastComma = raw.lastIndexOf(',');
  const lastDot = raw.lastIndexOf('.');
  let cleaned: string;
  if (lastComma === -1 && lastDot === -1) {
    cleaned = raw;
  } else if (lastComma > lastDot) {
    // Comma is decimal: 1.234,56 → 1234.56
    cleaned = raw.replace(/\./g, '').replace(',', '.');
  } else {
    // Dot is decimal: 1,234.56 → 1234.56
    cleaned = raw.replace(/,/g, '');
  }
  const n = Number(cleaned);
  return Number.isFinite(n) ? n : null;
}

/**
 * Vendor heuristic: first non-blank line that isn't a date/number-only line
 * and isn't shorter than 3 chars. Receipts almost always print the merchant
 * name at the top.
 */
function extractVendor(lines: string[]): string | null {
  for (const line of lines.slice(0, 6)) {
    const trimmed = line.trim();
    if (trimmed.length < 3) continue;
    // Vendor lines must include at least two alphabetic characters — drops
    // pure-punctuation noise like "@@@" and divider rows like "===".
    if ((trimmed.match(/[A-Za-z]/g) ?? []).length < 2) continue;
    if (DATE_PATTERNS.some((p) => p.regex.test(trimmed))) continue;
    if (/^(receipt|invoice|tax invoice|order|ticket)/i.test(trimmed)) continue;
    return trimmed.slice(0, 120);
  }
  return null;
}

/** Pulls line items: lines with both descriptive text and a trailing number. */
function extractLineItems(lines: string[]): ParsedReceiptLineItem[] {
  const skipMarker = /\b(subtotal|tax|vat|gst|total|tip|service|change|cash|card|tend|due)\b/i;
  const out: ParsedReceiptLineItem[] = [];
  for (const line of lines) {
    if (skipMarker.test(line)) continue;
    // Skip header-ish rows: dates, postal codes, "Date:" / "Time:" labels.
    if (DATE_PATTERNS.some((p) => p.regex.test(line))) continue;
    if (
      /^\s*(date|time|tel|phone|store|store#|cashier|order|table|receipt|invoice)\b/i.test(line)
    ) {
      continue;
    }
    // Skip lines that look like an address: leading street number, common suffixes.
    if (/^\s*\d+\s+\w/.test(line) && /\b(st|ave|blvd|rd|way|lane|ln|drive|dr)\b/i.test(line)) {
      continue;
    }
    const numbers = extractNumbers(line);
    if (numbers.length === 0) continue;
    // Line items always have the price at the END; if the only number is at
    // the start (e.g. street number), this isn't a line item.
    const trailingNumber = /[.,]?\d[\d.,]*\s*$/.test(line);
    if (!trailingNumber) continue;
    const lastNum = numbers[numbers.length - 1]!;
    const numStr = String(lastNum);
    const idx = line.lastIndexOf(numStr.replace(/\.\d+$/, '')); // approximate match
    const description = (idx > 0 ? line.slice(0, idx) : line.replace(/[\d.,]+$/, ''))
      .trim()
      .replace(/[.\-–—\s]+$/, '');
    if (description.length < 2) continue;
    out.push({ description: description.slice(0, 120), amount: lastNum });
    if (out.length >= 20) break;
  }
  return out;
}

/**
 * Confidence = fraction of headline fields recovered, scaled by avg
 * Tesseract per-line confidence (1 if not provided).
 */
function computeConfidence(
  fields: { vendor: unknown; date: unknown; amount: unknown },
  ocrConfidence: number | null,
): number {
  const recovered = [fields.vendor, fields.date, fields.amount].filter(Boolean).length;
  const fieldScore = recovered / 3;
  const ocrScore = ocrConfidence == null ? 1 : Math.max(0, Math.min(1, ocrConfidence / 100));
  return Number((fieldScore * ocrScore).toFixed(2));
}

export interface ParseReceiptInput {
  text: string;
  /** 0–100 from Tesseract, or null if we don't have it. */
  ocrConfidence?: number | null;
}

export function parseReceiptText({ text, ocrConfidence = null }: ParseReceiptInput): ParsedReceipt {
  const lines = text
    .split(/\r?\n/)
    .map((l) => l.trim())
    .filter(Boolean);

  const vendor = extractVendor(lines);
  const date = extractDate(text);
  const amount = extractAmount(lines);
  const currency = extractCurrency(text);
  const lineItems = extractLineItems(lines);
  const confidence = computeConfidence({ vendor, date, amount }, ocrConfidence);

  return {
    establishment: vendor,
    date,
    amount,
    currency,
    lineItems,
    confidence,
  };
}
-												feat(ocr): Tesseract.js as default scanner, AI as opt-in per port

The mobile receipt scanner now runs Tesseract.js in-browser by default —
on-device, free, and image bytes never leave the device. AI providers
(OpenAI / Claude) become a per-port opt-in for higher accuracy on
hard-to-read receipts.

- Lazy-load Tesseract WASM in src/lib/ocr/tesseract-client.ts (5 MB
  bundle dynamic-imports on first scan, not in main chunk)
- Heuristic parser src/lib/ocr/parse-receipt-text.ts extracts vendor,
  date, amount, currency, and line items from raw OCR text
- New port-scoped aiEnabled flag on OcrConfig (defaults false). Resolved
  flag never inherits from the global row — each port admin opts in
  independently
- Scan endpoint short-circuits to manual-mode when aiEnabled=false so
  the AI provider is never invoked unless the admin has flipped the
  switch
- Scan UI runs Tesseract first, then asks the server whether AI is
  enabled — uses the AI result only when its confidence beats Tesseract;
  network failures degrade gracefully to the local parse
- Admin OCR-settings form gains the per-port aiEnabled checkbox

Tests: 756/756 vitest (was 747) — +7 parser unit tests, +2 aiEnabled
config tests.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-28 19:46:29 +02:00
+								/**
 								 * Heuristic parser for raw OCR text from a receipt image.
 								 *
 								 * Tesseract returns plain text — we extract structured fields (vendor, date,
 								 * amount, currency, line items) using regex/positional rules. The output
 								 * matches `ParsedReceipt` from `ocr-providers.ts` so callers don't need to
 								 * branch on which engine produced it.
 								 *
 								 * Confidence is computed from how many fields we managed to recover, scaled
 								 * by Tesseract's own per-line confidence when provided.
 								 */
 								import type { ParsedReceipt, ParsedReceiptLineItem } from '@/lib/services/ocr-providers';
 								/** ISO 4217 codes we recognize, plus common symbol → ISO map. */
 								const CURRENCY_SYMBOLS: Record<string, string> = {
 								  $: 'USD',
 								  '€': 'EUR',
 								  '£': 'GBP',
 								  '¥': 'JPY',
 								  '₣': 'CHF',
 								  '₹': 'INR',
 								  '₽': 'RUB',
 								  '₱': 'PHP',
 								  '₩': 'KRW',
 								};
 								const CURRENCY_CODES = new Set([
 								  'USD',
 								  'EUR',
 								  'GBP',
 								  'JPY',
 								  'CHF',
 								  'CAD',
 								  'AUD',
 								  'NZD',
 								  'SEK',
 								  'NOK',
 								  'DKK',
 								  'PLN',
 								  'CZK',
 								  'HUF',
 								  'INR',
 								  'CNY',
 								  'HKD',
 								  'SGD',
 								  'AED',
 								  'ILS',
 								  'TRY',
 								  'ZAR',
 								  'BRL',
 								  'MXN',
 								  'RUB',
 								  'KRW',
 								]);
 								/** Patterns we try in order; the first match wins. */
 								const DATE_PATTERNS: Array<{ regex: RegExp; build: (m: RegExpMatchArray) => string | null }> = [
 								  // ISO 2024-04-28
 								  {
 								    regex: /\b(\d{4})-(\d{1,2})-(\d{1,2})\b/,
 								    build: (m) => normalizeDate(m[1]!, m[2]!, m[3]!),
 								  },
 								  // 28/04/2024 or 28-04-2024 (DMY — common in EU)
 								  {
 								    regex: /\b(\d{1,2})[/.\-](\d{1,2})[/.\-](\d{2,4})\b/,
 								    build: (m) => {
 								      const d = m[1]!;
 								      const mo = m[2]!;
 								      const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
 								      // We can't tell DMY from MDY; trust DMY which is more common globally
 								      // and won't fail validation as long as month <= 12.
 								      return normalizeDate(y, mo, d);
 								    },
 								  },
 								  // 28 Apr 2024 / 28-Apr-2024
 								  {
 								    regex: /\b(\d{1,2})\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+(\d{2,4})\b/i,
 								    build: (m) => {
 								      const months: Record<string, string> = {
 								        jan: '01',
 								        feb: '02',
 								        mar: '03',
 								        apr: '04',
 								        may: '05',
 								        jun: '06',
 								        jul: '07',
 								        aug: '08',
 								        sep: '09',
 								        oct: '10',
 								        nov: '11',
 								        dec: '12',
 								      };
 								      const mo = months[m[2]!.toLowerCase().slice(0, 3)];
 								      if (!mo) return null;
 								      const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
 								      return normalizeDate(y, mo, m[1]!);
 								    },
 								  },
 								];
 								function normalizeDate(year: string, month: string, day: string): string | null {
 								  const y = year.padStart(4, '0');
 								  const m = month.padStart(2, '0');
 								  const d = day.padStart(2, '0');
 								  const candidate = `${y}-${m}-${d}`;
 								  // Sanity-check by round-tripping through Date — drops invalid days.
 								  const t = new Date(candidate);
 								  if (Number.isNaN(t.getTime()) || t.toISOString().slice(0, 10) !== candidate) return null;
 								  // Don't accept implausibly old or future-dated receipts.
 								  const yr = Number(y);
 								  if (yr < 2000 || yr > 2100) return null;
 								  return candidate;
 								}
 								/** Pulls the first recognizable date out of `text`. */
 								function extractDate(text: string): string | null {
 								  for (const { regex, build } of DATE_PATTERNS) {
 								    const m = text.match(regex);
 								    if (m) {
 								      const d = build(m);
 								      if (d) return d;
 								    }
 								  }
 								  return null;
 								}
 								/** Detects a currency symbol or 3-letter ISO code anywhere in `text`. */
 								function extractCurrency(text: string): string | null {
 								  for (const sym of Object.keys(CURRENCY_SYMBOLS)) {
 								    if (text.includes(sym)) return CURRENCY_SYMBOLS[sym]!;
 								  }
 								  // Match a stand-alone uppercase 3-letter token.
 								  const m = text.match(/\b([A-Z]{3})\b/g);
 								  if (m) {
 								    for (const code of m) {
 								      if (CURRENCY_CODES.has(code)) return code;
 								    }
 								  }
 								  return null;
 								}
 								/**
 								 * Extracts the receipt total. Strategy:
 								 *   1. Look for a line containing "total", "amount due", "grand total",
 								 *      "balance due", "to pay" — preferring the last match (subtotals
 								 *      come earlier on the receipt).
 								 *   2. Fall back to the largest decimal number on the receipt.
 								 */
 								function extractAmount(lines: string[]): number | null {
 								  const totalMarker = /\b(grand\s*total|total\s*due|balance\s*due|amount\s*due|total|to\s*pay)\b/i;
 								  let best: { amount: number; priority: number } | null = null;
 								  for (const line of lines) {
 								    if (!totalMarker.test(line)) continue;
 								    const numbers = extractNumbers(line);
 								    if (numbers.length === 0) continue;
 								    // Take the largest number on this line (subtotal+tax often appear before total).
 								    const amt = Math.max(...numbers);
 								    // Prefer "grand total" / "total due" over plain "total" / "subtotal-adjacent".
 								    const priority = /grand\s*total|total\s*due|balance\s*due|amount\s*due|to\s*pay/i.test(line)
 								      ? 2
 								      : 1;
 								    if (!best || priority > best.priority || (priority === best.priority && amt > best.amount)) {
 								      best = { amount: amt, priority };
 								    }
 								  }
 								  if (best) return best.amount;
 								  // Fallback: largest decimal on the whole receipt.
 								  const all = lines.flatMap(extractNumbers);
 								  if (all.length === 0) return null;
 								  return Math.max(...all);
 								}
 								/** Pulls numeric values out of a line, supporting `1,234.56` and `1.234,56`. */
 								function extractNumbers(line: string): number[] {
 								  const out: number[] = [];
 								  const re = /(?<![A-Za-z0-9])-?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{1,2})?(?![A-Za-z0-9])/g;
 								  for (const match of line.matchAll(re)) {
 								    const raw = match[0];
 								    const parsed = parseLocaleNumber(raw);
 								    if (parsed != null && Math.abs(parsed) >= 0.01) out.push(parsed);
 								  }
 								  return out;
 								}
 								function parseLocaleNumber(raw: string): number | null {
 								  // Decide whether `,` or `.` is the decimal separator by looking at the last one.
 								  const lastComma = raw.lastIndexOf(',');
 								  const lastDot = raw.lastIndexOf('.');
 								  let cleaned: string;
 								  if (lastComma === -1 && lastDot === -1) {
 								    cleaned = raw;
 								  } else if (lastComma > lastDot) {
 								    // Comma is decimal: 1.234,56 → 1234.56
 								    cleaned = raw.replace(/\./g, '').replace(',', '.');
 								  } else {
 								    // Dot is decimal: 1,234.56 → 1234.56
 								    cleaned = raw.replace(/,/g, '');
 								  }
 								  const n = Number(cleaned);
 								  return Number.isFinite(n) ? n : null;
 								}
 								/**
 								 * Vendor heuristic: first non-blank line that isn't a date/number-only line
 								 * and isn't shorter than 3 chars. Receipts almost always print the merchant
 								 * name at the top.
 								 */
 								function extractVendor(lines: string[]): string | null {
 								  for (const line of lines.slice(0, 6)) {
 								    const trimmed = line.trim();
 								    if (trimmed.length < 3) continue;
 								    // Vendor lines must include at least two alphabetic characters — drops
 								    // pure-punctuation noise like "@@@" and divider rows like "===".
 								    if ((trimmed.match(/[A-Za-z]/g) ?? []).length < 2) continue;
 								    if (DATE_PATTERNS.some((p) => p.regex.test(trimmed))) continue;
 								    if (/^(receipt|invoice|tax invoice|order|ticket)/i.test(trimmed)) continue;
 								    return trimmed.slice(0, 120);
 								  }
 								  return null;
 								}
 								/** Pulls line items: lines with both descriptive text and a trailing number. */
 								function extractLineItems(lines: string[]): ParsedReceiptLineItem[] {
 								  const skipMarker = /\b(subtotal|tax|vat|gst|total|tip|service|change|cash|card|tend|due)\b/i;
 								  const out: ParsedReceiptLineItem[] = [];
 								  for (const line of lines) {
 								    if (skipMarker.test(line)) continue;
 								    // Skip header-ish rows: dates, postal codes, "Date:" / "Time:" labels.
 								    if (DATE_PATTERNS.some((p) => p.regex.test(line))) continue;
 								    if (
 								      /^\s*(date|time|tel|phone|store|store#|cashier|order|table|receipt|invoice)\b/i.test(line)
 								    ) {
 								      continue;
 								    }
 								    // Skip lines that look like an address: leading street number, common suffixes.
 								    if (/^\s*\d+\s+\w/.test(line) && /\b(st|ave|blvd|rd|way|lane|ln|drive|dr)\b/i.test(line)) {
 								      continue;
 								    }
 								    const numbers = extractNumbers(line);
 								    if (numbers.length === 0) continue;
 								    // Line items always have the price at the END; if the only number is at
 								    // the start (e.g. street number), this isn't a line item.
 								    const trailingNumber = /[.,]?\d[\d.,]*\s*$/.test(line);
 								    if (!trailingNumber) continue;
 								    const lastNum = numbers[numbers.length - 1]!;
 								    const numStr = String(lastNum);
 								    const idx = line.lastIndexOf(numStr.replace(/\.\d+$/, '')); // approximate match
 								    const description = (idx > 0 ? line.slice(0, idx) : line.replace(/[\d.,]+$/, ''))
 								      .trim()
 								      .replace(/[.\-–—\s]+$/, '');
 								    if (description.length < 2) continue;
 								    out.push({ description: description.slice(0, 120), amount: lastNum });
 								    if (out.length >= 20) break;
 								  }
 								  return out;
 								}
 								/**
 								 * Confidence = fraction of headline fields recovered, scaled by avg
 								 * Tesseract per-line confidence (1 if not provided).
 								 */
 								function computeConfidence(
 								  fields: { vendor: unknown; date: unknown; amount: unknown },
 								  ocrConfidence: number | null,
 								): number {
 								  const recovered = [fields.vendor, fields.date, fields.amount].filter(Boolean).length;
 								  const fieldScore = recovered / 3;
 								  const ocrScore = ocrConfidence == null ? 1 : Math.max(0, Math.min(1, ocrConfidence / 100));
 								  return Number((fieldScore * ocrScore).toFixed(2));
 								}
 								export interface ParseReceiptInput {
 								  text: string;
 								  /** 0–100 from Tesseract, or null if we don't have it. */
 								  ocrConfidence?: number | null;
 								}
 								export function parseReceiptText({ text, ocrConfidence = null }: ParseReceiptInput): ParsedReceipt {
 								  const lines = text
 								    .split(/\r?\n/)
 								    .map((l) => l.trim())
 								    .filter(Boolean);
 								  const vendor = extractVendor(lines);
 								  const date = extractDate(text);
 								  const amount = extractAmount(lines);
 								  const currency = extractCurrency(text);
 								  const lineItems = extractLineItems(lines);
 								  const confidence = computeConfidence({ vendor, date, amount }, ocrConfidence);
 								  return {
 								    establishment: vendor,
 								    date,
 								    amount,
 								    currency,
 								    lineItems,
 								    confidence,
 								  };
 								}