pn-new-crm/src/lib/ocr/parse-receipt-text.ts

/**
 * Heuristic parser for raw OCR text from a receipt image.
 *
 * Tesseract returns plain text - we extract structured fields (vendor, date,
 * amount, currency, line items) using regex/positional rules. The output
 * matches `ParsedReceipt` from `ocr-providers.ts` so callers don't need to
 * branch on which engine produced it.
 *
 * Confidence is computed from how many fields we managed to recover, scaled
 * by Tesseract's own per-line confidence when provided.
 */

import type { ParsedReceipt, ParsedReceiptLineItem } from '@/lib/services/ocr-providers';

/** ISO 4217 codes we recognize, plus common symbol → ISO map. */
const CURRENCY_SYMBOLS: Record<string, string> = {
  $: 'USD',
  '€': 'EUR',
  '£': 'GBP',
  '¥': 'JPY',
  '₣': 'CHF',
  '₹': 'INR',
  '₽': 'RUB',
  '₱': 'PHP',
  '₩': 'KRW',
};

const CURRENCY_CODES = new Set([
  'USD',
  'EUR',
  'GBP',
  'JPY',
  'CHF',
  'CAD',
  'AUD',
  'NZD',
  'SEK',
  'NOK',
  'DKK',
  'PLN',
  'CZK',
  'HUF',
  'INR',
  'CNY',
  'HKD',
  'SGD',
  'AED',
  'ILS',
  'TRY',
  'ZAR',
  'BRL',
  'MXN',
  'RUB',
  'KRW',
]);

/** Patterns we try in order; the first match wins. */
const DATE_PATTERNS: Array<{ regex: RegExp; build: (m: RegExpMatchArray) => string | null }> = [
  // ISO 2024-04-28
  {
    regex: /\b(\d{4})-(\d{1,2})-(\d{1,2})\b/,
    build: (m) => normalizeDate(m[1]!, m[2]!, m[3]!),
  },
  // 28/04/2024 or 28-04-2024 (DMY - common in EU)
  {
    regex: /\b(\d{1,2})[/.\-](\d{1,2})[/.\-](\d{2,4})\b/,
    build: (m) => {
      const d = m[1]!;
      const mo = m[2]!;
      const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
      // We can't tell DMY from MDY; trust DMY which is more common globally
      // and won't fail validation as long as month <= 12.
      return normalizeDate(y, mo, d);
    },
  },
  // 28 Apr 2024 / 28-Apr-2024
  {
    regex: /\b(\d{1,2})\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+(\d{2,4})\b/i,
    build: (m) => {
      const months: Record<string, string> = {
        jan: '01',
        feb: '02',
        mar: '03',
        apr: '04',
        may: '05',
        jun: '06',
        jul: '07',
        aug: '08',
        sep: '09',
        oct: '10',
        nov: '11',
        dec: '12',
      };
      const mo = months[m[2]!.toLowerCase().slice(0, 3)];
      if (!mo) return null;
      const y = m[3]!.length === 2 ? `20${m[3]}` : m[3]!;
      return normalizeDate(y, mo, m[1]!);
    },
  },
];

function normalizeDate(year: string, month: string, day: string): string | null {
  const y = year.padStart(4, '0');
  const m = month.padStart(2, '0');
  const d = day.padStart(2, '0');
  const candidate = `${y}-${m}-${d}`;
  // Sanity-check by round-tripping through Date - drops invalid days.
  const t = new Date(candidate);
  if (Number.isNaN(t.getTime()) || t.toISOString().slice(0, 10) !== candidate) return null;
  // Don't accept implausibly old or future-dated receipts.
  const yr = Number(y);
  if (yr < 2000 || yr > 2100) return null;
  return candidate;
}

/** Pulls the first recognizable date out of `text`. */
function extractDate(text: string): string | null {
  for (const { regex, build } of DATE_PATTERNS) {
    const m = text.match(regex);
    if (m) {
      const d = build(m);
      if (d) return d;
    }
  }
  return null;
}

/** Detects a currency symbol or 3-letter ISO code anywhere in `text`. */
function extractCurrency(text: string): string | null {
  for (const sym of Object.keys(CURRENCY_SYMBOLS)) {
    if (text.includes(sym)) return CURRENCY_SYMBOLS[sym]!;
  }
  // Match a stand-alone uppercase 3-letter token.
  const m = text.match(/\b([A-Z]{3})\b/g);
  if (m) {
    for (const code of m) {
      if (CURRENCY_CODES.has(code)) return code;
    }
  }
  return null;
}

/**
 * Extracts the receipt total. Strategy:
 *   1. Look for a line containing "total", "amount due", "grand total",
 *      "balance due", "to pay" - preferring the last match (subtotals
 *      come earlier on the receipt).
 *   2. Fall back to the largest decimal number on the receipt.
 */
function extractAmount(lines: string[]): number | null {
  const totalMarker = /\b(grand\s*total|total\s*due|balance\s*due|amount\s*due|total|to\s*pay)\b/i;
  let best: { amount: number; priority: number } | null = null;

  for (const line of lines) {
    if (!totalMarker.test(line)) continue;
    const numbers = extractNumbers(line);
    if (numbers.length === 0) continue;
    // Take the largest number on this line (subtotal+tax often appear before total).
    const amt = Math.max(...numbers);
    // Prefer "grand total" / "total due" over plain "total" / "subtotal-adjacent".
    const priority = /grand\s*total|total\s*due|balance\s*due|amount\s*due|to\s*pay/i.test(line)
      ? 2
      : 1;
    if (!best || priority > best.priority || (priority === best.priority && amt > best.amount)) {
      best = { amount: amt, priority };
    }
  }
  if (best) return best.amount;

  // Fallback: largest decimal on the whole receipt.
  const all = lines.flatMap(extractNumbers);
  if (all.length === 0) return null;
  return Math.max(...all);
}

/** Pulls numeric values out of a line, supporting `1,234.56` and `1.234,56`. */
function extractNumbers(line: string): number[] {
  const out: number[] = [];
  const re = /(?<![A-Za-z0-9])-?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{1,2})?(?![A-Za-z0-9])/g;
  for (const match of line.matchAll(re)) {
    const raw = match[0];
    const parsed = parseLocaleNumber(raw);
    if (parsed != null && Math.abs(parsed) >= 0.01) out.push(parsed);
  }
  return out;
}

function parseLocaleNumber(raw: string): number | null {
  // Decide whether `,` or `.` is the decimal separator by looking at the last one.
  const lastComma = raw.lastIndexOf(',');
  const lastDot = raw.lastIndexOf('.');
  let cleaned: string;
  if (lastComma === -1 && lastDot === -1) {
    cleaned = raw;
  } else if (lastComma > lastDot) {
    // Comma is decimal: 1.234,56 → 1234.56
    cleaned = raw.replace(/\./g, '').replace(',', '.');
  } else {
    // Dot is decimal: 1,234.56 → 1234.56
    cleaned = raw.replace(/,/g, '');
  }
  const n = Number(cleaned);
  return Number.isFinite(n) ? n : null;
}

/**
 * Vendor heuristic: first non-blank line that isn't a date/number-only line
 * and isn't shorter than 3 chars. Receipts almost always print the merchant
 * name at the top.
 */
function extractVendor(lines: string[]): string | null {
  for (const line of lines.slice(0, 6)) {
    const trimmed = line.trim();
    if (trimmed.length < 3) continue;
    // Vendor lines must include at least two alphabetic characters - drops
    // pure-punctuation noise like "@@@" and divider rows like "===".
    if ((trimmed.match(/[A-Za-z]/g) ?? []).length < 2) continue;
    if (DATE_PATTERNS.some((p) => p.regex.test(trimmed))) continue;
    if (/^(receipt|invoice|tax invoice|order|ticket)/i.test(trimmed)) continue;
    return trimmed.slice(0, 120);
  }
  return null;
}

/** Pulls line items: lines with both descriptive text and a trailing number. */
function extractLineItems(lines: string[]): ParsedReceiptLineItem[] {
  const skipMarker = /\b(subtotal|tax|vat|gst|total|tip|service|change|cash|card|tend|due)\b/i;
  const out: ParsedReceiptLineItem[] = [];
  for (const line of lines) {
    if (skipMarker.test(line)) continue;
    // Skip header-ish rows: dates, postal codes, "Date:" / "Time:" labels.
    if (DATE_PATTERNS.some((p) => p.regex.test(line))) continue;
    if (
      /^\s*(date|time|tel|phone|store|store#|cashier|order|table|receipt|invoice)\b/i.test(line)
    ) {
      continue;
    }
    // Skip lines that look like an address: leading street number, common suffixes.
    if (/^\s*\d+\s+\w/.test(line) && /\b(st|ave|blvd|rd|way|lane|ln|drive|dr)\b/i.test(line)) {
      continue;
    }
    const numbers = extractNumbers(line);
    if (numbers.length === 0) continue;
    // Line items always have the price at the END; if the only number is at
    // the start (e.g. street number), this isn't a line item.
    const trailingNumber = /[.,]?\d[\d.,]*\s*$/.test(line);
    if (!trailingNumber) continue;
    const lastNum = numbers[numbers.length - 1]!;
    const numStr = String(lastNum);
    const idx = line.lastIndexOf(numStr.replace(/\.\d+$/, '')); // approximate match
    const description = (idx > 0 ? line.slice(0, idx) : line.replace(/[\d.,]+$/, ''))
      .trim()
      .replace(/[.\-–-\s]+$/, '');
    if (description.length < 2) continue;
    out.push({ description: description.slice(0, 120), amount: lastNum });
    if (out.length >= 20) break;
  }
  return out;
}

/**
 * Confidence = fraction of headline fields recovered, scaled by avg
 * Tesseract per-line confidence (1 if not provided).
 */
function computeConfidence(
  fields: { vendor: unknown; date: unknown; amount: unknown },
  ocrConfidence: number | null,
): number {
  const recovered = [fields.vendor, fields.date, fields.amount].filter(Boolean).length;
  const fieldScore = recovered / 3;
  const ocrScore = ocrConfidence == null ? 1 : Math.max(0, Math.min(1, ocrConfidence / 100));
  return Number((fieldScore * ocrScore).toFixed(2));
}

export interface ParseReceiptInput {
  text: string;
  /** 0–100 from Tesseract, or null if we don't have it. */
  ocrConfidence?: number | null;
}

export function parseReceiptText({ text, ocrConfidence = null }: ParseReceiptInput): ParsedReceipt {
  const lines = text
    .split(/\r?\n/)
    .map((l) => l.trim())
    .filter(Boolean);

  const vendor = extractVendor(lines);
  const date = extractDate(text);
  const amount = extractAmount(lines);
  const currency = extractCurrency(text);
  const lineItems = extractLineItems(lines);
  const confidence = computeConfidence({ vendor, date, amount }, ocrConfidence);

  return {
    establishment: vendor,
    date,
    amount,
    currency,
    lineItems,
    confidence,
  };
}