pn-new-crm/src/lib/dedup/normalize.ts

/**
 * Normalization helpers for the dedup pipeline.
 *
 * Pure functions (no DB, no React). Used by both the runtime at-create
 * surfaces and the one-shot NocoDB migration script. Every transform
 * here has a fixture in `tests/unit/dedup/normalize.test.ts` drawn from
 * real dirty values observed in the legacy NocoDB Interests table.
 *
 * Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §3.
 */

import { z } from 'zod';

import { ALL_COUNTRY_CODES, getCountryName, type CountryCode } from '@/lib/i18n/countries';
import { parsePhoneScriptSafe as parsePhone } from './phone-parse';

// ─── Names ──────────────────────────────────────────────────────────────────

/**
 * Tokens that should stay lowercase mid-name. Covers the common Romance,
 * Germanic, and Iberian particles seen in client records. The first token
 * of a name is always title-cased even if it appears in this set.
 */
const PARTICLES: ReadonlySet<string> = new Set([
  'van',
  'von',
  'de',
  'del',
  'da',
  'das',
  'do',
  'dos',
  'di',
  'le',
  'la',
  'el',
  'al',
  'der',
  'den',
  'des',
  'du',
  'dalla',
  'della',
  'st',
  'st.',
  'y',
]);

export interface NormalizedName {
  /** Human-readable form preserved for UI display. Trims, collapses
   *  whitespace, fixes case, but never destroys the user's intent -
   *  slash-with-company structure ("Daniel Wainstein / 7 Knots, LLC")
   *  is left intact. */
  display: string;
  /** Lowercased form for matching. */
  normalized: string;
  /** Last non-particle token, lowercased. Used as a blocking key by the
   *  dedup algorithm so we only compare candidates with similar surnames. */
  surnameToken?: string;
}

/**
 * Normalize a free-text full name. Trims and collapses whitespace,
 * replaces \r/\n/\t with single spaces, intelligently title-cases
 * ALL-CAPS surnames while keeping particles (van / de / dalla / etc.)
 * lowercase mid-name, and preserves Irish O' surnames as O'Brien.
 *
 * If the input contains a `/` (slash-with-company structure like
 * "Daniel Wainstein / 7 Knots, LLC"), the trailing company text is
 * preserved verbatim - it's signal, not noise.
 */
export function normalizeName(raw: string | null | undefined): NormalizedName {
  const safe = (raw ?? '').toString();
  // Replace \r, \n, \t with single spaces, then collapse runs of whitespace.
  const cleaned = safe
    .replace(/[\r\n\t]/g, ' ')
    .replace(/\s+/g, ' ')
    .trim();

  if (!cleaned) {
    return { display: '', normalized: '', surnameToken: undefined };
  }

  // Slash-with-company: title-case the part before the slash, leave the
  // company segment untouched (it's typically already a brand we shouldn't
  // mangle: "SAS TIKI", "7 Knots, LLC").
  const slashIdx = cleaned.indexOf('/');
  let displayCore: string;
  if (slashIdx !== -1) {
    const personPart = cleaned.slice(0, slashIdx).trim();
    const companyPart = cleaned.slice(slashIdx + 1).trim();
    displayCore = `${titleCaseTokens(personPart)} / ${companyPart}`;
  } else {
    displayCore = titleCaseTokens(cleaned);
  }

  const display = displayCore;
  const normalized = display.toLowerCase();
  const surnameToken = computeSurnameToken(slashIdx !== -1 ? cleaned.slice(0, slashIdx) : cleaned);

  return { display, normalized, surnameToken };
}

function titleCaseTokens(s: string): string {
  const tokens = s.split(' ').filter(Boolean);
  if (tokens.length === 0) return '';
  return tokens.map((tok, idx) => titleCaseOneToken(tok, idx === 0)).join(' ');
}

function titleCaseOneToken(token: string, isFirst: boolean): string {
  if (!token) return '';
  const lower = token.toLowerCase();
  if (!isFirst && PARTICLES.has(lower)) return lower;
  // O'Brien / D'Angelo / l'Estrange - capitalize the segment after each
  // apostrophe so a lowercased input round-trips to readable Irish caps.
  if (lower.includes("'")) {
    return lower
      .split("'")
      .map((part) => (part.length > 0 ? part[0]!.toUpperCase() + part.slice(1) : part))
      .join("'");
  }
  return lower[0]!.toUpperCase() + lower.slice(1);
}

function computeSurnameToken(personPart: string): string | undefined {
  const cleaned = personPart
    .replace(/[\r\n\t]/g, ' ')
    .replace(/\s+/g, ' ')
    .trim();
  if (!cleaned) return undefined;
  const tokens = cleaned.split(' ').map((t) => t.toLowerCase());
  // Walk from the right past particles to find the last "real" surname token.
  for (let i = tokens.length - 1; i >= 0; i -= 1) {
    const tok = tokens[i]!;
    if (!PARTICLES.has(tok)) return tok;
  }
  // All tokens are particles? Fall back to the last token verbatim.
  return tokens[tokens.length - 1];
}

// ─── Emails ─────────────────────────────────────────────────────────────────

const emailSchema = z.string().email();

/**
 * Normalize a free-text email. Trims + lowercases. Returns null for empty
 * or malformed input - caller decides whether to flag, store, or drop.
 *
 * Plus-aliases (`user+tag@domain.com`) are NOT stripped: they're real
 * distinct addresses, and stripping them would auto-merge legitimately
 * separate accounts.
 */
export function normalizeEmail(raw: string | null | undefined): string | null {
  if (raw == null) return null;
  const trimmed = raw.toString().trim().toLowerCase();
  if (!trimmed) return null;
  const result = emailSchema.safeParse(trimmed);
  return result.success ? trimmed : null;
}

// ─── Phones ─────────────────────────────────────────────────────────────────

export type PhoneFlag = 'multi_number' | 'placeholder' | 'unparseable';

export interface NormalizedPhone {
  /** Canonical E.164 form, e.g. '+15742740548'. Null when unparseable
   *  or flagged as placeholder. */
  e164: string | null;
  /** ISO-3166-1 alpha-2 of the country the number was parsed against. */
  country: CountryCode | null;
  /** Display-friendly international format. Useful for migration reports. */
  display: string | null;
  /** Set when the input had a quirk worth surfacing in the migration
   *  report or runtime audit log. Absent on clean parses. */
  flagged?: PhoneFlag;
}

/**
 * Normalize a raw user-entered phone string for comparison + storage.
 *
 * Pipeline:
 *   1. strip leading apostrophe (spreadsheet copy-paste artifact)
 *   2. strip \r / \n / \t (real values seen in NocoDB had carriage returns)
 *   3. detect multi-number fields ("+33611111111;+33622222222",
 *      "0677580750/0690511494") - flag and take first segment
 *   4. strip whitespace, dots, dashes, parens, single quotes
 *   5. convert leading "00" → "+" (international dialling code)
 *   6. detect placeholder fakes (8+ consecutive zeros) - flag, return null e164
 *   7. parse via libphonenumber-js
 *   8. on parse failure or invalid number → flag 'unparseable'
 *
 * Returns null for empty inputs (cheaper to short-circuit than to wrap).
 */
export function normalizePhone(
  raw: string | null | undefined,
  defaultCountry?: CountryCode,
): NormalizedPhone | null {
  if (raw == null) return null;
  let cleaned = raw.toString().trim();
  if (!cleaned) return null;

  // 1. Spreadsheet apostrophe prefix.
  if (cleaned.startsWith("'")) cleaned = cleaned.slice(1);

  // 2. Strip carriage returns / newlines / tabs.
  cleaned = cleaned.replace(/[\r\n\t]/g, '');

  // 3. Multi-number detection - split on /, ;, , (in that order of priority).
  let flagged: PhoneFlag | undefined;
  if (/[/;,]/.test(cleaned)) {
    flagged = 'multi_number';
    cleaned = cleaned.split(/[/;,]/)[0]!.trim();
  }

  // 4. Strip whitespace, dots, dashes, parens. Keep + for E.164 prefix.
  cleaned = cleaned.replace(/[\s.\-()]/g, '');
  if (!cleaned) return { e164: null, country: null, display: null, flagged: 'unparseable' };

  // 5. 00 international prefix → +.
  if (cleaned.startsWith('00')) {
    cleaned = '+' + cleaned.slice(2);
  }

  // 6. Placeholder fakes - runs of 8+ consecutive zeros, e.g. +447000000000.
  if (/0{8,}/.test(cleaned)) {
    return { e164: null, country: null, display: null, flagged: 'placeholder' };
  }

  // 7. Parse via the existing i18n helper (libphonenumber-js under the hood).
  const parsed = parsePhone(cleaned, defaultCountry);
  if (!parsed.e164) {
    // Couldn't even produce a canonical form - genuinely garbage.
    return { e164: null, country: null, display: null, flagged: 'unparseable' };
  }

  // Note: we deliberately don't gate on `parsed.isValid`. The
  // libphonenumber-js `min` build returns isValid=false for many real
  // numbers (NANP territories share +1; some country metadata is
  // truncated). For dedup we only need a canonical E.164 string to
  // compare; strict validity is the form layer's problem, not ours.
  // If a string-only test (e.g. \"abc-not-a-phone\") gets here, parse
  // returns null e164 anyway and the branch above handles it.
  return {
    e164: parsed.e164,
    country: parsed.country,
    display: parsed.international,
    flagged,
  };
}

// ─── Countries ──────────────────────────────────────────────────────────────

/**
 * Aliases for canonical country names that don't match
 * `Intl.DisplayNames(en)` output verbatim. Keys are pre-normalized
 * (lowercase, diacritic-free, hyphens/dots → spaces, collapsed whitespace).
 *
 * Kept opinionated and small - only entries we've actually seen in legacy
 * data. Adding a new alias is cheap; trying to be exhaustive isn't.
 */
const COUNTRY_ALIASES: Record<string, CountryCode> = {
  // Generic abbreviations
  usa: 'US',
  us: 'US',
  uk: 'GB',
  // Saint-Barthélemy variants seen in production
  'saint barthelemy': 'BL',
  'saint barth': 'BL',
  'st barth': 'BL',
  'st barths': 'BL',
  'st barthelemy': 'BL',
  // Caribbean short-forms whose canonical Intl names are awkward
  // ("Antigua and Barbuda", "Saint Vincent and the Grenadines", etc.).
  antigua: 'AG',
  barbuda: 'AG',
  'st kitts': 'KN',
  'saint kitts': 'KN',
  nevis: 'KN',
};

/**
 * High-frequency cities → country, used as a last-resort fallback when
 * exact / alias / fuzzy country matching all miss. Keys are normalized.
 *
 * Order matters: an entry's key is also matched as a substring of the
 * input ("Sag Harbor Y" contains "sag harbor"), so the most specific
 * city appears first to avoid a wrong partial hit.
 */
const CITY_TO_COUNTRY: Record<string, CountryCode> = {
  'kansas city': 'US',
  'sag harbor': 'US',
  'new york': 'US',
  // Cities that came out unresolved from the 2026-05-03 NocoDB dry-run.
  // Using lowercase (post-normalize keys).
  boston: 'US',
  tampa: 'US',
  'fort lauderdale': 'US',
  'port jefferson': 'US',
  nantucket: 'US',
  // US state abbreviations that often appear standalone or as suffix:
  ' fl': 'US',
  ' ma': 'US',
  ' ny': 'US',
  ' tx': 'US',
  ' ca': 'US',
  // International
  london: 'GB',
  paris: 'FR',
};

export type CountryConfidence = 'exact' | 'fuzzy' | 'city';

export interface ResolvedCountry {
  iso: CountryCode | null;
  confidence: CountryConfidence | null;
}

/**
 * Map free-text country / region input to an ISO-3166-1 alpha-2 code.
 *
 * Lookup order: alias → exact (vs. all locale country names) → city →
 * fuzzy (Levenshtein ≤ 2). Anything beyond fuzzy returns null and the
 * migration script flags the row for human review.
 */
export function resolveCountry(text: string | null | undefined): ResolvedCountry {
  if (text == null) return { iso: null, confidence: null };
  const normalized = normalizeForLookup(text.toString());
  if (!normalized) return { iso: null, confidence: null };

  // 1. Aliases - covers USA / UK / St Barth and friends.
  const alias = COUNTRY_ALIASES[normalized];
  if (alias) return { iso: alias, confidence: 'exact' };

  // 2. Exact match against Intl-derived country names. We compare against
  //    diacritic-stripped + lowercased canonical names so 'United States'
  //    and 'united states' both resolve.
  for (const code of ALL_COUNTRY_CODES) {
    const cleanName = normalizeForLookup(getCountryName(code, 'en'));
    if (cleanName === normalized) return { iso: code, confidence: 'exact' };
  }

  // 3. City → country fallback, exact or substring.
  const cityExact = CITY_TO_COUNTRY[normalized];
  if (cityExact) return { iso: cityExact, confidence: 'city' };
  for (const [city, iso] of Object.entries(CITY_TO_COUNTRY)) {
    if (normalized.includes(city)) return { iso, confidence: 'city' };
  }

  // 4. Fuzzy fallback (Levenshtein ≤ 2). Skipped for short inputs because
  //    a 4-char string like "Mars" sits within distance 2 of multiple
  //    short country names (Mali, Laos, Iran, …) - false-positive city.
  if (normalized.length >= 6) {
    let bestCode: CountryCode | null = null;
    let bestDistance = Number.POSITIVE_INFINITY;
    for (const code of ALL_COUNTRY_CODES) {
      const cleanName = normalizeForLookup(getCountryName(code, 'en'));
      const d = levenshtein(cleanName, normalized);
      if (d < bestDistance) {
        bestDistance = d;
        bestCode = code;
        if (d === 0) break;
      }
    }
    if (bestDistance <= 2 && bestCode) {
      return { iso: bestCode, confidence: 'fuzzy' };
    }
  }

  return { iso: null, confidence: null };
}

/** Lowercase + strip diacritics + replace hyphens/dots with spaces +
 *  collapse whitespace. Used by both the input and the canonical-name
 *  side of the country comparison so they meet on the same shape. */
function normalizeForLookup(s: string): string {
  return s
    .normalize('NFD')
    .replace(/[̀-ͯ]/g, '')
    .toLowerCase()
    .replace(/[-.]/g, ' ')
    .replace(/\s+/g, ' ')
    .trim();
}

// ─── Levenshtein ────────────────────────────────────────────────────────────

/**
 * Standard iterative Levenshtein. Used by the country fuzzy match and by
 * the dedup algorithm's name-similarity rule. Allocates O(n*m) so callers
 * shouldn't run it against pathological inputs - the dedup blocking
 * strategy keeps comparison sets small.
 *
 * Exported so the find-matches module can reuse the same implementation
 * without relying on an external dep.
 */
export function levenshtein(a: string, b: string): number {
  if (a === b) return 0;
  if (!a) return b.length;
  if (!b) return a.length;

  const m = a.length;
  const n = b.length;
  // Two rolling rows is enough - keeps memory at O(n) instead of O(n*m).
  let prev = new Array<number>(n + 1);
  let curr = new Array<number>(n + 1);
  for (let j = 0; j <= n; j += 1) prev[j] = j;

  for (let i = 1; i <= m; i += 1) {
    curr[0] = i;
    for (let j = 1; j <= n; j += 1) {
      const cost = a[i - 1] === b[j - 1] ? 0 : 1;
      curr[j] = Math.min(curr[j - 1]! + 1, prev[j]! + 1, prev[j - 1]! + cost);
    }
    [prev, curr] = [curr, prev];
  }

  return prev[n]!;
}