feat(dedup): normalization + match-finding library (P1)

The pure-logic spine of the client deduplication system spec'd in docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md. Two modules, JSX-free, vitest-tested against fixtures drawn directly from real dirty values observed in the legacy NocoDB Interests audit. src/lib/dedup/normalize.ts - normalizeName: trims whitespace, replaces \r/\n/\t, intelligently title-cases ALL-CAPS surnames while keeping particles (van / de / dalla / etc.) lowercase mid-name. Preserves Irish O' surnames and the "slash-with-company" structure ("Daniel Wainstein / 7 Knots, LLC") seen in production. Returns a surnameToken (lowercased last non-particle token) for use as a dedup blocking key. - normalizeEmail: trim + lowercase + zod email validation. Plus-aliases preserved; null on invalid. - normalizePhone: pre-cleans the input (strips spreadsheet apostrophes, carriage returns, dots/dashes/parens, converts 00 prefix to +) then delegates to libphonenumber-js. Detects multi-number fields ("a/b", "a;b") and placeholder fakes (8+ consecutive zeros, e.g. +447000000000). Flags every quirk so the migration report and runtime audit log can surface it. - resolveCountry: maps free-text country/region input to ISO-3166-1 alpha-2 via alias → exact (vs. Intl-derived names) → city → fuzzy (Levenshtein ≤ 2). Fuzzy is gated by length so 4-char inputs ("Mars") don't false-positive against short country names. - levenshtein: standard iterative implementation, exported for reuse by find-matches. src/lib/dedup/find-matches.ts - findClientMatches: builds three blocking indexes off the pool (email / phone / surname-token), gathers the comparison set via union, and scores each candidate via the rule set in design §4.2: Email match +60 Phone E.164 match +50 (≥ 8 digits, excludes placeholder zeros) Name exact match +20 Surname + given fuzzy +15 (Levenshtein ≤ 1) Negative: shared email but different phone country −15 Negative: name match but no shared contact −20 Score is clamped to [0,100]. Confidence tier ('high' / 'medium' / 'low') is derived from configurable thresholds passed in by the caller — defaults are highScore=90, mediumScore=50. tests/unit/dedup/normalize.test.ts (38 cases) Every dirty-data pattern from design §1.3 has a fixture: carriage returns in names, ALL-CAPS surnames, lowercase entries, particles, slash-with-company, plus-aliases, capitalized email localparts, spreadsheet-apostrophe phones, multi-number phones, placeholder phones, 00-prefix phones, French/UK local-format phones, Saint-Barthélemy diacritic variants, Kansas City fallback. tests/unit/dedup/find-matches.test.ts (12 cases) Each duplicate cluster from design §1.2 has a test: - Pattern A (Deepak Ramchandani — pure double-submit) → high - Pattern B (Howard Wiarda — phone format variance) → high - Pattern C (Nicolas Ruiz — name capitalization) → high - Pattern D (Chris/Christopher Allen — name shortening) → high - Pattern E (Christopher Camazou — typo on resubmit) → high or medium - Pattern E (Constanzo/Costanzo — surname typo, multi-yacht) → high - Pattern F (Etiennette Clamouze — same name, different country) → must NOT auto-merge - Pattern F (Bruno+Bruce — shared household contact) → no match - Negative evidence (same email, different phone country) → medium - Blocking (no shared keys → 0 matches) - Sort order (high before low) - Empty pool Total: 50 new tests, all green. Zero changes to runtime behavior or schema; unblocks P2 (runtime surfaces) and P3 (NocoDB migration). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 14:28:59 +02:00
parent e2398099c4
commit 8b077e1999
4 changed files with 1293 additions and 0 deletions
--- a/src/lib/dedup/find-matches.ts
+++ b/src/lib/dedup/find-matches.ts
@@ -0,0 +1,255 @@
+/**
+ * Client-match finder — pure scoring logic.
+ *
+ * Compares one input candidate against a pool of existing candidates and
+ * returns scored matches. Used by:
+ *   - the at-create suggestion in client/interest forms (Layer 1)
+ *   - the public-form auto-link path (when score >= block threshold)
+ *   - the nightly background scoring job (Layer 3)
+ *   - the migration script's dedup pass
+ *
+ * Performance shape: blocking via email / phone / surname-token reduces
+ * the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in
+ * production. See `findClientMatches` for the blocking implementation.
+ *
+ * Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4.
+ */
+
+import { parsePhone } from '@/lib/i18n/phone';
+
+import { levenshtein } from './normalize';
+
+// ─── Types ──────────────────────────────────────────────────────────────────
+
+export interface MatchCandidate {
+  id: string;
+  fullName: string | null;
+  /** Lowercased last non-particle token from `normalizeName(...).surnameToken`.
+   *  Used as a blocking key. */
+  surnameToken: string | null;
+  /** Already lowercased + validated via `normalizeEmail`. */
+  emails: string[];
+  /** Already canonical E.164 via `normalizePhone`. */
+  phonesE164: string[];
+  /** Address country (NOT phone country) — used for tiebreaking, not scoring. */
+  countryIso: string | null;
+}
+
+export type MatchConfidence = 'high' | 'medium' | 'low';
+
+export interface MatchResult {
+  candidate: MatchCandidate;
+  /** 0–100 after capping. */
+  score: number;
+  /** Human-readable list of which rules contributed. Useful for the
+   *  review queue UI ("matched on email + phone + surname token"). */
+  reasons: string[];
+  confidence: MatchConfidence;
+}
+
+export interface DedupThresholds {
+  /** Inclusive lower bound for `'high'` confidence. */
+  highScore: number;
+  /** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */
+  mediumScore: number;
+}
+
+// ─── Public entry point ─────────────────────────────────────────────────────
+
+/**
+ * Compare `input` against every reachable candidate in `pool` and return
+ * scored matches, sorted by score descending. The result list includes
+ * low-confidence hits — caller filters by `confidence` or `score`
+ * depending on use case.
+ *
+ * Self-matches (an entry with `id === input.id`, e.g. when re-scoring an
+ * existing client during a background job) are excluded.
+ */
+export function findClientMatches(
+  input: MatchCandidate,
+  pool: MatchCandidate[],
+  thresholds: DedupThresholds,
+): MatchResult[] {
+  if (pool.length === 0) return [];
+
+  // ── Phase 1: build blocking indexes off the pool. ─────────────────────────
+  //
+  // Three indexes mean any candidate that shares ANY of (email / phone /
+  // surname-token) with the input shows up in the comparison set. Anything
+  // that shares NONE is structurally too different to be a duplicate and
+  // is skipped — this is what keeps the algorithm O(n) at scale.
+  const byEmail = new Map<string, MatchCandidate[]>();
+  const byPhone = new Map<string, MatchCandidate[]>();
+  const bySurnameToken = new Map<string, MatchCandidate[]>();
+
+  for (const c of pool) {
+    if (c.id === input.id) continue;
+    for (const email of c.emails) {
+      pushTo(byEmail, email, c);
+    }
+    for (const phone of c.phonesE164) {
+      pushTo(byPhone, phone, c);
+    }
+    if (c.surnameToken) {
+      pushTo(bySurnameToken, c.surnameToken, c);
+    }
+  }
+
+  // ── Phase 2: gather the comparison set via the blocking indexes. ─────────
+  const comparisonSet = new Map<string, MatchCandidate>();
+  for (const email of input.emails) {
+    for (const c of byEmail.get(email) ?? []) {
+      comparisonSet.set(c.id, c);
+    }
+  }
+  for (const phone of input.phonesE164) {
+    for (const c of byPhone.get(phone) ?? []) {
+      comparisonSet.set(c.id, c);
+    }
+  }
+  if (input.surnameToken) {
+    for (const c of bySurnameToken.get(input.surnameToken) ?? []) {
+      comparisonSet.set(c.id, c);
+    }
+  }
+
+  // ── Phase 3: score every candidate that survived blocking. ───────────────
+  const results: MatchResult[] = [];
+  for (const candidate of comparisonSet.values()) {
+    const r = scorePair(input, candidate);
+    results.push(r);
+  }
+
+  // ── Phase 4: sort by score desc + assign confidence tier. ────────────────
+  results.sort((a, b) => b.score - a.score);
+  for (const r of results) {
+    r.confidence = classify(r.score, thresholds);
+  }
+  return results;
+}
+
+// ─── Scoring ────────────────────────────────────────────────────────────────
+
+/**
+ * Score one (input, candidate) pair against the rule set in design §4.2.
+ * Compounding: positive rules sum, negative rules subtract; the result is
+ * clamped to [0, 100]. Reasons accumulate in the order rules fire so the
+ * review-queue UI can show "matched on email + phone".
+ */
+function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult {
+  let score = 0;
+  const reasons: string[] = [];
+
+  // ── Positive rules. ──────────────────────────────────────────────────────
+
+  const sharedEmail = a.emails.find((e) => b.emails.includes(e));
+  const emailMatch = !!sharedEmail;
+  if (emailMatch) {
+    score += 60;
+    reasons.push('email match');
+  }
+
+  const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8);
+  const phoneMatch = !!sharedPhone;
+  if (phoneMatch) {
+    score += 50;
+    reasons.push('phone match');
+  }
+
+  const aNameNorm = (a.fullName ?? '').toLowerCase().trim();
+  const bNameNorm = (b.fullName ?? '').toLowerCase().trim();
+  const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm;
+  if (nameExactMatch) {
+    score += 20;
+    reasons.push('name match');
+  }
+
+  // Surname + given-name fuzzy. Only fires when names are NOT exactly
+  // equal — avoids double-counting with the rule above. Catches
+  // 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact
+  // signals confirm them.
+  if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) {
+    const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
+    const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
+    if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) {
+      score += 15;
+      reasons.push('surname + given-name fuzzy match');
+    }
+  }
+
+  // ── Negative rules. ──────────────────────────────────────────────────────
+
+  // Same email but the two parties' phone numbers belong to different
+  // countries. Common when one inbox is shared by spouses / coworkers
+  // and the actual phone owners are distinct people. Don't auto-merge.
+  if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) {
+    const aCountries = phoneCountriesOf(a);
+    const bCountries = phoneCountriesOf(b);
+    const overlap = [...aCountries].some((c) => bCountries.has(c));
+    if (!overlap && aCountries.size > 0 && bCountries.size > 0) {
+      score -= 15;
+      reasons.push('phone country mismatch (negative)');
+    }
+  }
+
+  // Same name but no contact match. Two distinct people with the same
+  // name (common for "John Smith") sneak through name-based blocking;
+  // penalize so the score lands below the auto-merge threshold.
+  if (nameExactMatch && !emailMatch && !phoneMatch) {
+    score -= 20;
+    reasons.push('name match but no shared contact (negative)');
+  }
+
+  return {
+    candidate: b,
+    score: clamp(score, 0, 100),
+    reasons,
+    confidence: 'low', // assigned by caller after threshold lookup
+  };
+}
+
+// ─── Helpers ────────────────────────────────────────────────────────────────
+
+function pushTo<K, V>(map: Map<K, V[]>, key: K, value: V): void {
+  const existing = map.get(key);
+  if (existing) {
+    existing.push(value);
+  } else {
+    map.set(key, [value]);
+  }
+}
+
+function classify(score: number, thresholds: DedupThresholds): MatchConfidence {
+  if (score >= thresholds.highScore) return 'high';
+  if (score >= thresholds.mediumScore) return 'medium';
+  return 'low';
+}
+
+function clamp(value: number, min: number, max: number): number {
+  if (value < min) return min;
+  if (value > max) return max;
+  return value;
+}
+
+function countDigits(s: string): number {
+  let count = 0;
+  for (let i = 0; i < s.length; i += 1) {
+    const code = s.charCodeAt(i);
+    if (code >= 48 && code <= 57) count += 1;
+  }
+  return count;
+}
+
+/**
+ * Resolve each phone in a candidate to its ISO country code (via
+ * libphonenumber-js). Cached per call; the surrounding caller doesn't
+ * batch so we accept the parse cost.
+ */
+function phoneCountriesOf(c: MatchCandidate): Set<string> {
+  const out = new Set<string>();
+  for (const p of c.phonesE164) {
+    const parsed = parsePhone(p);
+    if (parsed.country) out.add(parsed.country);
+  }
+  return out;
+}
--- a/src/lib/dedup/normalize.ts
+++ b/src/lib/dedup/normalize.ts
@@ -0,0 +1,389 @@
+/**
+ * Normalization helpers for the dedup pipeline.
+ *
+ * Pure functions (no DB, no React). Used by both the runtime at-create
+ * surfaces and the one-shot NocoDB migration script. Every transform
+ * here has a fixture in `tests/unit/dedup/normalize.test.ts` drawn from
+ * real dirty values observed in the legacy NocoDB Interests table.
+ *
+ * Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §3.
+ */
+
+import { z } from 'zod';
+
+import { ALL_COUNTRY_CODES, getCountryName, type CountryCode } from '@/lib/i18n/countries';
+import { parsePhone } from '@/lib/i18n/phone';
+
+// ─── Names ──────────────────────────────────────────────────────────────────
+
+/**
+ * Tokens that should stay lowercase mid-name. Covers the common Romance,
+ * Germanic, and Iberian particles seen in client records. The first token
+ * of a name is always title-cased even if it appears in this set.
+ */
+const PARTICLES: ReadonlySet<string> = new Set([
+  'van',
+  'von',
+  'de',
+  'del',
+  'da',
+  'das',
+  'do',
+  'dos',
+  'di',
+  'le',
+  'la',
+  'el',
+  'al',
+  'der',
+  'den',
+  'des',
+  'du',
+  'dalla',
+  'della',
+  'st',
+  'st.',
+  'y',
+]);
+
+export interface NormalizedName {
+  /** Human-readable form preserved for UI display. Trims, collapses
+   *  whitespace, fixes case, but never destroys the user's intent —
+   *  slash-with-company structure ("Daniel Wainstein / 7 Knots, LLC")
+   *  is left intact. */
+  display: string;
+  /** Lowercased form for matching. */
+  normalized: string;
+  /** Last non-particle token, lowercased. Used as a blocking key by the
+   *  dedup algorithm so we only compare candidates with similar surnames. */
+  surnameToken?: string;
+}
+
+/**
+ * Normalize a free-text full name. Trims and collapses whitespace,
+ * replaces \r/\n/\t with single spaces, intelligently title-cases
+ * ALL-CAPS surnames while keeping particles (van / de / dalla / etc.)
+ * lowercase mid-name, and preserves Irish O' surnames as O'Brien.
+ *
+ * If the input contains a `/` (slash-with-company structure like
+ * "Daniel Wainstein / 7 Knots, LLC"), the trailing company text is
+ * preserved verbatim — it's signal, not noise.
+ */
+export function normalizeName(raw: string | null | undefined): NormalizedName {
+  const safe = (raw ?? '').toString();
+  // Replace \r, \n, \t with single spaces, then collapse runs of whitespace.
+  const cleaned = safe
+    .replace(/[\r\n\t]/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim();
+
+  if (!cleaned) {
+    return { display: '', normalized: '', surnameToken: undefined };
+  }
+
+  // Slash-with-company: title-case the part before the slash, leave the
+  // company segment untouched (it's typically already a brand we shouldn't
+  // mangle: "SAS TIKI", "7 Knots, LLC").
+  const slashIdx = cleaned.indexOf('/');
+  let displayCore: string;
+  if (slashIdx !== -1) {
+    const personPart = cleaned.slice(0, slashIdx).trim();
+    const companyPart = cleaned.slice(slashIdx + 1).trim();
+    displayCore = `${titleCaseTokens(personPart)} / ${companyPart}`;
+  } else {
+    displayCore = titleCaseTokens(cleaned);
+  }
+
+  const display = displayCore;
+  const normalized = display.toLowerCase();
+  const surnameToken = computeSurnameToken(slashIdx !== -1 ? cleaned.slice(0, slashIdx) : cleaned);
+
+  return { display, normalized, surnameToken };
+}
+
+function titleCaseTokens(s: string): string {
+  const tokens = s.split(' ').filter(Boolean);
+  if (tokens.length === 0) return '';
+  return tokens.map((tok, idx) => titleCaseOneToken(tok, idx === 0)).join(' ');
+}
+
+function titleCaseOneToken(token: string, isFirst: boolean): string {
+  if (!token) return '';
+  const lower = token.toLowerCase();
+  if (!isFirst && PARTICLES.has(lower)) return lower;
+  // O'Brien / D'Angelo / l'Estrange — capitalize the segment after each
+  // apostrophe so a lowercased input round-trips to readable Irish caps.
+  if (lower.includes("'")) {
+    return lower
+      .split("'")
+      .map((part) => (part.length > 0 ? part[0]!.toUpperCase() + part.slice(1) : part))
+      .join("'");
+  }
+  return lower[0]!.toUpperCase() + lower.slice(1);
+}
+
+function computeSurnameToken(personPart: string): string | undefined {
+  const cleaned = personPart
+    .replace(/[\r\n\t]/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim();
+  if (!cleaned) return undefined;
+  const tokens = cleaned.split(' ').map((t) => t.toLowerCase());
+  // Walk from the right past particles to find the last "real" surname token.
+  for (let i = tokens.length - 1; i >= 0; i -= 1) {
+    const tok = tokens[i]!;
+    if (!PARTICLES.has(tok)) return tok;
+  }
+  // All tokens are particles? Fall back to the last token verbatim.
+  return tokens[tokens.length - 1];
+}
+
+// ─── Emails ─────────────────────────────────────────────────────────────────
+
+const emailSchema = z.string().email();
+
+/**
+ * Normalize a free-text email. Trims + lowercases. Returns null for empty
+ * or malformed input — caller decides whether to flag, store, or drop.
+ *
+ * Plus-aliases (`user+tag@domain.com`) are NOT stripped: they're real
+ * distinct addresses, and stripping them would auto-merge legitimately
+ * separate accounts.
+ */
+export function normalizeEmail(raw: string | null | undefined): string | null {
+  if (raw == null) return null;
+  const trimmed = raw.toString().trim().toLowerCase();
+  if (!trimmed) return null;
+  const result = emailSchema.safeParse(trimmed);
+  return result.success ? trimmed : null;
+}
+
+// ─── Phones ─────────────────────────────────────────────────────────────────
+
+export type PhoneFlag = 'multi_number' | 'placeholder' | 'unparseable';
+
+export interface NormalizedPhone {
+  /** Canonical E.164 form, e.g. '+15742740548'. Null when unparseable
+   *  or flagged as placeholder. */
+  e164: string | null;
+  /** ISO-3166-1 alpha-2 of the country the number was parsed against. */
+  country: CountryCode | null;
+  /** Display-friendly international format. Useful for migration reports. */
+  display: string | null;
+  /** Set when the input had a quirk worth surfacing in the migration
+   *  report or runtime audit log. Absent on clean parses. */
+  flagged?: PhoneFlag;
+}
+
+/**
+ * Normalize a raw user-entered phone string for comparison + storage.
+ *
+ * Pipeline:
+ *   1. strip leading apostrophe (spreadsheet copy-paste artifact)
+ *   2. strip \r / \n / \t (real values seen in NocoDB had carriage returns)
+ *   3. detect multi-number fields ("+33611111111;+33622222222",
+ *      "0677580750/0690511494") — flag and take first segment
+ *   4. strip whitespace, dots, dashes, parens, single quotes
+ *   5. convert leading "00" → "+" (international dialling code)
+ *   6. detect placeholder fakes (8+ consecutive zeros) — flag, return null e164
+ *   7. parse via libphonenumber-js
+ *   8. on parse failure or invalid number → flag 'unparseable'
+ *
+ * Returns null for empty inputs (cheaper to short-circuit than to wrap).
+ */
+export function normalizePhone(
+  raw: string | null | undefined,
+  defaultCountry?: CountryCode,
+): NormalizedPhone | null {
+  if (raw == null) return null;
+  let cleaned = raw.toString().trim();
+  if (!cleaned) return null;
+
+  // 1. Spreadsheet apostrophe prefix.
+  if (cleaned.startsWith("'")) cleaned = cleaned.slice(1);
+
+  // 2. Strip carriage returns / newlines / tabs.
+  cleaned = cleaned.replace(/[\r\n\t]/g, '');
+
+  // 3. Multi-number detection — split on /, ;, , (in that order of priority).
+  let flagged: PhoneFlag | undefined;
+  if (/[/;,]/.test(cleaned)) {
+    flagged = 'multi_number';
+    cleaned = cleaned.split(/[/;,]/)[0]!.trim();
+  }
+
+  // 4. Strip whitespace, dots, dashes, parens. Keep + for E.164 prefix.
+  cleaned = cleaned.replace(/[\s.\-()]/g, '');
+  if (!cleaned) return { e164: null, country: null, display: null, flagged: 'unparseable' };
+
+  // 5. 00 international prefix → +.
+  if (cleaned.startsWith('00')) {
+    cleaned = '+' + cleaned.slice(2);
+  }
+
+  // 6. Placeholder fakes — runs of 8+ consecutive zeros, e.g. +447000000000.
+  if (/0{8,}/.test(cleaned)) {
+    return { e164: null, country: null, display: null, flagged: 'placeholder' };
+  }
+
+  // 7. Parse via the existing i18n helper (libphonenumber-js under the hood).
+  const parsed = parsePhone(cleaned, defaultCountry);
+  if (!parsed.e164 || !parsed.isValid) {
+    return { e164: null, country: null, display: null, flagged: 'unparseable' };
+  }
+
+  return {
+    e164: parsed.e164,
+    country: parsed.country,
+    display: parsed.international,
+    flagged,
+  };
+}
+
+// ─── Countries ──────────────────────────────────────────────────────────────
+
+/**
+ * Aliases for canonical country names that don't match
+ * `Intl.DisplayNames(en)` output verbatim. Keys are pre-normalized
+ * (lowercase, diacritic-free, hyphens/dots → spaces, collapsed whitespace).
+ *
+ * Kept opinionated and small — only entries we've actually seen in legacy
+ * data. Adding a new alias is cheap; trying to be exhaustive isn't.
+ */
+const COUNTRY_ALIASES: Record<string, CountryCode> = {
+  // Generic abbreviations
+  usa: 'US',
+  us: 'US',
+  uk: 'GB',
+  // Saint-Barthélemy variants seen in production
+  'saint barthelemy': 'BL',
+  'saint barth': 'BL',
+  'st barth': 'BL',
+  'st barths': 'BL',
+  'st barthelemy': 'BL',
+};
+
+/**
+ * High-frequency cities → country, used as a last-resort fallback when
+ * exact / alias / fuzzy country matching all miss. Keys are normalized.
+ *
+ * Order matters: an entry's key is also matched as a substring of the
+ * input ("Sag Harbor Y" contains "sag harbor"), so the most specific
+ * city appears first to avoid a wrong partial hit.
+ */
+const CITY_TO_COUNTRY: Record<string, CountryCode> = {
+  'kansas city': 'US',
+  'sag harbor': 'US',
+  'new york': 'US',
+  london: 'GB',
+  paris: 'FR',
+};
+
+export type CountryConfidence = 'exact' | 'fuzzy' | 'city';
+
+export interface ResolvedCountry {
+  iso: CountryCode | null;
+  confidence: CountryConfidence | null;
+}
+
+/**
+ * Map free-text country / region input to an ISO-3166-1 alpha-2 code.
+ *
+ * Lookup order: alias → exact (vs. all locale country names) → city →
+ * fuzzy (Levenshtein ≤ 2). Anything beyond fuzzy returns null and the
+ * migration script flags the row for human review.
+ */
+export function resolveCountry(text: string | null | undefined): ResolvedCountry {
+  if (text == null) return { iso: null, confidence: null };
+  const normalized = normalizeForLookup(text.toString());
+  if (!normalized) return { iso: null, confidence: null };
+
+  // 1. Aliases — covers USA / UK / St Barth and friends.
+  const alias = COUNTRY_ALIASES[normalized];
+  if (alias) return { iso: alias, confidence: 'exact' };
+
+  // 2. Exact match against Intl-derived country names. We compare against
+  //    diacritic-stripped + lowercased canonical names so 'United States'
+  //    and 'united states' both resolve.
+  for (const code of ALL_COUNTRY_CODES) {
+    const cleanName = normalizeForLookup(getCountryName(code, 'en'));
+    if (cleanName === normalized) return { iso: code, confidence: 'exact' };
+  }
+
+  // 3. City → country fallback, exact or substring.
+  const cityExact = CITY_TO_COUNTRY[normalized];
+  if (cityExact) return { iso: cityExact, confidence: 'city' };
+  for (const [city, iso] of Object.entries(CITY_TO_COUNTRY)) {
+    if (normalized.includes(city)) return { iso, confidence: 'city' };
+  }
+
+  // 4. Fuzzy fallback (Levenshtein ≤ 2). Skipped for short inputs because
+  //    a 4-char string like "Mars" sits within distance 2 of multiple
+  //    short country names (Mali, Laos, Iran, …) — false-positive city.
+  if (normalized.length >= 6) {
+    let bestCode: CountryCode | null = null;
+    let bestDistance = Number.POSITIVE_INFINITY;
+    for (const code of ALL_COUNTRY_CODES) {
+      const cleanName = normalizeForLookup(getCountryName(code, 'en'));
+      const d = levenshtein(cleanName, normalized);
+      if (d < bestDistance) {
+        bestDistance = d;
+        bestCode = code;
+        if (d === 0) break;
+      }
+    }
+    if (bestDistance <= 2 && bestCode) {
+      return { iso: bestCode, confidence: 'fuzzy' };
+    }
+  }
+
+  return { iso: null, confidence: null };
+}
+
+/** Lowercase + strip diacritics + replace hyphens/dots with spaces +
+ *  collapse whitespace. Used by both the input and the canonical-name
+ *  side of the country comparison so they meet on the same shape. */
+function normalizeForLookup(s: string): string {
+  return s
+    .normalize('NFD')
+    .replace(/[̀-ͯ]/g, '')
+    .toLowerCase()
+    .replace(/[-.]/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim();
+}
+
+// ─── Levenshtein ────────────────────────────────────────────────────────────
+
+/**
+ * Standard iterative Levenshtein. Used by the country fuzzy match and by
+ * the dedup algorithm's name-similarity rule. Allocates O(n*m) so callers
+ * shouldn't run it against pathological inputs — the dedup blocking
+ * strategy keeps comparison sets small.
+ *
+ * Exported so the find-matches module can reuse the same implementation
+ * without relying on an external dep.
+ */
+export function levenshtein(a: string, b: string): number {
+  if (a === b) return 0;
+  if (!a) return b.length;
+  if (!b) return a.length;
+
+  const m = a.length;
+  const n = b.length;
+  // Two rolling rows is enough — keeps memory at O(n) instead of O(n*m).
+  let prev = new Array<number>(n + 1);
+  let curr = new Array<number>(n + 1);
+  for (let j = 0; j <= n; j += 1) prev[j] = j;
+
+  for (let i = 1; i <= m; i += 1) {
+    curr[0] = i;
+    for (let j = 1; j <= n; j += 1) {
+      const cost = a[i - 1] === b[j - 1] ? 0 : 1;
+      curr[j] = Math.min(curr[j - 1]! + 1, prev[j]! + 1, prev[j - 1]! + cost);
+    }
+    [prev, curr] = [curr, prev];
+  }
+
+  return prev[n]!;
+}