feat(dedup): normalization + match-finding library (P1)

The pure-logic spine of the client deduplication system spec'd in docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md. Two modules, JSX-free, vitest-tested against fixtures drawn directly from real dirty values observed in the legacy NocoDB Interests audit. src/lib/dedup/normalize.ts - normalizeName: trims whitespace, replaces \r/\n/\t, intelligently title-cases ALL-CAPS surnames while keeping particles (van / de / dalla / etc.) lowercase mid-name. Preserves Irish O' surnames and the "slash-with-company" structure ("Daniel Wainstein / 7 Knots, LLC") seen in production. Returns a surnameToken (lowercased last non-particle token) for use as a dedup blocking key. - normalizeEmail: trim + lowercase + zod email validation. Plus-aliases preserved; null on invalid. - normalizePhone: pre-cleans the input (strips spreadsheet apostrophes, carriage returns, dots/dashes/parens, converts 00 prefix to +) then delegates to libphonenumber-js. Detects multi-number fields ("a/b", "a;b") and placeholder fakes (8+ consecutive zeros, e.g. +447000000000). Flags every quirk so the migration report and runtime audit log can surface it. - resolveCountry: maps free-text country/region input to ISO-3166-1 alpha-2 via alias → exact (vs. Intl-derived names) → city → fuzzy (Levenshtein ≤ 2). Fuzzy is gated by length so 4-char inputs ("Mars") don't false-positive against short country names. - levenshtein: standard iterative implementation, exported for reuse by find-matches. src/lib/dedup/find-matches.ts - findClientMatches: builds three blocking indexes off the pool (email / phone / surname-token), gathers the comparison set via union, and scores each candidate via the rule set in design §4.2: Email match +60 Phone E.164 match +50 (≥ 8 digits, excludes placeholder zeros) Name exact match +20 Surname + given fuzzy +15 (Levenshtein ≤ 1) Negative: shared email but different phone country −15 Negative: name match but no shared contact −20 Score is clamped to [0,100]. Confidence tier ('high' / 'medium' / 'low') is derived from configurable thresholds passed in by the caller — defaults are highScore=90, mediumScore=50. tests/unit/dedup/normalize.test.ts (38 cases) Every dirty-data pattern from design §1.3 has a fixture: carriage returns in names, ALL-CAPS surnames, lowercase entries, particles, slash-with-company, plus-aliases, capitalized email localparts, spreadsheet-apostrophe phones, multi-number phones, placeholder phones, 00-prefix phones, French/UK local-format phones, Saint-Barthélemy diacritic variants, Kansas City fallback. tests/unit/dedup/find-matches.test.ts (12 cases) Each duplicate cluster from design §1.2 has a test: - Pattern A (Deepak Ramchandani — pure double-submit) → high - Pattern B (Howard Wiarda — phone format variance) → high - Pattern C (Nicolas Ruiz — name capitalization) → high - Pattern D (Chris/Christopher Allen — name shortening) → high - Pattern E (Christopher Camazou — typo on resubmit) → high or medium - Pattern E (Constanzo/Costanzo — surname typo, multi-yacht) → high - Pattern F (Etiennette Clamouze — same name, different country) → must NOT auto-merge - Pattern F (Bruno+Bruce — shared household contact) → no match - Negative evidence (same email, different phone country) → medium - Blocking (no shared keys → 0 matches) - Sort order (high before low) - Empty pool Total: 50 new tests, all green. Zero changes to runtime behavior or schema; unblocks P2 (runtime surfaces) and P3 (NocoDB migration). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 14:28:59 +02:00
parent e2398099c4
commit 8b077e1999
4 changed files with 1293 additions and 0 deletions
--- a/tests/unit/dedup/find-matches.test.ts
+++ b/tests/unit/dedup/find-matches.test.ts
@@ -0,0 +1,379 @@
+/**
+ * Match-finding library — unit tests.
+ *
+ * Each duplicate cluster from the legacy NocoDB Interests audit (see
+ * docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.2)
+ * is encoded as a fixture here. The expected scoring tier (high / medium
+ * / low) is the design contract — if the algorithm starts returning
+ * "high" for a Pattern F case (Etiennette / Bruno+Bruce) it has lost
+ * the false-positive guard and we'll know immediately.
+ */
+import { describe, expect, it } from 'vitest';
+
+import { findClientMatches, type MatchCandidate } from '@/lib/dedup/find-matches';
+
+// Sensible defaults for tests — match the design's recommended thresholds.
+const THRESHOLDS = {
+  highScore: 90,
+  mediumScore: 50,
+};
+
+function candidate(partial: Partial<MatchCandidate> & { id: string }): MatchCandidate {
+  return {
+    id: partial.id,
+    fullName: partial.fullName ?? null,
+    surnameToken: partial.surnameToken ?? null,
+    emails: partial.emails ?? [],
+    phonesE164: partial.phonesE164 ?? [],
+    countryIso: partial.countryIso ?? null,
+  };
+}
+
+describe('findClientMatches', () => {
+  describe('Pattern A — pure double-submit (high confidence)', () => {
+    it('flags identical email + phone as high', () => {
+      // From real data: Deepak Ramchandani #624/#625, identical fields.
+      const incoming = candidate({
+        id: 'b',
+        fullName: 'Deepak Ramchandani',
+        surnameToken: 'ramchandani',
+        emails: ['dannyrams8888@gmail.com'],
+        phonesE164: ['+17215868888'],
+      });
+      const pool = [
+        candidate({
+          id: 'a',
+          fullName: 'Deepak Ramchandani',
+          surnameToken: 'ramchandani',
+          emails: ['dannyrams8888@gmail.com'],
+          phonesE164: ['+17215868888'],
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      expect(matches).toHaveLength(1);
+      expect(matches[0]!.candidate.id).toBe('a');
+      expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
+      expect(matches[0]!.confidence).toBe('high');
+      expect(matches[0]!.reasons).toEqual(expect.arrayContaining(['email match', 'phone match']));
+    });
+  });
+
+  describe('Pattern B — same email, different phone format (high)', () => {
+    it('high confidence when phones already normalize-equal', () => {
+      // From real data: Howard Wiarda #236/#536, "574-274-0548" vs "+15742740548".
+      // After normalization both phones are the same E.164, so the rule fires.
+      const incoming = candidate({
+        id: 'b',
+        fullName: 'Howard Wiarda',
+        surnameToken: 'wiarda',
+        emails: ['hwiarda@hotmail.com'],
+        phonesE164: ['+15742740548'],
+      });
+      const pool = [
+        candidate({
+          id: 'a',
+          fullName: 'Howard Wiarda',
+          surnameToken: 'wiarda',
+          emails: ['hwiarda@hotmail.com'],
+          phonesE164: ['+15742740548'],
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      expect(matches[0]!.confidence).toBe('high');
+      expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
+    });
+  });
+
+  describe('Pattern C — name capitalization variant (high)', () => {
+    it('treats lowercase + uppercase as the same person when surname-token + email + phone all match', () => {
+      // From real data: Nicolas Ruiz #681/#682/#683, email differs only by case.
+      const incoming = candidate({
+        id: 'b',
+        fullName: 'Nicolas Ruiz',
+        surnameToken: 'ruiz',
+        emails: ['ruiz.nicolas@ufl.edu'],
+        phonesE164: ['+17862006617'],
+      });
+      const pool = [
+        candidate({
+          id: 'a',
+          fullName: 'Nicolas Ruiz',
+          surnameToken: 'ruiz',
+          emails: ['ruiz.nicolas@ufl.edu'],
+          phonesE164: ['+17862006617'],
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      expect(matches[0]!.confidence).toBe('high');
+    });
+  });
+
+  describe('Pattern D — name shortening (high)', () => {
+    it('Chris vs Christopher with same email + phone scores high', () => {
+      // From real data: Chris Allen #700 vs Christopher Allen #534.
+      const incoming = candidate({
+        id: 'b',
+        fullName: 'Chris Allen',
+        surnameToken: 'allen',
+        emails: ['chris@thundercatsports.com'],
+        phonesE164: ['+17814548950'],
+      });
+      const pool = [
+        candidate({
+          id: 'a',
+          fullName: 'Christopher Allen',
+          surnameToken: 'allen',
+          emails: ['chris@thundercatsports.com'],
+          phonesE164: ['+17814548950'],
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      expect(matches[0]!.confidence).toBe('high');
+    });
+  });
+
+  describe('Pattern E — typo on resubmit', () => {
+    it('same email + nearly-identical phone (typo in last digits) scores high', () => {
+      // Christopher Camazou #649/#650 — phone differs in last 4 digits but
+      // everything else matches. Exact phone equality fails; email exact
+      // match alone (60) + name-token match (20) puts us in medium tier.
+      // The user can confirm the merge.
+      const incoming = candidate({
+        id: 'b',
+        fullName: 'Christopher Camazou',
+        surnameToken: 'camazou',
+        emails: ['camazou11@gmail.com'],
+        phonesE164: ['+33608334455'],
+      });
+      const pool = [
+        candidate({
+          id: 'a',
+          fullName: 'Christopher Camazou',
+          surnameToken: 'camazou',
+          emails: ['camazou11@gmail.com'],
+          phonesE164: ['+33608336549'],
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      expect(matches).toHaveLength(1);
+      // Email + name match without phone match — strong but not certain.
+      expect(matches[0]!.confidence).toMatch(/^(high|medium)$/);
+      expect(matches[0]!.score).toBeGreaterThanOrEqual(70);
+    });
+
+    it('Constanzo / Costanzo surname typo with same email + phone scores high', () => {
+      // Gianfranco Di Constanzo #585 vs Di Costanzo #336 — same email + phone
+      // and only a 1-letter surname typo. This is a strong "same client,
+      // multiple yachts" signal — the design's signature win.
+      const incoming = candidate({
+        id: 'b',
+        fullName: 'Gianfranco Di Constanzo',
+        surnameToken: 'constanzo',
+        emails: ['gdc@nauticall.com'],
+        phonesE164: ['+17542628669'],
+      });
+      const pool = [
+        candidate({
+          id: 'a',
+          fullName: 'Gianfranco Di Costanzo',
+          surnameToken: 'costanzo',
+          emails: ['gdc@nauticall.com'],
+          phonesE164: ['+17542628669'],
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      expect(matches[0]!.confidence).toBe('high');
+      expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
+    });
+  });
+
+  describe('Pattern F — hard cases (must NOT auto-merge)', () => {
+    it('same name with different country phone + different email scores at most medium', () => {
+      // Etiennette Clamouze #188/#717 — same name but completely different
+      // email + phone (and the phones are in different country codes,
+      // suggesting either a relative, a coworker, or a name-collision).
+      // We must NOT classify this as "high" or it would force-merge two
+      // distinct people.
+      const incoming = candidate({
+        id: 'b',
+        fullName: 'Etiennette Clamouze',
+        surnameToken: 'clamouze',
+        emails: ['etiennette@the-manoah.com'],
+        phonesE164: ['+12645815607'],
+        countryIso: 'AI',
+      });
+      const pool = [
+        candidate({
+          id: 'a',
+          fullName: 'Etiennette Clamouze',
+          surnameToken: 'clamouze',
+          emails: ['clamouze.etiennette@gmail.com'],
+          phonesE164: ['+33767780640'],
+          countryIso: 'FR',
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      // Surname-token + name-exact match should score in medium tier so
+      // the pair lands in the review queue but doesn't auto-merge.
+      if (matches.length > 0) {
+        expect(matches[0]!.confidence).not.toBe('high');
+        expect(matches[0]!.score).toBeLessThan(90);
+      }
+    });
+
+    it('shared email between two clearly different names is medium not high', () => {
+      // Bruno Joyerot #18 vs Bruce Hearn #19 — Bruno's row shows email
+      // belonging to "catherine elaine hearn" (Bruce's spouse). Same
+      // household phone area code. Name overlap is partial. Don't merge.
+      const incoming = candidate({
+        id: 'b',
+        fullName: 'Bruce Hearn',
+        surnameToken: 'hearn',
+        emails: ['bhearn1063@gmail.com'],
+        phonesE164: ['+12642358840'],
+      });
+      const pool = [
+        candidate({
+          id: 'a',
+          fullName: 'Bruno Joyerot',
+          surnameToken: 'joyerot',
+          emails: ['catherineelainehearn@gmail.com'],
+          phonesE164: ['+12642352816'],
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      // Names don't match, emails don't match, phones differ — there's
+      // no reason for this to surface at all. Either no match or low.
+      if (matches.length > 0) {
+        expect(matches[0]!.confidence).toBe('low');
+      }
+    });
+  });
+
+  describe('Negative evidence — same email but different country phone', () => {
+    it('reduces score when email matches but phone country differs', () => {
+      // Constructed: same email, but one phone is +33 (FR) and the other
+      // is +1 (US). Likely a shared-inbox spouse situation. We want
+      // medium tier so it lands in review, not high tier.
+      const incoming = candidate({
+        id: 'b',
+        fullName: 'Test User',
+        surnameToken: 'user',
+        emails: ['shared@example.com'],
+        phonesE164: ['+15551234567'],
+        countryIso: 'US',
+      });
+      const pool = [
+        candidate({
+          id: 'a',
+          fullName: 'Test User',
+          surnameToken: 'user',
+          emails: ['shared@example.com'],
+          phonesE164: ['+33611111111'],
+          countryIso: 'FR',
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      // Email match alone would be 60 + name token match 20 = 80 (medium).
+      // Negative evidence (different phone country) brings it down further.
+      expect(matches[0]!.confidence).toBe('medium');
+    });
+  });
+
+  describe('Blocking — only relevant candidates are scored', () => {
+    it('does not score candidates with no shared emails / phones / surname token', () => {
+      const incoming = candidate({
+        id: 'newbie',
+        fullName: 'Alice Smith',
+        surnameToken: 'smith',
+        emails: ['alice@example.com'],
+        phonesE164: ['+15551234567'],
+      });
+      const pool = [
+        candidate({
+          id: 'unrelated1',
+          fullName: 'Bob Jones',
+          surnameToken: 'jones',
+          emails: ['bob@example.org'],
+          phonesE164: ['+33611111111'],
+        }),
+        candidate({
+          id: 'unrelated2',
+          fullName: 'Carol White',
+          surnameToken: 'white',
+          emails: ['carol@example.net'],
+          phonesE164: ['+447700900111'],
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      expect(matches).toHaveLength(0);
+    });
+  });
+
+  describe('Empty pool', () => {
+    it('returns no matches when the pool is empty', () => {
+      const incoming = candidate({
+        id: 'a',
+        fullName: 'Alice',
+        emails: ['alice@example.com'],
+      });
+      expect(findClientMatches(incoming, [], THRESHOLDS)).toEqual([]);
+    });
+  });
+
+  describe('Sort order', () => {
+    it('returns matches sorted by score descending', () => {
+      const incoming = candidate({
+        id: 'incoming',
+        fullName: 'John Smith',
+        surnameToken: 'smith',
+        emails: ['john@example.com'],
+        phonesE164: ['+15551234567'],
+      });
+      const pool = [
+        candidate({
+          // High match — same email + phone
+          id: 'high-match',
+          fullName: 'John Smith',
+          surnameToken: 'smith',
+          emails: ['john@example.com'],
+          phonesE164: ['+15551234567'],
+        }),
+        candidate({
+          // Medium match — same email only
+          id: 'medium-match',
+          fullName: 'Different Person',
+          surnameToken: 'person',
+          emails: ['john@example.com'],
+          phonesE164: ['+33611111111'],
+        }),
+      ];
+
+      const matches = findClientMatches(incoming, pool, THRESHOLDS);
+
+      expect(matches.length).toBeGreaterThanOrEqual(2);
+      expect(matches[0]!.candidate.id).toBe('high-match');
+      expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
+    });
+  });
+});
--- a/tests/unit/dedup/normalize.test.ts
+++ b/tests/unit/dedup/normalize.test.ts
@@ -0,0 +1,270 @@
+/**
+ * Normalization library — unit tests.
+ *
+ * Every fixture here comes from real dirty values observed in the legacy
+ * NocoDB Interests table during the 2026-05-03 audit (see
+ * docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.3).
+ * The point is regression-prevention: if any of these patterns ever
+ * stops normalizing the way it should, dedup quality silently drops.
+ */
+import { describe, expect, it } from 'vitest';
+
+import {
+  normalizeName,
+  normalizeEmail,
+  normalizePhone,
+  resolveCountry,
+} from '@/lib/dedup/normalize';
+
+describe('normalizeName', () => {
+  it('returns null fields for empty / null input', () => {
+    expect(normalizeName('')).toEqual({ display: '', normalized: '', surnameToken: undefined });
+    expect(normalizeName('   ')).toEqual({
+      display: '',
+      normalized: '',
+      surnameToken: undefined,
+    });
+  });
+
+  it('trims leading/trailing whitespace', () => {
+    expect(normalizeName('  Marcus Laurent  ')).toMatchObject({
+      display: 'Marcus Laurent',
+      normalized: 'marcus laurent',
+    });
+  });
+
+  it('collapses repeated internal whitespace to a single space', () => {
+    // From real data: "Arthur  Matthews" (#183), "Corinne  Roche" (#208).
+    expect(normalizeName('Arthur  Matthews').display).toBe('Arthur Matthews');
+    expect(normalizeName('Corinne   Roche').display).toBe('Corinne Roche');
+  });
+
+  it('replaces embedded carriage returns and newlines with single spaces', () => {
+    // From real data: "Andrei \nVAGNANOV" (#178), "Daniel\r PRZEDBORSKI" (#175).
+    expect(normalizeName('Andrei \nVAGNANOV').display).toBe('Andrei Vagnanov');
+    expect(normalizeName('Daniel\r PRZEDBORSKI').display).toBe('Daniel Przedborski');
+  });
+
+  it('title-cases ALL-CAPS surnames while keeping given name title-cased', () => {
+    // From real data: "Jona ANDERSEN" (#232), "Duane SALTSGAVER" (#227),
+    // "Marcos DALLA PRIA" (#165).
+    expect(normalizeName('Jona ANDERSEN').display).toBe('Jona Andersen');
+    expect(normalizeName('Duane SALTSGAVER').display).toBe('Duane Saltsgaver');
+    // Particle 'dalla' stays lowercase mid-name.
+    expect(normalizeName('Marcos DALLA PRIA').display).toBe('Marcos dalla Pria');
+  });
+
+  it('title-cases lowercased entries', () => {
+    // From real data: "antony amaral" (#665), "david rosenbloom" (#239),
+    // "john Tickner" (#247).
+    expect(normalizeName('antony amaral').display).toBe('Antony Amaral');
+    expect(normalizeName('david rosenbloom').display).toBe('David Rosenbloom');
+    expect(normalizeName('john Tickner').display).toBe('John Tickner');
+  });
+
+  it('keeps Romance and Germanic particles lowercase mid-name', () => {
+    // From real data: "Olav van Velsen" (#526), "Bruno Joyerot" (#18),
+    // "OLIVIER DAIN" (#677). Also synthetic "Carla de la Cruz".
+    expect(normalizeName('Olav van Velsen').display).toBe('Olav van Velsen');
+    expect(normalizeName('Carla de la Cruz').display).toBe('Carla de la Cruz');
+    expect(normalizeName('OLIVIER DAIN').display).toBe('Olivier Dain');
+  });
+
+  it('preserves O‘-prefixed Irish surnames as title-case', () => {
+    expect(normalizeName("liam o'brien").display).toBe("Liam O'Brien");
+  });
+
+  it('keeps the slash-with-company structure intact', () => {
+    // From real data: "Daniel Wainstein / 7 Knots, LLC" (#637),
+    // "Bruno Joyerot / SAS TIKI" (#18).
+    expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').display).toBe(
+      'Daniel Wainstein / 7 Knots, LLC',
+    );
+    expect(normalizeName('Bruno Joyerot / SAS TIKI').display).toBe('Bruno Joyerot / SAS TIKI');
+  });
+
+  it('exposes the last non-particle token as surnameToken (lowercase) for blocking', () => {
+    expect(normalizeName('Marcus Laurent').surnameToken).toBe('laurent');
+    expect(normalizeName('Olav van Velsen').surnameToken).toBe('velsen');
+    expect(normalizeName('Carla de la Cruz').surnameToken).toBe('cruz');
+    expect(normalizeName("Liam O'Brien").surnameToken).toBe("o'brien");
+  });
+
+  it('handles single-token names — surnameToken is the only token', () => {
+    expect(normalizeName('Madonna').surnameToken).toBe('madonna');
+  });
+
+  it('produces a normalized form that is always lowercase', () => {
+    expect(normalizeName('Andrei VAGNANOV').normalized).toBe('andrei vagnanov');
+    expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').normalized).toBe(
+      'daniel wainstein / 7 knots, llc',
+    );
+  });
+});
+
+describe('normalizeEmail', () => {
+  it('returns null for empty / null inputs', () => {
+    expect(normalizeEmail('')).toBeNull();
+    expect(normalizeEmail('   ')).toBeNull();
+  });
+
+  it('lowercases and trims', () => {
+    // From real data: "Arthur@laser-align.com" vs "arthur@laser-align.com" (#183/#686).
+    expect(normalizeEmail('Arthur@laser-align.com')).toBe('arthur@laser-align.com');
+    expect(normalizeEmail('  marcus@example.com  ')).toBe('marcus@example.com');
+  });
+
+  it('lowercases capitalized localparts', () => {
+    // From real data: "Bmalone850@gmail.com" (#489), "Hef355@yahoo.com" (#533),
+    // "Donclaytonmusic@gmail.com" (#679).
+    expect(normalizeEmail('Bmalone850@gmail.com')).toBe('bmalone850@gmail.com');
+    expect(normalizeEmail('Hef355@yahoo.com')).toBe('hef355@yahoo.com');
+  });
+
+  it('preserves plus-aliases — both legitimate and tricks', () => {
+    // Per design §3.2: "+aliases" are not stripped. Compare by full localpart.
+    expect(normalizeEmail('marcus+sales@example.com')).toBe('marcus+sales@example.com');
+  });
+
+  it('returns null for invalid email shapes', () => {
+    expect(normalizeEmail('not-an-email')).toBeNull();
+    expect(normalizeEmail('@example.com')).toBeNull();
+    expect(normalizeEmail('user@')).toBeNull();
+    expect(normalizeEmail('user@.com')).toBeNull();
+  });
+});
+
+describe('normalizePhone', () => {
+  it('returns null for empty / whitespace / null', () => {
+    expect(normalizePhone('', 'AI')).toBeNull();
+    expect(normalizePhone('   ', 'AI')).toBeNull();
+  });
+
+  it('parses a plain E.164 number', () => {
+    expect(normalizePhone('+15742740548', 'US')).toMatchObject({
+      e164: '+15742740548',
+      country: 'US',
+    });
+  });
+
+  it('strips embedded carriage returns and trailing whitespace', () => {
+    // From real data: "+1-264-235-8840\r" (#19), "+1-264-772-3272\r" (#20).
+    const out = normalizePhone('+1-264-235-8840\r', 'AI');
+    expect(out?.e164).toBe('+12642358840');
+  });
+
+  it('strips dashes, dots, parens, single quotes, spaces in a single pass', () => {
+    // From real data: "'+1.214.603.4235" (#205), "574-274-0548" (#236),
+    // "+1-264-235-8840" (#19), "+1 (212) 555-0123" (synthetic).
+    expect(normalizePhone("'+1.214.603.4235", 'US')?.e164).toBe('+12146034235');
+    expect(normalizePhone('574-274-0548', 'US')?.e164).toBe('+15742740548');
+    expect(normalizePhone('+1 (212) 555-0123', 'US')?.e164).toBe('+12125550123');
+  });
+
+  it('converts a leading 00 prefix to + (international dialling)', () => {
+    // From real data: "00447956657022" (#216), "0033651381036" (#702).
+    expect(normalizePhone('00447956657022', 'GB')?.e164).toBe('+447956657022');
+    expect(normalizePhone('0033651381036', 'FR')?.e164).toBe('+33651381036');
+  });
+
+  it('uses defaultCountry when input has no international prefix', () => {
+    // From real data: "0690699699" (#203, French local), "0651381036" (#701).
+    expect(normalizePhone('0690699699', 'FR')?.e164).toBe('+33690699699');
+    expect(normalizePhone('0651381036', 'FR')?.e164).toBe('+33651381036');
+  });
+
+  it('returns null when there is no prefix AND no defaultCountry', () => {
+    // The migration script flags these for human review.
+    const out = normalizePhone('5742740548');
+    expect(out?.e164 ?? null).toBeNull();
+  });
+
+  it('flags placeholder all-zeros numbers and returns null', () => {
+    // From real data: "+447000000000" (#641, "Milos Vitkovic" — clearly fake).
+    const out = normalizePhone('+447000000000', 'GB');
+    expect(out?.flagged).toBe('placeholder');
+    expect(out?.e164).toBeNull();
+  });
+
+  it('flags multi-number fields and uses the first segment', () => {
+    // From real data: "0677580750/0690511494" (#209). Other separators: ; ,
+    const slash = normalizePhone('0677580750/0690511494', 'FR');
+    expect(slash?.flagged).toBe('multi_number');
+    expect(slash?.e164).toBe('+33677580750');
+
+    const semi = normalizePhone('+33611111111;+33622222222', 'FR');
+    expect(semi?.flagged).toBe('multi_number');
+    expect(semi?.e164).toBe('+33611111111');
+  });
+
+  it('flags genuinely unparseable input as `unparseable`', () => {
+    const out = normalizePhone('xyz-not-a-phone', 'US');
+    expect(out?.flagged).toBe('unparseable');
+    expect(out?.e164).toBeNull();
+  });
+
+  it('strips an apostrophe-prefix without breaking the parse', () => {
+    // From real data: leading "'" copy-pasted from spreadsheets escapes
+    // numeric-cell coercion. Should be invisible to dedup.
+    expect(normalizePhone("'0690699699", 'FR')?.e164).toBe('+33690699699');
+  });
+
+  it('returns the country alongside the E.164 form', () => {
+    expect(normalizePhone('+33690699699', 'FR')).toMatchObject({
+      e164: '+33690699699',
+      country: 'FR',
+    });
+  });
+});
+
+describe('resolveCountry', () => {
+  it('returns null for empty / nullish input', () => {
+    expect(resolveCountry('')).toEqual({ iso: null, confidence: null });
+    expect(resolveCountry('   ')).toEqual({ iso: null, confidence: null });
+  });
+
+  it('exact-matches a canonical English country name', () => {
+    expect(resolveCountry('Anguilla')).toEqual({ iso: 'AI', confidence: 'exact' });
+    expect(resolveCountry('United Kingdom')).toEqual({ iso: 'GB', confidence: 'exact' });
+    expect(resolveCountry('United States')).toEqual({ iso: 'US', confidence: 'exact' });
+  });
+
+  it('matches case-insensitively', () => {
+    expect(resolveCountry('anguilla').iso).toBe('AI');
+    expect(resolveCountry('UNITED KINGDOM').iso).toBe('GB');
+  });
+
+  it('matches values with surrounding whitespace', () => {
+    expect(resolveCountry('  United States  ').iso).toBe('US');
+  });
+
+  it('handles diacritic variants of Saint-Barthélemy', () => {
+    // From real data: "Saint barthelemy" (#203), "St Barth" (#208), "Saint-Barthélemy".
+    expect(resolveCountry('Saint-Barthélemy').iso).toBe('BL');
+    expect(resolveCountry('Saint Barthelemy').iso).toBe('BL');
+    expect(resolveCountry('saint barthelemy').iso).toBe('BL');
+    expect(resolveCountry('St Barth').iso).toBe('BL');
+  });
+
+  it('resolves common abbreviations', () => {
+    expect(resolveCountry('USA').iso).toBe('US');
+    expect(resolveCountry('UK').iso).toBe('GB');
+  });
+
+  it('falls back to a city → country mapping for high-frequency cities', () => {
+    // From real data: "Kansas City" (#198), "Sag Harbor Y" (#239).
+    expect(resolveCountry('Kansas City').iso).toBe('US');
+    expect(resolveCountry('Sag Harbor Y').iso).toBe('US');
+  });
+
+  it('marks the confidence tier appropriately', () => {
+    expect(resolveCountry('Anguilla').confidence).toBe('exact');
+    expect(resolveCountry('Kansas City').confidence).toBe('city');
+  });
+
+  it('returns null + null for unresolvable values', () => {
+    // Migration script flags these for human review rather than guessing.
+    expect(resolveCountry('asdfghjkl xyz')).toEqual({ iso: null, confidence: null });
+    expect(resolveCountry('Mars')).toEqual({ iso: null, confidence: null });
+  });
+});