src/lib/dedup/find-matches.ts

/**
 * Client-match finder — pure scoring logic.
 *
 * Compares one input candidate against a pool of existing candidates and
 * returns scored matches. Used by:
 *   - the at-create suggestion in client/interest forms (Layer 1)
 *   - the public-form auto-link path (when score >= block threshold)
 *   - the nightly background scoring job (Layer 3)
 *   - the migration script's dedup pass
 *
 * Performance shape: blocking via email / phone / surname-token reduces
 * the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in
 * production. See `findClientMatches` for the blocking implementation.
 *
 * Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4.
 */

import { parsePhone } from '@/lib/i18n/phone';

import { levenshtein } from './normalize';

// ─── Types ──────────────────────────────────────────────────────────────────

export interface MatchCandidate {
  id: string;
  fullName: string | null;
  /** Lowercased last non-particle token from `normalizeName(...).surnameToken`.
   *  Used as a blocking key. */
  surnameToken: string | null;
  /** Already lowercased + validated via `normalizeEmail`. */
  emails: string[];
  /** Already canonical E.164 via `normalizePhone`. */
  phonesE164: string[];
  /** Address country (NOT phone country) — used for tiebreaking, not scoring. */
  countryIso: string | null;
}

export type MatchConfidence = 'high' | 'medium' | 'low';

export interface MatchResult {
  candidate: MatchCandidate;
  /** 0–100 after capping. */
  score: number;
  /** Human-readable list of which rules contributed. Useful for the
   *  review queue UI ("matched on email + phone + surname token"). */
  reasons: string[];
  confidence: MatchConfidence;
}

export interface DedupThresholds {
  /** Inclusive lower bound for `'high'` confidence. */
  highScore: number;
  /** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */
  mediumScore: number;
}

// ─── Public entry point ─────────────────────────────────────────────────────

/**
 * Compare `input` against every reachable candidate in `pool` and return
 * scored matches, sorted by score descending. The result list includes
 * low-confidence hits — caller filters by `confidence` or `score`
 * depending on use case.
 *
 * Self-matches (an entry with `id === input.id`, e.g. when re-scoring an
 * existing client during a background job) are excluded.
 */
export function findClientMatches(
  input: MatchCandidate,
  pool: MatchCandidate[],
  thresholds: DedupThresholds,
): MatchResult[] {
  if (pool.length === 0) return [];

  // ── Phase 1: build blocking indexes off the pool. ─────────────────────────
  //
  // Three indexes mean any candidate that shares ANY of (email / phone /
  // surname-token) with the input shows up in the comparison set. Anything
  // that shares NONE is structurally too different to be a duplicate and
  // is skipped — this is what keeps the algorithm O(n) at scale.
  const byEmail = new Map<string, MatchCandidate[]>();
  const byPhone = new Map<string, MatchCandidate[]>();
  const bySurnameToken = new Map<string, MatchCandidate[]>();

  for (const c of pool) {
    if (c.id === input.id) continue;
    for (const email of c.emails) {
      pushTo(byEmail, email, c);
    }
    for (const phone of c.phonesE164) {
      pushTo(byPhone, phone, c);
    }
    if (c.surnameToken) {
      pushTo(bySurnameToken, c.surnameToken, c);
    }
  }

  // ── Phase 2: gather the comparison set via the blocking indexes. ─────────
  const comparisonSet = new Map<string, MatchCandidate>();
  for (const email of input.emails) {
    for (const c of byEmail.get(email) ?? []) {
      comparisonSet.set(c.id, c);
    }
  }
  for (const phone of input.phonesE164) {
    for (const c of byPhone.get(phone) ?? []) {
      comparisonSet.set(c.id, c);
    }
  }
  if (input.surnameToken) {
    for (const c of bySurnameToken.get(input.surnameToken) ?? []) {
      comparisonSet.set(c.id, c);
    }
  }

  // ── Phase 3: score every candidate that survived blocking. ───────────────
  const results: MatchResult[] = [];
  for (const candidate of comparisonSet.values()) {
    const r = scorePair(input, candidate);
    results.push(r);
  }

  // ── Phase 4: sort by score desc + assign confidence tier. ────────────────
  results.sort((a, b) => b.score - a.score);
  for (const r of results) {
    r.confidence = classify(r.score, thresholds);
  }
  return results;
}

// ─── Scoring ────────────────────────────────────────────────────────────────

/**
 * Score one (input, candidate) pair against the rule set in design §4.2.
 * Compounding: positive rules sum, negative rules subtract; the result is
 * clamped to [0, 100]. Reasons accumulate in the order rules fire so the
 * review-queue UI can show "matched on email + phone".
 */
function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult {
  let score = 0;
  const reasons: string[] = [];

  // ── Positive rules. ──────────────────────────────────────────────────────

  const sharedEmail = a.emails.find((e) => b.emails.includes(e));
  const emailMatch = !!sharedEmail;
  if (emailMatch) {
    score += 60;
    reasons.push('email match');
  }

  const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8);
  const phoneMatch = !!sharedPhone;
  if (phoneMatch) {
    score += 50;
    reasons.push('phone match');
  }

  const aNameNorm = (a.fullName ?? '').toLowerCase().trim();
  const bNameNorm = (b.fullName ?? '').toLowerCase().trim();
  const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm;
  if (nameExactMatch) {
    score += 20;
    reasons.push('name match');
  }

  // Surname + given-name fuzzy. Only fires when names are NOT exactly
  // equal — avoids double-counting with the rule above. Catches
  // 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact
  // signals confirm them.
  if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) {
    const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
    const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
    if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) {
      score += 15;
      reasons.push('surname + given-name fuzzy match');
    }
  }

  // ── Negative rules. ──────────────────────────────────────────────────────

  // Same email but the two parties' phone numbers belong to different
  // countries. Common when one inbox is shared by spouses / coworkers
  // and the actual phone owners are distinct people. Don't auto-merge.
  if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) {
    const aCountries = phoneCountriesOf(a);
    const bCountries = phoneCountriesOf(b);
    const overlap = [...aCountries].some((c) => bCountries.has(c));
    if (!overlap && aCountries.size > 0 && bCountries.size > 0) {
      score -= 15;
      reasons.push('phone country mismatch (negative)');
    }
  }

  // Same name but no contact match. Two distinct people with the same
  // name (common for "John Smith") sneak through name-based blocking;
  // penalize so the score lands below the auto-merge threshold.
  if (nameExactMatch && !emailMatch && !phoneMatch) {
    score -= 20;
    reasons.push('name match but no shared contact (negative)');
  }

  return {
    candidate: b,
    score: clamp(score, 0, 100),
    reasons,
    confidence: 'low', // assigned by caller after threshold lookup
  };
}

// ─── Helpers ────────────────────────────────────────────────────────────────

function pushTo<K, V>(map: Map<K, V[]>, key: K, value: V): void {
  const existing = map.get(key);
  if (existing) {
    existing.push(value);
  } else {
    map.set(key, [value]);
  }
}

function classify(score: number, thresholds: DedupThresholds): MatchConfidence {
  if (score >= thresholds.highScore) return 'high';
  if (score >= thresholds.mediumScore) return 'medium';
  return 'low';
}

function clamp(value: number, min: number, max: number): number {
  if (value < min) return min;
  if (value > max) return max;
  return value;
}

function countDigits(s: string): number {
  let count = 0;
  for (let i = 0; i < s.length; i += 1) {
    const code = s.charCodeAt(i);
    if (code >= 48 && code <= 57) count += 1;
  }
  return count;
}

/**
 * Resolve each phone in a candidate to its ISO country code (via
 * libphonenumber-js). Cached per call; the surrounding caller doesn't
 * batch so we accept the parse cost.
 */
function phoneCountriesOf(c: MatchCandidate): Set<string> {
  const out = new Set<string>();
  for (const p of c.phonesE164) {
    const parsed = parsePhone(p);
    if (parsed.country) out.add(parsed.country);
  }
  return out;
}
-												feat(dedup): normalization + match-finding library (P1)

The pure-logic spine of the client deduplication system spec'd in
docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md.
Two modules, JSX-free, vitest-tested against fixtures drawn directly
from real dirty values observed in the legacy NocoDB Interests audit.

src/lib/dedup/normalize.ts
- normalizeName: trims whitespace, replaces \r/\n/\t, intelligently
  title-cases ALL-CAPS surnames while keeping particles (van / de /
  dalla / etc.) lowercase mid-name. Preserves Irish O' surnames and
  the "slash-with-company" structure ("Daniel Wainstein / 7 Knots,
  LLC") seen in production. Returns a surnameToken (lowercased last
  non-particle token) for use as a dedup blocking key.
- normalizeEmail: trim + lowercase + zod email validation. Plus-aliases
  preserved; null on invalid.
- normalizePhone: pre-cleans the input (strips spreadsheet apostrophes,
  carriage returns, dots/dashes/parens, converts 00 prefix to +) then
  delegates to libphonenumber-js. Detects multi-number fields ("a/b",
  "a;b") and placeholder fakes (8+ consecutive zeros, e.g.
  +447000000000). Flags every quirk so the migration report and runtime
  audit log can surface it.
- resolveCountry: maps free-text country/region input to ISO-3166-1
  alpha-2 via alias → exact (vs. Intl-derived names) → city → fuzzy
  (Levenshtein ≤ 2). Fuzzy is gated by length so 4-char inputs ("Mars")
  don't false-positive against short country names.
- levenshtein: standard iterative implementation, exported for reuse
  by find-matches.

src/lib/dedup/find-matches.ts
- findClientMatches: builds three blocking indexes off the pool (email
  / phone / surname-token), gathers the comparison set via union, and
  scores each candidate via the rule set in design §4.2:
    Email match            +60
    Phone E.164 match      +50  (≥ 8 digits, excludes placeholder zeros)
    Name exact match       +20
    Surname + given fuzzy  +15  (Levenshtein ≤ 1)
    Negative: shared email but different phone country  −15
    Negative: name match but no shared contact          −20
  Score is clamped to [0,100]. Confidence tier ('high' / 'medium' /
  'low') is derived from configurable thresholds passed in by the
  caller — defaults are highScore=90, mediumScore=50.

tests/unit/dedup/normalize.test.ts (38 cases)
Every dirty-data pattern from design §1.3 has a fixture: carriage
returns in names, ALL-CAPS surnames, lowercase entries, particles,
slash-with-company, plus-aliases, capitalized email localparts,
spreadsheet-apostrophe phones, multi-number phones, placeholder
phones, 00-prefix phones, French/UK local-format phones,
Saint-Barthélemy diacritic variants, Kansas City fallback.

tests/unit/dedup/find-matches.test.ts (12 cases)
Each duplicate cluster from design §1.2 has a test:
- Pattern A (Deepak Ramchandani — pure double-submit) → high
- Pattern B (Howard Wiarda — phone format variance) → high
- Pattern C (Nicolas Ruiz — name capitalization) → high
- Pattern D (Chris/Christopher Allen — name shortening) → high
- Pattern E (Christopher Camazou — typo on resubmit) → high or medium
- Pattern E (Constanzo/Costanzo — surname typo, multi-yacht) → high
- Pattern F (Etiennette Clamouze — same name, different country) →
  must NOT auto-merge
- Pattern F (Bruno+Bruce — shared household contact) → no match
- Negative evidence (same email, different phone country) → medium
- Blocking (no shared keys → 0 matches)
- Sort order (high before low)
- Empty pool

Total: 50 new tests, all green. Zero changes to runtime behavior or
schema; unblocks P2 (runtime surfaces) and P3 (NocoDB migration).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-03 14:28:59 +02:00
+								/**
 								 * Client-match finder — pure scoring logic.
 								 *
 								 * Compares one input candidate against a pool of existing candidates and
 								 * returns scored matches. Used by:
 								 *   - the at-create suggestion in client/interest forms (Layer 1)
 								 *   - the public-form auto-link path (when score >= block threshold)
 								 *   - the nightly background scoring job (Layer 3)
 								 *   - the migration script's dedup pass
 								 *
 								 * Performance shape: blocking via email / phone / surname-token reduces
 								 * the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in
 								 * production. See `findClientMatches` for the blocking implementation.
 								 *
 								 * Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4.
 								 */
 								import { parsePhone } from '@/lib/i18n/phone';
 								import { levenshtein } from './normalize';
 								// ─── Types ──────────────────────────────────────────────────────────────────
 								export interface MatchCandidate {
 								  id: string;
 								  fullName: string | null;
 								  /** Lowercased last non-particle token from `normalizeName(...).surnameToken`.
 								   *  Used as a blocking key. */
 								  surnameToken: string | null;
 								  /** Already lowercased + validated via `normalizeEmail`. */
 								  emails: string[];
 								  /** Already canonical E.164 via `normalizePhone`. */
 								  phonesE164: string[];
 								  /** Address country (NOT phone country) — used for tiebreaking, not scoring. */
 								  countryIso: string | null;
 								}
 								export type MatchConfidence = 'high' | 'medium' | 'low';
 								export interface MatchResult {
 								  candidate: MatchCandidate;
 								  /** 0–100 after capping. */
 								  score: number;
 								  /** Human-readable list of which rules contributed. Useful for the
 								   *  review queue UI ("matched on email + phone + surname token"). */
 								  reasons: string[];
 								  confidence: MatchConfidence;
 								}
 								export interface DedupThresholds {
 								  /** Inclusive lower bound for `'high'` confidence. */
 								  highScore: number;
 								  /** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */
 								  mediumScore: number;
 								}
 								// ─── Public entry point ─────────────────────────────────────────────────────
 								/**
 								 * Compare `input` against every reachable candidate in `pool` and return
 								 * scored matches, sorted by score descending. The result list includes
 								 * low-confidence hits — caller filters by `confidence` or `score`
 								 * depending on use case.
 								 *
 								 * Self-matches (an entry with `id === input.id`, e.g. when re-scoring an
 								 * existing client during a background job) are excluded.
 								 */
 								export function findClientMatches(
 								  input: MatchCandidate,
 								  pool: MatchCandidate[],
 								  thresholds: DedupThresholds,
 								): MatchResult[] {
 								  if (pool.length === 0) return [];
 								  // ── Phase 1: build blocking indexes off the pool. ─────────────────────────
 								  //
 								  // Three indexes mean any candidate that shares ANY of (email / phone /
 								  // surname-token) with the input shows up in the comparison set. Anything
 								  // that shares NONE is structurally too different to be a duplicate and
 								  // is skipped — this is what keeps the algorithm O(n) at scale.
 								  const byEmail = new Map<string, MatchCandidate[]>();
 								  const byPhone = new Map<string, MatchCandidate[]>();
 								  const bySurnameToken = new Map<string, MatchCandidate[]>();
 								  for (const c of pool) {
 								    if (c.id === input.id) continue;
 								    for (const email of c.emails) {
 								      pushTo(byEmail, email, c);
 								    }
 								    for (const phone of c.phonesE164) {
 								      pushTo(byPhone, phone, c);
 								    }
 								    if (c.surnameToken) {
 								      pushTo(bySurnameToken, c.surnameToken, c);
 								    }
 								  }
 								  // ── Phase 2: gather the comparison set via the blocking indexes. ─────────
 								  const comparisonSet = new Map<string, MatchCandidate>();
 								  for (const email of input.emails) {
 								    for (const c of byEmail.get(email) ?? []) {
 								      comparisonSet.set(c.id, c);
 								    }
 								  }
 								  for (const phone of input.phonesE164) {
 								    for (const c of byPhone.get(phone) ?? []) {
 								      comparisonSet.set(c.id, c);
 								    }
 								  }
 								  if (input.surnameToken) {
 								    for (const c of bySurnameToken.get(input.surnameToken) ?? []) {
 								      comparisonSet.set(c.id, c);
 								    }
 								  }
 								  // ── Phase 3: score every candidate that survived blocking. ───────────────
 								  const results: MatchResult[] = [];
 								  for (const candidate of comparisonSet.values()) {
 								    const r = scorePair(input, candidate);
 								    results.push(r);
 								  }
 								  // ── Phase 4: sort by score desc + assign confidence tier. ────────────────
 								  results.sort((a, b) => b.score - a.score);
 								  for (const r of results) {
 								    r.confidence = classify(r.score, thresholds);
 								  }
 								  return results;
 								}
 								// ─── Scoring ────────────────────────────────────────────────────────────────
 								/**
 								 * Score one (input, candidate) pair against the rule set in design §4.2.
 								 * Compounding: positive rules sum, negative rules subtract; the result is
 								 * clamped to [0, 100]. Reasons accumulate in the order rules fire so the
 								 * review-queue UI can show "matched on email + phone".
 								 */
 								function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult {
 								  let score = 0;
 								  const reasons: string[] = [];
 								  // ── Positive rules. ──────────────────────────────────────────────────────
 								  const sharedEmail = a.emails.find((e) => b.emails.includes(e));
 								  const emailMatch = !!sharedEmail;
 								  if (emailMatch) {
 								    score += 60;
 								    reasons.push('email match');
 								  }
 								  const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8);
 								  const phoneMatch = !!sharedPhone;
 								  if (phoneMatch) {
 								    score += 50;
 								    reasons.push('phone match');
 								  }
 								  const aNameNorm = (a.fullName ?? '').toLowerCase().trim();
 								  const bNameNorm = (b.fullName ?? '').toLowerCase().trim();
 								  const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm;
 								  if (nameExactMatch) {
 								    score += 20;
 								    reasons.push('name match');
 								  }
 								  // Surname + given-name fuzzy. Only fires when names are NOT exactly
 								  // equal — avoids double-counting with the rule above. Catches
 								  // 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact
 								  // signals confirm them.
 								  if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) {
 								    const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
 								    const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
 								    if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) {
 								      score += 15;
 								      reasons.push('surname + given-name fuzzy match');
 								    }
 								  }
 								  // ── Negative rules. ──────────────────────────────────────────────────────
 								  // Same email but the two parties' phone numbers belong to different
 								  // countries. Common when one inbox is shared by spouses / coworkers
 								  // and the actual phone owners are distinct people. Don't auto-merge.
 								  if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) {
 								    const aCountries = phoneCountriesOf(a);
 								    const bCountries = phoneCountriesOf(b);
 								    const overlap = [...aCountries].some((c) => bCountries.has(c));
 								    if (!overlap && aCountries.size > 0 && bCountries.size > 0) {
 								      score -= 15;
 								      reasons.push('phone country mismatch (negative)');
 								    }
 								  }
 								  // Same name but no contact match. Two distinct people with the same
 								  // name (common for "John Smith") sneak through name-based blocking;
 								  // penalize so the score lands below the auto-merge threshold.
 								  if (nameExactMatch && !emailMatch && !phoneMatch) {
 								    score -= 20;
 								    reasons.push('name match but no shared contact (negative)');
 								  }
 								  return {
 								    candidate: b,
 								    score: clamp(score, 0, 100),
 								    reasons,
 								    confidence: 'low', // assigned by caller after threshold lookup
 								  };
 								}
 								// ─── Helpers ────────────────────────────────────────────────────────────────
 								function pushTo<K, V>(map: Map<K, V[]>, key: K, value: V): void {
 								  const existing = map.get(key);
 								  if (existing) {
 								    existing.push(value);
 								  } else {
 								    map.set(key, [value]);
 								  }
 								}
 								function classify(score: number, thresholds: DedupThresholds): MatchConfidence {
 								  if (score >= thresholds.highScore) return 'high';
 								  if (score >= thresholds.mediumScore) return 'medium';
 								  return 'low';
 								}
 								function clamp(value: number, min: number, max: number): number {
 								  if (value < min) return min;
 								  if (value > max) return max;
 								  return value;
 								}
 								function countDigits(s: string): number {
 								  let count = 0;
 								  for (let i = 0; i < s.length; i += 1) {
 								    const code = s.charCodeAt(i);
 								    if (code >= 48 && code <= 57) count += 1;
 								  }
 								  return count;
 								}
 								/**
 								 * Resolve each phone in a candidate to its ISO country code (via
 								 * libphonenumber-js). Cached per call; the surrounding caller doesn't
 								 * batch so we accept the parse cost.
 								 */
 								function phoneCountriesOf(c: MatchCandidate): Set<string> {
 								  const out = new Set<string>();
 								  for (const p of c.phonesE164) {
 								    const parsed = parsePhone(p);
 								    if (parsed.country) out.add(parsed.country);
 								  }
 								  return out;
 								}