/** * Client-match finder — pure scoring logic. * * Compares one input candidate against a pool of existing candidates and * returns scored matches. Used by: * - the at-create suggestion in client/interest forms (Layer 1) * - the public-form auto-link path (when score >= block threshold) * - the nightly background scoring job (Layer 3) * - the migration script's dedup pass * * Performance shape: blocking via email / phone / surname-token reduces * the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in * production. See `findClientMatches` for the blocking implementation. * * Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4. */ import { parsePhoneScriptSafe as parsePhone } from './phone-parse'; import { levenshtein } from './normalize'; // ─── Types ────────────────────────────────────────────────────────────────── export interface MatchCandidate { id: string; fullName: string | null; /** Lowercased last non-particle token from `normalizeName(...).surnameToken`. * Used as a blocking key. */ surnameToken: string | null; /** Already lowercased + validated via `normalizeEmail`. */ emails: string[]; /** Already canonical E.164 via `normalizePhone`. */ phonesE164: string[]; /** Address country (NOT phone country) — used for tiebreaking, not scoring. */ countryIso: string | null; } export type MatchConfidence = 'high' | 'medium' | 'low'; export interface MatchResult { candidate: MatchCandidate; /** 0–100 after capping. */ score: number; /** Human-readable list of which rules contributed. Useful for the * review queue UI ("matched on email + phone + surname token"). */ reasons: string[]; confidence: MatchConfidence; } export interface DedupThresholds { /** Inclusive lower bound for `'high'` confidence. */ highScore: number; /** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */ mediumScore: number; } // ─── Public entry point ───────────────────────────────────────────────────── /** * Compare `input` against every reachable candidate in `pool` and return * scored matches, sorted by score descending. The result list includes * low-confidence hits — caller filters by `confidence` or `score` * depending on use case. * * Self-matches (an entry with `id === input.id`, e.g. when re-scoring an * existing client during a background job) are excluded. */ export function findClientMatches( input: MatchCandidate, pool: MatchCandidate[], thresholds: DedupThresholds, ): MatchResult[] { if (pool.length === 0) return []; // ── Phase 1: build blocking indexes off the pool. ───────────────────────── // // Three indexes mean any candidate that shares ANY of (email / phone / // surname-token) with the input shows up in the comparison set. Anything // that shares NONE is structurally too different to be a duplicate and // is skipped — this is what keeps the algorithm O(n) at scale. const byEmail = new Map(); const byPhone = new Map(); const bySurnameToken = new Map(); for (const c of pool) { if (c.id === input.id) continue; for (const email of c.emails) { pushTo(byEmail, email, c); } for (const phone of c.phonesE164) { pushTo(byPhone, phone, c); } if (c.surnameToken) { pushTo(bySurnameToken, c.surnameToken, c); } } // ── Phase 2: gather the comparison set via the blocking indexes. ───────── const comparisonSet = new Map(); for (const email of input.emails) { for (const c of byEmail.get(email) ?? []) { comparisonSet.set(c.id, c); } } for (const phone of input.phonesE164) { for (const c of byPhone.get(phone) ?? []) { comparisonSet.set(c.id, c); } } if (input.surnameToken) { for (const c of bySurnameToken.get(input.surnameToken) ?? []) { comparisonSet.set(c.id, c); } } // ── Phase 3: score every candidate that survived blocking. ─────────────── const results: MatchResult[] = []; for (const candidate of comparisonSet.values()) { const r = scorePair(input, candidate); results.push(r); } // ── Phase 4: sort by score desc + assign confidence tier. ──────────────── results.sort((a, b) => b.score - a.score); for (const r of results) { r.confidence = classify(r.score, thresholds); } return results; } // ─── Scoring ──────────────────────────────────────────────────────────────── /** * Score one (input, candidate) pair against the rule set in design §4.2. * Compounding: positive rules sum, negative rules subtract; the result is * clamped to [0, 100]. Reasons accumulate in the order rules fire so the * review-queue UI can show "matched on email + phone". */ function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult { let score = 0; const reasons: string[] = []; // ── Positive rules. ────────────────────────────────────────────────────── const sharedEmail = a.emails.find((e) => b.emails.includes(e)); const emailMatch = !!sharedEmail; if (emailMatch) { score += 60; reasons.push('email match'); } const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8); const phoneMatch = !!sharedPhone; if (phoneMatch) { score += 50; reasons.push('phone match'); } const aNameNorm = (a.fullName ?? '').toLowerCase().trim(); const bNameNorm = (b.fullName ?? '').toLowerCase().trim(); const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm; if (nameExactMatch) { score += 20; reasons.push('name match'); } // Surname + given-name fuzzy. Only fires when names are NOT exactly // equal — avoids double-counting with the rule above. Catches // 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact // signals confirm them. if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) { const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? ''; const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? ''; if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) { score += 15; reasons.push('surname + given-name fuzzy match'); } } // ── Negative rules. ────────────────────────────────────────────────────── // Same email but the two parties' phone numbers belong to different // countries. Common when one inbox is shared by spouses / coworkers // and the actual phone owners are distinct people. Don't auto-merge. if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) { const aCountries = phoneCountriesOf(a); const bCountries = phoneCountriesOf(b); const overlap = [...aCountries].some((c) => bCountries.has(c)); if (!overlap && aCountries.size > 0 && bCountries.size > 0) { score -= 15; reasons.push('phone country mismatch (negative)'); } } // Same name but no contact match. Two distinct people with the same // name (common for "John Smith") sneak through name-based blocking; // penalize so the score lands below the auto-merge threshold. if (nameExactMatch && !emailMatch && !phoneMatch) { score -= 20; reasons.push('name match but no shared contact (negative)'); } return { candidate: b, score: clamp(score, 0, 100), reasons, confidence: 'low', // assigned by caller after threshold lookup }; } // ─── Helpers ──────────────────────────────────────────────────────────────── function pushTo(map: Map, key: K, value: V): void { const existing = map.get(key); if (existing) { existing.push(value); } else { map.set(key, [value]); } } function classify(score: number, thresholds: DedupThresholds): MatchConfidence { if (score >= thresholds.highScore) return 'high'; if (score >= thresholds.mediumScore) return 'medium'; return 'low'; } function clamp(value: number, min: number, max: number): number { if (value < min) return min; if (value > max) return max; return value; } function countDigits(s: string): number { let count = 0; for (let i = 0; i < s.length; i += 1) { const code = s.charCodeAt(i); if (code >= 48 && code <= 57) count += 1; } return count; } /** * Resolve each phone in a candidate to its ISO country code (via * libphonenumber-js). Cached per call; the surrounding caller doesn't * batch so we accept the parse cost. */ function phoneCountriesOf(c: MatchCandidate): Set { const out = new Set(); for (const p of c.phonesE164) { const parsed = parsePhone(p); if (parsed.country) out.add(parsed.country); } return out; }