diff --git a/src/lib/dedup/find-matches.ts b/src/lib/dedup/find-matches.ts new file mode 100644 index 0000000..db2a421 --- /dev/null +++ b/src/lib/dedup/find-matches.ts @@ -0,0 +1,255 @@ +/** + * Client-match finder — pure scoring logic. + * + * Compares one input candidate against a pool of existing candidates and + * returns scored matches. Used by: + * - the at-create suggestion in client/interest forms (Layer 1) + * - the public-form auto-link path (when score >= block threshold) + * - the nightly background scoring job (Layer 3) + * - the migration script's dedup pass + * + * Performance shape: blocking via email / phone / surname-token reduces + * the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in + * production. See `findClientMatches` for the blocking implementation. + * + * Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4. + */ + +import { parsePhone } from '@/lib/i18n/phone'; + +import { levenshtein } from './normalize'; + +// ─── Types ────────────────────────────────────────────────────────────────── + +export interface MatchCandidate { + id: string; + fullName: string | null; + /** Lowercased last non-particle token from `normalizeName(...).surnameToken`. + * Used as a blocking key. */ + surnameToken: string | null; + /** Already lowercased + validated via `normalizeEmail`. */ + emails: string[]; + /** Already canonical E.164 via `normalizePhone`. */ + phonesE164: string[]; + /** Address country (NOT phone country) — used for tiebreaking, not scoring. */ + countryIso: string | null; +} + +export type MatchConfidence = 'high' | 'medium' | 'low'; + +export interface MatchResult { + candidate: MatchCandidate; + /** 0–100 after capping. */ + score: number; + /** Human-readable list of which rules contributed. Useful for the + * review queue UI ("matched on email + phone + surname token"). */ + reasons: string[]; + confidence: MatchConfidence; +} + +export interface DedupThresholds { + /** Inclusive lower bound for `'high'` confidence. */ + highScore: number; + /** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */ + mediumScore: number; +} + +// ─── Public entry point ───────────────────────────────────────────────────── + +/** + * Compare `input` against every reachable candidate in `pool` and return + * scored matches, sorted by score descending. The result list includes + * low-confidence hits — caller filters by `confidence` or `score` + * depending on use case. + * + * Self-matches (an entry with `id === input.id`, e.g. when re-scoring an + * existing client during a background job) are excluded. + */ +export function findClientMatches( + input: MatchCandidate, + pool: MatchCandidate[], + thresholds: DedupThresholds, +): MatchResult[] { + if (pool.length === 0) return []; + + // ── Phase 1: build blocking indexes off the pool. ───────────────────────── + // + // Three indexes mean any candidate that shares ANY of (email / phone / + // surname-token) with the input shows up in the comparison set. Anything + // that shares NONE is structurally too different to be a duplicate and + // is skipped — this is what keeps the algorithm O(n) at scale. + const byEmail = new Map(); + const byPhone = new Map(); + const bySurnameToken = new Map(); + + for (const c of pool) { + if (c.id === input.id) continue; + for (const email of c.emails) { + pushTo(byEmail, email, c); + } + for (const phone of c.phonesE164) { + pushTo(byPhone, phone, c); + } + if (c.surnameToken) { + pushTo(bySurnameToken, c.surnameToken, c); + } + } + + // ── Phase 2: gather the comparison set via the blocking indexes. ───────── + const comparisonSet = new Map(); + for (const email of input.emails) { + for (const c of byEmail.get(email) ?? []) { + comparisonSet.set(c.id, c); + } + } + for (const phone of input.phonesE164) { + for (const c of byPhone.get(phone) ?? []) { + comparisonSet.set(c.id, c); + } + } + if (input.surnameToken) { + for (const c of bySurnameToken.get(input.surnameToken) ?? []) { + comparisonSet.set(c.id, c); + } + } + + // ── Phase 3: score every candidate that survived blocking. ─────────────── + const results: MatchResult[] = []; + for (const candidate of comparisonSet.values()) { + const r = scorePair(input, candidate); + results.push(r); + } + + // ── Phase 4: sort by score desc + assign confidence tier. ──────────────── + results.sort((a, b) => b.score - a.score); + for (const r of results) { + r.confidence = classify(r.score, thresholds); + } + return results; +} + +// ─── Scoring ──────────────────────────────────────────────────────────────── + +/** + * Score one (input, candidate) pair against the rule set in design §4.2. + * Compounding: positive rules sum, negative rules subtract; the result is + * clamped to [0, 100]. Reasons accumulate in the order rules fire so the + * review-queue UI can show "matched on email + phone". + */ +function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult { + let score = 0; + const reasons: string[] = []; + + // ── Positive rules. ────────────────────────────────────────────────────── + + const sharedEmail = a.emails.find((e) => b.emails.includes(e)); + const emailMatch = !!sharedEmail; + if (emailMatch) { + score += 60; + reasons.push('email match'); + } + + const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8); + const phoneMatch = !!sharedPhone; + if (phoneMatch) { + score += 50; + reasons.push('phone match'); + } + + const aNameNorm = (a.fullName ?? '').toLowerCase().trim(); + const bNameNorm = (b.fullName ?? '').toLowerCase().trim(); + const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm; + if (nameExactMatch) { + score += 20; + reasons.push('name match'); + } + + // Surname + given-name fuzzy. Only fires when names are NOT exactly + // equal — avoids double-counting with the rule above. Catches + // 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact + // signals confirm them. + if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) { + const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? ''; + const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? ''; + if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) { + score += 15; + reasons.push('surname + given-name fuzzy match'); + } + } + + // ── Negative rules. ────────────────────────────────────────────────────── + + // Same email but the two parties' phone numbers belong to different + // countries. Common when one inbox is shared by spouses / coworkers + // and the actual phone owners are distinct people. Don't auto-merge. + if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) { + const aCountries = phoneCountriesOf(a); + const bCountries = phoneCountriesOf(b); + const overlap = [...aCountries].some((c) => bCountries.has(c)); + if (!overlap && aCountries.size > 0 && bCountries.size > 0) { + score -= 15; + reasons.push('phone country mismatch (negative)'); + } + } + + // Same name but no contact match. Two distinct people with the same + // name (common for "John Smith") sneak through name-based blocking; + // penalize so the score lands below the auto-merge threshold. + if (nameExactMatch && !emailMatch && !phoneMatch) { + score -= 20; + reasons.push('name match but no shared contact (negative)'); + } + + return { + candidate: b, + score: clamp(score, 0, 100), + reasons, + confidence: 'low', // assigned by caller after threshold lookup + }; +} + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +function pushTo(map: Map, key: K, value: V): void { + const existing = map.get(key); + if (existing) { + existing.push(value); + } else { + map.set(key, [value]); + } +} + +function classify(score: number, thresholds: DedupThresholds): MatchConfidence { + if (score >= thresholds.highScore) return 'high'; + if (score >= thresholds.mediumScore) return 'medium'; + return 'low'; +} + +function clamp(value: number, min: number, max: number): number { + if (value < min) return min; + if (value > max) return max; + return value; +} + +function countDigits(s: string): number { + let count = 0; + for (let i = 0; i < s.length; i += 1) { + const code = s.charCodeAt(i); + if (code >= 48 && code <= 57) count += 1; + } + return count; +} + +/** + * Resolve each phone in a candidate to its ISO country code (via + * libphonenumber-js). Cached per call; the surrounding caller doesn't + * batch so we accept the parse cost. + */ +function phoneCountriesOf(c: MatchCandidate): Set { + const out = new Set(); + for (const p of c.phonesE164) { + const parsed = parsePhone(p); + if (parsed.country) out.add(parsed.country); + } + return out; +} diff --git a/src/lib/dedup/normalize.ts b/src/lib/dedup/normalize.ts new file mode 100644 index 0000000..76c274d --- /dev/null +++ b/src/lib/dedup/normalize.ts @@ -0,0 +1,389 @@ +/** + * Normalization helpers for the dedup pipeline. + * + * Pure functions (no DB, no React). Used by both the runtime at-create + * surfaces and the one-shot NocoDB migration script. Every transform + * here has a fixture in `tests/unit/dedup/normalize.test.ts` drawn from + * real dirty values observed in the legacy NocoDB Interests table. + * + * Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §3. + */ + +import { z } from 'zod'; + +import { ALL_COUNTRY_CODES, getCountryName, type CountryCode } from '@/lib/i18n/countries'; +import { parsePhone } from '@/lib/i18n/phone'; + +// ─── Names ────────────────────────────────────────────────────────────────── + +/** + * Tokens that should stay lowercase mid-name. Covers the common Romance, + * Germanic, and Iberian particles seen in client records. The first token + * of a name is always title-cased even if it appears in this set. + */ +const PARTICLES: ReadonlySet = new Set([ + 'van', + 'von', + 'de', + 'del', + 'da', + 'das', + 'do', + 'dos', + 'di', + 'le', + 'la', + 'el', + 'al', + 'der', + 'den', + 'des', + 'du', + 'dalla', + 'della', + 'st', + 'st.', + 'y', +]); + +export interface NormalizedName { + /** Human-readable form preserved for UI display. Trims, collapses + * whitespace, fixes case, but never destroys the user's intent — + * slash-with-company structure ("Daniel Wainstein / 7 Knots, LLC") + * is left intact. */ + display: string; + /** Lowercased form for matching. */ + normalized: string; + /** Last non-particle token, lowercased. Used as a blocking key by the + * dedup algorithm so we only compare candidates with similar surnames. */ + surnameToken?: string; +} + +/** + * Normalize a free-text full name. Trims and collapses whitespace, + * replaces \r/\n/\t with single spaces, intelligently title-cases + * ALL-CAPS surnames while keeping particles (van / de / dalla / etc.) + * lowercase mid-name, and preserves Irish O' surnames as O'Brien. + * + * If the input contains a `/` (slash-with-company structure like + * "Daniel Wainstein / 7 Knots, LLC"), the trailing company text is + * preserved verbatim — it's signal, not noise. + */ +export function normalizeName(raw: string | null | undefined): NormalizedName { + const safe = (raw ?? '').toString(); + // Replace \r, \n, \t with single spaces, then collapse runs of whitespace. + const cleaned = safe + .replace(/[\r\n\t]/g, ' ') + .replace(/\s+/g, ' ') + .trim(); + + if (!cleaned) { + return { display: '', normalized: '', surnameToken: undefined }; + } + + // Slash-with-company: title-case the part before the slash, leave the + // company segment untouched (it's typically already a brand we shouldn't + // mangle: "SAS TIKI", "7 Knots, LLC"). + const slashIdx = cleaned.indexOf('/'); + let displayCore: string; + if (slashIdx !== -1) { + const personPart = cleaned.slice(0, slashIdx).trim(); + const companyPart = cleaned.slice(slashIdx + 1).trim(); + displayCore = `${titleCaseTokens(personPart)} / ${companyPart}`; + } else { + displayCore = titleCaseTokens(cleaned); + } + + const display = displayCore; + const normalized = display.toLowerCase(); + const surnameToken = computeSurnameToken(slashIdx !== -1 ? cleaned.slice(0, slashIdx) : cleaned); + + return { display, normalized, surnameToken }; +} + +function titleCaseTokens(s: string): string { + const tokens = s.split(' ').filter(Boolean); + if (tokens.length === 0) return ''; + return tokens.map((tok, idx) => titleCaseOneToken(tok, idx === 0)).join(' '); +} + +function titleCaseOneToken(token: string, isFirst: boolean): string { + if (!token) return ''; + const lower = token.toLowerCase(); + if (!isFirst && PARTICLES.has(lower)) return lower; + // O'Brien / D'Angelo / l'Estrange — capitalize the segment after each + // apostrophe so a lowercased input round-trips to readable Irish caps. + if (lower.includes("'")) { + return lower + .split("'") + .map((part) => (part.length > 0 ? part[0]!.toUpperCase() + part.slice(1) : part)) + .join("'"); + } + return lower[0]!.toUpperCase() + lower.slice(1); +} + +function computeSurnameToken(personPart: string): string | undefined { + const cleaned = personPart + .replace(/[\r\n\t]/g, ' ') + .replace(/\s+/g, ' ') + .trim(); + if (!cleaned) return undefined; + const tokens = cleaned.split(' ').map((t) => t.toLowerCase()); + // Walk from the right past particles to find the last "real" surname token. + for (let i = tokens.length - 1; i >= 0; i -= 1) { + const tok = tokens[i]!; + if (!PARTICLES.has(tok)) return tok; + } + // All tokens are particles? Fall back to the last token verbatim. + return tokens[tokens.length - 1]; +} + +// ─── Emails ───────────────────────────────────────────────────────────────── + +const emailSchema = z.string().email(); + +/** + * Normalize a free-text email. Trims + lowercases. Returns null for empty + * or malformed input — caller decides whether to flag, store, or drop. + * + * Plus-aliases (`user+tag@domain.com`) are NOT stripped: they're real + * distinct addresses, and stripping them would auto-merge legitimately + * separate accounts. + */ +export function normalizeEmail(raw: string | null | undefined): string | null { + if (raw == null) return null; + const trimmed = raw.toString().trim().toLowerCase(); + if (!trimmed) return null; + const result = emailSchema.safeParse(trimmed); + return result.success ? trimmed : null; +} + +// ─── Phones ───────────────────────────────────────────────────────────────── + +export type PhoneFlag = 'multi_number' | 'placeholder' | 'unparseable'; + +export interface NormalizedPhone { + /** Canonical E.164 form, e.g. '+15742740548'. Null when unparseable + * or flagged as placeholder. */ + e164: string | null; + /** ISO-3166-1 alpha-2 of the country the number was parsed against. */ + country: CountryCode | null; + /** Display-friendly international format. Useful for migration reports. */ + display: string | null; + /** Set when the input had a quirk worth surfacing in the migration + * report or runtime audit log. Absent on clean parses. */ + flagged?: PhoneFlag; +} + +/** + * Normalize a raw user-entered phone string for comparison + storage. + * + * Pipeline: + * 1. strip leading apostrophe (spreadsheet copy-paste artifact) + * 2. strip \r / \n / \t (real values seen in NocoDB had carriage returns) + * 3. detect multi-number fields ("+33611111111;+33622222222", + * "0677580750/0690511494") — flag and take first segment + * 4. strip whitespace, dots, dashes, parens, single quotes + * 5. convert leading "00" → "+" (international dialling code) + * 6. detect placeholder fakes (8+ consecutive zeros) — flag, return null e164 + * 7. parse via libphonenumber-js + * 8. on parse failure or invalid number → flag 'unparseable' + * + * Returns null for empty inputs (cheaper to short-circuit than to wrap). + */ +export function normalizePhone( + raw: string | null | undefined, + defaultCountry?: CountryCode, +): NormalizedPhone | null { + if (raw == null) return null; + let cleaned = raw.toString().trim(); + if (!cleaned) return null; + + // 1. Spreadsheet apostrophe prefix. + if (cleaned.startsWith("'")) cleaned = cleaned.slice(1); + + // 2. Strip carriage returns / newlines / tabs. + cleaned = cleaned.replace(/[\r\n\t]/g, ''); + + // 3. Multi-number detection — split on /, ;, , (in that order of priority). + let flagged: PhoneFlag | undefined; + if (/[/;,]/.test(cleaned)) { + flagged = 'multi_number'; + cleaned = cleaned.split(/[/;,]/)[0]!.trim(); + } + + // 4. Strip whitespace, dots, dashes, parens. Keep + for E.164 prefix. + cleaned = cleaned.replace(/[\s.\-()]/g, ''); + if (!cleaned) return { e164: null, country: null, display: null, flagged: 'unparseable' }; + + // 5. 00 international prefix → +. + if (cleaned.startsWith('00')) { + cleaned = '+' + cleaned.slice(2); + } + + // 6. Placeholder fakes — runs of 8+ consecutive zeros, e.g. +447000000000. + if (/0{8,}/.test(cleaned)) { + return { e164: null, country: null, display: null, flagged: 'placeholder' }; + } + + // 7. Parse via the existing i18n helper (libphonenumber-js under the hood). + const parsed = parsePhone(cleaned, defaultCountry); + if (!parsed.e164 || !parsed.isValid) { + return { e164: null, country: null, display: null, flagged: 'unparseable' }; + } + + return { + e164: parsed.e164, + country: parsed.country, + display: parsed.international, + flagged, + }; +} + +// ─── Countries ────────────────────────────────────────────────────────────── + +/** + * Aliases for canonical country names that don't match + * `Intl.DisplayNames(en)` output verbatim. Keys are pre-normalized + * (lowercase, diacritic-free, hyphens/dots → spaces, collapsed whitespace). + * + * Kept opinionated and small — only entries we've actually seen in legacy + * data. Adding a new alias is cheap; trying to be exhaustive isn't. + */ +const COUNTRY_ALIASES: Record = { + // Generic abbreviations + usa: 'US', + us: 'US', + uk: 'GB', + // Saint-Barthélemy variants seen in production + 'saint barthelemy': 'BL', + 'saint barth': 'BL', + 'st barth': 'BL', + 'st barths': 'BL', + 'st barthelemy': 'BL', +}; + +/** + * High-frequency cities → country, used as a last-resort fallback when + * exact / alias / fuzzy country matching all miss. Keys are normalized. + * + * Order matters: an entry's key is also matched as a substring of the + * input ("Sag Harbor Y" contains "sag harbor"), so the most specific + * city appears first to avoid a wrong partial hit. + */ +const CITY_TO_COUNTRY: Record = { + 'kansas city': 'US', + 'sag harbor': 'US', + 'new york': 'US', + london: 'GB', + paris: 'FR', +}; + +export type CountryConfidence = 'exact' | 'fuzzy' | 'city'; + +export interface ResolvedCountry { + iso: CountryCode | null; + confidence: CountryConfidence | null; +} + +/** + * Map free-text country / region input to an ISO-3166-1 alpha-2 code. + * + * Lookup order: alias → exact (vs. all locale country names) → city → + * fuzzy (Levenshtein ≤ 2). Anything beyond fuzzy returns null and the + * migration script flags the row for human review. + */ +export function resolveCountry(text: string | null | undefined): ResolvedCountry { + if (text == null) return { iso: null, confidence: null }; + const normalized = normalizeForLookup(text.toString()); + if (!normalized) return { iso: null, confidence: null }; + + // 1. Aliases — covers USA / UK / St Barth and friends. + const alias = COUNTRY_ALIASES[normalized]; + if (alias) return { iso: alias, confidence: 'exact' }; + + // 2. Exact match against Intl-derived country names. We compare against + // diacritic-stripped + lowercased canonical names so 'United States' + // and 'united states' both resolve. + for (const code of ALL_COUNTRY_CODES) { + const cleanName = normalizeForLookup(getCountryName(code, 'en')); + if (cleanName === normalized) return { iso: code, confidence: 'exact' }; + } + + // 3. City → country fallback, exact or substring. + const cityExact = CITY_TO_COUNTRY[normalized]; + if (cityExact) return { iso: cityExact, confidence: 'city' }; + for (const [city, iso] of Object.entries(CITY_TO_COUNTRY)) { + if (normalized.includes(city)) return { iso, confidence: 'city' }; + } + + // 4. Fuzzy fallback (Levenshtein ≤ 2). Skipped for short inputs because + // a 4-char string like "Mars" sits within distance 2 of multiple + // short country names (Mali, Laos, Iran, …) — false-positive city. + if (normalized.length >= 6) { + let bestCode: CountryCode | null = null; + let bestDistance = Number.POSITIVE_INFINITY; + for (const code of ALL_COUNTRY_CODES) { + const cleanName = normalizeForLookup(getCountryName(code, 'en')); + const d = levenshtein(cleanName, normalized); + if (d < bestDistance) { + bestDistance = d; + bestCode = code; + if (d === 0) break; + } + } + if (bestDistance <= 2 && bestCode) { + return { iso: bestCode, confidence: 'fuzzy' }; + } + } + + return { iso: null, confidence: null }; +} + +/** Lowercase + strip diacritics + replace hyphens/dots with spaces + + * collapse whitespace. Used by both the input and the canonical-name + * side of the country comparison so they meet on the same shape. */ +function normalizeForLookup(s: string): string { + return s + .normalize('NFD') + .replace(/[̀-ͯ]/g, '') + .toLowerCase() + .replace(/[-.]/g, ' ') + .replace(/\s+/g, ' ') + .trim(); +} + +// ─── Levenshtein ──────────────────────────────────────────────────────────── + +/** + * Standard iterative Levenshtein. Used by the country fuzzy match and by + * the dedup algorithm's name-similarity rule. Allocates O(n*m) so callers + * shouldn't run it against pathological inputs — the dedup blocking + * strategy keeps comparison sets small. + * + * Exported so the find-matches module can reuse the same implementation + * without relying on an external dep. + */ +export function levenshtein(a: string, b: string): number { + if (a === b) return 0; + if (!a) return b.length; + if (!b) return a.length; + + const m = a.length; + const n = b.length; + // Two rolling rows is enough — keeps memory at O(n) instead of O(n*m). + let prev = new Array(n + 1); + let curr = new Array(n + 1); + for (let j = 0; j <= n; j += 1) prev[j] = j; + + for (let i = 1; i <= m; i += 1) { + curr[0] = i; + for (let j = 1; j <= n; j += 1) { + const cost = a[i - 1] === b[j - 1] ? 0 : 1; + curr[j] = Math.min(curr[j - 1]! + 1, prev[j]! + 1, prev[j - 1]! + cost); + } + [prev, curr] = [curr, prev]; + } + + return prev[n]!; +} diff --git a/tests/unit/dedup/find-matches.test.ts b/tests/unit/dedup/find-matches.test.ts new file mode 100644 index 0000000..b68cb0a --- /dev/null +++ b/tests/unit/dedup/find-matches.test.ts @@ -0,0 +1,379 @@ +/** + * Match-finding library — unit tests. + * + * Each duplicate cluster from the legacy NocoDB Interests audit (see + * docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.2) + * is encoded as a fixture here. The expected scoring tier (high / medium + * / low) is the design contract — if the algorithm starts returning + * "high" for a Pattern F case (Etiennette / Bruno+Bruce) it has lost + * the false-positive guard and we'll know immediately. + */ +import { describe, expect, it } from 'vitest'; + +import { findClientMatches, type MatchCandidate } from '@/lib/dedup/find-matches'; + +// Sensible defaults for tests — match the design's recommended thresholds. +const THRESHOLDS = { + highScore: 90, + mediumScore: 50, +}; + +function candidate(partial: Partial & { id: string }): MatchCandidate { + return { + id: partial.id, + fullName: partial.fullName ?? null, + surnameToken: partial.surnameToken ?? null, + emails: partial.emails ?? [], + phonesE164: partial.phonesE164 ?? [], + countryIso: partial.countryIso ?? null, + }; +} + +describe('findClientMatches', () => { + describe('Pattern A — pure double-submit (high confidence)', () => { + it('flags identical email + phone as high', () => { + // From real data: Deepak Ramchandani #624/#625, identical fields. + const incoming = candidate({ + id: 'b', + fullName: 'Deepak Ramchandani', + surnameToken: 'ramchandani', + emails: ['dannyrams8888@gmail.com'], + phonesE164: ['+17215868888'], + }); + const pool = [ + candidate({ + id: 'a', + fullName: 'Deepak Ramchandani', + surnameToken: 'ramchandani', + emails: ['dannyrams8888@gmail.com'], + phonesE164: ['+17215868888'], + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + expect(matches).toHaveLength(1); + expect(matches[0]!.candidate.id).toBe('a'); + expect(matches[0]!.score).toBeGreaterThanOrEqual(90); + expect(matches[0]!.confidence).toBe('high'); + expect(matches[0]!.reasons).toEqual(expect.arrayContaining(['email match', 'phone match'])); + }); + }); + + describe('Pattern B — same email, different phone format (high)', () => { + it('high confidence when phones already normalize-equal', () => { + // From real data: Howard Wiarda #236/#536, "574-274-0548" vs "+15742740548". + // After normalization both phones are the same E.164, so the rule fires. + const incoming = candidate({ + id: 'b', + fullName: 'Howard Wiarda', + surnameToken: 'wiarda', + emails: ['hwiarda@hotmail.com'], + phonesE164: ['+15742740548'], + }); + const pool = [ + candidate({ + id: 'a', + fullName: 'Howard Wiarda', + surnameToken: 'wiarda', + emails: ['hwiarda@hotmail.com'], + phonesE164: ['+15742740548'], + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + expect(matches[0]!.confidence).toBe('high'); + expect(matches[0]!.score).toBeGreaterThanOrEqual(90); + }); + }); + + describe('Pattern C — name capitalization variant (high)', () => { + it('treats lowercase + uppercase as the same person when surname-token + email + phone all match', () => { + // From real data: Nicolas Ruiz #681/#682/#683, email differs only by case. + const incoming = candidate({ + id: 'b', + fullName: 'Nicolas Ruiz', + surnameToken: 'ruiz', + emails: ['ruiz.nicolas@ufl.edu'], + phonesE164: ['+17862006617'], + }); + const pool = [ + candidate({ + id: 'a', + fullName: 'Nicolas Ruiz', + surnameToken: 'ruiz', + emails: ['ruiz.nicolas@ufl.edu'], + phonesE164: ['+17862006617'], + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + expect(matches[0]!.confidence).toBe('high'); + }); + }); + + describe('Pattern D — name shortening (high)', () => { + it('Chris vs Christopher with same email + phone scores high', () => { + // From real data: Chris Allen #700 vs Christopher Allen #534. + const incoming = candidate({ + id: 'b', + fullName: 'Chris Allen', + surnameToken: 'allen', + emails: ['chris@thundercatsports.com'], + phonesE164: ['+17814548950'], + }); + const pool = [ + candidate({ + id: 'a', + fullName: 'Christopher Allen', + surnameToken: 'allen', + emails: ['chris@thundercatsports.com'], + phonesE164: ['+17814548950'], + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + expect(matches[0]!.confidence).toBe('high'); + }); + }); + + describe('Pattern E — typo on resubmit', () => { + it('same email + nearly-identical phone (typo in last digits) scores high', () => { + // Christopher Camazou #649/#650 — phone differs in last 4 digits but + // everything else matches. Exact phone equality fails; email exact + // match alone (60) + name-token match (20) puts us in medium tier. + // The user can confirm the merge. + const incoming = candidate({ + id: 'b', + fullName: 'Christopher Camazou', + surnameToken: 'camazou', + emails: ['camazou11@gmail.com'], + phonesE164: ['+33608334455'], + }); + const pool = [ + candidate({ + id: 'a', + fullName: 'Christopher Camazou', + surnameToken: 'camazou', + emails: ['camazou11@gmail.com'], + phonesE164: ['+33608336549'], + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + expect(matches).toHaveLength(1); + // Email + name match without phone match — strong but not certain. + expect(matches[0]!.confidence).toMatch(/^(high|medium)$/); + expect(matches[0]!.score).toBeGreaterThanOrEqual(70); + }); + + it('Constanzo / Costanzo surname typo with same email + phone scores high', () => { + // Gianfranco Di Constanzo #585 vs Di Costanzo #336 — same email + phone + // and only a 1-letter surname typo. This is a strong "same client, + // multiple yachts" signal — the design's signature win. + const incoming = candidate({ + id: 'b', + fullName: 'Gianfranco Di Constanzo', + surnameToken: 'constanzo', + emails: ['gdc@nauticall.com'], + phonesE164: ['+17542628669'], + }); + const pool = [ + candidate({ + id: 'a', + fullName: 'Gianfranco Di Costanzo', + surnameToken: 'costanzo', + emails: ['gdc@nauticall.com'], + phonesE164: ['+17542628669'], + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + expect(matches[0]!.confidence).toBe('high'); + expect(matches[0]!.score).toBeGreaterThanOrEqual(90); + }); + }); + + describe('Pattern F — hard cases (must NOT auto-merge)', () => { + it('same name with different country phone + different email scores at most medium', () => { + // Etiennette Clamouze #188/#717 — same name but completely different + // email + phone (and the phones are in different country codes, + // suggesting either a relative, a coworker, or a name-collision). + // We must NOT classify this as "high" or it would force-merge two + // distinct people. + const incoming = candidate({ + id: 'b', + fullName: 'Etiennette Clamouze', + surnameToken: 'clamouze', + emails: ['etiennette@the-manoah.com'], + phonesE164: ['+12645815607'], + countryIso: 'AI', + }); + const pool = [ + candidate({ + id: 'a', + fullName: 'Etiennette Clamouze', + surnameToken: 'clamouze', + emails: ['clamouze.etiennette@gmail.com'], + phonesE164: ['+33767780640'], + countryIso: 'FR', + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + // Surname-token + name-exact match should score in medium tier so + // the pair lands in the review queue but doesn't auto-merge. + if (matches.length > 0) { + expect(matches[0]!.confidence).not.toBe('high'); + expect(matches[0]!.score).toBeLessThan(90); + } + }); + + it('shared email between two clearly different names is medium not high', () => { + // Bruno Joyerot #18 vs Bruce Hearn #19 — Bruno's row shows email + // belonging to "catherine elaine hearn" (Bruce's spouse). Same + // household phone area code. Name overlap is partial. Don't merge. + const incoming = candidate({ + id: 'b', + fullName: 'Bruce Hearn', + surnameToken: 'hearn', + emails: ['bhearn1063@gmail.com'], + phonesE164: ['+12642358840'], + }); + const pool = [ + candidate({ + id: 'a', + fullName: 'Bruno Joyerot', + surnameToken: 'joyerot', + emails: ['catherineelainehearn@gmail.com'], + phonesE164: ['+12642352816'], + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + // Names don't match, emails don't match, phones differ — there's + // no reason for this to surface at all. Either no match or low. + if (matches.length > 0) { + expect(matches[0]!.confidence).toBe('low'); + } + }); + }); + + describe('Negative evidence — same email but different country phone', () => { + it('reduces score when email matches but phone country differs', () => { + // Constructed: same email, but one phone is +33 (FR) and the other + // is +1 (US). Likely a shared-inbox spouse situation. We want + // medium tier so it lands in review, not high tier. + const incoming = candidate({ + id: 'b', + fullName: 'Test User', + surnameToken: 'user', + emails: ['shared@example.com'], + phonesE164: ['+15551234567'], + countryIso: 'US', + }); + const pool = [ + candidate({ + id: 'a', + fullName: 'Test User', + surnameToken: 'user', + emails: ['shared@example.com'], + phonesE164: ['+33611111111'], + countryIso: 'FR', + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + // Email match alone would be 60 + name token match 20 = 80 (medium). + // Negative evidence (different phone country) brings it down further. + expect(matches[0]!.confidence).toBe('medium'); + }); + }); + + describe('Blocking — only relevant candidates are scored', () => { + it('does not score candidates with no shared emails / phones / surname token', () => { + const incoming = candidate({ + id: 'newbie', + fullName: 'Alice Smith', + surnameToken: 'smith', + emails: ['alice@example.com'], + phonesE164: ['+15551234567'], + }); + const pool = [ + candidate({ + id: 'unrelated1', + fullName: 'Bob Jones', + surnameToken: 'jones', + emails: ['bob@example.org'], + phonesE164: ['+33611111111'], + }), + candidate({ + id: 'unrelated2', + fullName: 'Carol White', + surnameToken: 'white', + emails: ['carol@example.net'], + phonesE164: ['+447700900111'], + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + expect(matches).toHaveLength(0); + }); + }); + + describe('Empty pool', () => { + it('returns no matches when the pool is empty', () => { + const incoming = candidate({ + id: 'a', + fullName: 'Alice', + emails: ['alice@example.com'], + }); + expect(findClientMatches(incoming, [], THRESHOLDS)).toEqual([]); + }); + }); + + describe('Sort order', () => { + it('returns matches sorted by score descending', () => { + const incoming = candidate({ + id: 'incoming', + fullName: 'John Smith', + surnameToken: 'smith', + emails: ['john@example.com'], + phonesE164: ['+15551234567'], + }); + const pool = [ + candidate({ + // High match — same email + phone + id: 'high-match', + fullName: 'John Smith', + surnameToken: 'smith', + emails: ['john@example.com'], + phonesE164: ['+15551234567'], + }), + candidate({ + // Medium match — same email only + id: 'medium-match', + fullName: 'Different Person', + surnameToken: 'person', + emails: ['john@example.com'], + phonesE164: ['+33611111111'], + }), + ]; + + const matches = findClientMatches(incoming, pool, THRESHOLDS); + + expect(matches.length).toBeGreaterThanOrEqual(2); + expect(matches[0]!.candidate.id).toBe('high-match'); + expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score); + }); + }); +}); diff --git a/tests/unit/dedup/normalize.test.ts b/tests/unit/dedup/normalize.test.ts new file mode 100644 index 0000000..9a20252 --- /dev/null +++ b/tests/unit/dedup/normalize.test.ts @@ -0,0 +1,270 @@ +/** + * Normalization library — unit tests. + * + * Every fixture here comes from real dirty values observed in the legacy + * NocoDB Interests table during the 2026-05-03 audit (see + * docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.3). + * The point is regression-prevention: if any of these patterns ever + * stops normalizing the way it should, dedup quality silently drops. + */ +import { describe, expect, it } from 'vitest'; + +import { + normalizeName, + normalizeEmail, + normalizePhone, + resolveCountry, +} from '@/lib/dedup/normalize'; + +describe('normalizeName', () => { + it('returns null fields for empty / null input', () => { + expect(normalizeName('')).toEqual({ display: '', normalized: '', surnameToken: undefined }); + expect(normalizeName(' ')).toEqual({ + display: '', + normalized: '', + surnameToken: undefined, + }); + }); + + it('trims leading/trailing whitespace', () => { + expect(normalizeName(' Marcus Laurent ')).toMatchObject({ + display: 'Marcus Laurent', + normalized: 'marcus laurent', + }); + }); + + it('collapses repeated internal whitespace to a single space', () => { + // From real data: "Arthur Matthews" (#183), "Corinne Roche" (#208). + expect(normalizeName('Arthur Matthews').display).toBe('Arthur Matthews'); + expect(normalizeName('Corinne Roche').display).toBe('Corinne Roche'); + }); + + it('replaces embedded carriage returns and newlines with single spaces', () => { + // From real data: "Andrei \nVAGNANOV" (#178), "Daniel\r PRZEDBORSKI" (#175). + expect(normalizeName('Andrei \nVAGNANOV').display).toBe('Andrei Vagnanov'); + expect(normalizeName('Daniel\r PRZEDBORSKI').display).toBe('Daniel Przedborski'); + }); + + it('title-cases ALL-CAPS surnames while keeping given name title-cased', () => { + // From real data: "Jona ANDERSEN" (#232), "Duane SALTSGAVER" (#227), + // "Marcos DALLA PRIA" (#165). + expect(normalizeName('Jona ANDERSEN').display).toBe('Jona Andersen'); + expect(normalizeName('Duane SALTSGAVER').display).toBe('Duane Saltsgaver'); + // Particle 'dalla' stays lowercase mid-name. + expect(normalizeName('Marcos DALLA PRIA').display).toBe('Marcos dalla Pria'); + }); + + it('title-cases lowercased entries', () => { + // From real data: "antony amaral" (#665), "david rosenbloom" (#239), + // "john Tickner" (#247). + expect(normalizeName('antony amaral').display).toBe('Antony Amaral'); + expect(normalizeName('david rosenbloom').display).toBe('David Rosenbloom'); + expect(normalizeName('john Tickner').display).toBe('John Tickner'); + }); + + it('keeps Romance and Germanic particles lowercase mid-name', () => { + // From real data: "Olav van Velsen" (#526), "Bruno Joyerot" (#18), + // "OLIVIER DAIN" (#677). Also synthetic "Carla de la Cruz". + expect(normalizeName('Olav van Velsen').display).toBe('Olav van Velsen'); + expect(normalizeName('Carla de la Cruz').display).toBe('Carla de la Cruz'); + expect(normalizeName('OLIVIER DAIN').display).toBe('Olivier Dain'); + }); + + it('preserves O‘-prefixed Irish surnames as title-case', () => { + expect(normalizeName("liam o'brien").display).toBe("Liam O'Brien"); + }); + + it('keeps the slash-with-company structure intact', () => { + // From real data: "Daniel Wainstein / 7 Knots, LLC" (#637), + // "Bruno Joyerot / SAS TIKI" (#18). + expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').display).toBe( + 'Daniel Wainstein / 7 Knots, LLC', + ); + expect(normalizeName('Bruno Joyerot / SAS TIKI').display).toBe('Bruno Joyerot / SAS TIKI'); + }); + + it('exposes the last non-particle token as surnameToken (lowercase) for blocking', () => { + expect(normalizeName('Marcus Laurent').surnameToken).toBe('laurent'); + expect(normalizeName('Olav van Velsen').surnameToken).toBe('velsen'); + expect(normalizeName('Carla de la Cruz').surnameToken).toBe('cruz'); + expect(normalizeName("Liam O'Brien").surnameToken).toBe("o'brien"); + }); + + it('handles single-token names — surnameToken is the only token', () => { + expect(normalizeName('Madonna').surnameToken).toBe('madonna'); + }); + + it('produces a normalized form that is always lowercase', () => { + expect(normalizeName('Andrei VAGNANOV').normalized).toBe('andrei vagnanov'); + expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').normalized).toBe( + 'daniel wainstein / 7 knots, llc', + ); + }); +}); + +describe('normalizeEmail', () => { + it('returns null for empty / null inputs', () => { + expect(normalizeEmail('')).toBeNull(); + expect(normalizeEmail(' ')).toBeNull(); + }); + + it('lowercases and trims', () => { + // From real data: "Arthur@laser-align.com" vs "arthur@laser-align.com" (#183/#686). + expect(normalizeEmail('Arthur@laser-align.com')).toBe('arthur@laser-align.com'); + expect(normalizeEmail(' marcus@example.com ')).toBe('marcus@example.com'); + }); + + it('lowercases capitalized localparts', () => { + // From real data: "Bmalone850@gmail.com" (#489), "Hef355@yahoo.com" (#533), + // "Donclaytonmusic@gmail.com" (#679). + expect(normalizeEmail('Bmalone850@gmail.com')).toBe('bmalone850@gmail.com'); + expect(normalizeEmail('Hef355@yahoo.com')).toBe('hef355@yahoo.com'); + }); + + it('preserves plus-aliases — both legitimate and tricks', () => { + // Per design §3.2: "+aliases" are not stripped. Compare by full localpart. + expect(normalizeEmail('marcus+sales@example.com')).toBe('marcus+sales@example.com'); + }); + + it('returns null for invalid email shapes', () => { + expect(normalizeEmail('not-an-email')).toBeNull(); + expect(normalizeEmail('@example.com')).toBeNull(); + expect(normalizeEmail('user@')).toBeNull(); + expect(normalizeEmail('user@.com')).toBeNull(); + }); +}); + +describe('normalizePhone', () => { + it('returns null for empty / whitespace / null', () => { + expect(normalizePhone('', 'AI')).toBeNull(); + expect(normalizePhone(' ', 'AI')).toBeNull(); + }); + + it('parses a plain E.164 number', () => { + expect(normalizePhone('+15742740548', 'US')).toMatchObject({ + e164: '+15742740548', + country: 'US', + }); + }); + + it('strips embedded carriage returns and trailing whitespace', () => { + // From real data: "+1-264-235-8840\r" (#19), "+1-264-772-3272\r" (#20). + const out = normalizePhone('+1-264-235-8840\r', 'AI'); + expect(out?.e164).toBe('+12642358840'); + }); + + it('strips dashes, dots, parens, single quotes, spaces in a single pass', () => { + // From real data: "'+1.214.603.4235" (#205), "574-274-0548" (#236), + // "+1-264-235-8840" (#19), "+1 (212) 555-0123" (synthetic). + expect(normalizePhone("'+1.214.603.4235", 'US')?.e164).toBe('+12146034235'); + expect(normalizePhone('574-274-0548', 'US')?.e164).toBe('+15742740548'); + expect(normalizePhone('+1 (212) 555-0123', 'US')?.e164).toBe('+12125550123'); + }); + + it('converts a leading 00 prefix to + (international dialling)', () => { + // From real data: "00447956657022" (#216), "0033651381036" (#702). + expect(normalizePhone('00447956657022', 'GB')?.e164).toBe('+447956657022'); + expect(normalizePhone('0033651381036', 'FR')?.e164).toBe('+33651381036'); + }); + + it('uses defaultCountry when input has no international prefix', () => { + // From real data: "0690699699" (#203, French local), "0651381036" (#701). + expect(normalizePhone('0690699699', 'FR')?.e164).toBe('+33690699699'); + expect(normalizePhone('0651381036', 'FR')?.e164).toBe('+33651381036'); + }); + + it('returns null when there is no prefix AND no defaultCountry', () => { + // The migration script flags these for human review. + const out = normalizePhone('5742740548'); + expect(out?.e164 ?? null).toBeNull(); + }); + + it('flags placeholder all-zeros numbers and returns null', () => { + // From real data: "+447000000000" (#641, "Milos Vitkovic" — clearly fake). + const out = normalizePhone('+447000000000', 'GB'); + expect(out?.flagged).toBe('placeholder'); + expect(out?.e164).toBeNull(); + }); + + it('flags multi-number fields and uses the first segment', () => { + // From real data: "0677580750/0690511494" (#209). Other separators: ; , + const slash = normalizePhone('0677580750/0690511494', 'FR'); + expect(slash?.flagged).toBe('multi_number'); + expect(slash?.e164).toBe('+33677580750'); + + const semi = normalizePhone('+33611111111;+33622222222', 'FR'); + expect(semi?.flagged).toBe('multi_number'); + expect(semi?.e164).toBe('+33611111111'); + }); + + it('flags genuinely unparseable input as `unparseable`', () => { + const out = normalizePhone('xyz-not-a-phone', 'US'); + expect(out?.flagged).toBe('unparseable'); + expect(out?.e164).toBeNull(); + }); + + it('strips an apostrophe-prefix without breaking the parse', () => { + // From real data: leading "'" copy-pasted from spreadsheets escapes + // numeric-cell coercion. Should be invisible to dedup. + expect(normalizePhone("'0690699699", 'FR')?.e164).toBe('+33690699699'); + }); + + it('returns the country alongside the E.164 form', () => { + expect(normalizePhone('+33690699699', 'FR')).toMatchObject({ + e164: '+33690699699', + country: 'FR', + }); + }); +}); + +describe('resolveCountry', () => { + it('returns null for empty / nullish input', () => { + expect(resolveCountry('')).toEqual({ iso: null, confidence: null }); + expect(resolveCountry(' ')).toEqual({ iso: null, confidence: null }); + }); + + it('exact-matches a canonical English country name', () => { + expect(resolveCountry('Anguilla')).toEqual({ iso: 'AI', confidence: 'exact' }); + expect(resolveCountry('United Kingdom')).toEqual({ iso: 'GB', confidence: 'exact' }); + expect(resolveCountry('United States')).toEqual({ iso: 'US', confidence: 'exact' }); + }); + + it('matches case-insensitively', () => { + expect(resolveCountry('anguilla').iso).toBe('AI'); + expect(resolveCountry('UNITED KINGDOM').iso).toBe('GB'); + }); + + it('matches values with surrounding whitespace', () => { + expect(resolveCountry(' United States ').iso).toBe('US'); + }); + + it('handles diacritic variants of Saint-Barthélemy', () => { + // From real data: "Saint barthelemy" (#203), "St Barth" (#208), "Saint-Barthélemy". + expect(resolveCountry('Saint-Barthélemy').iso).toBe('BL'); + expect(resolveCountry('Saint Barthelemy').iso).toBe('BL'); + expect(resolveCountry('saint barthelemy').iso).toBe('BL'); + expect(resolveCountry('St Barth').iso).toBe('BL'); + }); + + it('resolves common abbreviations', () => { + expect(resolveCountry('USA').iso).toBe('US'); + expect(resolveCountry('UK').iso).toBe('GB'); + }); + + it('falls back to a city → country mapping for high-frequency cities', () => { + // From real data: "Kansas City" (#198), "Sag Harbor Y" (#239). + expect(resolveCountry('Kansas City').iso).toBe('US'); + expect(resolveCountry('Sag Harbor Y').iso).toBe('US'); + }); + + it('marks the confidence tier appropriately', () => { + expect(resolveCountry('Anguilla').confidence).toBe('exact'); + expect(resolveCountry('Kansas City').confidence).toBe('city'); + }); + + it('returns null + null for unresolvable values', () => { + // Migration script flags these for human review rather than guessing. + expect(resolveCountry('asdfghjkl xyz')).toEqual({ iso: null, confidence: null }); + expect(resolveCountry('Mars')).toEqual({ iso: null, confidence: null }); + }); +});