Files
pn-new-crm/src/lib/dedup/find-matches.ts

256 lines
9.4 KiB
TypeScript
Raw Normal View History

feat(dedup): normalization + match-finding library (P1) The pure-logic spine of the client deduplication system spec'd in docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md. Two modules, JSX-free, vitest-tested against fixtures drawn directly from real dirty values observed in the legacy NocoDB Interests audit. src/lib/dedup/normalize.ts - normalizeName: trims whitespace, replaces \r/\n/\t, intelligently title-cases ALL-CAPS surnames while keeping particles (van / de / dalla / etc.) lowercase mid-name. Preserves Irish O' surnames and the "slash-with-company" structure ("Daniel Wainstein / 7 Knots, LLC") seen in production. Returns a surnameToken (lowercased last non-particle token) for use as a dedup blocking key. - normalizeEmail: trim + lowercase + zod email validation. Plus-aliases preserved; null on invalid. - normalizePhone: pre-cleans the input (strips spreadsheet apostrophes, carriage returns, dots/dashes/parens, converts 00 prefix to +) then delegates to libphonenumber-js. Detects multi-number fields ("a/b", "a;b") and placeholder fakes (8+ consecutive zeros, e.g. +447000000000). Flags every quirk so the migration report and runtime audit log can surface it. - resolveCountry: maps free-text country/region input to ISO-3166-1 alpha-2 via alias → exact (vs. Intl-derived names) → city → fuzzy (Levenshtein ≤ 2). Fuzzy is gated by length so 4-char inputs ("Mars") don't false-positive against short country names. - levenshtein: standard iterative implementation, exported for reuse by find-matches. src/lib/dedup/find-matches.ts - findClientMatches: builds three blocking indexes off the pool (email / phone / surname-token), gathers the comparison set via union, and scores each candidate via the rule set in design §4.2: Email match +60 Phone E.164 match +50 (≥ 8 digits, excludes placeholder zeros) Name exact match +20 Surname + given fuzzy +15 (Levenshtein ≤ 1) Negative: shared email but different phone country −15 Negative: name match but no shared contact −20 Score is clamped to [0,100]. Confidence tier ('high' / 'medium' / 'low') is derived from configurable thresholds passed in by the caller — defaults are highScore=90, mediumScore=50. tests/unit/dedup/normalize.test.ts (38 cases) Every dirty-data pattern from design §1.3 has a fixture: carriage returns in names, ALL-CAPS surnames, lowercase entries, particles, slash-with-company, plus-aliases, capitalized email localparts, spreadsheet-apostrophe phones, multi-number phones, placeholder phones, 00-prefix phones, French/UK local-format phones, Saint-Barthélemy diacritic variants, Kansas City fallback. tests/unit/dedup/find-matches.test.ts (12 cases) Each duplicate cluster from design §1.2 has a test: - Pattern A (Deepak Ramchandani — pure double-submit) → high - Pattern B (Howard Wiarda — phone format variance) → high - Pattern C (Nicolas Ruiz — name capitalization) → high - Pattern D (Chris/Christopher Allen — name shortening) → high - Pattern E (Christopher Camazou — typo on resubmit) → high or medium - Pattern E (Constanzo/Costanzo — surname typo, multi-yacht) → high - Pattern F (Etiennette Clamouze — same name, different country) → must NOT auto-merge - Pattern F (Bruno+Bruce — shared household contact) → no match - Negative evidence (same email, different phone country) → medium - Blocking (no shared keys → 0 matches) - Sort order (high before low) - Empty pool Total: 50 new tests, all green. Zero changes to runtime behavior or schema; unblocks P2 (runtime surfaces) and P3 (NocoDB migration). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 14:28:59 +02:00
/**
* Client-match finder pure scoring logic.
*
* Compares one input candidate against a pool of existing candidates and
* returns scored matches. Used by:
* - the at-create suggestion in client/interest forms (Layer 1)
* - the public-form auto-link path (when score >= block threshold)
* - the nightly background scoring job (Layer 3)
* - the migration script's dedup pass
*
* Performance shape: blocking via email / phone / surname-token reduces
* the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in
* production. See `findClientMatches` for the blocking implementation.
*
* Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4.
*/
import { parsePhone } from '@/lib/i18n/phone';
import { levenshtein } from './normalize';
// ─── Types ──────────────────────────────────────────────────────────────────
export interface MatchCandidate {
id: string;
fullName: string | null;
/** Lowercased last non-particle token from `normalizeName(...).surnameToken`.
* Used as a blocking key. */
surnameToken: string | null;
/** Already lowercased + validated via `normalizeEmail`. */
emails: string[];
/** Already canonical E.164 via `normalizePhone`. */
phonesE164: string[];
/** Address country (NOT phone country) — used for tiebreaking, not scoring. */
countryIso: string | null;
}
export type MatchConfidence = 'high' | 'medium' | 'low';
export interface MatchResult {
candidate: MatchCandidate;
/** 0100 after capping. */
score: number;
/** Human-readable list of which rules contributed. Useful for the
* review queue UI ("matched on email + phone + surname token"). */
reasons: string[];
confidence: MatchConfidence;
}
export interface DedupThresholds {
/** Inclusive lower bound for `'high'` confidence. */
highScore: number;
/** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */
mediumScore: number;
}
// ─── Public entry point ─────────────────────────────────────────────────────
/**
* Compare `input` against every reachable candidate in `pool` and return
* scored matches, sorted by score descending. The result list includes
* low-confidence hits caller filters by `confidence` or `score`
* depending on use case.
*
* Self-matches (an entry with `id === input.id`, e.g. when re-scoring an
* existing client during a background job) are excluded.
*/
export function findClientMatches(
input: MatchCandidate,
pool: MatchCandidate[],
thresholds: DedupThresholds,
): MatchResult[] {
if (pool.length === 0) return [];
// ── Phase 1: build blocking indexes off the pool. ─────────────────────────
//
// Three indexes mean any candidate that shares ANY of (email / phone /
// surname-token) with the input shows up in the comparison set. Anything
// that shares NONE is structurally too different to be a duplicate and
// is skipped — this is what keeps the algorithm O(n) at scale.
const byEmail = new Map<string, MatchCandidate[]>();
const byPhone = new Map<string, MatchCandidate[]>();
const bySurnameToken = new Map<string, MatchCandidate[]>();
for (const c of pool) {
if (c.id === input.id) continue;
for (const email of c.emails) {
pushTo(byEmail, email, c);
}
for (const phone of c.phonesE164) {
pushTo(byPhone, phone, c);
}
if (c.surnameToken) {
pushTo(bySurnameToken, c.surnameToken, c);
}
}
// ── Phase 2: gather the comparison set via the blocking indexes. ─────────
const comparisonSet = new Map<string, MatchCandidate>();
for (const email of input.emails) {
for (const c of byEmail.get(email) ?? []) {
comparisonSet.set(c.id, c);
}
}
for (const phone of input.phonesE164) {
for (const c of byPhone.get(phone) ?? []) {
comparisonSet.set(c.id, c);
}
}
if (input.surnameToken) {
for (const c of bySurnameToken.get(input.surnameToken) ?? []) {
comparisonSet.set(c.id, c);
}
}
// ── Phase 3: score every candidate that survived blocking. ───────────────
const results: MatchResult[] = [];
for (const candidate of comparisonSet.values()) {
const r = scorePair(input, candidate);
results.push(r);
}
// ── Phase 4: sort by score desc + assign confidence tier. ────────────────
results.sort((a, b) => b.score - a.score);
for (const r of results) {
r.confidence = classify(r.score, thresholds);
}
return results;
}
// ─── Scoring ────────────────────────────────────────────────────────────────
/**
* Score one (input, candidate) pair against the rule set in design §4.2.
* Compounding: positive rules sum, negative rules subtract; the result is
* clamped to [0, 100]. Reasons accumulate in the order rules fire so the
* review-queue UI can show "matched on email + phone".
*/
function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult {
let score = 0;
const reasons: string[] = [];
// ── Positive rules. ──────────────────────────────────────────────────────
const sharedEmail = a.emails.find((e) => b.emails.includes(e));
const emailMatch = !!sharedEmail;
if (emailMatch) {
score += 60;
reasons.push('email match');
}
const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8);
const phoneMatch = !!sharedPhone;
if (phoneMatch) {
score += 50;
reasons.push('phone match');
}
const aNameNorm = (a.fullName ?? '').toLowerCase().trim();
const bNameNorm = (b.fullName ?? '').toLowerCase().trim();
const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm;
if (nameExactMatch) {
score += 20;
reasons.push('name match');
}
// Surname + given-name fuzzy. Only fires when names are NOT exactly
// equal — avoids double-counting with the rule above. Catches
// 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact
// signals confirm them.
if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) {
const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) {
score += 15;
reasons.push('surname + given-name fuzzy match');
}
}
// ── Negative rules. ──────────────────────────────────────────────────────
// Same email but the two parties' phone numbers belong to different
// countries. Common when one inbox is shared by spouses / coworkers
// and the actual phone owners are distinct people. Don't auto-merge.
if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) {
const aCountries = phoneCountriesOf(a);
const bCountries = phoneCountriesOf(b);
const overlap = [...aCountries].some((c) => bCountries.has(c));
if (!overlap && aCountries.size > 0 && bCountries.size > 0) {
score -= 15;
reasons.push('phone country mismatch (negative)');
}
}
// Same name but no contact match. Two distinct people with the same
// name (common for "John Smith") sneak through name-based blocking;
// penalize so the score lands below the auto-merge threshold.
if (nameExactMatch && !emailMatch && !phoneMatch) {
score -= 20;
reasons.push('name match but no shared contact (negative)');
}
return {
candidate: b,
score: clamp(score, 0, 100),
reasons,
confidence: 'low', // assigned by caller after threshold lookup
};
}
// ─── Helpers ────────────────────────────────────────────────────────────────
function pushTo<K, V>(map: Map<K, V[]>, key: K, value: V): void {
const existing = map.get(key);
if (existing) {
existing.push(value);
} else {
map.set(key, [value]);
}
}
function classify(score: number, thresholds: DedupThresholds): MatchConfidence {
if (score >= thresholds.highScore) return 'high';
if (score >= thresholds.mediumScore) return 'medium';
return 'low';
}
function clamp(value: number, min: number, max: number): number {
if (value < min) return min;
if (value > max) return max;
return value;
}
function countDigits(s: string): number {
let count = 0;
for (let i = 0; i < s.length; i += 1) {
const code = s.charCodeAt(i);
if (code >= 48 && code <= 57) count += 1;
}
return count;
}
/**
* Resolve each phone in a candidate to its ISO country code (via
* libphonenumber-js). Cached per call; the surrounding caller doesn't
* batch so we accept the parse cost.
*/
function phoneCountriesOf(c: MatchCandidate): Set<string> {
const out = new Set<string>();
for (const p of c.phonesE164) {
const parsed = parsePhone(p);
if (parsed.country) out.add(parsed.country);
}
return out;
}