feat(dedup): normalization + match-finding library (P1)
The pure-logic spine of the client deduplication system spec'd in
docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md.
Two modules, JSX-free, vitest-tested against fixtures drawn directly
from real dirty values observed in the legacy NocoDB Interests audit.
src/lib/dedup/normalize.ts
- normalizeName: trims whitespace, replaces \r/\n/\t, intelligently
title-cases ALL-CAPS surnames while keeping particles (van / de /
dalla / etc.) lowercase mid-name. Preserves Irish O' surnames and
the "slash-with-company" structure ("Daniel Wainstein / 7 Knots,
LLC") seen in production. Returns a surnameToken (lowercased last
non-particle token) for use as a dedup blocking key.
- normalizeEmail: trim + lowercase + zod email validation. Plus-aliases
preserved; null on invalid.
- normalizePhone: pre-cleans the input (strips spreadsheet apostrophes,
carriage returns, dots/dashes/parens, converts 00 prefix to +) then
delegates to libphonenumber-js. Detects multi-number fields ("a/b",
"a;b") and placeholder fakes (8+ consecutive zeros, e.g.
+447000000000). Flags every quirk so the migration report and runtime
audit log can surface it.
- resolveCountry: maps free-text country/region input to ISO-3166-1
alpha-2 via alias → exact (vs. Intl-derived names) → city → fuzzy
(Levenshtein ≤ 2). Fuzzy is gated by length so 4-char inputs ("Mars")
don't false-positive against short country names.
- levenshtein: standard iterative implementation, exported for reuse
by find-matches.
src/lib/dedup/find-matches.ts
- findClientMatches: builds three blocking indexes off the pool (email
/ phone / surname-token), gathers the comparison set via union, and
scores each candidate via the rule set in design §4.2:
Email match +60
Phone E.164 match +50 (≥ 8 digits, excludes placeholder zeros)
Name exact match +20
Surname + given fuzzy +15 (Levenshtein ≤ 1)
Negative: shared email but different phone country −15
Negative: name match but no shared contact −20
Score is clamped to [0,100]. Confidence tier ('high' / 'medium' /
'low') is derived from configurable thresholds passed in by the
caller — defaults are highScore=90, mediumScore=50.
tests/unit/dedup/normalize.test.ts (38 cases)
Every dirty-data pattern from design §1.3 has a fixture: carriage
returns in names, ALL-CAPS surnames, lowercase entries, particles,
slash-with-company, plus-aliases, capitalized email localparts,
spreadsheet-apostrophe phones, multi-number phones, placeholder
phones, 00-prefix phones, French/UK local-format phones,
Saint-Barthélemy diacritic variants, Kansas City fallback.
tests/unit/dedup/find-matches.test.ts (12 cases)
Each duplicate cluster from design §1.2 has a test:
- Pattern A (Deepak Ramchandani — pure double-submit) → high
- Pattern B (Howard Wiarda — phone format variance) → high
- Pattern C (Nicolas Ruiz — name capitalization) → high
- Pattern D (Chris/Christopher Allen — name shortening) → high
- Pattern E (Christopher Camazou — typo on resubmit) → high or medium
- Pattern E (Constanzo/Costanzo — surname typo, multi-yacht) → high
- Pattern F (Etiennette Clamouze — same name, different country) →
must NOT auto-merge
- Pattern F (Bruno+Bruce — shared household contact) → no match
- Negative evidence (same email, different phone country) → medium
- Blocking (no shared keys → 0 matches)
- Sort order (high before low)
- Empty pool
Total: 50 new tests, all green. Zero changes to runtime behavior or
schema; unblocks P2 (runtime surfaces) and P3 (NocoDB migration).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
255
src/lib/dedup/find-matches.ts
Normal file
255
src/lib/dedup/find-matches.ts
Normal file
@@ -0,0 +1,255 @@
|
||||
/**
|
||||
* Client-match finder — pure scoring logic.
|
||||
*
|
||||
* Compares one input candidate against a pool of existing candidates and
|
||||
* returns scored matches. Used by:
|
||||
* - the at-create suggestion in client/interest forms (Layer 1)
|
||||
* - the public-form auto-link path (when score >= block threshold)
|
||||
* - the nightly background scoring job (Layer 3)
|
||||
* - the migration script's dedup pass
|
||||
*
|
||||
* Performance shape: blocking via email / phone / surname-token reduces
|
||||
* the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in
|
||||
* production. See `findClientMatches` for the blocking implementation.
|
||||
*
|
||||
* Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4.
|
||||
*/
|
||||
|
||||
import { parsePhone } from '@/lib/i18n/phone';
|
||||
|
||||
import { levenshtein } from './normalize';
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
export interface MatchCandidate {
|
||||
id: string;
|
||||
fullName: string | null;
|
||||
/** Lowercased last non-particle token from `normalizeName(...).surnameToken`.
|
||||
* Used as a blocking key. */
|
||||
surnameToken: string | null;
|
||||
/** Already lowercased + validated via `normalizeEmail`. */
|
||||
emails: string[];
|
||||
/** Already canonical E.164 via `normalizePhone`. */
|
||||
phonesE164: string[];
|
||||
/** Address country (NOT phone country) — used for tiebreaking, not scoring. */
|
||||
countryIso: string | null;
|
||||
}
|
||||
|
||||
export type MatchConfidence = 'high' | 'medium' | 'low';
|
||||
|
||||
export interface MatchResult {
|
||||
candidate: MatchCandidate;
|
||||
/** 0–100 after capping. */
|
||||
score: number;
|
||||
/** Human-readable list of which rules contributed. Useful for the
|
||||
* review queue UI ("matched on email + phone + surname token"). */
|
||||
reasons: string[];
|
||||
confidence: MatchConfidence;
|
||||
}
|
||||
|
||||
export interface DedupThresholds {
|
||||
/** Inclusive lower bound for `'high'` confidence. */
|
||||
highScore: number;
|
||||
/** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */
|
||||
mediumScore: number;
|
||||
}
|
||||
|
||||
// ─── Public entry point ─────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Compare `input` against every reachable candidate in `pool` and return
|
||||
* scored matches, sorted by score descending. The result list includes
|
||||
* low-confidence hits — caller filters by `confidence` or `score`
|
||||
* depending on use case.
|
||||
*
|
||||
* Self-matches (an entry with `id === input.id`, e.g. when re-scoring an
|
||||
* existing client during a background job) are excluded.
|
||||
*/
|
||||
export function findClientMatches(
|
||||
input: MatchCandidate,
|
||||
pool: MatchCandidate[],
|
||||
thresholds: DedupThresholds,
|
||||
): MatchResult[] {
|
||||
if (pool.length === 0) return [];
|
||||
|
||||
// ── Phase 1: build blocking indexes off the pool. ─────────────────────────
|
||||
//
|
||||
// Three indexes mean any candidate that shares ANY of (email / phone /
|
||||
// surname-token) with the input shows up in the comparison set. Anything
|
||||
// that shares NONE is structurally too different to be a duplicate and
|
||||
// is skipped — this is what keeps the algorithm O(n) at scale.
|
||||
const byEmail = new Map<string, MatchCandidate[]>();
|
||||
const byPhone = new Map<string, MatchCandidate[]>();
|
||||
const bySurnameToken = new Map<string, MatchCandidate[]>();
|
||||
|
||||
for (const c of pool) {
|
||||
if (c.id === input.id) continue;
|
||||
for (const email of c.emails) {
|
||||
pushTo(byEmail, email, c);
|
||||
}
|
||||
for (const phone of c.phonesE164) {
|
||||
pushTo(byPhone, phone, c);
|
||||
}
|
||||
if (c.surnameToken) {
|
||||
pushTo(bySurnameToken, c.surnameToken, c);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Phase 2: gather the comparison set via the blocking indexes. ─────────
|
||||
const comparisonSet = new Map<string, MatchCandidate>();
|
||||
for (const email of input.emails) {
|
||||
for (const c of byEmail.get(email) ?? []) {
|
||||
comparisonSet.set(c.id, c);
|
||||
}
|
||||
}
|
||||
for (const phone of input.phonesE164) {
|
||||
for (const c of byPhone.get(phone) ?? []) {
|
||||
comparisonSet.set(c.id, c);
|
||||
}
|
||||
}
|
||||
if (input.surnameToken) {
|
||||
for (const c of bySurnameToken.get(input.surnameToken) ?? []) {
|
||||
comparisonSet.set(c.id, c);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Phase 3: score every candidate that survived blocking. ───────────────
|
||||
const results: MatchResult[] = [];
|
||||
for (const candidate of comparisonSet.values()) {
|
||||
const r = scorePair(input, candidate);
|
||||
results.push(r);
|
||||
}
|
||||
|
||||
// ── Phase 4: sort by score desc + assign confidence tier. ────────────────
|
||||
results.sort((a, b) => b.score - a.score);
|
||||
for (const r of results) {
|
||||
r.confidence = classify(r.score, thresholds);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
// ─── Scoring ────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Score one (input, candidate) pair against the rule set in design §4.2.
|
||||
* Compounding: positive rules sum, negative rules subtract; the result is
|
||||
* clamped to [0, 100]. Reasons accumulate in the order rules fire so the
|
||||
* review-queue UI can show "matched on email + phone".
|
||||
*/
|
||||
function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult {
|
||||
let score = 0;
|
||||
const reasons: string[] = [];
|
||||
|
||||
// ── Positive rules. ──────────────────────────────────────────────────────
|
||||
|
||||
const sharedEmail = a.emails.find((e) => b.emails.includes(e));
|
||||
const emailMatch = !!sharedEmail;
|
||||
if (emailMatch) {
|
||||
score += 60;
|
||||
reasons.push('email match');
|
||||
}
|
||||
|
||||
const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8);
|
||||
const phoneMatch = !!sharedPhone;
|
||||
if (phoneMatch) {
|
||||
score += 50;
|
||||
reasons.push('phone match');
|
||||
}
|
||||
|
||||
const aNameNorm = (a.fullName ?? '').toLowerCase().trim();
|
||||
const bNameNorm = (b.fullName ?? '').toLowerCase().trim();
|
||||
const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm;
|
||||
if (nameExactMatch) {
|
||||
score += 20;
|
||||
reasons.push('name match');
|
||||
}
|
||||
|
||||
// Surname + given-name fuzzy. Only fires when names are NOT exactly
|
||||
// equal — avoids double-counting with the rule above. Catches
|
||||
// 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact
|
||||
// signals confirm them.
|
||||
if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) {
|
||||
const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
|
||||
const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
|
||||
if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) {
|
||||
score += 15;
|
||||
reasons.push('surname + given-name fuzzy match');
|
||||
}
|
||||
}
|
||||
|
||||
// ── Negative rules. ──────────────────────────────────────────────────────
|
||||
|
||||
// Same email but the two parties' phone numbers belong to different
|
||||
// countries. Common when one inbox is shared by spouses / coworkers
|
||||
// and the actual phone owners are distinct people. Don't auto-merge.
|
||||
if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) {
|
||||
const aCountries = phoneCountriesOf(a);
|
||||
const bCountries = phoneCountriesOf(b);
|
||||
const overlap = [...aCountries].some((c) => bCountries.has(c));
|
||||
if (!overlap && aCountries.size > 0 && bCountries.size > 0) {
|
||||
score -= 15;
|
||||
reasons.push('phone country mismatch (negative)');
|
||||
}
|
||||
}
|
||||
|
||||
// Same name but no contact match. Two distinct people with the same
|
||||
// name (common for "John Smith") sneak through name-based blocking;
|
||||
// penalize so the score lands below the auto-merge threshold.
|
||||
if (nameExactMatch && !emailMatch && !phoneMatch) {
|
||||
score -= 20;
|
||||
reasons.push('name match but no shared contact (negative)');
|
||||
}
|
||||
|
||||
return {
|
||||
candidate: b,
|
||||
score: clamp(score, 0, 100),
|
||||
reasons,
|
||||
confidence: 'low', // assigned by caller after threshold lookup
|
||||
};
|
||||
}
|
||||
|
||||
// ─── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
function pushTo<K, V>(map: Map<K, V[]>, key: K, value: V): void {
|
||||
const existing = map.get(key);
|
||||
if (existing) {
|
||||
existing.push(value);
|
||||
} else {
|
||||
map.set(key, [value]);
|
||||
}
|
||||
}
|
||||
|
||||
function classify(score: number, thresholds: DedupThresholds): MatchConfidence {
|
||||
if (score >= thresholds.highScore) return 'high';
|
||||
if (score >= thresholds.mediumScore) return 'medium';
|
||||
return 'low';
|
||||
}
|
||||
|
||||
function clamp(value: number, min: number, max: number): number {
|
||||
if (value < min) return min;
|
||||
if (value > max) return max;
|
||||
return value;
|
||||
}
|
||||
|
||||
function countDigits(s: string): number {
|
||||
let count = 0;
|
||||
for (let i = 0; i < s.length; i += 1) {
|
||||
const code = s.charCodeAt(i);
|
||||
if (code >= 48 && code <= 57) count += 1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve each phone in a candidate to its ISO country code (via
|
||||
* libphonenumber-js). Cached per call; the surrounding caller doesn't
|
||||
* batch so we accept the parse cost.
|
||||
*/
|
||||
function phoneCountriesOf(c: MatchCandidate): Set<string> {
|
||||
const out = new Set<string>();
|
||||
for (const p of c.phonesE164) {
|
||||
const parsed = parsePhone(p);
|
||||
if (parsed.country) out.add(parsed.country);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
Reference in New Issue
Block a user