The pure-logic spine of the client deduplication system spec'd in
docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md.
Two modules, JSX-free, vitest-tested against fixtures drawn directly
from real dirty values observed in the legacy NocoDB Interests audit.
src/lib/dedup/normalize.ts
- normalizeName: trims whitespace, replaces \r/\n/\t, intelligently
title-cases ALL-CAPS surnames while keeping particles (van / de /
dalla / etc.) lowercase mid-name. Preserves Irish O' surnames and
the "slash-with-company" structure ("Daniel Wainstein / 7 Knots,
LLC") seen in production. Returns a surnameToken (lowercased last
non-particle token) for use as a dedup blocking key.
- normalizeEmail: trim + lowercase + zod email validation. Plus-aliases
preserved; null on invalid.
- normalizePhone: pre-cleans the input (strips spreadsheet apostrophes,
carriage returns, dots/dashes/parens, converts 00 prefix to +) then
delegates to libphonenumber-js. Detects multi-number fields ("a/b",
"a;b") and placeholder fakes (8+ consecutive zeros, e.g.
+447000000000). Flags every quirk so the migration report and runtime
audit log can surface it.
- resolveCountry: maps free-text country/region input to ISO-3166-1
alpha-2 via alias → exact (vs. Intl-derived names) → city → fuzzy
(Levenshtein ≤ 2). Fuzzy is gated by length so 4-char inputs ("Mars")
don't false-positive against short country names.
- levenshtein: standard iterative implementation, exported for reuse
by find-matches.
src/lib/dedup/find-matches.ts
- findClientMatches: builds three blocking indexes off the pool (email
/ phone / surname-token), gathers the comparison set via union, and
scores each candidate via the rule set in design §4.2:
Email match +60
Phone E.164 match +50 (≥ 8 digits, excludes placeholder zeros)
Name exact match +20
Surname + given fuzzy +15 (Levenshtein ≤ 1)
Negative: shared email but different phone country −15
Negative: name match but no shared contact −20
Score is clamped to [0,100]. Confidence tier ('high' / 'medium' /
'low') is derived from configurable thresholds passed in by the
caller — defaults are highScore=90, mediumScore=50.
tests/unit/dedup/normalize.test.ts (38 cases)
Every dirty-data pattern from design §1.3 has a fixture: carriage
returns in names, ALL-CAPS surnames, lowercase entries, particles,
slash-with-company, plus-aliases, capitalized email localparts,
spreadsheet-apostrophe phones, multi-number phones, placeholder
phones, 00-prefix phones, French/UK local-format phones,
Saint-Barthélemy diacritic variants, Kansas City fallback.
tests/unit/dedup/find-matches.test.ts (12 cases)
Each duplicate cluster from design §1.2 has a test:
- Pattern A (Deepak Ramchandani — pure double-submit) → high
- Pattern B (Howard Wiarda — phone format variance) → high
- Pattern C (Nicolas Ruiz — name capitalization) → high
- Pattern D (Chris/Christopher Allen — name shortening) → high
- Pattern E (Christopher Camazou — typo on resubmit) → high or medium
- Pattern E (Constanzo/Costanzo — surname typo, multi-yacht) → high
- Pattern F (Etiennette Clamouze — same name, different country) →
must NOT auto-merge
- Pattern F (Bruno+Bruce — shared household contact) → no match
- Negative evidence (same email, different phone country) → medium
- Blocking (no shared keys → 0 matches)
- Sort order (high before low)
- Empty pool
Total: 50 new tests, all green. Zero changes to runtime behavior or
schema; unblocks P2 (runtime surfaces) and P3 (NocoDB migration).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
380 lines
13 KiB
TypeScript
380 lines
13 KiB
TypeScript
/**
|
|
* Match-finding library — unit tests.
|
|
*
|
|
* Each duplicate cluster from the legacy NocoDB Interests audit (see
|
|
* docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.2)
|
|
* is encoded as a fixture here. The expected scoring tier (high / medium
|
|
* / low) is the design contract — if the algorithm starts returning
|
|
* "high" for a Pattern F case (Etiennette / Bruno+Bruce) it has lost
|
|
* the false-positive guard and we'll know immediately.
|
|
*/
|
|
import { describe, expect, it } from 'vitest';
|
|
|
|
import { findClientMatches, type MatchCandidate } from '@/lib/dedup/find-matches';
|
|
|
|
// Sensible defaults for tests — match the design's recommended thresholds.
|
|
const THRESHOLDS = {
|
|
highScore: 90,
|
|
mediumScore: 50,
|
|
};
|
|
|
|
function candidate(partial: Partial<MatchCandidate> & { id: string }): MatchCandidate {
|
|
return {
|
|
id: partial.id,
|
|
fullName: partial.fullName ?? null,
|
|
surnameToken: partial.surnameToken ?? null,
|
|
emails: partial.emails ?? [],
|
|
phonesE164: partial.phonesE164 ?? [],
|
|
countryIso: partial.countryIso ?? null,
|
|
};
|
|
}
|
|
|
|
describe('findClientMatches', () => {
|
|
describe('Pattern A — pure double-submit (high confidence)', () => {
|
|
it('flags identical email + phone as high', () => {
|
|
// From real data: Deepak Ramchandani #624/#625, identical fields.
|
|
const incoming = candidate({
|
|
id: 'b',
|
|
fullName: 'Deepak Ramchandani',
|
|
surnameToken: 'ramchandani',
|
|
emails: ['dannyrams8888@gmail.com'],
|
|
phonesE164: ['+17215868888'],
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'a',
|
|
fullName: 'Deepak Ramchandani',
|
|
surnameToken: 'ramchandani',
|
|
emails: ['dannyrams8888@gmail.com'],
|
|
phonesE164: ['+17215868888'],
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
expect(matches).toHaveLength(1);
|
|
expect(matches[0]!.candidate.id).toBe('a');
|
|
expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
|
|
expect(matches[0]!.confidence).toBe('high');
|
|
expect(matches[0]!.reasons).toEqual(expect.arrayContaining(['email match', 'phone match']));
|
|
});
|
|
});
|
|
|
|
describe('Pattern B — same email, different phone format (high)', () => {
|
|
it('high confidence when phones already normalize-equal', () => {
|
|
// From real data: Howard Wiarda #236/#536, "574-274-0548" vs "+15742740548".
|
|
// After normalization both phones are the same E.164, so the rule fires.
|
|
const incoming = candidate({
|
|
id: 'b',
|
|
fullName: 'Howard Wiarda',
|
|
surnameToken: 'wiarda',
|
|
emails: ['hwiarda@hotmail.com'],
|
|
phonesE164: ['+15742740548'],
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'a',
|
|
fullName: 'Howard Wiarda',
|
|
surnameToken: 'wiarda',
|
|
emails: ['hwiarda@hotmail.com'],
|
|
phonesE164: ['+15742740548'],
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
expect(matches[0]!.confidence).toBe('high');
|
|
expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
|
|
});
|
|
});
|
|
|
|
describe('Pattern C — name capitalization variant (high)', () => {
|
|
it('treats lowercase + uppercase as the same person when surname-token + email + phone all match', () => {
|
|
// From real data: Nicolas Ruiz #681/#682/#683, email differs only by case.
|
|
const incoming = candidate({
|
|
id: 'b',
|
|
fullName: 'Nicolas Ruiz',
|
|
surnameToken: 'ruiz',
|
|
emails: ['ruiz.nicolas@ufl.edu'],
|
|
phonesE164: ['+17862006617'],
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'a',
|
|
fullName: 'Nicolas Ruiz',
|
|
surnameToken: 'ruiz',
|
|
emails: ['ruiz.nicolas@ufl.edu'],
|
|
phonesE164: ['+17862006617'],
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
expect(matches[0]!.confidence).toBe('high');
|
|
});
|
|
});
|
|
|
|
describe('Pattern D — name shortening (high)', () => {
|
|
it('Chris vs Christopher with same email + phone scores high', () => {
|
|
// From real data: Chris Allen #700 vs Christopher Allen #534.
|
|
const incoming = candidate({
|
|
id: 'b',
|
|
fullName: 'Chris Allen',
|
|
surnameToken: 'allen',
|
|
emails: ['chris@thundercatsports.com'],
|
|
phonesE164: ['+17814548950'],
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'a',
|
|
fullName: 'Christopher Allen',
|
|
surnameToken: 'allen',
|
|
emails: ['chris@thundercatsports.com'],
|
|
phonesE164: ['+17814548950'],
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
expect(matches[0]!.confidence).toBe('high');
|
|
});
|
|
});
|
|
|
|
describe('Pattern E — typo on resubmit', () => {
|
|
it('same email + nearly-identical phone (typo in last digits) scores high', () => {
|
|
// Christopher Camazou #649/#650 — phone differs in last 4 digits but
|
|
// everything else matches. Exact phone equality fails; email exact
|
|
// match alone (60) + name-token match (20) puts us in medium tier.
|
|
// The user can confirm the merge.
|
|
const incoming = candidate({
|
|
id: 'b',
|
|
fullName: 'Christopher Camazou',
|
|
surnameToken: 'camazou',
|
|
emails: ['camazou11@gmail.com'],
|
|
phonesE164: ['+33608334455'],
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'a',
|
|
fullName: 'Christopher Camazou',
|
|
surnameToken: 'camazou',
|
|
emails: ['camazou11@gmail.com'],
|
|
phonesE164: ['+33608336549'],
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
expect(matches).toHaveLength(1);
|
|
// Email + name match without phone match — strong but not certain.
|
|
expect(matches[0]!.confidence).toMatch(/^(high|medium)$/);
|
|
expect(matches[0]!.score).toBeGreaterThanOrEqual(70);
|
|
});
|
|
|
|
it('Constanzo / Costanzo surname typo with same email + phone scores high', () => {
|
|
// Gianfranco Di Constanzo #585 vs Di Costanzo #336 — same email + phone
|
|
// and only a 1-letter surname typo. This is a strong "same client,
|
|
// multiple yachts" signal — the design's signature win.
|
|
const incoming = candidate({
|
|
id: 'b',
|
|
fullName: 'Gianfranco Di Constanzo',
|
|
surnameToken: 'constanzo',
|
|
emails: ['gdc@nauticall.com'],
|
|
phonesE164: ['+17542628669'],
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'a',
|
|
fullName: 'Gianfranco Di Costanzo',
|
|
surnameToken: 'costanzo',
|
|
emails: ['gdc@nauticall.com'],
|
|
phonesE164: ['+17542628669'],
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
expect(matches[0]!.confidence).toBe('high');
|
|
expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
|
|
});
|
|
});
|
|
|
|
describe('Pattern F — hard cases (must NOT auto-merge)', () => {
|
|
it('same name with different country phone + different email scores at most medium', () => {
|
|
// Etiennette Clamouze #188/#717 — same name but completely different
|
|
// email + phone (and the phones are in different country codes,
|
|
// suggesting either a relative, a coworker, or a name-collision).
|
|
// We must NOT classify this as "high" or it would force-merge two
|
|
// distinct people.
|
|
const incoming = candidate({
|
|
id: 'b',
|
|
fullName: 'Etiennette Clamouze',
|
|
surnameToken: 'clamouze',
|
|
emails: ['etiennette@the-manoah.com'],
|
|
phonesE164: ['+12645815607'],
|
|
countryIso: 'AI',
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'a',
|
|
fullName: 'Etiennette Clamouze',
|
|
surnameToken: 'clamouze',
|
|
emails: ['clamouze.etiennette@gmail.com'],
|
|
phonesE164: ['+33767780640'],
|
|
countryIso: 'FR',
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
// Surname-token + name-exact match should score in medium tier so
|
|
// the pair lands in the review queue but doesn't auto-merge.
|
|
if (matches.length > 0) {
|
|
expect(matches[0]!.confidence).not.toBe('high');
|
|
expect(matches[0]!.score).toBeLessThan(90);
|
|
}
|
|
});
|
|
|
|
it('shared email between two clearly different names is medium not high', () => {
|
|
// Bruno Joyerot #18 vs Bruce Hearn #19 — Bruno's row shows email
|
|
// belonging to "catherine elaine hearn" (Bruce's spouse). Same
|
|
// household phone area code. Name overlap is partial. Don't merge.
|
|
const incoming = candidate({
|
|
id: 'b',
|
|
fullName: 'Bruce Hearn',
|
|
surnameToken: 'hearn',
|
|
emails: ['bhearn1063@gmail.com'],
|
|
phonesE164: ['+12642358840'],
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'a',
|
|
fullName: 'Bruno Joyerot',
|
|
surnameToken: 'joyerot',
|
|
emails: ['catherineelainehearn@gmail.com'],
|
|
phonesE164: ['+12642352816'],
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
// Names don't match, emails don't match, phones differ — there's
|
|
// no reason for this to surface at all. Either no match or low.
|
|
if (matches.length > 0) {
|
|
expect(matches[0]!.confidence).toBe('low');
|
|
}
|
|
});
|
|
});
|
|
|
|
describe('Negative evidence — same email but different country phone', () => {
|
|
it('reduces score when email matches but phone country differs', () => {
|
|
// Constructed: same email, but one phone is +33 (FR) and the other
|
|
// is +1 (US). Likely a shared-inbox spouse situation. We want
|
|
// medium tier so it lands in review, not high tier.
|
|
const incoming = candidate({
|
|
id: 'b',
|
|
fullName: 'Test User',
|
|
surnameToken: 'user',
|
|
emails: ['shared@example.com'],
|
|
phonesE164: ['+15551234567'],
|
|
countryIso: 'US',
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'a',
|
|
fullName: 'Test User',
|
|
surnameToken: 'user',
|
|
emails: ['shared@example.com'],
|
|
phonesE164: ['+33611111111'],
|
|
countryIso: 'FR',
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
// Email match alone would be 60 + name token match 20 = 80 (medium).
|
|
// Negative evidence (different phone country) brings it down further.
|
|
expect(matches[0]!.confidence).toBe('medium');
|
|
});
|
|
});
|
|
|
|
describe('Blocking — only relevant candidates are scored', () => {
|
|
it('does not score candidates with no shared emails / phones / surname token', () => {
|
|
const incoming = candidate({
|
|
id: 'newbie',
|
|
fullName: 'Alice Smith',
|
|
surnameToken: 'smith',
|
|
emails: ['alice@example.com'],
|
|
phonesE164: ['+15551234567'],
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
id: 'unrelated1',
|
|
fullName: 'Bob Jones',
|
|
surnameToken: 'jones',
|
|
emails: ['bob@example.org'],
|
|
phonesE164: ['+33611111111'],
|
|
}),
|
|
candidate({
|
|
id: 'unrelated2',
|
|
fullName: 'Carol White',
|
|
surnameToken: 'white',
|
|
emails: ['carol@example.net'],
|
|
phonesE164: ['+447700900111'],
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
expect(matches).toHaveLength(0);
|
|
});
|
|
});
|
|
|
|
describe('Empty pool', () => {
|
|
it('returns no matches when the pool is empty', () => {
|
|
const incoming = candidate({
|
|
id: 'a',
|
|
fullName: 'Alice',
|
|
emails: ['alice@example.com'],
|
|
});
|
|
expect(findClientMatches(incoming, [], THRESHOLDS)).toEqual([]);
|
|
});
|
|
});
|
|
|
|
describe('Sort order', () => {
|
|
it('returns matches sorted by score descending', () => {
|
|
const incoming = candidate({
|
|
id: 'incoming',
|
|
fullName: 'John Smith',
|
|
surnameToken: 'smith',
|
|
emails: ['john@example.com'],
|
|
phonesE164: ['+15551234567'],
|
|
});
|
|
const pool = [
|
|
candidate({
|
|
// High match — same email + phone
|
|
id: 'high-match',
|
|
fullName: 'John Smith',
|
|
surnameToken: 'smith',
|
|
emails: ['john@example.com'],
|
|
phonesE164: ['+15551234567'],
|
|
}),
|
|
candidate({
|
|
// Medium match — same email only
|
|
id: 'medium-match',
|
|
fullName: 'Different Person',
|
|
surnameToken: 'person',
|
|
emails: ['john@example.com'],
|
|
phonesE164: ['+33611111111'],
|
|
}),
|
|
];
|
|
|
|
const matches = findClientMatches(incoming, pool, THRESHOLDS);
|
|
|
|
expect(matches.length).toBeGreaterThanOrEqual(2);
|
|
expect(matches[0]!.candidate.id).toBe('high-match');
|
|
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
|
|
});
|
|
});
|
|
});
|