/** * Match-finding library — unit tests. * * Each duplicate cluster from the legacy NocoDB Interests audit (see * docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.2) * is encoded as a fixture here. The expected scoring tier (high / medium * / low) is the design contract — if the algorithm starts returning * "high" for a Pattern F case (Etiennette / Bruno+Bruce) it has lost * the false-positive guard and we'll know immediately. */ import { describe, expect, it } from 'vitest'; import { findClientMatches, type MatchCandidate } from '@/lib/dedup/find-matches'; // Sensible defaults for tests — match the design's recommended thresholds. const THRESHOLDS = { highScore: 90, mediumScore: 50, }; function candidate(partial: Partial & { id: string }): MatchCandidate { return { id: partial.id, fullName: partial.fullName ?? null, surnameToken: partial.surnameToken ?? null, emails: partial.emails ?? [], phonesE164: partial.phonesE164 ?? [], countryIso: partial.countryIso ?? null, }; } describe('findClientMatches', () => { describe('Pattern A — pure double-submit (high confidence)', () => { it('flags identical email + phone as high', () => { // From real data: Deepak Ramchandani #624/#625, identical fields. const incoming = candidate({ id: 'b', fullName: 'Deepak Ramchandani', surnameToken: 'ramchandani', emails: ['dannyrams8888@gmail.com'], phonesE164: ['+17215868888'], }); const pool = [ candidate({ id: 'a', fullName: 'Deepak Ramchandani', surnameToken: 'ramchandani', emails: ['dannyrams8888@gmail.com'], phonesE164: ['+17215868888'], }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); expect(matches).toHaveLength(1); expect(matches[0]!.candidate.id).toBe('a'); expect(matches[0]!.score).toBeGreaterThanOrEqual(90); expect(matches[0]!.confidence).toBe('high'); expect(matches[0]!.reasons).toEqual(expect.arrayContaining(['email match', 'phone match'])); }); }); describe('Pattern B — same email, different phone format (high)', () => { it('high confidence when phones already normalize-equal', () => { // From real data: Howard Wiarda #236/#536, "574-274-0548" vs "+15742740548". // After normalization both phones are the same E.164, so the rule fires. const incoming = candidate({ id: 'b', fullName: 'Howard Wiarda', surnameToken: 'wiarda', emails: ['hwiarda@hotmail.com'], phonesE164: ['+15742740548'], }); const pool = [ candidate({ id: 'a', fullName: 'Howard Wiarda', surnameToken: 'wiarda', emails: ['hwiarda@hotmail.com'], phonesE164: ['+15742740548'], }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); expect(matches[0]!.confidence).toBe('high'); expect(matches[0]!.score).toBeGreaterThanOrEqual(90); }); }); describe('Pattern C — name capitalization variant (high)', () => { it('treats lowercase + uppercase as the same person when surname-token + email + phone all match', () => { // From real data: Nicolas Ruiz #681/#682/#683, email differs only by case. const incoming = candidate({ id: 'b', fullName: 'Nicolas Ruiz', surnameToken: 'ruiz', emails: ['ruiz.nicolas@ufl.edu'], phonesE164: ['+17862006617'], }); const pool = [ candidate({ id: 'a', fullName: 'Nicolas Ruiz', surnameToken: 'ruiz', emails: ['ruiz.nicolas@ufl.edu'], phonesE164: ['+17862006617'], }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); expect(matches[0]!.confidence).toBe('high'); }); }); describe('Pattern D — name shortening (high)', () => { it('Chris vs Christopher with same email + phone scores high', () => { // From real data: Chris Allen #700 vs Christopher Allen #534. const incoming = candidate({ id: 'b', fullName: 'Chris Allen', surnameToken: 'allen', emails: ['chris@thundercatsports.com'], phonesE164: ['+17814548950'], }); const pool = [ candidate({ id: 'a', fullName: 'Christopher Allen', surnameToken: 'allen', emails: ['chris@thundercatsports.com'], phonesE164: ['+17814548950'], }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); expect(matches[0]!.confidence).toBe('high'); }); }); describe('Pattern E — typo on resubmit', () => { it('same email + nearly-identical phone (typo in last digits) scores high', () => { // Christopher Camazou #649/#650 — phone differs in last 4 digits but // everything else matches. Exact phone equality fails; email exact // match alone (60) + name-token match (20) puts us in medium tier. // The user can confirm the merge. const incoming = candidate({ id: 'b', fullName: 'Christopher Camazou', surnameToken: 'camazou', emails: ['camazou11@gmail.com'], phonesE164: ['+33608334455'], }); const pool = [ candidate({ id: 'a', fullName: 'Christopher Camazou', surnameToken: 'camazou', emails: ['camazou11@gmail.com'], phonesE164: ['+33608336549'], }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); expect(matches).toHaveLength(1); // Email + name match without phone match — strong but not certain. expect(matches[0]!.confidence).toMatch(/^(high|medium)$/); expect(matches[0]!.score).toBeGreaterThanOrEqual(70); }); it('Constanzo / Costanzo surname typo with same email + phone scores high', () => { // Gianfranco Di Constanzo #585 vs Di Costanzo #336 — same email + phone // and only a 1-letter surname typo. This is a strong "same client, // multiple yachts" signal — the design's signature win. const incoming = candidate({ id: 'b', fullName: 'Gianfranco Di Constanzo', surnameToken: 'constanzo', emails: ['gdc@nauticall.com'], phonesE164: ['+17542628669'], }); const pool = [ candidate({ id: 'a', fullName: 'Gianfranco Di Costanzo', surnameToken: 'costanzo', emails: ['gdc@nauticall.com'], phonesE164: ['+17542628669'], }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); expect(matches[0]!.confidence).toBe('high'); expect(matches[0]!.score).toBeGreaterThanOrEqual(90); }); }); describe('Pattern F — hard cases (must NOT auto-merge)', () => { it('same name with different country phone + different email scores at most medium', () => { // Etiennette Clamouze #188/#717 — same name but completely different // email + phone (and the phones are in different country codes, // suggesting either a relative, a coworker, or a name-collision). // We must NOT classify this as "high" or it would force-merge two // distinct people. const incoming = candidate({ id: 'b', fullName: 'Etiennette Clamouze', surnameToken: 'clamouze', emails: ['etiennette@the-manoah.com'], phonesE164: ['+12645815607'], countryIso: 'AI', }); const pool = [ candidate({ id: 'a', fullName: 'Etiennette Clamouze', surnameToken: 'clamouze', emails: ['clamouze.etiennette@gmail.com'], phonesE164: ['+33767780640'], countryIso: 'FR', }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); // Surname-token + name-exact match should score in medium tier so // the pair lands in the review queue but doesn't auto-merge. if (matches.length > 0) { expect(matches[0]!.confidence).not.toBe('high'); expect(matches[0]!.score).toBeLessThan(90); } }); it('shared email between two clearly different names is medium not high', () => { // Bruno Joyerot #18 vs Bruce Hearn #19 — Bruno's row shows email // belonging to "catherine elaine hearn" (Bruce's spouse). Same // household phone area code. Name overlap is partial. Don't merge. const incoming = candidate({ id: 'b', fullName: 'Bruce Hearn', surnameToken: 'hearn', emails: ['bhearn1063@gmail.com'], phonesE164: ['+12642358840'], }); const pool = [ candidate({ id: 'a', fullName: 'Bruno Joyerot', surnameToken: 'joyerot', emails: ['catherineelainehearn@gmail.com'], phonesE164: ['+12642352816'], }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); // Names don't match, emails don't match, phones differ — there's // no reason for this to surface at all. Either no match or low. if (matches.length > 0) { expect(matches[0]!.confidence).toBe('low'); } }); }); describe('Negative evidence — same email but different country phone', () => { it('reduces score when email matches but phone country differs', () => { // Constructed: same email, but one phone is +33 (FR) and the other // is +1 (US). Likely a shared-inbox spouse situation. We want // medium tier so it lands in review, not high tier. const incoming = candidate({ id: 'b', fullName: 'Test User', surnameToken: 'user', emails: ['shared@example.com'], phonesE164: ['+15551234567'], countryIso: 'US', }); const pool = [ candidate({ id: 'a', fullName: 'Test User', surnameToken: 'user', emails: ['shared@example.com'], phonesE164: ['+33611111111'], countryIso: 'FR', }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); // Email match alone would be 60 + name token match 20 = 80 (medium). // Negative evidence (different phone country) brings it down further. expect(matches[0]!.confidence).toBe('medium'); }); }); describe('Blocking — only relevant candidates are scored', () => { it('does not score candidates with no shared emails / phones / surname token', () => { const incoming = candidate({ id: 'newbie', fullName: 'Alice Smith', surnameToken: 'smith', emails: ['alice@example.com'], phonesE164: ['+15551234567'], }); const pool = [ candidate({ id: 'unrelated1', fullName: 'Bob Jones', surnameToken: 'jones', emails: ['bob@example.org'], phonesE164: ['+33611111111'], }), candidate({ id: 'unrelated2', fullName: 'Carol White', surnameToken: 'white', emails: ['carol@example.net'], phonesE164: ['+447700900111'], }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); expect(matches).toHaveLength(0); }); }); describe('Empty pool', () => { it('returns no matches when the pool is empty', () => { const incoming = candidate({ id: 'a', fullName: 'Alice', emails: ['alice@example.com'], }); expect(findClientMatches(incoming, [], THRESHOLDS)).toEqual([]); }); }); describe('Sort order', () => { it('returns matches sorted by score descending', () => { const incoming = candidate({ id: 'incoming', fullName: 'John Smith', surnameToken: 'smith', emails: ['john@example.com'], phonesE164: ['+15551234567'], }); const pool = [ candidate({ // High match — same email + phone id: 'high-match', fullName: 'John Smith', surnameToken: 'smith', emails: ['john@example.com'], phonesE164: ['+15551234567'], }), candidate({ // Medium match — same email only id: 'medium-match', fullName: 'Different Person', surnameToken: 'person', emails: ['john@example.com'], phonesE164: ['+33611111111'], }), ]; const matches = findClientMatches(incoming, pool, THRESHOLDS); expect(matches.length).toBeGreaterThanOrEqual(2); expect(matches[0]!.candidate.id).toBe('high-match'); expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score); }); }); });