Merge feat/dedup-migration: client dedup library + NocoDB migration script + admin queue

# Conflicts:
#	.gitignore
#	src/lib/db/migrations/meta/_journal.json
This commit is contained in:
Matt Ciaccio
2026-05-03 16:24:13 +02:00
32 changed files with 25389 additions and 1 deletions

View File

@@ -0,0 +1,183 @@
/**
* Client merge service — end-to-end integration test.
*
* Spins up two real clients in a real port via the factory helpers,
* attaches a few satellites (interest, contact, address, note),
* merges them, and asserts everything survived in the right place
* with the merge log written.
*/
import { describe, expect, it } from 'vitest';
import { eq } from 'drizzle-orm';
import { db } from '@/lib/db';
import { clients, clientContacts, clientNotes, clientMergeLog } from '@/lib/db/schema/clients';
import { interests } from '@/lib/db/schema/interests';
import { mergeClients } from '@/lib/services/client-merge.service';
import { makeClient, makePort, makeBerth } from '../../helpers/factories';
describe('mergeClients', () => {
it('moves interests and contacts from loser to winner; archives loser; writes merge log', async () => {
const port = await makePort();
const winner = await makeClient({
portId: port.id,
overrides: { fullName: 'Marcus Laurent' },
});
const loser = await makeClient({
portId: port.id,
overrides: { fullName: 'Marcus Laurent (dup)' },
});
// Attach contact + interest to loser
await db.insert(clientContacts).values({
clientId: loser.id,
channel: 'email',
value: 'marcus@example.com',
isPrimary: true,
});
await db.insert(clientNotes).values({
clientId: loser.id,
authorId: 'test-user',
content: 'Loser-side note',
});
const berth = await makeBerth({ portId: port.id });
await db.insert(interests).values({
portId: port.id,
clientId: loser.id,
berthId: berth.id,
pipelineStage: 'open',
leadCategory: 'general_interest',
});
// ── Merge ─────────────────────────────────────────────────────────────
const result = await mergeClients({
winnerId: winner.id,
loserId: loser.id,
mergedBy: 'test-user',
});
expect(result.movedRows.interests).toBe(1);
expect(result.movedRows.contacts).toBe(1);
expect(result.movedRows.notes).toBe(1);
// ── Loser should be archived with mergedIntoClientId set ──────────────
const [archivedLoser] = await db.select().from(clients).where(eq(clients.id, loser.id));
expect(archivedLoser?.archivedAt).not.toBeNull();
expect(archivedLoser?.mergedIntoClientId).toBe(winner.id);
// ── All loser-side rows now point at the winner ───────────────────────
const winnerInterests = await db
.select()
.from(interests)
.where(eq(interests.clientId, winner.id));
expect(winnerInterests).toHaveLength(1);
const winnerContacts = await db
.select()
.from(clientContacts)
.where(eq(clientContacts.clientId, winner.id));
expect(winnerContacts.find((c) => c.value === 'marcus@example.com')).toBeDefined();
const winnerNotes = await db
.select()
.from(clientNotes)
.where(eq(clientNotes.clientId, winner.id));
expect(winnerNotes.find((n) => n.content === 'Loser-side note')).toBeDefined();
// ── Merge log row exists with snapshot ────────────────────────────────
const [log] = await db
.select()
.from(clientMergeLog)
.where(eq(clientMergeLog.id, result.mergeLogId));
expect(log?.survivingClientId).toBe(winner.id);
expect(log?.mergedClientId).toBe(loser.id);
expect(log?.mergedBy).toBe('test-user');
expect(log?.mergeDetails).toBeDefined();
});
it('refuses to merge a client into itself', async () => {
const port = await makePort();
const c = await makeClient({ portId: port.id });
await expect(mergeClients({ winnerId: c.id, loserId: c.id, mergedBy: 'u' })).rejects.toThrow(
/itself/i,
);
});
it('refuses to merge across different ports', async () => {
const portA = await makePort();
const portB = await makePort();
const a = await makeClient({ portId: portA.id });
const b = await makeClient({ portId: portB.id });
await expect(mergeClients({ winnerId: a.id, loserId: b.id, mergedBy: 'u' })).rejects.toThrow(
/different ports/i,
);
});
it('refuses to merge a client that has already been merged', async () => {
const port = await makePort();
const winner = await makeClient({ portId: port.id });
const loser = await makeClient({ portId: port.id });
// First merge succeeds.
await mergeClients({ winnerId: winner.id, loserId: loser.id, mergedBy: 'u' });
// Second merge of the same loser should refuse.
const winner2 = await makeClient({ portId: port.id });
await expect(
mergeClients({ winnerId: winner2.id, loserId: loser.id, mergedBy: 'u' }),
).rejects.toThrow(/already merged/i);
});
it('drops duplicate contact rows during reattach', async () => {
const port = await makePort();
const winner = await makeClient({ portId: port.id });
const loser = await makeClient({ portId: port.id });
// Both have the same email contact.
await db.insert(clientContacts).values({
clientId: winner.id,
channel: 'email',
value: 'same@example.com',
isPrimary: true,
});
await db.insert(clientContacts).values({
clientId: loser.id,
channel: 'email',
value: 'same@example.com',
isPrimary: true,
});
const result = await mergeClients({
winnerId: winner.id,
loserId: loser.id,
mergedBy: 'u',
});
expect(result.movedRows.contacts).toBe(0); // duplicate dropped
const winnerEmails = await db
.select()
.from(clientContacts)
.where(eq(clientContacts.clientId, winner.id));
// Winner kept exactly one copy of the shared email.
expect(winnerEmails.filter((c) => c.value === 'same@example.com')).toHaveLength(1);
});
it('applies fieldChoices to copy loser values onto the winner', async () => {
const port = await makePort();
const winner = await makeClient({
portId: port.id,
overrides: { fullName: 'Marcus L.' },
});
const loser = await makeClient({
portId: port.id,
overrides: { fullName: 'Marcus Laurent' },
});
await mergeClients({
winnerId: winner.id,
loserId: loser.id,
mergedBy: 'u',
fieldChoices: { fullName: 'loser' },
});
const [updatedWinner] = await db.select().from(clients).where(eq(clients.id, winner.id));
expect(updatedWinner?.fullName).toBe('Marcus Laurent');
});
});

View File

@@ -0,0 +1,157 @@
/**
* Match-candidates API — integration test.
*
* Exercises the GET /api/v1/clients/match-candidates handler against a
* real port + clients pool. Verifies the dedup library's at-create
* suggestion path returns the right candidates and confidence tiers
* for the "use existing client?" form interruption.
*/
import { describe, expect, it } from 'vitest';
import { db } from '@/lib/db';
import { clientContacts } from '@/lib/db/schema/clients';
import { getMatchCandidatesHandler } from '@/app/api/v1/clients/match-candidates/handlers';
import { makeMockCtx, makeMockRequest } from '../../helpers/route-tester';
import { makeClient, makePort } from '../../helpers/factories';
interface MatchData {
clientId: string;
fullName: string;
score: number;
confidence: 'high' | 'medium' | 'low';
reasons: string[];
interestCount: number;
}
async function callHandler(
ctx: ReturnType<typeof makeMockCtx>,
query: Record<string, string>,
): Promise<MatchData[]> {
const url = new URL('http://localhost/api/v1/clients/match-candidates');
for (const [k, v] of Object.entries(query)) url.searchParams.set(k, v);
const req = makeMockRequest('GET', url.toString());
const res = await getMatchCandidatesHandler(req, ctx);
expect(res.status).toBe(200);
const body = await res.json();
return body.data as MatchData[];
}
describe('GET /api/v1/clients/match-candidates', () => {
it('returns empty when nothing actionable was provided', async () => {
const port = await makePort();
const ctx = makeMockCtx({ portId: port.id });
const data = await callHandler(ctx, {});
expect(data).toEqual([]);
});
it('finds an existing client by exact email match (high confidence)', async () => {
const port = await makePort();
const ctx = makeMockCtx({ portId: port.id });
const existing = await makeClient({
portId: port.id,
overrides: { fullName: 'Marcus Laurent' },
});
await db.insert(clientContacts).values({
clientId: existing.id,
channel: 'email',
value: 'marcus@example.com',
isPrimary: true,
});
await db.insert(clientContacts).values({
clientId: existing.id,
channel: 'phone',
value: '+15551234567',
valueE164: '+15551234567',
isPrimary: true,
});
const data = await callHandler(ctx, {
email: 'Marcus@example.com',
phone: '+15551234567',
name: 'Marcus Laurent',
});
expect(data).toHaveLength(1);
expect(data[0]!.clientId).toBe(existing.id);
expect(data[0]!.confidence).toBe('high');
expect(data[0]!.reasons).toEqual(expect.arrayContaining(['email match', 'phone match']));
});
it('does not surface unrelated clients in the same port', async () => {
const port = await makePort();
const ctx = makeMockCtx({ portId: port.id });
const target = await makeClient({
portId: port.id,
overrides: { fullName: 'Marcus Laurent' },
});
await db.insert(clientContacts).values({
clientId: target.id,
channel: 'email',
value: 'marcus@example.com',
isPrimary: true,
});
// An unrelated client.
const unrelated = await makeClient({
portId: port.id,
overrides: { fullName: 'Bob Smith' },
});
await db.insert(clientContacts).values({
clientId: unrelated.id,
channel: 'email',
value: 'bob@example.org',
isPrimary: true,
});
const data = await callHandler(ctx, { email: 'marcus@example.com' });
expect(data.map((d) => d.clientId)).toEqual([target.id]);
});
it('returns medium-confidence partial matches', async () => {
// Same name, different contact info — Pattern F territory.
const port = await makePort();
const ctx = makeMockCtx({ portId: port.id });
const existing = await makeClient({
portId: port.id,
overrides: { fullName: 'Etiennette Clamouze' },
});
await db.insert(clientContacts).values({
clientId: existing.id,
channel: 'email',
value: 'clamouze.etiennette@gmail.com',
isPrimary: true,
});
const data = await callHandler(ctx, {
// Different email + phone, same name.
email: 'etiennette@the-manoah.com',
name: 'Etiennette Clamouze',
});
// Either no match (low confidence filtered out) or a medium one —
// either is fine. Critically, NOT high.
if (data.length > 0) {
expect(data[0]!.confidence).not.toBe('high');
}
});
it('does not leak across ports', async () => {
const portA = await makePort();
const portB = await makePort();
const ctxA = makeMockCtx({ portId: portA.id });
const inB = await makeClient({
portId: portB.id,
overrides: { fullName: 'In Port B' },
});
await db.insert(clientContacts).values({
clientId: inB.id,
channel: 'email',
value: 'b@example.com',
isPrimary: true,
});
// Caller is in port A, asking for an email that lives in port B.
const data = await callHandler(ctxA, { email: 'b@example.com' });
expect(data).toEqual([]);
});
});

View File

@@ -0,0 +1,379 @@
/**
* Match-finding library — unit tests.
*
* Each duplicate cluster from the legacy NocoDB Interests audit (see
* docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.2)
* is encoded as a fixture here. The expected scoring tier (high / medium
* / low) is the design contract — if the algorithm starts returning
* "high" for a Pattern F case (Etiennette / Bruno+Bruce) it has lost
* the false-positive guard and we'll know immediately.
*/
import { describe, expect, it } from 'vitest';
import { findClientMatches, type MatchCandidate } from '@/lib/dedup/find-matches';
// Sensible defaults for tests — match the design's recommended thresholds.
const THRESHOLDS = {
highScore: 90,
mediumScore: 50,
};
function candidate(partial: Partial<MatchCandidate> & { id: string }): MatchCandidate {
return {
id: partial.id,
fullName: partial.fullName ?? null,
surnameToken: partial.surnameToken ?? null,
emails: partial.emails ?? [],
phonesE164: partial.phonesE164 ?? [],
countryIso: partial.countryIso ?? null,
};
}
describe('findClientMatches', () => {
describe('Pattern A — pure double-submit (high confidence)', () => {
it('flags identical email + phone as high', () => {
// From real data: Deepak Ramchandani #624/#625, identical fields.
const incoming = candidate({
id: 'b',
fullName: 'Deepak Ramchandani',
surnameToken: 'ramchandani',
emails: ['dannyrams8888@gmail.com'],
phonesE164: ['+17215868888'],
});
const pool = [
candidate({
id: 'a',
fullName: 'Deepak Ramchandani',
surnameToken: 'ramchandani',
emails: ['dannyrams8888@gmail.com'],
phonesE164: ['+17215868888'],
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
expect(matches).toHaveLength(1);
expect(matches[0]!.candidate.id).toBe('a');
expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
expect(matches[0]!.confidence).toBe('high');
expect(matches[0]!.reasons).toEqual(expect.arrayContaining(['email match', 'phone match']));
});
});
describe('Pattern B — same email, different phone format (high)', () => {
it('high confidence when phones already normalize-equal', () => {
// From real data: Howard Wiarda #236/#536, "574-274-0548" vs "+15742740548".
// After normalization both phones are the same E.164, so the rule fires.
const incoming = candidate({
id: 'b',
fullName: 'Howard Wiarda',
surnameToken: 'wiarda',
emails: ['hwiarda@hotmail.com'],
phonesE164: ['+15742740548'],
});
const pool = [
candidate({
id: 'a',
fullName: 'Howard Wiarda',
surnameToken: 'wiarda',
emails: ['hwiarda@hotmail.com'],
phonesE164: ['+15742740548'],
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
expect(matches[0]!.confidence).toBe('high');
expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
});
});
describe('Pattern C — name capitalization variant (high)', () => {
it('treats lowercase + uppercase as the same person when surname-token + email + phone all match', () => {
// From real data: Nicolas Ruiz #681/#682/#683, email differs only by case.
const incoming = candidate({
id: 'b',
fullName: 'Nicolas Ruiz',
surnameToken: 'ruiz',
emails: ['ruiz.nicolas@ufl.edu'],
phonesE164: ['+17862006617'],
});
const pool = [
candidate({
id: 'a',
fullName: 'Nicolas Ruiz',
surnameToken: 'ruiz',
emails: ['ruiz.nicolas@ufl.edu'],
phonesE164: ['+17862006617'],
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
expect(matches[0]!.confidence).toBe('high');
});
});
describe('Pattern D — name shortening (high)', () => {
it('Chris vs Christopher with same email + phone scores high', () => {
// From real data: Chris Allen #700 vs Christopher Allen #534.
const incoming = candidate({
id: 'b',
fullName: 'Chris Allen',
surnameToken: 'allen',
emails: ['chris@thundercatsports.com'],
phonesE164: ['+17814548950'],
});
const pool = [
candidate({
id: 'a',
fullName: 'Christopher Allen',
surnameToken: 'allen',
emails: ['chris@thundercatsports.com'],
phonesE164: ['+17814548950'],
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
expect(matches[0]!.confidence).toBe('high');
});
});
describe('Pattern E — typo on resubmit', () => {
it('same email + nearly-identical phone (typo in last digits) scores high', () => {
// Christopher Camazou #649/#650 — phone differs in last 4 digits but
// everything else matches. Exact phone equality fails; email exact
// match alone (60) + name-token match (20) puts us in medium tier.
// The user can confirm the merge.
const incoming = candidate({
id: 'b',
fullName: 'Christopher Camazou',
surnameToken: 'camazou',
emails: ['camazou11@gmail.com'],
phonesE164: ['+33608334455'],
});
const pool = [
candidate({
id: 'a',
fullName: 'Christopher Camazou',
surnameToken: 'camazou',
emails: ['camazou11@gmail.com'],
phonesE164: ['+33608336549'],
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
expect(matches).toHaveLength(1);
// Email + name match without phone match — strong but not certain.
expect(matches[0]!.confidence).toMatch(/^(high|medium)$/);
expect(matches[0]!.score).toBeGreaterThanOrEqual(70);
});
it('Constanzo / Costanzo surname typo with same email + phone scores high', () => {
// Gianfranco Di Constanzo #585 vs Di Costanzo #336 — same email + phone
// and only a 1-letter surname typo. This is a strong "same client,
// multiple yachts" signal — the design's signature win.
const incoming = candidate({
id: 'b',
fullName: 'Gianfranco Di Constanzo',
surnameToken: 'constanzo',
emails: ['gdc@nauticall.com'],
phonesE164: ['+17542628669'],
});
const pool = [
candidate({
id: 'a',
fullName: 'Gianfranco Di Costanzo',
surnameToken: 'costanzo',
emails: ['gdc@nauticall.com'],
phonesE164: ['+17542628669'],
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
expect(matches[0]!.confidence).toBe('high');
expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
});
});
describe('Pattern F — hard cases (must NOT auto-merge)', () => {
it('same name with different country phone + different email scores at most medium', () => {
// Etiennette Clamouze #188/#717 — same name but completely different
// email + phone (and the phones are in different country codes,
// suggesting either a relative, a coworker, or a name-collision).
// We must NOT classify this as "high" or it would force-merge two
// distinct people.
const incoming = candidate({
id: 'b',
fullName: 'Etiennette Clamouze',
surnameToken: 'clamouze',
emails: ['etiennette@the-manoah.com'],
phonesE164: ['+12645815607'],
countryIso: 'AI',
});
const pool = [
candidate({
id: 'a',
fullName: 'Etiennette Clamouze',
surnameToken: 'clamouze',
emails: ['clamouze.etiennette@gmail.com'],
phonesE164: ['+33767780640'],
countryIso: 'FR',
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
// Surname-token + name-exact match should score in medium tier so
// the pair lands in the review queue but doesn't auto-merge.
if (matches.length > 0) {
expect(matches[0]!.confidence).not.toBe('high');
expect(matches[0]!.score).toBeLessThan(90);
}
});
it('shared email between two clearly different names is medium not high', () => {
// Bruno Joyerot #18 vs Bruce Hearn #19 — Bruno's row shows email
// belonging to "catherine elaine hearn" (Bruce's spouse). Same
// household phone area code. Name overlap is partial. Don't merge.
const incoming = candidate({
id: 'b',
fullName: 'Bruce Hearn',
surnameToken: 'hearn',
emails: ['bhearn1063@gmail.com'],
phonesE164: ['+12642358840'],
});
const pool = [
candidate({
id: 'a',
fullName: 'Bruno Joyerot',
surnameToken: 'joyerot',
emails: ['catherineelainehearn@gmail.com'],
phonesE164: ['+12642352816'],
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
// Names don't match, emails don't match, phones differ — there's
// no reason for this to surface at all. Either no match or low.
if (matches.length > 0) {
expect(matches[0]!.confidence).toBe('low');
}
});
});
describe('Negative evidence — same email but different country phone', () => {
it('reduces score when email matches but phone country differs', () => {
// Constructed: same email, but one phone is +33 (FR) and the other
// is +1 (US). Likely a shared-inbox spouse situation. We want
// medium tier so it lands in review, not high tier.
const incoming = candidate({
id: 'b',
fullName: 'Test User',
surnameToken: 'user',
emails: ['shared@example.com'],
phonesE164: ['+15551234567'],
countryIso: 'US',
});
const pool = [
candidate({
id: 'a',
fullName: 'Test User',
surnameToken: 'user',
emails: ['shared@example.com'],
phonesE164: ['+33611111111'],
countryIso: 'FR',
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
// Email match alone would be 60 + name token match 20 = 80 (medium).
// Negative evidence (different phone country) brings it down further.
expect(matches[0]!.confidence).toBe('medium');
});
});
describe('Blocking — only relevant candidates are scored', () => {
it('does not score candidates with no shared emails / phones / surname token', () => {
const incoming = candidate({
id: 'newbie',
fullName: 'Alice Smith',
surnameToken: 'smith',
emails: ['alice@example.com'],
phonesE164: ['+15551234567'],
});
const pool = [
candidate({
id: 'unrelated1',
fullName: 'Bob Jones',
surnameToken: 'jones',
emails: ['bob@example.org'],
phonesE164: ['+33611111111'],
}),
candidate({
id: 'unrelated2',
fullName: 'Carol White',
surnameToken: 'white',
emails: ['carol@example.net'],
phonesE164: ['+447700900111'],
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
expect(matches).toHaveLength(0);
});
});
describe('Empty pool', () => {
it('returns no matches when the pool is empty', () => {
const incoming = candidate({
id: 'a',
fullName: 'Alice',
emails: ['alice@example.com'],
});
expect(findClientMatches(incoming, [], THRESHOLDS)).toEqual([]);
});
});
describe('Sort order', () => {
it('returns matches sorted by score descending', () => {
const incoming = candidate({
id: 'incoming',
fullName: 'John Smith',
surnameToken: 'smith',
emails: ['john@example.com'],
phonesE164: ['+15551234567'],
});
const pool = [
candidate({
// High match — same email + phone
id: 'high-match',
fullName: 'John Smith',
surnameToken: 'smith',
emails: ['john@example.com'],
phonesE164: ['+15551234567'],
}),
candidate({
// Medium match — same email only
id: 'medium-match',
fullName: 'Different Person',
surnameToken: 'person',
emails: ['john@example.com'],
phonesE164: ['+33611111111'],
}),
];
const matches = findClientMatches(incoming, pool, THRESHOLDS);
expect(matches.length).toBeGreaterThanOrEqual(2);
expect(matches[0]!.candidate.id).toBe('high-match');
expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
});
});
});

View File

@@ -0,0 +1,213 @@
/**
* Migration transform — fixture-based regression test.
*
* Feeds the transform a small frozen NocoDB snapshot containing one
* representative row from each duplicate pattern documented in
* docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.2,
* and asserts the resulting plan matches the algorithm's expected
* behavior. If any future change starts merging Pattern F (Etiennette
* Clamouze) or stops merging Pattern A (Deepak Ramchandani), this
* test fails immediately.
*/
import { describe, expect, it } from 'vitest';
import { transformSnapshot } from '@/lib/dedup/migration-transform';
import type { NocoDbRow, NocoDbSnapshot } from '@/lib/dedup/nocodb-source';
function row(fields: Partial<NocoDbRow> & { Id: number }): NocoDbRow {
return fields as NocoDbRow;
}
const FIXTURE: NocoDbSnapshot = {
fetchedAt: '2026-05-03T12:00:00.000Z',
berths: [],
residentialInterests: [],
websiteInterestSubmissions: [],
websiteContactFormSubmissions: [],
websiteBerthEoiSupplements: [],
interests: [
// Pattern A: pure double-submit (Deepak Ramchandani #624/#625)
row({
Id: 624,
'Full Name': 'Deepak Ramchandani',
'Email Address': 'dannyrams8888@gmail.com',
'Phone Number': '+17215868888',
'Sales Process Level': 'General Qualified Interest',
}),
row({
Id: 625,
'Full Name': 'Deepak Ramchandani',
'Email Address': 'dannyrams8888@gmail.com',
'Phone Number': '+17215868888',
'Sales Process Level': 'General Qualified Interest',
}),
// Pattern B: phone format variance (Howard Wiarda #236/#536)
row({
Id: 236,
'Full Name': 'Howard Wiarda',
'Email Address': 'hwiarda@hotmail.com',
'Phone Number': '574-274-0548',
'Place of Residence': 'USA',
'Sales Process Level': 'General Qualified Interest',
}),
row({
Id: 536,
'Full Name': 'Howard Wiarda',
'Email Address': 'hwiarda@hotmail.com',
'Phone Number': '+15742740548',
'Sales Process Level': 'General Qualified Interest',
}),
// Pattern C: name capitalization (Nicolas Ruiz #681/#682/#683 — three rows)
row({
Id: 681,
'Full Name': 'Nicolas Ruiz',
'Email Address': 'ruiz.nicolas@ufl.edu',
'Phone Number': '+17862006617',
'Sales Process Level': 'General Qualified Interest',
}),
row({
Id: 682,
'Full Name': 'Nicolas Ruiz',
'Email Address': 'ruiz.nicolas@ufl.edu',
'Phone Number': '+17862006617',
'Sales Process Level': 'Specific Qualified Interest',
}),
row({
Id: 683,
'Full Name': 'Nicolas Ruiz',
'Email Address': 'Ruiz.Nicolas@ufl.edu',
'Phone Number': '+17862006617',
'Sales Process Level': 'General Qualified Interest',
}),
// Pattern E: surname typo with same email + phone (Constanzo/Costanzo)
row({
Id: 336,
'Full Name': 'Gianfranco Di Costanzo',
'Email Address': 'gdc@nauticall.com',
'Phone Number': '+17542628669',
'Yacht Name': 'GEMINI',
'Sales Process Level': 'Contract Signed',
}),
row({
Id: 585,
'Full Name': 'Gianfranco Di Constanzo',
'Email Address': 'gdc@nauticall.com',
'Phone Number': '+17542628669',
'Yacht Name': 'CALYPSO',
'Sales Process Level': 'Signed EOI and NDA',
}),
// Pattern F: same name, different country phones (Etiennette Clamouze)
row({
Id: 188,
'Full Name': 'Etiennette Clamouze',
'Email Address': 'clamouze.etiennette@gmail.com',
'Phone Number': '+33767780640',
'Sales Process Level': 'General Qualified Interest',
}),
row({
Id: 717,
'Full Name': 'Etiennette Clamouze',
'Email Address': 'Etiennette@the-manoah.com',
'Phone Number': '+12645815607',
'Sales Process Level': 'General Qualified Interest',
}),
// Single isolated row to verify non-duplicates pass through
row({
Id: 999,
'Full Name': 'Lone Wolf',
'Email Address': 'lone@example.com',
'Phone Number': '+15551234567',
'Sales Process Level': 'General Qualified Interest',
}),
],
};
describe('transformSnapshot — fixture regression', () => {
it('produces the expected number of clients + interests', () => {
const plan = transformSnapshot(FIXTURE);
// 12 input rows → 7 unique clients (Deepak: 1, Wiarda: 1, Ruiz: 1,
// Constanzo: 1, Etiennette x2: 2, Lone: 1). Etiennette stays as 2
// because Pattern F is correctly NOT auto-merged.
expect(plan.stats.outputClients).toBe(7);
expect(plan.stats.outputInterests).toBe(12); // one per source row
});
it('auto-links every Pattern AE cluster', () => {
const plan = transformSnapshot(FIXTURE);
const linkedSourceIds = new Set<number>();
for (const link of plan.autoLinks) {
linkedSourceIds.add(link.leadSourceId);
for (const merged of link.mergedSourceIds) {
linkedSourceIds.add(merged);
}
}
// Pattern A: 624 + 625
expect(linkedSourceIds.has(624) && linkedSourceIds.has(625)).toBe(true);
// Pattern B: 236 + 536
expect(linkedSourceIds.has(236) && linkedSourceIds.has(536)).toBe(true);
// Pattern C: 681 + 682 + 683 (three-way)
expect(linkedSourceIds.has(681) && linkedSourceIds.has(682) && linkedSourceIds.has(683)).toBe(
true,
);
// Pattern E: 336 + 585
expect(linkedSourceIds.has(336) && linkedSourceIds.has(585)).toBe(true);
});
it('does NOT auto-link Pattern F (Etiennette Clamouze, different country)', () => {
const plan = transformSnapshot(FIXTURE);
const linkedSourceIds = new Set<number>();
for (const link of plan.autoLinks) {
linkedSourceIds.add(link.leadSourceId);
for (const merged of link.mergedSourceIds) {
linkedSourceIds.add(merged);
}
}
// Both Etiennette rows must remain as separate clients.
expect(linkedSourceIds.has(188)).toBe(false);
expect(linkedSourceIds.has(717)).toBe(false);
});
it('preserves every interest as its own row even when clients merge', () => {
const plan = transformSnapshot(FIXTURE);
const sourceIds = plan.interests.map((i) => i.sourceId).sort((a, b) => a - b);
expect(sourceIds).toEqual([188, 236, 336, 536, 585, 624, 625, 681, 682, 683, 717, 999]);
});
it('maps the legacy 8-stage enum to new pipeline stages', () => {
const plan = transformSnapshot(FIXTURE);
const stagesById = new Map(plan.interests.map((i) => [i.sourceId, i.pipelineStage]));
expect(stagesById.get(681)).toBe('open'); // General Qualified Interest
expect(stagesById.get(682)).toBe('details_sent'); // Specific Qualified Interest
expect(stagesById.get(336)).toBe('contract_signed'); // Contract Signed
expect(stagesById.get(585)).toBe('eoi_signed'); // Signed EOI and NDA
});
it('attaches different yachts to one merged Constanzo client', () => {
const plan = transformSnapshot(FIXTURE);
const constanzoClient = plan.clients.find(
(c) => c.sourceIds.includes(336) && c.sourceIds.includes(585),
);
expect(constanzoClient).toBeDefined();
const yachtsForConstanzo = plan.interests
.filter((i) => i.clientTempId === constanzoClient!.tempId)
.map((i) => i.yachtName)
.sort();
expect(yachtsForConstanzo).toEqual(['CALYPSO', 'GEMINI']);
});
it('produces deterministic output (same input → same plan)', () => {
// The transform is pure — running it twice should yield bit-identical
// results. Catches order-dependent bugs in the dedup clustering.
const a = transformSnapshot(FIXTURE);
const b = transformSnapshot(FIXTURE);
expect(JSON.stringify(a.stats)).toBe(JSON.stringify(b.stats));
expect(a.autoLinks.length).toBe(b.autoLinks.length);
});
});

View File

@@ -0,0 +1,270 @@
/**
* Normalization library — unit tests.
*
* Every fixture here comes from real dirty values observed in the legacy
* NocoDB Interests table during the 2026-05-03 audit (see
* docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.3).
* The point is regression-prevention: if any of these patterns ever
* stops normalizing the way it should, dedup quality silently drops.
*/
import { describe, expect, it } from 'vitest';
import {
normalizeName,
normalizeEmail,
normalizePhone,
resolveCountry,
} from '@/lib/dedup/normalize';
describe('normalizeName', () => {
it('returns null fields for empty / null input', () => {
expect(normalizeName('')).toEqual({ display: '', normalized: '', surnameToken: undefined });
expect(normalizeName(' ')).toEqual({
display: '',
normalized: '',
surnameToken: undefined,
});
});
it('trims leading/trailing whitespace', () => {
expect(normalizeName(' Marcus Laurent ')).toMatchObject({
display: 'Marcus Laurent',
normalized: 'marcus laurent',
});
});
it('collapses repeated internal whitespace to a single space', () => {
// From real data: "Arthur Matthews" (#183), "Corinne Roche" (#208).
expect(normalizeName('Arthur Matthews').display).toBe('Arthur Matthews');
expect(normalizeName('Corinne Roche').display).toBe('Corinne Roche');
});
it('replaces embedded carriage returns and newlines with single spaces', () => {
// From real data: "Andrei \nVAGNANOV" (#178), "Daniel\r PRZEDBORSKI" (#175).
expect(normalizeName('Andrei \nVAGNANOV').display).toBe('Andrei Vagnanov');
expect(normalizeName('Daniel\r PRZEDBORSKI').display).toBe('Daniel Przedborski');
});
it('title-cases ALL-CAPS surnames while keeping given name title-cased', () => {
// From real data: "Jona ANDERSEN" (#232), "Duane SALTSGAVER" (#227),
// "Marcos DALLA PRIA" (#165).
expect(normalizeName('Jona ANDERSEN').display).toBe('Jona Andersen');
expect(normalizeName('Duane SALTSGAVER').display).toBe('Duane Saltsgaver');
// Particle 'dalla' stays lowercase mid-name.
expect(normalizeName('Marcos DALLA PRIA').display).toBe('Marcos dalla Pria');
});
it('title-cases lowercased entries', () => {
// From real data: "antony amaral" (#665), "david rosenbloom" (#239),
// "john Tickner" (#247).
expect(normalizeName('antony amaral').display).toBe('Antony Amaral');
expect(normalizeName('david rosenbloom').display).toBe('David Rosenbloom');
expect(normalizeName('john Tickner').display).toBe('John Tickner');
});
it('keeps Romance and Germanic particles lowercase mid-name', () => {
// From real data: "Olav van Velsen" (#526), "Bruno Joyerot" (#18),
// "OLIVIER DAIN" (#677). Also synthetic "Carla de la Cruz".
expect(normalizeName('Olav van Velsen').display).toBe('Olav van Velsen');
expect(normalizeName('Carla de la Cruz').display).toBe('Carla de la Cruz');
expect(normalizeName('OLIVIER DAIN').display).toBe('Olivier Dain');
});
it('preserves O-prefixed Irish surnames as title-case', () => {
expect(normalizeName("liam o'brien").display).toBe("Liam O'Brien");
});
it('keeps the slash-with-company structure intact', () => {
// From real data: "Daniel Wainstein / 7 Knots, LLC" (#637),
// "Bruno Joyerot / SAS TIKI" (#18).
expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').display).toBe(
'Daniel Wainstein / 7 Knots, LLC',
);
expect(normalizeName('Bruno Joyerot / SAS TIKI').display).toBe('Bruno Joyerot / SAS TIKI');
});
it('exposes the last non-particle token as surnameToken (lowercase) for blocking', () => {
expect(normalizeName('Marcus Laurent').surnameToken).toBe('laurent');
expect(normalizeName('Olav van Velsen').surnameToken).toBe('velsen');
expect(normalizeName('Carla de la Cruz').surnameToken).toBe('cruz');
expect(normalizeName("Liam O'Brien").surnameToken).toBe("o'brien");
});
it('handles single-token names — surnameToken is the only token', () => {
expect(normalizeName('Madonna').surnameToken).toBe('madonna');
});
it('produces a normalized form that is always lowercase', () => {
expect(normalizeName('Andrei VAGNANOV').normalized).toBe('andrei vagnanov');
expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').normalized).toBe(
'daniel wainstein / 7 knots, llc',
);
});
});
describe('normalizeEmail', () => {
it('returns null for empty / null inputs', () => {
expect(normalizeEmail('')).toBeNull();
expect(normalizeEmail(' ')).toBeNull();
});
it('lowercases and trims', () => {
// From real data: "Arthur@laser-align.com" vs "arthur@laser-align.com" (#183/#686).
expect(normalizeEmail('Arthur@laser-align.com')).toBe('arthur@laser-align.com');
expect(normalizeEmail(' marcus@example.com ')).toBe('marcus@example.com');
});
it('lowercases capitalized localparts', () => {
// From real data: "Bmalone850@gmail.com" (#489), "Hef355@yahoo.com" (#533),
// "Donclaytonmusic@gmail.com" (#679).
expect(normalizeEmail('Bmalone850@gmail.com')).toBe('bmalone850@gmail.com');
expect(normalizeEmail('Hef355@yahoo.com')).toBe('hef355@yahoo.com');
});
it('preserves plus-aliases — both legitimate and tricks', () => {
// Per design §3.2: "+aliases" are not stripped. Compare by full localpart.
expect(normalizeEmail('marcus+sales@example.com')).toBe('marcus+sales@example.com');
});
it('returns null for invalid email shapes', () => {
expect(normalizeEmail('not-an-email')).toBeNull();
expect(normalizeEmail('@example.com')).toBeNull();
expect(normalizeEmail('user@')).toBeNull();
expect(normalizeEmail('user@.com')).toBeNull();
});
});
describe('normalizePhone', () => {
it('returns null for empty / whitespace / null', () => {
expect(normalizePhone('', 'AI')).toBeNull();
expect(normalizePhone(' ', 'AI')).toBeNull();
});
it('parses a plain E.164 number', () => {
expect(normalizePhone('+15742740548', 'US')).toMatchObject({
e164: '+15742740548',
country: 'US',
});
});
it('strips embedded carriage returns and trailing whitespace', () => {
// From real data: "+1-264-235-8840\r" (#19), "+1-264-772-3272\r" (#20).
const out = normalizePhone('+1-264-235-8840\r', 'AI');
expect(out?.e164).toBe('+12642358840');
});
it('strips dashes, dots, parens, single quotes, spaces in a single pass', () => {
// From real data: "'+1.214.603.4235" (#205), "574-274-0548" (#236),
// "+1-264-235-8840" (#19), "+1 (212) 555-0123" (synthetic).
expect(normalizePhone("'+1.214.603.4235", 'US')?.e164).toBe('+12146034235');
expect(normalizePhone('574-274-0548', 'US')?.e164).toBe('+15742740548');
expect(normalizePhone('+1 (212) 555-0123', 'US')?.e164).toBe('+12125550123');
});
it('converts a leading 00 prefix to + (international dialling)', () => {
// From real data: "00447956657022" (#216), "0033651381036" (#702).
expect(normalizePhone('00447956657022', 'GB')?.e164).toBe('+447956657022');
expect(normalizePhone('0033651381036', 'FR')?.e164).toBe('+33651381036');
});
it('uses defaultCountry when input has no international prefix', () => {
// From real data: "0690699699" (#203, French local), "0651381036" (#701).
expect(normalizePhone('0690699699', 'FR')?.e164).toBe('+33690699699');
expect(normalizePhone('0651381036', 'FR')?.e164).toBe('+33651381036');
});
it('returns null when there is no prefix AND no defaultCountry', () => {
// The migration script flags these for human review.
const out = normalizePhone('5742740548');
expect(out?.e164 ?? null).toBeNull();
});
it('flags placeholder all-zeros numbers and returns null', () => {
// From real data: "+447000000000" (#641, "Milos Vitkovic" — clearly fake).
const out = normalizePhone('+447000000000', 'GB');
expect(out?.flagged).toBe('placeholder');
expect(out?.e164).toBeNull();
});
it('flags multi-number fields and uses the first segment', () => {
// From real data: "0677580750/0690511494" (#209). Other separators: ; ,
const slash = normalizePhone('0677580750/0690511494', 'FR');
expect(slash?.flagged).toBe('multi_number');
expect(slash?.e164).toBe('+33677580750');
const semi = normalizePhone('+33611111111;+33622222222', 'FR');
expect(semi?.flagged).toBe('multi_number');
expect(semi?.e164).toBe('+33611111111');
});
it('flags genuinely unparseable input as `unparseable`', () => {
const out = normalizePhone('xyz-not-a-phone', 'US');
expect(out?.flagged).toBe('unparseable');
expect(out?.e164).toBeNull();
});
it('strips an apostrophe-prefix without breaking the parse', () => {
// From real data: leading "'" copy-pasted from spreadsheets escapes
// numeric-cell coercion. Should be invisible to dedup.
expect(normalizePhone("'0690699699", 'FR')?.e164).toBe('+33690699699');
});
it('returns the country alongside the E.164 form', () => {
expect(normalizePhone('+33690699699', 'FR')).toMatchObject({
e164: '+33690699699',
country: 'FR',
});
});
});
describe('resolveCountry', () => {
it('returns null for empty / nullish input', () => {
expect(resolveCountry('')).toEqual({ iso: null, confidence: null });
expect(resolveCountry(' ')).toEqual({ iso: null, confidence: null });
});
it('exact-matches a canonical English country name', () => {
expect(resolveCountry('Anguilla')).toEqual({ iso: 'AI', confidence: 'exact' });
expect(resolveCountry('United Kingdom')).toEqual({ iso: 'GB', confidence: 'exact' });
expect(resolveCountry('United States')).toEqual({ iso: 'US', confidence: 'exact' });
});
it('matches case-insensitively', () => {
expect(resolveCountry('anguilla').iso).toBe('AI');
expect(resolveCountry('UNITED KINGDOM').iso).toBe('GB');
});
it('matches values with surrounding whitespace', () => {
expect(resolveCountry(' United States ').iso).toBe('US');
});
it('handles diacritic variants of Saint-Barthélemy', () => {
// From real data: "Saint barthelemy" (#203), "St Barth" (#208), "Saint-Barthélemy".
expect(resolveCountry('Saint-Barthélemy').iso).toBe('BL');
expect(resolveCountry('Saint Barthelemy').iso).toBe('BL');
expect(resolveCountry('saint barthelemy').iso).toBe('BL');
expect(resolveCountry('St Barth').iso).toBe('BL');
});
it('resolves common abbreviations', () => {
expect(resolveCountry('USA').iso).toBe('US');
expect(resolveCountry('UK').iso).toBe('GB');
});
it('falls back to a city → country mapping for high-frequency cities', () => {
// From real data: "Kansas City" (#198), "Sag Harbor Y" (#239).
expect(resolveCountry('Kansas City').iso).toBe('US');
expect(resolveCountry('Sag Harbor Y').iso).toBe('US');
});
it('marks the confidence tier appropriately', () => {
expect(resolveCountry('Anguilla').confidence).toBe('exact');
expect(resolveCountry('Kansas City').confidence).toBe('city');
});
it('returns null + null for unresolvable values', () => {
// Migration script flags these for human review rather than guessing.
expect(resolveCountry('asdfghjkl xyz')).toEqual({ iso: null, confidence: null });
expect(resolveCountry('Mars')).toEqual({ iso: null, confidence: null });
});
});