/** * Normalization library — unit tests. * * Every fixture here comes from real dirty values observed in the legacy * NocoDB Interests table during the 2026-05-03 audit (see * docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.3). * The point is regression-prevention: if any of these patterns ever * stops normalizing the way it should, dedup quality silently drops. */ import { describe, expect, it } from 'vitest'; import { normalizeName, normalizeEmail, normalizePhone, resolveCountry, } from '@/lib/dedup/normalize'; describe('normalizeName', () => { it('returns null fields for empty / null input', () => { expect(normalizeName('')).toEqual({ display: '', normalized: '', surnameToken: undefined }); expect(normalizeName(' ')).toEqual({ display: '', normalized: '', surnameToken: undefined, }); }); it('trims leading/trailing whitespace', () => { expect(normalizeName(' Marcus Laurent ')).toMatchObject({ display: 'Marcus Laurent', normalized: 'marcus laurent', }); }); it('collapses repeated internal whitespace to a single space', () => { // From real data: "Arthur Matthews" (#183), "Corinne Roche" (#208). expect(normalizeName('Arthur Matthews').display).toBe('Arthur Matthews'); expect(normalizeName('Corinne Roche').display).toBe('Corinne Roche'); }); it('replaces embedded carriage returns and newlines with single spaces', () => { // From real data: "Andrei \nVAGNANOV" (#178), "Daniel\r PRZEDBORSKI" (#175). expect(normalizeName('Andrei \nVAGNANOV').display).toBe('Andrei Vagnanov'); expect(normalizeName('Daniel\r PRZEDBORSKI').display).toBe('Daniel Przedborski'); }); it('title-cases ALL-CAPS surnames while keeping given name title-cased', () => { // From real data: "Jona ANDERSEN" (#232), "Duane SALTSGAVER" (#227), // "Marcos DALLA PRIA" (#165). expect(normalizeName('Jona ANDERSEN').display).toBe('Jona Andersen'); expect(normalizeName('Duane SALTSGAVER').display).toBe('Duane Saltsgaver'); // Particle 'dalla' stays lowercase mid-name. expect(normalizeName('Marcos DALLA PRIA').display).toBe('Marcos dalla Pria'); }); it('title-cases lowercased entries', () => { // From real data: "antony amaral" (#665), "david rosenbloom" (#239), // "john Tickner" (#247). expect(normalizeName('antony amaral').display).toBe('Antony Amaral'); expect(normalizeName('david rosenbloom').display).toBe('David Rosenbloom'); expect(normalizeName('john Tickner').display).toBe('John Tickner'); }); it('keeps Romance and Germanic particles lowercase mid-name', () => { // From real data: "Olav van Velsen" (#526), "Bruno Joyerot" (#18), // "OLIVIER DAIN" (#677). Also synthetic "Carla de la Cruz". expect(normalizeName('Olav van Velsen').display).toBe('Olav van Velsen'); expect(normalizeName('Carla de la Cruz').display).toBe('Carla de la Cruz'); expect(normalizeName('OLIVIER DAIN').display).toBe('Olivier Dain'); }); it('preserves O‘-prefixed Irish surnames as title-case', () => { expect(normalizeName("liam o'brien").display).toBe("Liam O'Brien"); }); it('keeps the slash-with-company structure intact', () => { // From real data: "Daniel Wainstein / 7 Knots, LLC" (#637), // "Bruno Joyerot / SAS TIKI" (#18). expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').display).toBe( 'Daniel Wainstein / 7 Knots, LLC', ); expect(normalizeName('Bruno Joyerot / SAS TIKI').display).toBe('Bruno Joyerot / SAS TIKI'); }); it('exposes the last non-particle token as surnameToken (lowercase) for blocking', () => { expect(normalizeName('Marcus Laurent').surnameToken).toBe('laurent'); expect(normalizeName('Olav van Velsen').surnameToken).toBe('velsen'); expect(normalizeName('Carla de la Cruz').surnameToken).toBe('cruz'); expect(normalizeName("Liam O'Brien").surnameToken).toBe("o'brien"); }); it('handles single-token names — surnameToken is the only token', () => { expect(normalizeName('Madonna').surnameToken).toBe('madonna'); }); it('produces a normalized form that is always lowercase', () => { expect(normalizeName('Andrei VAGNANOV').normalized).toBe('andrei vagnanov'); expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').normalized).toBe( 'daniel wainstein / 7 knots, llc', ); }); }); describe('normalizeEmail', () => { it('returns null for empty / null inputs', () => { expect(normalizeEmail('')).toBeNull(); expect(normalizeEmail(' ')).toBeNull(); }); it('lowercases and trims', () => { // From real data: "Arthur@laser-align.com" vs "arthur@laser-align.com" (#183/#686). expect(normalizeEmail('Arthur@laser-align.com')).toBe('arthur@laser-align.com'); expect(normalizeEmail(' marcus@example.com ')).toBe('marcus@example.com'); }); it('lowercases capitalized localparts', () => { // From real data: "Bmalone850@gmail.com" (#489), "Hef355@yahoo.com" (#533), // "Donclaytonmusic@gmail.com" (#679). expect(normalizeEmail('Bmalone850@gmail.com')).toBe('bmalone850@gmail.com'); expect(normalizeEmail('Hef355@yahoo.com')).toBe('hef355@yahoo.com'); }); it('preserves plus-aliases — both legitimate and tricks', () => { // Per design §3.2: "+aliases" are not stripped. Compare by full localpart. expect(normalizeEmail('marcus+sales@example.com')).toBe('marcus+sales@example.com'); }); it('returns null for invalid email shapes', () => { expect(normalizeEmail('not-an-email')).toBeNull(); expect(normalizeEmail('@example.com')).toBeNull(); expect(normalizeEmail('user@')).toBeNull(); expect(normalizeEmail('user@.com')).toBeNull(); }); }); describe('normalizePhone', () => { it('returns null for empty / whitespace / null', () => { expect(normalizePhone('', 'AI')).toBeNull(); expect(normalizePhone(' ', 'AI')).toBeNull(); }); it('parses a plain E.164 number', () => { expect(normalizePhone('+15742740548', 'US')).toMatchObject({ e164: '+15742740548', country: 'US', }); }); it('strips embedded carriage returns and trailing whitespace', () => { // From real data: "+1-264-235-8840\r" (#19), "+1-264-772-3272\r" (#20). const out = normalizePhone('+1-264-235-8840\r', 'AI'); expect(out?.e164).toBe('+12642358840'); }); it('strips dashes, dots, parens, single quotes, spaces in a single pass', () => { // From real data: "'+1.214.603.4235" (#205), "574-274-0548" (#236), // "+1-264-235-8840" (#19), "+1 (212) 555-0123" (synthetic). expect(normalizePhone("'+1.214.603.4235", 'US')?.e164).toBe('+12146034235'); expect(normalizePhone('574-274-0548', 'US')?.e164).toBe('+15742740548'); expect(normalizePhone('+1 (212) 555-0123', 'US')?.e164).toBe('+12125550123'); }); it('converts a leading 00 prefix to + (international dialling)', () => { // From real data: "00447956657022" (#216), "0033651381036" (#702). expect(normalizePhone('00447956657022', 'GB')?.e164).toBe('+447956657022'); expect(normalizePhone('0033651381036', 'FR')?.e164).toBe('+33651381036'); }); it('uses defaultCountry when input has no international prefix', () => { // From real data: "0690699699" (#203, French local), "0651381036" (#701). expect(normalizePhone('0690699699', 'FR')?.e164).toBe('+33690699699'); expect(normalizePhone('0651381036', 'FR')?.e164).toBe('+33651381036'); }); it('returns null when there is no prefix AND no defaultCountry', () => { // The migration script flags these for human review. const out = normalizePhone('5742740548'); expect(out?.e164 ?? null).toBeNull(); }); it('flags placeholder all-zeros numbers and returns null', () => { // From real data: "+447000000000" (#641, "Milos Vitkovic" — clearly fake). const out = normalizePhone('+447000000000', 'GB'); expect(out?.flagged).toBe('placeholder'); expect(out?.e164).toBeNull(); }); it('flags multi-number fields and uses the first segment', () => { // From real data: "0677580750/0690511494" (#209). Other separators: ; , const slash = normalizePhone('0677580750/0690511494', 'FR'); expect(slash?.flagged).toBe('multi_number'); expect(slash?.e164).toBe('+33677580750'); const semi = normalizePhone('+33611111111;+33622222222', 'FR'); expect(semi?.flagged).toBe('multi_number'); expect(semi?.e164).toBe('+33611111111'); }); it('flags genuinely unparseable input as `unparseable`', () => { const out = normalizePhone('xyz-not-a-phone', 'US'); expect(out?.flagged).toBe('unparseable'); expect(out?.e164).toBeNull(); }); it('strips an apostrophe-prefix without breaking the parse', () => { // From real data: leading "'" copy-pasted from spreadsheets escapes // numeric-cell coercion. Should be invisible to dedup. expect(normalizePhone("'0690699699", 'FR')?.e164).toBe('+33690699699'); }); it('returns the country alongside the E.164 form', () => { expect(normalizePhone('+33690699699', 'FR')).toMatchObject({ e164: '+33690699699', country: 'FR', }); }); }); describe('resolveCountry', () => { it('returns null for empty / nullish input', () => { expect(resolveCountry('')).toEqual({ iso: null, confidence: null }); expect(resolveCountry(' ')).toEqual({ iso: null, confidence: null }); }); it('exact-matches a canonical English country name', () => { expect(resolveCountry('Anguilla')).toEqual({ iso: 'AI', confidence: 'exact' }); expect(resolveCountry('United Kingdom')).toEqual({ iso: 'GB', confidence: 'exact' }); expect(resolveCountry('United States')).toEqual({ iso: 'US', confidence: 'exact' }); }); it('matches case-insensitively', () => { expect(resolveCountry('anguilla').iso).toBe('AI'); expect(resolveCountry('UNITED KINGDOM').iso).toBe('GB'); }); it('matches values with surrounding whitespace', () => { expect(resolveCountry(' United States ').iso).toBe('US'); }); it('handles diacritic variants of Saint-Barthélemy', () => { // From real data: "Saint barthelemy" (#203), "St Barth" (#208), "Saint-Barthélemy". expect(resolveCountry('Saint-Barthélemy').iso).toBe('BL'); expect(resolveCountry('Saint Barthelemy').iso).toBe('BL'); expect(resolveCountry('saint barthelemy').iso).toBe('BL'); expect(resolveCountry('St Barth').iso).toBe('BL'); }); it('resolves common abbreviations', () => { expect(resolveCountry('USA').iso).toBe('US'); expect(resolveCountry('UK').iso).toBe('GB'); }); it('falls back to a city → country mapping for high-frequency cities', () => { // From real data: "Kansas City" (#198), "Sag Harbor Y" (#239). expect(resolveCountry('Kansas City').iso).toBe('US'); expect(resolveCountry('Sag Harbor Y').iso).toBe('US'); }); it('marks the confidence tier appropriately', () => { expect(resolveCountry('Anguilla').confidence).toBe('exact'); expect(resolveCountry('Kansas City').confidence).toBe('city'); }); it('returns null + null for unresolvable values', () => { // Migration script flags these for human review rather than guessing. expect(resolveCountry('asdfghjkl xyz')).toEqual({ iso: null, confidence: null }); expect(resolveCountry('Mars')).toEqual({ iso: null, confidence: null }); }); });