Files
pn-new-crm/tests/unit/dedup/normalize.test.ts
Matt 221ae5784e chore(autonomous-session): consolidate uncommitted work from prior session
Bundles the prior autonomous-session output that was sitting unstaged:

- Em-dash sweep across src/ + tests/ (en-dash/em-dash to hyphen, ~2280 instances)
- country-flag-icons rollout (CountryFlag component, replaces emoji glyphs that
  never rendered on Windows; lazy-loads the 3x2 SVG index as a single chunk
  after the per-subpath dynamic-import approach silently failed in webpack)
- Admin IA Phase 1+2: 7-domain regroup, 41 to 38 pages, /admin/berths index,
  redirects (ocr to ai, reports to dashboard, invitations to users),
  docs/admin-ia-proposal.md
- Per-template email tester (registry + endpoint + UI on Email admin page)
- Cancel-document mode picker (delete-from-Documenso vs keep-for-audit)
- Dashboard PDF report: 25 widgets, SVG charts, date-range picker, 11 resolvers
- Customize-widgets per-region sortables at xl+ (charts/rails/feed); single
  flat sortable below xl when the layout stacks; per-viewport saved orders
- Audit doc updates capturing each shipped item
- Lint fixes: react-compiler immutability in DonutChart (reduce instead of
  let-reassign), set-state-in-effect disables in CountryFlag and
  UploadForSigning preview-bytes effect, unused 'confirm' destructures in
  interest contract + reservation tabs, unescaped apostrophe in test-template
  card copy
2026-05-23 00:52:59 +02:00

271 lines
11 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Normalization library - unit tests.
*
* Every fixture here comes from real dirty values observed in the legacy
* NocoDB Interests table during the 2026-05-03 audit (see
* docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.3).
* The point is regression-prevention: if any of these patterns ever
* stops normalizing the way it should, dedup quality silently drops.
*/
import { describe, expect, it } from 'vitest';
import {
normalizeName,
normalizeEmail,
normalizePhone,
resolveCountry,
} from '@/lib/dedup/normalize';
describe('normalizeName', () => {
it('returns null fields for empty / null input', () => {
expect(normalizeName('')).toEqual({ display: '', normalized: '', surnameToken: undefined });
expect(normalizeName(' ')).toEqual({
display: '',
normalized: '',
surnameToken: undefined,
});
});
it('trims leading/trailing whitespace', () => {
expect(normalizeName(' Marcus Laurent ')).toMatchObject({
display: 'Marcus Laurent',
normalized: 'marcus laurent',
});
});
it('collapses repeated internal whitespace to a single space', () => {
// From real data: "Arthur Matthews" (#183), "Corinne Roche" (#208).
expect(normalizeName('Arthur Matthews').display).toBe('Arthur Matthews');
expect(normalizeName('Corinne Roche').display).toBe('Corinne Roche');
});
it('replaces embedded carriage returns and newlines with single spaces', () => {
// From real data: "Andrei \nVAGNANOV" (#178), "Daniel\r PRZEDBORSKI" (#175).
expect(normalizeName('Andrei \nVAGNANOV').display).toBe('Andrei Vagnanov');
expect(normalizeName('Daniel\r PRZEDBORSKI').display).toBe('Daniel Przedborski');
});
it('title-cases ALL-CAPS surnames while keeping given name title-cased', () => {
// From real data: "Jona ANDERSEN" (#232), "Duane SALTSGAVER" (#227),
// "Marcos DALLA PRIA" (#165).
expect(normalizeName('Jona ANDERSEN').display).toBe('Jona Andersen');
expect(normalizeName('Duane SALTSGAVER').display).toBe('Duane Saltsgaver');
// Particle 'dalla' stays lowercase mid-name.
expect(normalizeName('Marcos DALLA PRIA').display).toBe('Marcos dalla Pria');
});
it('title-cases lowercased entries', () => {
// From real data: "antony amaral" (#665), "david rosenbloom" (#239),
// "john Tickner" (#247).
expect(normalizeName('antony amaral').display).toBe('Antony Amaral');
expect(normalizeName('david rosenbloom').display).toBe('David Rosenbloom');
expect(normalizeName('john Tickner').display).toBe('John Tickner');
});
it('keeps Romance and Germanic particles lowercase mid-name', () => {
// From real data: "Olav van Velsen" (#526), "Bruno Joyerot" (#18),
// "OLIVIER DAIN" (#677). Also synthetic "Carla de la Cruz".
expect(normalizeName('Olav van Velsen').display).toBe('Olav van Velsen');
expect(normalizeName('Carla de la Cruz').display).toBe('Carla de la Cruz');
expect(normalizeName('OLIVIER DAIN').display).toBe('Olivier Dain');
});
it('preserves O-prefixed Irish surnames as title-case', () => {
expect(normalizeName("liam o'brien").display).toBe("Liam O'Brien");
});
it('keeps the slash-with-company structure intact', () => {
// From real data: "Daniel Wainstein / 7 Knots, LLC" (#637),
// "Bruno Joyerot / SAS TIKI" (#18).
expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').display).toBe(
'Daniel Wainstein / 7 Knots, LLC',
);
expect(normalizeName('Bruno Joyerot / SAS TIKI').display).toBe('Bruno Joyerot / SAS TIKI');
});
it('exposes the last non-particle token as surnameToken (lowercase) for blocking', () => {
expect(normalizeName('Marcus Laurent').surnameToken).toBe('laurent');
expect(normalizeName('Olav van Velsen').surnameToken).toBe('velsen');
expect(normalizeName('Carla de la Cruz').surnameToken).toBe('cruz');
expect(normalizeName("Liam O'Brien").surnameToken).toBe("o'brien");
});
it('handles single-token names - surnameToken is the only token', () => {
expect(normalizeName('Madonna').surnameToken).toBe('madonna');
});
it('produces a normalized form that is always lowercase', () => {
expect(normalizeName('Andrei VAGNANOV').normalized).toBe('andrei vagnanov');
expect(normalizeName('Daniel Wainstein / 7 Knots, LLC').normalized).toBe(
'daniel wainstein / 7 knots, llc',
);
});
});
describe('normalizeEmail', () => {
it('returns null for empty / null inputs', () => {
expect(normalizeEmail('')).toBeNull();
expect(normalizeEmail(' ')).toBeNull();
});
it('lowercases and trims', () => {
// From real data: "Arthur@laser-align.com" vs "arthur@laser-align.com" (#183/#686).
expect(normalizeEmail('Arthur@laser-align.com')).toBe('arthur@laser-align.com');
expect(normalizeEmail(' marcus@example.com ')).toBe('marcus@example.com');
});
it('lowercases capitalized localparts', () => {
// From real data: "Bmalone850@gmail.com" (#489), "Hef355@yahoo.com" (#533),
// "Donclaytonmusic@gmail.com" (#679).
expect(normalizeEmail('Bmalone850@gmail.com')).toBe('bmalone850@gmail.com');
expect(normalizeEmail('Hef355@yahoo.com')).toBe('hef355@yahoo.com');
});
it('preserves plus-aliases - both legitimate and tricks', () => {
// Per design §3.2: "+aliases" are not stripped. Compare by full localpart.
expect(normalizeEmail('marcus+sales@example.com')).toBe('marcus+sales@example.com');
});
it('returns null for invalid email shapes', () => {
expect(normalizeEmail('not-an-email')).toBeNull();
expect(normalizeEmail('@example.com')).toBeNull();
expect(normalizeEmail('user@')).toBeNull();
expect(normalizeEmail('user@.com')).toBeNull();
});
});
describe('normalizePhone', () => {
it('returns null for empty / whitespace / null', () => {
expect(normalizePhone('', 'AI')).toBeNull();
expect(normalizePhone(' ', 'AI')).toBeNull();
});
it('parses a plain E.164 number', () => {
expect(normalizePhone('+15742740548', 'US')).toMatchObject({
e164: '+15742740548',
country: 'US',
});
});
it('strips embedded carriage returns and trailing whitespace', () => {
// From real data: "+1-264-235-8840\r" (#19), "+1-264-772-3272\r" (#20).
const out = normalizePhone('+1-264-235-8840\r', 'AI');
expect(out?.e164).toBe('+12642358840');
});
it('strips dashes, dots, parens, single quotes, spaces in a single pass', () => {
// From real data: "'+1.214.603.4235" (#205), "574-274-0548" (#236),
// "+1-264-235-8840" (#19), "+1 (212) 555-0123" (synthetic).
expect(normalizePhone("'+1.214.603.4235", 'US')?.e164).toBe('+12146034235');
expect(normalizePhone('574-274-0548', 'US')?.e164).toBe('+15742740548');
expect(normalizePhone('+1 (212) 555-0123', 'US')?.e164).toBe('+12125550123');
});
it('converts a leading 00 prefix to + (international dialling)', () => {
// From real data: "00447956657022" (#216), "0033651381036" (#702).
expect(normalizePhone('00447956657022', 'GB')?.e164).toBe('+447956657022');
expect(normalizePhone('0033651381036', 'FR')?.e164).toBe('+33651381036');
});
it('uses defaultCountry when input has no international prefix', () => {
// From real data: "0690699699" (#203, French local), "0651381036" (#701).
expect(normalizePhone('0690699699', 'FR')?.e164).toBe('+33690699699');
expect(normalizePhone('0651381036', 'FR')?.e164).toBe('+33651381036');
});
it('returns null when there is no prefix AND no defaultCountry', () => {
// The migration script flags these for human review.
const out = normalizePhone('5742740548');
expect(out?.e164 ?? null).toBeNull();
});
it('flags placeholder all-zeros numbers and returns null', () => {
// From real data: "+447000000000" (#641, "Milos Vitkovic" - clearly fake).
const out = normalizePhone('+447000000000', 'GB');
expect(out?.flagged).toBe('placeholder');
expect(out?.e164).toBeNull();
});
it('flags multi-number fields and uses the first segment', () => {
// From real data: "0677580750/0690511494" (#209). Other separators: ; ,
const slash = normalizePhone('0677580750/0690511494', 'FR');
expect(slash?.flagged).toBe('multi_number');
expect(slash?.e164).toBe('+33677580750');
const semi = normalizePhone('+33611111111;+33622222222', 'FR');
expect(semi?.flagged).toBe('multi_number');
expect(semi?.e164).toBe('+33611111111');
});
it('flags genuinely unparseable input as `unparseable`', () => {
const out = normalizePhone('xyz-not-a-phone', 'US');
expect(out?.flagged).toBe('unparseable');
expect(out?.e164).toBeNull();
});
it('strips an apostrophe-prefix without breaking the parse', () => {
// From real data: leading "'" copy-pasted from spreadsheets escapes
// numeric-cell coercion. Should be invisible to dedup.
expect(normalizePhone("'0690699699", 'FR')?.e164).toBe('+33690699699');
});
it('returns the country alongside the E.164 form', () => {
expect(normalizePhone('+33690699699', 'FR')).toMatchObject({
e164: '+33690699699',
country: 'FR',
});
});
});
describe('resolveCountry', () => {
it('returns null for empty / nullish input', () => {
expect(resolveCountry('')).toEqual({ iso: null, confidence: null });
expect(resolveCountry(' ')).toEqual({ iso: null, confidence: null });
});
it('exact-matches a canonical English country name', () => {
expect(resolveCountry('Anguilla')).toEqual({ iso: 'AI', confidence: 'exact' });
expect(resolveCountry('United Kingdom')).toEqual({ iso: 'GB', confidence: 'exact' });
expect(resolveCountry('United States')).toEqual({ iso: 'US', confidence: 'exact' });
});
it('matches case-insensitively', () => {
expect(resolveCountry('anguilla').iso).toBe('AI');
expect(resolveCountry('UNITED KINGDOM').iso).toBe('GB');
});
it('matches values with surrounding whitespace', () => {
expect(resolveCountry(' United States ').iso).toBe('US');
});
it('handles diacritic variants of Saint-Barthélemy', () => {
// From real data: "Saint barthelemy" (#203), "St Barth" (#208), "Saint-Barthélemy".
expect(resolveCountry('Saint-Barthélemy').iso).toBe('BL');
expect(resolveCountry('Saint Barthelemy').iso).toBe('BL');
expect(resolveCountry('saint barthelemy').iso).toBe('BL');
expect(resolveCountry('St Barth').iso).toBe('BL');
});
it('resolves common abbreviations', () => {
expect(resolveCountry('USA').iso).toBe('US');
expect(resolveCountry('UK').iso).toBe('GB');
});
it('falls back to a city → country mapping for high-frequency cities', () => {
// From real data: "Kansas City" (#198), "Sag Harbor Y" (#239).
expect(resolveCountry('Kansas City').iso).toBe('US');
expect(resolveCountry('Sag Harbor Y').iso).toBe('US');
});
it('marks the confidence tier appropriately', () => {
expect(resolveCountry('Anguilla').confidence).toBe('exact');
expect(resolveCountry('Kansas City').confidence).toBe('city');
});
it('returns null + null for unresolvable values', () => {
// Migration script flags these for human review rather than guessing.
expect(resolveCountry('asdfghjkl xyz')).toEqual({ iso: null, confidence: null });
expect(resolveCountry('Mars')).toEqual({ iso: null, confidence: null });
});
});