Files
pn-new-crm/src/lib/dedup/normalize.ts
Matt Ciaccio d62822c284 fix(migration): NocoDB import safety + dedup helpers + lead-source backfill
migration-apply: residential client + interest inserts now wrap in
db.transaction so a partial failure can't leave an orphan client
row without its interest (or vice versa).

migration-transform: buildPlannedDocument returns null when there
are no signers so the apply pass doesn't try to send a Documenso
envelope without recipients. mapDocumentStatus gets an explicit
"Awaiting Further Details" branch that no longer auto-promotes via
stale sign-time fields. parseFlexibleDate handles ISO and DD-MM-YYYY
inputs uniformly.

backfill-legacy-lead-source: chunk UPDATE WHERE clause now
isNull(source) on top of the inArray match, so a re-run can't
overwrite a more accurate source written between batches.

Adds 235 lines of vitest coverage on migration-transform.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 22:56:18 +02:00

419 lines
15 KiB
TypeScript

/**
* Normalization helpers for the dedup pipeline.
*
* Pure functions (no DB, no React). Used by both the runtime at-create
* surfaces and the one-shot NocoDB migration script. Every transform
* here has a fixture in `tests/unit/dedup/normalize.test.ts` drawn from
* real dirty values observed in the legacy NocoDB Interests table.
*
* Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §3.
*/
import { z } from 'zod';
import { ALL_COUNTRY_CODES, getCountryName, type CountryCode } from '@/lib/i18n/countries';
import { parsePhoneScriptSafe as parsePhone } from './phone-parse';
// ─── Names ──────────────────────────────────────────────────────────────────
/**
* Tokens that should stay lowercase mid-name. Covers the common Romance,
* Germanic, and Iberian particles seen in client records. The first token
* of a name is always title-cased even if it appears in this set.
*/
const PARTICLES: ReadonlySet<string> = new Set([
'van',
'von',
'de',
'del',
'da',
'das',
'do',
'dos',
'di',
'le',
'la',
'el',
'al',
'der',
'den',
'des',
'du',
'dalla',
'della',
'st',
'st.',
'y',
]);
export interface NormalizedName {
/** Human-readable form preserved for UI display. Trims, collapses
* whitespace, fixes case, but never destroys the user's intent -
* slash-with-company structure ("Daniel Wainstein / 7 Knots, LLC")
* is left intact. */
display: string;
/** Lowercased form for matching. */
normalized: string;
/** Last non-particle token, lowercased. Used as a blocking key by the
* dedup algorithm so we only compare candidates with similar surnames. */
surnameToken?: string;
}
/**
* Normalize a free-text full name. Trims and collapses whitespace,
* replaces \r/\n/\t with single spaces, intelligently title-cases
* ALL-CAPS surnames while keeping particles (van / de / dalla / etc.)
* lowercase mid-name, and preserves Irish O' surnames as O'Brien.
*
* If the input contains a `/` (slash-with-company structure like
* "Daniel Wainstein / 7 Knots, LLC"), the trailing company text is
* preserved verbatim - it's signal, not noise.
*/
export function normalizeName(raw: string | null | undefined): NormalizedName {
const safe = (raw ?? '').toString();
// Replace \r, \n, \t with single spaces, then collapse runs of whitespace.
const cleaned = safe
.replace(/[\r\n\t]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
if (!cleaned) {
return { display: '', normalized: '', surnameToken: undefined };
}
// Slash-with-company: title-case the part before the slash, leave the
// company segment untouched (it's typically already a brand we shouldn't
// mangle: "SAS TIKI", "7 Knots, LLC").
const slashIdx = cleaned.indexOf('/');
let displayCore: string;
if (slashIdx !== -1) {
const personPart = cleaned.slice(0, slashIdx).trim();
const companyPart = cleaned.slice(slashIdx + 1).trim();
displayCore = `${titleCaseTokens(personPart)} / ${companyPart}`;
} else {
displayCore = titleCaseTokens(cleaned);
}
const display = displayCore;
const normalized = display.toLowerCase();
const surnameToken = computeSurnameToken(slashIdx !== -1 ? cleaned.slice(0, slashIdx) : cleaned);
return { display, normalized, surnameToken };
}
function titleCaseTokens(s: string): string {
const tokens = s.split(' ').filter(Boolean);
if (tokens.length === 0) return '';
return tokens.map((tok, idx) => titleCaseOneToken(tok, idx === 0)).join(' ');
}
function titleCaseOneToken(token: string, isFirst: boolean): string {
if (!token) return '';
const lower = token.toLowerCase();
if (!isFirst && PARTICLES.has(lower)) return lower;
// O'Brien / D'Angelo / l'Estrange - capitalize the segment after each
// apostrophe so a lowercased input round-trips to readable Irish caps.
if (lower.includes("'")) {
return lower
.split("'")
.map((part) => (part.length > 0 ? part[0]!.toUpperCase() + part.slice(1) : part))
.join("'");
}
return lower[0]!.toUpperCase() + lower.slice(1);
}
function computeSurnameToken(personPart: string): string | undefined {
const cleaned = personPart
.replace(/[\r\n\t]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
if (!cleaned) return undefined;
const tokens = cleaned.split(' ').map((t) => t.toLowerCase());
// Walk from the right past particles to find the last "real" surname token.
for (let i = tokens.length - 1; i >= 0; i -= 1) {
const tok = tokens[i]!;
if (!PARTICLES.has(tok)) return tok;
}
// All tokens are particles? Fall back to the last token verbatim.
return tokens[tokens.length - 1];
}
// ─── Emails ─────────────────────────────────────────────────────────────────
const emailSchema = z.string().email();
/**
* Normalize a free-text email. Trims + lowercases. Returns null for empty
* or malformed input - caller decides whether to flag, store, or drop.
*
* Plus-aliases (`user+tag@domain.com`) are NOT stripped: they're real
* distinct addresses, and stripping them would auto-merge legitimately
* separate accounts.
*/
export function normalizeEmail(raw: string | null | undefined): string | null {
if (raw == null) return null;
const trimmed = raw.toString().trim().toLowerCase();
if (!trimmed) return null;
const result = emailSchema.safeParse(trimmed);
return result.success ? trimmed : null;
}
// ─── Phones ─────────────────────────────────────────────────────────────────
export type PhoneFlag = 'multi_number' | 'placeholder' | 'unparseable';
export interface NormalizedPhone {
/** Canonical E.164 form, e.g. '+15742740548'. Null when unparseable
* or flagged as placeholder. */
e164: string | null;
/** ISO-3166-1 alpha-2 of the country the number was parsed against. */
country: CountryCode | null;
/** Display-friendly international format. Useful for migration reports. */
display: string | null;
/** Set when the input had a quirk worth surfacing in the migration
* report or runtime audit log. Absent on clean parses. */
flagged?: PhoneFlag;
}
/**
* Normalize a raw user-entered phone string for comparison + storage.
*
* Pipeline:
* 1. strip leading apostrophe (spreadsheet copy-paste artifact)
* 2. strip \r / \n / \t (real values seen in NocoDB had carriage returns)
* 3. detect multi-number fields ("+33611111111;+33622222222",
* "0677580750/0690511494") - flag and take first segment
* 4. strip whitespace, dots, dashes, parens, single quotes
* 5. convert leading "00" → "+" (international dialling code)
* 6. detect placeholder fakes (8+ consecutive zeros) - flag, return null e164
* 7. parse via libphonenumber-js
* 8. on parse failure or invalid number → flag 'unparseable'
*
* Returns null for empty inputs (cheaper to short-circuit than to wrap).
*/
export function normalizePhone(
raw: string | null | undefined,
defaultCountry?: CountryCode,
): NormalizedPhone | null {
if (raw == null) return null;
let cleaned = raw.toString().trim();
if (!cleaned) return null;
// 1. Spreadsheet apostrophe prefix.
if (cleaned.startsWith("'")) cleaned = cleaned.slice(1);
// 2. Strip carriage returns / newlines / tabs.
cleaned = cleaned.replace(/[\r\n\t]/g, '');
// 3. Multi-number detection - split on /, ;, , (in that order of priority).
let flagged: PhoneFlag | undefined;
if (/[/;,]/.test(cleaned)) {
flagged = 'multi_number';
cleaned = cleaned.split(/[/;,]/)[0]!.trim();
}
// 4. Strip whitespace, dots, dashes, parens. Keep + for E.164 prefix.
cleaned = cleaned.replace(/[\s.\-()]/g, '');
if (!cleaned) return { e164: null, country: null, display: null, flagged: 'unparseable' };
// 5. 00 international prefix → +.
if (cleaned.startsWith('00')) {
cleaned = '+' + cleaned.slice(2);
}
// 6. Placeholder fakes - runs of 8+ consecutive zeros, e.g. +447000000000.
if (/0{8,}/.test(cleaned)) {
return { e164: null, country: null, display: null, flagged: 'placeholder' };
}
// 7. Parse via the existing i18n helper (libphonenumber-js under the hood).
const parsed = parsePhone(cleaned, defaultCountry);
if (!parsed.e164) {
// Couldn't even produce a canonical form - genuinely garbage.
return { e164: null, country: null, display: null, flagged: 'unparseable' };
}
// Note: we deliberately don't gate on `parsed.isValid`. The
// libphonenumber-js `min` build returns isValid=false for many real
// numbers (NANP territories share +1; some country metadata is
// truncated). For dedup we only need a canonical E.164 string to
// compare; strict validity is the form layer's problem, not ours.
// If a string-only test (e.g. \"abc-not-a-phone\") gets here, parse
// returns null e164 anyway and the branch above handles it.
return {
e164: parsed.e164,
country: parsed.country,
display: parsed.international,
flagged,
};
}
// ─── Countries ──────────────────────────────────────────────────────────────
/**
* Aliases for canonical country names that don't match
* `Intl.DisplayNames(en)` output verbatim. Keys are pre-normalized
* (lowercase, diacritic-free, hyphens/dots → spaces, collapsed whitespace).
*
* Kept opinionated and small - only entries we've actually seen in legacy
* data. Adding a new alias is cheap; trying to be exhaustive isn't.
*/
const COUNTRY_ALIASES: Record<string, CountryCode> = {
// Generic abbreviations
usa: 'US',
us: 'US',
uk: 'GB',
// Saint-Barthélemy variants seen in production
'saint barthelemy': 'BL',
'saint barth': 'BL',
'st barth': 'BL',
'st barths': 'BL',
'st barthelemy': 'BL',
// Caribbean short-forms whose canonical Intl names are awkward
// ("Antigua and Barbuda", "Saint Vincent and the Grenadines", etc.).
antigua: 'AG',
barbuda: 'AG',
'st kitts': 'KN',
'saint kitts': 'KN',
nevis: 'KN',
};
/**
* High-frequency cities → country, used as a last-resort fallback when
* exact / alias / fuzzy country matching all miss. Keys are normalized.
*
* Order matters: an entry's key is also matched as a substring of the
* input ("Sag Harbor Y" contains "sag harbor"), so the most specific
* city appears first to avoid a wrong partial hit.
*/
const CITY_TO_COUNTRY: Record<string, CountryCode> = {
'kansas city': 'US',
'sag harbor': 'US',
'new york': 'US',
// Cities that came out unresolved from the 2026-05-03 NocoDB dry-run.
// Using lowercase (post-normalize keys).
boston: 'US',
tampa: 'US',
'fort lauderdale': 'US',
'port jefferson': 'US',
nantucket: 'US',
// US state abbreviations that often appear standalone or as suffix:
' fl': 'US',
' ma': 'US',
' ny': 'US',
' tx': 'US',
' ca': 'US',
// International
london: 'GB',
paris: 'FR',
};
export type CountryConfidence = 'exact' | 'fuzzy' | 'city';
export interface ResolvedCountry {
iso: CountryCode | null;
confidence: CountryConfidence | null;
}
/**
* Map free-text country / region input to an ISO-3166-1 alpha-2 code.
*
* Lookup order: alias → exact (vs. all locale country names) → city →
* fuzzy (Levenshtein ≤ 2). Anything beyond fuzzy returns null and the
* migration script flags the row for human review.
*/
export function resolveCountry(text: string | null | undefined): ResolvedCountry {
if (text == null) return { iso: null, confidence: null };
const normalized = normalizeForLookup(text.toString());
if (!normalized) return { iso: null, confidence: null };
// 1. Aliases - covers USA / UK / St Barth and friends.
const alias = COUNTRY_ALIASES[normalized];
if (alias) return { iso: alias, confidence: 'exact' };
// 2. Exact match against Intl-derived country names. We compare against
// diacritic-stripped + lowercased canonical names so 'United States'
// and 'united states' both resolve.
for (const code of ALL_COUNTRY_CODES) {
const cleanName = normalizeForLookup(getCountryName(code, 'en'));
if (cleanName === normalized) return { iso: code, confidence: 'exact' };
}
// 3. City → country fallback, exact or substring.
const cityExact = CITY_TO_COUNTRY[normalized];
if (cityExact) return { iso: cityExact, confidence: 'city' };
for (const [city, iso] of Object.entries(CITY_TO_COUNTRY)) {
if (normalized.includes(city)) return { iso, confidence: 'city' };
}
// 4. Fuzzy fallback (Levenshtein ≤ 2). Skipped for short inputs because
// a 4-char string like "Mars" sits within distance 2 of multiple
// short country names (Mali, Laos, Iran, …) - false-positive city.
if (normalized.length >= 6) {
let bestCode: CountryCode | null = null;
let bestDistance = Number.POSITIVE_INFINITY;
for (const code of ALL_COUNTRY_CODES) {
const cleanName = normalizeForLookup(getCountryName(code, 'en'));
const d = levenshtein(cleanName, normalized);
if (d < bestDistance) {
bestDistance = d;
bestCode = code;
if (d === 0) break;
}
}
if (bestDistance <= 2 && bestCode) {
return { iso: bestCode, confidence: 'fuzzy' };
}
}
return { iso: null, confidence: null };
}
/** Lowercase + strip diacritics + replace hyphens/dots with spaces +
* collapse whitespace. Used by both the input and the canonical-name
* side of the country comparison so they meet on the same shape. */
function normalizeForLookup(s: string): string {
return s
.normalize('NFD')
.replace(/[̀-ͯ]/g, '')
.toLowerCase()
.replace(/[-.]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
// ─── Levenshtein ────────────────────────────────────────────────────────────
/**
* Standard iterative Levenshtein. Used by the country fuzzy match and by
* the dedup algorithm's name-similarity rule. Allocates O(n*m) so callers
* shouldn't run it against pathological inputs - the dedup blocking
* strategy keeps comparison sets small.
*
* Exported so the find-matches module can reuse the same implementation
* without relying on an external dep.
*/
export function levenshtein(a: string, b: string): number {
if (a === b) return 0;
if (!a) return b.length;
if (!b) return a.length;
const m = a.length;
const n = b.length;
// Two rolling rows is enough - keeps memory at O(n) instead of O(n*m).
let prev = new Array<number>(n + 1);
let curr = new Array<number>(n + 1);
for (let j = 0; j <= n; j += 1) prev[j] = j;
for (let i = 1; i <= m; i += 1) {
curr[0] = i;
for (let j = 1; j <= n; j += 1) {
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
curr[j] = Math.min(curr[j - 1]! + 1, prev[j]! + 1, prev[j - 1]! + cost);
}
[prev, curr] = [curr, prev];
}
return prev[n]!;
}