/** * Normalization helpers for the dedup pipeline. * * Pure functions (no DB, no React). Used by both the runtime at-create * surfaces and the one-shot NocoDB migration script. Every transform * here has a fixture in `tests/unit/dedup/normalize.test.ts` drawn from * real dirty values observed in the legacy NocoDB Interests table. * * Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §3. */ import { z } from 'zod'; import { ALL_COUNTRY_CODES, getCountryName, type CountryCode } from '@/lib/i18n/countries'; import { parsePhoneScriptSafe as parsePhone } from './phone-parse'; // ─── Names ────────────────────────────────────────────────────────────────── /** * Tokens that should stay lowercase mid-name. Covers the common Romance, * Germanic, and Iberian particles seen in client records. The first token * of a name is always title-cased even if it appears in this set. */ const PARTICLES: ReadonlySet = new Set([ 'van', 'von', 'de', 'del', 'da', 'das', 'do', 'dos', 'di', 'le', 'la', 'el', 'al', 'der', 'den', 'des', 'du', 'dalla', 'della', 'st', 'st.', 'y', ]); export interface NormalizedName { /** Human-readable form preserved for UI display. Trims, collapses * whitespace, fixes case, but never destroys the user's intent - * slash-with-company structure ("Daniel Wainstein / 7 Knots, LLC") * is left intact. */ display: string; /** Lowercased form for matching. */ normalized: string; /** Last non-particle token, lowercased. Used as a blocking key by the * dedup algorithm so we only compare candidates with similar surnames. */ surnameToken?: string; } /** * Normalize a free-text full name. Trims and collapses whitespace, * replaces \r/\n/\t with single spaces, intelligently title-cases * ALL-CAPS surnames while keeping particles (van / de / dalla / etc.) * lowercase mid-name, and preserves Irish O' surnames as O'Brien. * * If the input contains a `/` (slash-with-company structure like * "Daniel Wainstein / 7 Knots, LLC"), the trailing company text is * preserved verbatim - it's signal, not noise. */ export function normalizeName(raw: string | null | undefined): NormalizedName { const safe = (raw ?? '').toString(); // Replace \r, \n, \t with single spaces, then collapse runs of whitespace. const cleaned = safe .replace(/[\r\n\t]/g, ' ') .replace(/\s+/g, ' ') .trim(); if (!cleaned) { return { display: '', normalized: '', surnameToken: undefined }; } // Slash-with-company: title-case the part before the slash, leave the // company segment untouched (it's typically already a brand we shouldn't // mangle: "SAS TIKI", "7 Knots, LLC"). const slashIdx = cleaned.indexOf('/'); let displayCore: string; if (slashIdx !== -1) { const personPart = cleaned.slice(0, slashIdx).trim(); const companyPart = cleaned.slice(slashIdx + 1).trim(); displayCore = `${titleCaseTokens(personPart)} / ${companyPart}`; } else { displayCore = titleCaseTokens(cleaned); } const display = displayCore; const normalized = display.toLowerCase(); const surnameToken = computeSurnameToken(slashIdx !== -1 ? cleaned.slice(0, slashIdx) : cleaned); return { display, normalized, surnameToken }; } function titleCaseTokens(s: string): string { const tokens = s.split(' ').filter(Boolean); if (tokens.length === 0) return ''; return tokens.map((tok, idx) => titleCaseOneToken(tok, idx === 0)).join(' '); } function titleCaseOneToken(token: string, isFirst: boolean): string { if (!token) return ''; const lower = token.toLowerCase(); if (!isFirst && PARTICLES.has(lower)) return lower; // O'Brien / D'Angelo / l'Estrange - capitalize the segment after each // apostrophe so a lowercased input round-trips to readable Irish caps. if (lower.includes("'")) { return lower .split("'") .map((part) => (part.length > 0 ? part[0]!.toUpperCase() + part.slice(1) : part)) .join("'"); } return lower[0]!.toUpperCase() + lower.slice(1); } function computeSurnameToken(personPart: string): string | undefined { const cleaned = personPart .replace(/[\r\n\t]/g, ' ') .replace(/\s+/g, ' ') .trim(); if (!cleaned) return undefined; const tokens = cleaned.split(' ').map((t) => t.toLowerCase()); // Walk from the right past particles to find the last "real" surname token. for (let i = tokens.length - 1; i >= 0; i -= 1) { const tok = tokens[i]!; if (!PARTICLES.has(tok)) return tok; } // All tokens are particles? Fall back to the last token verbatim. return tokens[tokens.length - 1]; } // ─── Emails ───────────────────────────────────────────────────────────────── const emailSchema = z.string().email(); /** * Normalize a free-text email. Trims + lowercases. Returns null for empty * or malformed input - caller decides whether to flag, store, or drop. * * Plus-aliases (`user+tag@domain.com`) are NOT stripped: they're real * distinct addresses, and stripping them would auto-merge legitimately * separate accounts. */ export function normalizeEmail(raw: string | null | undefined): string | null { if (raw == null) return null; const trimmed = raw.toString().trim().toLowerCase(); if (!trimmed) return null; const result = emailSchema.safeParse(trimmed); return result.success ? trimmed : null; } // ─── Phones ───────────────────────────────────────────────────────────────── export type PhoneFlag = 'multi_number' | 'placeholder' | 'unparseable'; export interface NormalizedPhone { /** Canonical E.164 form, e.g. '+15742740548'. Null when unparseable * or flagged as placeholder. */ e164: string | null; /** ISO-3166-1 alpha-2 of the country the number was parsed against. */ country: CountryCode | null; /** Display-friendly international format. Useful for migration reports. */ display: string | null; /** Set when the input had a quirk worth surfacing in the migration * report or runtime audit log. Absent on clean parses. */ flagged?: PhoneFlag; } /** * Normalize a raw user-entered phone string for comparison + storage. * * Pipeline: * 1. strip leading apostrophe (spreadsheet copy-paste artifact) * 2. strip \r / \n / \t (real values seen in NocoDB had carriage returns) * 3. detect multi-number fields ("+33611111111;+33622222222", * "0677580750/0690511494") - flag and take first segment * 4. strip whitespace, dots, dashes, parens, single quotes * 5. convert leading "00" → "+" (international dialling code) * 6. detect placeholder fakes (8+ consecutive zeros) - flag, return null e164 * 7. parse via libphonenumber-js * 8. on parse failure or invalid number → flag 'unparseable' * * Returns null for empty inputs (cheaper to short-circuit than to wrap). */ export function normalizePhone( raw: string | null | undefined, defaultCountry?: CountryCode, ): NormalizedPhone | null { if (raw == null) return null; let cleaned = raw.toString().trim(); if (!cleaned) return null; // 1. Spreadsheet apostrophe prefix. if (cleaned.startsWith("'")) cleaned = cleaned.slice(1); // 2. Strip carriage returns / newlines / tabs. cleaned = cleaned.replace(/[\r\n\t]/g, ''); // 3. Multi-number detection - split on /, ;, , (in that order of priority). let flagged: PhoneFlag | undefined; if (/[/;,]/.test(cleaned)) { flagged = 'multi_number'; cleaned = cleaned.split(/[/;,]/)[0]!.trim(); } // 4. Strip whitespace, dots, dashes, parens. Keep + for E.164 prefix. cleaned = cleaned.replace(/[\s.\-()]/g, ''); if (!cleaned) return { e164: null, country: null, display: null, flagged: 'unparseable' }; // 5. 00 international prefix → +. if (cleaned.startsWith('00')) { cleaned = '+' + cleaned.slice(2); } // 6. Placeholder fakes - runs of 8+ consecutive zeros, e.g. +447000000000. if (/0{8,}/.test(cleaned)) { return { e164: null, country: null, display: null, flagged: 'placeholder' }; } // 7. Parse via the existing i18n helper (libphonenumber-js under the hood). const parsed = parsePhone(cleaned, defaultCountry); if (!parsed.e164) { // Couldn't even produce a canonical form - genuinely garbage. return { e164: null, country: null, display: null, flagged: 'unparseable' }; } // Note: we deliberately don't gate on `parsed.isValid`. The // libphonenumber-js `min` build returns isValid=false for many real // numbers (NANP territories share +1; some country metadata is // truncated). For dedup we only need a canonical E.164 string to // compare; strict validity is the form layer's problem, not ours. // If a string-only test (e.g. \"abc-not-a-phone\") gets here, parse // returns null e164 anyway and the branch above handles it. return { e164: parsed.e164, country: parsed.country, display: parsed.international, flagged, }; } // ─── Countries ────────────────────────────────────────────────────────────── /** * Aliases for canonical country names that don't match * `Intl.DisplayNames(en)` output verbatim. Keys are pre-normalized * (lowercase, diacritic-free, hyphens/dots → spaces, collapsed whitespace). * * Kept opinionated and small - only entries we've actually seen in legacy * data. Adding a new alias is cheap; trying to be exhaustive isn't. */ const COUNTRY_ALIASES: Record = { // Generic abbreviations usa: 'US', us: 'US', uk: 'GB', // Saint-Barthélemy variants seen in production 'saint barthelemy': 'BL', 'saint barth': 'BL', 'st barth': 'BL', 'st barths': 'BL', 'st barthelemy': 'BL', // Caribbean short-forms whose canonical Intl names are awkward // ("Antigua and Barbuda", "Saint Vincent and the Grenadines", etc.). antigua: 'AG', barbuda: 'AG', 'st kitts': 'KN', 'saint kitts': 'KN', nevis: 'KN', }; /** * High-frequency cities → country, used as a last-resort fallback when * exact / alias / fuzzy country matching all miss. Keys are normalized. * * Order matters: an entry's key is also matched as a substring of the * input ("Sag Harbor Y" contains "sag harbor"), so the most specific * city appears first to avoid a wrong partial hit. */ const CITY_TO_COUNTRY: Record = { 'kansas city': 'US', 'sag harbor': 'US', 'new york': 'US', // Cities that came out unresolved from the 2026-05-03 NocoDB dry-run. // Using lowercase (post-normalize keys). boston: 'US', tampa: 'US', 'fort lauderdale': 'US', 'port jefferson': 'US', nantucket: 'US', // US state abbreviations that often appear standalone or as suffix: ' fl': 'US', ' ma': 'US', ' ny': 'US', ' tx': 'US', ' ca': 'US', // International london: 'GB', paris: 'FR', }; export type CountryConfidence = 'exact' | 'fuzzy' | 'city'; export interface ResolvedCountry { iso: CountryCode | null; confidence: CountryConfidence | null; } /** * Map free-text country / region input to an ISO-3166-1 alpha-2 code. * * Lookup order: alias → exact (vs. all locale country names) → city → * fuzzy (Levenshtein ≤ 2). Anything beyond fuzzy returns null and the * migration script flags the row for human review. */ export function resolveCountry(text: string | null | undefined): ResolvedCountry { if (text == null) return { iso: null, confidence: null }; const normalized = normalizeForLookup(text.toString()); if (!normalized) return { iso: null, confidence: null }; // 1. Aliases - covers USA / UK / St Barth and friends. const alias = COUNTRY_ALIASES[normalized]; if (alias) return { iso: alias, confidence: 'exact' }; // 2. Exact match against Intl-derived country names. We compare against // diacritic-stripped + lowercased canonical names so 'United States' // and 'united states' both resolve. for (const code of ALL_COUNTRY_CODES) { const cleanName = normalizeForLookup(getCountryName(code, 'en')); if (cleanName === normalized) return { iso: code, confidence: 'exact' }; } // 3. City → country fallback, exact or substring. const cityExact = CITY_TO_COUNTRY[normalized]; if (cityExact) return { iso: cityExact, confidence: 'city' }; for (const [city, iso] of Object.entries(CITY_TO_COUNTRY)) { if (normalized.includes(city)) return { iso, confidence: 'city' }; } // 4. Fuzzy fallback (Levenshtein ≤ 2). Skipped for short inputs because // a 4-char string like "Mars" sits within distance 2 of multiple // short country names (Mali, Laos, Iran, …) - false-positive city. if (normalized.length >= 6) { let bestCode: CountryCode | null = null; let bestDistance = Number.POSITIVE_INFINITY; for (const code of ALL_COUNTRY_CODES) { const cleanName = normalizeForLookup(getCountryName(code, 'en')); const d = levenshtein(cleanName, normalized); if (d < bestDistance) { bestDistance = d; bestCode = code; if (d === 0) break; } } if (bestDistance <= 2 && bestCode) { return { iso: bestCode, confidence: 'fuzzy' }; } } return { iso: null, confidence: null }; } /** Lowercase + strip diacritics + replace hyphens/dots with spaces + * collapse whitespace. Used by both the input and the canonical-name * side of the country comparison so they meet on the same shape. */ function normalizeForLookup(s: string): string { return s .normalize('NFD') .replace(/[̀-ͯ]/g, '') .toLowerCase() .replace(/[-.]/g, ' ') .replace(/\s+/g, ' ') .trim(); } // ─── Levenshtein ──────────────────────────────────────────────────────────── /** * Standard iterative Levenshtein. Used by the country fuzzy match and by * the dedup algorithm's name-similarity rule. Allocates O(n*m) so callers * shouldn't run it against pathological inputs - the dedup blocking * strategy keeps comparison sets small. * * Exported so the find-matches module can reuse the same implementation * without relying on an external dep. */ export function levenshtein(a: string, b: string): number { if (a === b) return 0; if (!a) return b.length; if (!b) return a.length; const m = a.length; const n = b.length; // Two rolling rows is enough - keeps memory at O(n) instead of O(n*m). let prev = new Array(n + 1); let curr = new Array(n + 1); for (let j = 0; j <= n; j += 1) prev[j] = j; for (let i = 1; i <= m; i += 1) { curr[0] = i; for (let j = 1; j <= n; j += 1) { const cost = a[i - 1] === b[j - 1] ? 0 : 1; curr[j] = Math.min(curr[j - 1]! + 1, prev[j]! + 1, prev[j - 1]! + cost); } [prev, curr] = [curr, prev]; } return prev[n]!; }