Files
pn-new-crm/src/lib/dedup/find-matches.ts
Matt Ciaccio d62822c284 fix(migration): NocoDB import safety + dedup helpers + lead-source backfill
migration-apply: residential client + interest inserts now wrap in
db.transaction so a partial failure can't leave an orphan client
row without its interest (or vice versa).

migration-transform: buildPlannedDocument returns null when there
are no signers so the apply pass doesn't try to send a Documenso
envelope without recipients. mapDocumentStatus gets an explicit
"Awaiting Further Details" branch that no longer auto-promotes via
stale sign-time fields. parseFlexibleDate handles ISO and DD-MM-YYYY
inputs uniformly.

backfill-legacy-lead-source: chunk UPDATE WHERE clause now
isNull(source) on top of the inArray match, so a re-run can't
overwrite a more accurate source written between batches.

Adds 235 lines of vitest coverage on migration-transform.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 22:56:18 +02:00

256 lines
9.4 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Client-match finder - pure scoring logic.
*
* Compares one input candidate against a pool of existing candidates and
* returns scored matches. Used by:
* - the at-create suggestion in client/interest forms (Layer 1)
* - the public-form auto-link path (when score >= block threshold)
* - the nightly background scoring job (Layer 3)
* - the migration script's dedup pass
*
* Performance shape: blocking via email / phone / surname-token reduces
* the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in
* production. See `findClientMatches` for the blocking implementation.
*
* Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4.
*/
import { parsePhoneScriptSafe as parsePhone } from './phone-parse';
import { levenshtein } from './normalize';
// ─── Types ──────────────────────────────────────────────────────────────────
export interface MatchCandidate {
id: string;
fullName: string | null;
/** Lowercased last non-particle token from `normalizeName(...).surnameToken`.
* Used as a blocking key. */
surnameToken: string | null;
/** Already lowercased + validated via `normalizeEmail`. */
emails: string[];
/** Already canonical E.164 via `normalizePhone`. */
phonesE164: string[];
/** Address country (NOT phone country) - used for tiebreaking, not scoring. */
countryIso: string | null;
}
export type MatchConfidence = 'high' | 'medium' | 'low';
export interface MatchResult {
candidate: MatchCandidate;
/** 0100 after capping. */
score: number;
/** Human-readable list of which rules contributed. Useful for the
* review queue UI ("matched on email + phone + surname token"). */
reasons: string[];
confidence: MatchConfidence;
}
export interface DedupThresholds {
/** Inclusive lower bound for `'high'` confidence. */
highScore: number;
/** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */
mediumScore: number;
}
// ─── Public entry point ─────────────────────────────────────────────────────
/**
* Compare `input` against every reachable candidate in `pool` and return
* scored matches, sorted by score descending. The result list includes
* low-confidence hits - caller filters by `confidence` or `score`
* depending on use case.
*
* Self-matches (an entry with `id === input.id`, e.g. when re-scoring an
* existing client during a background job) are excluded.
*/
export function findClientMatches(
input: MatchCandidate,
pool: MatchCandidate[],
thresholds: DedupThresholds,
): MatchResult[] {
if (pool.length === 0) return [];
// ── Phase 1: build blocking indexes off the pool. ─────────────────────────
//
// Three indexes mean any candidate that shares ANY of (email / phone /
// surname-token) with the input shows up in the comparison set. Anything
// that shares NONE is structurally too different to be a duplicate and
// is skipped - this is what keeps the algorithm O(n) at scale.
const byEmail = new Map<string, MatchCandidate[]>();
const byPhone = new Map<string, MatchCandidate[]>();
const bySurnameToken = new Map<string, MatchCandidate[]>();
for (const c of pool) {
if (c.id === input.id) continue;
for (const email of c.emails) {
pushTo(byEmail, email, c);
}
for (const phone of c.phonesE164) {
pushTo(byPhone, phone, c);
}
if (c.surnameToken) {
pushTo(bySurnameToken, c.surnameToken, c);
}
}
// ── Phase 2: gather the comparison set via the blocking indexes. ─────────
const comparisonSet = new Map<string, MatchCandidate>();
for (const email of input.emails) {
for (const c of byEmail.get(email) ?? []) {
comparisonSet.set(c.id, c);
}
}
for (const phone of input.phonesE164) {
for (const c of byPhone.get(phone) ?? []) {
comparisonSet.set(c.id, c);
}
}
if (input.surnameToken) {
for (const c of bySurnameToken.get(input.surnameToken) ?? []) {
comparisonSet.set(c.id, c);
}
}
// ── Phase 3: score every candidate that survived blocking. ───────────────
const results: MatchResult[] = [];
for (const candidate of comparisonSet.values()) {
const r = scorePair(input, candidate);
results.push(r);
}
// ── Phase 4: sort by score desc + assign confidence tier. ────────────────
results.sort((a, b) => b.score - a.score);
for (const r of results) {
r.confidence = classify(r.score, thresholds);
}
return results;
}
// ─── Scoring ────────────────────────────────────────────────────────────────
/**
* Score one (input, candidate) pair against the rule set in design §4.2.
* Compounding: positive rules sum, negative rules subtract; the result is
* clamped to [0, 100]. Reasons accumulate in the order rules fire so the
* review-queue UI can show "matched on email + phone".
*/
function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult {
let score = 0;
const reasons: string[] = [];
// ── Positive rules. ──────────────────────────────────────────────────────
const sharedEmail = a.emails.find((e) => b.emails.includes(e));
const emailMatch = !!sharedEmail;
if (emailMatch) {
score += 60;
reasons.push('email match');
}
const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8);
const phoneMatch = !!sharedPhone;
if (phoneMatch) {
score += 50;
reasons.push('phone match');
}
const aNameNorm = (a.fullName ?? '').toLowerCase().trim();
const bNameNorm = (b.fullName ?? '').toLowerCase().trim();
const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm;
if (nameExactMatch) {
score += 20;
reasons.push('name match');
}
// Surname + given-name fuzzy. Only fires when names are NOT exactly
// equal - avoids double-counting with the rule above. Catches
// 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact
// signals confirm them.
if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) {
const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) {
score += 15;
reasons.push('surname + given-name fuzzy match');
}
}
// ── Negative rules. ──────────────────────────────────────────────────────
// Same email but the two parties' phone numbers belong to different
// countries. Common when one inbox is shared by spouses / coworkers
// and the actual phone owners are distinct people. Don't auto-merge.
if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) {
const aCountries = phoneCountriesOf(a);
const bCountries = phoneCountriesOf(b);
const overlap = [...aCountries].some((c) => bCountries.has(c));
if (!overlap && aCountries.size > 0 && bCountries.size > 0) {
score -= 15;
reasons.push('phone country mismatch (negative)');
}
}
// Same name but no contact match. Two distinct people with the same
// name (common for "John Smith") sneak through name-based blocking;
// penalize so the score lands below the auto-merge threshold.
if (nameExactMatch && !emailMatch && !phoneMatch) {
score -= 20;
reasons.push('name match but no shared contact (negative)');
}
return {
candidate: b,
score: clamp(score, 0, 100),
reasons,
confidence: 'low', // assigned by caller after threshold lookup
};
}
// ─── Helpers ────────────────────────────────────────────────────────────────
function pushTo<K, V>(map: Map<K, V[]>, key: K, value: V): void {
const existing = map.get(key);
if (existing) {
existing.push(value);
} else {
map.set(key, [value]);
}
}
function classify(score: number, thresholds: DedupThresholds): MatchConfidence {
if (score >= thresholds.highScore) return 'high';
if (score >= thresholds.mediumScore) return 'medium';
return 'low';
}
function clamp(value: number, min: number, max: number): number {
if (value < min) return min;
if (value > max) return max;
return value;
}
function countDigits(s: string): number {
let count = 0;
for (let i = 0; i < s.length; i += 1) {
const code = s.charCodeAt(i);
if (code >= 48 && code <= 57) count += 1;
}
return count;
}
/**
* Resolve each phone in a candidate to its ISO country code (via
* libphonenumber-js). Cached per call; the surrounding caller doesn't
* batch so we accept the parse cost.
*/
function phoneCountriesOf(c: MatchCandidate): Set<string> {
const out = new Set<string>();
for (const p of c.phonesE164) {
const parsed = parsePhone(p);
if (parsed.country) out.add(parsed.country);
}
return out;
}