256 lines
9.4 KiB
TypeScript
256 lines
9.4 KiB
TypeScript
|
|
/**
|
|||
|
|
* Client-match finder — pure scoring logic.
|
|||
|
|
*
|
|||
|
|
* Compares one input candidate against a pool of existing candidates and
|
|||
|
|
* returns scored matches. Used by:
|
|||
|
|
* - the at-create suggestion in client/interest forms (Layer 1)
|
|||
|
|
* - the public-form auto-link path (when score >= block threshold)
|
|||
|
|
* - the nightly background scoring job (Layer 3)
|
|||
|
|
* - the migration script's dedup pass
|
|||
|
|
*
|
|||
|
|
* Performance shape: blocking via email / phone / surname-token reduces
|
|||
|
|
* the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in
|
|||
|
|
* production. See `findClientMatches` for the blocking implementation.
|
|||
|
|
*
|
|||
|
|
* Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4.
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
import { parsePhone } from '@/lib/i18n/phone';
|
|||
|
|
|
|||
|
|
import { levenshtein } from './normalize';
|
|||
|
|
|
|||
|
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
export interface MatchCandidate {
|
|||
|
|
id: string;
|
|||
|
|
fullName: string | null;
|
|||
|
|
/** Lowercased last non-particle token from `normalizeName(...).surnameToken`.
|
|||
|
|
* Used as a blocking key. */
|
|||
|
|
surnameToken: string | null;
|
|||
|
|
/** Already lowercased + validated via `normalizeEmail`. */
|
|||
|
|
emails: string[];
|
|||
|
|
/** Already canonical E.164 via `normalizePhone`. */
|
|||
|
|
phonesE164: string[];
|
|||
|
|
/** Address country (NOT phone country) — used for tiebreaking, not scoring. */
|
|||
|
|
countryIso: string | null;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export type MatchConfidence = 'high' | 'medium' | 'low';
|
|||
|
|
|
|||
|
|
export interface MatchResult {
|
|||
|
|
candidate: MatchCandidate;
|
|||
|
|
/** 0–100 after capping. */
|
|||
|
|
score: number;
|
|||
|
|
/** Human-readable list of which rules contributed. Useful for the
|
|||
|
|
* review queue UI ("matched on email + phone + surname token"). */
|
|||
|
|
reasons: string[];
|
|||
|
|
confidence: MatchConfidence;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
export interface DedupThresholds {
|
|||
|
|
/** Inclusive lower bound for `'high'` confidence. */
|
|||
|
|
highScore: number;
|
|||
|
|
/** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */
|
|||
|
|
mediumScore: number;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ─── Public entry point ─────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Compare `input` against every reachable candidate in `pool` and return
|
|||
|
|
* scored matches, sorted by score descending. The result list includes
|
|||
|
|
* low-confidence hits — caller filters by `confidence` or `score`
|
|||
|
|
* depending on use case.
|
|||
|
|
*
|
|||
|
|
* Self-matches (an entry with `id === input.id`, e.g. when re-scoring an
|
|||
|
|
* existing client during a background job) are excluded.
|
|||
|
|
*/
|
|||
|
|
export function findClientMatches(
|
|||
|
|
input: MatchCandidate,
|
|||
|
|
pool: MatchCandidate[],
|
|||
|
|
thresholds: DedupThresholds,
|
|||
|
|
): MatchResult[] {
|
|||
|
|
if (pool.length === 0) return [];
|
|||
|
|
|
|||
|
|
// ── Phase 1: build blocking indexes off the pool. ─────────────────────────
|
|||
|
|
//
|
|||
|
|
// Three indexes mean any candidate that shares ANY of (email / phone /
|
|||
|
|
// surname-token) with the input shows up in the comparison set. Anything
|
|||
|
|
// that shares NONE is structurally too different to be a duplicate and
|
|||
|
|
// is skipped — this is what keeps the algorithm O(n) at scale.
|
|||
|
|
const byEmail = new Map<string, MatchCandidate[]>();
|
|||
|
|
const byPhone = new Map<string, MatchCandidate[]>();
|
|||
|
|
const bySurnameToken = new Map<string, MatchCandidate[]>();
|
|||
|
|
|
|||
|
|
for (const c of pool) {
|
|||
|
|
if (c.id === input.id) continue;
|
|||
|
|
for (const email of c.emails) {
|
|||
|
|
pushTo(byEmail, email, c);
|
|||
|
|
}
|
|||
|
|
for (const phone of c.phonesE164) {
|
|||
|
|
pushTo(byPhone, phone, c);
|
|||
|
|
}
|
|||
|
|
if (c.surnameToken) {
|
|||
|
|
pushTo(bySurnameToken, c.surnameToken, c);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ── Phase 2: gather the comparison set via the blocking indexes. ─────────
|
|||
|
|
const comparisonSet = new Map<string, MatchCandidate>();
|
|||
|
|
for (const email of input.emails) {
|
|||
|
|
for (const c of byEmail.get(email) ?? []) {
|
|||
|
|
comparisonSet.set(c.id, c);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
for (const phone of input.phonesE164) {
|
|||
|
|
for (const c of byPhone.get(phone) ?? []) {
|
|||
|
|
comparisonSet.set(c.id, c);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (input.surnameToken) {
|
|||
|
|
for (const c of bySurnameToken.get(input.surnameToken) ?? []) {
|
|||
|
|
comparisonSet.set(c.id, c);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ── Phase 3: score every candidate that survived blocking. ───────────────
|
|||
|
|
const results: MatchResult[] = [];
|
|||
|
|
for (const candidate of comparisonSet.values()) {
|
|||
|
|
const r = scorePair(input, candidate);
|
|||
|
|
results.push(r);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ── Phase 4: sort by score desc + assign confidence tier. ────────────────
|
|||
|
|
results.sort((a, b) => b.score - a.score);
|
|||
|
|
for (const r of results) {
|
|||
|
|
r.confidence = classify(r.score, thresholds);
|
|||
|
|
}
|
|||
|
|
return results;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ─── Scoring ────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Score one (input, candidate) pair against the rule set in design §4.2.
|
|||
|
|
* Compounding: positive rules sum, negative rules subtract; the result is
|
|||
|
|
* clamped to [0, 100]. Reasons accumulate in the order rules fire so the
|
|||
|
|
* review-queue UI can show "matched on email + phone".
|
|||
|
|
*/
|
|||
|
|
function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult {
|
|||
|
|
let score = 0;
|
|||
|
|
const reasons: string[] = [];
|
|||
|
|
|
|||
|
|
// ── Positive rules. ──────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
const sharedEmail = a.emails.find((e) => b.emails.includes(e));
|
|||
|
|
const emailMatch = !!sharedEmail;
|
|||
|
|
if (emailMatch) {
|
|||
|
|
score += 60;
|
|||
|
|
reasons.push('email match');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8);
|
|||
|
|
const phoneMatch = !!sharedPhone;
|
|||
|
|
if (phoneMatch) {
|
|||
|
|
score += 50;
|
|||
|
|
reasons.push('phone match');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const aNameNorm = (a.fullName ?? '').toLowerCase().trim();
|
|||
|
|
const bNameNorm = (b.fullName ?? '').toLowerCase().trim();
|
|||
|
|
const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm;
|
|||
|
|
if (nameExactMatch) {
|
|||
|
|
score += 20;
|
|||
|
|
reasons.push('name match');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Surname + given-name fuzzy. Only fires when names are NOT exactly
|
|||
|
|
// equal — avoids double-counting with the rule above. Catches
|
|||
|
|
// 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact
|
|||
|
|
// signals confirm them.
|
|||
|
|
if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) {
|
|||
|
|
const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
|
|||
|
|
const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
|
|||
|
|
if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) {
|
|||
|
|
score += 15;
|
|||
|
|
reasons.push('surname + given-name fuzzy match');
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ── Negative rules. ──────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
// Same email but the two parties' phone numbers belong to different
|
|||
|
|
// countries. Common when one inbox is shared by spouses / coworkers
|
|||
|
|
// and the actual phone owners are distinct people. Don't auto-merge.
|
|||
|
|
if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) {
|
|||
|
|
const aCountries = phoneCountriesOf(a);
|
|||
|
|
const bCountries = phoneCountriesOf(b);
|
|||
|
|
const overlap = [...aCountries].some((c) => bCountries.has(c));
|
|||
|
|
if (!overlap && aCountries.size > 0 && bCountries.size > 0) {
|
|||
|
|
score -= 15;
|
|||
|
|
reasons.push('phone country mismatch (negative)');
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Same name but no contact match. Two distinct people with the same
|
|||
|
|
// name (common for "John Smith") sneak through name-based blocking;
|
|||
|
|
// penalize so the score lands below the auto-merge threshold.
|
|||
|
|
if (nameExactMatch && !emailMatch && !phoneMatch) {
|
|||
|
|
score -= 20;
|
|||
|
|
reasons.push('name match but no shared contact (negative)');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
candidate: b,
|
|||
|
|
score: clamp(score, 0, 100),
|
|||
|
|
reasons,
|
|||
|
|
confidence: 'low', // assigned by caller after threshold lookup
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
function pushTo<K, V>(map: Map<K, V[]>, key: K, value: V): void {
|
|||
|
|
const existing = map.get(key);
|
|||
|
|
if (existing) {
|
|||
|
|
existing.push(value);
|
|||
|
|
} else {
|
|||
|
|
map.set(key, [value]);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function classify(score: number, thresholds: DedupThresholds): MatchConfidence {
|
|||
|
|
if (score >= thresholds.highScore) return 'high';
|
|||
|
|
if (score >= thresholds.mediumScore) return 'medium';
|
|||
|
|
return 'low';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function clamp(value: number, min: number, max: number): number {
|
|||
|
|
if (value < min) return min;
|
|||
|
|
if (value > max) return max;
|
|||
|
|
return value;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function countDigits(s: string): number {
|
|||
|
|
let count = 0;
|
|||
|
|
for (let i = 0; i < s.length; i += 1) {
|
|||
|
|
const code = s.charCodeAt(i);
|
|||
|
|
if (code >= 48 && code <= 57) count += 1;
|
|||
|
|
}
|
|||
|
|
return count;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Resolve each phone in a candidate to its ISO country code (via
|
|||
|
|
* libphonenumber-js). Cached per call; the surrounding caller doesn't
|
|||
|
|
* batch so we accept the parse cost.
|
|||
|
|
*/
|
|||
|
|
function phoneCountriesOf(c: MatchCandidate): Set<string> {
|
|||
|
|
const out = new Set<string>();
|
|||
|
|
for (const p of c.phonesE164) {
|
|||
|
|
const parsed = parsePhone(p);
|
|||
|
|
if (parsed.country) out.add(parsed.country);
|
|||
|
|
}
|
|||
|
|
return out;
|
|||
|
|
}
|