migration-apply: residential client + interest inserts now wrap in db.transaction so a partial failure can't leave an orphan client row without its interest (or vice versa). migration-transform: buildPlannedDocument returns null when there are no signers so the apply pass doesn't try to send a Documenso envelope without recipients. mapDocumentStatus gets an explicit "Awaiting Further Details" branch that no longer auto-promotes via stale sign-time fields. parseFlexibleDate handles ISO and DD-MM-YYYY inputs uniformly. backfill-legacy-lead-source: chunk UPDATE WHERE clause now isNull(source) on top of the inArray match, so a re-run can't overwrite a more accurate source written between batches. Adds 235 lines of vitest coverage on migration-transform. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
256 lines
9.4 KiB
TypeScript
256 lines
9.4 KiB
TypeScript
/**
|
||
* Client-match finder - pure scoring logic.
|
||
*
|
||
* Compares one input candidate against a pool of existing candidates and
|
||
* returns scored matches. Used by:
|
||
* - the at-create suggestion in client/interest forms (Layer 1)
|
||
* - the public-form auto-link path (when score >= block threshold)
|
||
* - the nightly background scoring job (Layer 3)
|
||
* - the migration script's dedup pass
|
||
*
|
||
* Performance shape: blocking via email / phone / surname-token reduces
|
||
* the pairwise scan from O(n²) to ~O(n) for any pool size we'll see in
|
||
* production. See `findClientMatches` for the blocking implementation.
|
||
*
|
||
* Design reference: docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §4.
|
||
*/
|
||
|
||
import { parsePhoneScriptSafe as parsePhone } from './phone-parse';
|
||
|
||
import { levenshtein } from './normalize';
|
||
|
||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||
|
||
export interface MatchCandidate {
|
||
id: string;
|
||
fullName: string | null;
|
||
/** Lowercased last non-particle token from `normalizeName(...).surnameToken`.
|
||
* Used as a blocking key. */
|
||
surnameToken: string | null;
|
||
/** Already lowercased + validated via `normalizeEmail`. */
|
||
emails: string[];
|
||
/** Already canonical E.164 via `normalizePhone`. */
|
||
phonesE164: string[];
|
||
/** Address country (NOT phone country) - used for tiebreaking, not scoring. */
|
||
countryIso: string | null;
|
||
}
|
||
|
||
export type MatchConfidence = 'high' | 'medium' | 'low';
|
||
|
||
export interface MatchResult {
|
||
candidate: MatchCandidate;
|
||
/** 0–100 after capping. */
|
||
score: number;
|
||
/** Human-readable list of which rules contributed. Useful for the
|
||
* review queue UI ("matched on email + phone + surname token"). */
|
||
reasons: string[];
|
||
confidence: MatchConfidence;
|
||
}
|
||
|
||
export interface DedupThresholds {
|
||
/** Inclusive lower bound for `'high'` confidence. */
|
||
highScore: number;
|
||
/** Inclusive lower bound for `'medium'` confidence. Below this is `'low'`. */
|
||
mediumScore: number;
|
||
}
|
||
|
||
// ─── Public entry point ─────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Compare `input` against every reachable candidate in `pool` and return
|
||
* scored matches, sorted by score descending. The result list includes
|
||
* low-confidence hits - caller filters by `confidence` or `score`
|
||
* depending on use case.
|
||
*
|
||
* Self-matches (an entry with `id === input.id`, e.g. when re-scoring an
|
||
* existing client during a background job) are excluded.
|
||
*/
|
||
export function findClientMatches(
|
||
input: MatchCandidate,
|
||
pool: MatchCandidate[],
|
||
thresholds: DedupThresholds,
|
||
): MatchResult[] {
|
||
if (pool.length === 0) return [];
|
||
|
||
// ── Phase 1: build blocking indexes off the pool. ─────────────────────────
|
||
//
|
||
// Three indexes mean any candidate that shares ANY of (email / phone /
|
||
// surname-token) with the input shows up in the comparison set. Anything
|
||
// that shares NONE is structurally too different to be a duplicate and
|
||
// is skipped - this is what keeps the algorithm O(n) at scale.
|
||
const byEmail = new Map<string, MatchCandidate[]>();
|
||
const byPhone = new Map<string, MatchCandidate[]>();
|
||
const bySurnameToken = new Map<string, MatchCandidate[]>();
|
||
|
||
for (const c of pool) {
|
||
if (c.id === input.id) continue;
|
||
for (const email of c.emails) {
|
||
pushTo(byEmail, email, c);
|
||
}
|
||
for (const phone of c.phonesE164) {
|
||
pushTo(byPhone, phone, c);
|
||
}
|
||
if (c.surnameToken) {
|
||
pushTo(bySurnameToken, c.surnameToken, c);
|
||
}
|
||
}
|
||
|
||
// ── Phase 2: gather the comparison set via the blocking indexes. ─────────
|
||
const comparisonSet = new Map<string, MatchCandidate>();
|
||
for (const email of input.emails) {
|
||
for (const c of byEmail.get(email) ?? []) {
|
||
comparisonSet.set(c.id, c);
|
||
}
|
||
}
|
||
for (const phone of input.phonesE164) {
|
||
for (const c of byPhone.get(phone) ?? []) {
|
||
comparisonSet.set(c.id, c);
|
||
}
|
||
}
|
||
if (input.surnameToken) {
|
||
for (const c of bySurnameToken.get(input.surnameToken) ?? []) {
|
||
comparisonSet.set(c.id, c);
|
||
}
|
||
}
|
||
|
||
// ── Phase 3: score every candidate that survived blocking. ───────────────
|
||
const results: MatchResult[] = [];
|
||
for (const candidate of comparisonSet.values()) {
|
||
const r = scorePair(input, candidate);
|
||
results.push(r);
|
||
}
|
||
|
||
// ── Phase 4: sort by score desc + assign confidence tier. ────────────────
|
||
results.sort((a, b) => b.score - a.score);
|
||
for (const r of results) {
|
||
r.confidence = classify(r.score, thresholds);
|
||
}
|
||
return results;
|
||
}
|
||
|
||
// ─── Scoring ────────────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Score one (input, candidate) pair against the rule set in design §4.2.
|
||
* Compounding: positive rules sum, negative rules subtract; the result is
|
||
* clamped to [0, 100]. Reasons accumulate in the order rules fire so the
|
||
* review-queue UI can show "matched on email + phone".
|
||
*/
|
||
function scorePair(a: MatchCandidate, b: MatchCandidate): MatchResult {
|
||
let score = 0;
|
||
const reasons: string[] = [];
|
||
|
||
// ── Positive rules. ──────────────────────────────────────────────────────
|
||
|
||
const sharedEmail = a.emails.find((e) => b.emails.includes(e));
|
||
const emailMatch = !!sharedEmail;
|
||
if (emailMatch) {
|
||
score += 60;
|
||
reasons.push('email match');
|
||
}
|
||
|
||
const sharedPhone = a.phonesE164.find((p) => b.phonesE164.includes(p) && countDigits(p) >= 8);
|
||
const phoneMatch = !!sharedPhone;
|
||
if (phoneMatch) {
|
||
score += 50;
|
||
reasons.push('phone match');
|
||
}
|
||
|
||
const aNameNorm = (a.fullName ?? '').toLowerCase().trim();
|
||
const bNameNorm = (b.fullName ?? '').toLowerCase().trim();
|
||
const nameExactMatch = aNameNorm.length > 0 && aNameNorm === bNameNorm;
|
||
if (nameExactMatch) {
|
||
score += 20;
|
||
reasons.push('name match');
|
||
}
|
||
|
||
// Surname + given-name fuzzy. Only fires when names are NOT exactly
|
||
// equal - avoids double-counting with the rule above. Catches
|
||
// 'Constanzo' / 'Costanzo', 'Marc' / 'Marcus' etc. when other contact
|
||
// signals confirm them.
|
||
if (!nameExactMatch && a.surnameToken && b.surnameToken && a.surnameToken === b.surnameToken) {
|
||
const aGiven = (a.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
|
||
const bGiven = (b.fullName ?? '').toLowerCase().split(/\s+/)[0] ?? '';
|
||
if (aGiven && bGiven && levenshtein(aGiven, bGiven) <= 1) {
|
||
score += 15;
|
||
reasons.push('surname + given-name fuzzy match');
|
||
}
|
||
}
|
||
|
||
// ── Negative rules. ──────────────────────────────────────────────────────
|
||
|
||
// Same email but the two parties' phone numbers belong to different
|
||
// countries. Common when one inbox is shared by spouses / coworkers
|
||
// and the actual phone owners are distinct people. Don't auto-merge.
|
||
if (emailMatch && !phoneMatch && a.phonesE164.length > 0 && b.phonesE164.length > 0) {
|
||
const aCountries = phoneCountriesOf(a);
|
||
const bCountries = phoneCountriesOf(b);
|
||
const overlap = [...aCountries].some((c) => bCountries.has(c));
|
||
if (!overlap && aCountries.size > 0 && bCountries.size > 0) {
|
||
score -= 15;
|
||
reasons.push('phone country mismatch (negative)');
|
||
}
|
||
}
|
||
|
||
// Same name but no contact match. Two distinct people with the same
|
||
// name (common for "John Smith") sneak through name-based blocking;
|
||
// penalize so the score lands below the auto-merge threshold.
|
||
if (nameExactMatch && !emailMatch && !phoneMatch) {
|
||
score -= 20;
|
||
reasons.push('name match but no shared contact (negative)');
|
||
}
|
||
|
||
return {
|
||
candidate: b,
|
||
score: clamp(score, 0, 100),
|
||
reasons,
|
||
confidence: 'low', // assigned by caller after threshold lookup
|
||
};
|
||
}
|
||
|
||
// ─── Helpers ────────────────────────────────────────────────────────────────
|
||
|
||
function pushTo<K, V>(map: Map<K, V[]>, key: K, value: V): void {
|
||
const existing = map.get(key);
|
||
if (existing) {
|
||
existing.push(value);
|
||
} else {
|
||
map.set(key, [value]);
|
||
}
|
||
}
|
||
|
||
function classify(score: number, thresholds: DedupThresholds): MatchConfidence {
|
||
if (score >= thresholds.highScore) return 'high';
|
||
if (score >= thresholds.mediumScore) return 'medium';
|
||
return 'low';
|
||
}
|
||
|
||
function clamp(value: number, min: number, max: number): number {
|
||
if (value < min) return min;
|
||
if (value > max) return max;
|
||
return value;
|
||
}
|
||
|
||
function countDigits(s: string): number {
|
||
let count = 0;
|
||
for (let i = 0; i < s.length; i += 1) {
|
||
const code = s.charCodeAt(i);
|
||
if (code >= 48 && code <= 57) count += 1;
|
||
}
|
||
return count;
|
||
}
|
||
|
||
/**
|
||
* Resolve each phone in a candidate to its ISO country code (via
|
||
* libphonenumber-js). Cached per call; the surrounding caller doesn't
|
||
* batch so we accept the parse cost.
|
||
*/
|
||
function phoneCountriesOf(c: MatchCandidate): Set<string> {
|
||
const out = new Set<string>();
|
||
for (const p of c.phonesE164) {
|
||
const parsed = parsePhone(p);
|
||
if (parsed.country) out.add(parsed.country);
|
||
}
|
||
return out;
|
||
}
|