feat: Implement centralized duplicate detection utility for expenses and interests

This commit is contained in:
2025-07-12 13:42:53 -04:00
parent b8a6a52417
commit 080cb60d71
3 changed files with 441 additions and 512 deletions

View File

@@ -1,5 +1,6 @@
import { requireSalesOrAdmin } from '~/server/utils/auth';
import { getNocoDbConfiguration, normalizePersonName } from '~/server/utils/nocodb';
import { findDuplicates, createExpenseConfig } from '~/server/utils/duplicate-detection';
import type { Expense } from '~/utils/types';
export default defineEventHandler(async (event) => {
@@ -35,21 +36,31 @@ export default defineEventHandler(async (event) => {
const expenses = response.list || [];
console.log('[EXPENSES] Analyzing', expenses.length, 'expenses for duplicates');
// Find duplicate groups
const duplicateGroups = findDuplicateExpenses(expenses);
// Find duplicate groups using the new centralized utility
const duplicateConfig = createExpenseConfig();
const duplicateGroups = findDuplicates(expenses, duplicateConfig);
// Convert to the expected format
const formattedGroups = duplicateGroups.map(group => ({
id: group.id,
expenses: group.items,
matchReason: group.matchReason,
confidence: group.confidence,
masterCandidate: group.masterCandidate
}));
// Also find payer name variations
const payerVariations = findPayerNameVariations(expenses);
console.log('[EXPENSES] Found', duplicateGroups.length, 'duplicate groups and', payerVariations.length, 'payer variations');
console.log('[EXPENSES] Found', formattedGroups.length, 'duplicate groups and', payerVariations.length, 'payer variations');
return {
success: true,
data: {
duplicateGroups,
duplicateGroups: formattedGroups,
payerVariations,
totalExpenses: expenses.length,
duplicateCount: duplicateGroups.reduce((sum, group) => sum + group.expenses.length, 0),
duplicateCount: formattedGroups.reduce((sum, group) => sum + group.expenses.length, 0),
dateRange: {
start: startDate.toISOString().split('T')[0],
end: endDate.toISOString().split('T')[0]
@@ -74,71 +85,6 @@ export default defineEventHandler(async (event) => {
}
});
/**
* Find duplicate expenses based on multiple criteria
*/
function findDuplicateExpenses(expenses: any[]) {
console.log('[EXPENSES] Starting duplicate detection for', expenses.length, 'expenses');
const duplicateGroups: Array<{
id: string;
expenses: any[];
matchReason: string;
confidence: number;
masterCandidate: any;
}> = [];
const processedIds = new Set<number>();
let comparisons = 0;
for (let i = 0; i < expenses.length; i++) {
const expense1 = expenses[i];
if (processedIds.has(expense1.Id)) continue;
const matches = [expense1];
let matchReasons = new Set<string>();
for (let j = i + 1; j < expenses.length; j++) {
const expense2 = expenses[j];
if (processedIds.has(expense2.Id)) continue;
const similarity = calculateExpenseSimilarity(expense1, expense2);
comparisons++;
console.log(`[EXPENSES] Comparing ${expense1.Id} vs ${expense2.Id}: score=${similarity.score.toFixed(3)}, threshold=0.7`);
if (similarity.score >= 0.7) { // Lower threshold for expenses
console.log(`[EXPENSES] MATCH FOUND! ${expense1.Id} vs ${expense2.Id} (score: ${similarity.score.toFixed(3)})`);
console.log('[EXPENSES] Match reasons:', similarity.reasons);
matches.push(expense2);
processedIds.add(expense2.Id);
similarity.reasons.forEach(r => matchReasons.add(r));
}
}
if (matches.length > 1) {
// Mark all as processed
matches.forEach(match => processedIds.add(match.Id));
// Determine the best master candidate
const masterCandidate = selectMasterExpense(matches);
duplicateGroups.push({
id: `group_${duplicateGroups.length + 1}`,
expenses: matches,
matchReason: Array.from(matchReasons).join(', '),
confidence: Math.max(...matches.slice(1).map(match =>
calculateExpenseSimilarity(masterCandidate, match).score
)),
masterCandidate
});
}
}
return duplicateGroups;
}
/**
* Find payer name variations (like "Abbie" vs "abbie")
@@ -181,154 +127,3 @@ function findPayerNameVariations(expenses: any[]) {
return variations.sort((a, b) => b.expenseCount - a.expenseCount);
}
/**
* Calculate similarity between two expenses
*/
function calculateExpenseSimilarity(expense1: any, expense2: any) {
const scores: Array<{ type: string; score: number; weight: number }> = [];
const reasons: string[] = [];
// Exact match on establishment, price, and date (highest weight for true duplicates)
if (expense1['Establishment Name'] === expense2['Establishment Name'] &&
expense1.Price === expense2.Price &&
expense1.Time === expense2.Time) {
scores.push({ type: 'exact', score: 1.0, weight: 0.5 });
reasons.push('Exact match');
}
// Same payer, establishment, and price on same day (likely duplicate)
const date1 = expense1.Time?.split('T')[0];
const date2 = expense2.Time?.split('T')[0];
if (normalizePersonName(expense1.Payer) === normalizePersonName(expense2.Payer) &&
expense1['Establishment Name'] === expense2['Establishment Name'] &&
expense1.Price === expense2.Price &&
date1 === date2) {
scores.push({ type: 'same-day', score: 0.95, weight: 0.4 });
reasons.push('Same person, place, amount on same day');
}
// Similar establishment names with same price and payer
if (expense1['Establishment Name'] && expense2['Establishment Name']) {
const nameSimilarity = calculateStringSimilarity(
expense1['Establishment Name'],
expense2['Establishment Name']
);
if (nameSimilarity > 0.8 &&
expense1.Price === expense2.Price &&
normalizePersonName(expense1.Payer) === normalizePersonName(expense2.Payer)) {
scores.push({ type: 'similar', score: nameSimilarity, weight: 0.3 });
reasons.push('Similar establishment name');
}
}
// Time proximity check (within 5 minutes)
if (expense1.Time && expense2.Time) {
const time1 = new Date(expense1.Time).getTime();
const time2 = new Date(expense2.Time).getTime();
const timeDiff = Math.abs(time1 - time2);
if (timeDiff < 5 * 60 * 1000 && // 5 minutes
expense1['Establishment Name'] === expense2['Establishment Name']) {
scores.push({ type: 'time-proximity', score: 0.9, weight: 0.2 });
reasons.push('Within 5 minutes at same establishment');
}
}
// Calculate weighted average
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
const weightedScore = totalWeight > 0
? scores.reduce((sum, s) => sum + (s.score * s.weight), 0) / totalWeight
: 0;
return {
score: weightedScore,
reasons,
details: scores
};
}
/**
* Calculate string similarity using Levenshtein distance
*/
function calculateStringSimilarity(str1: string, str2: string): number {
const s1 = str1.toLowerCase().trim();
const s2 = str2.toLowerCase().trim();
if (s1 === s2) return 1.0;
const distance = levenshteinDistance(s1, s2);
const maxLength = Math.max(s1.length, s2.length);
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
}
/**
* Calculate Levenshtein distance between two strings
*/
function levenshteinDistance(str1: string, str2: string): number {
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
for (let i = 0; i <= str1.length; i += 1) {
matrix[0][i] = i;
}
for (let j = 0; j <= str2.length; j += 1) {
matrix[j][0] = j;
}
for (let j = 1; j <= str2.length; j += 1) {
for (let i = 1; i <= str1.length; i += 1) {
const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
matrix[j][i] = Math.min(
matrix[j][i - 1] + 1, // deletion
matrix[j - 1][i] + 1, // insertion
matrix[j - 1][i - 1] + indicator // substitution
);
}
}
return matrix[str2.length][str1.length];
}
/**
* Select the best master expense from a group
*/
function selectMasterExpense(expenses: any[]) {
return expenses.reduce((best, current) => {
const bestScore = calculateExpenseCompletenessScore(best);
const currentScore = calculateExpenseCompletenessScore(current);
return currentScore > bestScore ? current : best;
});
}
/**
* Calculate completeness score for an expense
*/
function calculateExpenseCompletenessScore(expense: any): number {
const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time'];
const filledFields = fields.filter(field =>
expense[field] && expense[field].toString().trim().length > 0
);
let score = filledFields.length / fields.length;
// Bonus for having contents description
if (expense.Contents && expense.Contents.length > 10) {
score += 0.2;
}
// Bonus for recent creation (more likely to be accurate)
if (expense.CreatedAt) {
const created = new Date(expense.CreatedAt);
const now = new Date();
const hoursOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60);
if (hoursOld < 24) score += 0.1;
}
return Math.min(score, 1.0);
}

View File

@@ -1,6 +1,7 @@
import { requireSalesOrAdmin } from '~/server/utils/auth';
import { getNocoDbConfiguration } from '~/server/utils/nocodb';
import { logAuditEvent } from '~/server/utils/audit-logger';
import { findDuplicates, createInterestConfig } from '~/server/utils/duplicate-detection';
export default defineEventHandler(async (event) => {
console.log('[INTERESTS] Find duplicates request');
@@ -40,16 +41,26 @@ export default defineEventHandler(async (event) => {
const interests = response.list || [];
console.log('[INTERESTS] Analyzing', interests.length, 'interests for duplicates');
// Find potential duplicates
const duplicateGroups = findDuplicateInterests(interests, threshold);
// Find duplicate groups using the new centralized utility
const duplicateConfig = createInterestConfig();
const duplicateGroups = findDuplicates(interests, duplicateConfig);
// Convert to the expected format
const formattedGroups = duplicateGroups.map(group => ({
id: group.id,
interests: group.items,
matchReason: group.matchReason,
confidence: group.confidence,
masterCandidate: group.masterCandidate
}));
console.log('[INTERESTS] Found', duplicateGroups.length, 'duplicate groups');
console.log('[INTERESTS] Found', formattedGroups.length, 'duplicate groups');
// Log the audit event
await logAuditEvent(event, 'FIND_INTEREST_DUPLICATES', 'interest', {
changes: {
totalInterests: interests.length,
duplicateGroups: duplicateGroups.length,
duplicateGroups: formattedGroups.length,
threshold,
dateRange
}
@@ -58,9 +69,9 @@ export default defineEventHandler(async (event) => {
return {
success: true,
data: {
duplicateGroups,
duplicateGroups: formattedGroups,
totalInterests: interests.length,
duplicateCount: duplicateGroups.reduce((sum, group) => sum + group.interests.length, 0),
duplicateCount: formattedGroups.reduce((sum, group) => sum + group.interests.length, 0),
threshold,
dateRange
}
@@ -82,288 +93,3 @@ export default defineEventHandler(async (event) => {
};
}
});
/**
* Find duplicate interests based on multiple criteria
*/
function findDuplicateInterests(interests: any[], threshold: number = 0.8) {
console.log('[INTERESTS] Starting duplicate detection with threshold:', threshold);
console.log('[INTERESTS] Total interests to analyze:', interests.length);
const duplicateGroups: Array<{
id: string;
interests: any[];
matchReason: string;
confidence: number;
masterCandidate: any;
}> = [];
const processedIds = new Set<number>();
let comparisons = 0;
for (let i = 0; i < interests.length; i++) {
const interest1 = interests[i];
if (processedIds.has(interest1.Id)) continue;
const matches = [interest1];
for (let j = i + 1; j < interests.length; j++) {
const interest2 = interests[j];
if (processedIds.has(interest2.Id)) continue;
const similarity = calculateSimilarity(interest1, interest2);
comparisons++;
console.log(`[INTERESTS] Comparing ${interest1.Id} vs ${interest2.Id}: score=${similarity.score.toFixed(3)}, threshold=${threshold}`);
if (similarity.score >= threshold) {
console.log(`[INTERESTS] MATCH FOUND! ${interest1.Id} vs ${interest2.Id} (score: ${similarity.score.toFixed(3)})`);
console.log('[INTERESTS] Match details:', similarity.details);
matches.push(interest2);
processedIds.add(interest2.Id);
}
}
if (matches.length > 1) {
console.log(`[INTERESTS] Creating duplicate group with ${matches.length} matches`);
// Mark all as processed
matches.forEach(match => processedIds.add(match.Id));
// Determine the best master candidate (most complete record)
const masterCandidate = selectMasterCandidate(matches);
// Calculate average confidence
const avgConfidence = matches.slice(1).reduce((sum, match) => {
return sum + calculateSimilarity(masterCandidate, match).score;
}, 0) / (matches.length - 1);
duplicateGroups.push({
id: `group_${duplicateGroups.length + 1}`,
interests: matches,
matchReason: generateMatchReason(matches),
confidence: avgConfidence,
masterCandidate
});
}
}
console.log(`[INTERESTS] Completed ${comparisons} comparisons, found ${duplicateGroups.length} duplicate groups`);
return duplicateGroups;
}
/**
* Calculate similarity between two interests
*/
function calculateSimilarity(interest1: any, interest2: any) {
const scores: Array<{ type: string; score: number; weight: number }> = [];
console.log(`[INTERESTS] Calculating similarity between:`, {
id1: interest1.Id,
name1: interest1['Full Name'],
email1: interest1['Email Address'],
phone1: interest1['Phone Number'],
id2: interest2.Id,
name2: interest2['Full Name'],
email2: interest2['Email Address'],
phone2: interest2['Phone Number']
});
// Email similarity (highest weight) - exact match required
if (interest1['Email Address'] && interest2['Email Address']) {
const email1 = normalizeEmail(interest1['Email Address']);
const email2 = normalizeEmail(interest2['Email Address']);
const emailScore = email1 === email2 ? 1.0 : 0.0;
scores.push({ type: 'email', score: emailScore, weight: 0.5 });
console.log(`[INTERESTS] Email comparison: "${email1}" vs "${email2}" = ${emailScore}`);
}
// Phone similarity - exact match on normalized numbers
if (interest1['Phone Number'] && interest2['Phone Number']) {
const phone1 = normalizePhone(interest1['Phone Number']);
const phone2 = normalizePhone(interest2['Phone Number']);
const phoneScore = phone1 === phone2 && phone1.length >= 8 ? 1.0 : 0.0; // Require at least 8 digits
scores.push({ type: 'phone', score: phoneScore, weight: 0.4 });
console.log(`[INTERESTS] Phone comparison: "${phone1}" vs "${phone2}" = ${phoneScore}`);
}
// Name similarity - fuzzy matching
if (interest1['Full Name'] && interest2['Full Name']) {
const nameScore = calculateNameSimilarity(interest1['Full Name'], interest2['Full Name']);
scores.push({ type: 'name', score: nameScore, weight: 0.3 });
console.log(`[INTERESTS] Name comparison: "${interest1['Full Name']}" vs "${interest2['Full Name']}" = ${nameScore.toFixed(3)}`);
}
// Address similarity
if (interest1.Address && interest2.Address) {
const addressScore = calculateStringSimilarity(interest1.Address, interest2.Address);
scores.push({ type: 'address', score: addressScore, weight: 0.2 });
console.log(`[INTERESTS] Address comparison: ${addressScore.toFixed(3)}`);
}
// Special case: if we have exact email OR phone match, give high score regardless of other fields
const hasExactEmailMatch = scores.find(s => s.type === 'email' && s.score === 1.0);
const hasExactPhoneMatch = scores.find(s => s.type === 'phone' && s.score === 1.0);
if (hasExactEmailMatch || hasExactPhoneMatch) {
console.log('[INTERESTS] Exact email or phone match found - high confidence');
return {
score: 0.95, // High confidence for exact email/phone match
details: scores
};
}
// Calculate weighted average for other cases
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
const weightedScore = scores.reduce((sum, s) => sum + (s.score * s.weight), 0) / (totalWeight || 1);
console.log(`[INTERESTS] Weighted score: ${weightedScore.toFixed(3)} (weights: ${totalWeight})`);
return {
score: weightedScore,
details: scores
};
}
/**
* Normalize email for comparison
*/
function normalizeEmail(email: string): string {
return email.toLowerCase().trim();
}
/**
* Normalize phone number for comparison
*/
function normalizePhone(phone: string): string {
return phone.replace(/\D/g, ''); // Remove all non-digits
}
/**
* Calculate name similarity using Levenshtein distance
*/
function calculateNameSimilarity(name1: string, name2: string): number {
const str1 = name1.toLowerCase().trim();
const str2 = name2.toLowerCase().trim();
if (str1 === str2) return 1.0;
const distance = levenshteinDistance(str1, str2);
const maxLength = Math.max(str1.length, str2.length);
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
}
/**
* Calculate string similarity using Levenshtein distance
*/
function calculateStringSimilarity(str1: string, str2: string): number {
const s1 = str1.toLowerCase().trim();
const s2 = str2.toLowerCase().trim();
if (s1 === s2) return 1.0;
const distance = levenshteinDistance(s1, s2);
const maxLength = Math.max(s1.length, s2.length);
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
}
/**
* Calculate Levenshtein distance between two strings
*/
function levenshteinDistance(str1: string, str2: string): number {
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
for (let i = 0; i <= str1.length; i += 1) {
matrix[0][i] = i;
}
for (let j = 0; j <= str2.length; j += 1) {
matrix[j][0] = j;
}
for (let j = 1; j <= str2.length; j += 1) {
for (let i = 1; i <= str1.length; i += 1) {
const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
matrix[j][i] = Math.min(
matrix[j][i - 1] + 1, // deletion
matrix[j - 1][i] + 1, // insertion
matrix[j - 1][i - 1] + indicator // substitution
);
}
}
return matrix[str2.length][str1.length];
}
/**
* Select the best master candidate from a group of duplicates
*/
function selectMasterCandidate(interests: any[]) {
return interests.reduce((best, current) => {
const bestScore = calculateCompletenessScore(best);
const currentScore = calculateCompletenessScore(current);
return currentScore > bestScore ? current : best;
});
}
/**
* Calculate completeness score for an interest record
*/
function calculateCompletenessScore(interest: any): number {
const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired'];
const filledFields = fields.filter(field =>
interest[field] && interest[field].toString().trim().length > 0
);
let score = filledFields.length / fields.length;
// Bonus for recent creation
if (interest['Created At']) {
const created = new Date(interest['Created At']);
const now = new Date();
const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24);
// More recent records get a small bonus
if (daysOld < 30) score += 0.1;
else if (daysOld < 90) score += 0.05;
}
return score;
}
/**
* Generate a descriptive match reason
*/
function generateMatchReason(interests: any[]): string {
const reasons = [];
// Check for exact email matches
const emails = interests.map(i => i['Email Address']).filter(Boolean);
if (emails.length > 1 && new Set(emails.map(e => normalizeEmail(e))).size === 1) {
reasons.push('Same email address');
}
// Check for exact phone matches
const phones = interests.map(i => i['Phone Number']).filter(Boolean);
if (phones.length > 1 && new Set(phones.map(p => normalizePhone(p))).size === 1) {
reasons.push('Same phone number');
}
// Check for similar names
const names = interests.map(i => i['Full Name']).filter(Boolean);
if (names.length > 1) {
const normalizedNames = names.map(n => n.toLowerCase().trim());
if (new Set(normalizedNames).size === 1) {
reasons.push('Same name');
} else {
reasons.push('Similar names');
}
}
return reasons.length > 0 ? reasons.join(', ') : 'Multiple matching criteria';
}