Fix duplicate detection system - improve blocking strategy for interests

- Updated interest blocking strategy to group by name prefix instead of email domain
- This fixes the issue where interests with different email domains were never compared
- Updated admin duplicate finder to use the new centralized utility
- Both Matthew Ciaccio entries will now be detected as duplicates
This commit is contained in:
Matt 2025-07-12 14:04:01 -04:00
parent 080cb60d71
commit 0762306bf3
2 changed files with 27 additions and 207 deletions

View File

@ -1,8 +1,9 @@
import { requireAuth, requireSalesOrAdmin } from '~/server/utils/auth'; import { requireAuth, requireSalesOrAdmin } from '~/server/utils/auth';
import { getNocoDbConfiguration } from '~/server/utils/nocodb'; import { getNocoDbConfiguration } from '~/server/utils/nocodb';
import { findDuplicates, createInterestConfig } from '~/server/utils/duplicate-detection';
export default defineEventHandler(async (event) => { export default defineEventHandler(async (event) => {
console.log('[DUPLICATES] Find duplicates request'); console.log('[ADMIN] Find duplicates request');
try { try {
// Require sales or admin access for duplicate detection // Require sales or admin access for duplicate detection
@ -26,17 +27,27 @@ export default defineEventHandler(async (event) => {
const interests = response.list || []; const interests = response.list || [];
console.log('[ADMIN] Analyzing', interests.length, 'interests for duplicates'); console.log('[ADMIN] Analyzing', interests.length, 'interests for duplicates');
// Find potential duplicates // Find duplicate groups using the new centralized utility
const duplicateGroups = findDuplicateInterests(interests, threshold); const duplicateConfig = createInterestConfig();
const duplicateGroups = findDuplicates(interests, duplicateConfig);
console.log('[ADMIN] Found', duplicateGroups.length, 'duplicate groups'); // Convert to the expected format
const formattedGroups = duplicateGroups.map(group => ({
id: group.id,
interests: group.items,
matchReason: group.matchReason,
confidence: group.confidence,
masterCandidate: group.masterCandidate
}));
console.log('[ADMIN] Found', formattedGroups.length, 'duplicate groups');
return { return {
success: true, success: true,
data: { data: {
duplicateGroups, duplicateGroups: formattedGroups,
totalInterests: interests.length, totalInterests: interests.length,
duplicateCount: duplicateGroups.reduce((sum, group) => sum + group.interests.length, 0), duplicateCount: formattedGroups.reduce((sum, group) => sum + group.interests.length, 0),
threshold threshold
} }
}; };
@ -57,203 +68,3 @@ export default defineEventHandler(async (event) => {
}; };
} }
}); });
/**
* Find duplicate interests based on multiple criteria
*/
function findDuplicateInterests(interests: any[], threshold: number = 0.8) {
const duplicateGroups: Array<{
id: string;
interests: any[];
matchReason: string;
confidence: number;
masterCandidate: any;
}> = [];
const processedIds = new Set<number>();
for (let i = 0; i < interests.length; i++) {
const interest1 = interests[i];
if (processedIds.has(interest1.Id)) continue;
const matches = [interest1];
for (let j = i + 1; j < interests.length; j++) {
const interest2 = interests[j];
if (processedIds.has(interest2.Id)) continue;
const similarity = calculateSimilarity(interest1, interest2);
if (similarity.score >= threshold) {
matches.push(interest2);
processedIds.add(interest2.Id);
}
}
if (matches.length > 1) {
// Mark all as processed
matches.forEach(match => processedIds.add(match.Id));
// Determine the best master candidate (most complete record)
const masterCandidate = selectMasterCandidate(matches);
duplicateGroups.push({
id: `group_${duplicateGroups.length + 1}`,
interests: matches,
matchReason: 'Multiple matching criteria',
confidence: Math.max(...matches.slice(1).map(match =>
calculateSimilarity(masterCandidate, match).score
)),
masterCandidate
});
}
}
return duplicateGroups;
}
/**
* Calculate similarity between two interests
*/
function calculateSimilarity(interest1: any, interest2: any) {
const scores: Array<{ type: string; score: number; weight: number }> = [];
// Email similarity (highest weight)
if (interest1['Email Address'] && interest2['Email Address']) {
const emailScore = interest1['Email Address'].toLowerCase() === interest2['Email Address'].toLowerCase() ? 1.0 : 0.0;
scores.push({ type: 'email', score: emailScore, weight: 0.4 });
}
// Phone similarity
if (interest1['Phone Number'] && interest2['Phone Number']) {
const phone1 = normalizePhone(interest1['Phone Number']);
const phone2 = normalizePhone(interest2['Phone Number']);
const phoneScore = phone1 === phone2 ? 1.0 : 0.0;
scores.push({ type: 'phone', score: phoneScore, weight: 0.3 });
}
// Name similarity
if (interest1['Full Name'] && interest2['Full Name']) {
const nameScore = calculateNameSimilarity(interest1['Full Name'], interest2['Full Name']);
scores.push({ type: 'name', score: nameScore, weight: 0.2 });
}
// Address similarity
if (interest1.Address && interest2.Address) {
const addressScore = calculateStringSimilarity(interest1.Address, interest2.Address);
scores.push({ type: 'address', score: addressScore, weight: 0.1 });
}
// Calculate weighted average
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
const weightedScore = scores.reduce((sum, s) => sum + (s.score * s.weight), 0) / (totalWeight || 1);
return {
score: weightedScore,
details: scores
};
}
/**
* Normalize phone number for comparison
*/
function normalizePhone(phone: string): string {
return phone.replace(/\D/g, ''); // Remove all non-digits
}
/**
* Calculate name similarity using Levenshtein distance
*/
function calculateNameSimilarity(name1: string, name2: string): number {
const str1 = name1.toLowerCase().trim();
const str2 = name2.toLowerCase().trim();
if (str1 === str2) return 1.0;
const distance = levenshteinDistance(str1, str2);
const maxLength = Math.max(str1.length, str2.length);
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
}
/**
* Calculate string similarity using Levenshtein distance
*/
function calculateStringSimilarity(str1: string, str2: string): number {
const s1 = str1.toLowerCase().trim();
const s2 = str2.toLowerCase().trim();
if (s1 === s2) return 1.0;
const distance = levenshteinDistance(s1, s2);
const maxLength = Math.max(s1.length, s2.length);
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
}
/**
* Calculate Levenshtein distance between two strings
*/
function levenshteinDistance(str1: string, str2: string): number {
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
for (let i = 0; i <= str1.length; i += 1) {
matrix[0][i] = i;
}
for (let j = 0; j <= str2.length; j += 1) {
matrix[j][0] = j;
}
for (let j = 1; j <= str2.length; j += 1) {
for (let i = 1; i <= str1.length; i += 1) {
const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
matrix[j][i] = Math.min(
matrix[j][i - 1] + 1, // deletion
matrix[j - 1][i] + 1, // insertion
matrix[j - 1][i - 1] + indicator // substitution
);
}
}
return matrix[str2.length][str1.length];
}
/**
* Select the best master candidate from a group of duplicates
*/
function selectMasterCandidate(interests: any[]) {
return interests.reduce((best, current) => {
const bestScore = calculateCompletenessScore(best);
const currentScore = calculateCompletenessScore(current);
return currentScore > bestScore ? current : best;
});
}
/**
* Calculate completeness score for an interest record
*/
function calculateCompletenessScore(interest: any): number {
const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired'];
const filledFields = fields.filter(field =>
interest[field] && interest[field].toString().trim().length > 0
);
let score = filledFields.length / fields.length;
// Bonus for recent creation
if (interest['Created At']) {
const created = new Date(interest['Created At']);
const now = new Date();
const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24);
// More recent records get a small bonus
if (daysOld < 30) score += 0.1;
else if (daysOld < 90) score += 0.05;
}
return score;
}

View File

@ -342,14 +342,23 @@ export function createInterestConfig(): DuplicateDetectionConfig<any> {
return { return {
type: 'interest', type: 'interest',
// Group by normalized email domain or phone prefix for blocking // Group by normalized name prefix for blocking to catch name-based duplicates
getKey: (interest) => { getKey: (interest) => {
// Priority 1: Use normalized name prefix (first 3 chars) to catch name duplicates
if (interest['Full Name']) {
const name = interest['Full Name'].toLowerCase().trim();
const prefix = name.substring(0, 3);
return `name_${prefix}`;
}
// Priority 2: Use email domain for email-based grouping
if (interest['Email Address']) { if (interest['Email Address']) {
const email = normalizeEmail(interest['Email Address']); const email = normalizeEmail(interest['Email Address']);
const domain = email.split('@')[1] || 'unknown'; const domain = email.split('@')[1] || 'unknown';
return `email_${domain}`; return `email_${domain}`;
} }
// Priority 3: Use phone prefix
if (interest['Phone Number']) { if (interest['Phone Number']) {
const phone = normalizePhone(interest['Phone Number']); const phone = normalizePhone(interest['Phone Number']);
const prefix = phone.length >= 4 ? phone.substring(0, 4) : phone; const prefix = phone.length >= 4 ? phone.substring(0, 4) : phone;