From 080cb60d71f89e88e932181fb5572877dc8a5761 Mon Sep 17 00:00:00 2001 From: Matt Date: Sat, 12 Jul 2025 13:42:53 -0400 Subject: [PATCH] feat: Implement centralized duplicate detection utility for expenses and interests --- server/api/expenses/duplicates/find.ts | 237 +------------- server/api/interests/duplicates/find.ts | 308 +----------------- server/utils/duplicate-detection.ts | 408 ++++++++++++++++++++++++ 3 files changed, 441 insertions(+), 512 deletions(-) create mode 100644 server/utils/duplicate-detection.ts diff --git a/server/api/expenses/duplicates/find.ts b/server/api/expenses/duplicates/find.ts index 0cab3ed..f5c4fb8 100644 --- a/server/api/expenses/duplicates/find.ts +++ b/server/api/expenses/duplicates/find.ts @@ -1,5 +1,6 @@ import { requireSalesOrAdmin } from '~/server/utils/auth'; import { getNocoDbConfiguration, normalizePersonName } from '~/server/utils/nocodb'; +import { findDuplicates, createExpenseConfig } from '~/server/utils/duplicate-detection'; import type { Expense } from '~/utils/types'; export default defineEventHandler(async (event) => { @@ -35,21 +36,31 @@ export default defineEventHandler(async (event) => { const expenses = response.list || []; console.log('[EXPENSES] Analyzing', expenses.length, 'expenses for duplicates'); - // Find duplicate groups - const duplicateGroups = findDuplicateExpenses(expenses); + // Find duplicate groups using the new centralized utility + const duplicateConfig = createExpenseConfig(); + const duplicateGroups = findDuplicates(expenses, duplicateConfig); + + // Convert to the expected format + const formattedGroups = duplicateGroups.map(group => ({ + id: group.id, + expenses: group.items, + matchReason: group.matchReason, + confidence: group.confidence, + masterCandidate: group.masterCandidate + })); // Also find payer name variations const payerVariations = findPayerNameVariations(expenses); - console.log('[EXPENSES] Found', duplicateGroups.length, 'duplicate groups and', payerVariations.length, 'payer variations'); + console.log('[EXPENSES] Found', formattedGroups.length, 'duplicate groups and', payerVariations.length, 'payer variations'); return { success: true, data: { - duplicateGroups, + duplicateGroups: formattedGroups, payerVariations, totalExpenses: expenses.length, - duplicateCount: duplicateGroups.reduce((sum, group) => sum + group.expenses.length, 0), + duplicateCount: formattedGroups.reduce((sum, group) => sum + group.expenses.length, 0), dateRange: { start: startDate.toISOString().split('T')[0], end: endDate.toISOString().split('T')[0] @@ -74,71 +85,6 @@ export default defineEventHandler(async (event) => { } }); -/** - * Find duplicate expenses based on multiple criteria - */ -function findDuplicateExpenses(expenses: any[]) { - console.log('[EXPENSES] Starting duplicate detection for', expenses.length, 'expenses'); - - const duplicateGroups: Array<{ - id: string; - expenses: any[]; - matchReason: string; - confidence: number; - masterCandidate: any; - }> = []; - - const processedIds = new Set(); - let comparisons = 0; - - for (let i = 0; i < expenses.length; i++) { - const expense1 = expenses[i]; - - if (processedIds.has(expense1.Id)) continue; - - const matches = [expense1]; - let matchReasons = new Set(); - - for (let j = i + 1; j < expenses.length; j++) { - const expense2 = expenses[j]; - - if (processedIds.has(expense2.Id)) continue; - - const similarity = calculateExpenseSimilarity(expense1, expense2); - comparisons++; - - console.log(`[EXPENSES] Comparing ${expense1.Id} vs ${expense2.Id}: score=${similarity.score.toFixed(3)}, threshold=0.7`); - - if (similarity.score >= 0.7) { // Lower threshold for expenses - console.log(`[EXPENSES] MATCH FOUND! ${expense1.Id} vs ${expense2.Id} (score: ${similarity.score.toFixed(3)})`); - console.log('[EXPENSES] Match reasons:', similarity.reasons); - matches.push(expense2); - processedIds.add(expense2.Id); - similarity.reasons.forEach(r => matchReasons.add(r)); - } - } - - if (matches.length > 1) { - // Mark all as processed - matches.forEach(match => processedIds.add(match.Id)); - - // Determine the best master candidate - const masterCandidate = selectMasterExpense(matches); - - duplicateGroups.push({ - id: `group_${duplicateGroups.length + 1}`, - expenses: matches, - matchReason: Array.from(matchReasons).join(', '), - confidence: Math.max(...matches.slice(1).map(match => - calculateExpenseSimilarity(masterCandidate, match).score - )), - masterCandidate - }); - } - } - - return duplicateGroups; -} /** * Find payer name variations (like "Abbie" vs "abbie") @@ -181,154 +127,3 @@ function findPayerNameVariations(expenses: any[]) { return variations.sort((a, b) => b.expenseCount - a.expenseCount); } - -/** - * Calculate similarity between two expenses - */ -function calculateExpenseSimilarity(expense1: any, expense2: any) { - const scores: Array<{ type: string; score: number; weight: number }> = []; - const reasons: string[] = []; - - // Exact match on establishment, price, and date (highest weight for true duplicates) - if (expense1['Establishment Name'] === expense2['Establishment Name'] && - expense1.Price === expense2.Price && - expense1.Time === expense2.Time) { - scores.push({ type: 'exact', score: 1.0, weight: 0.5 }); - reasons.push('Exact match'); - } - - // Same payer, establishment, and price on same day (likely duplicate) - const date1 = expense1.Time?.split('T')[0]; - const date2 = expense2.Time?.split('T')[0]; - - if (normalizePersonName(expense1.Payer) === normalizePersonName(expense2.Payer) && - expense1['Establishment Name'] === expense2['Establishment Name'] && - expense1.Price === expense2.Price && - date1 === date2) { - scores.push({ type: 'same-day', score: 0.95, weight: 0.4 }); - reasons.push('Same person, place, amount on same day'); - } - - // Similar establishment names with same price and payer - if (expense1['Establishment Name'] && expense2['Establishment Name']) { - const nameSimilarity = calculateStringSimilarity( - expense1['Establishment Name'], - expense2['Establishment Name'] - ); - - if (nameSimilarity > 0.8 && - expense1.Price === expense2.Price && - normalizePersonName(expense1.Payer) === normalizePersonName(expense2.Payer)) { - scores.push({ type: 'similar', score: nameSimilarity, weight: 0.3 }); - reasons.push('Similar establishment name'); - } - } - - // Time proximity check (within 5 minutes) - if (expense1.Time && expense2.Time) { - const time1 = new Date(expense1.Time).getTime(); - const time2 = new Date(expense2.Time).getTime(); - const timeDiff = Math.abs(time1 - time2); - - if (timeDiff < 5 * 60 * 1000 && // 5 minutes - expense1['Establishment Name'] === expense2['Establishment Name']) { - scores.push({ type: 'time-proximity', score: 0.9, weight: 0.2 }); - reasons.push('Within 5 minutes at same establishment'); - } - } - - // Calculate weighted average - const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0); - const weightedScore = totalWeight > 0 - ? scores.reduce((sum, s) => sum + (s.score * s.weight), 0) / totalWeight - : 0; - - return { - score: weightedScore, - reasons, - details: scores - }; -} - -/** - * Calculate string similarity using Levenshtein distance - */ -function calculateStringSimilarity(str1: string, str2: string): number { - const s1 = str1.toLowerCase().trim(); - const s2 = str2.toLowerCase().trim(); - - if (s1 === s2) return 1.0; - - const distance = levenshteinDistance(s1, s2); - const maxLength = Math.max(s1.length, s2.length); - - return maxLength > 0 ? 1 - (distance / maxLength) : 0; -} - -/** - * Calculate Levenshtein distance between two strings - */ -function levenshteinDistance(str1: string, str2: string): number { - const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null)); - - for (let i = 0; i <= str1.length; i += 1) { - matrix[0][i] = i; - } - - for (let j = 0; j <= str2.length; j += 1) { - matrix[j][0] = j; - } - - for (let j = 1; j <= str2.length; j += 1) { - for (let i = 1; i <= str1.length; i += 1) { - const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1; - matrix[j][i] = Math.min( - matrix[j][i - 1] + 1, // deletion - matrix[j - 1][i] + 1, // insertion - matrix[j - 1][i - 1] + indicator // substitution - ); - } - } - - return matrix[str2.length][str1.length]; -} - -/** - * Select the best master expense from a group - */ -function selectMasterExpense(expenses: any[]) { - return expenses.reduce((best, current) => { - const bestScore = calculateExpenseCompletenessScore(best); - const currentScore = calculateExpenseCompletenessScore(current); - - return currentScore > bestScore ? current : best; - }); -} - -/** - * Calculate completeness score for an expense - */ -function calculateExpenseCompletenessScore(expense: any): number { - const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time']; - const filledFields = fields.filter(field => - expense[field] && expense[field].toString().trim().length > 0 - ); - - let score = filledFields.length / fields.length; - - // Bonus for having contents description - if (expense.Contents && expense.Contents.length > 10) { - score += 0.2; - } - - // Bonus for recent creation (more likely to be accurate) - if (expense.CreatedAt) { - const created = new Date(expense.CreatedAt); - const now = new Date(); - const hoursOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60); - - if (hoursOld < 24) score += 0.1; - } - - return Math.min(score, 1.0); -} diff --git a/server/api/interests/duplicates/find.ts b/server/api/interests/duplicates/find.ts index 835f440..a19a01e 100644 --- a/server/api/interests/duplicates/find.ts +++ b/server/api/interests/duplicates/find.ts @@ -1,6 +1,7 @@ import { requireSalesOrAdmin } from '~/server/utils/auth'; import { getNocoDbConfiguration } from '~/server/utils/nocodb'; import { logAuditEvent } from '~/server/utils/audit-logger'; +import { findDuplicates, createInterestConfig } from '~/server/utils/duplicate-detection'; export default defineEventHandler(async (event) => { console.log('[INTERESTS] Find duplicates request'); @@ -40,16 +41,26 @@ export default defineEventHandler(async (event) => { const interests = response.list || []; console.log('[INTERESTS] Analyzing', interests.length, 'interests for duplicates'); - // Find potential duplicates - const duplicateGroups = findDuplicateInterests(interests, threshold); + // Find duplicate groups using the new centralized utility + const duplicateConfig = createInterestConfig(); + const duplicateGroups = findDuplicates(interests, duplicateConfig); + + // Convert to the expected format + const formattedGroups = duplicateGroups.map(group => ({ + id: group.id, + interests: group.items, + matchReason: group.matchReason, + confidence: group.confidence, + masterCandidate: group.masterCandidate + })); - console.log('[INTERESTS] Found', duplicateGroups.length, 'duplicate groups'); + console.log('[INTERESTS] Found', formattedGroups.length, 'duplicate groups'); // Log the audit event await logAuditEvent(event, 'FIND_INTEREST_DUPLICATES', 'interest', { changes: { totalInterests: interests.length, - duplicateGroups: duplicateGroups.length, + duplicateGroups: formattedGroups.length, threshold, dateRange } @@ -58,9 +69,9 @@ export default defineEventHandler(async (event) => { return { success: true, data: { - duplicateGroups, + duplicateGroups: formattedGroups, totalInterests: interests.length, - duplicateCount: duplicateGroups.reduce((sum, group) => sum + group.interests.length, 0), + duplicateCount: formattedGroups.reduce((sum, group) => sum + group.interests.length, 0), threshold, dateRange } @@ -82,288 +93,3 @@ export default defineEventHandler(async (event) => { }; } }); - -/** - * Find duplicate interests based on multiple criteria - */ -function findDuplicateInterests(interests: any[], threshold: number = 0.8) { - console.log('[INTERESTS] Starting duplicate detection with threshold:', threshold); - console.log('[INTERESTS] Total interests to analyze:', interests.length); - - const duplicateGroups: Array<{ - id: string; - interests: any[]; - matchReason: string; - confidence: number; - masterCandidate: any; - }> = []; - - const processedIds = new Set(); - let comparisons = 0; - - for (let i = 0; i < interests.length; i++) { - const interest1 = interests[i]; - - if (processedIds.has(interest1.Id)) continue; - - const matches = [interest1]; - - for (let j = i + 1; j < interests.length; j++) { - const interest2 = interests[j]; - - if (processedIds.has(interest2.Id)) continue; - - const similarity = calculateSimilarity(interest1, interest2); - comparisons++; - - console.log(`[INTERESTS] Comparing ${interest1.Id} vs ${interest2.Id}: score=${similarity.score.toFixed(3)}, threshold=${threshold}`); - - if (similarity.score >= threshold) { - console.log(`[INTERESTS] MATCH FOUND! ${interest1.Id} vs ${interest2.Id} (score: ${similarity.score.toFixed(3)})`); - console.log('[INTERESTS] Match details:', similarity.details); - matches.push(interest2); - processedIds.add(interest2.Id); - } - } - - if (matches.length > 1) { - console.log(`[INTERESTS] Creating duplicate group with ${matches.length} matches`); - - // Mark all as processed - matches.forEach(match => processedIds.add(match.Id)); - - // Determine the best master candidate (most complete record) - const masterCandidate = selectMasterCandidate(matches); - - // Calculate average confidence - const avgConfidence = matches.slice(1).reduce((sum, match) => { - return sum + calculateSimilarity(masterCandidate, match).score; - }, 0) / (matches.length - 1); - - duplicateGroups.push({ - id: `group_${duplicateGroups.length + 1}`, - interests: matches, - matchReason: generateMatchReason(matches), - confidence: avgConfidence, - masterCandidate - }); - } - } - - console.log(`[INTERESTS] Completed ${comparisons} comparisons, found ${duplicateGroups.length} duplicate groups`); - return duplicateGroups; -} - -/** - * Calculate similarity between two interests - */ -function calculateSimilarity(interest1: any, interest2: any) { - const scores: Array<{ type: string; score: number; weight: number }> = []; - - console.log(`[INTERESTS] Calculating similarity between:`, { - id1: interest1.Id, - name1: interest1['Full Name'], - email1: interest1['Email Address'], - phone1: interest1['Phone Number'], - id2: interest2.Id, - name2: interest2['Full Name'], - email2: interest2['Email Address'], - phone2: interest2['Phone Number'] - }); - - // Email similarity (highest weight) - exact match required - if (interest1['Email Address'] && interest2['Email Address']) { - const email1 = normalizeEmail(interest1['Email Address']); - const email2 = normalizeEmail(interest2['Email Address']); - const emailScore = email1 === email2 ? 1.0 : 0.0; - scores.push({ type: 'email', score: emailScore, weight: 0.5 }); - console.log(`[INTERESTS] Email comparison: "${email1}" vs "${email2}" = ${emailScore}`); - } - - // Phone similarity - exact match on normalized numbers - if (interest1['Phone Number'] && interest2['Phone Number']) { - const phone1 = normalizePhone(interest1['Phone Number']); - const phone2 = normalizePhone(interest2['Phone Number']); - const phoneScore = phone1 === phone2 && phone1.length >= 8 ? 1.0 : 0.0; // Require at least 8 digits - scores.push({ type: 'phone', score: phoneScore, weight: 0.4 }); - console.log(`[INTERESTS] Phone comparison: "${phone1}" vs "${phone2}" = ${phoneScore}`); - } - - // Name similarity - fuzzy matching - if (interest1['Full Name'] && interest2['Full Name']) { - const nameScore = calculateNameSimilarity(interest1['Full Name'], interest2['Full Name']); - scores.push({ type: 'name', score: nameScore, weight: 0.3 }); - console.log(`[INTERESTS] Name comparison: "${interest1['Full Name']}" vs "${interest2['Full Name']}" = ${nameScore.toFixed(3)}`); - } - - // Address similarity - if (interest1.Address && interest2.Address) { - const addressScore = calculateStringSimilarity(interest1.Address, interest2.Address); - scores.push({ type: 'address', score: addressScore, weight: 0.2 }); - console.log(`[INTERESTS] Address comparison: ${addressScore.toFixed(3)}`); - } - - // Special case: if we have exact email OR phone match, give high score regardless of other fields - const hasExactEmailMatch = scores.find(s => s.type === 'email' && s.score === 1.0); - const hasExactPhoneMatch = scores.find(s => s.type === 'phone' && s.score === 1.0); - - if (hasExactEmailMatch || hasExactPhoneMatch) { - console.log('[INTERESTS] Exact email or phone match found - high confidence'); - return { - score: 0.95, // High confidence for exact email/phone match - details: scores - }; - } - - // Calculate weighted average for other cases - const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0); - const weightedScore = scores.reduce((sum, s) => sum + (s.score * s.weight), 0) / (totalWeight || 1); - - console.log(`[INTERESTS] Weighted score: ${weightedScore.toFixed(3)} (weights: ${totalWeight})`); - - return { - score: weightedScore, - details: scores - }; -} - -/** - * Normalize email for comparison - */ -function normalizeEmail(email: string): string { - return email.toLowerCase().trim(); -} - -/** - * Normalize phone number for comparison - */ -function normalizePhone(phone: string): string { - return phone.replace(/\D/g, ''); // Remove all non-digits -} - -/** - * Calculate name similarity using Levenshtein distance - */ -function calculateNameSimilarity(name1: string, name2: string): number { - const str1 = name1.toLowerCase().trim(); - const str2 = name2.toLowerCase().trim(); - - if (str1 === str2) return 1.0; - - const distance = levenshteinDistance(str1, str2); - const maxLength = Math.max(str1.length, str2.length); - - return maxLength > 0 ? 1 - (distance / maxLength) : 0; -} - -/** - * Calculate string similarity using Levenshtein distance - */ -function calculateStringSimilarity(str1: string, str2: string): number { - const s1 = str1.toLowerCase().trim(); - const s2 = str2.toLowerCase().trim(); - - if (s1 === s2) return 1.0; - - const distance = levenshteinDistance(s1, s2); - const maxLength = Math.max(s1.length, s2.length); - - return maxLength > 0 ? 1 - (distance / maxLength) : 0; -} - -/** - * Calculate Levenshtein distance between two strings - */ -function levenshteinDistance(str1: string, str2: string): number { - const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null)); - - for (let i = 0; i <= str1.length; i += 1) { - matrix[0][i] = i; - } - - for (let j = 0; j <= str2.length; j += 1) { - matrix[j][0] = j; - } - - for (let j = 1; j <= str2.length; j += 1) { - for (let i = 1; i <= str1.length; i += 1) { - const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1; - matrix[j][i] = Math.min( - matrix[j][i - 1] + 1, // deletion - matrix[j - 1][i] + 1, // insertion - matrix[j - 1][i - 1] + indicator // substitution - ); - } - } - - return matrix[str2.length][str1.length]; -} - -/** - * Select the best master candidate from a group of duplicates - */ -function selectMasterCandidate(interests: any[]) { - return interests.reduce((best, current) => { - const bestScore = calculateCompletenessScore(best); - const currentScore = calculateCompletenessScore(current); - - return currentScore > bestScore ? current : best; - }); -} - -/** - * Calculate completeness score for an interest record - */ -function calculateCompletenessScore(interest: any): number { - const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired']; - const filledFields = fields.filter(field => - interest[field] && interest[field].toString().trim().length > 0 - ); - - let score = filledFields.length / fields.length; - - // Bonus for recent creation - if (interest['Created At']) { - const created = new Date(interest['Created At']); - const now = new Date(); - const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24); - - // More recent records get a small bonus - if (daysOld < 30) score += 0.1; - else if (daysOld < 90) score += 0.05; - } - - return score; -} - -/** - * Generate a descriptive match reason - */ -function generateMatchReason(interests: any[]): string { - const reasons = []; - - // Check for exact email matches - const emails = interests.map(i => i['Email Address']).filter(Boolean); - if (emails.length > 1 && new Set(emails.map(e => normalizeEmail(e))).size === 1) { - reasons.push('Same email address'); - } - - // Check for exact phone matches - const phones = interests.map(i => i['Phone Number']).filter(Boolean); - if (phones.length > 1 && new Set(phones.map(p => normalizePhone(p))).size === 1) { - reasons.push('Same phone number'); - } - - // Check for similar names - const names = interests.map(i => i['Full Name']).filter(Boolean); - if (names.length > 1) { - const normalizedNames = names.map(n => n.toLowerCase().trim()); - if (new Set(normalizedNames).size === 1) { - reasons.push('Same name'); - } else { - reasons.push('Similar names'); - } - } - - return reasons.length > 0 ? reasons.join(', ') : 'Multiple matching criteria'; -} diff --git a/server/utils/duplicate-detection.ts b/server/utils/duplicate-detection.ts new file mode 100644 index 0000000..bf9ff41 --- /dev/null +++ b/server/utils/duplicate-detection.ts @@ -0,0 +1,408 @@ +import { normalizePersonName } from './nocodb'; + +/** + * Configuration for duplicate detection + */ +export interface DuplicateDetectionConfig { + type: 'expense' | 'interest'; + + // Field extractors + getKey: (item: T) => string; // Primary grouping key for blocking + getId: (item: T) => number; // Unique identifier + + // Duplicate detection rules + rules: DuplicateRule[]; + + // Performance settings + maxGroupSize?: number; // Skip groups larger than this + maxComparisons?: number; // Limit total comparisons +} + +/** + * A rule for detecting duplicates + */ +export interface DuplicateRule { + name: string; + weight: number; + check: (item1: T, item2: T) => boolean; +} + +/** + * Result of duplicate detection + */ +export interface DuplicateGroup { + id: string; + items: T[]; + matchReason: string; + confidence: number; + masterCandidate: T; +} + +/** + * Main function to find duplicates using an efficient blocking strategy + */ +export function findDuplicates( + items: T[], + config: DuplicateDetectionConfig +): DuplicateGroup[] { + console.log(`[DUPLICATES] Starting detection for ${items.length} ${config.type}s`); + + if (items.length === 0) return []; + + // Phase 1: Group items by blocking key for efficient comparison + const blocks = new Map(); + + items.forEach(item => { + const key = config.getKey(item); + if (!blocks.has(key)) { + blocks.set(key, []); + } + blocks.get(key)!.push(item); + }); + + console.log(`[DUPLICATES] Created ${blocks.size} blocks from ${items.length} items`); + + // Phase 2: Find duplicates within each block + const duplicateGroups: DuplicateGroup[] = []; + const processedIds = new Set(); + let totalComparisons = 0; + + for (const [blockKey, blockItems] of blocks) { + // Skip large blocks that would be too expensive to process + if (config.maxGroupSize && blockItems.length > config.maxGroupSize) { + console.log(`[DUPLICATES] Skipping large block "${blockKey}" with ${blockItems.length} items`); + continue; + } + + // Skip blocks with only one item + if (blockItems.length < 2) continue; + + console.log(`[DUPLICATES] Processing block "${blockKey}" with ${blockItems.length} items`); + + // Find duplicates within this block + for (let i = 0; i < blockItems.length; i++) { + const item1 = blockItems[i]; + if (processedIds.has(config.getId(item1))) continue; + + const group = [item1]; + const matchedRules = new Set(); + + for (let j = i + 1; j < blockItems.length; j++) { + const item2 = blockItems[j]; + if (processedIds.has(config.getId(item2))) continue; + + totalComparisons++; + + // Check if items match according to any rule + const matchingRule = config.rules.find(rule => rule.check(item1, item2)); + + if (matchingRule) { + console.log(`[DUPLICATES] Match found: ${config.getId(item1)} vs ${config.getId(item2)} (rule: ${matchingRule.name})`); + group.push(item2); + matchedRules.add(matchingRule.name); + processedIds.add(config.getId(item2)); + } + + // Stop if we've hit the comparison limit + if (config.maxComparisons && totalComparisons >= config.maxComparisons) { + console.log(`[DUPLICATES] Hit comparison limit of ${config.maxComparisons}`); + break; + } + } + + // If we found duplicates, create a group + if (group.length > 1) { + processedIds.add(config.getId(item1)); + + const masterCandidate = selectMasterCandidate(group, config.type); + const confidence = calculateGroupConfidence(group, config.rules); + + duplicateGroups.push({ + id: `group_${duplicateGroups.length + 1}`, + items: group, + matchReason: Array.from(matchedRules).join(', '), + confidence, + masterCandidate + }); + } + + if (config.maxComparisons && totalComparisons >= config.maxComparisons) { + break; + } + } + + if (config.maxComparisons && totalComparisons >= config.maxComparisons) { + break; + } + } + + console.log(`[DUPLICATES] Completed ${totalComparisons} comparisons, found ${duplicateGroups.length} duplicate groups`); + return duplicateGroups; +} + +/** + * Select the best master candidate from a group + */ +function selectMasterCandidate(items: T[], type: 'expense' | 'interest'): T { + return items.reduce((best, current) => { + const bestScore = calculateCompletenessScore(best, type); + const currentScore = calculateCompletenessScore(current, type); + return currentScore > bestScore ? current : best; + }); +} + +/** + * Calculate completeness score for prioritizing records + */ +function calculateCompletenessScore(item: any, type: 'expense' | 'interest'): number { + let score = 0; + let totalFields = 0; + + if (type === 'expense') { + const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time']; + fields.forEach(field => { + totalFields++; + if (item[field] && item[field].toString().trim().length > 0) { + score++; + } + }); + + // Bonus for detailed contents + if (item.Contents && item.Contents.length > 10) { + score += 0.5; + } + } else if (type === 'interest') { + const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired']; + fields.forEach(field => { + totalFields++; + if (item[field] && item[field].toString().trim().length > 0) { + score++; + } + }); + } + + // Bonus for recent creation + if (item['Created At'] || item.CreatedAt) { + const createdField = item['Created At'] || item.CreatedAt; + const created = new Date(createdField); + const now = new Date(); + const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24); + + if (daysOld < 30) score += 0.3; + else if (daysOld < 90) score += 0.15; + } + + return totalFields > 0 ? score / totalFields : 0; +} + +/** + * Calculate confidence score for a duplicate group + */ +function calculateGroupConfidence(items: T[], rules: DuplicateRule[]): number { + if (items.length < 2) return 0; + + let totalConfidence = 0; + let comparisons = 0; + + for (let i = 0; i < items.length; i++) { + for (let j = i + 1; j < items.length; j++) { + const matchingRule = rules.find(rule => rule.check(items[i], items[j])); + if (matchingRule) { + totalConfidence += matchingRule.weight; + comparisons++; + } + } + } + + return comparisons > 0 ? totalConfidence / comparisons : 0; +} + +/** + * Normalize email for comparison + */ +export function normalizeEmail(email: string): string { + return email.toLowerCase().trim(); +} + +/** + * Normalize phone number for comparison + */ +export function normalizePhone(phone: string): string { + return phone.replace(/\D/g, ''); // Remove all non-digits +} + +/** + * Calculate string similarity using Levenshtein distance + */ +export function calculateStringSimilarity(str1: string, str2: string): number { + const s1 = str1.toLowerCase().trim(); + const s2 = str2.toLowerCase().trim(); + + if (s1 === s2) return 1.0; + + const distance = levenshteinDistance(s1, s2); + const maxLength = Math.max(s1.length, s2.length); + + return maxLength > 0 ? 1 - (distance / maxLength) : 0; +} + +/** + * Calculate Levenshtein distance between two strings + */ +function levenshteinDistance(str1: string, str2: string): number { + const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null)); + + for (let i = 0; i <= str1.length; i += 1) { + matrix[0][i] = i; + } + + for (let j = 0; j <= str2.length; j += 1) { + matrix[j][0] = j; + } + + for (let j = 1; j <= str2.length; j += 1) { + for (let i = 1; i <= str1.length; i += 1) { + const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1; + matrix[j][i] = Math.min( + matrix[j][i - 1] + 1, // deletion + matrix[j - 1][i] + 1, // insertion + matrix[j - 1][i - 1] + indicator // substitution + ); + } + } + + return matrix[str2.length][str1.length]; +} + +/** + * Create configuration for expense duplicate detection + */ +export function createExpenseConfig(): DuplicateDetectionConfig { + return { + type: 'expense', + + // Group by normalized payer name for blocking + getKey: (expense) => { + const payer = expense.Payer ? normalizePersonName(expense.Payer) : 'unknown'; + const date = expense.Time ? expense.Time.split('T')[0] : 'nodate'; + return `${payer}_${date}`; + }, + + getId: (expense) => expense.Id, + + rules: [ + { + name: 'Exact match', + weight: 1.0, + check: (exp1, exp2) => { + return exp1['Establishment Name'] === exp2['Establishment Name'] && + exp1.Price === exp2.Price && + exp1.Time === exp2.Time; + } + }, + { + name: 'Same day, same details', + weight: 0.95, + check: (exp1, exp2) => { + const date1 = exp1.Time?.split('T')[0]; + const date2 = exp2.Time?.split('T')[0]; + + return normalizePersonName(exp1.Payer || '') === normalizePersonName(exp2.Payer || '') && + exp1['Establishment Name'] === exp2['Establishment Name'] && + exp1.Price === exp2.Price && + date1 === date2; + } + }, + { + name: 'Close time proximity', + weight: 0.9, + check: (exp1, exp2) => { + if (!exp1.Time || !exp2.Time) return false; + + const time1 = new Date(exp1.Time).getTime(); + const time2 = new Date(exp2.Time).getTime(); + const timeDiff = Math.abs(time1 - time2); + + return timeDiff < 5 * 60 * 1000 && // 5 minutes + exp1['Establishment Name'] === exp2['Establishment Name'] && + exp1.Price === exp2.Price; + } + } + ], + + maxGroupSize: 50, + maxComparisons: 10000 + }; +} + +/** + * Create configuration for interest duplicate detection + */ +export function createInterestConfig(): DuplicateDetectionConfig { + return { + type: 'interest', + + // Group by normalized email domain or phone prefix for blocking + getKey: (interest) => { + if (interest['Email Address']) { + const email = normalizeEmail(interest['Email Address']); + const domain = email.split('@')[1] || 'unknown'; + return `email_${domain}`; + } + + if (interest['Phone Number']) { + const phone = normalizePhone(interest['Phone Number']); + const prefix = phone.length >= 4 ? phone.substring(0, 4) : phone; + return `phone_${prefix}`; + } + + return 'unknown'; + }, + + getId: (interest) => interest.Id, + + rules: [ + { + name: 'Same email', + weight: 1.0, + check: (int1, int2) => { + return int1['Email Address'] && int2['Email Address'] && + normalizeEmail(int1['Email Address']) === normalizeEmail(int2['Email Address']); + } + }, + { + name: 'Same phone', + weight: 1.0, + check: (int1, int2) => { + const phone1 = normalizePhone(int1['Phone Number'] || ''); + const phone2 = normalizePhone(int2['Phone Number'] || ''); + + return phone1 && phone2 && phone1.length >= 8 && phone1 === phone2; + } + }, + { + name: 'Similar name and address', + weight: 0.8, + check: (int1, int2) => { + if (!int1['Full Name'] || !int2['Full Name']) return false; + + const nameSimilarity = calculateStringSimilarity(int1['Full Name'], int2['Full Name']); + + if (nameSimilarity > 0.9) { + // If names are very similar, check address too + if (int1.Address && int2.Address) { + const addressSimilarity = calculateStringSimilarity(int1.Address, int2.Address); + return addressSimilarity > 0.8; + } + return true; // Similar names, no address to compare + } + + return false; + } + } + ], + + maxGroupSize: 50, + maxComparisons: 10000 + }; +}