import { normalizePersonName } from './nocodb'; /** * Configuration for duplicate detection */ export interface DuplicateDetectionConfig { type: 'expense' | 'interest'; // Field extractors getKey: (item: T) => string; // Primary grouping key for blocking getId: (item: T) => number; // Unique identifier // Duplicate detection rules rules: DuplicateRule[]; // Performance settings maxGroupSize?: number; // Skip groups larger than this maxComparisons?: number; // Limit total comparisons } /** * A rule for detecting duplicates */ export interface DuplicateRule { name: string; weight: number; check: (item1: T, item2: T) => boolean; } /** * Result of duplicate detection */ export interface DuplicateGroup { id: string; items: T[]; matchReason: string; confidence: number; masterCandidate: T; } /** * Main function to find duplicates using an efficient blocking strategy */ export function findDuplicates( items: T[], config: DuplicateDetectionConfig ): DuplicateGroup[] { console.log(`[DUPLICATES] Starting detection for ${items.length} ${config.type}s`); if (items.length === 0) return []; // Phase 1: Group items by blocking key for efficient comparison const blocks = new Map(); items.forEach(item => { const key = config.getKey(item); if (!blocks.has(key)) { blocks.set(key, []); } blocks.get(key)!.push(item); }); console.log(`[DUPLICATES] Created ${blocks.size} blocks from ${items.length} items`); // Phase 2: Find duplicates within each block const duplicateGroups: DuplicateGroup[] = []; const processedIds = new Set(); let totalComparisons = 0; for (const [blockKey, blockItems] of blocks) { // Skip large blocks that would be too expensive to process if (config.maxGroupSize && blockItems.length > config.maxGroupSize) { console.log(`[DUPLICATES] Skipping large block "${blockKey}" with ${blockItems.length} items`); continue; } // Skip blocks with only one item if (blockItems.length < 2) continue; console.log(`[DUPLICATES] Processing block "${blockKey}" with ${blockItems.length} items`); // Find duplicates within this block for (let i = 0; i < blockItems.length; i++) { const item1 = blockItems[i]; if (processedIds.has(config.getId(item1))) continue; const group = [item1]; const matchedRules = new Set(); for (let j = i + 1; j < blockItems.length; j++) { const item2 = blockItems[j]; if (processedIds.has(config.getId(item2))) continue; totalComparisons++; // Check if items match according to any rule const matchingRule = config.rules.find(rule => rule.check(item1, item2)); if (matchingRule) { console.log(`[DUPLICATES] Match found: ${config.getId(item1)} vs ${config.getId(item2)} (rule: ${matchingRule.name})`); group.push(item2); matchedRules.add(matchingRule.name); processedIds.add(config.getId(item2)); } // Stop if we've hit the comparison limit if (config.maxComparisons && totalComparisons >= config.maxComparisons) { console.log(`[DUPLICATES] Hit comparison limit of ${config.maxComparisons}`); break; } } // If we found duplicates, create a group if (group.length > 1) { processedIds.add(config.getId(item1)); const masterCandidate = selectMasterCandidate(group, config.type); const confidence = calculateGroupConfidence(group, config.rules); duplicateGroups.push({ id: `group_${duplicateGroups.length + 1}`, items: group, matchReason: Array.from(matchedRules).join(', '), confidence, masterCandidate }); } if (config.maxComparisons && totalComparisons >= config.maxComparisons) { break; } } if (config.maxComparisons && totalComparisons >= config.maxComparisons) { break; } } console.log(`[DUPLICATES] Completed ${totalComparisons} comparisons, found ${duplicateGroups.length} duplicate groups`); return duplicateGroups; } /** * Select the best master candidate from a group */ function selectMasterCandidate(items: T[], type: 'expense' | 'interest'): T { return items.reduce((best, current) => { const bestScore = calculateCompletenessScore(best, type); const currentScore = calculateCompletenessScore(current, type); return currentScore > bestScore ? current : best; }); } /** * Calculate completeness score for prioritizing records */ function calculateCompletenessScore(item: any, type: 'expense' | 'interest'): number { let score = 0; let totalFields = 0; if (type === 'expense') { const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time']; fields.forEach(field => { totalFields++; if (item[field] && item[field].toString().trim().length > 0) { score++; } }); // Bonus for detailed contents if (item.Contents && item.Contents.length > 10) { score += 0.5; } } else if (type === 'interest') { const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired']; fields.forEach(field => { totalFields++; if (item[field] && item[field].toString().trim().length > 0) { score++; } }); } // Bonus for recent creation if (item['Created At'] || item.CreatedAt) { const createdField = item['Created At'] || item.CreatedAt; const created = new Date(createdField); const now = new Date(); const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24); if (daysOld < 30) score += 0.3; else if (daysOld < 90) score += 0.15; } return totalFields > 0 ? score / totalFields : 0; } /** * Calculate confidence score for a duplicate group */ function calculateGroupConfidence(items: T[], rules: DuplicateRule[]): number { if (items.length < 2) return 0; let totalConfidence = 0; let comparisons = 0; for (let i = 0; i < items.length; i++) { for (let j = i + 1; j < items.length; j++) { const matchingRule = rules.find(rule => rule.check(items[i], items[j])); if (matchingRule) { totalConfidence += matchingRule.weight; comparisons++; } } } return comparisons > 0 ? totalConfidence / comparisons : 0; } /** * Normalize email for comparison */ export function normalizeEmail(email: string): string { return email.toLowerCase().trim(); } /** * Normalize phone number for comparison */ export function normalizePhone(phone: string): string { return phone.replace(/\D/g, ''); // Remove all non-digits } /** * Calculate string similarity using Levenshtein distance */ export function calculateStringSimilarity(str1: string, str2: string): number { const s1 = str1.toLowerCase().trim(); const s2 = str2.toLowerCase().trim(); if (s1 === s2) return 1.0; const distance = levenshteinDistance(s1, s2); const maxLength = Math.max(s1.length, s2.length); return maxLength > 0 ? 1 - (distance / maxLength) : 0; } /** * Calculate Levenshtein distance between two strings */ function levenshteinDistance(str1: string, str2: string): number { const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null)); for (let i = 0; i <= str1.length; i += 1) { matrix[0][i] = i; } for (let j = 0; j <= str2.length; j += 1) { matrix[j][0] = j; } for (let j = 1; j <= str2.length; j += 1) { for (let i = 1; i <= str1.length; i += 1) { const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1; matrix[j][i] = Math.min( matrix[j][i - 1] + 1, // deletion matrix[j - 1][i] + 1, // insertion matrix[j - 1][i - 1] + indicator // substitution ); } } return matrix[str2.length][str1.length]; } /** * Create configuration for expense duplicate detection */ export function createExpenseConfig(): DuplicateDetectionConfig { return { type: 'expense', // Group by normalized payer name for blocking getKey: (expense) => { const payer = expense.Payer ? normalizePersonName(expense.Payer) : 'unknown'; const date = expense.Time ? expense.Time.split('T')[0] : 'nodate'; return `${payer}_${date}`; }, getId: (expense) => expense.Id, rules: [ { name: 'Exact match', weight: 1.0, check: (exp1, exp2) => { return exp1['Establishment Name'] === exp2['Establishment Name'] && exp1.Price === exp2.Price && exp1.Time === exp2.Time; } }, { name: 'Same day, same details', weight: 0.95, check: (exp1, exp2) => { const date1 = exp1.Time?.split('T')[0]; const date2 = exp2.Time?.split('T')[0]; return normalizePersonName(exp1.Payer || '') === normalizePersonName(exp2.Payer || '') && exp1['Establishment Name'] === exp2['Establishment Name'] && exp1.Price === exp2.Price && date1 === date2; } }, { name: 'Close time proximity', weight: 0.9, check: (exp1, exp2) => { if (!exp1.Time || !exp2.Time) return false; const time1 = new Date(exp1.Time).getTime(); const time2 = new Date(exp2.Time).getTime(); const timeDiff = Math.abs(time1 - time2); return timeDiff < 5 * 60 * 1000 && // 5 minutes exp1['Establishment Name'] === exp2['Establishment Name'] && exp1.Price === exp2.Price; } } ], maxGroupSize: 50, maxComparisons: 10000 }; } /** * Create configuration for interest duplicate detection */ export function createInterestConfig(): DuplicateDetectionConfig { return { type: 'interest', // Group by normalized name prefix for blocking to catch name-based duplicates getKey: (interest) => { // Priority 1: Use normalized name prefix (first 3 chars) to catch name duplicates if (interest['Full Name']) { const name = interest['Full Name'].toLowerCase().trim(); const prefix = name.substring(0, 3); return `name_${prefix}`; } // Priority 2: Use email domain for email-based grouping if (interest['Email Address']) { const email = normalizeEmail(interest['Email Address']); const domain = email.split('@')[1] || 'unknown'; return `email_${domain}`; } // Priority 3: Use phone prefix if (interest['Phone Number']) { const phone = normalizePhone(interest['Phone Number']); const prefix = phone.length >= 4 ? phone.substring(0, 4) : phone; return `phone_${prefix}`; } return 'unknown'; }, getId: (interest) => interest.Id, rules: [ { name: 'Same email', weight: 1.0, check: (int1, int2) => { return int1['Email Address'] && int2['Email Address'] && normalizeEmail(int1['Email Address']) === normalizeEmail(int2['Email Address']); } }, { name: 'Same phone', weight: 1.0, check: (int1, int2) => { const phone1 = normalizePhone(int1['Phone Number'] || ''); const phone2 = normalizePhone(int2['Phone Number'] || ''); return phone1 && phone2 && phone1.length >= 8 && phone1 === phone2; } }, { name: 'Similar name and address', weight: 0.8, check: (int1, int2) => { if (!int1['Full Name'] || !int2['Full Name']) return false; const nameSimilarity = calculateStringSimilarity(int1['Full Name'], int2['Full Name']); if (nameSimilarity > 0.9) { // If names are very similar, check address too if (int1.Address && int2.Address) { const addressSimilarity = calculateStringSimilarity(int1.Address, int2.Address); return addressSimilarity > 0.8; } return true; // Similar names, no address to compare } return false; } } ], maxGroupSize: 50, maxComparisons: 10000 }; }