feat: Implement centralized duplicate detection utility for expenses and interests

2025-07-12 13:42:53 -04:00 · 2025-07-12 13:42:53 -04:00 · 080cb60d71
parent b8a6a52417
commit 080cb60d71
3 changed files with 441 additions and 512 deletions
--- a/server/api/expenses/duplicates/find.ts
+++ b/server/api/expenses/duplicates/find.ts
@ -1,5 +1,6 @@
 import { requireSalesOrAdmin } from '~/server/utils/auth';
 import { getNocoDbConfiguration, normalizePersonName } from '~/server/utils/nocodb';
+import { findDuplicates, createExpenseConfig } from '~/server/utils/duplicate-detection';
 import type { Expense } from '~/utils/types';

 export default defineEventHandler(async (event) => {
@ -35,21 +36,31 @@ export default defineEventHandler(async (event) => {
    const expenses = response.list || [];
    console.log('[EXPENSES] Analyzing', expenses.length, 'expenses for duplicates');

-    // Find duplicate groups
-    const duplicateGroups = findDuplicateExpenses(expenses);
+    // Find duplicate groups using the new centralized utility
+    const duplicateConfig = createExpenseConfig();
+    const duplicateGroups = findDuplicates(expenses, duplicateConfig);
+    
+    // Convert to the expected format
+    const formattedGroups = duplicateGroups.map(group => ({
+      id: group.id,
+      expenses: group.items,
+      matchReason: group.matchReason,
+      confidence: group.confidence,
+      masterCandidate: group.masterCandidate
+    }));
    
    // Also find payer name variations
    const payerVariations = findPayerNameVariations(expenses);

-    console.log('[EXPENSES] Found', duplicateGroups.length, 'duplicate groups and', payerVariations.length, 'payer variations');
+    console.log('[EXPENSES] Found', formattedGroups.length, 'duplicate groups and', payerVariations.length, 'payer variations');

    return {
      success: true,
      data: {
-        duplicateGroups,
+        duplicateGroups: formattedGroups,
        payerVariations,
        totalExpenses: expenses.length,
-        duplicateCount: duplicateGroups.reduce((sum, group) => sum + group.expenses.length, 0),
+        duplicateCount: formattedGroups.reduce((sum, group) => sum + group.expenses.length, 0),
        dateRange: {
          start: startDate.toISOString().split('T')[0],
          end: endDate.toISOString().split('T')[0]
@ -74,71 +85,6 @@ export default defineEventHandler(async (event) => {
  }
 });

-/**
- * Find duplicate expenses based on multiple criteria
- */
-function findDuplicateExpenses(expenses: any[]) {
-  console.log('[EXPENSES] Starting duplicate detection for', expenses.length, 'expenses');
-  
-  const duplicateGroups: Array<{
-    id: string;
-    expenses: any[];
-    matchReason: string;
-    confidence: number;
-    masterCandidate: any;
-  }> = [];
-
-  const processedIds = new Set<number>();
-  let comparisons = 0;
-
-  for (let i = 0; i < expenses.length; i++) {
-    const expense1 = expenses[i];
-    
-    if (processedIds.has(expense1.Id)) continue;
-
-    const matches = [expense1];
-    let matchReasons = new Set<string>();
-    
-    for (let j = i + 1; j < expenses.length; j++) {
-      const expense2 = expenses[j];
-      
-      if (processedIds.has(expense2.Id)) continue;
-
-      const similarity = calculateExpenseSimilarity(expense1, expense2);
-      comparisons++;
-      
-      console.log(`[EXPENSES] Comparing ${expense1.Id} vs ${expense2.Id}: score=${similarity.score.toFixed(3)}, threshold=0.7`);
-      
-      if (similarity.score >= 0.7) { // Lower threshold for expenses
-        console.log(`[EXPENSES] MATCH FOUND! ${expense1.Id} vs ${expense2.Id} (score: ${similarity.score.toFixed(3)})`);
-        console.log('[EXPENSES] Match reasons:', similarity.reasons);
-        matches.push(expense2);
-        processedIds.add(expense2.Id);
-        similarity.reasons.forEach(r => matchReasons.add(r));
-      }
-    }
-
-    if (matches.length > 1) {
-      // Mark all as processed
-      matches.forEach(match => processedIds.add(match.Id));
-
-      // Determine the best master candidate
-      const masterCandidate = selectMasterExpense(matches);
-      
-      duplicateGroups.push({
-        id: `group_${duplicateGroups.length + 1}`,
-        expenses: matches,
-        matchReason: Array.from(matchReasons).join(', '),
-        confidence: Math.max(...matches.slice(1).map(match => 
-          calculateExpenseSimilarity(masterCandidate, match).score
-        )),
-        masterCandidate
-      });
-    }
-  }
-
-  return duplicateGroups;
-}

 /**
 * Find payer name variations (like "Abbie" vs "abbie")
@ -181,154 +127,3 @@ function findPayerNameVariations(expenses: any[]) {
  
  return variations.sort((a, b) => b.expenseCount - a.expenseCount);
 }
-
-/**
- * Calculate similarity between two expenses
- */
-function calculateExpenseSimilarity(expense1: any, expense2: any) {
-  const scores: Array<{ type: string; score: number; weight: number }> = [];
-  const reasons: string[] = [];
-
-  // Exact match on establishment, price, and date (highest weight for true duplicates)
-  if (expense1['Establishment Name'] === expense2['Establishment Name'] &&
-      expense1.Price === expense2.Price &&
-      expense1.Time === expense2.Time) {
-    scores.push({ type: 'exact', score: 1.0, weight: 0.5 });
-    reasons.push('Exact match');
-  }
-
-  // Same payer, establishment, and price on same day (likely duplicate)
-  const date1 = expense1.Time?.split('T')[0];
-  const date2 = expense2.Time?.split('T')[0];
-  
-  if (normalizePersonName(expense1.Payer) === normalizePersonName(expense2.Payer) &&
-      expense1['Establishment Name'] === expense2['Establishment Name'] &&
-      expense1.Price === expense2.Price &&
-      date1 === date2) {
-    scores.push({ type: 'same-day', score: 0.95, weight: 0.4 });
-    reasons.push('Same person, place, amount on same day');
-  }
-
-  // Similar establishment names with same price and payer
-  if (expense1['Establishment Name'] && expense2['Establishment Name']) {
-    const nameSimilarity = calculateStringSimilarity(
-      expense1['Establishment Name'],
-      expense2['Establishment Name']
-    );
-    
-    if (nameSimilarity > 0.8 && 
-        expense1.Price === expense2.Price &&
-        normalizePersonName(expense1.Payer) === normalizePersonName(expense2.Payer)) {
-      scores.push({ type: 'similar', score: nameSimilarity, weight: 0.3 });
-      reasons.push('Similar establishment name');
-    }
-  }
-
-  // Time proximity check (within 5 minutes)
-  if (expense1.Time && expense2.Time) {
-    const time1 = new Date(expense1.Time).getTime();
-    const time2 = new Date(expense2.Time).getTime();
-    const timeDiff = Math.abs(time1 - time2);
-    
-    if (timeDiff < 5 * 60 * 1000 && // 5 minutes
-        expense1['Establishment Name'] === expense2['Establishment Name']) {
-      scores.push({ type: 'time-proximity', score: 0.9, weight: 0.2 });
-      reasons.push('Within 5 minutes at same establishment');
-    }
-  }
-
-  // Calculate weighted average
-  const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
-  const weightedScore = totalWeight > 0 
-    ? scores.reduce((sum, s) => sum + (s.score * s.weight), 0) / totalWeight 
-    : 0;
-
-  return {
-    score: weightedScore,
-    reasons,
-    details: scores
-  };
-}
-
-/**
- * Calculate string similarity using Levenshtein distance
- */
-function calculateStringSimilarity(str1: string, str2: string): number {
-  const s1 = str1.toLowerCase().trim();
-  const s2 = str2.toLowerCase().trim();
-  
-  if (s1 === s2) return 1.0;
-  
-  const distance = levenshteinDistance(s1, s2);
-  const maxLength = Math.max(s1.length, s2.length);
-  
-  return maxLength > 0 ? 1 - (distance / maxLength) : 0;
-}
-
-/**
- * Calculate Levenshtein distance between two strings
- */
-function levenshteinDistance(str1: string, str2: string): number {
-  const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
-
-  for (let i = 0; i <= str1.length; i += 1) {
-    matrix[0][i] = i;
-  }
-
-  for (let j = 0; j <= str2.length; j += 1) {
-    matrix[j][0] = j;
-  }
-
-  for (let j = 1; j <= str2.length; j += 1) {
-    for (let i = 1; i <= str1.length; i += 1) {
-      const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
-      matrix[j][i] = Math.min(
-        matrix[j][i - 1] + 1, // deletion
-        matrix[j - 1][i] + 1, // insertion
-        matrix[j - 1][i - 1] + indicator // substitution
-      );
-    }
-  }
-
-  return matrix[str2.length][str1.length];
-}
-
-/**
- * Select the best master expense from a group
- */
-function selectMasterExpense(expenses: any[]) {
-  return expenses.reduce((best, current) => {
-    const bestScore = calculateExpenseCompletenessScore(best);
-    const currentScore = calculateExpenseCompletenessScore(current);
-    
-    return currentScore > bestScore ? current : best;
-  });
-}
-
-/**
- * Calculate completeness score for an expense
- */
-function calculateExpenseCompletenessScore(expense: any): number {
-  const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time'];
-  const filledFields = fields.filter(field => 
-    expense[field] && expense[field].toString().trim().length > 0
-  );
-  
-  let score = filledFields.length / fields.length;
-  
-  // Bonus for having contents description
-  if (expense.Contents && expense.Contents.length > 10) {
-    score += 0.2;
-  }
-  
-  // Bonus for recent creation (more likely to be accurate)
-  if (expense.CreatedAt) {
-    const created = new Date(expense.CreatedAt);
-    const now = new Date();
-    const hoursOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60);
-    
-    if (hoursOld < 24) score += 0.1;
-  }
-  
-  return Math.min(score, 1.0);
-}
--- a/server/api/interests/duplicates/find.ts
+++ b/server/api/interests/duplicates/find.ts
@ -1,6 +1,7 @@
 import { requireSalesOrAdmin } from '~/server/utils/auth';
 import { getNocoDbConfiguration } from '~/server/utils/nocodb';
 import { logAuditEvent } from '~/server/utils/audit-logger';
+import { findDuplicates, createInterestConfig } from '~/server/utils/duplicate-detection';

 export default defineEventHandler(async (event) => {
  console.log('[INTERESTS] Find duplicates request');
@ -40,16 +41,26 @@ export default defineEventHandler(async (event) => {
    const interests = response.list || [];
    console.log('[INTERESTS] Analyzing', interests.length, 'interests for duplicates');

-    // Find potential duplicates
-    const duplicateGroups = findDuplicateInterests(interests, threshold);
+    // Find duplicate groups using the new centralized utility
+    const duplicateConfig = createInterestConfig();
+    const duplicateGroups = findDuplicates(interests, duplicateConfig);
    
-    console.log('[INTERESTS] Found', duplicateGroups.length, 'duplicate groups');
+    // Convert to the expected format
+    const formattedGroups = duplicateGroups.map(group => ({
+      id: group.id,
+      interests: group.items,
+      matchReason: group.matchReason,
+      confidence: group.confidence,
+      masterCandidate: group.masterCandidate
+    }));
+
+    console.log('[INTERESTS] Found', formattedGroups.length, 'duplicate groups');

    // Log the audit event
    await logAuditEvent(event, 'FIND_INTEREST_DUPLICATES', 'interest', {
      changes: {
        totalInterests: interests.length,
-        duplicateGroups: duplicateGroups.length,
+        duplicateGroups: formattedGroups.length,
        threshold,
        dateRange
      }
@ -58,9 +69,9 @@ export default defineEventHandler(async (event) => {
    return {
      success: true,
      data: {
-        duplicateGroups,
+        duplicateGroups: formattedGroups,
        totalInterests: interests.length,
-        duplicateCount: duplicateGroups.reduce((sum, group) => sum + group.interests.length, 0),
+        duplicateCount: formattedGroups.reduce((sum, group) => sum + group.interests.length, 0),
        threshold,
        dateRange
      }
@ -82,288 +93,3 @@ export default defineEventHandler(async (event) => {
    };
  }
 });
-
-/**
- * Find duplicate interests based on multiple criteria
- */
-function findDuplicateInterests(interests: any[], threshold: number = 0.8) {
-  console.log('[INTERESTS] Starting duplicate detection with threshold:', threshold);
-  console.log('[INTERESTS] Total interests to analyze:', interests.length);
-  
-  const duplicateGroups: Array<{
-    id: string;
-    interests: any[];
-    matchReason: string;
-    confidence: number;
-    masterCandidate: any;
-  }> = [];
-
-  const processedIds = new Set<number>();
-  let comparisons = 0;
-
-  for (let i = 0; i < interests.length; i++) {
-    const interest1 = interests[i];
-    
-    if (processedIds.has(interest1.Id)) continue;
-
-    const matches = [interest1];
-    
-    for (let j = i + 1; j < interests.length; j++) {
-      const interest2 = interests[j];
-      
-      if (processedIds.has(interest2.Id)) continue;
-
-      const similarity = calculateSimilarity(interest1, interest2);
-      comparisons++;
-      
-      console.log(`[INTERESTS] Comparing ${interest1.Id} vs ${interest2.Id}: score=${similarity.score.toFixed(3)}, threshold=${threshold}`);
-      
-      if (similarity.score >= threshold) {
-        console.log(`[INTERESTS] MATCH FOUND! ${interest1.Id} vs ${interest2.Id} (score: ${similarity.score.toFixed(3)})`);
-        console.log('[INTERESTS] Match details:', similarity.details);
-        matches.push(interest2);
-        processedIds.add(interest2.Id);
-      }
-    }
-
-    if (matches.length > 1) {
-      console.log(`[INTERESTS] Creating duplicate group with ${matches.length} matches`);
-      
-      // Mark all as processed
-      matches.forEach(match => processedIds.add(match.Id));
-
-      // Determine the best master candidate (most complete record)
-      const masterCandidate = selectMasterCandidate(matches);
-      
-      // Calculate average confidence
-      const avgConfidence = matches.slice(1).reduce((sum, match) => {
-        return sum + calculateSimilarity(masterCandidate, match).score;
-      }, 0) / (matches.length - 1);
-      
-      duplicateGroups.push({
-        id: `group_${duplicateGroups.length + 1}`,
-        interests: matches,
-        matchReason: generateMatchReason(matches),
-        confidence: avgConfidence,
-        masterCandidate
-      });
-    }
-  }
-
-  console.log(`[INTERESTS] Completed ${comparisons} comparisons, found ${duplicateGroups.length} duplicate groups`);
-  return duplicateGroups;
-}
-
-/**
- * Calculate similarity between two interests
- */
-function calculateSimilarity(interest1: any, interest2: any) {
-  const scores: Array<{ type: string; score: number; weight: number }> = [];
-  
-  console.log(`[INTERESTS] Calculating similarity between:`, {
-    id1: interest1.Id,
-    name1: interest1['Full Name'],
-    email1: interest1['Email Address'],
-    phone1: interest1['Phone Number'],
-    id2: interest2.Id,
-    name2: interest2['Full Name'],
-    email2: interest2['Email Address'],
-    phone2: interest2['Phone Number']
-  });
-
-  // Email similarity (highest weight) - exact match required
-  if (interest1['Email Address'] && interest2['Email Address']) {
-    const email1 = normalizeEmail(interest1['Email Address']);
-    const email2 = normalizeEmail(interest2['Email Address']);
-    const emailScore = email1 === email2 ? 1.0 : 0.0;
-    scores.push({ type: 'email', score: emailScore, weight: 0.5 });
-    console.log(`[INTERESTS] Email comparison: "${email1}" vs "${email2}" = ${emailScore}`);
-  }
-
-  // Phone similarity - exact match on normalized numbers
-  if (interest1['Phone Number'] && interest2['Phone Number']) {
-    const phone1 = normalizePhone(interest1['Phone Number']);
-    const phone2 = normalizePhone(interest2['Phone Number']);
-    const phoneScore = phone1 === phone2 && phone1.length >= 8 ? 1.0 : 0.0; // Require at least 8 digits
-    scores.push({ type: 'phone', score: phoneScore, weight: 0.4 });
-    console.log(`[INTERESTS] Phone comparison: "${phone1}" vs "${phone2}" = ${phoneScore}`);
-  }
-
-  // Name similarity - fuzzy matching
-  if (interest1['Full Name'] && interest2['Full Name']) {
-    const nameScore = calculateNameSimilarity(interest1['Full Name'], interest2['Full Name']);
-    scores.push({ type: 'name', score: nameScore, weight: 0.3 });
-    console.log(`[INTERESTS] Name comparison: "${interest1['Full Name']}" vs "${interest2['Full Name']}" = ${nameScore.toFixed(3)}`);
-  }
-
-  // Address similarity
-  if (interest1.Address && interest2.Address) {
-    const addressScore = calculateStringSimilarity(interest1.Address, interest2.Address);
-    scores.push({ type: 'address', score: addressScore, weight: 0.2 });
-    console.log(`[INTERESTS] Address comparison: ${addressScore.toFixed(3)}`);
-  }
-
-  // Special case: if we have exact email OR phone match, give high score regardless of other fields
-  const hasExactEmailMatch = scores.find(s => s.type === 'email' && s.score === 1.0);
-  const hasExactPhoneMatch = scores.find(s => s.type === 'phone' && s.score === 1.0);
-  
-  if (hasExactEmailMatch || hasExactPhoneMatch) {
-    console.log('[INTERESTS] Exact email or phone match found - high confidence');
-    return {
-      score: 0.95, // High confidence for exact email/phone match
-      details: scores
-    };
-  }
-
-  // Calculate weighted average for other cases
-  const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
-  const weightedScore = scores.reduce((sum, s) => sum + (s.score * s.weight), 0) / (totalWeight || 1);
-
-  console.log(`[INTERESTS] Weighted score: ${weightedScore.toFixed(3)} (weights: ${totalWeight})`);
-
-  return {
-    score: weightedScore,
-    details: scores
-  };
-}
-
-/**
- * Normalize email for comparison
- */
-function normalizeEmail(email: string): string {
-  return email.toLowerCase().trim();
-}
-
-/**
- * Normalize phone number for comparison
- */
-function normalizePhone(phone: string): string {
-  return phone.replace(/\D/g, ''); // Remove all non-digits
-}
-
-/**
- * Calculate name similarity using Levenshtein distance
- */
-function calculateNameSimilarity(name1: string, name2: string): number {
-  const str1 = name1.toLowerCase().trim();
-  const str2 = name2.toLowerCase().trim();
-  
-  if (str1 === str2) return 1.0;
-  
-  const distance = levenshteinDistance(str1, str2);
-  const maxLength = Math.max(str1.length, str2.length);
-  
-  return maxLength > 0 ? 1 - (distance / maxLength) : 0;
-}
-
-/**
- * Calculate string similarity using Levenshtein distance
- */
-function calculateStringSimilarity(str1: string, str2: string): number {
-  const s1 = str1.toLowerCase().trim();
-  const s2 = str2.toLowerCase().trim();
-  
-  if (s1 === s2) return 1.0;
-  
-  const distance = levenshteinDistance(s1, s2);
-  const maxLength = Math.max(s1.length, s2.length);
-  
-  return maxLength > 0 ? 1 - (distance / maxLength) : 0;
-}
-
-/**
- * Calculate Levenshtein distance between two strings
- */
-function levenshteinDistance(str1: string, str2: string): number {
-  const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
-
-  for (let i = 0; i <= str1.length; i += 1) {
-    matrix[0][i] = i;
-  }
-
-  for (let j = 0; j <= str2.length; j += 1) {
-    matrix[j][0] = j;
-  }
-
-  for (let j = 1; j <= str2.length; j += 1) {
-    for (let i = 1; i <= str1.length; i += 1) {
-      const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
-      matrix[j][i] = Math.min(
-        matrix[j][i - 1] + 1, // deletion
-        matrix[j - 1][i] + 1, // insertion
-        matrix[j - 1][i - 1] + indicator // substitution
-      );
-    }
-  }
-
-  return matrix[str2.length][str1.length];
-}
-
-/**
- * Select the best master candidate from a group of duplicates
- */
-function selectMasterCandidate(interests: any[]) {
-  return interests.reduce((best, current) => {
-    const bestScore = calculateCompletenessScore(best);
-    const currentScore = calculateCompletenessScore(current);
-    
-    return currentScore > bestScore ? current : best;
-  });
-}
-
-/**
- * Calculate completeness score for an interest record
- */
-function calculateCompletenessScore(interest: any): number {
-  const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired'];
-  const filledFields = fields.filter(field => 
-    interest[field] && interest[field].toString().trim().length > 0
-  );
-  
-  let score = filledFields.length / fields.length;
-  
-  // Bonus for recent creation
-  if (interest['Created At']) {
-    const created = new Date(interest['Created At']);
-    const now = new Date();
-    const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24);
-    
-    // More recent records get a small bonus
-    if (daysOld < 30) score += 0.1;
-    else if (daysOld < 90) score += 0.05;
-  }
-  
-  return score;
-}
-
-/**
- * Generate a descriptive match reason
- */
-function generateMatchReason(interests: any[]): string {
-  const reasons = [];
-  
-  // Check for exact email matches
-  const emails = interests.map(i => i['Email Address']).filter(Boolean);
-  if (emails.length > 1 && new Set(emails.map(e => normalizeEmail(e))).size === 1) {
-    reasons.push('Same email address');
-  }
-  
-  // Check for exact phone matches
-  const phones = interests.map(i => i['Phone Number']).filter(Boolean);
-  if (phones.length > 1 && new Set(phones.map(p => normalizePhone(p))).size === 1) {
-    reasons.push('Same phone number');
-  }
-  
-  // Check for similar names
-  const names = interests.map(i => i['Full Name']).filter(Boolean);
-  if (names.length > 1) {
-    const normalizedNames = names.map(n => n.toLowerCase().trim());
-    if (new Set(normalizedNames).size === 1) {
-      reasons.push('Same name');
-    } else {
-      reasons.push('Similar names');
-    }
-  }
-  
-  return reasons.length > 0 ? reasons.join(', ') : 'Multiple matching criteria';
-}
--- a/server/utils/duplicate-detection.ts
+++ b/server/utils/duplicate-detection.ts
@ -0,0 +1,408 @@
+import { normalizePersonName } from './nocodb';
+
+/**
+ * Configuration for duplicate detection
+ */
+export interface DuplicateDetectionConfig<T> {
+  type: 'expense' | 'interest';
+  
+  // Field extractors
+  getKey: (item: T) => string;           // Primary grouping key for blocking
+  getId: (item: T) => number;            // Unique identifier
+  
+  // Duplicate detection rules
+  rules: DuplicateRule<T>[];
+  
+  // Performance settings
+  maxGroupSize?: number;                 // Skip groups larger than this
+  maxComparisons?: number;               // Limit total comparisons
+}
+
+/**
+ * A rule for detecting duplicates
+ */
+export interface DuplicateRule<T> {
+  name: string;
+  weight: number;
+  check: (item1: T, item2: T) => boolean;
+}
+
+/**
+ * Result of duplicate detection
+ */
+export interface DuplicateGroup<T> {
+  id: string;
+  items: T[];
+  matchReason: string;
+  confidence: number;
+  masterCandidate: T;
+}
+
+/**
+ * Main function to find duplicates using an efficient blocking strategy
+ */
+export function findDuplicates<T>(
+  items: T[], 
+  config: DuplicateDetectionConfig<T>
+): DuplicateGroup<T>[] {
+  console.log(`[DUPLICATES] Starting detection for ${items.length} ${config.type}s`);
+  
+  if (items.length === 0) return [];
+  
+  // Phase 1: Group items by blocking key for efficient comparison
+  const blocks = new Map<string, T[]>();
+  
+  items.forEach(item => {
+    const key = config.getKey(item);
+    if (!blocks.has(key)) {
+      blocks.set(key, []);
+    }
+    blocks.get(key)!.push(item);
+  });
+  
+  console.log(`[DUPLICATES] Created ${blocks.size} blocks from ${items.length} items`);
+  
+  // Phase 2: Find duplicates within each block
+  const duplicateGroups: DuplicateGroup<T>[] = [];
+  const processedIds = new Set<number>();
+  let totalComparisons = 0;
+  
+  for (const [blockKey, blockItems] of blocks) {
+    // Skip large blocks that would be too expensive to process
+    if (config.maxGroupSize && blockItems.length > config.maxGroupSize) {
+      console.log(`[DUPLICATES] Skipping large block "${blockKey}" with ${blockItems.length} items`);
+      continue;
+    }
+    
+    // Skip blocks with only one item
+    if (blockItems.length < 2) continue;
+    
+    console.log(`[DUPLICATES] Processing block "${blockKey}" with ${blockItems.length} items`);
+    
+    // Find duplicates within this block
+    for (let i = 0; i < blockItems.length; i++) {
+      const item1 = blockItems[i];
+      if (processedIds.has(config.getId(item1))) continue;
+      
+      const group = [item1];
+      const matchedRules = new Set<string>();
+      
+      for (let j = i + 1; j < blockItems.length; j++) {
+        const item2 = blockItems[j];
+        if (processedIds.has(config.getId(item2))) continue;
+        
+        totalComparisons++;
+        
+        // Check if items match according to any rule
+        const matchingRule = config.rules.find(rule => rule.check(item1, item2));
+        
+        if (matchingRule) {
+          console.log(`[DUPLICATES] Match found: ${config.getId(item1)} vs ${config.getId(item2)} (rule: ${matchingRule.name})`);
+          group.push(item2);
+          matchedRules.add(matchingRule.name);
+          processedIds.add(config.getId(item2));
+        }
+        
+        // Stop if we've hit the comparison limit
+        if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
+          console.log(`[DUPLICATES] Hit comparison limit of ${config.maxComparisons}`);
+          break;
+        }
+      }
+      
+      // If we found duplicates, create a group
+      if (group.length > 1) {
+        processedIds.add(config.getId(item1));
+        
+        const masterCandidate = selectMasterCandidate(group, config.type);
+        const confidence = calculateGroupConfidence(group, config.rules);
+        
+        duplicateGroups.push({
+          id: `group_${duplicateGroups.length + 1}`,
+          items: group,
+          matchReason: Array.from(matchedRules).join(', '),
+          confidence,
+          masterCandidate
+        });
+      }
+      
+      if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
+        break;
+      }
+    }
+    
+    if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
+      break;
+    }
+  }
+  
+  console.log(`[DUPLICATES] Completed ${totalComparisons} comparisons, found ${duplicateGroups.length} duplicate groups`);
+  return duplicateGroups;
+}
+
+/**
+ * Select the best master candidate from a group
+ */
+function selectMasterCandidate<T>(items: T[], type: 'expense' | 'interest'): T {
+  return items.reduce((best, current) => {
+    const bestScore = calculateCompletenessScore(best, type);
+    const currentScore = calculateCompletenessScore(current, type);
+    return currentScore > bestScore ? current : best;
+  });
+}
+
+/**
+ * Calculate completeness score for prioritizing records
+ */
+function calculateCompletenessScore(item: any, type: 'expense' | 'interest'): number {
+  let score = 0;
+  let totalFields = 0;
+  
+  if (type === 'expense') {
+    const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time'];
+    fields.forEach(field => {
+      totalFields++;
+      if (item[field] && item[field].toString().trim().length > 0) {
+        score++;
+      }
+    });
+    
+    // Bonus for detailed contents
+    if (item.Contents && item.Contents.length > 10) {
+      score += 0.5;
+    }
+  } else if (type === 'interest') {
+    const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired'];
+    fields.forEach(field => {
+      totalFields++;
+      if (item[field] && item[field].toString().trim().length > 0) {
+        score++;
+      }
+    });
+  }
+  
+  // Bonus for recent creation
+  if (item['Created At'] || item.CreatedAt) {
+    const createdField = item['Created At'] || item.CreatedAt;
+    const created = new Date(createdField);
+    const now = new Date();
+    const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24);
+    
+    if (daysOld < 30) score += 0.3;
+    else if (daysOld < 90) score += 0.15;
+  }
+  
+  return totalFields > 0 ? score / totalFields : 0;
+}
+
+/**
+ * Calculate confidence score for a duplicate group
+ */
+function calculateGroupConfidence<T>(items: T[], rules: DuplicateRule<T>[]): number {
+  if (items.length < 2) return 0;
+  
+  let totalConfidence = 0;
+  let comparisons = 0;
+  
+  for (let i = 0; i < items.length; i++) {
+    for (let j = i + 1; j < items.length; j++) {
+      const matchingRule = rules.find(rule => rule.check(items[i], items[j]));
+      if (matchingRule) {
+        totalConfidence += matchingRule.weight;
+        comparisons++;
+      }
+    }
+  }
+  
+  return comparisons > 0 ? totalConfidence / comparisons : 0;
+}
+
+/**
+ * Normalize email for comparison
+ */
+export function normalizeEmail(email: string): string {
+  return email.toLowerCase().trim();
+}
+
+/**
+ * Normalize phone number for comparison
+ */
+export function normalizePhone(phone: string): string {
+  return phone.replace(/\D/g, ''); // Remove all non-digits
+}
+
+/**
+ * Calculate string similarity using Levenshtein distance
+ */
+export function calculateStringSimilarity(str1: string, str2: string): number {
+  const s1 = str1.toLowerCase().trim();
+  const s2 = str2.toLowerCase().trim();
+  
+  if (s1 === s2) return 1.0;
+  
+  const distance = levenshteinDistance(s1, s2);
+  const maxLength = Math.max(s1.length, s2.length);
+  
+  return maxLength > 0 ? 1 - (distance / maxLength) : 0;
+}
+
+/**
+ * Calculate Levenshtein distance between two strings
+ */
+function levenshteinDistance(str1: string, str2: string): number {
+  const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
+
+  for (let i = 0; i <= str1.length; i += 1) {
+    matrix[0][i] = i;
+  }
+
+  for (let j = 0; j <= str2.length; j += 1) {
+    matrix[j][0] = j;
+  }
+
+  for (let j = 1; j <= str2.length; j += 1) {
+    for (let i = 1; i <= str1.length; i += 1) {
+      const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
+      matrix[j][i] = Math.min(
+        matrix[j][i - 1] + 1, // deletion
+        matrix[j - 1][i] + 1, // insertion
+        matrix[j - 1][i - 1] + indicator // substitution
+      );
+    }
+  }
+
+  return matrix[str2.length][str1.length];
+}
+
+/**
+ * Create configuration for expense duplicate detection
+ */
+export function createExpenseConfig(): DuplicateDetectionConfig<any> {
+  return {
+    type: 'expense',
+    
+    // Group by normalized payer name for blocking
+    getKey: (expense) => {
+      const payer = expense.Payer ? normalizePersonName(expense.Payer) : 'unknown';
+      const date = expense.Time ? expense.Time.split('T')[0] : 'nodate';
+      return `${payer}_${date}`;
+    },
+    
+    getId: (expense) => expense.Id,
+    
+    rules: [
+      {
+        name: 'Exact match',
+        weight: 1.0,
+        check: (exp1, exp2) => {
+          return exp1['Establishment Name'] === exp2['Establishment Name'] &&
+                 exp1.Price === exp2.Price &&
+                 exp1.Time === exp2.Time;
+        }
+      },
+      {
+        name: 'Same day, same details',
+        weight: 0.95,
+        check: (exp1, exp2) => {
+          const date1 = exp1.Time?.split('T')[0];
+          const date2 = exp2.Time?.split('T')[0];
+          
+          return normalizePersonName(exp1.Payer || '') === normalizePersonName(exp2.Payer || '') &&
+                 exp1['Establishment Name'] === exp2['Establishment Name'] &&
+                 exp1.Price === exp2.Price &&
+                 date1 === date2;
+        }
+      },
+      {
+        name: 'Close time proximity',
+        weight: 0.9,
+        check: (exp1, exp2) => {
+          if (!exp1.Time || !exp2.Time) return false;
+          
+          const time1 = new Date(exp1.Time).getTime();
+          const time2 = new Date(exp2.Time).getTime();
+          const timeDiff = Math.abs(time1 - time2);
+          
+          return timeDiff < 5 * 60 * 1000 && // 5 minutes
+                 exp1['Establishment Name'] === exp2['Establishment Name'] &&
+                 exp1.Price === exp2.Price;
+        }
+      }
+    ],
+    
+    maxGroupSize: 50,
+    maxComparisons: 10000
+  };
+}
+
+/**
+ * Create configuration for interest duplicate detection
+ */
+export function createInterestConfig(): DuplicateDetectionConfig<any> {
+  return {
+    type: 'interest',
+    
+    // Group by normalized email domain or phone prefix for blocking
+    getKey: (interest) => {
+      if (interest['Email Address']) {
+        const email = normalizeEmail(interest['Email Address']);
+        const domain = email.split('@')[1] || 'unknown';
+        return `email_${domain}`;
+      }
+      
+      if (interest['Phone Number']) {
+        const phone = normalizePhone(interest['Phone Number']);
+        const prefix = phone.length >= 4 ? phone.substring(0, 4) : phone;
+        return `phone_${prefix}`;
+      }
+      
+      return 'unknown';
+    },
+    
+    getId: (interest) => interest.Id,
+    
+    rules: [
+      {
+        name: 'Same email',
+        weight: 1.0,
+        check: (int1, int2) => {
+          return int1['Email Address'] && int2['Email Address'] &&
+                 normalizeEmail(int1['Email Address']) === normalizeEmail(int2['Email Address']);
+        }
+      },
+      {
+        name: 'Same phone',
+        weight: 1.0,
+        check: (int1, int2) => {
+          const phone1 = normalizePhone(int1['Phone Number'] || '');
+          const phone2 = normalizePhone(int2['Phone Number'] || '');
+          
+          return phone1 && phone2 && phone1.length >= 8 && phone1 === phone2;
+        }
+      },
+      {
+        name: 'Similar name and address',
+        weight: 0.8,
+        check: (int1, int2) => {
+          if (!int1['Full Name'] || !int2['Full Name']) return false;
+          
+          const nameSimilarity = calculateStringSimilarity(int1['Full Name'], int2['Full Name']);
+          
+          if (nameSimilarity > 0.9) {
+            // If names are very similar, check address too
+            if (int1.Address && int2.Address) {
+              const addressSimilarity = calculateStringSimilarity(int1.Address, int2.Address);
+              return addressSimilarity > 0.8;
+            }
+            return true; // Similar names, no address to compare
+          }
+          
+          return false;
+        }
+      }
+    ],
+    
+    maxGroupSize: 50,
+    maxComparisons: 10000
+  };
+}