port-nimara-client-portal/server/utils/duplicate-detection.ts

import { normalizePersonName } from './nocodb';

/**
 * Configuration for duplicate detection
 */
export interface DuplicateDetectionConfig<T> {
  type: 'expense' | 'interest';

  // Field extractors
  getKey: (item: T) => string;           // Primary grouping key for blocking
  getId: (item: T) => number;            // Unique identifier

  // Duplicate detection rules
  rules: DuplicateRule<T>[];

  // Performance settings
  maxGroupSize?: number;                 // Skip groups larger than this
  maxComparisons?: number;               // Limit total comparisons
}

/**
 * A rule for detecting duplicates
 */
export interface DuplicateRule<T> {
  name: string;
  weight: number;
  check: (item1: T, item2: T) => boolean;
}

/**
 * Result of duplicate detection
 */
export interface DuplicateGroup<T> {
  id: string;
  items: T[];
  matchReason: string;
  confidence: number;
  masterCandidate: T;
}

/**
 * Main function to find duplicates using an efficient blocking strategy
 */
export function findDuplicates<T>(
  items: T[],
  config: DuplicateDetectionConfig<T>
): DuplicateGroup<T>[] {
  console.log(`[DUPLICATES] Starting detection for ${items.length} ${config.type}s`);

  if (items.length === 0) return [];

  // Phase 1: Group items by blocking key for efficient comparison
  const blocks = new Map<string, T[]>();

  items.forEach(item => {
    const key = config.getKey(item);
    if (!blocks.has(key)) {
      blocks.set(key, []);
    }
    blocks.get(key)!.push(item);
  });

  console.log(`[DUPLICATES] Created ${blocks.size} blocks from ${items.length} items`);

  // Phase 2: Find duplicates within each block
  const duplicateGroups: DuplicateGroup<T>[] = [];
  const processedIds = new Set<number>();
  let totalComparisons = 0;

  for (const [blockKey, blockItems] of blocks) {
    // Skip large blocks that would be too expensive to process
    if (config.maxGroupSize && blockItems.length > config.maxGroupSize) {
      console.log(`[DUPLICATES] Skipping large block "${blockKey}" with ${blockItems.length} items`);
      continue;
    }

    // Skip blocks with only one item
    if (blockItems.length < 2) continue;

    console.log(`[DUPLICATES] Processing block "${blockKey}" with ${blockItems.length} items`);

    // Find duplicates within this block
    for (let i = 0; i < blockItems.length; i++) {
      const item1 = blockItems[i];
      if (processedIds.has(config.getId(item1))) continue;

      const group = [item1];
      const matchedRules = new Set<string>();

      for (let j = i + 1; j < blockItems.length; j++) {
        const item2 = blockItems[j];
        if (processedIds.has(config.getId(item2))) continue;

        totalComparisons++;

        // Check if items match according to any rule
        const matchingRule = config.rules.find(rule => rule.check(item1, item2));

        if (matchingRule) {
          console.log(`[DUPLICATES] Match found: ${config.getId(item1)} vs ${config.getId(item2)} (rule: ${matchingRule.name})`);
          group.push(item2);
          matchedRules.add(matchingRule.name);
          processedIds.add(config.getId(item2));
        }

        // Stop if we've hit the comparison limit
        if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
          console.log(`[DUPLICATES] Hit comparison limit of ${config.maxComparisons}`);
          break;
        }
      }

      // If we found duplicates, create a group
      if (group.length > 1) {
        processedIds.add(config.getId(item1));

        const masterCandidate = selectMasterCandidate(group, config.type);
        const confidence = calculateGroupConfidence(group, config.rules);

        duplicateGroups.push({
          id: `group_${duplicateGroups.length + 1}`,
          items: group,
          matchReason: Array.from(matchedRules).join(', '),
          confidence,
          masterCandidate
        });
      }

      if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
        break;
      }
    }

    if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
      break;
    }
  }

  console.log(`[DUPLICATES] Completed ${totalComparisons} comparisons, found ${duplicateGroups.length} duplicate groups`);
  return duplicateGroups;
}

/**
 * Select the best master candidate from a group
 */
function selectMasterCandidate<T>(items: T[], type: 'expense' | 'interest'): T {
  return items.reduce((best, current) => {
    const bestScore = calculateCompletenessScore(best, type);
    const currentScore = calculateCompletenessScore(current, type);
    return currentScore > bestScore ? current : best;
  });
}

/**
 * Calculate completeness score for prioritizing records
 */
function calculateCompletenessScore(item: any, type: 'expense' | 'interest'): number {
  let score = 0;
  let totalFields = 0;

  if (type === 'expense') {
    const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time'];
    fields.forEach(field => {
      totalFields++;
      if (item[field] && item[field].toString().trim().length > 0) {
        score++;
      }
    });

    // Bonus for detailed contents
    if (item.Contents && item.Contents.length > 10) {
      score += 0.5;
    }
  } else if (type === 'interest') {
    const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired'];
    fields.forEach(field => {
      totalFields++;
      if (item[field] && item[field].toString().trim().length > 0) {
        score++;
      }
    });
  }

  // Bonus for recent creation
  if (item['Created At'] || item.CreatedAt) {
    const createdField = item['Created At'] || item.CreatedAt;
    const created = new Date(createdField);
    const now = new Date();
    const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24);

    if (daysOld < 30) score += 0.3;
    else if (daysOld < 90) score += 0.15;
  }

  return totalFields > 0 ? score / totalFields : 0;
}

/**
 * Calculate confidence score for a duplicate group
 */
function calculateGroupConfidence<T>(items: T[], rules: DuplicateRule<T>[]): number {
  if (items.length < 2) return 0;

  let totalConfidence = 0;
  let comparisons = 0;

  for (let i = 0; i < items.length; i++) {
    for (let j = i + 1; j < items.length; j++) {
      const matchingRule = rules.find(rule => rule.check(items[i], items[j]));
      if (matchingRule) {
        totalConfidence += matchingRule.weight;
        comparisons++;
      }
    }
  }

  return comparisons > 0 ? totalConfidence / comparisons : 0;
}

/**
 * Normalize email for comparison
 */
export function normalizeEmail(email: string): string {
  return email.toLowerCase().trim();
}

/**
 * Normalize phone number for comparison
 */
export function normalizePhone(phone: string): string {
  return phone.replace(/\D/g, ''); // Remove all non-digits
}

/**
 * Calculate string similarity using Levenshtein distance
 */
export function calculateStringSimilarity(str1: string, str2: string): number {
  const s1 = str1.toLowerCase().trim();
  const s2 = str2.toLowerCase().trim();

  if (s1 === s2) return 1.0;

  const distance = levenshteinDistance(s1, s2);
  const maxLength = Math.max(s1.length, s2.length);

  return maxLength > 0 ? 1 - (distance / maxLength) : 0;
}

/**
 * Calculate Levenshtein distance between two strings
 */
function levenshteinDistance(str1: string, str2: string): number {
  const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));

  for (let i = 0; i <= str1.length; i += 1) {
    matrix[0][i] = i;
  }

  for (let j = 0; j <= str2.length; j += 1) {
    matrix[j][0] = j;
  }

  for (let j = 1; j <= str2.length; j += 1) {
    for (let i = 1; i <= str1.length; i += 1) {
      const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
      matrix[j][i] = Math.min(
        matrix[j][i - 1] + 1, // deletion
        matrix[j - 1][i] + 1, // insertion
        matrix[j - 1][i - 1] + indicator // substitution
      );
    }
  }

  return matrix[str2.length][str1.length];
}

/**
 * Create configuration for expense duplicate detection
 */
export function createExpenseConfig(): DuplicateDetectionConfig<any> {
  return {
    type: 'expense',

    // Group by normalized payer name for blocking
    getKey: (expense) => {
      const payer = expense.Payer ? normalizePersonName(expense.Payer) : 'unknown';
      const date = expense.Time ? expense.Time.split('T')[0] : 'nodate';
      return `${payer}_${date}`;
    },

    getId: (expense) => expense.Id,

    rules: [
      {
        name: 'Exact match',
        weight: 1.0,
        check: (exp1, exp2) => {
          return exp1['Establishment Name'] === exp2['Establishment Name'] &&
                 exp1.Price === exp2.Price &&
                 exp1.Time === exp2.Time;
        }
      },
      {
        name: 'Same day, same details',
        weight: 0.95,
        check: (exp1, exp2) => {
          const date1 = exp1.Time?.split('T')[0];
          const date2 = exp2.Time?.split('T')[0];

          return normalizePersonName(exp1.Payer || '') === normalizePersonName(exp2.Payer || '') &&
                 exp1['Establishment Name'] === exp2['Establishment Name'] &&
                 exp1.Price === exp2.Price &&
                 date1 === date2;
        }
      },
      {
        name: 'Close time proximity',
        weight: 0.9,
        check: (exp1, exp2) => {
          if (!exp1.Time || !exp2.Time) return false;

          const time1 = new Date(exp1.Time).getTime();
          const time2 = new Date(exp2.Time).getTime();
          const timeDiff = Math.abs(time1 - time2);

          return timeDiff < 5 * 60 * 1000 && // 5 minutes
                 exp1['Establishment Name'] === exp2['Establishment Name'] &&
                 exp1.Price === exp2.Price;
        }
      }
    ],

    maxGroupSize: 50,
    maxComparisons: 10000
  };
}

/**
 * Create configuration for interest duplicate detection
 */
export function createInterestConfig(): DuplicateDetectionConfig<any> {
  return {
    type: 'interest',

    // Group by normalized name prefix for blocking to catch name-based duplicates
    getKey: (interest) => {
      // Priority 1: Use normalized name prefix (first 3 chars) to catch name duplicates
      if (interest['Full Name']) {
        const name = interest['Full Name'].toLowerCase().trim();
        const prefix = name.substring(0, 3);
        return `name_${prefix}`;
      }

      // Priority 2: Use email domain for email-based grouping
      if (interest['Email Address']) {
        const email = normalizeEmail(interest['Email Address']);
        const domain = email.split('@')[1] || 'unknown';
        return `email_${domain}`;
      }

      // Priority 3: Use phone prefix
      if (interest['Phone Number']) {
        const phone = normalizePhone(interest['Phone Number']);
        const prefix = phone.length >= 4 ? phone.substring(0, 4) : phone;
        return `phone_${prefix}`;
      }

      return 'unknown';
    },

    getId: (interest) => interest.Id,

    rules: [
      {
        name: 'Same email',
        weight: 1.0,
        check: (int1, int2) => {
          return int1['Email Address'] && int2['Email Address'] &&
                 normalizeEmail(int1['Email Address']) === normalizeEmail(int2['Email Address']);
        }
      },
      {
        name: 'Same phone',
        weight: 1.0,
        check: (int1, int2) => {
          const phone1 = normalizePhone(int1['Phone Number'] || '');
          const phone2 = normalizePhone(int2['Phone Number'] || '');

          return phone1 && phone2 && phone1.length >= 8 && phone1 === phone2;
        }
      },
      {
        name: 'Similar name and address',
        weight: 0.8,
        check: (int1, int2) => {
          if (!int1['Full Name'] || !int2['Full Name']) return false;

          const nameSimilarity = calculateStringSimilarity(int1['Full Name'], int2['Full Name']);

          if (nameSimilarity > 0.9) {
            // If names are very similar, check address too
            if (int1.Address && int2.Address) {
              const addressSimilarity = calculateStringSimilarity(int1.Address, int2.Address);
              return addressSimilarity > 0.8;
            }
            return true; // Similar names, no address to compare
          }

          return false;
        }
      }
    ],

    maxGroupSize: 50,
    maxComparisons: 10000
  };
}