feat: Implement centralized duplicate detection utility for expenses and interests
This commit is contained in:
408
server/utils/duplicate-detection.ts
Normal file
408
server/utils/duplicate-detection.ts
Normal file
@@ -0,0 +1,408 @@
|
||||
import { normalizePersonName } from './nocodb';
|
||||
|
||||
/**
|
||||
* Configuration for duplicate detection
|
||||
*/
|
||||
export interface DuplicateDetectionConfig<T> {
|
||||
type: 'expense' | 'interest';
|
||||
|
||||
// Field extractors
|
||||
getKey: (item: T) => string; // Primary grouping key for blocking
|
||||
getId: (item: T) => number; // Unique identifier
|
||||
|
||||
// Duplicate detection rules
|
||||
rules: DuplicateRule<T>[];
|
||||
|
||||
// Performance settings
|
||||
maxGroupSize?: number; // Skip groups larger than this
|
||||
maxComparisons?: number; // Limit total comparisons
|
||||
}
|
||||
|
||||
/**
|
||||
* A rule for detecting duplicates
|
||||
*/
|
||||
export interface DuplicateRule<T> {
|
||||
name: string;
|
||||
weight: number;
|
||||
check: (item1: T, item2: T) => boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of duplicate detection
|
||||
*/
|
||||
export interface DuplicateGroup<T> {
|
||||
id: string;
|
||||
items: T[];
|
||||
matchReason: string;
|
||||
confidence: number;
|
||||
masterCandidate: T;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function to find duplicates using an efficient blocking strategy
|
||||
*/
|
||||
export function findDuplicates<T>(
|
||||
items: T[],
|
||||
config: DuplicateDetectionConfig<T>
|
||||
): DuplicateGroup<T>[] {
|
||||
console.log(`[DUPLICATES] Starting detection for ${items.length} ${config.type}s`);
|
||||
|
||||
if (items.length === 0) return [];
|
||||
|
||||
// Phase 1: Group items by blocking key for efficient comparison
|
||||
const blocks = new Map<string, T[]>();
|
||||
|
||||
items.forEach(item => {
|
||||
const key = config.getKey(item);
|
||||
if (!blocks.has(key)) {
|
||||
blocks.set(key, []);
|
||||
}
|
||||
blocks.get(key)!.push(item);
|
||||
});
|
||||
|
||||
console.log(`[DUPLICATES] Created ${blocks.size} blocks from ${items.length} items`);
|
||||
|
||||
// Phase 2: Find duplicates within each block
|
||||
const duplicateGroups: DuplicateGroup<T>[] = [];
|
||||
const processedIds = new Set<number>();
|
||||
let totalComparisons = 0;
|
||||
|
||||
for (const [blockKey, blockItems] of blocks) {
|
||||
// Skip large blocks that would be too expensive to process
|
||||
if (config.maxGroupSize && blockItems.length > config.maxGroupSize) {
|
||||
console.log(`[DUPLICATES] Skipping large block "${blockKey}" with ${blockItems.length} items`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip blocks with only one item
|
||||
if (blockItems.length < 2) continue;
|
||||
|
||||
console.log(`[DUPLICATES] Processing block "${blockKey}" with ${blockItems.length} items`);
|
||||
|
||||
// Find duplicates within this block
|
||||
for (let i = 0; i < blockItems.length; i++) {
|
||||
const item1 = blockItems[i];
|
||||
if (processedIds.has(config.getId(item1))) continue;
|
||||
|
||||
const group = [item1];
|
||||
const matchedRules = new Set<string>();
|
||||
|
||||
for (let j = i + 1; j < blockItems.length; j++) {
|
||||
const item2 = blockItems[j];
|
||||
if (processedIds.has(config.getId(item2))) continue;
|
||||
|
||||
totalComparisons++;
|
||||
|
||||
// Check if items match according to any rule
|
||||
const matchingRule = config.rules.find(rule => rule.check(item1, item2));
|
||||
|
||||
if (matchingRule) {
|
||||
console.log(`[DUPLICATES] Match found: ${config.getId(item1)} vs ${config.getId(item2)} (rule: ${matchingRule.name})`);
|
||||
group.push(item2);
|
||||
matchedRules.add(matchingRule.name);
|
||||
processedIds.add(config.getId(item2));
|
||||
}
|
||||
|
||||
// Stop if we've hit the comparison limit
|
||||
if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
|
||||
console.log(`[DUPLICATES] Hit comparison limit of ${config.maxComparisons}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If we found duplicates, create a group
|
||||
if (group.length > 1) {
|
||||
processedIds.add(config.getId(item1));
|
||||
|
||||
const masterCandidate = selectMasterCandidate(group, config.type);
|
||||
const confidence = calculateGroupConfidence(group, config.rules);
|
||||
|
||||
duplicateGroups.push({
|
||||
id: `group_${duplicateGroups.length + 1}`,
|
||||
items: group,
|
||||
matchReason: Array.from(matchedRules).join(', '),
|
||||
confidence,
|
||||
masterCandidate
|
||||
});
|
||||
}
|
||||
|
||||
if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[DUPLICATES] Completed ${totalComparisons} comparisons, found ${duplicateGroups.length} duplicate groups`);
|
||||
return duplicateGroups;
|
||||
}
|
||||
|
||||
/**
|
||||
* Select the best master candidate from a group
|
||||
*/
|
||||
function selectMasterCandidate<T>(items: T[], type: 'expense' | 'interest'): T {
|
||||
return items.reduce((best, current) => {
|
||||
const bestScore = calculateCompletenessScore(best, type);
|
||||
const currentScore = calculateCompletenessScore(current, type);
|
||||
return currentScore > bestScore ? current : best;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate completeness score for prioritizing records
|
||||
*/
|
||||
function calculateCompletenessScore(item: any, type: 'expense' | 'interest'): number {
|
||||
let score = 0;
|
||||
let totalFields = 0;
|
||||
|
||||
if (type === 'expense') {
|
||||
const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time'];
|
||||
fields.forEach(field => {
|
||||
totalFields++;
|
||||
if (item[field] && item[field].toString().trim().length > 0) {
|
||||
score++;
|
||||
}
|
||||
});
|
||||
|
||||
// Bonus for detailed contents
|
||||
if (item.Contents && item.Contents.length > 10) {
|
||||
score += 0.5;
|
||||
}
|
||||
} else if (type === 'interest') {
|
||||
const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired'];
|
||||
fields.forEach(field => {
|
||||
totalFields++;
|
||||
if (item[field] && item[field].toString().trim().length > 0) {
|
||||
score++;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Bonus for recent creation
|
||||
if (item['Created At'] || item.CreatedAt) {
|
||||
const createdField = item['Created At'] || item.CreatedAt;
|
||||
const created = new Date(createdField);
|
||||
const now = new Date();
|
||||
const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24);
|
||||
|
||||
if (daysOld < 30) score += 0.3;
|
||||
else if (daysOld < 90) score += 0.15;
|
||||
}
|
||||
|
||||
return totalFields > 0 ? score / totalFields : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate confidence score for a duplicate group
|
||||
*/
|
||||
function calculateGroupConfidence<T>(items: T[], rules: DuplicateRule<T>[]): number {
|
||||
if (items.length < 2) return 0;
|
||||
|
||||
let totalConfidence = 0;
|
||||
let comparisons = 0;
|
||||
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
for (let j = i + 1; j < items.length; j++) {
|
||||
const matchingRule = rules.find(rule => rule.check(items[i], items[j]));
|
||||
if (matchingRule) {
|
||||
totalConfidence += matchingRule.weight;
|
||||
comparisons++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return comparisons > 0 ? totalConfidence / comparisons : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize email for comparison
|
||||
*/
|
||||
export function normalizeEmail(email: string): string {
|
||||
return email.toLowerCase().trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize phone number for comparison
|
||||
*/
|
||||
export function normalizePhone(phone: string): string {
|
||||
return phone.replace(/\D/g, ''); // Remove all non-digits
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate string similarity using Levenshtein distance
|
||||
*/
|
||||
export function calculateStringSimilarity(str1: string, str2: string): number {
|
||||
const s1 = str1.toLowerCase().trim();
|
||||
const s2 = str2.toLowerCase().trim();
|
||||
|
||||
if (s1 === s2) return 1.0;
|
||||
|
||||
const distance = levenshteinDistance(s1, s2);
|
||||
const maxLength = Math.max(s1.length, s2.length);
|
||||
|
||||
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate Levenshtein distance between two strings
|
||||
*/
|
||||
function levenshteinDistance(str1: string, str2: string): number {
|
||||
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
|
||||
|
||||
for (let i = 0; i <= str1.length; i += 1) {
|
||||
matrix[0][i] = i;
|
||||
}
|
||||
|
||||
for (let j = 0; j <= str2.length; j += 1) {
|
||||
matrix[j][0] = j;
|
||||
}
|
||||
|
||||
for (let j = 1; j <= str2.length; j += 1) {
|
||||
for (let i = 1; i <= str1.length; i += 1) {
|
||||
const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
||||
matrix[j][i] = Math.min(
|
||||
matrix[j][i - 1] + 1, // deletion
|
||||
matrix[j - 1][i] + 1, // insertion
|
||||
matrix[j - 1][i - 1] + indicator // substitution
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[str2.length][str1.length];
|
||||
}
|
||||
|
||||
/**
|
||||
* Create configuration for expense duplicate detection
|
||||
*/
|
||||
export function createExpenseConfig(): DuplicateDetectionConfig<any> {
|
||||
return {
|
||||
type: 'expense',
|
||||
|
||||
// Group by normalized payer name for blocking
|
||||
getKey: (expense) => {
|
||||
const payer = expense.Payer ? normalizePersonName(expense.Payer) : 'unknown';
|
||||
const date = expense.Time ? expense.Time.split('T')[0] : 'nodate';
|
||||
return `${payer}_${date}`;
|
||||
},
|
||||
|
||||
getId: (expense) => expense.Id,
|
||||
|
||||
rules: [
|
||||
{
|
||||
name: 'Exact match',
|
||||
weight: 1.0,
|
||||
check: (exp1, exp2) => {
|
||||
return exp1['Establishment Name'] === exp2['Establishment Name'] &&
|
||||
exp1.Price === exp2.Price &&
|
||||
exp1.Time === exp2.Time;
|
||||
}
|
||||
},
|
||||
{
|
||||
name: 'Same day, same details',
|
||||
weight: 0.95,
|
||||
check: (exp1, exp2) => {
|
||||
const date1 = exp1.Time?.split('T')[0];
|
||||
const date2 = exp2.Time?.split('T')[0];
|
||||
|
||||
return normalizePersonName(exp1.Payer || '') === normalizePersonName(exp2.Payer || '') &&
|
||||
exp1['Establishment Name'] === exp2['Establishment Name'] &&
|
||||
exp1.Price === exp2.Price &&
|
||||
date1 === date2;
|
||||
}
|
||||
},
|
||||
{
|
||||
name: 'Close time proximity',
|
||||
weight: 0.9,
|
||||
check: (exp1, exp2) => {
|
||||
if (!exp1.Time || !exp2.Time) return false;
|
||||
|
||||
const time1 = new Date(exp1.Time).getTime();
|
||||
const time2 = new Date(exp2.Time).getTime();
|
||||
const timeDiff = Math.abs(time1 - time2);
|
||||
|
||||
return timeDiff < 5 * 60 * 1000 && // 5 minutes
|
||||
exp1['Establishment Name'] === exp2['Establishment Name'] &&
|
||||
exp1.Price === exp2.Price;
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
maxGroupSize: 50,
|
||||
maxComparisons: 10000
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Create configuration for interest duplicate detection
|
||||
*/
|
||||
export function createInterestConfig(): DuplicateDetectionConfig<any> {
|
||||
return {
|
||||
type: 'interest',
|
||||
|
||||
// Group by normalized email domain or phone prefix for blocking
|
||||
getKey: (interest) => {
|
||||
if (interest['Email Address']) {
|
||||
const email = normalizeEmail(interest['Email Address']);
|
||||
const domain = email.split('@')[1] || 'unknown';
|
||||
return `email_${domain}`;
|
||||
}
|
||||
|
||||
if (interest['Phone Number']) {
|
||||
const phone = normalizePhone(interest['Phone Number']);
|
||||
const prefix = phone.length >= 4 ? phone.substring(0, 4) : phone;
|
||||
return `phone_${prefix}`;
|
||||
}
|
||||
|
||||
return 'unknown';
|
||||
},
|
||||
|
||||
getId: (interest) => interest.Id,
|
||||
|
||||
rules: [
|
||||
{
|
||||
name: 'Same email',
|
||||
weight: 1.0,
|
||||
check: (int1, int2) => {
|
||||
return int1['Email Address'] && int2['Email Address'] &&
|
||||
normalizeEmail(int1['Email Address']) === normalizeEmail(int2['Email Address']);
|
||||
}
|
||||
},
|
||||
{
|
||||
name: 'Same phone',
|
||||
weight: 1.0,
|
||||
check: (int1, int2) => {
|
||||
const phone1 = normalizePhone(int1['Phone Number'] || '');
|
||||
const phone2 = normalizePhone(int2['Phone Number'] || '');
|
||||
|
||||
return phone1 && phone2 && phone1.length >= 8 && phone1 === phone2;
|
||||
}
|
||||
},
|
||||
{
|
||||
name: 'Similar name and address',
|
||||
weight: 0.8,
|
||||
check: (int1, int2) => {
|
||||
if (!int1['Full Name'] || !int2['Full Name']) return false;
|
||||
|
||||
const nameSimilarity = calculateStringSimilarity(int1['Full Name'], int2['Full Name']);
|
||||
|
||||
if (nameSimilarity > 0.9) {
|
||||
// If names are very similar, check address too
|
||||
if (int1.Address && int2.Address) {
|
||||
const addressSimilarity = calculateStringSimilarity(int1.Address, int2.Address);
|
||||
return addressSimilarity > 0.8;
|
||||
}
|
||||
return true; // Similar names, no address to compare
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
maxGroupSize: 50,
|
||||
maxComparisons: 10000
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user