feat: Implement centralized duplicate detection utility for expenses and interests
This commit is contained in:
parent
b8a6a52417
commit
080cb60d71
|
|
@ -1,5 +1,6 @@
|
||||||
import { requireSalesOrAdmin } from '~/server/utils/auth';
|
import { requireSalesOrAdmin } from '~/server/utils/auth';
|
||||||
import { getNocoDbConfiguration, normalizePersonName } from '~/server/utils/nocodb';
|
import { getNocoDbConfiguration, normalizePersonName } from '~/server/utils/nocodb';
|
||||||
|
import { findDuplicates, createExpenseConfig } from '~/server/utils/duplicate-detection';
|
||||||
import type { Expense } from '~/utils/types';
|
import type { Expense } from '~/utils/types';
|
||||||
|
|
||||||
export default defineEventHandler(async (event) => {
|
export default defineEventHandler(async (event) => {
|
||||||
|
|
@ -35,21 +36,31 @@ export default defineEventHandler(async (event) => {
|
||||||
const expenses = response.list || [];
|
const expenses = response.list || [];
|
||||||
console.log('[EXPENSES] Analyzing', expenses.length, 'expenses for duplicates');
|
console.log('[EXPENSES] Analyzing', expenses.length, 'expenses for duplicates');
|
||||||
|
|
||||||
// Find duplicate groups
|
// Find duplicate groups using the new centralized utility
|
||||||
const duplicateGroups = findDuplicateExpenses(expenses);
|
const duplicateConfig = createExpenseConfig();
|
||||||
|
const duplicateGroups = findDuplicates(expenses, duplicateConfig);
|
||||||
|
|
||||||
|
// Convert to the expected format
|
||||||
|
const formattedGroups = duplicateGroups.map(group => ({
|
||||||
|
id: group.id,
|
||||||
|
expenses: group.items,
|
||||||
|
matchReason: group.matchReason,
|
||||||
|
confidence: group.confidence,
|
||||||
|
masterCandidate: group.masterCandidate
|
||||||
|
}));
|
||||||
|
|
||||||
// Also find payer name variations
|
// Also find payer name variations
|
||||||
const payerVariations = findPayerNameVariations(expenses);
|
const payerVariations = findPayerNameVariations(expenses);
|
||||||
|
|
||||||
console.log('[EXPENSES] Found', duplicateGroups.length, 'duplicate groups and', payerVariations.length, 'payer variations');
|
console.log('[EXPENSES] Found', formattedGroups.length, 'duplicate groups and', payerVariations.length, 'payer variations');
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: {
|
data: {
|
||||||
duplicateGroups,
|
duplicateGroups: formattedGroups,
|
||||||
payerVariations,
|
payerVariations,
|
||||||
totalExpenses: expenses.length,
|
totalExpenses: expenses.length,
|
||||||
duplicateCount: duplicateGroups.reduce((sum, group) => sum + group.expenses.length, 0),
|
duplicateCount: formattedGroups.reduce((sum, group) => sum + group.expenses.length, 0),
|
||||||
dateRange: {
|
dateRange: {
|
||||||
start: startDate.toISOString().split('T')[0],
|
start: startDate.toISOString().split('T')[0],
|
||||||
end: endDate.toISOString().split('T')[0]
|
end: endDate.toISOString().split('T')[0]
|
||||||
|
|
@ -74,71 +85,6 @@ export default defineEventHandler(async (event) => {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
/**
|
|
||||||
* Find duplicate expenses based on multiple criteria
|
|
||||||
*/
|
|
||||||
function findDuplicateExpenses(expenses: any[]) {
|
|
||||||
console.log('[EXPENSES] Starting duplicate detection for', expenses.length, 'expenses');
|
|
||||||
|
|
||||||
const duplicateGroups: Array<{
|
|
||||||
id: string;
|
|
||||||
expenses: any[];
|
|
||||||
matchReason: string;
|
|
||||||
confidence: number;
|
|
||||||
masterCandidate: any;
|
|
||||||
}> = [];
|
|
||||||
|
|
||||||
const processedIds = new Set<number>();
|
|
||||||
let comparisons = 0;
|
|
||||||
|
|
||||||
for (let i = 0; i < expenses.length; i++) {
|
|
||||||
const expense1 = expenses[i];
|
|
||||||
|
|
||||||
if (processedIds.has(expense1.Id)) continue;
|
|
||||||
|
|
||||||
const matches = [expense1];
|
|
||||||
let matchReasons = new Set<string>();
|
|
||||||
|
|
||||||
for (let j = i + 1; j < expenses.length; j++) {
|
|
||||||
const expense2 = expenses[j];
|
|
||||||
|
|
||||||
if (processedIds.has(expense2.Id)) continue;
|
|
||||||
|
|
||||||
const similarity = calculateExpenseSimilarity(expense1, expense2);
|
|
||||||
comparisons++;
|
|
||||||
|
|
||||||
console.log(`[EXPENSES] Comparing ${expense1.Id} vs ${expense2.Id}: score=${similarity.score.toFixed(3)}, threshold=0.7`);
|
|
||||||
|
|
||||||
if (similarity.score >= 0.7) { // Lower threshold for expenses
|
|
||||||
console.log(`[EXPENSES] MATCH FOUND! ${expense1.Id} vs ${expense2.Id} (score: ${similarity.score.toFixed(3)})`);
|
|
||||||
console.log('[EXPENSES] Match reasons:', similarity.reasons);
|
|
||||||
matches.push(expense2);
|
|
||||||
processedIds.add(expense2.Id);
|
|
||||||
similarity.reasons.forEach(r => matchReasons.add(r));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (matches.length > 1) {
|
|
||||||
// Mark all as processed
|
|
||||||
matches.forEach(match => processedIds.add(match.Id));
|
|
||||||
|
|
||||||
// Determine the best master candidate
|
|
||||||
const masterCandidate = selectMasterExpense(matches);
|
|
||||||
|
|
||||||
duplicateGroups.push({
|
|
||||||
id: `group_${duplicateGroups.length + 1}`,
|
|
||||||
expenses: matches,
|
|
||||||
matchReason: Array.from(matchReasons).join(', '),
|
|
||||||
confidence: Math.max(...matches.slice(1).map(match =>
|
|
||||||
calculateExpenseSimilarity(masterCandidate, match).score
|
|
||||||
)),
|
|
||||||
masterCandidate
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return duplicateGroups;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find payer name variations (like "Abbie" vs "abbie")
|
* Find payer name variations (like "Abbie" vs "abbie")
|
||||||
|
|
@ -181,154 +127,3 @@ function findPayerNameVariations(expenses: any[]) {
|
||||||
|
|
||||||
return variations.sort((a, b) => b.expenseCount - a.expenseCount);
|
return variations.sort((a, b) => b.expenseCount - a.expenseCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate similarity between two expenses
|
|
||||||
*/
|
|
||||||
function calculateExpenseSimilarity(expense1: any, expense2: any) {
|
|
||||||
const scores: Array<{ type: string; score: number; weight: number }> = [];
|
|
||||||
const reasons: string[] = [];
|
|
||||||
|
|
||||||
// Exact match on establishment, price, and date (highest weight for true duplicates)
|
|
||||||
if (expense1['Establishment Name'] === expense2['Establishment Name'] &&
|
|
||||||
expense1.Price === expense2.Price &&
|
|
||||||
expense1.Time === expense2.Time) {
|
|
||||||
scores.push({ type: 'exact', score: 1.0, weight: 0.5 });
|
|
||||||
reasons.push('Exact match');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Same payer, establishment, and price on same day (likely duplicate)
|
|
||||||
const date1 = expense1.Time?.split('T')[0];
|
|
||||||
const date2 = expense2.Time?.split('T')[0];
|
|
||||||
|
|
||||||
if (normalizePersonName(expense1.Payer) === normalizePersonName(expense2.Payer) &&
|
|
||||||
expense1['Establishment Name'] === expense2['Establishment Name'] &&
|
|
||||||
expense1.Price === expense2.Price &&
|
|
||||||
date1 === date2) {
|
|
||||||
scores.push({ type: 'same-day', score: 0.95, weight: 0.4 });
|
|
||||||
reasons.push('Same person, place, amount on same day');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Similar establishment names with same price and payer
|
|
||||||
if (expense1['Establishment Name'] && expense2['Establishment Name']) {
|
|
||||||
const nameSimilarity = calculateStringSimilarity(
|
|
||||||
expense1['Establishment Name'],
|
|
||||||
expense2['Establishment Name']
|
|
||||||
);
|
|
||||||
|
|
||||||
if (nameSimilarity > 0.8 &&
|
|
||||||
expense1.Price === expense2.Price &&
|
|
||||||
normalizePersonName(expense1.Payer) === normalizePersonName(expense2.Payer)) {
|
|
||||||
scores.push({ type: 'similar', score: nameSimilarity, weight: 0.3 });
|
|
||||||
reasons.push('Similar establishment name');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Time proximity check (within 5 minutes)
|
|
||||||
if (expense1.Time && expense2.Time) {
|
|
||||||
const time1 = new Date(expense1.Time).getTime();
|
|
||||||
const time2 = new Date(expense2.Time).getTime();
|
|
||||||
const timeDiff = Math.abs(time1 - time2);
|
|
||||||
|
|
||||||
if (timeDiff < 5 * 60 * 1000 && // 5 minutes
|
|
||||||
expense1['Establishment Name'] === expense2['Establishment Name']) {
|
|
||||||
scores.push({ type: 'time-proximity', score: 0.9, weight: 0.2 });
|
|
||||||
reasons.push('Within 5 minutes at same establishment');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate weighted average
|
|
||||||
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
||||||
const weightedScore = totalWeight > 0
|
|
||||||
? scores.reduce((sum, s) => sum + (s.score * s.weight), 0) / totalWeight
|
|
||||||
: 0;
|
|
||||||
|
|
||||||
return {
|
|
||||||
score: weightedScore,
|
|
||||||
reasons,
|
|
||||||
details: scores
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate string similarity using Levenshtein distance
|
|
||||||
*/
|
|
||||||
function calculateStringSimilarity(str1: string, str2: string): number {
|
|
||||||
const s1 = str1.toLowerCase().trim();
|
|
||||||
const s2 = str2.toLowerCase().trim();
|
|
||||||
|
|
||||||
if (s1 === s2) return 1.0;
|
|
||||||
|
|
||||||
const distance = levenshteinDistance(s1, s2);
|
|
||||||
const maxLength = Math.max(s1.length, s2.length);
|
|
||||||
|
|
||||||
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate Levenshtein distance between two strings
|
|
||||||
*/
|
|
||||||
function levenshteinDistance(str1: string, str2: string): number {
|
|
||||||
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
|
|
||||||
|
|
||||||
for (let i = 0; i <= str1.length; i += 1) {
|
|
||||||
matrix[0][i] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (let j = 0; j <= str2.length; j += 1) {
|
|
||||||
matrix[j][0] = j;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (let j = 1; j <= str2.length; j += 1) {
|
|
||||||
for (let i = 1; i <= str1.length; i += 1) {
|
|
||||||
const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
|
||||||
matrix[j][i] = Math.min(
|
|
||||||
matrix[j][i - 1] + 1, // deletion
|
|
||||||
matrix[j - 1][i] + 1, // insertion
|
|
||||||
matrix[j - 1][i - 1] + indicator // substitution
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return matrix[str2.length][str1.length];
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Select the best master expense from a group
|
|
||||||
*/
|
|
||||||
function selectMasterExpense(expenses: any[]) {
|
|
||||||
return expenses.reduce((best, current) => {
|
|
||||||
const bestScore = calculateExpenseCompletenessScore(best);
|
|
||||||
const currentScore = calculateExpenseCompletenessScore(current);
|
|
||||||
|
|
||||||
return currentScore > bestScore ? current : best;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate completeness score for an expense
|
|
||||||
*/
|
|
||||||
function calculateExpenseCompletenessScore(expense: any): number {
|
|
||||||
const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time'];
|
|
||||||
const filledFields = fields.filter(field =>
|
|
||||||
expense[field] && expense[field].toString().trim().length > 0
|
|
||||||
);
|
|
||||||
|
|
||||||
let score = filledFields.length / fields.length;
|
|
||||||
|
|
||||||
// Bonus for having contents description
|
|
||||||
if (expense.Contents && expense.Contents.length > 10) {
|
|
||||||
score += 0.2;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bonus for recent creation (more likely to be accurate)
|
|
||||||
if (expense.CreatedAt) {
|
|
||||||
const created = new Date(expense.CreatedAt);
|
|
||||||
const now = new Date();
|
|
||||||
const hoursOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60);
|
|
||||||
|
|
||||||
if (hoursOld < 24) score += 0.1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return Math.min(score, 1.0);
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import { requireSalesOrAdmin } from '~/server/utils/auth';
|
import { requireSalesOrAdmin } from '~/server/utils/auth';
|
||||||
import { getNocoDbConfiguration } from '~/server/utils/nocodb';
|
import { getNocoDbConfiguration } from '~/server/utils/nocodb';
|
||||||
import { logAuditEvent } from '~/server/utils/audit-logger';
|
import { logAuditEvent } from '~/server/utils/audit-logger';
|
||||||
|
import { findDuplicates, createInterestConfig } from '~/server/utils/duplicate-detection';
|
||||||
|
|
||||||
export default defineEventHandler(async (event) => {
|
export default defineEventHandler(async (event) => {
|
||||||
console.log('[INTERESTS] Find duplicates request');
|
console.log('[INTERESTS] Find duplicates request');
|
||||||
|
|
@ -40,16 +41,26 @@ export default defineEventHandler(async (event) => {
|
||||||
const interests = response.list || [];
|
const interests = response.list || [];
|
||||||
console.log('[INTERESTS] Analyzing', interests.length, 'interests for duplicates');
|
console.log('[INTERESTS] Analyzing', interests.length, 'interests for duplicates');
|
||||||
|
|
||||||
// Find potential duplicates
|
// Find duplicate groups using the new centralized utility
|
||||||
const duplicateGroups = findDuplicateInterests(interests, threshold);
|
const duplicateConfig = createInterestConfig();
|
||||||
|
const duplicateGroups = findDuplicates(interests, duplicateConfig);
|
||||||
|
|
||||||
|
// Convert to the expected format
|
||||||
|
const formattedGroups = duplicateGroups.map(group => ({
|
||||||
|
id: group.id,
|
||||||
|
interests: group.items,
|
||||||
|
matchReason: group.matchReason,
|
||||||
|
confidence: group.confidence,
|
||||||
|
masterCandidate: group.masterCandidate
|
||||||
|
}));
|
||||||
|
|
||||||
console.log('[INTERESTS] Found', duplicateGroups.length, 'duplicate groups');
|
console.log('[INTERESTS] Found', formattedGroups.length, 'duplicate groups');
|
||||||
|
|
||||||
// Log the audit event
|
// Log the audit event
|
||||||
await logAuditEvent(event, 'FIND_INTEREST_DUPLICATES', 'interest', {
|
await logAuditEvent(event, 'FIND_INTEREST_DUPLICATES', 'interest', {
|
||||||
changes: {
|
changes: {
|
||||||
totalInterests: interests.length,
|
totalInterests: interests.length,
|
||||||
duplicateGroups: duplicateGroups.length,
|
duplicateGroups: formattedGroups.length,
|
||||||
threshold,
|
threshold,
|
||||||
dateRange
|
dateRange
|
||||||
}
|
}
|
||||||
|
|
@ -58,9 +69,9 @@ export default defineEventHandler(async (event) => {
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
data: {
|
data: {
|
||||||
duplicateGroups,
|
duplicateGroups: formattedGroups,
|
||||||
totalInterests: interests.length,
|
totalInterests: interests.length,
|
||||||
duplicateCount: duplicateGroups.reduce((sum, group) => sum + group.interests.length, 0),
|
duplicateCount: formattedGroups.reduce((sum, group) => sum + group.interests.length, 0),
|
||||||
threshold,
|
threshold,
|
||||||
dateRange
|
dateRange
|
||||||
}
|
}
|
||||||
|
|
@ -82,288 +93,3 @@ export default defineEventHandler(async (event) => {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
/**
|
|
||||||
* Find duplicate interests based on multiple criteria
|
|
||||||
*/
|
|
||||||
function findDuplicateInterests(interests: any[], threshold: number = 0.8) {
|
|
||||||
console.log('[INTERESTS] Starting duplicate detection with threshold:', threshold);
|
|
||||||
console.log('[INTERESTS] Total interests to analyze:', interests.length);
|
|
||||||
|
|
||||||
const duplicateGroups: Array<{
|
|
||||||
id: string;
|
|
||||||
interests: any[];
|
|
||||||
matchReason: string;
|
|
||||||
confidence: number;
|
|
||||||
masterCandidate: any;
|
|
||||||
}> = [];
|
|
||||||
|
|
||||||
const processedIds = new Set<number>();
|
|
||||||
let comparisons = 0;
|
|
||||||
|
|
||||||
for (let i = 0; i < interests.length; i++) {
|
|
||||||
const interest1 = interests[i];
|
|
||||||
|
|
||||||
if (processedIds.has(interest1.Id)) continue;
|
|
||||||
|
|
||||||
const matches = [interest1];
|
|
||||||
|
|
||||||
for (let j = i + 1; j < interests.length; j++) {
|
|
||||||
const interest2 = interests[j];
|
|
||||||
|
|
||||||
if (processedIds.has(interest2.Id)) continue;
|
|
||||||
|
|
||||||
const similarity = calculateSimilarity(interest1, interest2);
|
|
||||||
comparisons++;
|
|
||||||
|
|
||||||
console.log(`[INTERESTS] Comparing ${interest1.Id} vs ${interest2.Id}: score=${similarity.score.toFixed(3)}, threshold=${threshold}`);
|
|
||||||
|
|
||||||
if (similarity.score >= threshold) {
|
|
||||||
console.log(`[INTERESTS] MATCH FOUND! ${interest1.Id} vs ${interest2.Id} (score: ${similarity.score.toFixed(3)})`);
|
|
||||||
console.log('[INTERESTS] Match details:', similarity.details);
|
|
||||||
matches.push(interest2);
|
|
||||||
processedIds.add(interest2.Id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (matches.length > 1) {
|
|
||||||
console.log(`[INTERESTS] Creating duplicate group with ${matches.length} matches`);
|
|
||||||
|
|
||||||
// Mark all as processed
|
|
||||||
matches.forEach(match => processedIds.add(match.Id));
|
|
||||||
|
|
||||||
// Determine the best master candidate (most complete record)
|
|
||||||
const masterCandidate = selectMasterCandidate(matches);
|
|
||||||
|
|
||||||
// Calculate average confidence
|
|
||||||
const avgConfidence = matches.slice(1).reduce((sum, match) => {
|
|
||||||
return sum + calculateSimilarity(masterCandidate, match).score;
|
|
||||||
}, 0) / (matches.length - 1);
|
|
||||||
|
|
||||||
duplicateGroups.push({
|
|
||||||
id: `group_${duplicateGroups.length + 1}`,
|
|
||||||
interests: matches,
|
|
||||||
matchReason: generateMatchReason(matches),
|
|
||||||
confidence: avgConfidence,
|
|
||||||
masterCandidate
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`[INTERESTS] Completed ${comparisons} comparisons, found ${duplicateGroups.length} duplicate groups`);
|
|
||||||
return duplicateGroups;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate similarity between two interests
|
|
||||||
*/
|
|
||||||
function calculateSimilarity(interest1: any, interest2: any) {
|
|
||||||
const scores: Array<{ type: string; score: number; weight: number }> = [];
|
|
||||||
|
|
||||||
console.log(`[INTERESTS] Calculating similarity between:`, {
|
|
||||||
id1: interest1.Id,
|
|
||||||
name1: interest1['Full Name'],
|
|
||||||
email1: interest1['Email Address'],
|
|
||||||
phone1: interest1['Phone Number'],
|
|
||||||
id2: interest2.Id,
|
|
||||||
name2: interest2['Full Name'],
|
|
||||||
email2: interest2['Email Address'],
|
|
||||||
phone2: interest2['Phone Number']
|
|
||||||
});
|
|
||||||
|
|
||||||
// Email similarity (highest weight) - exact match required
|
|
||||||
if (interest1['Email Address'] && interest2['Email Address']) {
|
|
||||||
const email1 = normalizeEmail(interest1['Email Address']);
|
|
||||||
const email2 = normalizeEmail(interest2['Email Address']);
|
|
||||||
const emailScore = email1 === email2 ? 1.0 : 0.0;
|
|
||||||
scores.push({ type: 'email', score: emailScore, weight: 0.5 });
|
|
||||||
console.log(`[INTERESTS] Email comparison: "${email1}" vs "${email2}" = ${emailScore}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Phone similarity - exact match on normalized numbers
|
|
||||||
if (interest1['Phone Number'] && interest2['Phone Number']) {
|
|
||||||
const phone1 = normalizePhone(interest1['Phone Number']);
|
|
||||||
const phone2 = normalizePhone(interest2['Phone Number']);
|
|
||||||
const phoneScore = phone1 === phone2 && phone1.length >= 8 ? 1.0 : 0.0; // Require at least 8 digits
|
|
||||||
scores.push({ type: 'phone', score: phoneScore, weight: 0.4 });
|
|
||||||
console.log(`[INTERESTS] Phone comparison: "${phone1}" vs "${phone2}" = ${phoneScore}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Name similarity - fuzzy matching
|
|
||||||
if (interest1['Full Name'] && interest2['Full Name']) {
|
|
||||||
const nameScore = calculateNameSimilarity(interest1['Full Name'], interest2['Full Name']);
|
|
||||||
scores.push({ type: 'name', score: nameScore, weight: 0.3 });
|
|
||||||
console.log(`[INTERESTS] Name comparison: "${interest1['Full Name']}" vs "${interest2['Full Name']}" = ${nameScore.toFixed(3)}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Address similarity
|
|
||||||
if (interest1.Address && interest2.Address) {
|
|
||||||
const addressScore = calculateStringSimilarity(interest1.Address, interest2.Address);
|
|
||||||
scores.push({ type: 'address', score: addressScore, weight: 0.2 });
|
|
||||||
console.log(`[INTERESTS] Address comparison: ${addressScore.toFixed(3)}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Special case: if we have exact email OR phone match, give high score regardless of other fields
|
|
||||||
const hasExactEmailMatch = scores.find(s => s.type === 'email' && s.score === 1.0);
|
|
||||||
const hasExactPhoneMatch = scores.find(s => s.type === 'phone' && s.score === 1.0);
|
|
||||||
|
|
||||||
if (hasExactEmailMatch || hasExactPhoneMatch) {
|
|
||||||
console.log('[INTERESTS] Exact email or phone match found - high confidence');
|
|
||||||
return {
|
|
||||||
score: 0.95, // High confidence for exact email/phone match
|
|
||||||
details: scores
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate weighted average for other cases
|
|
||||||
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
||||||
const weightedScore = scores.reduce((sum, s) => sum + (s.score * s.weight), 0) / (totalWeight || 1);
|
|
||||||
|
|
||||||
console.log(`[INTERESTS] Weighted score: ${weightedScore.toFixed(3)} (weights: ${totalWeight})`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
score: weightedScore,
|
|
||||||
details: scores
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalize email for comparison
|
|
||||||
*/
|
|
||||||
function normalizeEmail(email: string): string {
|
|
||||||
return email.toLowerCase().trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalize phone number for comparison
|
|
||||||
*/
|
|
||||||
function normalizePhone(phone: string): string {
|
|
||||||
return phone.replace(/\D/g, ''); // Remove all non-digits
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate name similarity using Levenshtein distance
|
|
||||||
*/
|
|
||||||
function calculateNameSimilarity(name1: string, name2: string): number {
|
|
||||||
const str1 = name1.toLowerCase().trim();
|
|
||||||
const str2 = name2.toLowerCase().trim();
|
|
||||||
|
|
||||||
if (str1 === str2) return 1.0;
|
|
||||||
|
|
||||||
const distance = levenshteinDistance(str1, str2);
|
|
||||||
const maxLength = Math.max(str1.length, str2.length);
|
|
||||||
|
|
||||||
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate string similarity using Levenshtein distance
|
|
||||||
*/
|
|
||||||
function calculateStringSimilarity(str1: string, str2: string): number {
|
|
||||||
const s1 = str1.toLowerCase().trim();
|
|
||||||
const s2 = str2.toLowerCase().trim();
|
|
||||||
|
|
||||||
if (s1 === s2) return 1.0;
|
|
||||||
|
|
||||||
const distance = levenshteinDistance(s1, s2);
|
|
||||||
const maxLength = Math.max(s1.length, s2.length);
|
|
||||||
|
|
||||||
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate Levenshtein distance between two strings
|
|
||||||
*/
|
|
||||||
function levenshteinDistance(str1: string, str2: string): number {
|
|
||||||
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
|
|
||||||
|
|
||||||
for (let i = 0; i <= str1.length; i += 1) {
|
|
||||||
matrix[0][i] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (let j = 0; j <= str2.length; j += 1) {
|
|
||||||
matrix[j][0] = j;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (let j = 1; j <= str2.length; j += 1) {
|
|
||||||
for (let i = 1; i <= str1.length; i += 1) {
|
|
||||||
const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
|
||||||
matrix[j][i] = Math.min(
|
|
||||||
matrix[j][i - 1] + 1, // deletion
|
|
||||||
matrix[j - 1][i] + 1, // insertion
|
|
||||||
matrix[j - 1][i - 1] + indicator // substitution
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return matrix[str2.length][str1.length];
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Select the best master candidate from a group of duplicates
|
|
||||||
*/
|
|
||||||
function selectMasterCandidate(interests: any[]) {
|
|
||||||
return interests.reduce((best, current) => {
|
|
||||||
const bestScore = calculateCompletenessScore(best);
|
|
||||||
const currentScore = calculateCompletenessScore(current);
|
|
||||||
|
|
||||||
return currentScore > bestScore ? current : best;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate completeness score for an interest record
|
|
||||||
*/
|
|
||||||
function calculateCompletenessScore(interest: any): number {
|
|
||||||
const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired'];
|
|
||||||
const filledFields = fields.filter(field =>
|
|
||||||
interest[field] && interest[field].toString().trim().length > 0
|
|
||||||
);
|
|
||||||
|
|
||||||
let score = filledFields.length / fields.length;
|
|
||||||
|
|
||||||
// Bonus for recent creation
|
|
||||||
if (interest['Created At']) {
|
|
||||||
const created = new Date(interest['Created At']);
|
|
||||||
const now = new Date();
|
|
||||||
const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24);
|
|
||||||
|
|
||||||
// More recent records get a small bonus
|
|
||||||
if (daysOld < 30) score += 0.1;
|
|
||||||
else if (daysOld < 90) score += 0.05;
|
|
||||||
}
|
|
||||||
|
|
||||||
return score;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generate a descriptive match reason
|
|
||||||
*/
|
|
||||||
function generateMatchReason(interests: any[]): string {
|
|
||||||
const reasons = [];
|
|
||||||
|
|
||||||
// Check for exact email matches
|
|
||||||
const emails = interests.map(i => i['Email Address']).filter(Boolean);
|
|
||||||
if (emails.length > 1 && new Set(emails.map(e => normalizeEmail(e))).size === 1) {
|
|
||||||
reasons.push('Same email address');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for exact phone matches
|
|
||||||
const phones = interests.map(i => i['Phone Number']).filter(Boolean);
|
|
||||||
if (phones.length > 1 && new Set(phones.map(p => normalizePhone(p))).size === 1) {
|
|
||||||
reasons.push('Same phone number');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for similar names
|
|
||||||
const names = interests.map(i => i['Full Name']).filter(Boolean);
|
|
||||||
if (names.length > 1) {
|
|
||||||
const normalizedNames = names.map(n => n.toLowerCase().trim());
|
|
||||||
if (new Set(normalizedNames).size === 1) {
|
|
||||||
reasons.push('Same name');
|
|
||||||
} else {
|
|
||||||
reasons.push('Similar names');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return reasons.length > 0 ? reasons.join(', ') : 'Multiple matching criteria';
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,408 @@
|
||||||
|
import { normalizePersonName } from './nocodb';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configuration for duplicate detection
|
||||||
|
*/
|
||||||
|
export interface DuplicateDetectionConfig<T> {
|
||||||
|
type: 'expense' | 'interest';
|
||||||
|
|
||||||
|
// Field extractors
|
||||||
|
getKey: (item: T) => string; // Primary grouping key for blocking
|
||||||
|
getId: (item: T) => number; // Unique identifier
|
||||||
|
|
||||||
|
// Duplicate detection rules
|
||||||
|
rules: DuplicateRule<T>[];
|
||||||
|
|
||||||
|
// Performance settings
|
||||||
|
maxGroupSize?: number; // Skip groups larger than this
|
||||||
|
maxComparisons?: number; // Limit total comparisons
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A rule for detecting duplicates
|
||||||
|
*/
|
||||||
|
export interface DuplicateRule<T> {
|
||||||
|
name: string;
|
||||||
|
weight: number;
|
||||||
|
check: (item1: T, item2: T) => boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Result of duplicate detection
|
||||||
|
*/
|
||||||
|
export interface DuplicateGroup<T> {
|
||||||
|
id: string;
|
||||||
|
items: T[];
|
||||||
|
matchReason: string;
|
||||||
|
confidence: number;
|
||||||
|
masterCandidate: T;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main function to find duplicates using an efficient blocking strategy
|
||||||
|
*/
|
||||||
|
export function findDuplicates<T>(
|
||||||
|
items: T[],
|
||||||
|
config: DuplicateDetectionConfig<T>
|
||||||
|
): DuplicateGroup<T>[] {
|
||||||
|
console.log(`[DUPLICATES] Starting detection for ${items.length} ${config.type}s`);
|
||||||
|
|
||||||
|
if (items.length === 0) return [];
|
||||||
|
|
||||||
|
// Phase 1: Group items by blocking key for efficient comparison
|
||||||
|
const blocks = new Map<string, T[]>();
|
||||||
|
|
||||||
|
items.forEach(item => {
|
||||||
|
const key = config.getKey(item);
|
||||||
|
if (!blocks.has(key)) {
|
||||||
|
blocks.set(key, []);
|
||||||
|
}
|
||||||
|
blocks.get(key)!.push(item);
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`[DUPLICATES] Created ${blocks.size} blocks from ${items.length} items`);
|
||||||
|
|
||||||
|
// Phase 2: Find duplicates within each block
|
||||||
|
const duplicateGroups: DuplicateGroup<T>[] = [];
|
||||||
|
const processedIds = new Set<number>();
|
||||||
|
let totalComparisons = 0;
|
||||||
|
|
||||||
|
for (const [blockKey, blockItems] of blocks) {
|
||||||
|
// Skip large blocks that would be too expensive to process
|
||||||
|
if (config.maxGroupSize && blockItems.length > config.maxGroupSize) {
|
||||||
|
console.log(`[DUPLICATES] Skipping large block "${blockKey}" with ${blockItems.length} items`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip blocks with only one item
|
||||||
|
if (blockItems.length < 2) continue;
|
||||||
|
|
||||||
|
console.log(`[DUPLICATES] Processing block "${blockKey}" with ${blockItems.length} items`);
|
||||||
|
|
||||||
|
// Find duplicates within this block
|
||||||
|
for (let i = 0; i < blockItems.length; i++) {
|
||||||
|
const item1 = blockItems[i];
|
||||||
|
if (processedIds.has(config.getId(item1))) continue;
|
||||||
|
|
||||||
|
const group = [item1];
|
||||||
|
const matchedRules = new Set<string>();
|
||||||
|
|
||||||
|
for (let j = i + 1; j < blockItems.length; j++) {
|
||||||
|
const item2 = blockItems[j];
|
||||||
|
if (processedIds.has(config.getId(item2))) continue;
|
||||||
|
|
||||||
|
totalComparisons++;
|
||||||
|
|
||||||
|
// Check if items match according to any rule
|
||||||
|
const matchingRule = config.rules.find(rule => rule.check(item1, item2));
|
||||||
|
|
||||||
|
if (matchingRule) {
|
||||||
|
console.log(`[DUPLICATES] Match found: ${config.getId(item1)} vs ${config.getId(item2)} (rule: ${matchingRule.name})`);
|
||||||
|
group.push(item2);
|
||||||
|
matchedRules.add(matchingRule.name);
|
||||||
|
processedIds.add(config.getId(item2));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop if we've hit the comparison limit
|
||||||
|
if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
|
||||||
|
console.log(`[DUPLICATES] Hit comparison limit of ${config.maxComparisons}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we found duplicates, create a group
|
||||||
|
if (group.length > 1) {
|
||||||
|
processedIds.add(config.getId(item1));
|
||||||
|
|
||||||
|
const masterCandidate = selectMasterCandidate(group, config.type);
|
||||||
|
const confidence = calculateGroupConfidence(group, config.rules);
|
||||||
|
|
||||||
|
duplicateGroups.push({
|
||||||
|
id: `group_${duplicateGroups.length + 1}`,
|
||||||
|
items: group,
|
||||||
|
matchReason: Array.from(matchedRules).join(', '),
|
||||||
|
confidence,
|
||||||
|
masterCandidate
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (config.maxComparisons && totalComparisons >= config.maxComparisons) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[DUPLICATES] Completed ${totalComparisons} comparisons, found ${duplicateGroups.length} duplicate groups`);
|
||||||
|
return duplicateGroups;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Select the best master candidate from a group
|
||||||
|
*/
|
||||||
|
function selectMasterCandidate<T>(items: T[], type: 'expense' | 'interest'): T {
|
||||||
|
return items.reduce((best, current) => {
|
||||||
|
const bestScore = calculateCompletenessScore(best, type);
|
||||||
|
const currentScore = calculateCompletenessScore(current, type);
|
||||||
|
return currentScore > bestScore ? current : best;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate completeness score for prioritizing records
|
||||||
|
*/
|
||||||
|
function calculateCompletenessScore(item: any, type: 'expense' | 'interest'): number {
|
||||||
|
let score = 0;
|
||||||
|
let totalFields = 0;
|
||||||
|
|
||||||
|
if (type === 'expense') {
|
||||||
|
const fields = ['Establishment Name', 'Price', 'Payer', 'Category', 'Contents', 'Time'];
|
||||||
|
fields.forEach(field => {
|
||||||
|
totalFields++;
|
||||||
|
if (item[field] && item[field].toString().trim().length > 0) {
|
||||||
|
score++;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Bonus for detailed contents
|
||||||
|
if (item.Contents && item.Contents.length > 10) {
|
||||||
|
score += 0.5;
|
||||||
|
}
|
||||||
|
} else if (type === 'interest') {
|
||||||
|
const fields = ['Full Name', 'Email Address', 'Phone Number', 'Address', 'Extra Comments', 'Berth Size Desired'];
|
||||||
|
fields.forEach(field => {
|
||||||
|
totalFields++;
|
||||||
|
if (item[field] && item[field].toString().trim().length > 0) {
|
||||||
|
score++;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bonus for recent creation
|
||||||
|
if (item['Created At'] || item.CreatedAt) {
|
||||||
|
const createdField = item['Created At'] || item.CreatedAt;
|
||||||
|
const created = new Date(createdField);
|
||||||
|
const now = new Date();
|
||||||
|
const daysOld = (now.getTime() - created.getTime()) / (1000 * 60 * 60 * 24);
|
||||||
|
|
||||||
|
if (daysOld < 30) score += 0.3;
|
||||||
|
else if (daysOld < 90) score += 0.15;
|
||||||
|
}
|
||||||
|
|
||||||
|
return totalFields > 0 ? score / totalFields : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate confidence score for a duplicate group
|
||||||
|
*/
|
||||||
|
function calculateGroupConfidence<T>(items: T[], rules: DuplicateRule<T>[]): number {
|
||||||
|
if (items.length < 2) return 0;
|
||||||
|
|
||||||
|
let totalConfidence = 0;
|
||||||
|
let comparisons = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < items.length; i++) {
|
||||||
|
for (let j = i + 1; j < items.length; j++) {
|
||||||
|
const matchingRule = rules.find(rule => rule.check(items[i], items[j]));
|
||||||
|
if (matchingRule) {
|
||||||
|
totalConfidence += matchingRule.weight;
|
||||||
|
comparisons++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return comparisons > 0 ? totalConfidence / comparisons : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize email for comparison
|
||||||
|
*/
|
||||||
|
export function normalizeEmail(email: string): string {
|
||||||
|
return email.toLowerCase().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize phone number for comparison
|
||||||
|
*/
|
||||||
|
export function normalizePhone(phone: string): string {
|
||||||
|
return phone.replace(/\D/g, ''); // Remove all non-digits
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate string similarity using Levenshtein distance
|
||||||
|
*/
|
||||||
|
export function calculateStringSimilarity(str1: string, str2: string): number {
|
||||||
|
const s1 = str1.toLowerCase().trim();
|
||||||
|
const s2 = str2.toLowerCase().trim();
|
||||||
|
|
||||||
|
if (s1 === s2) return 1.0;
|
||||||
|
|
||||||
|
const distance = levenshteinDistance(s1, s2);
|
||||||
|
const maxLength = Math.max(s1.length, s2.length);
|
||||||
|
|
||||||
|
return maxLength > 0 ? 1 - (distance / maxLength) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate Levenshtein distance between two strings
|
||||||
|
*/
|
||||||
|
function levenshteinDistance(str1: string, str2: string): number {
|
||||||
|
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
|
||||||
|
|
||||||
|
for (let i = 0; i <= str1.length; i += 1) {
|
||||||
|
matrix[0][i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let j = 0; j <= str2.length; j += 1) {
|
||||||
|
matrix[j][0] = j;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let j = 1; j <= str2.length; j += 1) {
|
||||||
|
for (let i = 1; i <= str1.length; i += 1) {
|
||||||
|
const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
||||||
|
matrix[j][i] = Math.min(
|
||||||
|
matrix[j][i - 1] + 1, // deletion
|
||||||
|
matrix[j - 1][i] + 1, // insertion
|
||||||
|
matrix[j - 1][i - 1] + indicator // substitution
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return matrix[str2.length][str1.length];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create configuration for expense duplicate detection
|
||||||
|
*/
|
||||||
|
export function createExpenseConfig(): DuplicateDetectionConfig<any> {
|
||||||
|
return {
|
||||||
|
type: 'expense',
|
||||||
|
|
||||||
|
// Group by normalized payer name for blocking
|
||||||
|
getKey: (expense) => {
|
||||||
|
const payer = expense.Payer ? normalizePersonName(expense.Payer) : 'unknown';
|
||||||
|
const date = expense.Time ? expense.Time.split('T')[0] : 'nodate';
|
||||||
|
return `${payer}_${date}`;
|
||||||
|
},
|
||||||
|
|
||||||
|
getId: (expense) => expense.Id,
|
||||||
|
|
||||||
|
rules: [
|
||||||
|
{
|
||||||
|
name: 'Exact match',
|
||||||
|
weight: 1.0,
|
||||||
|
check: (exp1, exp2) => {
|
||||||
|
return exp1['Establishment Name'] === exp2['Establishment Name'] &&
|
||||||
|
exp1.Price === exp2.Price &&
|
||||||
|
exp1.Time === exp2.Time;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Same day, same details',
|
||||||
|
weight: 0.95,
|
||||||
|
check: (exp1, exp2) => {
|
||||||
|
const date1 = exp1.Time?.split('T')[0];
|
||||||
|
const date2 = exp2.Time?.split('T')[0];
|
||||||
|
|
||||||
|
return normalizePersonName(exp1.Payer || '') === normalizePersonName(exp2.Payer || '') &&
|
||||||
|
exp1['Establishment Name'] === exp2['Establishment Name'] &&
|
||||||
|
exp1.Price === exp2.Price &&
|
||||||
|
date1 === date2;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Close time proximity',
|
||||||
|
weight: 0.9,
|
||||||
|
check: (exp1, exp2) => {
|
||||||
|
if (!exp1.Time || !exp2.Time) return false;
|
||||||
|
|
||||||
|
const time1 = new Date(exp1.Time).getTime();
|
||||||
|
const time2 = new Date(exp2.Time).getTime();
|
||||||
|
const timeDiff = Math.abs(time1 - time2);
|
||||||
|
|
||||||
|
return timeDiff < 5 * 60 * 1000 && // 5 minutes
|
||||||
|
exp1['Establishment Name'] === exp2['Establishment Name'] &&
|
||||||
|
exp1.Price === exp2.Price;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
|
||||||
|
maxGroupSize: 50,
|
||||||
|
maxComparisons: 10000
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create configuration for interest duplicate detection
|
||||||
|
*/
|
||||||
|
export function createInterestConfig(): DuplicateDetectionConfig<any> {
|
||||||
|
return {
|
||||||
|
type: 'interest',
|
||||||
|
|
||||||
|
// Group by normalized email domain or phone prefix for blocking
|
||||||
|
getKey: (interest) => {
|
||||||
|
if (interest['Email Address']) {
|
||||||
|
const email = normalizeEmail(interest['Email Address']);
|
||||||
|
const domain = email.split('@')[1] || 'unknown';
|
||||||
|
return `email_${domain}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (interest['Phone Number']) {
|
||||||
|
const phone = normalizePhone(interest['Phone Number']);
|
||||||
|
const prefix = phone.length >= 4 ? phone.substring(0, 4) : phone;
|
||||||
|
return `phone_${prefix}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 'unknown';
|
||||||
|
},
|
||||||
|
|
||||||
|
getId: (interest) => interest.Id,
|
||||||
|
|
||||||
|
rules: [
|
||||||
|
{
|
||||||
|
name: 'Same email',
|
||||||
|
weight: 1.0,
|
||||||
|
check: (int1, int2) => {
|
||||||
|
return int1['Email Address'] && int2['Email Address'] &&
|
||||||
|
normalizeEmail(int1['Email Address']) === normalizeEmail(int2['Email Address']);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Same phone',
|
||||||
|
weight: 1.0,
|
||||||
|
check: (int1, int2) => {
|
||||||
|
const phone1 = normalizePhone(int1['Phone Number'] || '');
|
||||||
|
const phone2 = normalizePhone(int2['Phone Number'] || '');
|
||||||
|
|
||||||
|
return phone1 && phone2 && phone1.length >= 8 && phone1 === phone2;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Similar name and address',
|
||||||
|
weight: 0.8,
|
||||||
|
check: (int1, int2) => {
|
||||||
|
if (!int1['Full Name'] || !int2['Full Name']) return false;
|
||||||
|
|
||||||
|
const nameSimilarity = calculateStringSimilarity(int1['Full Name'], int2['Full Name']);
|
||||||
|
|
||||||
|
if (nameSimilarity > 0.9) {
|
||||||
|
// If names are very similar, check address too
|
||||||
|
if (int1.Address && int2.Address) {
|
||||||
|
const addressSimilarity = calculateStringSimilarity(int1.Address, int2.Address);
|
||||||
|
return addressSimilarity > 0.8;
|
||||||
|
}
|
||||||
|
return true; // Similar names, no address to compare
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
|
||||||
|
maxGroupSize: 50,
|
||||||
|
maxComparisons: 10000
|
||||||
|
};
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue