/** * Expense duplicate detection — heuristic match on * (port + vendor + amount + date ± 3d). PR1 ships the function shape; * PR8 wires the BullMQ trigger and the merge service. */ import { and, between, eq, ne, sql } from 'drizzle-orm'; import { db } from '@/lib/db'; import { expenses } from '@/lib/db/schema/financial'; const DEDUP_WINDOW_DAYS = 3; export interface DedupCandidate { /** Existing expense that the new one likely duplicates. */ candidateId: string; /** 0..1 confidence; 1.0 = exact vendor + amount + same day. */ confidence: number; } export async function scanForDuplicates(expenseId: string): Promise { const target = await db.query.expenses.findFirst({ where: eq(expenses.id, expenseId) }); if (!target) return []; const { portId, establishmentName, amount, expenseDate } = target; if (!establishmentName || !amount || !expenseDate) return []; const lo = new Date(expenseDate); lo.setDate(lo.getDate() - DEDUP_WINDOW_DAYS); const hi = new Date(expenseDate); hi.setDate(hi.getDate() + DEDUP_WINDOW_DAYS); const matches = await db.query.expenses.findMany({ where: and( eq(expenses.portId, portId), sql`lower(${expenses.establishmentName}) = lower(${establishmentName})`, eq(expenses.amount, amount), between(expenses.expenseDate, lo, hi), ne(expenses.id, expenseId), ), limit: 5, }); return matches.map((m) => ({ candidateId: m.id, confidence: dayDiff(m.expenseDate, expenseDate) === 0 ? 1.0 : 0.85, })); } function dayDiff(a: Date, b: Date): number { const ms = Math.abs(a.getTime() - b.getTime()); return Math.round(ms / 86_400_000); } /** Mark an expense as a duplicate of the candidate with the highest score. */ export async function markBestDuplicate(expenseId: string): Promise { const candidates = await scanForDuplicates(expenseId); if (candidates.length === 0) { await db .update(expenses) .set({ dedupScannedAt: sql`now()` }) .where(eq(expenses.id, expenseId)); return null; } const best = candidates.reduce((a, b) => (a.confidence >= b.confidence ? a : b)); await db .update(expenses) .set({ duplicateOf: best.candidateId, dedupScannedAt: sql`now()` }) .where(eq(expenses.id, expenseId)); return best.candidateId; } /** * Clear the duplicate flag — operator confirmed this is a real expense. * Leaves `dedupScannedAt` populated so the engine doesn't re-flag it. */ export async function clearDuplicate(expenseId: string, portId: string): Promise { await db .update(expenses) .set({ duplicateOf: null, dedupScannedAt: sql`now()` }) .where(and(eq(expenses.id, expenseId), eq(expenses.portId, portId))); } /** * Merge `sourceId` into `targetId`: combine receipt files, archive the * source, and clear the duplicate-of pointer. Both rows must belong to * the same port; runs inside a single transaction so a partial failure * leaves both rows untouched. */ export async function mergeDuplicate( sourceId: string, targetId: string, portId: string, ): Promise { if (sourceId === targetId) { throw new Error('Cannot merge an expense into itself'); } await db.transaction(async (tx) => { const [source] = await tx .select() .from(expenses) .where(and(eq(expenses.id, sourceId), eq(expenses.portId, portId))); const [target] = await tx .select() .from(expenses) .where(and(eq(expenses.id, targetId), eq(expenses.portId, portId))); if (!source || !target) { throw new Error('Source or target expense not found in this port'); } const mergedReceipts = Array.from( new Set([...(target.receiptFileIds ?? []), ...(source.receiptFileIds ?? [])]), ); await tx .update(expenses) .set({ receiptFileIds: mergedReceipts }) .where(eq(expenses.id, targetId)); // Archive the source — preserves audit history, keeps any FKs alive. await tx .update(expenses) .set({ archivedAt: sql`now()`, duplicateOf: null }) .where(eq(expenses.id, sourceId)); }); }