src/lib/services/expense-dedup.service.ts

/**
 * Expense duplicate detection — heuristic match on
 * (port + vendor + amount + date ± 3d). PR1 ships the function shape;
 * PR8 wires the BullMQ trigger and the merge service.
 */

import { and, between, eq, ne, sql } from 'drizzle-orm';

import { db } from '@/lib/db';
import { expenses } from '@/lib/db/schema/financial';

const DEDUP_WINDOW_DAYS = 3;

export interface DedupCandidate {
  /** Existing expense that the new one likely duplicates. */
  candidateId: string;
  /** 0..1 confidence; 1.0 = exact vendor + amount + same day. */
  confidence: number;
}

export async function scanForDuplicates(expenseId: string): Promise<DedupCandidate[]> {
  const target = await db.query.expenses.findFirst({ where: eq(expenses.id, expenseId) });
  if (!target) return [];

  const { portId, establishmentName, amount, expenseDate } = target;
  if (!establishmentName || !amount || !expenseDate) return [];

  const lo = new Date(expenseDate);
  lo.setDate(lo.getDate() - DEDUP_WINDOW_DAYS);
  const hi = new Date(expenseDate);
  hi.setDate(hi.getDate() + DEDUP_WINDOW_DAYS);

  const matches = await db.query.expenses.findMany({
    where: and(
      eq(expenses.portId, portId),
      sql`lower(${expenses.establishmentName}) = lower(${establishmentName})`,
      eq(expenses.amount, amount),
      between(expenses.expenseDate, lo, hi),
      ne(expenses.id, expenseId),
    ),
    limit: 5,
  });

  return matches.map((m) => ({
    candidateId: m.id,
    confidence: dayDiff(m.expenseDate, expenseDate) === 0 ? 1.0 : 0.85,
  }));
}

function dayDiff(a: Date, b: Date): number {
  const ms = Math.abs(a.getTime() - b.getTime());
  return Math.round(ms / 86_400_000);
}

/** Mark an expense as a duplicate of the candidate with the highest score. */
export async function markBestDuplicate(expenseId: string): Promise<string | null> {
  const candidates = await scanForDuplicates(expenseId);
  if (candidates.length === 0) {
    await db
      .update(expenses)
      .set({ dedupScannedAt: sql`now()` })
      .where(eq(expenses.id, expenseId));
    return null;
  }
  const best = candidates.reduce((a, b) => (a.confidence >= b.confidence ? a : b));
  await db
    .update(expenses)
    .set({ duplicateOf: best.candidateId, dedupScannedAt: sql`now()` })
    .where(eq(expenses.id, expenseId));
  return best.candidateId;
}

/**
 * Clear the duplicate flag — operator confirmed this is a real expense.
 * Leaves `dedupScannedAt` populated so the engine doesn't re-flag it.
 */
export async function clearDuplicate(expenseId: string, portId: string): Promise<void> {
  await db
    .update(expenses)
    .set({ duplicateOf: null, dedupScannedAt: sql`now()` })
    .where(and(eq(expenses.id, expenseId), eq(expenses.portId, portId)));
}

/**
 * Merge `sourceId` into `targetId`: combine receipt files, archive the
 * source, and clear the duplicate-of pointer. Both rows must belong to
 * the same port; runs inside a single transaction so a partial failure
 * leaves both rows untouched.
 */
export async function mergeDuplicate(
  sourceId: string,
  targetId: string,
  portId: string,
): Promise<void> {
  if (sourceId === targetId) {
    throw new Error('Cannot merge an expense into itself');
  }

  await db.transaction(async (tx) => {
    const [source] = await tx
      .select()
      .from(expenses)
      .where(and(eq(expenses.id, sourceId), eq(expenses.portId, portId)));
    const [target] = await tx
      .select()
      .from(expenses)
      .where(and(eq(expenses.id, targetId), eq(expenses.portId, portId)));
    if (!source || !target) {
      throw new Error('Source or target expense not found in this port');
    }

    const mergedReceipts = Array.from(
      new Set([...(target.receiptFileIds ?? []), ...(source.receiptFileIds ?? [])]),
    );

    await tx
      .update(expenses)
      .set({ receiptFileIds: mergedReceipts })
      .where(eq(expenses.id, targetId));

    // Archive the source — preserves audit history, keeps any FKs alive.
    await tx
      .update(expenses)
      .set({ archivedAt: sql`now()`, duplicateOf: null })
      .where(eq(expenses.id, sourceId));
  });
}
feat(insights): Phase B schema + service skeletons PR1 of Phase B per docs/superpowers/specs/2026-04-28-phase-b-insights-alerts-design.md. Lays the foundation that PRs 2-10 will fill in with behaviour. Schema (migration 0014): - alerts table with rule-engine fields (rule_id, severity, link, entity_type/id, fingerprint, fired/dismissed/acknowledged/resolved timestamps, jsonb metadata). Partial-unique fingerprint index keeps one open row per (port, rule, entity); separate indexes power severity-filtered and time-ordered queries. - analytics_snapshots (port_id, metric_id) -> jsonb cache + computedAt for the 15-min recurring refresh. - expenses: duplicate_of self-FK, dedup_scanned_at, ocr_status/raw/ confidence; partial index on (port, vendor, amount, date) where duplicate_of IS NULL drives the dedup heuristic. - audit_logs.search_text: GENERATED ALWAYS tsvector over action+entity_type+entity_id+user_id, GIN-indexed (drizzle can't model GENERATED ALWAYS in TS yet, so the migration appends manual ALTER + the GIN index). Service skeletons in src/lib/services/: - alerts.service.ts: fingerprintFor, reconcileAlertsForPort (upsert + auto-resolve), dismiss, acknowledge, listAlertsForPort. - alert-rules.ts: RULE_REGISTRY of 10 rule evaluators (currently no-op); PR2 fills in the bodies. - analytics.service.ts: readSnapshot/writeSnapshot with 15-min TTL + no-op compute* stubs for the four chart series; PR3 fills behavior. - expense-dedup.service.ts: scanForDuplicates + markBestDuplicate using the partial dedup index. PR8 wires the BullMQ trigger. - expense-ocr.service.ts: OcrResult/OcrLineItem types + ocrReceipt stub. PR9 wires Claude Vision (Haiku 4.5 + ephemeral system-prompt cache). - audit-search.service.ts: tsvector @@ plainto_tsquery + cursor pagination on (createdAt, id). PR10 wires the admin UI. tsc clean, lint clean, vitest 675/675 (one unrelated AES random-output flake passes solo). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-28 14:43:01 +02:00			`/**`
			`* Expense duplicate detection — heuristic match on`
			`* (port + vendor + amount + date ± 3d). PR1 ships the function shape;`
			`* PR8 wires the BullMQ trigger and the merge service.`
			`*/`

			`import { and, between, eq, ne, sql } from 'drizzle-orm';`

			`import { db } from '@/lib/db';`
			`import { expenses } from '@/lib/db/schema/financial';`

			`const DEDUP_WINDOW_DAYS = 3;`

			`export interface DedupCandidate {`
			`/** Existing expense that the new one likely duplicates. */`
			`candidateId: string;`
			`/** 0..1 confidence; 1.0 = exact vendor + amount + same day. */`
			`confidence: number;`
			`}`

			`export async function scanForDuplicates(expenseId: string): Promise<DedupCandidate[]> {`
			`const target = await db.query.expenses.findFirst({ where: eq(expenses.id, expenseId) });`
			`if (!target) return [];`

			`const { portId, establishmentName, amount, expenseDate } = target;`
			`if (!establishmentName \|\| !amount \|\| !expenseDate) return [];`

			`const lo = new Date(expenseDate);`
			`lo.setDate(lo.getDate() - DEDUP_WINDOW_DAYS);`
			`const hi = new Date(expenseDate);`
			`hi.setDate(hi.getDate() + DEDUP_WINDOW_DAYS);`

			`const matches = await db.query.expenses.findMany({`
			`where: and(`
			`eq(expenses.portId, portId),`
			sql`lower(${expenses.establishmentName}) = lower(${establishmentName})`,
			`eq(expenses.amount, amount),`
			`between(expenses.expenseDate, lo, hi),`
			`ne(expenses.id, expenseId),`
			`),`
			`limit: 5,`
			`});`

			`return matches.map((m) => ({`
			`candidateId: m.id,`
			`confidence: dayDiff(m.expenseDate, expenseDate) === 0 ? 1.0 : 0.85,`
			`}));`
			`}`

			`function dayDiff(a: Date, b: Date): number {`
			`const ms = Math.abs(a.getTime() - b.getTime());`
			`return Math.round(ms / 86_400_000);`
			`}`

			`/** Mark an expense as a duplicate of the candidate with the highest score. */`
			`export async function markBestDuplicate(expenseId: string): Promise<string \| null> {`
			`const candidates = await scanForDuplicates(expenseId);`
			`if (candidates.length === 0) {`
			`await db`
			`.update(expenses)`
			.set({ dedupScannedAt: sql`now()` })
			`.where(eq(expenses.id, expenseId));`
			`return null;`
			`}`
			`const best = candidates.reduce((a, b) => (a.confidence >= b.confidence ? a : b));`
			`await db`
			`.update(expenses)`
			.set({ duplicateOf: best.candidateId, dedupScannedAt: sql`now()` })
			`.where(eq(expenses.id, expenseId));`
			`return best.candidateId;`
			`}`
feat(phase-b): ship analytics dashboard, alerts, scanner PWA, dedup, audit view Phase B (Insights & Alerts) PR4-11 in one drop. Builds on the schema + service skeletons committed in PRs 1-3. PR4 Analytics dashboard — 4 chart types (funnel/timeline/breakdown/source), date-range picker (today/7d/30d/90d), CSV+PNG export per card. PR5 Alert rail UI + /alerts page — topbar bell w/ live count, dashboard right-rail, three-tab page (active/dismissed/resolved), socket-driven invalidation. Bell lazy-loads list on popover open to keep cold pages fast in non-dashboard routes. PR6 EOI queue tab on documents hub — filters to in-flight EOIs, count surfaces in tab label. PR7 Interests-by-berth tab on berth detail — replaces the stub. PR8 Expense duplicate detection — BullMQ job runs scan on create, yellow banner on detail w/ Merge / Not-a-duplicate, transactional merge consolidates receipts and archives the source. PR9 Receipt scanner PWA + multi-provider AI — port-scoped /scan route in its own (scanner) group with no dashboard chrome, dynamic per-port manifest, OpenAI + Claude provider abstraction, admin OCR settings page (port-level + super-admin global default w/ opt-in fallback), test-connection endpoint, manual-entry fallback when no key is configured. Verify form always shown before save — no ghost rows. PR10 Audit log read view — swap to tsvector full-text search on the existing GIN index, cursor pagination, filters for entity/action/user /date range, batched actor-email resolution. PR11 Real-API tests — opt-in receipt-ocr.spec (admin save+test, optional real-receipt parse via REALAPI_RECEIPT_FIXTURE) and alert-engine socket-fanout spec gated behind RUN_ALERT_ENGINE_REALAPI. Both skip cleanly without their gate envs so CI stays green. Test totals: vitest 690 -> 713, smoke 130 -> 138, realapi +2 opt-in. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-28 17:21:55 +02:00
			`/**`
			`* Clear the duplicate flag — operator confirmed this is a real expense.`
			* Leaves `dedupScannedAt` populated so the engine doesn't re-flag it.
			`*/`
			`export async function clearDuplicate(expenseId: string, portId: string): Promise<void> {`
			`await db`
			`.update(expenses)`
			.set({ duplicateOf: null, dedupScannedAt: sql`now()` })
			`.where(and(eq(expenses.id, expenseId), eq(expenses.portId, portId)));`
			`}`

			`/**`
			* Merge `sourceId` into `targetId`: combine receipt files, archive the
			`* source, and clear the duplicate-of pointer. Both rows must belong to`
			`* the same port; runs inside a single transaction so a partial failure`
			`* leaves both rows untouched.`
			`*/`
			`export async function mergeDuplicate(`
			`sourceId: string,`
			`targetId: string,`
			`portId: string,`
			`): Promise<void> {`
			`if (sourceId === targetId) {`
			`throw new Error('Cannot merge an expense into itself');`
			`}`

			`await db.transaction(async (tx) => {`
			`const [source] = await tx`
			`.select()`
			`.from(expenses)`
			`.where(and(eq(expenses.id, sourceId), eq(expenses.portId, portId)));`
			`const [target] = await tx`
			`.select()`
			`.from(expenses)`
			`.where(and(eq(expenses.id, targetId), eq(expenses.portId, portId)));`
			`if (!source \|\| !target) {`
			`throw new Error('Source or target expense not found in this port');`
			`}`

			`const mergedReceipts = Array.from(`
			`new Set([...(target.receiptFileIds ?? []), ...(source.receiptFileIds ?? [])]),`
			`);`

			`await tx`
			`.update(expenses)`
			`.set({ receiptFileIds: mergedReceipts })`
			`.where(eq(expenses.id, targetId));`

			`// Archive the source — preserves audit history, keeps any FKs alive.`
			`await tx`
			`.update(expenses)`
			.set({ archivedAt: sql`now()`, duplicateOf: null })
			`.where(eq(expenses.id, sourceId));`
			`});`
			`}`