pn-new-crm/tests/unit/dedup/find-matches.test.ts

/**
 * Match-finding library - unit tests.
 *
 * Each duplicate cluster from the legacy NocoDB Interests audit (see
 * docs/superpowers/specs/2026-05-03-dedup-and-migration-design.md §1.2)
 * is encoded as a fixture here. The expected scoring tier (high / medium
 * / low) is the design contract - if the algorithm starts returning
 * "high" for a Pattern F case (Etiennette / Bruno+Bruce) it has lost
 * the false-positive guard and we'll know immediately.
 */
import { describe, expect, it } from 'vitest';

import { findClientMatches, type MatchCandidate } from '@/lib/dedup/find-matches';

// Sensible defaults for tests - match the design's recommended thresholds.
const THRESHOLDS = {
  highScore: 90,
  mediumScore: 50,
};

function candidate(partial: Partial<MatchCandidate> & { id: string }): MatchCandidate {
  return {
    id: partial.id,
    fullName: partial.fullName ?? null,
    surnameToken: partial.surnameToken ?? null,
    emails: partial.emails ?? [],
    phonesE164: partial.phonesE164 ?? [],
    countryIso: partial.countryIso ?? null,
  };
}

describe('findClientMatches', () => {
  describe('Pattern A - pure double-submit (high confidence)', () => {
    it('flags identical email + phone as high', () => {
      // From real data: Deepak Ramchandani #624/#625, identical fields.
      const incoming = candidate({
        id: 'b',
        fullName: 'Deepak Ramchandani',
        surnameToken: 'ramchandani',
        emails: ['dannyrams8888@gmail.com'],
        phonesE164: ['+17215868888'],
      });
      const pool = [
        candidate({
          id: 'a',
          fullName: 'Deepak Ramchandani',
          surnameToken: 'ramchandani',
          emails: ['dannyrams8888@gmail.com'],
          phonesE164: ['+17215868888'],
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      expect(matches).toHaveLength(1);
      expect(matches[0]!.candidate.id).toBe('a');
      expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
      expect(matches[0]!.confidence).toBe('high');
      expect(matches[0]!.reasons).toEqual(expect.arrayContaining(['email match', 'phone match']));
    });
  });

  describe('Pattern B - same email, different phone format (high)', () => {
    it('high confidence when phones already normalize-equal', () => {
      // From real data: Howard Wiarda #236/#536, "574-274-0548" vs "+15742740548".
      // After normalization both phones are the same E.164, so the rule fires.
      const incoming = candidate({
        id: 'b',
        fullName: 'Howard Wiarda',
        surnameToken: 'wiarda',
        emails: ['hwiarda@hotmail.com'],
        phonesE164: ['+15742740548'],
      });
      const pool = [
        candidate({
          id: 'a',
          fullName: 'Howard Wiarda',
          surnameToken: 'wiarda',
          emails: ['hwiarda@hotmail.com'],
          phonesE164: ['+15742740548'],
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      expect(matches[0]!.confidence).toBe('high');
      expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
    });
  });

  describe('Pattern C - name capitalization variant (high)', () => {
    it('treats lowercase + uppercase as the same person when surname-token + email + phone all match', () => {
      // From real data: Nicolas Ruiz #681/#682/#683, email differs only by case.
      const incoming = candidate({
        id: 'b',
        fullName: 'Nicolas Ruiz',
        surnameToken: 'ruiz',
        emails: ['ruiz.nicolas@ufl.edu'],
        phonesE164: ['+17862006617'],
      });
      const pool = [
        candidate({
          id: 'a',
          fullName: 'Nicolas Ruiz',
          surnameToken: 'ruiz',
          emails: ['ruiz.nicolas@ufl.edu'],
          phonesE164: ['+17862006617'],
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      expect(matches[0]!.confidence).toBe('high');
    });
  });

  describe('Pattern D - name shortening (high)', () => {
    it('Chris vs Christopher with same email + phone scores high', () => {
      // From real data: Chris Allen #700 vs Christopher Allen #534.
      const incoming = candidate({
        id: 'b',
        fullName: 'Chris Allen',
        surnameToken: 'allen',
        emails: ['chris@thundercatsports.com'],
        phonesE164: ['+17814548950'],
      });
      const pool = [
        candidate({
          id: 'a',
          fullName: 'Christopher Allen',
          surnameToken: 'allen',
          emails: ['chris@thundercatsports.com'],
          phonesE164: ['+17814548950'],
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      expect(matches[0]!.confidence).toBe('high');
    });
  });

  describe('Pattern E - typo on resubmit', () => {
    it('same email + nearly-identical phone (typo in last digits) scores high', () => {
      // Christopher Camazou #649/#650 - phone differs in last 4 digits but
      // everything else matches. Exact phone equality fails; email exact
      // match alone (60) + name-token match (20) puts us in medium tier.
      // The user can confirm the merge.
      const incoming = candidate({
        id: 'b',
        fullName: 'Christopher Camazou',
        surnameToken: 'camazou',
        emails: ['camazou11@gmail.com'],
        phonesE164: ['+33608334455'],
      });
      const pool = [
        candidate({
          id: 'a',
          fullName: 'Christopher Camazou',
          surnameToken: 'camazou',
          emails: ['camazou11@gmail.com'],
          phonesE164: ['+33608336549'],
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      expect(matches).toHaveLength(1);
      // Email + name match without phone match - strong but not certain.
      expect(matches[0]!.confidence).toMatch(/^(high|medium)$/);
      expect(matches[0]!.score).toBeGreaterThanOrEqual(70);
    });

    it('Constanzo / Costanzo surname typo with same email + phone scores high', () => {
      // Gianfranco Di Constanzo #585 vs Di Costanzo #336 - same email + phone
      // and only a 1-letter surname typo. This is a strong "same client,
      // multiple yachts" signal - the design's signature win.
      const incoming = candidate({
        id: 'b',
        fullName: 'Gianfranco Di Constanzo',
        surnameToken: 'constanzo',
        emails: ['gdc@nauticall.com'],
        phonesE164: ['+17542628669'],
      });
      const pool = [
        candidate({
          id: 'a',
          fullName: 'Gianfranco Di Costanzo',
          surnameToken: 'costanzo',
          emails: ['gdc@nauticall.com'],
          phonesE164: ['+17542628669'],
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      expect(matches[0]!.confidence).toBe('high');
      expect(matches[0]!.score).toBeGreaterThanOrEqual(90);
    });
  });

  describe('Pattern F - hard cases (must NOT auto-merge)', () => {
    it('same name with different country phone + different email scores at most medium', () => {
      // Etiennette Clamouze #188/#717 - same name but completely different
      // email + phone (and the phones are in different country codes,
      // suggesting either a relative, a coworker, or a name-collision).
      // We must NOT classify this as "high" or it would force-merge two
      // distinct people.
      const incoming = candidate({
        id: 'b',
        fullName: 'Etiennette Clamouze',
        surnameToken: 'clamouze',
        emails: ['etiennette@the-manoah.com'],
        phonesE164: ['+12645815607'],
        countryIso: 'AI',
      });
      const pool = [
        candidate({
          id: 'a',
          fullName: 'Etiennette Clamouze',
          surnameToken: 'clamouze',
          emails: ['clamouze.etiennette@gmail.com'],
          phonesE164: ['+33767780640'],
          countryIso: 'FR',
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      // Surname-token + name-exact match should score in medium tier so
      // the pair lands in the review queue but doesn't auto-merge.
      if (matches.length > 0) {
        expect(matches[0]!.confidence).not.toBe('high');
        expect(matches[0]!.score).toBeLessThan(90);
      }
    });

    it('shared email between two clearly different names is medium not high', () => {
      // Bruno Joyerot #18 vs Bruce Hearn #19 - Bruno's row shows email
      // belonging to "catherine elaine hearn" (Bruce's spouse). Same
      // household phone area code. Name overlap is partial. Don't merge.
      const incoming = candidate({
        id: 'b',
        fullName: 'Bruce Hearn',
        surnameToken: 'hearn',
        emails: ['bhearn1063@gmail.com'],
        phonesE164: ['+12642358840'],
      });
      const pool = [
        candidate({
          id: 'a',
          fullName: 'Bruno Joyerot',
          surnameToken: 'joyerot',
          emails: ['catherineelainehearn@gmail.com'],
          phonesE164: ['+12642352816'],
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      // Names don't match, emails don't match, phones differ - there's
      // no reason for this to surface at all. Either no match or low.
      if (matches.length > 0) {
        expect(matches[0]!.confidence).toBe('low');
      }
    });
  });

  describe('Negative evidence - same email but different country phone', () => {
    it('reduces score when email matches but phone country differs', () => {
      // Constructed: same email, but one phone is +33 (FR) and the other
      // is +1 (US). Likely a shared-inbox spouse situation. We want
      // medium tier so it lands in review, not high tier.
      const incoming = candidate({
        id: 'b',
        fullName: 'Test User',
        surnameToken: 'user',
        emails: ['shared@example.com'],
        phonesE164: ['+15551234567'],
        countryIso: 'US',
      });
      const pool = [
        candidate({
          id: 'a',
          fullName: 'Test User',
          surnameToken: 'user',
          emails: ['shared@example.com'],
          phonesE164: ['+33611111111'],
          countryIso: 'FR',
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      // Email match alone would be 60 + name token match 20 = 80 (medium).
      // Negative evidence (different phone country) brings it down further.
      expect(matches[0]!.confidence).toBe('medium');
    });
  });

  describe('Blocking - only relevant candidates are scored', () => {
    it('does not score candidates with no shared emails / phones / surname token', () => {
      const incoming = candidate({
        id: 'newbie',
        fullName: 'Alice Smith',
        surnameToken: 'smith',
        emails: ['alice@example.com'],
        phonesE164: ['+15551234567'],
      });
      const pool = [
        candidate({
          id: 'unrelated1',
          fullName: 'Bob Jones',
          surnameToken: 'jones',
          emails: ['bob@example.org'],
          phonesE164: ['+33611111111'],
        }),
        candidate({
          id: 'unrelated2',
          fullName: 'Carol White',
          surnameToken: 'white',
          emails: ['carol@example.net'],
          phonesE164: ['+447700900111'],
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      expect(matches).toHaveLength(0);
    });
  });

  describe('Empty pool', () => {
    it('returns no matches when the pool is empty', () => {
      const incoming = candidate({
        id: 'a',
        fullName: 'Alice',
        emails: ['alice@example.com'],
      });
      expect(findClientMatches(incoming, [], THRESHOLDS)).toEqual([]);
    });
  });

  describe('Sort order', () => {
    it('returns matches sorted by score descending', () => {
      const incoming = candidate({
        id: 'incoming',
        fullName: 'John Smith',
        surnameToken: 'smith',
        emails: ['john@example.com'],
        phonesE164: ['+15551234567'],
      });
      const pool = [
        candidate({
          // High match - same email + phone
          id: 'high-match',
          fullName: 'John Smith',
          surnameToken: 'smith',
          emails: ['john@example.com'],
          phonesE164: ['+15551234567'],
        }),
        candidate({
          // Medium match - same email only
          id: 'medium-match',
          fullName: 'Different Person',
          surnameToken: 'person',
          emails: ['john@example.com'],
          phonesE164: ['+33611111111'],
        }),
      ];

      const matches = findClientMatches(incoming, pool, THRESHOLDS);

      expect(matches.length).toBeGreaterThanOrEqual(2);
      expect(matches[0]!.candidate.id).toBe('high-match');
      expect(matches[0]!.score).toBeGreaterThan(matches[1]!.score);
    });
  });
});