pn-new-crm/src/lib/services/document-field-detector.ts

/**
 * Phase 4c — Auto-detect anchor scanner.
 *
 * Scans a PDF for common signing-block keywords ("Signature:", "Date:",
 * "Initials", a long run of underscores, etc.) and proposes Documenso
 * field placements positioned right after the matched anchor. Output
 * is in PERCENT coordinates so it lines up with the existing
 * `DocumensoFieldPlacement` shape consumed by the Phase 3 service.
 *
 * Confidence calculation is conservative: an explicit keyword match
 * scores higher than a generic underscore-run; the field-type-specific
 * regexes are tried in priority order so a `"Date of Signature:"`
 * anchor doesn't double-place as both DATE and SIGNATURE.
 *
 * This is intentionally pdf-content driven (text-extraction based) —
 * the alternative (image-of-PDF + OCR) is the bigger berth-PDF parser
 * tier-3 path; we keep this lightweight so it runs in <500ms on a
 * 10-page contract.
 */

import type { DocumensoFieldType } from '@/lib/services/documenso-client';

/** Result of detection, one entry per matched anchor. */
export interface DetectedField {
  type: DocumensoFieldType;
  /** 1-indexed page number. */
  pageNumber: number;
  /** All four values are 0-100 percent of page dimensions. */
  pageX: number;
  pageY: number;
  pageWidth: number;
  pageHeight: number;
  /** 0..1 — how sure the scanner is. */
  confidence: number;
  /** Verbatim anchor that triggered the detection (display + debug). */
  anchorText: string;
  /** Inferred recipient label ("Buyer", "Seller", "Client", "Witness",
   *  "Developer", "Notary", null). Phase 4d maps these to recipients
   *  by role/name. */
  inferredRecipientLabel?: string | null;
}

/** Anchor → field-type pattern table. Order matters: earlier patterns
 *  win when two anchors overlap on the same text item (e.g. "Date of
 *  Signature" matches both DATE and SIGNATURE — DATE goes first because
 *  it's the more specific pattern). */
interface AnchorPattern {
  type: DocumensoFieldType;
  /** Test against lower-cased anchor text. */
  match: RegExp;
  /** Suggested field box in PDF points (72 dpi). Converted to percent
   *  per-page after extraction. */
  widthPt: number;
  heightPt: number;
  /** Bias added to the base confidence. Specific keywords get a bump
   *  over the generic underscore catch-all. */
  confidenceBoost: number;
}

const ANCHOR_PATTERNS: AnchorPattern[] = [
  // DATE — more specific than SIGNATURE for the common "Date of
  // Signature:" case, so listed first.
  {
    type: 'DATE',
    match: /(?:dated|date(?:\s+of\s+signature)?)[:\s_-]+/i,
    widthPt: 80,
    heightPt: 20,
    confidenceBoost: 0.2,
  },
  // INITIALS — pre-empts NAME because "Initial:" is short and unique.
  {
    type: 'INITIALS',
    match: /(?:^|\b)(?:initials?)[:\s_-]+/i,
    widthPt: 50,
    heightPt: 30,
    confidenceBoost: 0.2,
  },
  // EMAIL — explicit email anchor.
  {
    type: 'EMAIL',
    match: /(?:^|\b)e-?mail[:\s_-]+/i,
    widthPt: 200,
    heightPt: 20,
    confidenceBoost: 0.2,
  },
  // NAME — printed/full name labels.
  {
    type: 'NAME',
    match: /(?:^|\b)(?:printed\s*)?(?:full\s+)?name[:\s_-]+/i,
    widthPt: 150,
    heightPt: 20,
    confidenceBoost: 0.15,
  },
  // SIGNATURE — broadest of the signing-block patterns.
  {
    type: 'SIGNATURE',
    match: /(?:^|\b)(?:signature|sign\s*here|signed\s*by|signed\s*at)[:\s_-]+/i,
    widthPt: 150,
    heightPt: 30,
    confidenceBoost: 0.2,
  },
  // SIGNATURE — explicit "X" mark followed by a blank line.
  {
    type: 'SIGNATURE',
    match: /X\s*_{4,}/,
    widthPt: 150,
    heightPt: 30,
    confidenceBoost: 0.15,
  },
  // Catch-all: a run of underscores not preceded by a more specific
  // keyword (which would have matched above). Defaults to TEXT.
  {
    type: 'TEXT',
    match: /_{8,}/,
    widthPt: 200,
    heightPt: 20,
    confidenceBoost: 0,
  },
];

/** Recipient labels we know how to match against. Kept in priority
 *  order so "Buyer Notary" wins NOTARY (more specific than BUYER on a
 *  notary-block tail). Each entry is lower-cased. */
const RECIPIENT_LABELS: Array<{ label: string; aliases: string[] }> = [
  { label: 'Notary', aliases: ['notary', 'witness'] },
  { label: 'Witness', aliases: ['witness'] },
  { label: 'Developer', aliases: ['developer', 'seller', 'vendor'] },
  { label: 'Approver', aliases: ['approver', 'manager'] },
  { label: 'Buyer', aliases: ['buyer', 'purchaser', 'client'] },
  { label: 'Seller', aliases: ['seller', 'vendor'] },
  { label: 'Client', aliases: ['client', 'customer'] },
];

/** A single text item returned by pdfjs-dist. The transform array
 *  encodes the position + scale of the text via PDF's affine matrix:
 *  `[scaleX, skewY, skewX, scaleY, translateX, translateY]`. We use
 *  `(translateX, translateY)` as the anchor's lower-left corner. */
interface PdfTextItem {
  str: string;
  /** PDF affine [a, b, c, d, e, f]. (e, f) is position. */
  transform: number[];
  /** Item width in PDF user-space units. */
  width?: number;
  /** Item height — usually equals scaleY. */
  height?: number;
}

interface PdfPageView {
  pageNumber: number;
  widthPt: number;
  heightPt: number;
  items: PdfTextItem[];
}

/**
 * Detect signing-block fields in a PDF. Each detection points at the
 * position immediately after the matched anchor text and is offset 5pt
 * to the right so the placeholder doesn't visually overlap the
 * keyword.
 *
 * Returns an empty array when the PDF has no extractable text (image-
 * only scans). The caller should fall back to drag-place-manual in
 * that case.
 */
export async function detectFields(pdfBuffer: Buffer): Promise<DetectedField[]> {
  const pages = await extractPdfPages(pdfBuffer);
  const detected: DetectedField[] = [];

  for (const page of pages) {
    for (const item of page.items) {
      const lower = item.str.toLowerCase();
      // Skip if the item has no positional data — defensive against
      // exotic PDF encodings.
      if (!Array.isArray(item.transform) || item.transform.length < 6) continue;
      const translateX = Number(item.transform[4]);
      const translateY = Number(item.transform[5]);
      if (!Number.isFinite(translateX) || !Number.isFinite(translateY)) continue;

      for (const pattern of ANCHOR_PATTERNS) {
        if (!pattern.match.test(lower)) continue;

        // Place the field immediately after the anchor with a 5pt
        // horizontal offset. The anchor's width is approximate; pdfjs
        // sometimes gives a too-small width for short tokens so we
        // floor at 30pt to avoid the field landing on top of the text.
        const anchorWidthPt = Math.max(30, item.width ?? lower.length * 5);
        const fieldXPt = translateX + anchorWidthPt + 5;
        // PDF user-space origin is the lower-left; transform[5] is the
        // baseline of the text so the field's lower-left also lives
        // there. CSS/web origin is top-left — we keep the percent in
        // PDF coordinates here because Documenso accepts both (the
        // existing placeFields helper handles the conversion).
        const fieldYPt = translateY;

        const pageX = (fieldXPt / page.widthPt) * 100;
        const pageY = (fieldYPt / page.heightPt) * 100;
        const pageWidth = (pattern.widthPt / page.widthPt) * 100;
        const pageHeight = (pattern.heightPt / page.heightPt) * 100;

        // Hard-skip fields that would land off-page (defensive — a
        // misparsed transform can blow up the coordinate space).
        if (pageX < 0 || pageX > 95 || pageY < 0 || pageY > 95) continue;
        if (pageWidth <= 0 || pageHeight <= 0) continue;

        const recipientLabel = inferRecipient(page.items, item, translateX, translateY);

        detected.push({
          type: pattern.type,
          pageNumber: page.pageNumber,
          pageX,
          pageY,
          pageWidth,
          pageHeight,
          confidence: 0.5 + pattern.confidenceBoost,
          anchorText: item.str.trim(),
          inferredRecipientLabel: recipientLabel,
        });
        // First matching pattern wins for this item — earlier
        // (more-specific) patterns shadow later ones.
        break;
      }
    }
  }
  return detected;
}

/**
 * Walk the page's other text items within ±100pt of the anchor and
 * find a recipient-label keyword. Used to seed the recipient
 * assignment side-panel; the rep can override.
 */
function inferRecipient(
  items: PdfTextItem[],
  anchor: PdfTextItem,
  anchorX: number,
  anchorY: number,
): string | null {
  const RADIUS = 100;
  for (const candidate of items) {
    if (candidate === anchor) continue;
    if (!Array.isArray(candidate.transform) || candidate.transform.length < 6) continue;
    const cx = Number(candidate.transform[4]);
    const cy = Number(candidate.transform[5]);
    if (!Number.isFinite(cx) || !Number.isFinite(cy)) continue;
    if (Math.abs(cx - anchorX) > RADIUS) continue;
    if (Math.abs(cy - anchorY) > RADIUS) continue;
    const lower = candidate.str.toLowerCase();
    for (const { label, aliases } of RECIPIENT_LABELS) {
      if (aliases.some((alias) => lower.includes(alias))) return label;
    }
  }
  return null;
}

/**
 * Extract per-page text + page dimensions from a PDF buffer. Uses
 * pdfjs-dist (the same library powering react-pdf in the dialog). We
 * import it dynamically so the heavy native-bindings dep only loads
 * when the detector actually runs.
 *
 * Returns an empty array if pdfjs fails to parse — the rep gets the
 * manual placement flow without an error toast.
 */
export async function extractPdfPages(pdfBuffer: Buffer): Promise<PdfPageView[]> {
  try {
    // pdfjs-dist 5.x ships a legacy ESM build that works in Node + Next
    // server bundles without the worker wiring needed in the browser.
    const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
    const data = new Uint8Array(pdfBuffer);
    const loadingTask = pdfjsLib.getDocument({ data });
    const pdf = await loadingTask.promise;
    const pages: PdfPageView[] = [];
    for (let i = 1; i <= pdf.numPages; i++) {
      const page = await pdf.getPage(i);
      const viewport = page.getViewport({ scale: 1 });
      const content = await page.getTextContent();
      const items = (content.items as Array<unknown>).filter(isPdfTextItem);
      pages.push({
        pageNumber: i,
        widthPt: viewport.width,
        heightPt: viewport.height,
        items,
      });
    }
    return pages;
  } catch {
    // Image-only scans or corrupt PDFs land here. The dialog falls
    // back to manual placement — no rep-facing error needed.
    return [];
  }
}

function isPdfTextItem(item: unknown): item is PdfTextItem {
  if (!item || typeof item !== 'object') return false;
  const i = item as Record<string, unknown>;
  return typeof i.str === 'string' && Array.isArray(i.transform);
}