/** * Phase 4c - Auto-detect anchor scanner. * * Scans a PDF for common signing-block keywords ("Signature:", "Date:", * "Initials", a long run of underscores, etc.) and proposes Documenso * field placements positioned right after the matched anchor. Output * is in PERCENT coordinates so it lines up with the existing * `DocumensoFieldPlacement` shape consumed by the Phase 3 service. * * Confidence calculation is conservative: an explicit keyword match * scores higher than a generic underscore-run; the field-type-specific * regexes are tried in priority order so a `"Date of Signature:"` * anchor doesn't double-place as both DATE and SIGNATURE. * * This is intentionally pdf-content driven (text-extraction based) - * the alternative (image-of-PDF + OCR) is the bigger berth-PDF parser * tier-3 path; we keep this lightweight so it runs in <500ms on a * 10-page contract. */ import type { DocumensoFieldType } from '@/lib/services/documenso-client'; /** Result of detection, one entry per matched anchor. */ export interface DetectedField { type: DocumensoFieldType; /** 1-indexed page number. */ pageNumber: number; /** All four values are 0-100 percent of page dimensions. */ pageX: number; pageY: number; pageWidth: number; pageHeight: number; /** 0..1 - how sure the scanner is. */ confidence: number; /** Verbatim anchor that triggered the detection (display + debug). */ anchorText: string; /** Inferred recipient label ("Buyer", "Seller", "Client", "Witness", * "Developer", "Notary", null). Phase 4d maps these to recipients * by role/name. */ inferredRecipientLabel?: string | null; } /** Anchor → field-type pattern table. Order matters: earlier patterns * win when two anchors overlap on the same text item (e.g. "Date of * Signature" matches both DATE and SIGNATURE - DATE goes first because * it's the more specific pattern). */ interface AnchorPattern { type: DocumensoFieldType; /** Test against lower-cased anchor text. */ match: RegExp; /** Suggested field box in PDF points (72 dpi). Converted to percent * per-page after extraction. */ widthPt: number; heightPt: number; /** Bias added to the base confidence. Specific keywords get a bump * over the generic underscore catch-all. */ confidenceBoost: number; } const ANCHOR_PATTERNS: AnchorPattern[] = [ // DATE - more specific than SIGNATURE for the common "Date of // Signature:" case, so listed first. { type: 'DATE', match: /(?:dated|date(?:\s+of\s+signature)?)[:\s_-]+/i, widthPt: 80, heightPt: 20, confidenceBoost: 0.2, }, // INITIALS - pre-empts NAME because "Initial:" is short and unique. { type: 'INITIALS', match: /(?:^|\b)(?:initials?)[:\s_-]+/i, widthPt: 50, heightPt: 30, confidenceBoost: 0.2, }, // EMAIL - explicit email anchor. { type: 'EMAIL', match: /(?:^|\b)e-?mail[:\s_-]+/i, widthPt: 200, heightPt: 20, confidenceBoost: 0.2, }, // NAME - printed/full name labels. { type: 'NAME', match: /(?:^|\b)(?:printed\s*)?(?:full\s+)?name[:\s_-]+/i, widthPt: 150, heightPt: 20, confidenceBoost: 0.15, }, // SIGNATURE - broadest of the signing-block patterns. { type: 'SIGNATURE', match: /(?:^|\b)(?:signature|sign\s*here|signed\s*by|signed\s*at)[:\s_-]+/i, widthPt: 150, heightPt: 30, confidenceBoost: 0.2, }, // SIGNATURE - explicit "X" mark followed by a blank line. { type: 'SIGNATURE', match: /X\s*_{4,}/, widthPt: 150, heightPt: 30, confidenceBoost: 0.15, }, // Catch-all: a run of underscores not preceded by a more specific // keyword (which would have matched above). Defaults to TEXT. { type: 'TEXT', match: /_{8,}/, widthPt: 200, heightPt: 20, confidenceBoost: 0, }, ]; /** Recipient labels we know how to match against. Kept in priority * order so "Buyer Notary" wins NOTARY (more specific than BUYER on a * notary-block tail). Each entry is lower-cased. */ const RECIPIENT_LABELS: Array<{ label: string; aliases: string[] }> = [ { label: 'Notary', aliases: ['notary', 'witness'] }, { label: 'Witness', aliases: ['witness'] }, { label: 'Developer', aliases: ['developer', 'seller', 'vendor'] }, { label: 'Approver', aliases: ['approver', 'manager'] }, { label: 'Buyer', aliases: ['buyer', 'purchaser', 'client'] }, { label: 'Seller', aliases: ['seller', 'vendor'] }, { label: 'Client', aliases: ['client', 'customer'] }, ]; /** A single text item returned by pdfjs-dist. The transform array * encodes the position + scale of the text via PDF's affine matrix: * `[scaleX, skewY, skewX, scaleY, translateX, translateY]`. We use * `(translateX, translateY)` as the anchor's lower-left corner. */ interface PdfTextItem { str: string; /** PDF affine [a, b, c, d, e, f]. (e, f) is position. */ transform: number[]; /** Item width in PDF user-space units. */ width?: number; /** Item height - usually equals scaleY. */ height?: number; } interface PdfPageView { pageNumber: number; widthPt: number; heightPt: number; items: PdfTextItem[]; } /** * Detect signing-block fields in a PDF. Each detection points at the * position immediately after the matched anchor text and is offset 5pt * to the right so the placeholder doesn't visually overlap the * keyword. * * Returns an empty array when the PDF has no extractable text (image- * only scans). The caller should fall back to drag-place-manual in * that case. */ export async function detectFields(pdfBuffer: Buffer): Promise { const pages = await extractPdfPages(pdfBuffer); const detected: DetectedField[] = []; for (const page of pages) { for (const item of page.items) { const lower = item.str.toLowerCase(); // Skip if the item has no positional data - defensive against // exotic PDF encodings. if (!Array.isArray(item.transform) || item.transform.length < 6) continue; const translateX = Number(item.transform[4]); const translateY = Number(item.transform[5]); if (!Number.isFinite(translateX) || !Number.isFinite(translateY)) continue; for (const pattern of ANCHOR_PATTERNS) { if (!pattern.match.test(lower)) continue; // Place the field immediately after the anchor with a 5pt // horizontal offset. The anchor's width is approximate; pdfjs // sometimes gives a too-small width for short tokens so we // floor at 30pt to avoid the field landing on top of the text. const anchorWidthPt = Math.max(30, item.width ?? lower.length * 5); const fieldXPt = translateX + anchorWidthPt + 5; // PDF user-space origin is the lower-left; transform[5] is the // baseline of the text so the field's lower-left also lives // there. CSS/web origin is top-left - we keep the percent in // PDF coordinates here because Documenso accepts both (the // existing placeFields helper handles the conversion). const fieldYPt = translateY; const pageX = (fieldXPt / page.widthPt) * 100; const pageY = (fieldYPt / page.heightPt) * 100; const pageWidth = (pattern.widthPt / page.widthPt) * 100; const pageHeight = (pattern.heightPt / page.heightPt) * 100; // Hard-skip fields that would land off-page (defensive - a // misparsed transform can blow up the coordinate space). if (pageX < 0 || pageX > 95 || pageY < 0 || pageY > 95) continue; if (pageWidth <= 0 || pageHeight <= 0) continue; const recipientLabel = inferRecipient(page.items, item, translateX, translateY); detected.push({ type: pattern.type, pageNumber: page.pageNumber, pageX, pageY, pageWidth, pageHeight, confidence: 0.5 + pattern.confidenceBoost, anchorText: item.str.trim(), inferredRecipientLabel: recipientLabel, }); // First matching pattern wins for this item - earlier // (more-specific) patterns shadow later ones. break; } } } return detected; } /** * Walk the page's other text items within ±100pt of the anchor and * find a recipient-label keyword. Used to seed the recipient * assignment side-panel; the rep can override. */ function inferRecipient( items: PdfTextItem[], anchor: PdfTextItem, anchorX: number, anchorY: number, ): string | null { const RADIUS = 100; for (const candidate of items) { if (candidate === anchor) continue; if (!Array.isArray(candidate.transform) || candidate.transform.length < 6) continue; const cx = Number(candidate.transform[4]); const cy = Number(candidate.transform[5]); if (!Number.isFinite(cx) || !Number.isFinite(cy)) continue; if (Math.abs(cx - anchorX) > RADIUS) continue; if (Math.abs(cy - anchorY) > RADIUS) continue; const lower = candidate.str.toLowerCase(); for (const { label, aliases } of RECIPIENT_LABELS) { if (aliases.some((alias) => lower.includes(alias))) return label; } } return null; } /** * Extract per-page text + page dimensions from a PDF buffer. Uses * pdfjs-dist (the same library powering react-pdf in the dialog). We * import it dynamically so the heavy native-bindings dep only loads * when the detector actually runs. * * Returns an empty array if pdfjs fails to parse - the rep gets the * manual placement flow without an error toast. */ export async function extractPdfPages(pdfBuffer: Buffer): Promise { try { // pdfjs-dist 5.x ships a legacy ESM build that works in Node + Next // server bundles without the worker wiring needed in the browser. const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs'); const data = new Uint8Array(pdfBuffer); const loadingTask = pdfjsLib.getDocument({ data }); const pdf = await loadingTask.promise; const pages: PdfPageView[] = []; for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i); const viewport = page.getViewport({ scale: 1 }); const content = await page.getTextContent(); const items = (content.items as Array).filter(isPdfTextItem); pages.push({ pageNumber: i, widthPt: viewport.width, heightPt: viewport.height, items, }); } return pages; } catch { // Image-only scans or corrupt PDFs land here. The dialog falls // back to manual placement - no rep-facing error needed. return []; } } function isPdfTextItem(item: unknown): item is PdfTextItem { if (!item || typeof item !== 'object') return false; const i = item as Record; return typeof i.str === 'string' && Array.isArray(i.transform); }