diff --git a/src/jobs/processors/imap-bounce-poller.ts b/src/jobs/processors/imap-bounce-poller.ts index ba408721..b3d04581 100644 --- a/src/jobs/processors/imap-bounce-poller.ts +++ b/src/jobs/processors/imap-bounce-poller.ts @@ -2,24 +2,58 @@ * Phase 6 - IMAP bounce poller. * * Polls the configured IMAP inbox for delivery-status notifications, runs - * each through `parseBounce()`, and matches the original recipient against - * a recent `document_sends` row. When matched, updates the send row's + * each through `parseBounce()`, and matches the bounce back to the exact + * originating `document_sends` row. When matched, updates the send row's * bounce_* columns and fires an `email_bounced` notification to the rep * who originated the send (hard/soft only - out-of-office is logged but * not surfaced as an actionable alert). * - * The job runs globally (no per-port context). IMAP creds are read from - * environment variables (`IMAP_HOST` / `IMAP_PORT` / `IMAP_USER` / - * `IMAP_PASS`) - when any is missing the poll is a no-op so the worker - * boots happily in dev. Run cadence is set in `src/lib/queue/scheduler.ts` - * (every 15 minutes). + * ── Matching strategy (audit M8) ───────────────────────────────────────── + * The job runs globally against a SINGLE env-configured IMAP inbox shared + * by every port (`IMAP_HOST` / `IMAP_PORT` / `IMAP_USER` / `IMAP_PASS`) - + * there is no per-port IMAP config today. A naive "most recent send to this + * recipient address within N days" match is cross-tenant unsafe: if two + * ports both emailed `victim@x.com`, the bounce would be pinned to whichever + * sent most recently, mutating the wrong port's row and leaking the bounce + * reason into the wrong tenant's notification feed. A forged NDR (the body + * is attacker-controllable) could likewise mark an arbitrary cross-port send + * bounced. + * + * To avoid that we match in two tiers: + * 1. **By Message-ID (precise, port-unambiguous).** Every outbound send + * stores the SMTP `Message-ID` on `document_sends.message_id`. A + * conformant NDR echoes that id back via `In-Reply-To` / `References` + * or in its returned-headers block; `parseBounce` surfaces them as + * `originalMessageIds`. An exact id match identifies one specific send + * row (and therefore one specific port) with no guessing. + * 2. **By recipient + time window (fallback, single-port only).** Used + * only when no Message-ID could be extracted (older/odd NDR shapes). + * If the recipient+window query returns sends spanning MORE THAN ONE + * port, we DO NOT guess - the message is skipped/flagged rather than + * mis-attributed. In the common single-tenant deployment every match is + * the same port, so this fallback still works there. + * + * LIMITATION: with one global inbox and a recipient-only fallback we cannot + * attribute a bounce that (a) carries no usable Message-ID AND (b) matches + * sends from more than one port. Those are logged (`skippedAmbiguous`) and + * left untouched. Wiring per-port IMAP (a `getSalesImapConfig(portId)` + + * `eq(documentSends.portId, portId)` scope) would remove the limitation + * entirely; tracked alongside audit M8. + * + * The `originalRecipient` value is parsed from the untrusted inbound body + * and is already validated against the RFC5322 regex inside `parseBounce`, + * so by the time it reaches the query/notification here it is a + * syntactically valid address (audit L16a). * * State (last-run timestamp) is persisted to `system_settings` under * `bounce_poller_state` with `port_id = NULL`, so concurrent worker * instances see the same checkpoint. On first run the lookback is 24 h. + * When any IMAP_* env var is missing the poll is a no-op so the worker + * boots happily in dev. Run cadence is set in `src/lib/queue/scheduler.ts` + * (every 15 minutes). */ -import { and, desc, eq, gte, isNull } from 'drizzle-orm'; +import { and, desc, eq, gte, inArray, isNull } from 'drizzle-orm'; import { db } from '@/lib/db'; import { documentSends } from '@/lib/db/schema/brochures'; @@ -118,6 +152,7 @@ export async function processImapBouncePoll(): Promise { let matched = 0; let skippedNoMatch = 0; let skippedNonBounce = 0; + let skippedAmbiguous = 0; try { await client.connect(); @@ -134,27 +169,85 @@ export async function processImapBouncePoll(): Promise { try { if (!message.source) continue; const parsed = await parseBounce(message.source); - if (!parsed.originalRecipient || parsed.bounceClass === 'unknown') { + if (parsed.bounceClass === 'unknown') { skippedNonBounce++; continue; } const lookback = new Date(Date.now() - SEND_MATCH_WINDOW_DAYS * 86_400_000); - // Most-recent matching send to this recipient; the recipient - // may have been sent multiple files in the same window - the - // bounce always refers to the latest. - const candidates = await db - .select() - .from(documentSends) - .where( - and( - eq(documentSends.recipientEmail, parsed.originalRecipient), - gte(documentSends.sentAt, lookback), - ), - ) - .orderBy(desc(documentSends.sentAt)) - .limit(1); - const target = candidates[0]; + + // ── Tier 1: exact Message-ID match (port-unambiguous). ────────── + // The NDR echoed back one or more original Message-IDs; each maps + // to at most one send row, so a hit identifies the exact + // originating send (and its port) with zero guessing. This works + // even cross-tenant on the shared inbox. + let target: typeof documentSends.$inferSelect | undefined; + if (parsed.originalMessageIds.length > 0) { + const byMessageId = await db + .select() + .from(documentSends) + .where(inArray(documentSends.messageId, parsed.originalMessageIds)) + .orderBy(desc(documentSends.sentAt)) + .limit(2); + if (byMessageId.length > 0) { + // Message-IDs are unique per send; >1 hit would mean a stored + // collision - still safe to take the most recent, but log it. + if (byMessageId.length > 1) { + logger.warn( + { messageIds: parsed.originalMessageIds, uid: message.uid }, + 'IMAP bounce: multiple sends share a Message-ID; using most recent', + ); + } + target = byMessageId[0]; + } + } + + // ── Tier 2: recipient + window fallback (single-port only). ───── + // Only when no Message-ID could be matched. We MUST NOT guess + // across tenants on the shared inbox, so we refuse to attribute + // when candidate sends span more than one port (audit M8). + if (!target) { + if (!parsed.originalRecipient) { + skippedNoMatch++; + continue; + } + // Pull a small set (not just the latest) so we can detect a + // cross-port collision before committing to any one row. + const candidates = await db + .select() + .from(documentSends) + .where( + and( + eq(documentSends.recipientEmail, parsed.originalRecipient), + gte(documentSends.sentAt, lookback), + ), + ) + .orderBy(desc(documentSends.sentAt)) + .limit(10); + if (candidates.length === 0) { + skippedNoMatch++; + continue; + } + const distinctPorts = new Set(candidates.map((c) => c.portId)); + if (distinctPorts.size > 1) { + // Ambiguous: this recipient was emailed by multiple ports in + // the window and we have no Message-ID to disambiguate. Skip + // rather than mis-attribute to the wrong tenant. + skippedAmbiguous++; + logger.warn( + { + recipient: parsed.originalRecipient, + ports: [...distinctPorts], + uid: message.uid, + }, + 'IMAP bounce: recipient matched sends across multiple ports with no Message-ID; skipping to avoid cross-tenant misattribution', + ); + continue; + } + // Single port: the most-recent send is the bounce target. + target = candidates[0]; + } + if (!target) { skippedNoMatch++; continue; @@ -185,7 +278,10 @@ export async function processImapBouncePoll(): Promise { userId: target.sentByUserId, type: 'email_bounced', title: 'Email bounced', - description: `Your email to ${parsed.originalRecipient} bounced - ${parsed.reason}`, + // Use the trusted stored recipient from the matched send row, + // not the parsed-from-untrusted-body value (which can be null + // on a Message-ID-only match and is attacker-influenced). + description: `Your email to ${target.recipientEmail} bounced - ${parsed.reason}`, link: target.interestId ? `/interests/${target.interestId}` : undefined, entityType: 'document_send', entityId: target.id, @@ -200,7 +296,14 @@ export async function processImapBouncePoll(): Promise { await savePollerState({ lastRunAt: runStartedAt.toISOString() }); logger.info( - { scanned, matched, skippedNoMatch, skippedNonBounce, sinceISO: since.toISOString() }, + { + scanned, + matched, + skippedNoMatch, + skippedNonBounce, + skippedAmbiguous, + sinceISO: since.toISOString(), + }, 'IMAP bounce poll complete', ); } finally { diff --git a/src/lib/email/bounce-parser.ts b/src/lib/email/bounce-parser.ts index ac37655b..24350cd3 100644 --- a/src/lib/email/bounce-parser.ts +++ b/src/lib/email/bounce-parser.ts @@ -34,10 +34,87 @@ export interface ParsedBounce { reason: string; /** Inbound message-id (or in-reply-to header) for cross-reference. */ inReplyTo: string | null; + /** + * Candidate Message-IDs of the *original* (bounced) message, gathered + * from the NDR's `In-Reply-To` + `References` headers AND from the + * returned-headers block embedded in the DSN body (RFC 3464 carries the + * failed message's headers in a `message/rfc822` part). The poller + * matches these against `document_sends.message_id` to pin a bounce to + * the exact originating send (and therefore the correct port) without + * having to guess by recipient + time window. Angle brackets/whitespace + * are stripped and values lowercased so comparison is normalization-safe. + */ + originalMessageIds: string[]; /** SMTP status code (e.g. "5.1.1") if the NDR carried one. */ statusCode: string | null; } +/** + * RFC 5322-ish address validator. Deliberately identical to the strict + * regex used on the outbound side (`document-sends.service.ts`) so a + * recipient that was rejected at send time can't slip through here either. + * The `originalRecipient` we parse comes from an attacker-controllable + * inbound NDR body, so it MUST be validated before it lands in a DB query + * or a user-facing notification string (audit L16a). + */ +const RFC5322_EMAIL = /^[^\s@]+@[^\s@]+\.[^\s@]+$/; + +function isValidEmail(email: string | null | undefined): email is string { + return Boolean(email) && email!.length <= 254 && RFC5322_EMAIL.test(email!); +} + +/** Strip angle brackets + whitespace and lowercase a Message-ID for + * normalization-safe equality. Returns null for empty/garbage input. */ +function normalizeMessageId(raw: string | null | undefined): string | null { + if (!raw) return null; + const trimmed = raw + .trim() + .replace(/^<+|>+$/g, '') + .trim() + .toLowerCase(); + // A Message-ID must contain an '@' and have no internal whitespace; reject + // anything that doesn't look like one so we never query on noise. + if (!trimmed || /\s/.test(trimmed) || !trimmed.includes('@')) return null; + return trimmed; +} + +/** + * Collect every plausible original-message-id from the parsed NDR: + * 1. `In-Reply-To` (single id) and `References` (one or many) headers. + * 2. The returned-headers block in the DSN body — many MTAs inline the + * failed message's `Message-ID:` header inside the human-readable or + * `message/rfc822` part rather than setting In-Reply-To on the NDR. + * Deduplicated + normalized. + */ +function collectOriginalMessageIds( + inReplyTo: string | null, + references: string[] | string | undefined, + bodyText: string, +): string[] { + const out = new Set(); + const push = (v: string | null | undefined): void => { + const n = normalizeMessageId(v); + if (n) out.add(n); + }; + + push(inReplyTo); + if (Array.isArray(references)) { + for (const r of references) push(r); + } else if (typeof references === 'string') { + // A single `References` header may carry space-separated ids. + for (const r of references.split(/\s+/)) push(r); + } + + // Returned `Message-ID:` header(s) embedded in the DSN body. + const re = /^\s*Message-ID:\s*(<[^>\r\n]+>)/gim; + let m: RegExpExecArray | null; + while ((m = re.exec(bodyText)) !== null) { + push(m[1]); + } + + return [...out]; +} + const HARD_BOUNCE_STATUSES = new Set([ '5.0.0', '5.1.1', // mailbox does not exist @@ -116,6 +193,7 @@ export async function parseBounce(raw: string | Buffer): Promise { bounceClass: 'unknown', reason: 'Failed to parse message', inReplyTo: null, + originalMessageIds: [], statusCode: null, }; } @@ -123,13 +201,20 @@ export async function parseBounce(raw: string | Buffer): Promise { const subject = parsed.subject ?? ''; const bodyText = parsed.text ?? ''; const inReplyTo = (parsed.inReplyTo as string | undefined) ?? null; + const originalMessageIds = collectOriginalMessageIds(inReplyTo, parsed.references, bodyText); if (looksLikeOoo(subject, bodyText)) { + const oooFrom = parsed.from?.value[0]?.address ?? null; return { - originalRecipient: parsed.from?.value[0]?.address ?? null, + // OOO auto-replies come *from* the recipient, so the From address is + // the "original recipient". Validate it the same way as a bounce + // recipient (audit L16a) - an invalid value would only ever pollute a + // string we don't act on for OOO, but we keep the contract uniform. + originalRecipient: isValidEmail(oooFrom) ? oooFrom : null, bounceClass: 'ooo', reason: 'Out-of-office auto-reply', inReplyTo, + originalMessageIds, statusCode: null, }; } @@ -137,7 +222,11 @@ export async function parseBounce(raw: string | Buffer): Promise { // Try to walk the multipart/report DSN structure first; falls back to // plain-text heuristics for non-RFC-compliant Outlook NDRs. const statusCode = extractStatusFromBody(bodyText); - const originalRecipient = extractRecipientFromBody(bodyText); + const rawRecipient = extractRecipientFromBody(bodyText); + // The recipient is parsed out of an attacker-controllable inbound body; + // reject anything that isn't a syntactically valid address before it can + // reach a DB query or a notification string (audit L16a). + const originalRecipient = isValidEmail(rawRecipient) ? rawRecipient : null; const cls = classifyByStatus(statusCode); @@ -154,6 +243,7 @@ export async function parseBounce(raw: string | Buffer): Promise { bounceClass: 'unknown', reason: 'No bounce indicators detected', inReplyTo, + originalMessageIds, statusCode, }; } @@ -163,6 +253,7 @@ export async function parseBounce(raw: string | Buffer): Promise { bounceClass: cls ?? 'hard', reason: deriveReason(statusCode, bodyText, subject), inReplyTo, + originalMessageIds, statusCode, }; } diff --git a/src/lib/email/shell.ts b/src/lib/email/shell.ts index 311ac388..c42f27f0 100644 --- a/src/lib/email/shell.ts +++ b/src/lib/email/shell.ts @@ -50,6 +50,17 @@ export function renderShell({ title, body, branding }: ShellOpts): string { // any host). Mail clients have no app origin, so re-absolutize here. const logoUrl = absolutizeBrandingUrl(branding?.logoUrl ?? DEFAULT_LOGO_URL); const backgroundUrl = absolutizeBrandingUrl(branding?.backgroundUrl ?? DEFAULT_BACKGROUND_URL); + // SECURITY / trust boundary (audit L16b): `emailHeaderHtml` / `emailFooterHtml` + // are admin-authored branding HTML and are interpolated RAW into the email + // body below (intentional - admins legitimately need to paste a styled + // legal footer / marketing strip). Authoring them is gated on the + // `manage_settings` permission, so the worst case is a self-XSS by the + // highest-privileged user on a tenant they already control - not a + // cross-tenant or privilege-escalation vector (each port reads only its + // own settings rows). If a future tenant model has MULTIPLE admins with + // mutually-distrusting `manage_settings` holders, allowlist-sanitize these + // two fields here (e.g. via a sanitize-html allowlist of safe tags/attrs) + // before interpolation. Until then we keep them raw by design. const headerHtml = branding?.emailHeaderHtml ?? ''; const footerHtml = branding?.emailFooterHtml ?? ''; diff --git a/src/lib/email/template-overrides.ts b/src/lib/email/template-overrides.ts index a8fca990..36e0ba2b 100644 --- a/src/lib/email/template-overrides.ts +++ b/src/lib/email/template-overrides.ts @@ -28,14 +28,32 @@ export async function loadSubjectOverride( return typeof value === 'string' && value.trim() ? value : null; } -/** Synchronous client-side helper for substituting {{token}} placeholders. */ +/** + * Synchronous helper for substituting {{token}} placeholders in an email + * subject line. + * + * Defensive CRLF neutralization (audit L16c): a subject is an email *header*, + * and a CR/LF inside a substituted token value is the classic header-injection + * primitive (smuggle a `Bcc:` / second header / a fake body). nodemailer + * already strips CR/LF from header values before transmission, so this is not + * exploitable in practice through our send path - but a token value can flow + * from user-controlled data (client name, berth label, …), so we strip + * CR/LF (and the rest of the C0/DEL control range) from each substituted value + * here too, in depth, rather than relying solely on the transport. The static + * template text is admin-authored and left untouched. + */ export function applySubjectTokens( template: string, tokens: Record, ): string { return template.replace(/\{\{(\w+)\}\}/g, (match, name: string) => { const v = tokens[name]; - return v === undefined || v === null ? match : String(v); + if (v === undefined || v === null) return match; + // Replace CR/LF (and the rest of the C0/DEL control range) with a single + // space so a multi-line token value can never break the subject onto a + // new header line. + + return String(v).replace(/[\x00-\x1f\x7f]+/g, ' '); }); }