fix(ops): /health DB+Redis checks, validated env.REDIS_URL across workers, error_events 90d retention

Three audit-pass-#3 findings, all in the "wakes you at 3am" category.

- /api/public/health now runs DB SELECT 1 + Redis PING in parallel and
  returns 503 + a degraded payload when either fails. Anonymous probes
  (no X-Intake-Secret) still get a flat {status:'ok'} so generic uptime
  monitors keep working; authenticated probes see the dep results.
- All worker entrypoints (ai, bulk, documents, email, export, import,
  maintenance, notifications, reports, webhooks) and src/lib/redis.ts
  now use env.REDIS_URL (Zod-validated at boot) instead of
  process.env.REDIS_URL!. Previously a missing env let the app start
  silently and fail at first job pickup.
- maintenance worker gains an `error-events-retention` case that
  delete()s rows older than 90 days from error_events. scheduler.ts
  registers it at 06:00 daily. Closes the contract from migration
  0040 which declared the table "pruned at 90 days" but had no
  implementation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Matt Ciaccio
2026-05-06 14:59:07 +02:00
parent 64f0e0a1b8
commit f93de75bb5
13 changed files with 110 additions and 41 deletions

View File

@@ -1,28 +1,56 @@
import { NextRequest, NextResponse } from 'next/server';
import { timingSafeEqual } from 'node:crypto';
import { sql } from 'drizzle-orm';
import { db } from '@/lib/db';
import { redis } from '@/lib/redis';
import { env } from '@/lib/env';
import { logger } from '@/lib/logger';
/**
* GET /api/public/health
*
* Health probe used by the marketing-website server on startup to verify
* it's pointed at a CRM matching its own deployment env (plan §14.8
* critical: prevent staging-website-talking-to-prod-CRM).
* Liveness + readiness probe. Two response shapes:
*
* Auditor-K §41 flagged that the previous response disclosed `NODE_ENV`
* and `APP_URL` to anonymous internet — mirrors the website's own intake
* secret gate so we don't leak deployment fingerprints. When
* `WEBSITE_INTAKE_SECRET` is set and the caller presents the matching
* `X-Intake-Secret` header we return the full payload; otherwise return
* a minimal `{status:'ok'}` so generic uptime monitors still get a 200.
* 1. **Anonymous (no header):** minimal `{status:'ok'}` so uptime
* monitors can poll without leaking deployment fingerprints.
*
* 2. **Authenticated (`X-Intake-Secret`, timing-safe compared):** full
* payload including env + dependency check results. This is what
* the marketing site uses on startup AND what k8s readiness
* probes should hit, because it returns 503 on hard dep failures.
*
* The dep checks (DB SELECT 1, Redis PING) run on every request — they
* are <5ms each. If either fails, the response is 503 so a load balancer
* stops routing to this instance.
*/
export function GET(req: NextRequest): Response {
type HealthCheck = { ok: true; latencyMs: number } | { ok: false; error: string };
async function checkDb(): Promise<HealthCheck> {
const start = Date.now();
try {
await db.execute(sql`SELECT 1`);
return { ok: true, latencyMs: Date.now() - start };
} catch (err) {
return { ok: false, error: err instanceof Error ? err.message : 'unknown' };
}
}
async function checkRedis(): Promise<HealthCheck> {
const start = Date.now();
try {
const pong = await redis.ping();
if (pong !== 'PONG') return { ok: false, error: `unexpected: ${pong}` };
return { ok: true, latencyMs: Date.now() - start };
} catch (err) {
return { ok: false, error: err instanceof Error ? err.message : 'unknown' };
}
}
export async function GET(req: NextRequest): Promise<Response> {
const expected = env.WEBSITE_INTAKE_SECRET;
const provided = req.headers.get('x-intake-secret');
// Use timingSafeEqual rather than a `===` comparison — string equality
// is not constant-time and lets a remote attacker enumerate the secret
// byte-by-byte via response-time differences.
const matched =
!!expected &&
!!provided &&
@@ -35,6 +63,8 @@ export function GET(req: NextRequest): Response {
}
})();
// Anonymous probe: no dep checks, never 503. Uptime monitors that
// can't carry the secret keep working.
if (!matched) {
return NextResponse.json(
{ status: 'ok', timestamp: new Date().toISOString() },
@@ -42,13 +72,23 @@ export function GET(req: NextRequest): Response {
);
}
// Authenticated probe: run dep checks in parallel and surface a 503
// when anything required is down.
const [dbCheck, redisCheck] = await Promise.all([checkDb(), checkRedis()]);
const allOk = dbCheck.ok && redisCheck.ok;
if (!allOk) {
logger.warn({ db: dbCheck, redis: redisCheck }, 'Health probe found an unhealthy dependency');
}
return NextResponse.json(
{
status: 'ok',
status: allOk ? 'ok' : 'degraded',
env: env.NODE_ENV,
appUrl: env.APP_URL,
timestamp: new Date().toISOString(),
checks: { db: dbCheck, redis: redisCheck },
},
{ headers: { 'cache-control': 'no-store' } },
{ status: allOk ? 200 : 503, headers: { 'cache-control': 'no-store' } },
);
}

View File

@@ -1,19 +1,20 @@
import { Queue, type ConnectionOptions } from 'bullmq';
import { env } from '@/lib/env';
const redisUrl = process.env.REDIS_URL!;
const redisUrl = env.REDIS_URL;
// 10 queues matching 11-REALTIME-AND-BACKGROUND-JOBS.md Section 3.1
const QUEUE_CONFIGS = {
email: { concurrency: 5, maxAttempts: 5 },
documents: { concurrency: 3, maxAttempts: 5 },
email: { concurrency: 5, maxAttempts: 5 },
documents: { concurrency: 3, maxAttempts: 5 },
notifications: { concurrency: 10, maxAttempts: 3 },
import: { concurrency: 1, maxAttempts: 1 },
export: { concurrency: 2, maxAttempts: 3 },
reports: { concurrency: 1, maxAttempts: 3 },
webhooks: { concurrency: 5, maxAttempts: 3 },
maintenance: { concurrency: 1, maxAttempts: 3 },
ai: { concurrency: 2, maxAttempts: 3 },
bulk: { concurrency: 2, maxAttempts: 3 },
import: { concurrency: 1, maxAttempts: 1 },
export: { concurrency: 2, maxAttempts: 3 },
reports: { concurrency: 1, maxAttempts: 3 },
webhooks: { concurrency: 5, maxAttempts: 3 },
maintenance: { concurrency: 1, maxAttempts: 3 },
ai: { concurrency: 2, maxAttempts: 3 },
bulk: { concurrency: 2, maxAttempts: 3 },
} as const;
export type QueueName = keyof typeof QUEUE_CONFIGS;
@@ -28,8 +29,8 @@ export function getQueue(name: QueueName): Queue {
defaultJobOptions: {
attempts: QUEUE_CONFIGS[name].maxAttempts,
backoff: { type: 'exponential', delay: 1000 },
removeOnComplete: { age: 24 * 3600 }, // keep completed jobs 24 hours
removeOnFail: { age: 7 * 24 * 3600 }, // keep failed jobs 7 days
removeOnComplete: { age: 24 * 3600 }, // keep completed jobs 24 hours
removeOnFail: { age: 7 * 24 * 3600 }, // keep failed jobs 7 days
},
});
queues.set(name, queue);

View File

@@ -57,6 +57,8 @@ export async function registerRecurringJobs(): Promise<void> {
{ queue: 'maintenance', name: 'gdpr-export-cleanup', pattern: '0 4 * * *' },
// Phase 3b: AI usage ledger retention (90-day rolling window)
{ queue: 'maintenance', name: 'ai-usage-retention', pattern: '0 5 * * *' },
// Migration 0040 contract: error_events older than 90 days get pruned.
{ queue: 'maintenance', name: 'error-events-retention', pattern: '0 6 * * *' },
];
for (const job of recurring) {

View File

@@ -1,4 +1,5 @@
import { Worker, type Job } from 'bullmq';
import { env } from '@/lib/env';
import type { ConnectionOptions } from 'bullmq';
import { logger } from '@/lib/logger';
@@ -310,7 +311,7 @@ export const aiWorker = new Worker(
}
},
{
connection: { url: process.env.REDIS_URL! } as ConnectionOptions,
connection: { url: env.REDIS_URL } as ConnectionOptions,
concurrency: QUEUE_CONFIGS.ai.concurrency,
},
);

View File

@@ -1,4 +1,5 @@
import { Worker, type Job } from 'bullmq';
import { env } from '@/lib/env';
import type { ConnectionOptions } from 'bullmq';
import { logger } from '@/lib/logger';
@@ -19,7 +20,7 @@ export const documentsWorker = new Worker(
}
},
{
connection: { url: process.env.REDIS_URL! } as ConnectionOptions,
connection: { url: env.REDIS_URL } as ConnectionOptions,
concurrency: QUEUE_CONFIGS.documents.concurrency,
},
);

View File

@@ -1,4 +1,5 @@
import { Worker, type Job } from 'bullmq';
import { env } from '@/lib/env';
import type { ConnectionOptions } from 'bullmq';
import { logger } from '@/lib/logger';
@@ -56,7 +57,7 @@ export const emailWorker = new Worker(
}
},
{
connection: { url: process.env.REDIS_URL! } as ConnectionOptions,
connection: { url: env.REDIS_URL } as ConnectionOptions,
concurrency: QUEUE_CONFIGS.email.concurrency,
},
);

View File

@@ -1,4 +1,5 @@
import { Worker, type Job } from 'bullmq';
import { env } from '@/lib/env';
import type { ConnectionOptions } from 'bullmq';
import { logger } from '@/lib/logger';
@@ -26,7 +27,7 @@ export const exportWorker = new Worker(
}
},
{
connection: { url: process.env.REDIS_URL! } as ConnectionOptions,
connection: { url: env.REDIS_URL } as ConnectionOptions,
concurrency: QUEUE_CONFIGS.export.concurrency,
},
);

View File

@@ -1,4 +1,5 @@
import { Worker, type Job } from 'bullmq';
import { env } from '@/lib/env';
import type { ConnectionOptions } from 'bullmq';
import { logger } from '@/lib/logger';
@@ -14,7 +15,7 @@ export const importWorker = new Worker(
// - Note: maxAttempts=1 - imports are idempotent, user retries manually
},
{
connection: { url: process.env.REDIS_URL! } as ConnectionOptions,
connection: { url: env.REDIS_URL } as ConnectionOptions,
concurrency: QUEUE_CONFIGS.import.concurrency,
},
);

View File

@@ -1,4 +1,5 @@
import { Worker, type Job } from 'bullmq';
import { env } from '@/lib/env';
import { and, eq, lt, isNotNull } from 'drizzle-orm';
import type { ConnectionOptions } from 'bullmq';
@@ -6,12 +7,16 @@ import { db } from '@/lib/db';
import { formSubmissions } from '@/lib/db/schema/documents';
import { gdprExports } from '@/lib/db/schema/gdpr';
import { aiUsageLedger } from '@/lib/db/schema/ai-usage';
import { errorEvents } from '@/lib/db/schema/system';
import { logger } from '@/lib/logger';
import { getStorageBackend } from '@/lib/storage';
import { QUEUE_CONFIGS } from '@/lib/queue';
/** AI usage rows older than this are deleted by the retention job. */
const AI_USAGE_RETENTION_DAYS = 90;
/** error_events rows older than this are pruned. Migration 0040 declares
* this contract; the worker had no implementation until now. */
const ERROR_EVENTS_RETENTION_DAYS = 90;
export const maintenanceWorker = new Worker(
'maintenance',
@@ -113,12 +118,27 @@ export const maintenanceWorker = new Worker(
);
break;
}
case 'error-events-retention': {
// Honor the contract from migration 0040: error_events older than
// ERROR_EVENTS_RETENTION_DAYS get dropped. Otherwise the table
// grows unbounded and the admin error log becomes unusable.
const cutoff = new Date(Date.now() - ERROR_EVENTS_RETENTION_DAYS * 24 * 60 * 60 * 1000);
const result = await db
.delete(errorEvents)
.where(lt(errorEvents.createdAt, cutoff))
.returning({ requestId: errorEvents.requestId });
logger.info(
{ deleted: result.length, retentionDays: ERROR_EVENTS_RETENTION_DAYS },
'Error events retention sweep complete',
);
break;
}
default:
logger.warn({ jobName: job.name }, 'Unknown maintenance job');
}
},
{
connection: { url: process.env.REDIS_URL! } as ConnectionOptions,
connection: { url: env.REDIS_URL } as ConnectionOptions,
concurrency: QUEUE_CONFIGS.maintenance.concurrency,
},
);

View File

@@ -1,4 +1,5 @@
import { Worker, type Job } from 'bullmq';
import { env } from '@/lib/env';
import type { ConnectionOptions } from 'bullmq';
import { logger } from '@/lib/logger';
@@ -79,7 +80,7 @@ export const notificationsWorker = new Worker(
}
},
{
connection: { url: process.env.REDIS_URL! } as ConnectionOptions,
connection: { url: env.REDIS_URL } as ConnectionOptions,
concurrency: QUEUE_CONFIGS.notifications.concurrency,
},
);

View File

@@ -1,4 +1,5 @@
import { Worker, type Job } from 'bullmq';
import { env } from '@/lib/env';
import type { ConnectionOptions } from 'bullmq';
import { logger } from '@/lib/logger';
@@ -21,10 +22,7 @@ export const reportsWorker = new Worker(
.select()
.from(scheduledReports)
.where(
and(
eq(scheduledReports.isActive, true),
lte(scheduledReports.nextRunAt, new Date()),
),
and(eq(scheduledReports.isActive, true), lte(scheduledReports.nextRunAt, new Date())),
);
for (const report of dueReports) {
@@ -64,7 +62,7 @@ export const reportsWorker = new Worker(
}
},
{
connection: { url: process.env.REDIS_URL! } as ConnectionOptions,
connection: { url: env.REDIS_URL } as ConnectionOptions,
concurrency: QUEUE_CONFIGS.reports.concurrency,
},
);

View File

@@ -1,4 +1,5 @@
import { Worker, type Job } from 'bullmq';
import { env } from '@/lib/env';
import { createHmac } from 'node:crypto';
import { lookup } from 'node:dns/promises';
@@ -277,7 +278,7 @@ export const webhooksWorker = new Worker(
}
},
{
connection: { url: process.env.REDIS_URL! } as ConnectionOptions,
connection: { url: env.REDIS_URL } as ConnectionOptions,
concurrency: QUEUE_CONFIGS.webhooks.concurrency,
},
);

View File

@@ -1,8 +1,9 @@
import Redis from 'ioredis';
import { env } from '@/lib/env';
import { logger } from '@/lib/logger';
const redisUrl = process.env.REDIS_URL!;
const redisUrl = env.REDIS_URL;
export const redis = new Redis(redisUrl, {
maxRetriesPerRequest: 3,