fix(ops): /health DB+Redis checks, validated env.REDIS_URL across workers, error_events 90d retention
Three audit-pass-#3 findings, all in the "wakes you at 3am" category.
- /api/public/health now runs DB SELECT 1 + Redis PING in parallel and
returns 503 + a degraded payload when either fails. Anonymous probes
(no X-Intake-Secret) still get a flat {status:'ok'} so generic uptime
monitors keep working; authenticated probes see the dep results.
- All worker entrypoints (ai, bulk, documents, email, export, import,
maintenance, notifications, reports, webhooks) and src/lib/redis.ts
now use env.REDIS_URL (Zod-validated at boot) instead of
process.env.REDIS_URL!. Previously a missing env let the app start
silently and fail at first job pickup.
- maintenance worker gains an `error-events-retention` case that
delete()s rows older than 90 days from error_events. scheduler.ts
registers it at 06:00 daily. Closes the contract from migration
0040 which declared the table "pruned at 90 days" but had no
implementation.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,28 +1,56 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
import { timingSafeEqual } from 'node:crypto';
|
||||
import { sql } from 'drizzle-orm';
|
||||
|
||||
import { db } from '@/lib/db';
|
||||
import { redis } from '@/lib/redis';
|
||||
import { env } from '@/lib/env';
|
||||
import { logger } from '@/lib/logger';
|
||||
|
||||
/**
|
||||
* GET /api/public/health
|
||||
*
|
||||
* Health probe used by the marketing-website server on startup to verify
|
||||
* it's pointed at a CRM matching its own deployment env (plan §14.8
|
||||
* critical: prevent staging-website-talking-to-prod-CRM).
|
||||
* Liveness + readiness probe. Two response shapes:
|
||||
*
|
||||
* Auditor-K §41 flagged that the previous response disclosed `NODE_ENV`
|
||||
* and `APP_URL` to anonymous internet — mirrors the website's own intake
|
||||
* secret gate so we don't leak deployment fingerprints. When
|
||||
* `WEBSITE_INTAKE_SECRET` is set and the caller presents the matching
|
||||
* `X-Intake-Secret` header we return the full payload; otherwise return
|
||||
* a minimal `{status:'ok'}` so generic uptime monitors still get a 200.
|
||||
* 1. **Anonymous (no header):** minimal `{status:'ok'}` so uptime
|
||||
* monitors can poll without leaking deployment fingerprints.
|
||||
*
|
||||
* 2. **Authenticated (`X-Intake-Secret`, timing-safe compared):** full
|
||||
* payload including env + dependency check results. This is what
|
||||
* the marketing site uses on startup AND what k8s readiness
|
||||
* probes should hit, because it returns 503 on hard dep failures.
|
||||
*
|
||||
* The dep checks (DB SELECT 1, Redis PING) run on every request — they
|
||||
* are <5ms each. If either fails, the response is 503 so a load balancer
|
||||
* stops routing to this instance.
|
||||
*/
|
||||
export function GET(req: NextRequest): Response {
|
||||
|
||||
type HealthCheck = { ok: true; latencyMs: number } | { ok: false; error: string };
|
||||
|
||||
async function checkDb(): Promise<HealthCheck> {
|
||||
const start = Date.now();
|
||||
try {
|
||||
await db.execute(sql`SELECT 1`);
|
||||
return { ok: true, latencyMs: Date.now() - start };
|
||||
} catch (err) {
|
||||
return { ok: false, error: err instanceof Error ? err.message : 'unknown' };
|
||||
}
|
||||
}
|
||||
|
||||
async function checkRedis(): Promise<HealthCheck> {
|
||||
const start = Date.now();
|
||||
try {
|
||||
const pong = await redis.ping();
|
||||
if (pong !== 'PONG') return { ok: false, error: `unexpected: ${pong}` };
|
||||
return { ok: true, latencyMs: Date.now() - start };
|
||||
} catch (err) {
|
||||
return { ok: false, error: err instanceof Error ? err.message : 'unknown' };
|
||||
}
|
||||
}
|
||||
|
||||
export async function GET(req: NextRequest): Promise<Response> {
|
||||
const expected = env.WEBSITE_INTAKE_SECRET;
|
||||
const provided = req.headers.get('x-intake-secret');
|
||||
// Use timingSafeEqual rather than a `===` comparison — string equality
|
||||
// is not constant-time and lets a remote attacker enumerate the secret
|
||||
// byte-by-byte via response-time differences.
|
||||
const matched =
|
||||
!!expected &&
|
||||
!!provided &&
|
||||
@@ -35,6 +63,8 @@ export function GET(req: NextRequest): Response {
|
||||
}
|
||||
})();
|
||||
|
||||
// Anonymous probe: no dep checks, never 503. Uptime monitors that
|
||||
// can't carry the secret keep working.
|
||||
if (!matched) {
|
||||
return NextResponse.json(
|
||||
{ status: 'ok', timestamp: new Date().toISOString() },
|
||||
@@ -42,13 +72,23 @@ export function GET(req: NextRequest): Response {
|
||||
);
|
||||
}
|
||||
|
||||
// Authenticated probe: run dep checks in parallel and surface a 503
|
||||
// when anything required is down.
|
||||
const [dbCheck, redisCheck] = await Promise.all([checkDb(), checkRedis()]);
|
||||
const allOk = dbCheck.ok && redisCheck.ok;
|
||||
|
||||
if (!allOk) {
|
||||
logger.warn({ db: dbCheck, redis: redisCheck }, 'Health probe found an unhealthy dependency');
|
||||
}
|
||||
|
||||
return NextResponse.json(
|
||||
{
|
||||
status: 'ok',
|
||||
status: allOk ? 'ok' : 'degraded',
|
||||
env: env.NODE_ENV,
|
||||
appUrl: env.APP_URL,
|
||||
timestamp: new Date().toISOString(),
|
||||
checks: { db: dbCheck, redis: redisCheck },
|
||||
},
|
||||
{ headers: { 'cache-control': 'no-store' } },
|
||||
{ status: allOk ? 200 : 503, headers: { 'cache-control': 'no-store' } },
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user