docs(ops): backup/restore + email deliverability runbooks
Two new runbooks under docs/runbooks/ plus the automation scripts the
backup runbook references. Both are written so an operator who has only
the off-site backup credentials and the runbook can recover the system
unaided.
Backup/restore (Phase 4a):
- docs/runbooks/backup-and-restore.md — covers what gets backed up
(Postgres / MinIO / .env+ENCRYPTION_KEY), schedule (hourly DB +
hourly MinIO mirror, 7-day hourly + 30-day daily retention),
cold-restore procedure with row-count verification, weekly drill
- scripts/backup/pg-backup.sh — pg_dump → gzip → optional GPG → mc
upload, fails loud
- scripts/backup/minio-mirror.sh — incremental mc mirror, no --remove
flag so accidental deletes on the live bucket can't cascade
- scripts/backup/restore.sh — interactive prod restore + --drill mode
that runs against a sandbox DB and diffs row counts
Email deliverability (Phase 4b):
- docs/runbooks/email-deliverability.md — what the CRM sends, DNS
records (SPF/DKIM/DMARC/MX), per-port override implications,
diagnosis flow ("didn't arrive" → 4-step checklist starting with
EMAIL_REDIRECT_TO), provider migration plan, realapi suite as the
end-to-end probe
Tests still 778/778 vitest, tsc/lint clean — these phases are docs +
shell scripts, no code changes.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
51
scripts/backup/minio-mirror.sh
Normal file
51
scripts/backup/minio-mirror.sh
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
# Hourly MinIO mirror for Port Nimara CRM.
|
||||
#
|
||||
# Mirrors the live `MINIO_BUCKET` to the backup destination. `mc mirror`
|
||||
# is incremental — only changed objects transfer — so this is cheap.
|
||||
#
|
||||
# Versioning on the destination bucket is what protects against object
|
||||
# deletes / overwrites; we don't try to roll our own.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${MINIO_ENDPOINT:?MINIO_ENDPOINT not set}"
|
||||
: "${MINIO_ACCESS_KEY:?MINIO_ACCESS_KEY not set}"
|
||||
: "${MINIO_SECRET_KEY:?MINIO_SECRET_KEY not set}"
|
||||
: "${MINIO_BUCKET:?MINIO_BUCKET not set}"
|
||||
: "${BACKUP_S3_BUCKET:?BACKUP_S3_BUCKET not set}"
|
||||
: "${BACKUP_S3_ENDPOINT:?BACKUP_S3_ENDPOINT not set}"
|
||||
: "${BACKUP_S3_ACCESS_KEY:?BACKUP_S3_ACCESS_KEY not set}"
|
||||
: "${BACKUP_S3_SECRET_KEY:?BACKUP_S3_SECRET_KEY not set}"
|
||||
|
||||
# Default scheme: live MinIO is plain HTTP unless MINIO_USE_SSL=true.
|
||||
LIVE_URL="${MINIO_ENDPOINT}"
|
||||
if [[ "${MINIO_USE_SSL:-false}" == "true" ]]; then
|
||||
LIVE_URL="https://${MINIO_ENDPOINT}:${MINIO_PORT:-443}"
|
||||
else
|
||||
LIVE_URL="http://${MINIO_ENDPOINT}:${MINIO_PORT:-9000}"
|
||||
fi
|
||||
|
||||
LIVE_ALIAS="live-$$"
|
||||
BACKUP_ALIAS="bk-$$"
|
||||
trap 'mc alias remove "$LIVE_ALIAS" 2>/dev/null || true; mc alias remove "$BACKUP_ALIAS" 2>/dev/null || true' EXIT
|
||||
|
||||
mc alias set "$LIVE_ALIAS" "$LIVE_URL" \
|
||||
"$MINIO_ACCESS_KEY" "$MINIO_SECRET_KEY" --api S3v4 >/dev/null
|
||||
mc alias set "$BACKUP_ALIAS" "$BACKUP_S3_ENDPOINT" \
|
||||
"$BACKUP_S3_ACCESS_KEY" "$BACKUP_S3_SECRET_KEY" --api S3v4 >/dev/null
|
||||
|
||||
SOURCE="${LIVE_ALIAS}/${MINIO_BUCKET}/"
|
||||
DEST="${BACKUP_ALIAS}/${BACKUP_S3_BUCKET}/minio/"
|
||||
|
||||
echo "[$(date -u +%FT%TZ)] Mirroring $SOURCE → $DEST"
|
||||
|
||||
# `--remove` would delete objects from the destination that no longer
|
||||
# exist in source — we DON'T pass it, because that would let an
|
||||
# accidental delete on the live bucket cascade into permanent loss on
|
||||
# the backup side. Versioning + lifecycle handle stale-object cleanup.
|
||||
mc mirror --quiet --overwrite "$SOURCE" "$DEST"
|
||||
|
||||
# Print byte / count diff for the operator.
|
||||
echo "[$(date -u +%FT%TZ)] Done. Destination summary:"
|
||||
mc du "$DEST"
|
||||
63
scripts/backup/pg-backup.sh
Normal file
63
scripts/backup/pg-backup.sh
Normal file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env bash
|
||||
# Hourly PostgreSQL backup for Port Nimara CRM.
|
||||
#
|
||||
# Reads DATABASE_URL and BACKUP_S3_* from the environment. Dumps to a
|
||||
# tmpfile, gzips, optionally GPG-encrypts to BACKUP_GPG_RECIPIENT, and
|
||||
# uploads to s3://${BACKUP_S3_BUCKET}/pg/<hostname>/<UTC-date>/<hour>.dump.gz[.gpg].
|
||||
#
|
||||
# Designed to fail loud: any non-zero exit halts the script and propagates
|
||||
# to the cron / CI runner so the operator sees the failure.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${DATABASE_URL:?DATABASE_URL not set}"
|
||||
: "${BACKUP_S3_BUCKET:?BACKUP_S3_BUCKET not set}"
|
||||
: "${BACKUP_S3_ENDPOINT:?BACKUP_S3_ENDPOINT not set}"
|
||||
: "${BACKUP_S3_ACCESS_KEY:?BACKUP_S3_ACCESS_KEY not set}"
|
||||
: "${BACKUP_S3_SECRET_KEY:?BACKUP_S3_SECRET_KEY not set}"
|
||||
|
||||
HOST="${BACKUP_HOST_OVERRIDE:-$(hostname -s)}"
|
||||
DATE_UTC="$(date -u +%Y-%m-%d)"
|
||||
HOUR_UTC="$(date -u +%H)"
|
||||
WORKDIR="$(mktemp -d)"
|
||||
trap 'rm -rf "$WORKDIR"' EXIT
|
||||
|
||||
DUMP_FILE="$WORKDIR/${HOUR_UTC}.dump"
|
||||
ARCHIVE_NAME="${HOUR_UTC}.dump.gz"
|
||||
|
||||
echo "[$(date -u +%FT%TZ)] Dumping $DATABASE_URL → $DUMP_FILE"
|
||||
pg_dump --format=custom --compress=9 --no-owner --no-privileges \
|
||||
--file="$DUMP_FILE" "$DATABASE_URL"
|
||||
|
||||
# pg_dump's `custom` format is already compressed, but we wrap in gzip so
|
||||
# the file looks the same regardless of the dump format on disk.
|
||||
gzip -n "$DUMP_FILE"
|
||||
GZ_FILE="${DUMP_FILE}.gz"
|
||||
|
||||
# Optional GPG layer. Only encrypt if the recipient is configured.
|
||||
if [[ -n "${BACKUP_GPG_RECIPIENT:-}" ]]; then
|
||||
echo "[$(date -u +%FT%TZ)] Encrypting for $BACKUP_GPG_RECIPIENT"
|
||||
gpg --batch --yes --trust-model always \
|
||||
--recipient "$BACKUP_GPG_RECIPIENT" \
|
||||
--encrypt --output "${GZ_FILE}.gpg" "$GZ_FILE"
|
||||
rm "$GZ_FILE"
|
||||
GZ_FILE="${GZ_FILE}.gpg"
|
||||
ARCHIVE_NAME="${ARCHIVE_NAME}.gpg"
|
||||
fi
|
||||
|
||||
# Configure mc client for the backup destination.
|
||||
MC_ALIAS="bk-$$"
|
||||
mc alias set "$MC_ALIAS" "$BACKUP_S3_ENDPOINT" \
|
||||
"$BACKUP_S3_ACCESS_KEY" "$BACKUP_S3_SECRET_KEY" \
|
||||
--api S3v4 >/dev/null
|
||||
|
||||
REMOTE_PATH="${MC_ALIAS}/${BACKUP_S3_BUCKET}/pg/${HOST}/${DATE_UTC}/${ARCHIVE_NAME}"
|
||||
echo "[$(date -u +%FT%TZ)] Uploading → $REMOTE_PATH"
|
||||
mc cp --quiet "$GZ_FILE" "$REMOTE_PATH"
|
||||
|
||||
# Tag with retention metadata so lifecycle rules can decide what to expire.
|
||||
mc tag set "$REMOTE_PATH" "kind=hourly&host=${HOST}&date=${DATE_UTC}" >/dev/null
|
||||
|
||||
mc alias remove "$MC_ALIAS" >/dev/null
|
||||
|
||||
echo "[$(date -u +%FT%TZ)] OK ${ARCHIVE_NAME} ($(du -h "$GZ_FILE" | cut -f1))"
|
||||
121
scripts/backup/restore.sh
Normal file
121
scripts/backup/restore.sh
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env bash
|
||||
# Cold-restore script for Port Nimara CRM.
|
||||
#
|
||||
# Two modes:
|
||||
# --drill Restore to a sandbox DB ($DRILL_DATABASE_URL) + a tagged
|
||||
# sandbox path on the live MinIO bucket. Used by the weekly
|
||||
# cron drill so the runbook stays accurate.
|
||||
# (no --drill) Interactive production restore. Prompts before each
|
||||
# destructive step; refuses to run if the live DB has
|
||||
# non-empty tables (caller is expected to drop first).
|
||||
#
|
||||
# Common args:
|
||||
# --snapshot YYYY-MM-DD/HH Specific dump to restore. Defaults to "latest".
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DRILL=0
|
||||
SNAPSHOT="latest"
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--drill) DRILL=1; shift ;;
|
||||
--snapshot) SNAPSHOT="$2"; shift 2 ;;
|
||||
*) echo "unknown arg: $1" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
: "${BACKUP_S3_BUCKET:?BACKUP_S3_BUCKET not set}"
|
||||
: "${BACKUP_S3_ENDPOINT:?BACKUP_S3_ENDPOINT not set}"
|
||||
: "${BACKUP_S3_ACCESS_KEY:?BACKUP_S3_ACCESS_KEY not set}"
|
||||
: "${BACKUP_S3_SECRET_KEY:?BACKUP_S3_SECRET_KEY not set}"
|
||||
|
||||
if [[ "$DRILL" -eq 1 ]]; then
|
||||
: "${DRILL_DATABASE_URL:?DRILL_DATABASE_URL not set}"
|
||||
TARGET_DB="$DRILL_DATABASE_URL"
|
||||
echo "[drill] target DB = $TARGET_DB"
|
||||
else
|
||||
: "${DATABASE_URL:?DATABASE_URL not set}"
|
||||
TARGET_DB="$DATABASE_URL"
|
||||
read -rp "About to overwrite $TARGET_DB. Type 'restore' to continue: " confirm
|
||||
[[ "$confirm" == "restore" ]] || { echo "aborted"; exit 1; }
|
||||
fi
|
||||
|
||||
HOST="${BACKUP_HOST_OVERRIDE:-$(hostname -s)}"
|
||||
WORKDIR="$(mktemp -d)"
|
||||
trap 'rm -rf "$WORKDIR"' EXIT
|
||||
|
||||
MC_ALIAS="bk-$$"
|
||||
mc alias set "$MC_ALIAS" "$BACKUP_S3_ENDPOINT" \
|
||||
"$BACKUP_S3_ACCESS_KEY" "$BACKUP_S3_SECRET_KEY" --api S3v4 >/dev/null
|
||||
trap 'rm -rf "$WORKDIR"; mc alias remove "$MC_ALIAS" 2>/dev/null || true' EXIT
|
||||
|
||||
# Resolve the snapshot path.
|
||||
if [[ "$SNAPSHOT" == "latest" ]]; then
|
||||
REMOTE=$(mc ls --recursive "${MC_ALIAS}/${BACKUP_S3_BUCKET}/pg/${HOST}/" \
|
||||
| awk '{print $NF}' | sort | tail -1)
|
||||
if [[ -z "$REMOTE" ]]; then
|
||||
echo "no snapshots found under ${BACKUP_S3_BUCKET}/pg/${HOST}/" >&2
|
||||
exit 1
|
||||
fi
|
||||
REMOTE="${MC_ALIAS}/${BACKUP_S3_BUCKET}/pg/${HOST}/${REMOTE}"
|
||||
else
|
||||
REMOTE="${MC_ALIAS}/${BACKUP_S3_BUCKET}/pg/${HOST}/${SNAPSHOT}.dump.gz"
|
||||
# If GPG was used, the file lives at .dump.gz.gpg. Try both.
|
||||
if ! mc stat "$REMOTE" >/dev/null 2>&1; then
|
||||
REMOTE="${REMOTE}.gpg"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "[$(date -u +%FT%TZ)] Pulling $REMOTE"
|
||||
LOCAL="$WORKDIR/$(basename "$REMOTE")"
|
||||
mc cp --quiet "$REMOTE" "$LOCAL"
|
||||
|
||||
# Decrypt if needed.
|
||||
if [[ "$LOCAL" == *.gpg ]]; then
|
||||
echo "[$(date -u +%FT%TZ)] Decrypting"
|
||||
gpg --batch --yes --decrypt --output "${LOCAL%.gpg}" "$LOCAL"
|
||||
rm "$LOCAL"
|
||||
LOCAL="${LOCAL%.gpg}"
|
||||
fi
|
||||
|
||||
# Decompress.
|
||||
gunzip "$LOCAL"
|
||||
LOCAL="${LOCAL%.gz}"
|
||||
|
||||
echo "[$(date -u +%FT%TZ)] Restoring into $TARGET_DB"
|
||||
|
||||
# Drop & recreate to guarantee no half-state from a prior run.
|
||||
DB_NAME=$(echo "$TARGET_DB" | sed -E 's|.*/([^?]+).*|\1|')
|
||||
ADMIN_URL=$(echo "$TARGET_DB" | sed -E "s|/${DB_NAME}|/postgres|")
|
||||
|
||||
psql "$ADMIN_URL" -v ON_ERROR_STOP=1 <<SQL
|
||||
SELECT pg_terminate_backend(pid) FROM pg_stat_activity
|
||||
WHERE datname = '${DB_NAME}' AND pid <> pg_backend_pid();
|
||||
DROP DATABASE IF EXISTS "${DB_NAME}";
|
||||
CREATE DATABASE "${DB_NAME}";
|
||||
SQL
|
||||
|
||||
pg_restore --no-owner --no-privileges --dbname "$TARGET_DB" "$LOCAL"
|
||||
|
||||
# Drill mode: compare row counts vs the live producer for parity.
|
||||
if [[ "$DRILL" -eq 1 ]]; then
|
||||
echo "[$(date -u +%FT%TZ)] Drill row-count diff (live vs restored):"
|
||||
TABLES=$(psql -At "$TARGET_DB" -c \
|
||||
"SELECT tablename FROM pg_tables WHERE schemaname='public' ORDER BY tablename;")
|
||||
diff_count=0
|
||||
while IFS= read -r tbl; do
|
||||
[[ -z "$tbl" ]] && continue
|
||||
live=$(psql -At "${LIVE_DATABASE_URL:-$DATABASE_URL}" -c "SELECT count(*) FROM \"$tbl\";")
|
||||
restored=$(psql -At "$TARGET_DB" -c "SELECT count(*) FROM \"$tbl\";")
|
||||
delta=$((live - restored))
|
||||
if [[ "$delta" -ne 0 ]]; then
|
||||
echo " ⚠ $tbl: live=$live restored=$restored delta=$delta"
|
||||
diff_count=$((diff_count + 1))
|
||||
fi
|
||||
done <<< "$TABLES"
|
||||
if [[ "$diff_count" -eq 0 ]]; then
|
||||
echo " ✓ row counts match across all tables"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "[$(date -u +%FT%TZ)] Restore complete."
|
||||
Reference in New Issue
Block a user