letsbe-hub/app/routes/telemetry.py

164 lines
5.4 KiB
Python

"""Telemetry endpoint for receiving metrics from orchestrators.
This endpoint receives aggregated telemetry from orchestrator instances.
It validates authentication, stores metrics, and updates instance state.
"""
import hashlib
import logging
import secrets
from fastapi import APIRouter, Header, HTTPException, status
from sqlalchemy import select
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.exc import IntegrityError
from app.db import AsyncSessionDep
from app.models.base import utc_now
from app.models.instance import Instance
from app.models.telemetry_sample import TelemetrySample
from app.schemas.telemetry import TelemetryPayload, TelemetryResponse
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/instances", tags=["Telemetry"])
@router.post("/{instance_id}/telemetry", response_model=TelemetryResponse)
async def receive_telemetry(
instance_id: str,
payload: TelemetryPayload,
db: AsyncSessionDep,
hub_api_key: str = Header(..., alias="X-Hub-Api-Key"),
) -> TelemetryResponse:
"""
Receive telemetry from an orchestrator instance.
Authentication:
- Requires valid X-Hub-Api-Key header matching the instance
Validation:
- instance_id in path must match payload.instance_id (prevents spoofing)
- Instance must exist and be active
- Schema uses extra="forbid" to reject unknown fields
De-duplication:
- Uses (instance_id, window_start) unique constraint
- Duplicate submissions are silently accepted (idempotent)
HTTP Semantics:
- 200 OK: Telemetry accepted
- 400 Bad Request: instance_id mismatch or invalid payload
- 401 Unauthorized: Invalid or missing hub_api_key
- 403 Forbidden: Instance suspended
- 404 Not Found: Instance not found
"""
# Validate instance_id in path matches payload
if instance_id != payload.instance_id:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={
"error": "instance_id mismatch between path and payload",
"code": "instance_id_mismatch",
},
)
# Find instance by instance_id string (e.g., "letsbe-orchestrator")
result = await db.execute(select(Instance).where(Instance.instance_id == instance_id))
instance = result.scalar_one_or_none()
if instance is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail={"error": "Instance not found", "code": "instance_not_found"},
)
# Validate hub_api_key using constant-time comparison
if not instance.hub_api_key_hash:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail={
"error": "Instance has no hub_api_key configured",
"code": "no_hub_key",
},
)
provided_hash = hashlib.sha256(hub_api_key.encode()).hexdigest()
if not secrets.compare_digest(provided_hash, instance.hub_api_key_hash):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail={"error": "Invalid hub_api_key", "code": "invalid_hub_key"},
)
# Check instance status
if instance.license_status == "suspended":
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail={"error": "Instance suspended", "code": "suspended"},
)
if instance.license_status == "revoked":
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail={"error": "Instance revoked", "code": "revoked"},
)
# Check license expiry
now = utc_now()
if instance.license_expires_at and instance.license_expires_at < now:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail={"error": "License expired", "code": "expired"},
)
# Store telemetry sample
# Use PostgreSQL upsert to handle duplicates gracefully
# Note: instance_id in DB is the UUID (instance.id), not the string instance_id
telemetry_data = {
"instance_id": instance.id,
"window_start": payload.window_start,
"window_end": payload.window_end,
"uptime_seconds": payload.uptime_seconds,
"metrics": payload.metrics.model_dump(),
}
try:
# PostgreSQL INSERT ... ON CONFLICT DO NOTHING
# If duplicate (instance_id, window_start), silently ignore
stmt = (
pg_insert(TelemetrySample)
.values(**telemetry_data)
.on_conflict_do_nothing(constraint="uq_telemetry_instance_window")
)
await db.execute(stmt)
except IntegrityError:
# Fallback for non-PostgreSQL (shouldn't happen in production)
logger.warning(
"telemetry_duplicate_submission",
extra={
"instance_id": str(instance_id),
"window_start": payload.window_start.isoformat(),
},
)
# Update instance last_seen_at
instance.last_seen_at = now
await db.commit()
logger.info(
"telemetry_received",
extra={
"instance_id": str(instance_id),
"window_start": payload.window_start.isoformat(),
"window_end": payload.window_end.isoformat(),
"uptime_seconds": payload.uptime_seconds,
},
)
return TelemetryResponse(
received=True,
next_interval_seconds=60,
message=None,
)