"""Telemetry endpoint for receiving metrics from orchestrators. This endpoint receives aggregated telemetry from orchestrator instances. It validates authentication, stores metrics, and updates instance state. """ import hashlib import logging import secrets from uuid import UUID from fastapi import APIRouter, Header, HTTPException, status from sqlalchemy import select from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.exc import IntegrityError from app.db import AsyncSessionDep from app.models.base import utc_now from app.models.instance import Instance from app.models.telemetry_sample import TelemetrySample from app.schemas.telemetry import TelemetryPayload, TelemetryResponse logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/instances", tags=["Telemetry"]) @router.post("/{instance_id}/telemetry", response_model=TelemetryResponse) async def receive_telemetry( instance_id: UUID, payload: TelemetryPayload, db: AsyncSessionDep, hub_api_key: str = Header(..., alias="X-Hub-Api-Key"), ) -> TelemetryResponse: """ Receive telemetry from an orchestrator instance. Authentication: - Requires valid X-Hub-Api-Key header matching the instance Validation: - instance_id in path must match payload.instance_id (prevents spoofing) - Instance must exist and be active - Schema uses extra="forbid" to reject unknown fields De-duplication: - Uses (instance_id, window_start) unique constraint - Duplicate submissions are silently accepted (idempotent) HTTP Semantics: - 200 OK: Telemetry accepted - 400 Bad Request: instance_id mismatch or invalid payload - 401 Unauthorized: Invalid or missing hub_api_key - 403 Forbidden: Instance suspended - 404 Not Found: Instance not found """ # Validate instance_id in path matches payload if instance_id != payload.instance_id: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail={ "error": "instance_id mismatch between path and payload", "code": "instance_id_mismatch", }, ) # Find instance by UUID (id column, not instance_id string) result = await db.execute(select(Instance).where(Instance.id == instance_id)) instance = result.scalar_one_or_none() if instance is None: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail={"error": "Instance not found", "code": "instance_not_found"}, ) # Validate hub_api_key using constant-time comparison if not instance.hub_api_key_hash: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail={ "error": "Instance has no hub_api_key configured", "code": "no_hub_key", }, ) provided_hash = hashlib.sha256(hub_api_key.encode()).hexdigest() if not secrets.compare_digest(provided_hash, instance.hub_api_key_hash): raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail={"error": "Invalid hub_api_key", "code": "invalid_hub_key"}, ) # Check instance status if instance.license_status == "suspended": raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail={"error": "Instance suspended", "code": "suspended"}, ) if instance.license_status == "revoked": raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail={"error": "Instance revoked", "code": "revoked"}, ) # Check license expiry now = utc_now() if instance.license_expires_at and instance.license_expires_at < now: raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail={"error": "License expired", "code": "expired"}, ) # Store telemetry sample # Use PostgreSQL upsert to handle duplicates gracefully telemetry_data = { "instance_id": instance_id, "window_start": payload.window_start, "window_end": payload.window_end, "uptime_seconds": payload.uptime_seconds, "metrics": payload.metrics.model_dump(), } try: # PostgreSQL INSERT ... ON CONFLICT DO NOTHING # If duplicate (instance_id, window_start), silently ignore stmt = ( pg_insert(TelemetrySample) .values(**telemetry_data) .on_conflict_do_nothing(constraint="uq_telemetry_instance_window") ) await db.execute(stmt) except IntegrityError: # Fallback for non-PostgreSQL (shouldn't happen in production) logger.warning( "telemetry_duplicate_submission", extra={ "instance_id": str(instance_id), "window_start": payload.window_start.isoformat(), }, ) # Update instance last_seen_at instance.last_seen_at = now await db.commit() logger.info( "telemetry_received", extra={ "instance_id": str(instance_id), "window_start": payload.window_start.isoformat(), "window_end": payload.window_end.isoformat(), "uptime_seconds": payload.uptime_seconds, }, ) return TelemetryResponse( received=True, next_interval_seconds=60, message=None, )