164 lines
5.3 KiB
Python
164 lines
5.3 KiB
Python
"""Telemetry endpoint for receiving metrics from orchestrators.
|
|
|
|
This endpoint receives aggregated telemetry from orchestrator instances.
|
|
It validates authentication, stores metrics, and updates instance state.
|
|
"""
|
|
|
|
import hashlib
|
|
import logging
|
|
import secrets
|
|
from uuid import UUID
|
|
|
|
from fastapi import APIRouter, Header, HTTPException, status
|
|
from sqlalchemy import select
|
|
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
|
from sqlalchemy.exc import IntegrityError
|
|
|
|
from app.db import AsyncSessionDep
|
|
from app.models.base import utc_now
|
|
from app.models.instance import Instance
|
|
from app.models.telemetry_sample import TelemetrySample
|
|
from app.schemas.telemetry import TelemetryPayload, TelemetryResponse
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/v1/instances", tags=["Telemetry"])
|
|
|
|
|
|
@router.post("/{instance_id}/telemetry", response_model=TelemetryResponse)
|
|
async def receive_telemetry(
|
|
instance_id: UUID,
|
|
payload: TelemetryPayload,
|
|
db: AsyncSessionDep,
|
|
hub_api_key: str = Header(..., alias="X-Hub-Api-Key"),
|
|
) -> TelemetryResponse:
|
|
"""
|
|
Receive telemetry from an orchestrator instance.
|
|
|
|
Authentication:
|
|
- Requires valid X-Hub-Api-Key header matching the instance
|
|
|
|
Validation:
|
|
- instance_id in path must match payload.instance_id (prevents spoofing)
|
|
- Instance must exist and be active
|
|
- Schema uses extra="forbid" to reject unknown fields
|
|
|
|
De-duplication:
|
|
- Uses (instance_id, window_start) unique constraint
|
|
- Duplicate submissions are silently accepted (idempotent)
|
|
|
|
HTTP Semantics:
|
|
- 200 OK: Telemetry accepted
|
|
- 400 Bad Request: instance_id mismatch or invalid payload
|
|
- 401 Unauthorized: Invalid or missing hub_api_key
|
|
- 403 Forbidden: Instance suspended
|
|
- 404 Not Found: Instance not found
|
|
"""
|
|
# Validate instance_id in path matches payload
|
|
if instance_id != payload.instance_id:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
detail={
|
|
"error": "instance_id mismatch between path and payload",
|
|
"code": "instance_id_mismatch",
|
|
},
|
|
)
|
|
|
|
# Find instance by UUID (id column, not instance_id string)
|
|
result = await db.execute(select(Instance).where(Instance.id == instance_id))
|
|
instance = result.scalar_one_or_none()
|
|
|
|
if instance is None:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
detail={"error": "Instance not found", "code": "instance_not_found"},
|
|
)
|
|
|
|
# Validate hub_api_key using constant-time comparison
|
|
if not instance.hub_api_key_hash:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
|
detail={
|
|
"error": "Instance has no hub_api_key configured",
|
|
"code": "no_hub_key",
|
|
},
|
|
)
|
|
|
|
provided_hash = hashlib.sha256(hub_api_key.encode()).hexdigest()
|
|
if not secrets.compare_digest(provided_hash, instance.hub_api_key_hash):
|
|
raise HTTPException(
|
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
|
detail={"error": "Invalid hub_api_key", "code": "invalid_hub_key"},
|
|
)
|
|
|
|
# Check instance status
|
|
if instance.license_status == "suspended":
|
|
raise HTTPException(
|
|
status_code=status.HTTP_403_FORBIDDEN,
|
|
detail={"error": "Instance suspended", "code": "suspended"},
|
|
)
|
|
|
|
if instance.license_status == "revoked":
|
|
raise HTTPException(
|
|
status_code=status.HTTP_403_FORBIDDEN,
|
|
detail={"error": "Instance revoked", "code": "revoked"},
|
|
)
|
|
|
|
# Check license expiry
|
|
now = utc_now()
|
|
if instance.license_expires_at and instance.license_expires_at < now:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_403_FORBIDDEN,
|
|
detail={"error": "License expired", "code": "expired"},
|
|
)
|
|
|
|
# Store telemetry sample
|
|
# Use PostgreSQL upsert to handle duplicates gracefully
|
|
telemetry_data = {
|
|
"instance_id": instance_id,
|
|
"window_start": payload.window_start,
|
|
"window_end": payload.window_end,
|
|
"uptime_seconds": payload.uptime_seconds,
|
|
"metrics": payload.metrics.model_dump(),
|
|
}
|
|
|
|
try:
|
|
# PostgreSQL INSERT ... ON CONFLICT DO NOTHING
|
|
# If duplicate (instance_id, window_start), silently ignore
|
|
stmt = (
|
|
pg_insert(TelemetrySample)
|
|
.values(**telemetry_data)
|
|
.on_conflict_do_nothing(constraint="uq_telemetry_instance_window")
|
|
)
|
|
await db.execute(stmt)
|
|
except IntegrityError:
|
|
# Fallback for non-PostgreSQL (shouldn't happen in production)
|
|
logger.warning(
|
|
"telemetry_duplicate_submission",
|
|
extra={
|
|
"instance_id": str(instance_id),
|
|
"window_start": payload.window_start.isoformat(),
|
|
},
|
|
)
|
|
|
|
# Update instance last_seen_at
|
|
instance.last_seen_at = now
|
|
|
|
await db.commit()
|
|
|
|
logger.info(
|
|
"telemetry_received",
|
|
extra={
|
|
"instance_id": str(instance_id),
|
|
"window_start": payload.window_start.isoformat(),
|
|
"window_end": payload.window_end.isoformat(),
|
|
"uptime_seconds": payload.uptime_seconds,
|
|
},
|
|
)
|
|
|
|
return TelemetryResponse(
|
|
received=True,
|
|
next_interval_seconds=60,
|
|
message=None,
|
|
)
|