feat: Initial Hub implementation

Complete LetsBe Hub service for license management and telemetry:

- Client and Instance CRUD APIs
- License key generation and validation (lb_inst_ format)
- Hub API key generation (hk_ format) for telemetry auth
- Instance activation endpoint
- Telemetry collection with privacy-first redactor
- Key rotation and suspend/reactivate functionality
- Alembic migrations for PostgreSQL
- Docker Compose deployment ready
- Comprehensive test suite

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-22 14:09:32 +01:00
commit adc02e176b
39 changed files with 2968 additions and 0 deletions

3
app/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
"""LetsBe Hub - Central licensing and telemetry service."""
__version__ = "0.1.0"

63
app/config.py Normal file
View File

@@ -0,0 +1,63 @@
"""Hub configuration via environment variables."""
from functools import lru_cache
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
"""Hub settings loaded from environment variables."""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
frozen=True,
)
# Application
APP_NAME: str = Field(default="LetsBe Hub", description="Application name")
APP_VERSION: str = Field(default="0.1.0", description="Application version")
DEBUG: bool = Field(default=False, description="Debug mode")
# Database
DATABASE_URL: str = Field(
default="postgresql+asyncpg://hub:hub@db:5432/hub",
description="PostgreSQL connection URL"
)
DB_POOL_SIZE: int = Field(default=5, ge=1, le=20, description="Connection pool size")
DB_MAX_OVERFLOW: int = Field(default=10, ge=0, le=50, description="Max overflow connections")
DB_POOL_TIMEOUT: int = Field(default=30, ge=5, le=120, description="Pool timeout in seconds")
DB_POOL_RECYCLE: int = Field(default=1800, ge=300, le=7200, description="Connection recycle time")
# Admin authentication
ADMIN_API_KEY: str = Field(
default="change-me-in-production",
min_length=16,
description="Admin API key for management endpoints"
)
# Telemetry settings
TELEMETRY_RETENTION_DAYS: int = Field(
default=90,
ge=7,
le=365,
description="Days to retain telemetry data"
)
# Rate limiting for activation endpoint
ACTIVATION_RATE_LIMIT_PER_MINUTE: int = Field(
default=10,
ge=1,
le=100,
description="Max activation attempts per instance per minute"
)
@lru_cache
def get_settings() -> Settings:
"""Get cached settings instance."""
return Settings()
settings = get_settings()

52
app/db.py Normal file
View File

@@ -0,0 +1,52 @@
"""Database configuration and session management."""
from collections.abc import AsyncGenerator
from typing import Annotated
from fastapi import Depends
from sqlalchemy.ext.asyncio import (
AsyncSession,
async_sessionmaker,
create_async_engine,
)
from app.config import settings
# Create async engine with connection pooling
engine = create_async_engine(
settings.DATABASE_URL,
pool_size=settings.DB_POOL_SIZE,
max_overflow=settings.DB_MAX_OVERFLOW,
pool_timeout=settings.DB_POOL_TIMEOUT,
pool_recycle=settings.DB_POOL_RECYCLE,
echo=settings.DEBUG,
)
# Create async session factory
async_session_maker = async_sessionmaker(
engine,
class_=AsyncSession,
expire_on_commit=False,
autocommit=False,
autoflush=False,
)
async def get_db() -> AsyncGenerator[AsyncSession, None]:
"""
Dependency that provides an async database session.
Yields a session and ensures proper cleanup via finally block.
"""
async with async_session_maker() as session:
try:
yield session
except Exception:
await session.rollback()
raise
finally:
await session.close()
# Type alias for dependency injection
AsyncSessionDep = Annotated[AsyncSession, Depends(get_db)]

View File

@@ -0,0 +1,5 @@
"""Hub dependencies."""
from app.dependencies.admin_auth import validate_admin_key
__all__ = ["validate_admin_key"]

View File

@@ -0,0 +1,28 @@
"""Admin authentication dependency."""
import secrets
from typing import Annotated
from fastapi import Header, HTTPException, status
from app.config import settings
def validate_admin_key(
x_admin_api_key: Annotated[str, Header(description="Admin API key")],
) -> str:
"""
Validate the admin API key.
Uses constant-time comparison to prevent timing attacks.
"""
if not secrets.compare_digest(x_admin_api_key, settings.ADMIN_API_KEY):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid admin API key",
)
return x_admin_api_key
# Type alias for dependency injection
AdminKeyDep = Annotated[str, validate_admin_key]

51
app/main.py Normal file
View File

@@ -0,0 +1,51 @@
"""LetsBe Hub - Central licensing and telemetry service."""
import logging
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app import __version__
from app.config import settings
from app.db import engine
from app.routes import activation_router, admin_router, health_router, telemetry_router
# Configure logging
logging.basicConfig(
level=logging.DEBUG if settings.DEBUG else logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan handler."""
logger.info(f"Starting LetsBe Hub v{__version__}")
yield
logger.info("Shutting down LetsBe Hub")
await engine.dispose()
app = FastAPI(
title="LetsBe Hub",
description="Central licensing and telemetry service for LetsBe Cloud",
version=__version__,
lifespan=lifespan,
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(health_router)
app.include_router(admin_router)
app.include_router(activation_router)
app.include_router(telemetry_router)

16
app/models/__init__.py Normal file
View File

@@ -0,0 +1,16 @@
"""Hub database models."""
from app.models.base import Base, TimestampMixin, UUIDMixin, utc_now
from app.models.client import Client
from app.models.instance import Instance
from app.models.usage_sample import UsageSample
__all__ = [
"Base",
"UUIDMixin",
"TimestampMixin",
"utc_now",
"Client",
"Instance",
"UsageSample",
]

44
app/models/base.py Normal file
View File

@@ -0,0 +1,44 @@
"""Base model and mixins for SQLAlchemy ORM."""
import uuid
from datetime import datetime, timezone
from sqlalchemy import DateTime
from sqlalchemy.ext.asyncio import AsyncAttrs
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
def utc_now() -> datetime:
"""Return current UTC datetime."""
return datetime.now(timezone.utc)
class Base(AsyncAttrs, DeclarativeBase):
"""Base class for all SQLAlchemy models."""
pass
class UUIDMixin:
"""Mixin that adds a UUID primary key."""
id: Mapped[uuid.UUID] = mapped_column(
primary_key=True,
default=uuid.uuid4,
)
class TimestampMixin:
"""Mixin that adds created_at and updated_at timestamps."""
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=utc_now,
nullable=False,
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=utc_now,
onupdate=utc_now,
nullable=False,
)

38
app/models/client.py Normal file
View File

@@ -0,0 +1,38 @@
"""Client model - represents a company/organization using LetsBe."""
from typing import TYPE_CHECKING, Optional
from sqlalchemy import String
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.models.base import Base, TimestampMixin, UUIDMixin
if TYPE_CHECKING:
from app.models.instance import Instance
class Client(UUIDMixin, TimestampMixin, Base):
"""
A client is a company or organization using LetsBe.
Clients can have multiple instances (orchestrator deployments).
"""
__tablename__ = "clients"
# Client identification
name: Mapped[str] = mapped_column(String(255), nullable=False)
contact_email: Mapped[Optional[str]] = mapped_column(String(255), nullable=True)
# Billing/plan info (for future use)
billing_plan: Mapped[str] = mapped_column(String(50), default="free")
# Status
status: Mapped[str] = mapped_column(String(50), default="active")
# "active", "suspended", "archived"
# Relationships
instances: Mapped[list["Instance"]] = relationship(
back_populates="client",
cascade="all, delete-orphan",
)

137
app/models/instance.py Normal file
View File

@@ -0,0 +1,137 @@
"""Instance model - represents a deployed orchestrator with licensing."""
from datetime import datetime
from typing import TYPE_CHECKING, Optional
from uuid import UUID
from sqlalchemy import DateTime, ForeignKey, Integer, String
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.models.base import Base, TimestampMixin, UUIDMixin
if TYPE_CHECKING:
from app.models.client import Client
class Instance(UUIDMixin, TimestampMixin, Base):
"""
A deployed orchestrator instance with licensing.
Each instance is tied to a client and requires a valid license to operate.
The Hub issues license keys and tracks activation status.
"""
__tablename__ = "instances"
# Client relationship
client_id: Mapped[UUID] = mapped_column(
ForeignKey("clients.id", ondelete="CASCADE"),
nullable=False,
)
# Instance identification
instance_id: Mapped[str] = mapped_column(
String(255),
unique=True,
nullable=False,
index=True,
)
# e.g., "acme-orchestrator"
# === LICENSING ===
license_key_hash: Mapped[str] = mapped_column(
String(64),
nullable=False,
)
# SHA-256 hash of the license key (lb_inst_...)
license_key_prefix: Mapped[str] = mapped_column(
String(12),
nullable=False,
)
# First 12 chars for display: "lb_inst_abc1"
license_status: Mapped[str] = mapped_column(
String(50),
default="active",
nullable=False,
)
# "active", "suspended", "expired", "revoked"
license_issued_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
)
license_expires_at: Mapped[Optional[datetime]] = mapped_column(
DateTime(timezone=True),
nullable=True,
)
# None = no expiry (perpetual)
# === ACTIVATION STATE ===
activated_at: Mapped[Optional[datetime]] = mapped_column(
DateTime(timezone=True),
nullable=True,
)
# Set when instance first calls /activate
last_activation_at: Mapped[Optional[datetime]] = mapped_column(
DateTime(timezone=True),
nullable=True,
)
# Updated on each activation call
activation_count: Mapped[int] = mapped_column(
Integer,
default=0,
nullable=False,
)
# === TELEMETRY ===
hub_api_key_hash: Mapped[Optional[str]] = mapped_column(
String(64),
nullable=True,
)
# Generated on activation, used for telemetry auth
# === METADATA ===
region: Mapped[Optional[str]] = mapped_column(
String(50),
nullable=True,
)
# e.g., "eu-west-1"
version: Mapped[Optional[str]] = mapped_column(
String(50),
nullable=True,
)
# Last reported orchestrator version
last_seen_at: Mapped[Optional[datetime]] = mapped_column(
DateTime(timezone=True),
nullable=True,
)
# Last telemetry or heartbeat
status: Mapped[str] = mapped_column(
String(50),
default="pending",
nullable=False,
)
# "pending" (created, not yet activated), "active", "inactive", "suspended"
# Relationships
client: Mapped["Client"] = relationship(back_populates="instances")
def is_license_valid(self) -> bool:
"""Check if the license is currently valid."""
from app.models.base import utc_now
if self.license_status not in ("active",):
return False
if self.license_expires_at and self.license_expires_at < utc_now():
return False
return True

View File

@@ -0,0 +1,93 @@
"""Telemetry sample model - stores aggregated metrics from orchestrators.
PRIVACY GUARANTEE: This model contains NO sensitive data fields.
Only aggregated counts, tool names, durations, and status metrics.
"""
from datetime import datetime
from uuid import UUID
from sqlalchemy import DateTime, ForeignKey, Integer, JSON, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column
from app.models.base import Base, UUIDMixin
class TelemetrySample(UUIDMixin, Base):
"""
Aggregated telemetry from an orchestrator instance.
PRIVACY: This model deliberately stores ONLY:
- Instance reference
- Time window boundaries
- Uptime counter
- Aggregated metrics (counts, durations, statuses)
It NEVER stores:
- Task payloads or results
- Environment variable values
- File contents
- Error messages or stack traces
- Any PII
De-duplication: The unique constraint on (instance_id, window_start)
prevents double-counting if the orchestrator retries submissions.
"""
__tablename__ = "telemetry_samples"
# Instance reference (FK to instances.id, not instance_id string)
instance_id: Mapped[UUID] = mapped_column(
ForeignKey("instances.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
# Time window for this sample
window_start: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
)
window_end: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
)
# Orchestrator uptime at time of submission
uptime_seconds: Mapped[int] = mapped_column(
Integer,
nullable=False,
)
# Aggregated metrics (stored as JSON for flexibility)
# Uses generic JSON type for SQLite test compatibility
# PostgreSQL will use native JSON support in production
# Structure matches TelemetryMetrics schema:
# {
# "agents": {"online_count": 1, "offline_count": 0, "total_count": 1},
# "tasks": {
# "by_status": {"completed": 10, "failed": 1},
# "by_type": {"SHELL": {"count": 5, "avg_duration_ms": 1200}}
# },
# "servers": {"total_count": 1}
# }
metrics: Mapped[dict] = mapped_column(
JSON,
nullable=False,
)
# Unique constraint for de-duplication
# If orchestrator retries a failed submission, this prevents duplicates
__table_args__ = (
UniqueConstraint(
"instance_id",
"window_start",
name="uq_telemetry_instance_window",
),
)
def __repr__(self) -> str:
return (
f"<TelemetrySample(instance_id={self.instance_id}, "
f"window_start={self.window_start})>"
)

View File

@@ -0,0 +1,72 @@
"""Usage sample model - aggregated telemetry data.
PRIVACY GUARANTEE: This model contains NO sensitive data fields.
Only tool names, durations, and counts are stored.
"""
from datetime import datetime
from uuid import UUID
from sqlalchemy import DateTime, ForeignKey, Integer, String
from sqlalchemy.orm import Mapped, mapped_column
from app.models.base import Base, UUIDMixin
class UsageSample(UUIDMixin, Base):
"""
Aggregated usage statistics for an instance.
PRIVACY: This model deliberately has NO fields for:
- Environment values
- File contents
- Request/response payloads
- Screenshots
- Credentials
- Error messages or stack traces
Only metadata fields are allowed.
"""
__tablename__ = "usage_samples"
# Instance reference
instance_id: Mapped[UUID] = mapped_column(
ForeignKey("instances.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
# Time window
window_start: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
)
window_end: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
)
window_type: Mapped[str] = mapped_column(
String(20),
nullable=False,
)
# "minute", "hour", "day"
# Tool (ONLY name, never payloads)
tool_name: Mapped[str] = mapped_column(
String(255),
nullable=False,
index=True,
)
# e.g., "sysadmin.env_update"
# Counts (aggregated)
call_count: Mapped[int] = mapped_column(Integer, default=0)
success_count: Mapped[int] = mapped_column(Integer, default=0)
error_count: Mapped[int] = mapped_column(Integer, default=0)
rate_limited_count: Mapped[int] = mapped_column(Integer, default=0)
# Duration stats (milliseconds)
total_duration_ms: Mapped[int] = mapped_column(Integer, default=0)
min_duration_ms: Mapped[int] = mapped_column(Integer, default=0)
max_duration_ms: Mapped[int] = mapped_column(Integer, default=0)

8
app/routes/__init__.py Normal file
View File

@@ -0,0 +1,8 @@
"""Hub API routes."""
from app.routes.activation import router as activation_router
from app.routes.admin import router as admin_router
from app.routes.health import router as health_router
from app.routes.telemetry import router as telemetry_router
__all__ = ["admin_router", "activation_router", "health_router", "telemetry_router"]

107
app/routes/activation.py Normal file
View File

@@ -0,0 +1,107 @@
"""Instance activation endpoint.
This is the PUBLIC endpoint that client instances call to validate their license
and activate with the Hub.
"""
import hashlib
import secrets
from fastapi import APIRouter, HTTPException, status
from sqlalchemy import select
from app.db import AsyncSessionDep
from app.models.base import utc_now
from app.models.instance import Instance
from app.schemas.instance import ActivationRequest, ActivationResponse
router = APIRouter(prefix="/api/v1/instances", tags=["Activation"])
@router.post("/activate", response_model=ActivationResponse)
async def activate_instance(
request: ActivationRequest,
db: AsyncSessionDep,
) -> ActivationResponse:
"""
Activate an instance with its license key.
Called by local_bootstrap.sh before running migrations.
Returns:
- 200 + ActivationResponse on success
- 400 with error details on failure
Privacy guarantee:
- Only receives license_key and instance_id
- Never receives sensitive client data
"""
# Find instance by instance_id
result = await db.execute(
select(Instance).where(Instance.instance_id == request.instance_id)
)
instance = result.scalar_one_or_none()
if instance is None:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={"error": "Instance not found", "code": "instance_not_found"},
)
# Validate license key using constant-time comparison
provided_hash = hashlib.sha256(request.license_key.encode()).hexdigest()
if not secrets.compare_digest(provided_hash, instance.license_key_hash):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={"error": "Invalid license key", "code": "invalid_license"},
)
# Check license status
if instance.license_status == "suspended":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={"error": "License suspended", "code": "suspended"},
)
if instance.license_status == "revoked":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={"error": "License revoked", "code": "revoked"},
)
# Check expiry
now = utc_now()
if instance.license_expires_at and instance.license_expires_at < now:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={"error": "License expired", "code": "expired"},
)
# Update activation state
if instance.activated_at is None:
instance.activated_at = now
instance.last_activation_at = now
instance.activation_count += 1
instance.status = "active"
# Generate hub_api_key if not already set
hub_api_key: str
if instance.hub_api_key_hash:
# Key was pre-generated, client should use existing key
hub_api_key = "USE_EXISTING"
else:
# Generate new hub_api_key
hub_api_key = f"hk_{secrets.token_hex(24)}"
instance.hub_api_key_hash = hashlib.sha256(hub_api_key.encode()).hexdigest()
await db.commit()
return ActivationResponse(
status="ok",
instance_id=instance.instance_id,
hub_api_key=hub_api_key,
config={
"telemetry_enabled": True,
"telemetry_interval_seconds": 60,
},
)

400
app/routes/admin.py Normal file
View File

@@ -0,0 +1,400 @@
"""Admin routes for client and instance management."""
import hashlib
import secrets
from typing import Annotated
from uuid import UUID
from fastapi import APIRouter, Depends, Header, HTTPException, status
from sqlalchemy import select
from sqlalchemy.orm import selectinload
from app.config import settings
from app.db import AsyncSessionDep
from app.models.base import utc_now
from app.models.client import Client
from app.models.instance import Instance
from app.schemas.client import ClientCreate, ClientResponse, ClientUpdate
from app.schemas.instance import InstanceBriefResponse, InstanceCreate, InstanceResponse
def validate_admin_key(
x_admin_api_key: Annotated[str, Header(description="Admin API key")],
) -> str:
"""Validate the admin API key with constant-time comparison."""
if not secrets.compare_digest(x_admin_api_key, settings.ADMIN_API_KEY):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid admin API key",
)
return x_admin_api_key
AdminKeyDep = Annotated[str, Depends(validate_admin_key)]
router = APIRouter(prefix="/api/v1/admin", tags=["Admin"])
# ============ CLIENT MANAGEMENT ============
@router.post("/clients", response_model=ClientResponse, status_code=status.HTTP_201_CREATED)
async def create_client(
client: ClientCreate,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> Client:
"""Create a new client (company/organization)."""
db_client = Client(
name=client.name,
contact_email=client.contact_email,
billing_plan=client.billing_plan,
)
db.add(db_client)
await db.commit()
await db.refresh(db_client)
return db_client
@router.get("/clients", response_model=list[ClientResponse])
async def list_clients(
db: AsyncSessionDep,
_: AdminKeyDep,
) -> list[Client]:
"""List all clients."""
result = await db.execute(select(Client).order_by(Client.created_at.desc()))
return list(result.scalars().all())
@router.get("/clients/{client_id}", response_model=ClientResponse)
async def get_client(
client_id: UUID,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> Client:
"""Get a specific client by ID."""
result = await db.execute(select(Client).where(Client.id == client_id))
client = result.scalar_one_or_none()
if client is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Client not found",
)
return client
@router.patch("/clients/{client_id}", response_model=ClientResponse)
async def update_client(
client_id: UUID,
update: ClientUpdate,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> Client:
"""Update a client."""
result = await db.execute(select(Client).where(Client.id == client_id))
client = result.scalar_one_or_none()
if client is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Client not found",
)
update_data = update.model_dump(exclude_unset=True)
for field, value in update_data.items():
setattr(client, field, value)
await db.commit()
await db.refresh(client)
return client
@router.delete("/clients/{client_id}", status_code=status.HTTP_204_NO_CONTENT)
async def delete_client(
client_id: UUID,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> None:
"""Delete a client and all associated instances."""
result = await db.execute(select(Client).where(Client.id == client_id))
client = result.scalar_one_or_none()
if client is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Client not found",
)
await db.delete(client)
await db.commit()
# ============ INSTANCE MANAGEMENT ============
@router.post(
"/clients/{client_id}/instances",
response_model=InstanceResponse,
status_code=status.HTTP_201_CREATED,
)
async def create_instance(
client_id: UUID,
instance: InstanceCreate,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> dict:
"""
Create a new instance for a client.
Returns the license_key and hub_api_key in PLAINTEXT - this is the only time
they are visible. Store them securely and provide to client for their config.json.
"""
# Verify client exists
client_result = await db.execute(select(Client).where(Client.id == client_id))
client = client_result.scalar_one_or_none()
if client is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Client not found",
)
# Check instance_id uniqueness
existing = await db.execute(
select(Instance).where(Instance.instance_id == instance.instance_id)
)
if existing.scalar_one_or_none():
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail=f"Instance with id '{instance.instance_id}' already exists",
)
# Generate license key
license_key = f"lb_inst_{secrets.token_hex(32)}"
license_key_hash = hashlib.sha256(license_key.encode()).hexdigest()
license_key_prefix = license_key[:12]
# Generate hub API key
hub_api_key = f"hk_{secrets.token_hex(24)}"
hub_api_key_hash = hashlib.sha256(hub_api_key.encode()).hexdigest()
now = utc_now()
db_instance = Instance(
client_id=client_id,
instance_id=instance.instance_id,
license_key_hash=license_key_hash,
license_key_prefix=license_key_prefix,
license_status="active",
license_issued_at=now,
license_expires_at=instance.license_expires_at,
hub_api_key_hash=hub_api_key_hash,
region=instance.region,
status="pending",
)
db.add(db_instance)
await db.commit()
await db.refresh(db_instance)
# Return instance with plaintext keys (only time visible)
return {
"id": db_instance.id,
"instance_id": db_instance.instance_id,
"client_id": db_instance.client_id,
"license_key": license_key, # Plaintext, only time visible
"license_key_prefix": db_instance.license_key_prefix,
"license_status": db_instance.license_status,
"license_issued_at": db_instance.license_issued_at,
"license_expires_at": db_instance.license_expires_at,
"hub_api_key": hub_api_key, # Plaintext, only time visible
"activated_at": db_instance.activated_at,
"last_activation_at": db_instance.last_activation_at,
"activation_count": db_instance.activation_count,
"region": db_instance.region,
"version": db_instance.version,
"last_seen_at": db_instance.last_seen_at,
"status": db_instance.status,
"created_at": db_instance.created_at,
"updated_at": db_instance.updated_at,
}
@router.get("/clients/{client_id}/instances", response_model=list[InstanceBriefResponse])
async def list_client_instances(
client_id: UUID,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> list[Instance]:
"""List all instances for a client."""
# Verify client exists
client_result = await db.execute(select(Client).where(Client.id == client_id))
if client_result.scalar_one_or_none() is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Client not found",
)
result = await db.execute(
select(Instance)
.where(Instance.client_id == client_id)
.order_by(Instance.created_at.desc())
)
return list(result.scalars().all())
@router.get("/instances/{instance_id}", response_model=InstanceBriefResponse)
async def get_instance(
instance_id: str,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> Instance:
"""Get a specific instance by its instance_id."""
result = await db.execute(
select(Instance).where(Instance.instance_id == instance_id)
)
instance = result.scalar_one_or_none()
if instance is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Instance not found",
)
return instance
@router.post("/instances/{instance_id}/rotate-license", response_model=dict)
async def rotate_license_key(
instance_id: str,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> dict:
"""
Generate a new license key for an instance.
Invalidates the old key. Returns new key in plaintext (only time visible).
"""
result = await db.execute(
select(Instance).where(Instance.instance_id == instance_id)
)
instance = result.scalar_one_or_none()
if instance is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Instance not found",
)
new_license_key = f"lb_inst_{secrets.token_hex(32)}"
instance.license_key_hash = hashlib.sha256(new_license_key.encode()).hexdigest()
instance.license_key_prefix = new_license_key[:12]
instance.license_issued_at = utc_now()
await db.commit()
return {
"instance_id": instance.instance_id,
"license_key": new_license_key,
"license_key_prefix": instance.license_key_prefix,
"license_issued_at": instance.license_issued_at,
}
@router.post("/instances/{instance_id}/rotate-hub-key", response_model=dict)
async def rotate_hub_api_key(
instance_id: str,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> dict:
"""
Generate a new Hub API key for telemetry.
Invalidates the old key. Returns new key in plaintext (only time visible).
"""
result = await db.execute(
select(Instance).where(Instance.instance_id == instance_id)
)
instance = result.scalar_one_or_none()
if instance is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Instance not found",
)
new_hub_api_key = f"hk_{secrets.token_hex(24)}"
instance.hub_api_key_hash = hashlib.sha256(new_hub_api_key.encode()).hexdigest()
await db.commit()
return {
"instance_id": instance.instance_id,
"hub_api_key": new_hub_api_key,
}
@router.post("/instances/{instance_id}/suspend", response_model=dict)
async def suspend_instance(
instance_id: str,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> dict:
"""Suspend an instance license (blocks future activations)."""
result = await db.execute(
select(Instance).where(Instance.instance_id == instance_id)
)
instance = result.scalar_one_or_none()
if instance is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Instance not found",
)
instance.license_status = "suspended"
instance.status = "suspended"
await db.commit()
return {"instance_id": instance.instance_id, "status": "suspended"}
@router.post("/instances/{instance_id}/reactivate", response_model=dict)
async def reactivate_instance(
instance_id: str,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> dict:
"""Reactivate a suspended instance license."""
result = await db.execute(
select(Instance).where(Instance.instance_id == instance_id)
)
instance = result.scalar_one_or_none()
if instance is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Instance not found",
)
if instance.license_status == "revoked":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Cannot reactivate a revoked license",
)
instance.license_status = "active"
instance.status = "active" if instance.activated_at else "pending"
await db.commit()
return {"instance_id": instance.instance_id, "status": instance.status}
@router.delete("/instances/{instance_id}", status_code=status.HTTP_204_NO_CONTENT)
async def delete_instance(
instance_id: str,
db: AsyncSessionDep,
_: AdminKeyDep,
) -> None:
"""Delete an instance."""
result = await db.execute(
select(Instance).where(Instance.instance_id == instance_id)
)
instance = result.scalar_one_or_none()
if instance is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Instance not found",
)
await db.delete(instance)
await db.commit()

11
app/routes/health.py Normal file
View File

@@ -0,0 +1,11 @@
"""Health check endpoints."""
from fastapi import APIRouter
router = APIRouter(tags=["Health"])
@router.get("/health")
async def health_check() -> dict:
"""Basic health check endpoint."""
return {"status": "healthy"}

163
app/routes/telemetry.py Normal file
View File

@@ -0,0 +1,163 @@
"""Telemetry endpoint for receiving metrics from orchestrators.
This endpoint receives aggregated telemetry from orchestrator instances.
It validates authentication, stores metrics, and updates instance state.
"""
import hashlib
import logging
import secrets
from uuid import UUID
from fastapi import APIRouter, Header, HTTPException, status
from sqlalchemy import select
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.exc import IntegrityError
from app.db import AsyncSessionDep
from app.models.base import utc_now
from app.models.instance import Instance
from app.models.telemetry_sample import TelemetrySample
from app.schemas.telemetry import TelemetryPayload, TelemetryResponse
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/instances", tags=["Telemetry"])
@router.post("/{instance_id}/telemetry", response_model=TelemetryResponse)
async def receive_telemetry(
instance_id: UUID,
payload: TelemetryPayload,
db: AsyncSessionDep,
hub_api_key: str = Header(..., alias="X-Hub-Api-Key"),
) -> TelemetryResponse:
"""
Receive telemetry from an orchestrator instance.
Authentication:
- Requires valid X-Hub-Api-Key header matching the instance
Validation:
- instance_id in path must match payload.instance_id (prevents spoofing)
- Instance must exist and be active
- Schema uses extra="forbid" to reject unknown fields
De-duplication:
- Uses (instance_id, window_start) unique constraint
- Duplicate submissions are silently accepted (idempotent)
HTTP Semantics:
- 200 OK: Telemetry accepted
- 400 Bad Request: instance_id mismatch or invalid payload
- 401 Unauthorized: Invalid or missing hub_api_key
- 403 Forbidden: Instance suspended
- 404 Not Found: Instance not found
"""
# Validate instance_id in path matches payload
if instance_id != payload.instance_id:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={
"error": "instance_id mismatch between path and payload",
"code": "instance_id_mismatch",
},
)
# Find instance by UUID (id column, not instance_id string)
result = await db.execute(select(Instance).where(Instance.id == instance_id))
instance = result.scalar_one_or_none()
if instance is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail={"error": "Instance not found", "code": "instance_not_found"},
)
# Validate hub_api_key using constant-time comparison
if not instance.hub_api_key_hash:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail={
"error": "Instance has no hub_api_key configured",
"code": "no_hub_key",
},
)
provided_hash = hashlib.sha256(hub_api_key.encode()).hexdigest()
if not secrets.compare_digest(provided_hash, instance.hub_api_key_hash):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail={"error": "Invalid hub_api_key", "code": "invalid_hub_key"},
)
# Check instance status
if instance.license_status == "suspended":
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail={"error": "Instance suspended", "code": "suspended"},
)
if instance.license_status == "revoked":
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail={"error": "Instance revoked", "code": "revoked"},
)
# Check license expiry
now = utc_now()
if instance.license_expires_at and instance.license_expires_at < now:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail={"error": "License expired", "code": "expired"},
)
# Store telemetry sample
# Use PostgreSQL upsert to handle duplicates gracefully
telemetry_data = {
"instance_id": instance_id,
"window_start": payload.window_start,
"window_end": payload.window_end,
"uptime_seconds": payload.uptime_seconds,
"metrics": payload.metrics.model_dump(),
}
try:
# PostgreSQL INSERT ... ON CONFLICT DO NOTHING
# If duplicate (instance_id, window_start), silently ignore
stmt = (
pg_insert(TelemetrySample)
.values(**telemetry_data)
.on_conflict_do_nothing(constraint="uq_telemetry_instance_window")
)
await db.execute(stmt)
except IntegrityError:
# Fallback for non-PostgreSQL (shouldn't happen in production)
logger.warning(
"telemetry_duplicate_submission",
extra={
"instance_id": str(instance_id),
"window_start": payload.window_start.isoformat(),
},
)
# Update instance last_seen_at
instance.last_seen_at = now
await db.commit()
logger.info(
"telemetry_received",
extra={
"instance_id": str(instance_id),
"window_start": payload.window_start.isoformat(),
"window_end": payload.window_end.isoformat(),
"uptime_seconds": payload.uptime_seconds,
},
)
return TelemetryResponse(
received=True,
next_interval_seconds=60,
message=None,
)

21
app/schemas/__init__.py Normal file
View File

@@ -0,0 +1,21 @@
"""Hub API schemas."""
from app.schemas.client import ClientCreate, ClientResponse, ClientUpdate
from app.schemas.instance import (
ActivationError,
ActivationRequest,
ActivationResponse,
InstanceCreate,
InstanceResponse,
)
__all__ = [
"ClientCreate",
"ClientResponse",
"ClientUpdate",
"InstanceCreate",
"InstanceResponse",
"ActivationRequest",
"ActivationResponse",
"ActivationError",
]

38
app/schemas/client.py Normal file
View File

@@ -0,0 +1,38 @@
"""Client schemas for API serialization."""
from datetime import datetime
from typing import Optional
from uuid import UUID
from pydantic import BaseModel, ConfigDict, EmailStr, Field
class ClientCreate(BaseModel):
"""Schema for creating a new client."""
name: str = Field(..., min_length=1, max_length=255, description="Client/company name")
contact_email: Optional[EmailStr] = Field(None, description="Primary contact email")
billing_plan: str = Field("free", description="Billing plan")
class ClientUpdate(BaseModel):
"""Schema for updating a client."""
name: Optional[str] = Field(None, min_length=1, max_length=255)
contact_email: Optional[EmailStr] = None
billing_plan: Optional[str] = None
status: Optional[str] = Field(None, pattern="^(active|suspended|archived)$")
class ClientResponse(BaseModel):
"""Schema for client API responses."""
model_config = ConfigDict(from_attributes=True)
id: UUID
name: str
contact_email: Optional[str]
billing_plan: str
status: str
created_at: datetime
updated_at: datetime

127
app/schemas/instance.py Normal file
View File

@@ -0,0 +1,127 @@
"""Instance schemas for API serialization."""
from datetime import datetime
from typing import Optional
from uuid import UUID
from pydantic import BaseModel, ConfigDict, Field
class InstanceCreate(BaseModel):
"""Schema for creating a new instance."""
instance_id: str = Field(
...,
min_length=1,
max_length=255,
description="Unique instance identifier (e.g., 'acme-orchestrator')",
)
region: Optional[str] = Field(None, max_length=50, description="Deployment region")
license_expires_at: Optional[datetime] = Field(
None,
description="License expiry date (None = perpetual)",
)
class InstanceResponse(BaseModel):
"""Schema for instance API responses.
Note: license_key and hub_api_key are ONLY returned on creation.
"""
model_config = ConfigDict(from_attributes=True)
id: UUID
instance_id: str
client_id: UUID
# License info
license_key: Optional[str] = Field(
None,
description="ONLY returned on creation - store securely!",
)
license_key_prefix: str
license_status: str
license_issued_at: datetime
license_expires_at: Optional[datetime]
# Hub API key
hub_api_key: Optional[str] = Field(
None,
description="ONLY returned on creation - store securely!",
)
# Activation state
activated_at: Optional[datetime]
last_activation_at: Optional[datetime]
activation_count: int
# Metadata
region: Optional[str]
version: Optional[str]
last_seen_at: Optional[datetime]
status: str
created_at: datetime
updated_at: datetime
class InstanceBriefResponse(BaseModel):
"""Brief instance response for listings (no secrets)."""
model_config = ConfigDict(from_attributes=True)
id: UUID
instance_id: str
client_id: UUID
license_key_prefix: str
license_status: str
license_expires_at: Optional[datetime]
activated_at: Optional[datetime]
activation_count: int
region: Optional[str]
status: str
created_at: datetime
# === ACTIVATION SCHEMAS ===
class ActivationRequest(BaseModel):
"""
Activation request from a client instance.
PRIVACY: This schema ONLY accepts:
- license_key (credential for validation)
- instance_id (identifier)
It NEVER accepts sensitive data fields.
"""
license_key: str = Field(..., description="License key (lb_inst_...)")
instance_id: str = Field(..., description="Instance identifier")
class ActivationResponse(BaseModel):
"""Response to a successful activation."""
status: str = Field("ok", description="Activation status")
instance_id: str
hub_api_key: str = Field(
...,
description="API key for telemetry auth (or 'USE_EXISTING')",
)
config: dict = Field(
default_factory=dict,
description="Optional configuration from Hub",
)
class ActivationError(BaseModel):
"""Error response for failed activation."""
error: str = Field(..., description="Human-readable error message")
code: str = Field(
...,
description="Error code: invalid_license, expired, suspended, instance_not_found",
)

105
app/schemas/telemetry.py Normal file
View File

@@ -0,0 +1,105 @@
"""Telemetry schemas for orchestrator metrics collection.
PRIVACY GUARANTEE: These schemas use extra="forbid" to reject
unknown fields, preventing accidental PII leaks.
"""
from datetime import datetime
from typing import Optional
from uuid import UUID
from pydantic import BaseModel, ConfigDict, Field
# === Nested Metrics Schemas ===
class AgentMetrics(BaseModel):
"""Agent status counts."""
model_config = ConfigDict(extra="forbid")
online_count: int = Field(ge=0, description="Agents currently online")
offline_count: int = Field(ge=0, description="Agents currently offline")
total_count: int = Field(ge=0, description="Total registered agents")
class TaskTypeMetrics(BaseModel):
"""Per-task-type metrics."""
model_config = ConfigDict(extra="forbid")
count: int = Field(ge=0, description="Number of tasks of this type")
avg_duration_ms: Optional[float] = Field(
None,
ge=0,
description="Average duration in milliseconds",
)
class TaskMetrics(BaseModel):
"""Task execution metrics."""
model_config = ConfigDict(extra="forbid")
by_status: dict[str, int] = Field(
default_factory=dict,
description="Task counts by status (completed, failed, running, pending)",
)
by_type: dict[str, TaskTypeMetrics] = Field(
default_factory=dict,
description="Task metrics by type (SHELL, FILE_WRITE, etc.)",
)
class ServerMetrics(BaseModel):
"""Server metrics."""
model_config = ConfigDict(extra="forbid")
total_count: int = Field(ge=0, description="Total registered servers")
class TelemetryMetrics(BaseModel):
"""Top-level metrics container."""
model_config = ConfigDict(extra="forbid")
agents: AgentMetrics
tasks: TaskMetrics
servers: ServerMetrics
# === Request/Response Schemas ===
class TelemetryPayload(BaseModel):
"""
Telemetry payload from an orchestrator instance.
PRIVACY: This schema deliberately uses extra="forbid" to reject
any fields not explicitly defined. This prevents accidental
transmission of PII or sensitive data.
De-duplication: The Hub uses (instance_id, window_start) as a
unique constraint to handle duplicate submissions.
"""
model_config = ConfigDict(extra="forbid")
instance_id: UUID = Field(..., description="Instance UUID (must match path)")
window_start: datetime = Field(..., description="Start of telemetry window")
window_end: datetime = Field(..., description="End of telemetry window")
uptime_seconds: int = Field(ge=0, description="Orchestrator uptime in seconds")
metrics: TelemetryMetrics = Field(..., description="Aggregated metrics")
class TelemetryResponse(BaseModel):
"""Response to telemetry submission."""
received: bool = Field(True, description="Whether telemetry was accepted")
next_interval_seconds: int = Field(
60,
description="Suggested interval for next submission",
)
message: Optional[str] = Field(None, description="Optional status message")

5
app/services/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
"""Hub services."""
from app.services.redactor import redact_metadata, validate_tool_name
__all__ = ["redact_metadata", "validate_tool_name"]

142
app/services/redactor.py Normal file
View File

@@ -0,0 +1,142 @@
"""
Strict ALLOW-LIST redaction for telemetry data.
PRIVACY GUARANTEE: If a field is not explicitly allowed, it is removed.
This module ensures NO sensitive data ever reaches the Hub database.
"""
from typing import Any
# ONLY these fields can be stored in metadata
ALLOWED_METADATA_FIELDS = frozenset({
"tool_name",
"duration_ms",
"status",
"error_code",
"component",
"version",
})
# Patterns that indicate sensitive data (defense in depth)
SENSITIVE_PATTERNS = frozenset({
"password",
"secret",
"token",
"key",
"credential",
"auth",
"cookie",
"session",
"bearer",
"content",
"body",
"payload",
"data",
"file",
"env",
"environment",
"config",
"setting",
"screenshot",
"image",
"base64",
"binary",
"private",
"cert",
"certificate",
})
def redact_metadata(metadata: dict[str, Any] | None) -> dict[str, Any]:
"""
Filter metadata to ONLY allowed fields.
Uses allow-list approach: if not explicitly allowed, it's removed.
This provides defense against accidentally storing sensitive data.
Args:
metadata: Raw metadata from telemetry
Returns:
Filtered metadata with only safe fields
"""
if metadata is None:
return {}
redacted: dict[str, Any] = {}
for key, value in metadata.items():
# Must be in allow list
if key not in ALLOWED_METADATA_FIELDS:
continue
# Defense in depth: reject if key contains sensitive pattern
key_lower = key.lower()
if any(pattern in key_lower for pattern in SENSITIVE_PATTERNS):
continue
# Only primitive types (no nested objects that could hide data)
if isinstance(value, (str, int, float, bool)):
# String length limit to prevent large data blobs
if isinstance(value, str) and len(value) > 100:
continue
redacted[key] = value
return redacted
def validate_tool_name(tool_name: str) -> bool:
"""
Validate tool name format.
Tool names must:
- Start with a known prefix (sysadmin., browser., gateway.)
- Be reasonably short
- Not contain suspicious characters
Args:
tool_name: Tool name to validate
Returns:
True if valid, False otherwise
"""
# Must match known prefixes
valid_prefixes = ("sysadmin.", "browser.", "gateway.", "llm.")
if not tool_name.startswith(valid_prefixes):
return False
# Length limit
if len(tool_name) > 100:
return False
# No suspicious content
suspicious_chars = {";", "'", '"', "\\", "\n", "\r", "\t", "\0"}
if any(c in tool_name for c in suspicious_chars):
return False
return True
def sanitize_error_code(error_code: str | None) -> str | None:
"""
Sanitize an error code to ensure it doesn't contain sensitive data.
Args:
error_code: Raw error code
Returns:
Sanitized error code or None if invalid
"""
if error_code is None:
return None
# Length limit
if len(error_code) > 50:
return None
# Must be alphanumeric with underscores/dashes
allowed = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-")
if not all(c in allowed for c in error_code):
return None
return error_code