Include full contents of all nested repositories
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
5
letsbe-orchestrator/app/services/__init__.py
Normal file
5
letsbe-orchestrator/app/services/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Service layer for the Orchestrator."""
|
||||
|
||||
from app.services.local_bootstrap import LocalBootstrapService
|
||||
|
||||
__all__ = ["LocalBootstrapService"]
|
||||
270
letsbe-orchestrator/app/services/hub_telemetry.py
Normal file
270
letsbe-orchestrator/app/services/hub_telemetry.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""Hub Telemetry Service - sends aggregated metrics to Hub.
|
||||
|
||||
This background service periodically collects metrics from the local database
|
||||
and sends them to the central Hub for license compliance and usage analytics.
|
||||
|
||||
Key design choices:
|
||||
- Since-last-send windowing (avoids double-counting)
|
||||
- SQL aggregates (never loads task objects into Python)
|
||||
- Reusable httpx.AsyncClient (single connection pool)
|
||||
- Jitter ±15% (prevents thundering herd)
|
||||
- Exponential backoff on errors (1s → 2s → 4s → ... → 60s max)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import func, select
|
||||
|
||||
from app.config import get_settings
|
||||
from app.db import async_session_maker
|
||||
from app.models.agent import Agent, AgentStatus
|
||||
from app.models.server import Server
|
||||
from app.models.task import Task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
class HubTelemetryService:
|
||||
"""Background service that sends telemetry to Hub."""
|
||||
|
||||
_task: Optional[asyncio.Task] = None
|
||||
_shutdown_event: Optional[asyncio.Event] = None
|
||||
_start_time: Optional[datetime] = None
|
||||
_last_sent_at: Optional[datetime] = None
|
||||
_client: Optional[httpx.AsyncClient] = None
|
||||
_consecutive_failures: int = 0
|
||||
|
||||
@classmethod
|
||||
async def start(cls) -> None:
|
||||
"""Start the telemetry background task. Never blocks startup."""
|
||||
if not settings.HUB_TELEMETRY_ENABLED:
|
||||
logger.info("hub_telemetry_disabled")
|
||||
return
|
||||
|
||||
if not settings.HUB_URL:
|
||||
logger.warning("hub_telemetry_missing_hub_url")
|
||||
return
|
||||
|
||||
if not settings.HUB_API_KEY:
|
||||
logger.warning("hub_telemetry_missing_hub_api_key")
|
||||
return
|
||||
|
||||
if not settings.INSTANCE_ID:
|
||||
logger.warning("hub_telemetry_missing_instance_id")
|
||||
return
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
cls._start_time = now
|
||||
# Initialize window to (now - interval) so first send isn't empty
|
||||
cls._last_sent_at = now - timedelta(
|
||||
seconds=settings.HUB_TELEMETRY_INTERVAL_SECONDS
|
||||
)
|
||||
cls._shutdown_event = asyncio.Event()
|
||||
cls._consecutive_failures = 0
|
||||
cls._client = httpx.AsyncClient(timeout=30.0)
|
||||
cls._task = asyncio.create_task(cls._telemetry_loop())
|
||||
|
||||
logger.info(
|
||||
"hub_telemetry_started",
|
||||
extra={
|
||||
"interval_seconds": settings.HUB_TELEMETRY_INTERVAL_SECONDS,
|
||||
"hub_url": settings.HUB_URL,
|
||||
"instance_id": settings.INSTANCE_ID,
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def stop(cls) -> None:
|
||||
"""Stop the telemetry background task."""
|
||||
if cls._shutdown_event:
|
||||
cls._shutdown_event.set()
|
||||
|
||||
if cls._task:
|
||||
try:
|
||||
await asyncio.wait_for(cls._task, timeout=5.0)
|
||||
except asyncio.TimeoutError:
|
||||
cls._task.cancel()
|
||||
try:
|
||||
await cls._task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
if cls._client:
|
||||
await cls._client.aclose()
|
||||
cls._client = None
|
||||
|
||||
logger.info("hub_telemetry_stopped")
|
||||
|
||||
@classmethod
|
||||
async def _telemetry_loop(cls) -> None:
|
||||
"""Main telemetry loop with jitter and backoff."""
|
||||
base_interval = settings.HUB_TELEMETRY_INTERVAL_SECONDS
|
||||
|
||||
while not cls._shutdown_event.is_set():
|
||||
try:
|
||||
await cls._send_telemetry()
|
||||
cls._consecutive_failures = 0 # Reset on success
|
||||
except Exception as e:
|
||||
cls._consecutive_failures += 1
|
||||
logger.warning(
|
||||
"hub_telemetry_send_failed",
|
||||
extra={
|
||||
"error": str(e),
|
||||
"error_type": type(e).__name__,
|
||||
"consecutive_failures": cls._consecutive_failures,
|
||||
},
|
||||
)
|
||||
|
||||
# Calculate interval: base ± 15% jitter, with backoff on failures
|
||||
jitter = random.uniform(-0.15, 0.15) * base_interval
|
||||
backoff = (
|
||||
min(2**cls._consecutive_failures, 60)
|
||||
if cls._consecutive_failures
|
||||
else 0
|
||||
)
|
||||
interval = base_interval + jitter + backoff
|
||||
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
cls._shutdown_event.wait(), timeout=interval
|
||||
)
|
||||
break # Shutdown requested
|
||||
except asyncio.TimeoutError:
|
||||
pass # Normal timeout, continue loop
|
||||
|
||||
@classmethod
|
||||
async def _send_telemetry(cls) -> None:
|
||||
"""Collect and send telemetry to Hub."""
|
||||
window_start = cls._last_sent_at
|
||||
window_end = datetime.now(timezone.utc)
|
||||
|
||||
payload = await cls._collect_metrics(window_start, window_end)
|
||||
|
||||
response = await cls._client.post(
|
||||
f"{settings.HUB_URL}/api/v1/instances/{settings.INSTANCE_ID}/telemetry",
|
||||
json=payload,
|
||||
headers={"X-Hub-Api-Key": settings.HUB_API_KEY},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Only update window on success
|
||||
cls._last_sent_at = window_end
|
||||
|
||||
logger.debug(
|
||||
"hub_telemetry_sent",
|
||||
extra={
|
||||
"window_seconds": (window_end - window_start).total_seconds(),
|
||||
"status_code": response.status_code,
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def _collect_metrics(
|
||||
cls, window_start: datetime, window_end: datetime
|
||||
) -> dict[str, Any]:
|
||||
"""Collect metrics using SQL aggregates (never load objects)."""
|
||||
async with async_session_maker() as db:
|
||||
# Agent counts by status (all agents, not windowed)
|
||||
agent_result = await db.execute(
|
||||
select(Agent.status, func.count(Agent.id).label("count")).group_by(
|
||||
Agent.status
|
||||
)
|
||||
)
|
||||
agent_rows = agent_result.all()
|
||||
|
||||
# Task counts by status and type (windowed by updated_at)
|
||||
# Duration approximated as (updated_at - created_at) for completed/failed tasks
|
||||
task_result = await db.execute(
|
||||
select(
|
||||
Task.status,
|
||||
Task.type,
|
||||
func.count(Task.id).label("count"),
|
||||
func.avg(
|
||||
func.extract("epoch", Task.updated_at - Task.created_at) * 1000
|
||||
).label("avg_duration_ms"),
|
||||
)
|
||||
.where(Task.updated_at.between(window_start, window_end))
|
||||
.group_by(Task.status, Task.type)
|
||||
)
|
||||
task_rows = task_result.all()
|
||||
|
||||
# Server count (simple count, not windowed)
|
||||
server_count = await db.scalar(select(func.count(Server.id)))
|
||||
|
||||
return {
|
||||
"instance_id": str(settings.INSTANCE_ID),
|
||||
"window_start": window_start.isoformat(),
|
||||
"window_end": window_end.isoformat(),
|
||||
"uptime_seconds": int((window_end - cls._start_time).total_seconds()),
|
||||
"metrics": {
|
||||
"agents": cls._format_agent_counts(agent_rows),
|
||||
"tasks": cls._format_task_counts(task_rows),
|
||||
"servers": {"total_count": server_count or 0},
|
||||
},
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _format_agent_counts(cls, rows: list) -> dict[str, int]:
|
||||
"""Format agent count rows into response structure."""
|
||||
counts = {
|
||||
"online_count": 0,
|
||||
"offline_count": 0,
|
||||
"total_count": 0,
|
||||
}
|
||||
|
||||
for row in rows:
|
||||
status, count = row.status, row.count
|
||||
counts["total_count"] += count
|
||||
|
||||
if status == AgentStatus.ONLINE:
|
||||
counts["online_count"] = count
|
||||
elif status == AgentStatus.OFFLINE:
|
||||
counts["offline_count"] = count
|
||||
# INVALID agents are counted in total but not separately
|
||||
|
||||
return counts
|
||||
|
||||
@classmethod
|
||||
def _format_task_counts(cls, rows: list) -> dict[str, Any]:
|
||||
"""Format task count rows into response structure."""
|
||||
by_status: dict[str, int] = {}
|
||||
by_type: dict[str, dict[str, Any]] = {}
|
||||
|
||||
for row in rows:
|
||||
status_str = row.status.value if hasattr(row.status, "value") else str(row.status)
|
||||
type_str = row.type.value if hasattr(row.type, "value") else str(row.type)
|
||||
count = row.count
|
||||
avg_duration_ms = row.avg_duration_ms
|
||||
|
||||
# Aggregate by status
|
||||
by_status[status_str] = by_status.get(status_str, 0) + count
|
||||
|
||||
# Aggregate by type
|
||||
if type_str not in by_type:
|
||||
by_type[type_str] = {"count": 0, "avg_duration_ms": 0}
|
||||
|
||||
# Weighted average for duration when merging
|
||||
existing = by_type[type_str]
|
||||
total_count = existing["count"] + count
|
||||
if total_count > 0 and avg_duration_ms is not None:
|
||||
existing_weighted = existing["avg_duration_ms"] * existing["count"]
|
||||
new_weighted = avg_duration_ms * count
|
||||
by_type[type_str]["avg_duration_ms"] = (
|
||||
existing_weighted + new_weighted
|
||||
) / total_count
|
||||
by_type[type_str]["count"] = total_count
|
||||
|
||||
# Round durations for cleaner output
|
||||
for type_data in by_type.values():
|
||||
type_data["avg_duration_ms"] = round(type_data["avg_duration_ms"], 2)
|
||||
|
||||
return {
|
||||
"by_status": by_status,
|
||||
"by_type": by_type,
|
||||
}
|
||||
167
letsbe-orchestrator/app/services/local_bootstrap.py
Normal file
167
letsbe-orchestrator/app/services/local_bootstrap.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""
|
||||
Local bootstrap service for single-tenant mode.
|
||||
|
||||
Handles automatic tenant creation when LOCAL_MODE is enabled.
|
||||
Designed to be migration-safe: gracefully handles cases where
|
||||
database tables don't exist yet.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Optional
|
||||
from uuid import UUID
|
||||
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.exc import OperationalError, ProgrammingError
|
||||
|
||||
from app.config import settings
|
||||
from app.db import async_session_maker
|
||||
from app.models import Tenant
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LocalBootstrapService:
|
||||
"""
|
||||
Service for bootstrapping local single-tenant mode.
|
||||
|
||||
When LOCAL_MODE=true:
|
||||
- Waits for database migrations to complete
|
||||
- Creates or retrieves the local tenant
|
||||
- Makes tenant_id available for the meta endpoint
|
||||
|
||||
When LOCAL_MODE=false:
|
||||
- Does nothing (multi-tenant mode unchanged)
|
||||
"""
|
||||
|
||||
# Class-level state for meta endpoint access
|
||||
_local_tenant_id: Optional[UUID] = None
|
||||
_bootstrap_attempted: bool = False
|
||||
_bootstrap_error: Optional[str] = None
|
||||
|
||||
# Bootstrap configuration
|
||||
MAX_RETRIES = 30 # Max attempts waiting for migrations
|
||||
RETRY_DELAY_SECONDS = 2 # Delay between retries
|
||||
LOCAL_TENANT_NAME = "local"
|
||||
|
||||
@classmethod
|
||||
def get_local_tenant_id(cls) -> Optional[UUID]:
|
||||
"""Get the local tenant ID if bootstrap succeeded."""
|
||||
return cls._local_tenant_id
|
||||
|
||||
@classmethod
|
||||
def get_bootstrap_status(cls) -> dict:
|
||||
"""Get bootstrap status for diagnostics."""
|
||||
return {
|
||||
"attempted": cls._bootstrap_attempted,
|
||||
"success": cls._local_tenant_id is not None,
|
||||
"tenant_id": str(cls._local_tenant_id) if cls._local_tenant_id else None,
|
||||
"error": cls._bootstrap_error,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
async def run(cls) -> None:
|
||||
"""
|
||||
Run the bootstrap process.
|
||||
|
||||
Only executes if LOCAL_MODE is enabled.
|
||||
Safe to call multiple times (idempotent).
|
||||
"""
|
||||
if not settings.LOCAL_MODE:
|
||||
logger.debug("LOCAL_MODE is disabled, skipping bootstrap")
|
||||
return
|
||||
|
||||
if cls._bootstrap_attempted:
|
||||
logger.debug("Bootstrap already attempted, skipping")
|
||||
return
|
||||
|
||||
cls._bootstrap_attempted = True
|
||||
logger.info("LOCAL_MODE enabled, starting local tenant bootstrap")
|
||||
|
||||
# Validate required settings
|
||||
if not settings.INSTANCE_ID:
|
||||
cls._bootstrap_error = "INSTANCE_ID is required when LOCAL_MODE is enabled"
|
||||
logger.error(cls._bootstrap_error)
|
||||
return
|
||||
|
||||
try:
|
||||
await cls._bootstrap_with_retry()
|
||||
except Exception as e:
|
||||
cls._bootstrap_error = str(e)
|
||||
logger.exception("Bootstrap failed with unexpected error")
|
||||
|
||||
@classmethod
|
||||
async def _bootstrap_with_retry(cls) -> None:
|
||||
"""
|
||||
Attempt bootstrap with retry logic for migration safety.
|
||||
|
||||
Waits for the tenants table to exist before proceeding.
|
||||
"""
|
||||
for attempt in range(1, cls.MAX_RETRIES + 1):
|
||||
try:
|
||||
await cls._ensure_local_tenant()
|
||||
logger.info(f"Local tenant bootstrap succeeded (tenant_id={cls._local_tenant_id})")
|
||||
return
|
||||
except (OperationalError, ProgrammingError) as e:
|
||||
# These errors typically indicate migrations haven't run yet
|
||||
error_msg = str(e).lower()
|
||||
if "does not exist" in error_msg or "no such table" in error_msg:
|
||||
if attempt < cls.MAX_RETRIES:
|
||||
logger.warning(
|
||||
f"Database table not ready (attempt {attempt}/{cls.MAX_RETRIES}), "
|
||||
f"retrying in {cls.RETRY_DELAY_SECONDS}s..."
|
||||
)
|
||||
await asyncio.sleep(cls.RETRY_DELAY_SECONDS)
|
||||
continue
|
||||
else:
|
||||
cls._bootstrap_error = f"Migrations did not complete after {cls.MAX_RETRIES} attempts"
|
||||
logger.error(cls._bootstrap_error)
|
||||
return
|
||||
else:
|
||||
# Unexpected database error
|
||||
raise
|
||||
|
||||
@classmethod
|
||||
async def _ensure_local_tenant(cls) -> None:
|
||||
"""
|
||||
Ensure the local tenant exists.
|
||||
|
||||
Creates it if missing, retrieves it if already exists.
|
||||
"""
|
||||
async with async_session_maker() as session:
|
||||
# First, verify we can query the tenants table
|
||||
# This will fail fast if migrations haven't run
|
||||
await session.execute(text("SELECT 1 FROM tenants LIMIT 1"))
|
||||
|
||||
# Check if local tenant exists
|
||||
result = await session.execute(
|
||||
select(Tenant).where(Tenant.name == cls.LOCAL_TENANT_NAME)
|
||||
)
|
||||
tenant = result.scalar_one_or_none()
|
||||
|
||||
if tenant:
|
||||
logger.info(f"Local tenant already exists (id={tenant.id})")
|
||||
cls._local_tenant_id = tenant.id
|
||||
return
|
||||
|
||||
# Create local tenant
|
||||
tenant = Tenant(
|
||||
name=cls.LOCAL_TENANT_NAME,
|
||||
domain=settings.LOCAL_TENANT_DOMAIN,
|
||||
)
|
||||
session.add(tenant)
|
||||
await session.commit()
|
||||
await session.refresh(tenant)
|
||||
|
||||
cls._local_tenant_id = tenant.id
|
||||
logger.info(f"Created local tenant (id={tenant.id}, domain={settings.LOCAL_TENANT_DOMAIN})")
|
||||
|
||||
@classmethod
|
||||
async def _check_table_exists(cls, table_name: str) -> bool:
|
||||
"""Check if a database table exists."""
|
||||
async with async_session_maker() as session:
|
||||
try:
|
||||
await session.execute(text(f"SELECT 1 FROM {table_name} LIMIT 1"))
|
||||
return True
|
||||
except (OperationalError, ProgrammingError):
|
||||
return False
|
||||
Reference in New Issue
Block a user