Include full contents of all nested repositories

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 16:25:02 +01:00
parent 14ff8fd54c
commit 2401ed446f
7271 changed files with 1310112 additions and 6 deletions
--- a/letsbe-orchestrator/app/services/init.py
+++ b/letsbe-orchestrator/app/services/init.py
@@ -0,0 +1,5 @@
+"""Service layer for the Orchestrator."""
+
+from app.services.local_bootstrap import LocalBootstrapService
+
+__all__ = ["LocalBootstrapService"]
--- a/letsbe-orchestrator/app/services/hub_telemetry.py
+++ b/letsbe-orchestrator/app/services/hub_telemetry.py
@@ -0,0 +1,270 @@
+"""Hub Telemetry Service - sends aggregated metrics to Hub.
+
+This background service periodically collects metrics from the local database
+and sends them to the central Hub for license compliance and usage analytics.
+
+Key design choices:
+- Since-last-send windowing (avoids double-counting)
+- SQL aggregates (never loads task objects into Python)
+- Reusable httpx.AsyncClient (single connection pool)
+- Jitter ±15% (prevents thundering herd)
+- Exponential backoff on errors (1s → 2s → 4s → ... → 60s max)
+"""
+
+import asyncio
+import logging
+import random
+from datetime import datetime, timedelta, timezone
+from typing import Any, Optional
+
+import httpx
+from sqlalchemy import func, select
+
+from app.config import get_settings
+from app.db import async_session_maker
+from app.models.agent import Agent, AgentStatus
+from app.models.server import Server
+from app.models.task import Task
+
+logger = logging.getLogger(__name__)
+settings = get_settings()
+
+
+class HubTelemetryService:
+    """Background service that sends telemetry to Hub."""
+
+    _task: Optional[asyncio.Task] = None
+    _shutdown_event: Optional[asyncio.Event] = None
+    _start_time: Optional[datetime] = None
+    _last_sent_at: Optional[datetime] = None
+    _client: Optional[httpx.AsyncClient] = None
+    _consecutive_failures: int = 0
+
+    @classmethod
+    async def start(cls) -> None:
+        """Start the telemetry background task. Never blocks startup."""
+        if not settings.HUB_TELEMETRY_ENABLED:
+            logger.info("hub_telemetry_disabled")
+            return
+
+        if not settings.HUB_URL:
+            logger.warning("hub_telemetry_missing_hub_url")
+            return
+
+        if not settings.HUB_API_KEY:
+            logger.warning("hub_telemetry_missing_hub_api_key")
+            return
+
+        if not settings.INSTANCE_ID:
+            logger.warning("hub_telemetry_missing_instance_id")
+            return
+
+        now = datetime.now(timezone.utc)
+        cls._start_time = now
+        # Initialize window to (now - interval) so first send isn't empty
+        cls._last_sent_at = now - timedelta(
+            seconds=settings.HUB_TELEMETRY_INTERVAL_SECONDS
+        )
+        cls._shutdown_event = asyncio.Event()
+        cls._consecutive_failures = 0
+        cls._client = httpx.AsyncClient(timeout=30.0)
+        cls._task = asyncio.create_task(cls._telemetry_loop())
+
+        logger.info(
+            "hub_telemetry_started",
+            extra={
+                "interval_seconds": settings.HUB_TELEMETRY_INTERVAL_SECONDS,
+                "hub_url": settings.HUB_URL,
+                "instance_id": settings.INSTANCE_ID,
+            },
+        )
+
+    @classmethod
+    async def stop(cls) -> None:
+        """Stop the telemetry background task."""
+        if cls._shutdown_event:
+            cls._shutdown_event.set()
+
+        if cls._task:
+            try:
+                await asyncio.wait_for(cls._task, timeout=5.0)
+            except asyncio.TimeoutError:
+                cls._task.cancel()
+                try:
+                    await cls._task
+                except asyncio.CancelledError:
+                    pass
+
+        if cls._client:
+            await cls._client.aclose()
+            cls._client = None
+
+        logger.info("hub_telemetry_stopped")
+
+    @classmethod
+    async def _telemetry_loop(cls) -> None:
+        """Main telemetry loop with jitter and backoff."""
+        base_interval = settings.HUB_TELEMETRY_INTERVAL_SECONDS
+
+        while not cls._shutdown_event.is_set():
+            try:
+                await cls._send_telemetry()
+                cls._consecutive_failures = 0  # Reset on success
+            except Exception as e:
+                cls._consecutive_failures += 1
+                logger.warning(
+                    "hub_telemetry_send_failed",
+                    extra={
+                        "error": str(e),
+                        "error_type": type(e).__name__,
+                        "consecutive_failures": cls._consecutive_failures,
+                    },
+                )
+
+            # Calculate interval: base ± 15% jitter, with backoff on failures
+            jitter = random.uniform(-0.15, 0.15) * base_interval
+            backoff = (
+                min(2**cls._consecutive_failures, 60)
+                if cls._consecutive_failures
+                else 0
+            )
+            interval = base_interval + jitter + backoff
+
+            try:
+                await asyncio.wait_for(
+                    cls._shutdown_event.wait(), timeout=interval
+                )
+                break  # Shutdown requested
+            except asyncio.TimeoutError:
+                pass  # Normal timeout, continue loop
+
+    @classmethod
+    async def _send_telemetry(cls) -> None:
+        """Collect and send telemetry to Hub."""
+        window_start = cls._last_sent_at
+        window_end = datetime.now(timezone.utc)
+
+        payload = await cls._collect_metrics(window_start, window_end)
+
+        response = await cls._client.post(
+            f"{settings.HUB_URL}/api/v1/instances/{settings.INSTANCE_ID}/telemetry",
+            json=payload,
+            headers={"X-Hub-Api-Key": settings.HUB_API_KEY},
+        )
+        response.raise_for_status()
+
+        # Only update window on success
+        cls._last_sent_at = window_end
+
+        logger.debug(
+            "hub_telemetry_sent",
+            extra={
+                "window_seconds": (window_end - window_start).total_seconds(),
+                "status_code": response.status_code,
+            },
+        )
+
+    @classmethod
+    async def _collect_metrics(
+        cls, window_start: datetime, window_end: datetime
+    ) -> dict[str, Any]:
+        """Collect metrics using SQL aggregates (never load objects)."""
+        async with async_session_maker() as db:
+            # Agent counts by status (all agents, not windowed)
+            agent_result = await db.execute(
+                select(Agent.status, func.count(Agent.id).label("count")).group_by(
+                    Agent.status
+                )
+            )
+            agent_rows = agent_result.all()
+
+            # Task counts by status and type (windowed by updated_at)
+            # Duration approximated as (updated_at - created_at) for completed/failed tasks
+            task_result = await db.execute(
+                select(
+                    Task.status,
+                    Task.type,
+                    func.count(Task.id).label("count"),
+                    func.avg(
+                        func.extract("epoch", Task.updated_at - Task.created_at) * 1000
+                    ).label("avg_duration_ms"),
+                )
+                .where(Task.updated_at.between(window_start, window_end))
+                .group_by(Task.status, Task.type)
+            )
+            task_rows = task_result.all()
+
+            # Server count (simple count, not windowed)
+            server_count = await db.scalar(select(func.count(Server.id)))
+
+        return {
+            "instance_id": str(settings.INSTANCE_ID),
+            "window_start": window_start.isoformat(),
+            "window_end": window_end.isoformat(),
+            "uptime_seconds": int((window_end - cls._start_time).total_seconds()),
+            "metrics": {
+                "agents": cls._format_agent_counts(agent_rows),
+                "tasks": cls._format_task_counts(task_rows),
+                "servers": {"total_count": server_count or 0},
+            },
+        }
+
+    @classmethod
+    def _format_agent_counts(cls, rows: list) -> dict[str, int]:
+        """Format agent count rows into response structure."""
+        counts = {
+            "online_count": 0,
+            "offline_count": 0,
+            "total_count": 0,
+        }
+
+        for row in rows:
+            status, count = row.status, row.count
+            counts["total_count"] += count
+
+            if status == AgentStatus.ONLINE:
+                counts["online_count"] = count
+            elif status == AgentStatus.OFFLINE:
+                counts["offline_count"] = count
+            # INVALID agents are counted in total but not separately
+
+        return counts
+
+    @classmethod
+    def _format_task_counts(cls, rows: list) -> dict[str, Any]:
+        """Format task count rows into response structure."""
+        by_status: dict[str, int] = {}
+        by_type: dict[str, dict[str, Any]] = {}
+
+        for row in rows:
+            status_str = row.status.value if hasattr(row.status, "value") else str(row.status)
+            type_str = row.type.value if hasattr(row.type, "value") else str(row.type)
+            count = row.count
+            avg_duration_ms = row.avg_duration_ms
+
+            # Aggregate by status
+            by_status[status_str] = by_status.get(status_str, 0) + count
+
+            # Aggregate by type
+            if type_str not in by_type:
+                by_type[type_str] = {"count": 0, "avg_duration_ms": 0}
+
+            # Weighted average for duration when merging
+            existing = by_type[type_str]
+            total_count = existing["count"] + count
+            if total_count > 0 and avg_duration_ms is not None:
+                existing_weighted = existing["avg_duration_ms"] * existing["count"]
+                new_weighted = avg_duration_ms * count
+                by_type[type_str]["avg_duration_ms"] = (
+                    existing_weighted + new_weighted
+                ) / total_count
+            by_type[type_str]["count"] = total_count
+
+        # Round durations for cleaner output
+        for type_data in by_type.values():
+            type_data["avg_duration_ms"] = round(type_data["avg_duration_ms"], 2)
+
+        return {
+            "by_status": by_status,
+            "by_type": by_type,
+        }
--- a/letsbe-orchestrator/app/services/local_bootstrap.py
+++ b/letsbe-orchestrator/app/services/local_bootstrap.py
@@ -0,0 +1,167 @@
+"""
+Local bootstrap service for single-tenant mode.
+
+Handles automatic tenant creation when LOCAL_MODE is enabled.
+Designed to be migration-safe: gracefully handles cases where
+database tables don't exist yet.
+"""
+
+import asyncio
+import logging
+from typing import Optional
+from uuid import UUID
+
+from sqlalchemy import select, text
+from sqlalchemy.exc import OperationalError, ProgrammingError
+
+from app.config import settings
+from app.db import async_session_maker
+from app.models import Tenant
+
+logger = logging.getLogger(__name__)
+
+
+class LocalBootstrapService:
+    """
+    Service for bootstrapping local single-tenant mode.
+
+    When LOCAL_MODE=true:
+    - Waits for database migrations to complete
+    - Creates or retrieves the local tenant
+    - Makes tenant_id available for the meta endpoint
+
+    When LOCAL_MODE=false:
+    - Does nothing (multi-tenant mode unchanged)
+    """
+
+    # Class-level state for meta endpoint access
+    _local_tenant_id: Optional[UUID] = None
+    _bootstrap_attempted: bool = False
+    _bootstrap_error: Optional[str] = None
+
+    # Bootstrap configuration
+    MAX_RETRIES = 30  # Max attempts waiting for migrations
+    RETRY_DELAY_SECONDS = 2  # Delay between retries
+    LOCAL_TENANT_NAME = "local"
+
+    @classmethod
+    def get_local_tenant_id(cls) -> Optional[UUID]:
+        """Get the local tenant ID if bootstrap succeeded."""
+        return cls._local_tenant_id
+
+    @classmethod
+    def get_bootstrap_status(cls) -> dict:
+        """Get bootstrap status for diagnostics."""
+        return {
+            "attempted": cls._bootstrap_attempted,
+            "success": cls._local_tenant_id is not None,
+            "tenant_id": str(cls._local_tenant_id) if cls._local_tenant_id else None,
+            "error": cls._bootstrap_error,
+        }
+
+    @classmethod
+    async def run(cls) -> None:
+        """
+        Run the bootstrap process.
+
+        Only executes if LOCAL_MODE is enabled.
+        Safe to call multiple times (idempotent).
+        """
+        if not settings.LOCAL_MODE:
+            logger.debug("LOCAL_MODE is disabled, skipping bootstrap")
+            return
+
+        if cls._bootstrap_attempted:
+            logger.debug("Bootstrap already attempted, skipping")
+            return
+
+        cls._bootstrap_attempted = True
+        logger.info("LOCAL_MODE enabled, starting local tenant bootstrap")
+
+        # Validate required settings
+        if not settings.INSTANCE_ID:
+            cls._bootstrap_error = "INSTANCE_ID is required when LOCAL_MODE is enabled"
+            logger.error(cls._bootstrap_error)
+            return
+
+        try:
+            await cls._bootstrap_with_retry()
+        except Exception as e:
+            cls._bootstrap_error = str(e)
+            logger.exception("Bootstrap failed with unexpected error")
+
+    @classmethod
+    async def _bootstrap_with_retry(cls) -> None:
+        """
+        Attempt bootstrap with retry logic for migration safety.
+
+        Waits for the tenants table to exist before proceeding.
+        """
+        for attempt in range(1, cls.MAX_RETRIES + 1):
+            try:
+                await cls._ensure_local_tenant()
+                logger.info(f"Local tenant bootstrap succeeded (tenant_id={cls._local_tenant_id})")
+                return
+            except (OperationalError, ProgrammingError) as e:
+                # These errors typically indicate migrations haven't run yet
+                error_msg = str(e).lower()
+                if "does not exist" in error_msg or "no such table" in error_msg:
+                    if attempt < cls.MAX_RETRIES:
+                        logger.warning(
+                            f"Database table not ready (attempt {attempt}/{cls.MAX_RETRIES}), "
+                            f"retrying in {cls.RETRY_DELAY_SECONDS}s..."
+                        )
+                        await asyncio.sleep(cls.RETRY_DELAY_SECONDS)
+                        continue
+                    else:
+                        cls._bootstrap_error = f"Migrations did not complete after {cls.MAX_RETRIES} attempts"
+                        logger.error(cls._bootstrap_error)
+                        return
+                else:
+                    # Unexpected database error
+                    raise
+
+    @classmethod
+    async def _ensure_local_tenant(cls) -> None:
+        """
+        Ensure the local tenant exists.
+
+        Creates it if missing, retrieves it if already exists.
+        """
+        async with async_session_maker() as session:
+            # First, verify we can query the tenants table
+            # This will fail fast if migrations haven't run
+            await session.execute(text("SELECT 1 FROM tenants LIMIT 1"))
+
+            # Check if local tenant exists
+            result = await session.execute(
+                select(Tenant).where(Tenant.name == cls.LOCAL_TENANT_NAME)
+            )
+            tenant = result.scalar_one_or_none()
+
+            if tenant:
+                logger.info(f"Local tenant already exists (id={tenant.id})")
+                cls._local_tenant_id = tenant.id
+                return
+
+            # Create local tenant
+            tenant = Tenant(
+                name=cls.LOCAL_TENANT_NAME,
+                domain=settings.LOCAL_TENANT_DOMAIN,
+            )
+            session.add(tenant)
+            await session.commit()
+            await session.refresh(tenant)
+
+            cls._local_tenant_id = tenant.id
+            logger.info(f"Created local tenant (id={tenant.id}, domain={settings.LOCAL_TENANT_DOMAIN})")
+
+    @classmethod
+    async def _check_table_exists(cls, table_name: str) -> bool:
+        """Check if a database table exists."""
+        async with async_session_maker() as session:
+            try:
+                await session.execute(text(f"SELECT 1 FROM {table_name} LIMIT 1"))
+                return True
+            except (OperationalError, ProgrammingError):
+                return False