""" Operations phase handlers: monitor, incident_response """ import json import logging import asyncio import time logger = logging.getLogger(__name__) async def handle_monitor(tenant_id, task_id, step_name, input_data, config): """Periodic health monitoring of deployed application.""" env_output = input_data.get("deploy_env_collect", {}).get("output", {}) prod_env = env_output.get("production_env", {}) host = prod_env.get("host", "localhost") port = config.get("production_app_port", 80) base_url = f"http://{host}:{port}" # Run health checks checks = [] endpoints = config.get("monitor_endpoints", ["/health", "/index.ui"]) for ep in endpoints: result = await _monitor_endpoint(f"{base_url}{ep}", ep) checks.append(result) # Evaluate health failed = [c for c in checks if c.get("status") != "ok"] health_status = "healthy" if not failed else "degraded" if len(failed) < len(checks) else "down" # Check thresholds response_times = [c.get("response_ms", 0) for c in checks if c.get("response_ms")] avg_response = sum(response_times) / max(len(response_times), 1) alerts = [] if health_status != "healthy": alerts.append({ "level": "critical" if health_status == "down" else "warning", "message": f"Application health: {health_status}. Failed checks: {[c['endpoint'] for c in failed]}", }) if avg_response > config.get("response_threshold_ms", 2000): alerts.append({ "level": "warning", "message": f"Average response time {avg_response:.0f}ms exceeds threshold", }) return { "health_status": health_status, "checks": checks, "avg_response_ms": round(avg_response, 2), "alerts": alerts, "alert_count": len(alerts), "timestamp": time.time(), } async def _monitor_endpoint(url, endpoint_name): """Monitor a single endpoint.""" import aiohttp start = time.time() try: timeout = aiohttp.ClientTimeout(total=15) async with aiohttp.ClientSession(timeout=timeout) as session: async with session.get(url) as resp: elapsed_ms = int((time.time() - start) * 1000) return { "endpoint": endpoint_name, "url": url, "status_code": resp.status, "status": "ok" if resp.status < 400 else "fail", "response_ms": elapsed_ms, } except Exception as e: elapsed_ms = int((time.time() - start) * 1000) return { "endpoint": endpoint_name, "url": url, "status": "fail", "error": str(e)[:200], "response_ms": elapsed_ms, }