84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
"""
|
|
Operations phase handlers: monitor, incident_response
|
|
"""
|
|
import json
|
|
import logging
|
|
import asyncio
|
|
import time
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def handle_monitor(tenant_id, task_id, step_name, input_data, config):
|
|
"""Periodic health monitoring of deployed application."""
|
|
env_output = input_data.get("deploy_env_collect", {}).get("output", {})
|
|
prod_env = env_output.get("production_env", {})
|
|
|
|
host = prod_env.get("host", "localhost")
|
|
port = config.get("production_app_port", 80)
|
|
base_url = f"http://{host}:{port}"
|
|
|
|
# Run health checks
|
|
checks = []
|
|
endpoints = config.get("monitor_endpoints", ["/health", "/index.ui"])
|
|
|
|
for ep in endpoints:
|
|
result = await _monitor_endpoint(f"{base_url}{ep}", ep)
|
|
checks.append(result)
|
|
|
|
# Evaluate health
|
|
failed = [c for c in checks if c.get("status") != "ok"]
|
|
health_status = "healthy" if not failed else "degraded" if len(failed) < len(checks) else "down"
|
|
|
|
# Check thresholds
|
|
response_times = [c.get("response_ms", 0) for c in checks if c.get("response_ms")]
|
|
avg_response = sum(response_times) / max(len(response_times), 1)
|
|
|
|
alerts = []
|
|
if health_status != "healthy":
|
|
alerts.append({
|
|
"level": "critical" if health_status == "down" else "warning",
|
|
"message": f"Application health: {health_status}. Failed checks: {[c['endpoint'] for c in failed]}",
|
|
})
|
|
if avg_response > config.get("response_threshold_ms", 2000):
|
|
alerts.append({
|
|
"level": "warning",
|
|
"message": f"Average response time {avg_response:.0f}ms exceeds threshold",
|
|
})
|
|
|
|
return {
|
|
"health_status": health_status,
|
|
"checks": checks,
|
|
"avg_response_ms": round(avg_response, 2),
|
|
"alerts": alerts,
|
|
"alert_count": len(alerts),
|
|
"timestamp": time.time(),
|
|
}
|
|
|
|
|
|
async def _monitor_endpoint(url, endpoint_name):
|
|
"""Monitor a single endpoint."""
|
|
import aiohttp
|
|
start = time.time()
|
|
try:
|
|
timeout = aiohttp.ClientTimeout(total=15)
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
async with session.get(url) as resp:
|
|
elapsed_ms = int((time.time() - start) * 1000)
|
|
return {
|
|
"endpoint": endpoint_name,
|
|
"url": url,
|
|
"status_code": resp.status,
|
|
"status": "ok" if resp.status < 400 else "fail",
|
|
"response_ms": elapsed_ms,
|
|
}
|
|
except Exception as e:
|
|
elapsed_ms = int((time.time() - start) * 1000)
|
|
return {
|
|
"endpoint": endpoint_name,
|
|
"url": url,
|
|
"status": "fail",
|
|
"error": str(e)[:200],
|
|
"response_ms": elapsed_ms,
|
|
}
|