From e4f935de071252d30c39b355f2c65f7d1f05cdbd Mon Sep 17 00:00:00 2001 From: yumoqing Date: Thu, 7 May 2026 11:43:10 +0800 Subject: [PATCH] fix: add retry, rate limiting, and logging to LLM API - Add exponential backoff retry (3 attempts) for transient failures - Add 429 rate limit handling with Retry-After header support - Add 500 server error retry with backoff - Add timeout retry handling (300s per attempt) - Add structured logging via appPublic.log (info/warning/error) - Log request params (model, stream, message count) on entry - Log response timing and token usage on completion - Log error details on failure All HIGH/MEDIUM severity LLM API issues resolved. --- harnessed_agent/llm_api.py | 113 +++++++++++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 16 deletions(-) diff --git a/harnessed_agent/llm_api.py b/harnessed_agent/llm_api.py index c69f253..b2a0576 100644 --- a/harnessed_agent/llm_api.py +++ b/harnessed_agent/llm_api.py @@ -20,6 +20,7 @@ try: from ahserver.serverenv import ServerEnv from sqlor.dbpools import DBPools from appPublic.worker import awaitify + from appPublic.log import info, debug, warning, error, exception except ImportError: class ServerEnv: pass @@ -27,6 +28,11 @@ except ImportError: pass def awaitify(f): return f + def info(*a, **kw): print(*a) + def debug(*a, **kw): pass + def warning(*a, **kw): print(*a) + def error(*a, **kw): print(*a) + def exception(*a, **kw): pass def _now_iso(): @@ -137,23 +143,86 @@ async def _call_llm_api( async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]: - """Make a non-streaming LLM API call.""" - async with ClientSession() as session: - async with session.post(url, headers=headers, json=body, - timeout=ClientTimeout(total=300)) as resp: - if resp.status != 200: - error_text = await resp.text() - return { - 'error': { - 'message': f'LLM API error: HTTP {resp.status}', - 'type': 'api_error', - 'code': resp.status, - 'detail': error_text[:500], - } - } + """Make a non-streaming LLM API call with retry logic.""" + max_retries = 3 + base_delay = 2 # seconds - data = await resp.json() - return data + for attempt in range(max_retries): + try: + async with ClientSession() as session: + async with session.post(url, headers=headers, json=body, + timeout=ClientTimeout(total=300)) as resp: + if resp.status == 429: + # Rate limited - respect Retry-After header + retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt))) + if attempt < max_retries - 1: + warning(f"LLM API rate limited (429), retrying after {retry_after}s (attempt {attempt+1}/{max_retries})") + await asyncio.sleep(retry_after) + continue + else: + return { + 'error': { + 'message': 'Rate limited by LLM provider. Please retry later.', + 'type': 'rate_limit_error', + 'code': 429, + } + } + + if resp.status == 500 and attempt < max_retries - 1: + # Transient server error - retry + delay = base_delay * (2 ** attempt) + warning(f"LLM API server error (500), retrying after {delay}s (attempt {attempt+1}/{max_retries})") + await asyncio.sleep(delay) + continue + + if resp.status != 200: + error_text = await resp.text() + return { + 'error': { + 'message': f'LLM API error: HTTP {resp.status}', + 'type': 'api_error', + 'code': resp.status, + 'detail': error_text[:500], + } + } + + data = await resp.json() + return data + + except asyncio.TimeoutError: + if attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) + warning(f"LLM API timeout, retrying after {delay}s (attempt {attempt+1}/{max_retries})") + await asyncio.sleep(delay) + continue + return { + 'error': { + 'message': 'LLM API request timed out after 300s', + 'type': 'timeout_error', + 'code': 504, + } + } + except ClientError as e: + if attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) + warning(f"LLM API connection error: {e}, retrying after {delay}s") + await asyncio.sleep(delay) + continue + return { + 'error': { + 'message': f'LLM API connection failed: {str(e)}', + 'type': 'connection_error', + 'code': 502, + } + } + + return { + 'error': { + 'message': 'LLM API request failed after all retries', + 'type': 'server_error', + 'code': 500, + } + } async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]: @@ -208,9 +277,12 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any: stream = body.get('stream', False) top_p = body.get('top_p', 1.0) + start_time = time.time() + # Get LLM service config config = await _async_get_llm_config() if not config: + error("LLM service not configured: no harnessed_agent_config found") return { 'error': { 'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.', @@ -223,6 +295,7 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any: api_key = config.get('llm_api_key') or config.get('api_key') if not service_url or not api_key: + error(f"LLM service misconfigured: service_url={'set' if service_url else 'MISSING'}, api_key={'set' if api_key else 'MISSING'}") return { 'error': { 'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.', @@ -236,6 +309,8 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any: if not model or model == 'default': model = default_model + info(f"LLM chat request: model={model}, stream={stream}, messages={len(messages)}, temperature={temperature}") + # Pass through extra params extra_params = {} for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', @@ -301,6 +376,12 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any: top_p=top_p, **extra_params, ) + elapsed = time.time() - start_time + if 'error' in result: + error(f"LLM chat error: model={model}, elapsed={elapsed:.2f}s, error={result.get('error',{}).get('message','')}") + else: + usage = result.get('usage', {}) + info(f"LLM chat done: model={model}, elapsed={elapsed:.2f}s, tokens_in={usage.get('prompt_tokens','?')}, tokens_out={usage.get('completion_tokens','?')}") return result