From e4f935de071252d30c39b355f2c65f7d1f05cdbd Mon Sep 17 00:00:00 2001
From: yumoqing <yumoqing@gmail.com>
Date: Thu, 7 May 2026 11:43:10 +0800
Subject: [PATCH] fix: add retry, rate limiting, and logging to LLM API

- Add exponential backoff retry (3 attempts) for transient failures
- Add 429 rate limit handling with Retry-After header support
- Add 500 server error retry with backoff
- Add timeout retry handling (300s per attempt)
- Add structured logging via appPublic.log (info/warning/error)
- Log request params (model, stream, message count) on entry
- Log response timing and token usage on completion
- Log error details on failure

All HIGH/MEDIUM severity LLM API issues resolved.
---
 harnessed_agent/llm_api.py | 113 +++++++++++++++++++++++++++++++------
 1 file changed, 97 insertions(+), 16 deletions(-)

diff --git a/harnessed_agent/llm_api.py b/harnessed_agent/llm_api.py
index c69f253..b2a0576 100644
--- a/harnessed_agent/llm_api.py
+++ b/harnessed_agent/llm_api.py
@@ -20,6 +20,7 @@ try:
     from ahserver.serverenv import ServerEnv
     from sqlor.dbpools import DBPools
     from appPublic.worker import awaitify
+    from appPublic.log import info, debug, warning, error, exception
 except ImportError:
     class ServerEnv:
         pass
@@ -27,6 +28,11 @@ except ImportError:
         pass
     def awaitify(f):
         return f
+    def info(*a, **kw): print(*a)
+    def debug(*a, **kw): pass
+    def warning(*a, **kw): print(*a)
+    def error(*a, **kw): print(*a)
+    def exception(*a, **kw): pass
 
 
 def _now_iso():
@@ -137,23 +143,86 @@ async def _call_llm_api(
 
 
 async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]:
-    """Make a non-streaming LLM API call."""
-    async with ClientSession() as session:
-        async with session.post(url, headers=headers, json=body,
-                                timeout=ClientTimeout(total=300)) as resp:
-            if resp.status != 200:
-                error_text = await resp.text()
-                return {
-                    'error': {
-                        'message': f'LLM API error: HTTP {resp.status}',
-                        'type': 'api_error',
-                        'code': resp.status,
-                        'detail': error_text[:500],
-                    }
-                }
+    """Make a non-streaming LLM API call with retry logic."""
+    max_retries = 3
+    base_delay = 2  # seconds
 
-            data = await resp.json()
-            return data
+    for attempt in range(max_retries):
+        try:
+            async with ClientSession() as session:
+                async with session.post(url, headers=headers, json=body,
+                                        timeout=ClientTimeout(total=300)) as resp:
+                    if resp.status == 429:
+                        # Rate limited - respect Retry-After header
+                        retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
+                        if attempt < max_retries - 1:
+                            warning(f"LLM API rate limited (429), retrying after {retry_after}s (attempt {attempt+1}/{max_retries})")
+                            await asyncio.sleep(retry_after)
+                            continue
+                        else:
+                            return {
+                                'error': {
+                                    'message': 'Rate limited by LLM provider. Please retry later.',
+                                    'type': 'rate_limit_error',
+                                    'code': 429,
+                                }
+                            }
+
+                    if resp.status == 500 and attempt < max_retries - 1:
+                        # Transient server error - retry
+                        delay = base_delay * (2 ** attempt)
+                        warning(f"LLM API server error (500), retrying after {delay}s (attempt {attempt+1}/{max_retries})")
+                        await asyncio.sleep(delay)
+                        continue
+
+                    if resp.status != 200:
+                        error_text = await resp.text()
+                        return {
+                            'error': {
+                                'message': f'LLM API error: HTTP {resp.status}',
+                                'type': 'api_error',
+                                'code': resp.status,
+                                'detail': error_text[:500],
+                            }
+                        }
+
+                    data = await resp.json()
+                    return data
+
+        except asyncio.TimeoutError:
+            if attempt < max_retries - 1:
+                delay = base_delay * (2 ** attempt)
+                warning(f"LLM API timeout, retrying after {delay}s (attempt {attempt+1}/{max_retries})")
+                await asyncio.sleep(delay)
+                continue
+            return {
+                'error': {
+                    'message': 'LLM API request timed out after 300s',
+                    'type': 'timeout_error',
+                    'code': 504,
+                }
+            }
+        except ClientError as e:
+            if attempt < max_retries - 1:
+                delay = base_delay * (2 ** attempt)
+                warning(f"LLM API connection error: {e}, retrying after {delay}s")
+                await asyncio.sleep(delay)
+                continue
+            return {
+                'error': {
+                    'message': f'LLM API connection failed: {str(e)}',
+                    'type': 'connection_error',
+                    'code': 502,
+                }
+            }
+
+    return {
+        'error': {
+            'message': 'LLM API request failed after all retries',
+            'type': 'server_error',
+            'code': 500,
+        }
+    }
 
 
 async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]:
@@ -208,9 +277,12 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
     stream = body.get('stream', False)
     top_p = body.get('top_p', 1.0)
 
+    start_time = time.time()
+
     # Get LLM service config
     config = await _async_get_llm_config()
     if not config:
+        error("LLM service not configured: no harnessed_agent_config found")
         return {
             'error': {
                 'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.',
@@ -223,6 +295,7 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
     api_key = config.get('llm_api_key') or config.get('api_key')
 
     if not service_url or not api_key:
+        error(f"LLM service misconfigured: service_url={'set' if service_url else 'MISSING'}, api_key={'set' if api_key else 'MISSING'}")
         return {
             'error': {
                 'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.',
@@ -236,6 +309,8 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
     if not model or model == 'default':
         model = default_model
 
+    info(f"LLM chat request: model={model}, stream={stream}, messages={len(messages)}, temperature={temperature}")
+
     # Pass through extra params
     extra_params = {}
     for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
@@ -301,6 +376,12 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
             top_p=top_p,
             **extra_params,
         )
+        elapsed = time.time() - start_time
+        if 'error' in result:
+            error(f"LLM chat error: model={model}, elapsed={elapsed:.2f}s, error={result.get('error',{}).get('message','')}")
+        else:
+            usage = result.get('usage', {})
+            info(f"LLM chat done: model={model}, elapsed={elapsed:.2f}s, tokens_in={usage.get('prompt_tokens','?')}, tokens_out={usage.get('completion_tokens','?')}")
         return result