fix: add retry, rate limiting, and logging to LLM API

- Add exponential backoff retry (3 attempts) for transient failures
- Add 429 rate limit handling with Retry-After header support
- Add 500 server error retry with backoff
- Add timeout retry handling (300s per attempt)
- Add structured logging via appPublic.log (info/warning/error)
- Log request params (model, stream, message count) on entry
- Log response timing and token usage on completion
- Log error details on failure

All HIGH/MEDIUM severity LLM API issues resolved.
This commit is contained in:
yumoqing 2026-05-07 11:43:10 +08:00
parent 608413a5d5
commit e4f935de07

View File

@ -20,6 +20,7 @@ try:
from ahserver.serverenv import ServerEnv from ahserver.serverenv import ServerEnv
from sqlor.dbpools import DBPools from sqlor.dbpools import DBPools
from appPublic.worker import awaitify from appPublic.worker import awaitify
from appPublic.log import info, debug, warning, error, exception
except ImportError: except ImportError:
class ServerEnv: class ServerEnv:
pass pass
@ -27,6 +28,11 @@ except ImportError:
pass pass
def awaitify(f): def awaitify(f):
return f return f
def info(*a, **kw): print(*a)
def debug(*a, **kw): pass
def warning(*a, **kw): print(*a)
def error(*a, **kw): print(*a)
def exception(*a, **kw): pass
def _now_iso(): def _now_iso():
@ -137,23 +143,86 @@ async def _call_llm_api(
async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]: async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]:
"""Make a non-streaming LLM API call.""" """Make a non-streaming LLM API call with retry logic."""
async with ClientSession() as session: max_retries = 3
async with session.post(url, headers=headers, json=body, base_delay = 2 # seconds
timeout=ClientTimeout(total=300)) as resp:
if resp.status != 200:
error_text = await resp.text()
return {
'error': {
'message': f'LLM API error: HTTP {resp.status}',
'type': 'api_error',
'code': resp.status,
'detail': error_text[:500],
}
}
data = await resp.json() for attempt in range(max_retries):
return data try:
async with ClientSession() as session:
async with session.post(url, headers=headers, json=body,
timeout=ClientTimeout(total=300)) as resp:
if resp.status == 429:
# Rate limited - respect Retry-After header
retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
if attempt < max_retries - 1:
warning(f"LLM API rate limited (429), retrying after {retry_after}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(retry_after)
continue
else:
return {
'error': {
'message': 'Rate limited by LLM provider. Please retry later.',
'type': 'rate_limit_error',
'code': 429,
}
}
if resp.status == 500 and attempt < max_retries - 1:
# Transient server error - retry
delay = base_delay * (2 ** attempt)
warning(f"LLM API server error (500), retrying after {delay}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(delay)
continue
if resp.status != 200:
error_text = await resp.text()
return {
'error': {
'message': f'LLM API error: HTTP {resp.status}',
'type': 'api_error',
'code': resp.status,
'detail': error_text[:500],
}
}
data = await resp.json()
return data
except asyncio.TimeoutError:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
warning(f"LLM API timeout, retrying after {delay}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(delay)
continue
return {
'error': {
'message': 'LLM API request timed out after 300s',
'type': 'timeout_error',
'code': 504,
}
}
except ClientError as e:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
warning(f"LLM API connection error: {e}, retrying after {delay}s")
await asyncio.sleep(delay)
continue
return {
'error': {
'message': f'LLM API connection failed: {str(e)}',
'type': 'connection_error',
'code': 502,
}
}
return {
'error': {
'message': 'LLM API request failed after all retries',
'type': 'server_error',
'code': 500,
}
}
async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]: async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]:
@ -208,9 +277,12 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
stream = body.get('stream', False) stream = body.get('stream', False)
top_p = body.get('top_p', 1.0) top_p = body.get('top_p', 1.0)
start_time = time.time()
# Get LLM service config # Get LLM service config
config = await _async_get_llm_config() config = await _async_get_llm_config()
if not config: if not config:
error("LLM service not configured: no harnessed_agent_config found")
return { return {
'error': { 'error': {
'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.', 'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.',
@ -223,6 +295,7 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
api_key = config.get('llm_api_key') or config.get('api_key') api_key = config.get('llm_api_key') or config.get('api_key')
if not service_url or not api_key: if not service_url or not api_key:
error(f"LLM service misconfigured: service_url={'set' if service_url else 'MISSING'}, api_key={'set' if api_key else 'MISSING'}")
return { return {
'error': { 'error': {
'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.', 'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.',
@ -236,6 +309,8 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
if not model or model == 'default': if not model or model == 'default':
model = default_model model = default_model
info(f"LLM chat request: model={model}, stream={stream}, messages={len(messages)}, temperature={temperature}")
# Pass through extra params # Pass through extra params
extra_params = {} extra_params = {}
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
@ -301,6 +376,12 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
top_p=top_p, top_p=top_p,
**extra_params, **extra_params,
) )
elapsed = time.time() - start_time
if 'error' in result:
error(f"LLM chat error: model={model}, elapsed={elapsed:.2f}s, error={result.get('error',{}).get('message','')}")
else:
usage = result.get('usage', {})
info(f"LLM chat done: model={model}, elapsed={elapsed:.2f}s, tokens_in={usage.get('prompt_tokens','?')}, tokens_out={usage.get('completion_tokens','?')}")
return result return result