fix: add retry, rate limiting, and logging to LLM API
- Add exponential backoff retry (3 attempts) for transient failures - Add 429 rate limit handling with Retry-After header support - Add 500 server error retry with backoff - Add timeout retry handling (300s per attempt) - Add structured logging via appPublic.log (info/warning/error) - Log request params (model, stream, message count) on entry - Log response timing and token usage on completion - Log error details on failure All HIGH/MEDIUM severity LLM API issues resolved.
This commit is contained in:
parent
608413a5d5
commit
e4f935de07
@ -20,6 +20,7 @@ try:
|
|||||||
from ahserver.serverenv import ServerEnv
|
from ahserver.serverenv import ServerEnv
|
||||||
from sqlor.dbpools import DBPools
|
from sqlor.dbpools import DBPools
|
||||||
from appPublic.worker import awaitify
|
from appPublic.worker import awaitify
|
||||||
|
from appPublic.log import info, debug, warning, error, exception
|
||||||
except ImportError:
|
except ImportError:
|
||||||
class ServerEnv:
|
class ServerEnv:
|
||||||
pass
|
pass
|
||||||
@ -27,6 +28,11 @@ except ImportError:
|
|||||||
pass
|
pass
|
||||||
def awaitify(f):
|
def awaitify(f):
|
||||||
return f
|
return f
|
||||||
|
def info(*a, **kw): print(*a)
|
||||||
|
def debug(*a, **kw): pass
|
||||||
|
def warning(*a, **kw): print(*a)
|
||||||
|
def error(*a, **kw): print(*a)
|
||||||
|
def exception(*a, **kw): pass
|
||||||
|
|
||||||
|
|
||||||
def _now_iso():
|
def _now_iso():
|
||||||
@ -137,10 +143,38 @@ async def _call_llm_api(
|
|||||||
|
|
||||||
|
|
||||||
async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]:
|
async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]:
|
||||||
"""Make a non-streaming LLM API call."""
|
"""Make a non-streaming LLM API call with retry logic."""
|
||||||
|
max_retries = 3
|
||||||
|
base_delay = 2 # seconds
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
async with ClientSession() as session:
|
async with ClientSession() as session:
|
||||||
async with session.post(url, headers=headers, json=body,
|
async with session.post(url, headers=headers, json=body,
|
||||||
timeout=ClientTimeout(total=300)) as resp:
|
timeout=ClientTimeout(total=300)) as resp:
|
||||||
|
if resp.status == 429:
|
||||||
|
# Rate limited - respect Retry-After header
|
||||||
|
retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
warning(f"LLM API rate limited (429), retrying after {retry_after}s (attempt {attempt+1}/{max_retries})")
|
||||||
|
await asyncio.sleep(retry_after)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
'error': {
|
||||||
|
'message': 'Rate limited by LLM provider. Please retry later.',
|
||||||
|
'type': 'rate_limit_error',
|
||||||
|
'code': 429,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.status == 500 and attempt < max_retries - 1:
|
||||||
|
# Transient server error - retry
|
||||||
|
delay = base_delay * (2 ** attempt)
|
||||||
|
warning(f"LLM API server error (500), retrying after {delay}s (attempt {attempt+1}/{max_retries})")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
continue
|
||||||
|
|
||||||
if resp.status != 200:
|
if resp.status != 200:
|
||||||
error_text = await resp.text()
|
error_text = await resp.text()
|
||||||
return {
|
return {
|
||||||
@ -155,6 +189,41 @@ async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, A
|
|||||||
data = await resp.json()
|
data = await resp.json()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
delay = base_delay * (2 ** attempt)
|
||||||
|
warning(f"LLM API timeout, retrying after {delay}s (attempt {attempt+1}/{max_retries})")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
continue
|
||||||
|
return {
|
||||||
|
'error': {
|
||||||
|
'message': 'LLM API request timed out after 300s',
|
||||||
|
'type': 'timeout_error',
|
||||||
|
'code': 504,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
except ClientError as e:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
delay = base_delay * (2 ** attempt)
|
||||||
|
warning(f"LLM API connection error: {e}, retrying after {delay}s")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
continue
|
||||||
|
return {
|
||||||
|
'error': {
|
||||||
|
'message': f'LLM API connection failed: {str(e)}',
|
||||||
|
'type': 'connection_error',
|
||||||
|
'code': 502,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
'error': {
|
||||||
|
'message': 'LLM API request failed after all retries',
|
||||||
|
'type': 'server_error',
|
||||||
|
'code': 500,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]:
|
async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]:
|
||||||
"""Make a streaming LLM API call, yielding SSE chunks."""
|
"""Make a streaming LLM API call, yielding SSE chunks."""
|
||||||
@ -208,9 +277,12 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
|
|||||||
stream = body.get('stream', False)
|
stream = body.get('stream', False)
|
||||||
top_p = body.get('top_p', 1.0)
|
top_p = body.get('top_p', 1.0)
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
# Get LLM service config
|
# Get LLM service config
|
||||||
config = await _async_get_llm_config()
|
config = await _async_get_llm_config()
|
||||||
if not config:
|
if not config:
|
||||||
|
error("LLM service not configured: no harnessed_agent_config found")
|
||||||
return {
|
return {
|
||||||
'error': {
|
'error': {
|
||||||
'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.',
|
'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.',
|
||||||
@ -223,6 +295,7 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
|
|||||||
api_key = config.get('llm_api_key') or config.get('api_key')
|
api_key = config.get('llm_api_key') or config.get('api_key')
|
||||||
|
|
||||||
if not service_url or not api_key:
|
if not service_url or not api_key:
|
||||||
|
error(f"LLM service misconfigured: service_url={'set' if service_url else 'MISSING'}, api_key={'set' if api_key else 'MISSING'}")
|
||||||
return {
|
return {
|
||||||
'error': {
|
'error': {
|
||||||
'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.',
|
'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.',
|
||||||
@ -236,6 +309,8 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
|
|||||||
if not model or model == 'default':
|
if not model or model == 'default':
|
||||||
model = default_model
|
model = default_model
|
||||||
|
|
||||||
|
info(f"LLM chat request: model={model}, stream={stream}, messages={len(messages)}, temperature={temperature}")
|
||||||
|
|
||||||
# Pass through extra params
|
# Pass through extra params
|
||||||
extra_params = {}
|
extra_params = {}
|
||||||
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
|
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
|
||||||
@ -301,6 +376,12 @@ async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
|
|||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
**extra_params,
|
**extra_params,
|
||||||
)
|
)
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
if 'error' in result:
|
||||||
|
error(f"LLM chat error: model={model}, elapsed={elapsed:.2f}s, error={result.get('error',{}).get('message','')}")
|
||||||
|
else:
|
||||||
|
usage = result.get('usage', {})
|
||||||
|
info(f"LLM chat done: model={model}, elapsed={elapsed:.2f}s, tokens_in={usage.get('prompt_tokens','?')}, tokens_out={usage.get('completion_tokens','?')}")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user