refactor: LLM client for calling supplier LLM APIs (not server)
Replaces wrong-direction llm_api.py (which served OpenAI endpoints) with llm_client.py -- a client that calls supplier LLM APIs. New module: llm_client.py - llm_chat(messages, model, temperature, ...) -> OpenAI response dict - llm_chat_stream(messages, ...) -> async generator of SSE chunks - llm_simple(prompt, system) -> plain text response - llm_list_models() -> list available models from provider - llm_get_config() -> show current config (key masked) - Supports provider presets: openai, dashscope, deepseek, siliconflow - Retry with exponential backoff (3 attempts) - 429 rate limit handling with Retry-After - Structured logging via appPublic.log Model changes (harnessed_agent_config): - Add llm_provider (preset name: dashscope/openai/deepseek/siliconflow) - Add top_p field - llm_service_url defaults to DashScope compatible endpoint Other: - Remove wrong-direction /v1/ endpoints - Fix pyproject.toml deps: only sqlor + bricks_for_python - Update init/data.json seed data with LLM config fields - Update CRUD view with llm_provider dropdown
This commit is contained in:
parent
e4f935de07
commit
4e65ff8fe4
@ -20,10 +20,12 @@ from .config_functions import (
|
||||
harnessed_get_agent_config,
|
||||
harnessed_save_agent_config
|
||||
)
|
||||
from .llm_api import (
|
||||
harnessed_llm_chat_completions,
|
||||
harnessed_llm_models,
|
||||
harnessed_llm_completions,
|
||||
from .llm_client import (
|
||||
llm_chat,
|
||||
llm_chat_stream,
|
||||
llm_list_models,
|
||||
llm_simple,
|
||||
llm_get_config,
|
||||
)
|
||||
|
||||
def load_harnessed_agent():
|
||||
@ -49,7 +51,9 @@ def load_harnessed_agent():
|
||||
env.harnessed_get_agent_config = harnessed_get_agent_config
|
||||
env.harnessed_save_agent_config = harnessed_save_agent_config
|
||||
|
||||
# OpenAI-compatible LLM API
|
||||
env.harnessed_llm_chat_completions = harnessed_llm_chat_completions
|
||||
env.harnessed_llm_models = harnessed_llm_models
|
||||
env.harnessed_llm_completions = harnessed_llm_completions
|
||||
# LLM client -- calls supplier LLM APIs (OpenAI-compatible)
|
||||
env.llm_chat = llm_chat
|
||||
env.llm_chat_stream = llm_chat_stream
|
||||
env.llm_list_models = llm_list_models
|
||||
env.llm_simple = llm_simple
|
||||
env.llm_get_config = llm_get_config
|
||||
@ -1,463 +0,0 @@
|
||||
"""
|
||||
OpenAI-compatible LLM API for Hermes Agent
|
||||
Provides /v1/chat/completions and /v1/models endpoints compatible with OpenAI API spec.
|
||||
"""
|
||||
import json
|
||||
import uuid
|
||||
import time
|
||||
import asyncio
|
||||
from typing import Dict, Any, List, Optional, AsyncGenerator
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
from aiohttp.client_exceptions import ClientError
|
||||
HAS_AIOHTTP = True
|
||||
except ImportError:
|
||||
HAS_AIOHTTP = False
|
||||
|
||||
try:
|
||||
from ahserver.serverenv import ServerEnv
|
||||
from sqlor.dbpools import DBPools
|
||||
from appPublic.worker import awaitify
|
||||
from appPublic.log import info, debug, warning, error, exception
|
||||
except ImportError:
|
||||
class ServerEnv:
|
||||
pass
|
||||
class DBPools:
|
||||
pass
|
||||
def awaitify(f):
|
||||
return f
|
||||
def info(*a, **kw): print(*a)
|
||||
def debug(*a, **kw): pass
|
||||
def warning(*a, **kw): print(*a)
|
||||
def error(*a, **kw): print(*a)
|
||||
def exception(*a, **kw): pass
|
||||
|
||||
|
||||
def _now_iso():
|
||||
return datetime.utcnow().isoformat() + 'Z'
|
||||
|
||||
|
||||
def _now_ts():
|
||||
return int(time.time())
|
||||
|
||||
|
||||
def _get_default_llm_config(user_id: str = None) -> Dict[str, Any]:
|
||||
"""Get LLM service config from harnessed_agent_config table."""
|
||||
try:
|
||||
dbname = ServerEnv().get_module_dbname('harnessed_agent')
|
||||
except Exception:
|
||||
dbname = 'default'
|
||||
|
||||
try:
|
||||
import asyncio
|
||||
async def _fetch():
|
||||
async with DBPools().sqlorContext(dbname) as sor:
|
||||
filters = {}
|
||||
if user_id:
|
||||
filters['user_id'] = user_id
|
||||
rows = await sor.R('harnessed_agent_config', filters,
|
||||
orderby='updated_at DESC', limit=1)
|
||||
if rows:
|
||||
return rows[0]
|
||||
return None
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_running():
|
||||
# Already in async context, need to use create_task pattern
|
||||
# but for simplicity in .dspy context we return None and let caller handle
|
||||
return None
|
||||
else:
|
||||
config = loop.run_until_complete(_fetch())
|
||||
return config
|
||||
except RuntimeError:
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
async def _async_get_llm_config(user_id: str = None) -> Dict[str, Any]:
|
||||
"""Async version to get LLM service config."""
|
||||
try:
|
||||
env = ServerEnv()
|
||||
dbname = env.get_module_dbname('harnessed_agent')
|
||||
except Exception:
|
||||
dbname = 'default'
|
||||
|
||||
try:
|
||||
async with DBPools().sqlorContext(dbname) as sor:
|
||||
filters = {}
|
||||
if user_id:
|
||||
filters['user_id'] = user_id
|
||||
rows = await sor.R('harnessed_agent_config', filters,
|
||||
orderby='updated_at DESC', limit=1)
|
||||
if rows:
|
||||
return rows[0]
|
||||
except Exception as e:
|
||||
print(f"Error fetching LLM config: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def _call_llm_api(
|
||||
service_url: str,
|
||||
api_key: str,
|
||||
model: str,
|
||||
messages: List[Dict[str, str]],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None,
|
||||
stream: bool = False,
|
||||
top_p: float = 1.0,
|
||||
**kwargs
|
||||
) -> Any:
|
||||
"""Call an OpenAI-compatible LLM API endpoint."""
|
||||
url = service_url.rstrip('/') + '/chat/completions'
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {api_key}',
|
||||
}
|
||||
|
||||
body = {
|
||||
'model': model,
|
||||
'messages': messages,
|
||||
'temperature': temperature,
|
||||
'top_p': top_p,
|
||||
'stream': stream,
|
||||
}
|
||||
|
||||
if max_tokens is not None:
|
||||
body['max_tokens'] = max_tokens
|
||||
|
||||
# Pass through any additional OpenAI-compatible parameters
|
||||
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', 'tool_choice',
|
||||
'response_format', 'seed'):
|
||||
if key in kwargs and kwargs[key] is not None:
|
||||
body[key] = kwargs[key]
|
||||
|
||||
if stream:
|
||||
return await _stream_llm_response(url, headers, body)
|
||||
else:
|
||||
return await _sync_llm_response(url, headers, body)
|
||||
|
||||
|
||||
async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]:
|
||||
"""Make a non-streaming LLM API call with retry logic."""
|
||||
max_retries = 3
|
||||
base_delay = 2 # seconds
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with ClientSession() as session:
|
||||
async with session.post(url, headers=headers, json=body,
|
||||
timeout=ClientTimeout(total=300)) as resp:
|
||||
if resp.status == 429:
|
||||
# Rate limited - respect Retry-After header
|
||||
retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
|
||||
if attempt < max_retries - 1:
|
||||
warning(f"LLM API rate limited (429), retrying after {retry_after}s (attempt {attempt+1}/{max_retries})")
|
||||
await asyncio.sleep(retry_after)
|
||||
continue
|
||||
else:
|
||||
return {
|
||||
'error': {
|
||||
'message': 'Rate limited by LLM provider. Please retry later.',
|
||||
'type': 'rate_limit_error',
|
||||
'code': 429,
|
||||
}
|
||||
}
|
||||
|
||||
if resp.status == 500 and attempt < max_retries - 1:
|
||||
# Transient server error - retry
|
||||
delay = base_delay * (2 ** attempt)
|
||||
warning(f"LLM API server error (500), retrying after {delay}s (attempt {attempt+1}/{max_retries})")
|
||||
await asyncio.sleep(delay)
|
||||
continue
|
||||
|
||||
if resp.status != 200:
|
||||
error_text = await resp.text()
|
||||
return {
|
||||
'error': {
|
||||
'message': f'LLM API error: HTTP {resp.status}',
|
||||
'type': 'api_error',
|
||||
'code': resp.status,
|
||||
'detail': error_text[:500],
|
||||
}
|
||||
}
|
||||
|
||||
data = await resp.json()
|
||||
return data
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
if attempt < max_retries - 1:
|
||||
delay = base_delay * (2 ** attempt)
|
||||
warning(f"LLM API timeout, retrying after {delay}s (attempt {attempt+1}/{max_retries})")
|
||||
await asyncio.sleep(delay)
|
||||
continue
|
||||
return {
|
||||
'error': {
|
||||
'message': 'LLM API request timed out after 300s',
|
||||
'type': 'timeout_error',
|
||||
'code': 504,
|
||||
}
|
||||
}
|
||||
except ClientError as e:
|
||||
if attempt < max_retries - 1:
|
||||
delay = base_delay * (2 ** attempt)
|
||||
warning(f"LLM API connection error: {e}, retrying after {delay}s")
|
||||
await asyncio.sleep(delay)
|
||||
continue
|
||||
return {
|
||||
'error': {
|
||||
'message': f'LLM API connection failed: {str(e)}',
|
||||
'type': 'connection_error',
|
||||
'code': 502,
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
'error': {
|
||||
'message': 'LLM API request failed after all retries',
|
||||
'type': 'server_error',
|
||||
'code': 500,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]:
|
||||
"""Make a streaming LLM API call, yielding SSE chunks."""
|
||||
async with ClientSession() as session:
|
||||
async with session.post(url, headers=headers, json=body,
|
||||
timeout=ClientTimeout(total=600)) as resp:
|
||||
if resp.status != 200:
|
||||
error_text = await resp.text()
|
||||
error_data = {
|
||||
'error': {
|
||||
'message': f'LLM API error: HTTP {resp.status}',
|
||||
'type': 'api_error',
|
||||
'code': resp.status,
|
||||
'detail': error_text[:500],
|
||||
}
|
||||
}
|
||||
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
return
|
||||
|
||||
async for line in resp.content:
|
||||
line = line.decode('utf-8').strip()
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith('data: '):
|
||||
yield line + '\n\n'
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Public API functions (registered to ServerEnv via init.py)
|
||||
# ============================================================
|
||||
|
||||
async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
|
||||
"""
|
||||
OpenAI-compatible /v1/chat/completions endpoint.
|
||||
|
||||
Args:
|
||||
body: dict with keys: model, messages, temperature, max_tokens,
|
||||
stream, top_p, and other OpenAI-compatible params.
|
||||
|
||||
Returns:
|
||||
If stream=False: dict matching OpenAI chat completion response.
|
||||
If stream=True: aiohttp.StreamResponse with SSE.
|
||||
"""
|
||||
from aiohttp import web
|
||||
|
||||
model = body.get('model', 'default')
|
||||
messages = body.get('messages', [])
|
||||
temperature = body.get('temperature', 0.7)
|
||||
max_tokens = body.get('max_tokens', None)
|
||||
stream = body.get('stream', False)
|
||||
top_p = body.get('top_p', 1.0)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Get LLM service config
|
||||
config = await _async_get_llm_config()
|
||||
if not config:
|
||||
error("LLM service not configured: no harnessed_agent_config found")
|
||||
return {
|
||||
'error': {
|
||||
'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.',
|
||||
'type': 'configuration_error',
|
||||
'code': 503,
|
||||
}
|
||||
}
|
||||
|
||||
service_url = config.get('llm_service_url') or config.get('api_endpoint')
|
||||
api_key = config.get('llm_api_key') or config.get('api_key')
|
||||
|
||||
if not service_url or not api_key:
|
||||
error(f"LLM service misconfigured: service_url={'set' if service_url else 'MISSING'}, api_key={'set' if api_key else 'MISSING'}")
|
||||
return {
|
||||
'error': {
|
||||
'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.',
|
||||
'type': 'configuration_error',
|
||||
'code': 503,
|
||||
}
|
||||
}
|
||||
|
||||
# Use default model from config if request model is 'default' or empty
|
||||
default_model = config.get('default_model', 'qwen3-max')
|
||||
if not model or model == 'default':
|
||||
model = default_model
|
||||
|
||||
info(f"LLM chat request: model={model}, stream={stream}, messages={len(messages)}, temperature={temperature}")
|
||||
|
||||
# Pass through extra params
|
||||
extra_params = {}
|
||||
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
|
||||
'tool_choice', 'response_format', 'seed'):
|
||||
if key in body:
|
||||
extra_params[key] = body[key]
|
||||
|
||||
if stream:
|
||||
resp = web.StreamResponse(
|
||||
status=200,
|
||||
reason='OK',
|
||||
headers={
|
||||
'Content-Type': 'text/event-stream',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'X-Accel-Buffering': 'no',
|
||||
}
|
||||
)
|
||||
await resp.prepare(body.get('_request'))
|
||||
|
||||
request_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
||||
created = _now_ts()
|
||||
|
||||
try:
|
||||
async for chunk_line in _stream_llm_response(
|
||||
service_url, {'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}'},
|
||||
{
|
||||
'model': model,
|
||||
'messages': messages,
|
||||
'temperature': temperature,
|
||||
'top_p': top_p,
|
||||
'stream': True,
|
||||
'max_tokens': max_tokens,
|
||||
**extra_params,
|
||||
}
|
||||
):
|
||||
await resp.write(chunk_line.encode('utf-8') if isinstance(chunk_line, str) else chunk_line)
|
||||
await resp.drain()
|
||||
except Exception as e:
|
||||
error_data = {
|
||||
'error': {
|
||||
'message': f'Streaming error: {str(e)}',
|
||||
'type': 'server_error',
|
||||
'code': 500,
|
||||
}
|
||||
}
|
||||
await resp.write(f"data: {json.dumps(error_data)}\n\n".encode('utf-8'))
|
||||
await resp.drain()
|
||||
|
||||
await resp.write(b"data: [DONE]\n\n")
|
||||
await resp.drain()
|
||||
return resp
|
||||
|
||||
else:
|
||||
result = await _call_llm_api(
|
||||
service_url=service_url,
|
||||
api_key=api_key,
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
stream=False,
|
||||
top_p=top_p,
|
||||
**extra_params,
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
if 'error' in result:
|
||||
error(f"LLM chat error: model={model}, elapsed={elapsed:.2f}s, error={result.get('error',{}).get('message','')}")
|
||||
else:
|
||||
usage = result.get('usage', {})
|
||||
info(f"LLM chat done: model={model}, elapsed={elapsed:.2f}s, tokens_in={usage.get('prompt_tokens','?')}, tokens_out={usage.get('completion_tokens','?')}")
|
||||
return result
|
||||
|
||||
|
||||
async def harnessed_llm_models() -> Dict[str, Any]:
|
||||
"""
|
||||
OpenAI-compatible /v1/models endpoint.
|
||||
Returns list of available models from config.
|
||||
"""
|
||||
config = await _async_get_llm_config()
|
||||
|
||||
models_list = []
|
||||
|
||||
# Add default model from config
|
||||
default_model = 'qwen3-max'
|
||||
if config:
|
||||
default_model = config.get('default_model', 'qwen3-max')
|
||||
|
||||
# If available_models is configured as JSON string, parse it
|
||||
if config and config.get('available_models'):
|
||||
try:
|
||||
models_str = config['available_models']
|
||||
if isinstance(models_str, str):
|
||||
model_names = json.loads(models_str)
|
||||
else:
|
||||
model_names = models_str
|
||||
for m in model_names:
|
||||
if isinstance(m, str):
|
||||
models_list.append({
|
||||
'id': m,
|
||||
'object': 'model',
|
||||
'created': _now_ts(),
|
||||
'owned_by': 'harnessed_agent',
|
||||
})
|
||||
elif isinstance(m, dict):
|
||||
models_list.append(m)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Always include the default model if not already listed
|
||||
existing_ids = {m['id'] for m in models_list}
|
||||
if default_model not in existing_ids:
|
||||
models_list.insert(0, {
|
||||
'id': default_model,
|
||||
'object': 'model',
|
||||
'created': _now_ts(),
|
||||
'owned_by': 'harnessed_agent',
|
||||
})
|
||||
|
||||
return {
|
||||
'object': 'list',
|
||||
'data': models_list,
|
||||
}
|
||||
|
||||
|
||||
async def harnessed_llm_completions(body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
OpenAI-compatible /v1/completions endpoint (legacy, non-chat).
|
||||
Converts prompt to messages format internally.
|
||||
"""
|
||||
prompt = body.get('prompt', '')
|
||||
model = body.get('model', 'default')
|
||||
temperature = body.get('temperature', 0.7)
|
||||
max_tokens = body.get('max_tokens', None)
|
||||
stream = body.get('stream', False)
|
||||
|
||||
# Convert to chat format
|
||||
messages = [{'role': 'user', 'content': prompt}]
|
||||
|
||||
# Build request body for chat completions
|
||||
chat_body = {
|
||||
'model': model,
|
||||
'messages': messages,
|
||||
'temperature': temperature,
|
||||
'max_tokens': max_tokens,
|
||||
'stream': stream,
|
||||
'_request': body.get('_request'),
|
||||
}
|
||||
|
||||
return await harnessed_llm_chat_completions(chat_body)
|
||||
475
harnessed_agent/llm_client.py
Normal file
475
harnessed_agent/llm_client.py
Normal file
@ -0,0 +1,475 @@
|
||||
"""
|
||||
LLM Client for Hermes Agent - Calls supplier LLM APIs via OpenAI-compatible interface.
|
||||
|
||||
This module provides a client-side implementation for calling external LLM providers
|
||||
(OpenAI, DashScope, DeepSeek, etc.) through their OpenAI-compatible /v1/chat/completions API.
|
||||
|
||||
Usage:
|
||||
# From .dspy files:
|
||||
result = await llm_chat(messages=[{"role": "user", "content": "Hello"}])
|
||||
|
||||
# With explicit config:
|
||||
result = await llm_chat(
|
||||
messages=[{"role": "user", "content": "Hello"}],
|
||||
model="qwen3-max",
|
||||
temperature=0.7,
|
||||
stream=False
|
||||
)
|
||||
|
||||
# Stream mode:
|
||||
async for chunk in llm_chat_stream(messages=[...]):
|
||||
print(chunk.get("delta", ""))
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
import asyncio
|
||||
from typing import Dict, Any, List, Optional, AsyncGenerator
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
from aiohttp import ClientSession, ClientTimeout, ClientError
|
||||
except ImportError:
|
||||
ClientError = Exception
|
||||
|
||||
try:
|
||||
from ahserver.serverenv import ServerEnv
|
||||
from sqlor.dbpools import DBPools
|
||||
from appPublic.log import info, debug, warning, error, exception
|
||||
except ImportError:
|
||||
class ServerEnv:
|
||||
pass
|
||||
class DBPools:
|
||||
pass
|
||||
def info(*a, **kw): print(*a)
|
||||
def debug(*a, **kw): pass
|
||||
def warning(*a, **kw): print(*a)
|
||||
def error(*a, **kw): print(*a)
|
||||
def exception(*a, **kw): pass
|
||||
|
||||
|
||||
# ============================================================
|
||||
# LLM Provider configurations
|
||||
# ============================================================
|
||||
|
||||
LLM_PROVIDERS = {
|
||||
'openai': {
|
||||
'url': 'https://api.openai.com/v1',
|
||||
'model_default': 'gpt-4o',
|
||||
},
|
||||
'dashscope': {
|
||||
'url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
||||
'model_default': 'qwen-plus',
|
||||
},
|
||||
'deepseek': {
|
||||
'url': 'https://api.deepseek.com/v1',
|
||||
'model_default': 'deepseek-chat',
|
||||
},
|
||||
'siliconflow': {
|
||||
'url': 'https://api.siliconflow.cn/v1',
|
||||
'model_default': 'Qwen/Qwen2.5-72B-Instruct',
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Config retrieval
|
||||
# ============================================================
|
||||
|
||||
async def _get_llm_config() -> Dict[str, Any]:
|
||||
"""Get LLM client configuration from harnessed_agent_config table."""
|
||||
try:
|
||||
env = ServerEnv()
|
||||
dbname = env.get_module_dbname('harnessed_agent')
|
||||
except Exception:
|
||||
dbname = 'default'
|
||||
|
||||
try:
|
||||
async with DBPools().sqlorContext(dbname) as sor:
|
||||
rows = await sor.R('harnessed_agent_config', {},
|
||||
orderby='updated_at DESC', limit=1)
|
||||
if rows:
|
||||
return rows[0]
|
||||
except Exception as e:
|
||||
error(f"Failed to fetch LLM config: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def _resolve_provider(config: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""Resolve base URL and model from config, with provider presets."""
|
||||
provider = (config.get('llm_provider') or '').lower()
|
||||
service_url = config.get('llm_service_url', '')
|
||||
api_key = config.get('llm_api_key', '')
|
||||
model = config.get('default_model', '')
|
||||
|
||||
# If provider name is set, use preset URL
|
||||
if provider and provider in LLM_PROVIDERS:
|
||||
preset = LLM_PROVIDERS[provider]
|
||||
if not service_url:
|
||||
service_url = preset['url']
|
||||
if not model:
|
||||
model = preset['model_default']
|
||||
|
||||
return {
|
||||
'service_url': service_url.rstrip('/'),
|
||||
'api_key': api_key,
|
||||
'model': model,
|
||||
'provider': provider,
|
||||
}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Core LLM client
|
||||
# ============================================================
|
||||
|
||||
async def _post_chat_completions(
|
||||
service_url: str,
|
||||
api_key: str,
|
||||
model: str,
|
||||
messages: List[Dict[str, str]],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None,
|
||||
stream: bool = False,
|
||||
top_p: float = 1.0,
|
||||
extra: Optional[Dict] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Call OpenAI-compatible /v1/chat/completions endpoint.
|
||||
|
||||
Returns dict with OpenAI response format or error dict.
|
||||
"""
|
||||
if not service_url:
|
||||
return {'error': {'message': 'llm_service_url not configured', 'type': 'configuration_error'}}
|
||||
if not api_key:
|
||||
return {'error': {'message': 'llm_api_key not configured', 'type': 'configuration_error'}}
|
||||
|
||||
url = f"{service_url}/chat/completions"
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {api_key}',
|
||||
}
|
||||
|
||||
body = {
|
||||
'model': model,
|
||||
'messages': messages,
|
||||
'temperature': temperature,
|
||||
'top_p': top_p,
|
||||
'stream': stream,
|
||||
}
|
||||
|
||||
if max_tokens is not None:
|
||||
body['max_tokens'] = max_tokens
|
||||
|
||||
if extra:
|
||||
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
|
||||
'tool_choice', 'response_format', 'seed'):
|
||||
if key in extra and extra[key] is not None:
|
||||
body[key] = extra[key]
|
||||
|
||||
max_retries = 3
|
||||
base_delay = 2
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with ClientSession() as session:
|
||||
async with session.post(url, headers=headers, json=body,
|
||||
timeout=ClientTimeout(total=300)) as resp:
|
||||
|
||||
if resp.status == 429:
|
||||
retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
|
||||
if attempt < max_retries - 1:
|
||||
warning(f"LLM rate limited (429), retrying in {retry_after}s (attempt {attempt+1}/{max_retries})")
|
||||
await asyncio.sleep(retry_after)
|
||||
continue
|
||||
return {'error': {'message': 'Rate limited by LLM provider', 'type': 'rate_limit_error', 'code': 429}}
|
||||
|
||||
if resp.status == 500 and attempt < max_retries - 1:
|
||||
delay = base_delay * (2 ** attempt)
|
||||
warning(f"LLM server error (500), retrying in {delay}s (attempt {attempt+1}/{max_retries})")
|
||||
await asyncio.sleep(delay)
|
||||
continue
|
||||
|
||||
if resp.status != 200:
|
||||
err_text = await resp.text()
|
||||
return {
|
||||
'error': {
|
||||
'message': f'LLM API error: HTTP {resp.status}',
|
||||
'type': 'api_error',
|
||||
'code': resp.status,
|
||||
'detail': err_text[:500],
|
||||
}
|
||||
}
|
||||
|
||||
return await resp.json()
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
if attempt < max_retries - 1:
|
||||
delay = base_delay * (2 ** attempt)
|
||||
warning(f"LLM API timeout, retrying in {delay}s (attempt {attempt+1}/{max_retries})")
|
||||
await asyncio.sleep(delay)
|
||||
continue
|
||||
return {'error': {'message': 'LLM API request timed out', 'type': 'timeout_error', 'code': 504}}
|
||||
|
||||
except ClientError as e:
|
||||
if attempt < max_retries - 1:
|
||||
delay = base_delay * (2 ** attempt)
|
||||
warning(f"LLM connection error: {e}, retrying in {delay}s")
|
||||
await asyncio.sleep(delay)
|
||||
continue
|
||||
return {'error': {'message': f'LLM connection failed: {str(e)}', 'type': 'connection_error', 'code': 502}}
|
||||
|
||||
return {'error': {'message': 'LLM API failed after all retries', 'type': 'server_error', 'code': 500}}
|
||||
|
||||
|
||||
async def _stream_chat_completions(
|
||||
service_url: str,
|
||||
api_key: str,
|
||||
model: str,
|
||||
messages: List[Dict[str, str]],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None,
|
||||
top_p: float = 1.0,
|
||||
extra: Optional[Dict] = None,
|
||||
) -> AsyncGenerator[Dict[str, Any], None]:
|
||||
"""
|
||||
Stream LLM response via SSE. Yields parsed chunk dicts.
|
||||
|
||||
Each yielded dict contains:
|
||||
- delta: str (the text chunk)
|
||||
- finish_reason: str or None
|
||||
- raw: dict (the raw SSE chunk data)
|
||||
"""
|
||||
if not service_url:
|
||||
yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_service_url not configured'}}
|
||||
return
|
||||
if not api_key:
|
||||
yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_api_key not configured'}}
|
||||
return
|
||||
|
||||
url = f"{service_url}/chat/completions"
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {api_key}',
|
||||
}
|
||||
|
||||
body = {
|
||||
'model': model,
|
||||
'messages': messages,
|
||||
'temperature': temperature,
|
||||
'top_p': top_p,
|
||||
'stream': True,
|
||||
}
|
||||
|
||||
if max_tokens is not None:
|
||||
body['max_tokens'] = max_tokens
|
||||
|
||||
if extra:
|
||||
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
|
||||
'tool_choice', 'response_format', 'seed'):
|
||||
if key in extra and extra[key] is not None:
|
||||
body[key] = extra[key]
|
||||
|
||||
async with ClientSession() as session:
|
||||
async with session.post(url, headers=headers, json=body,
|
||||
timeout=ClientTimeout(total=600)) as resp:
|
||||
if resp.status != 200:
|
||||
err_text = await resp.text()
|
||||
yield {
|
||||
'delta': '',
|
||||
'finish_reason': 'error',
|
||||
'raw': {'error': f'HTTP {resp.status}', 'detail': err_text[:300]},
|
||||
}
|
||||
return
|
||||
|
||||
async for line in resp.content:
|
||||
line = line.decode('utf-8').strip()
|
||||
if not line or not line.startswith('data:'):
|
||||
continue
|
||||
data_str = line[5:].strip()
|
||||
if data_str == '[DONE]':
|
||||
break
|
||||
try:
|
||||
chunk = json.loads(data_str)
|
||||
choices = chunk.get('choices', [])
|
||||
if choices:
|
||||
choice = choices[0]
|
||||
delta = choice.get('delta', {})
|
||||
text = delta.get('content', '') or ''
|
||||
finish_reason = choice.get('finish_reason')
|
||||
yield {
|
||||
'delta': text,
|
||||
'finish_reason': finish_reason,
|
||||
'raw': chunk,
|
||||
}
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Public API functions (registered to ServerEnv)
|
||||
# ============================================================
|
||||
|
||||
async def llm_chat(
|
||||
messages: List[Dict[str, str]],
|
||||
model: str = None,
|
||||
temperature: float = None,
|
||||
max_tokens: int = None,
|
||||
stream: bool = False,
|
||||
top_p: float = None,
|
||||
**extra,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Call LLM provider and get chat completion.
|
||||
|
||||
This is the primary function for AI calls within harnessed_agent.
|
||||
Reads provider config from harnessed_agent_config table automatically.
|
||||
|
||||
Args:
|
||||
messages: List of {role, content} dicts (OpenAI format)
|
||||
model: Override model name (uses config default_model if not set)
|
||||
temperature: Override temperature (uses config default_temperature if not set)
|
||||
max_tokens: Max response tokens
|
||||
stream: If True, uses streaming
|
||||
top_p: Override top_p
|
||||
**extra: Other OpenAI-compatible params (stop, tools, etc.)
|
||||
|
||||
Returns:
|
||||
Dict matching OpenAI chat completion response, or error dict.
|
||||
"""
|
||||
config = await _get_llm_config()
|
||||
provider = _resolve_provider(config)
|
||||
|
||||
resolved_model = model or provider['model']
|
||||
resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7)
|
||||
resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0)
|
||||
|
||||
start_time = time.time()
|
||||
info(f"LLM chat: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}")
|
||||
|
||||
result = await _post_chat_completions(
|
||||
service_url=provider['service_url'],
|
||||
api_key=provider['api_key'],
|
||||
model=resolved_model,
|
||||
messages=messages,
|
||||
temperature=resolved_temp,
|
||||
max_tokens=max_tokens,
|
||||
stream=False,
|
||||
top_p=resolved_top_p,
|
||||
extra=extra if extra else None,
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
if 'error' in result:
|
||||
error(f"LLM chat error: model={resolved_model}, elapsed={elapsed:.2f}s, error={result['error'].get('message')}")
|
||||
else:
|
||||
usage = result.get('usage', {})
|
||||
info(f"LLM chat done: model={resolved_model}, elapsed={elapsed:.2f}s, prompt_tokens={usage.get('prompt_tokens')}, completion_tokens={usage.get('completion_tokens')}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def llm_chat_stream(
|
||||
messages: List[Dict[str, str]],
|
||||
model: str = None,
|
||||
temperature: float = None,
|
||||
max_tokens: int = None,
|
||||
top_p: float = None,
|
||||
**extra,
|
||||
) -> AsyncGenerator[Dict[str, Any], None]:
|
||||
"""
|
||||
Stream LLM chat response.
|
||||
|
||||
Yields dicts with {delta: str, finish_reason: str|None, raw: dict}.
|
||||
|
||||
Example:
|
||||
async for chunk in llm_chat_stream(messages=[{"role": "user", "content": "Hello"}]):
|
||||
print(chunk['delta'], end='', flush=True)
|
||||
"""
|
||||
config = await _get_llm_config()
|
||||
provider = _resolve_provider(config)
|
||||
|
||||
resolved_model = model or provider['model']
|
||||
resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7)
|
||||
resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0)
|
||||
|
||||
info(f"LLM chat stream: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}")
|
||||
|
||||
async for chunk in _stream_chat_completions(
|
||||
service_url=provider['service_url'],
|
||||
api_key=provider['api_key'],
|
||||
model=resolved_model,
|
||||
messages=messages,
|
||||
temperature=resolved_temp,
|
||||
max_tokens=max_tokens,
|
||||
top_p=resolved_top_p,
|
||||
extra=extra if extra else None,
|
||||
):
|
||||
yield chunk
|
||||
|
||||
|
||||
async def llm_list_models() -> Dict[str, Any]:
|
||||
"""List models available from the configured LLM provider."""
|
||||
config = await _get_llm_config()
|
||||
provider = _resolve_provider(config)
|
||||
|
||||
if not provider['service_url'] or not provider['api_key']:
|
||||
return {'error': 'LLM not configured'}
|
||||
|
||||
# Try to call /v1/models endpoint
|
||||
url = f"{provider['service_url']}/models"
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {provider["api_key"]}',
|
||||
}
|
||||
|
||||
try:
|
||||
async with ClientSession() as session:
|
||||
async with session.get(url, headers=headers,
|
||||
timeout=ClientTimeout(total=30)) as resp:
|
||||
if resp.status == 200:
|
||||
return await resp.json()
|
||||
else:
|
||||
return {'error': f'HTTP {resp.status}'}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
|
||||
async def llm_simple(prompt: str, system: str = None, **kwargs) -> str:
|
||||
"""
|
||||
Simplified LLM call: returns just the response text.
|
||||
|
||||
Args:
|
||||
prompt: User message text
|
||||
system: Optional system prompt
|
||||
**kwargs: Passed to llm_chat
|
||||
|
||||
Returns:
|
||||
Response content string, or error message.
|
||||
"""
|
||||
messages = []
|
||||
if system:
|
||||
messages.append({'role': 'system', 'content': system})
|
||||
messages.append({'role': 'user', 'content': prompt})
|
||||
|
||||
result = await llm_chat(messages=messages, **kwargs)
|
||||
|
||||
if 'error' in result:
|
||||
return f"Error: {result['error'].get('message', 'Unknown error')}"
|
||||
|
||||
choices = result.get('choices', [])
|
||||
if choices:
|
||||
return choices[0].get('message', {}).get('content', '')
|
||||
return ''
|
||||
|
||||
|
||||
async def llm_get_config() -> Dict[str, Any]:
|
||||
"""Get current LLM client configuration (with api_key masked)."""
|
||||
config = await _get_llm_config()
|
||||
provider = _resolve_provider(config)
|
||||
|
||||
return {
|
||||
'provider': provider['provider'],
|
||||
'service_url': provider['service_url'],
|
||||
'api_key': '***' + provider['api_key'][-4:] if provider['api_key'] else '(not set)',
|
||||
'default_model': provider['model'],
|
||||
'default_temperature': config.get('default_temperature', 0.7),
|
||||
}
|
||||
@ -9,7 +9,7 @@
|
||||
"updated_at": "2026-04-15 21:06:00"
|
||||
},
|
||||
{
|
||||
"id": "default_memory_notes_1",
|
||||
"id": "default_memory_notes_1",
|
||||
"user_id": "user_1",
|
||||
"target": "memory",
|
||||
"content": "Default memory notes for Hermes Agent module - User 1",
|
||||
@ -25,7 +25,7 @@
|
||||
"updated_at": "2026-04-15 21:06:00"
|
||||
},
|
||||
{
|
||||
"id": "default_memory_notes_2",
|
||||
"id": "default_memory_notes_2",
|
||||
"user_id": "user_2",
|
||||
"target": "memory",
|
||||
"content": "Default memory notes for Hermes Agent module - User 2",
|
||||
@ -70,7 +70,15 @@
|
||||
"auto_cleanup_enabled": "1",
|
||||
"min_retention_days": 30,
|
||||
"created_at": "2026-04-20 10:48:00",
|
||||
"updated_at": "2026-04-20 10:48:00"
|
||||
"updated_at": "2026-04-20 10:48:00",
|
||||
"llm_provider": "dashscope",
|
||||
"llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
"llm_api_key": "",
|
||||
"available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]",
|
||||
"default_model": "qwen-plus",
|
||||
"default_temperature": 0.7,
|
||||
"top_p": 1.0,
|
||||
"enable_streaming": "1"
|
||||
},
|
||||
{
|
||||
"id": "default_agent_config_user_2",
|
||||
@ -84,7 +92,15 @@
|
||||
"auto_cleanup_enabled": "1",
|
||||
"min_retention_days": 30,
|
||||
"created_at": "2026-04-20 10:48:00",
|
||||
"updated_at": "2026-04-20 10:48:00"
|
||||
"updated_at": "2026-04-20 10:48:00",
|
||||
"llm_provider": "dashscope",
|
||||
"llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
"llm_api_key": "",
|
||||
"available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]",
|
||||
"default_model": "qwen-plus",
|
||||
"default_temperature": 0.7,
|
||||
"top_p": 1.0,
|
||||
"enable_streaming": "1"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -21,6 +21,16 @@
|
||||
{"value": "1", "text": "Enabled"},
|
||||
{"value": "0", "text": "Disabled"}
|
||||
]
|
||||
},
|
||||
"llm_provider": {
|
||||
"uitype": "code",
|
||||
"data": [
|
||||
{"value": "dashscope", "text": "阿里云 DashScope"},
|
||||
{"value": "openai", "text": "OpenAI"},
|
||||
{"value": "deepseek", "text": "DeepSeek"},
|
||||
{"value": "siliconflow", "text": "SiliconFlow"},
|
||||
{"value": "", "text": "自定义 (Custom URL)"}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@ -97,6 +97,15 @@
|
||||
"nullable": "no",
|
||||
"default": "0.7"
|
||||
},
|
||||
{
|
||||
"name": "top_p",
|
||||
"title": "Default top_p for LLM calls",
|
||||
"type": "float",
|
||||
"length": 5,
|
||||
"dec": 2,
|
||||
"nullable": "no",
|
||||
"default": "1.00"
|
||||
},
|
||||
{
|
||||
"name": "enable_streaming",
|
||||
"title": "Enable streaming response for LLM calls",
|
||||
@ -105,6 +114,15 @@
|
||||
"nullable": "no",
|
||||
"default": "1"
|
||||
},
|
||||
{
|
||||
"name": "llm_provider",
|
||||
"title": "LLM provider preset name",
|
||||
"type": "str",
|
||||
"length": 32,
|
||||
"nullable": "yes",
|
||||
"default": "dashscope",
|
||||
"comments": "Provider preset: openai, dashscope, deepseek, siliconflow, or custom (empty)"
|
||||
},
|
||||
{
|
||||
"name": "llm_service_url",
|
||||
"title": "LLM service base URL (OpenAI-compatible endpoint)",
|
||||
|
||||
@ -8,11 +8,8 @@ version = "1.0.0"
|
||||
description = "Hermes Agent module - multi-user AI agent with memory, skills, workflows, and remote skill deployment"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"ahserver",
|
||||
"sqlor",
|
||||
"apppublic",
|
||||
"appbase",
|
||||
"rbac",
|
||||
"bricks_for_python",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@ -1,46 +0,0 @@
|
||||
"""
|
||||
OpenAI-compatible /v1/chat/completions endpoint
|
||||
Accepts POST with JSON body matching OpenAI API format.
|
||||
"""
|
||||
import json
|
||||
|
||||
async def main():
|
||||
# Read request body
|
||||
body = {}
|
||||
try:
|
||||
raw_body = await request.read()
|
||||
if raw_body:
|
||||
body = json.loads(raw_body)
|
||||
except Exception as e:
|
||||
result = {
|
||||
'error': {
|
||||
'message': f'Invalid JSON body: {str(e)}',
|
||||
'type': 'invalid_request_error',
|
||||
'code': 400,
|
||||
}
|
||||
}
|
||||
return json.dumps(result, ensure_ascii=False)
|
||||
|
||||
# Pass the request object for streaming support
|
||||
body['_request'] = request
|
||||
|
||||
# Call the LLM API handler
|
||||
result = await harnessed_llm_chat_completions(body)
|
||||
|
||||
# Handle streaming response (StreamResponse)
|
||||
from aiohttp.web_response import StreamResponse
|
||||
if isinstance(result, StreamResponse):
|
||||
return result
|
||||
|
||||
# Handle error response
|
||||
if 'error' in result:
|
||||
status_code = result.get('error', {}).get('code', 500)
|
||||
resp = web.Response(
|
||||
status=status_code,
|
||||
body=json.dumps(result, ensure_ascii=False),
|
||||
content_type='application/json'
|
||||
)
|
||||
return resp
|
||||
|
||||
# Return successful response
|
||||
return json.dumps(result, ensure_ascii=False)
|
||||
@ -1,41 +0,0 @@
|
||||
"""
|
||||
OpenAI-compatible /v1/completions endpoint (legacy)
|
||||
Accepts POST with JSON body matching OpenAI completions API format.
|
||||
"""
|
||||
import json
|
||||
|
||||
async def main():
|
||||
# Read request body
|
||||
body = {}
|
||||
try:
|
||||
raw_body = await request.read()
|
||||
if raw_body:
|
||||
body = json.loads(raw_body)
|
||||
except Exception as e:
|
||||
result = {
|
||||
'error': {
|
||||
'message': f'Invalid JSON body: {str(e)}',
|
||||
'type': 'invalid_request_error',
|
||||
'code': 400,
|
||||
}
|
||||
}
|
||||
return json.dumps(result, ensure_ascii=False)
|
||||
|
||||
body['_request'] = request
|
||||
|
||||
result = await harnessed_llm_completions(body)
|
||||
|
||||
from aiohttp.web_response import StreamResponse
|
||||
if isinstance(result, StreamResponse):
|
||||
return result
|
||||
|
||||
if 'error' in result:
|
||||
status_code = result.get('error', {}).get('code', 500)
|
||||
resp = web.Response(
|
||||
status=status_code,
|
||||
body=json.dumps(result, ensure_ascii=False),
|
||||
content_type='application/json'
|
||||
)
|
||||
return resp
|
||||
|
||||
return json.dumps(result, ensure_ascii=False)
|
||||
@ -1,9 +0,0 @@
|
||||
"""
|
||||
OpenAI-compatible /v1/models endpoint
|
||||
Returns list of available models.
|
||||
"""
|
||||
import json
|
||||
|
||||
async def main():
|
||||
result = await harnessed_llm_models()
|
||||
return json.dumps(result, ensure_ascii=False)
|
||||
Loading…
x
Reference in New Issue
Block a user