refactor: LLM client for calling supplier LLM APIs (not server)

Replaces wrong-direction llm_api.py (which served OpenAI endpoints)
with llm_client.py -- a client that calls supplier LLM APIs.

New module: llm_client.py
  - llm_chat(messages, model, temperature, ...) -> OpenAI response dict
  - llm_chat_stream(messages, ...) -> async generator of SSE chunks
  - llm_simple(prompt, system) -> plain text response
  - llm_list_models() -> list available models from provider
  - llm_get_config() -> show current config (key masked)
  - Supports provider presets: openai, dashscope, deepseek, siliconflow
  - Retry with exponential backoff (3 attempts)
  - 429 rate limit handling with Retry-After
  - Structured logging via appPublic.log

Model changes (harnessed_agent_config):
  - Add llm_provider (preset name: dashscope/openai/deepseek/siliconflow)
  - Add top_p field
  - llm_service_url defaults to DashScope compatible endpoint

Other:
  - Remove wrong-direction /v1/ endpoints
  - Fix pyproject.toml deps: only sqlor + bricks_for_python
  - Update init/data.json seed data with LLM config fields
  - Update CRUD view with llm_provider dropdown
This commit is contained in:
yumoqing 2026-05-07 11:57:04 +08:00
parent e4f935de07
commit 4e65ff8fe4
10 changed files with 536 additions and 575 deletions

View File

@ -20,10 +20,12 @@ from .config_functions import (
harnessed_get_agent_config, harnessed_get_agent_config,
harnessed_save_agent_config harnessed_save_agent_config
) )
from .llm_api import ( from .llm_client import (
harnessed_llm_chat_completions, llm_chat,
harnessed_llm_models, llm_chat_stream,
harnessed_llm_completions, llm_list_models,
llm_simple,
llm_get_config,
) )
def load_harnessed_agent(): def load_harnessed_agent():
@ -49,7 +51,9 @@ def load_harnessed_agent():
env.harnessed_get_agent_config = harnessed_get_agent_config env.harnessed_get_agent_config = harnessed_get_agent_config
env.harnessed_save_agent_config = harnessed_save_agent_config env.harnessed_save_agent_config = harnessed_save_agent_config
# OpenAI-compatible LLM API # LLM client -- calls supplier LLM APIs (OpenAI-compatible)
env.harnessed_llm_chat_completions = harnessed_llm_chat_completions env.llm_chat = llm_chat
env.harnessed_llm_models = harnessed_llm_models env.llm_chat_stream = llm_chat_stream
env.harnessed_llm_completions = harnessed_llm_completions env.llm_list_models = llm_list_models
env.llm_simple = llm_simple
env.llm_get_config = llm_get_config

View File

@ -1,463 +0,0 @@
"""
OpenAI-compatible LLM API for Hermes Agent
Provides /v1/chat/completions and /v1/models endpoints compatible with OpenAI API spec.
"""
import json
import uuid
import time
import asyncio
from typing import Dict, Any, List, Optional, AsyncGenerator
from datetime import datetime
try:
from aiohttp import ClientSession, ClientTimeout
from aiohttp.client_exceptions import ClientError
HAS_AIOHTTP = True
except ImportError:
HAS_AIOHTTP = False
try:
from ahserver.serverenv import ServerEnv
from sqlor.dbpools import DBPools
from appPublic.worker import awaitify
from appPublic.log import info, debug, warning, error, exception
except ImportError:
class ServerEnv:
pass
class DBPools:
pass
def awaitify(f):
return f
def info(*a, **kw): print(*a)
def debug(*a, **kw): pass
def warning(*a, **kw): print(*a)
def error(*a, **kw): print(*a)
def exception(*a, **kw): pass
def _now_iso():
return datetime.utcnow().isoformat() + 'Z'
def _now_ts():
return int(time.time())
def _get_default_llm_config(user_id: str = None) -> Dict[str, Any]:
"""Get LLM service config from harnessed_agent_config table."""
try:
dbname = ServerEnv().get_module_dbname('harnessed_agent')
except Exception:
dbname = 'default'
try:
import asyncio
async def _fetch():
async with DBPools().sqlorContext(dbname) as sor:
filters = {}
if user_id:
filters['user_id'] = user_id
rows = await sor.R('harnessed_agent_config', filters,
orderby='updated_at DESC', limit=1)
if rows:
return rows[0]
return None
try:
loop = asyncio.get_event_loop()
if loop.is_running():
# Already in async context, need to use create_task pattern
# but for simplicity in .dspy context we return None and let caller handle
return None
else:
config = loop.run_until_complete(_fetch())
return config
except RuntimeError:
return None
except Exception:
return None
async def _async_get_llm_config(user_id: str = None) -> Dict[str, Any]:
"""Async version to get LLM service config."""
try:
env = ServerEnv()
dbname = env.get_module_dbname('harnessed_agent')
except Exception:
dbname = 'default'
try:
async with DBPools().sqlorContext(dbname) as sor:
filters = {}
if user_id:
filters['user_id'] = user_id
rows = await sor.R('harnessed_agent_config', filters,
orderby='updated_at DESC', limit=1)
if rows:
return rows[0]
except Exception as e:
print(f"Error fetching LLM config: {e}")
return None
async def _call_llm_api(
service_url: str,
api_key: str,
model: str,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
stream: bool = False,
top_p: float = 1.0,
**kwargs
) -> Any:
"""Call an OpenAI-compatible LLM API endpoint."""
url = service_url.rstrip('/') + '/chat/completions'
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}',
}
body = {
'model': model,
'messages': messages,
'temperature': temperature,
'top_p': top_p,
'stream': stream,
}
if max_tokens is not None:
body['max_tokens'] = max_tokens
# Pass through any additional OpenAI-compatible parameters
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', 'tool_choice',
'response_format', 'seed'):
if key in kwargs and kwargs[key] is not None:
body[key] = kwargs[key]
if stream:
return await _stream_llm_response(url, headers, body)
else:
return await _sync_llm_response(url, headers, body)
async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]:
"""Make a non-streaming LLM API call with retry logic."""
max_retries = 3
base_delay = 2 # seconds
for attempt in range(max_retries):
try:
async with ClientSession() as session:
async with session.post(url, headers=headers, json=body,
timeout=ClientTimeout(total=300)) as resp:
if resp.status == 429:
# Rate limited - respect Retry-After header
retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
if attempt < max_retries - 1:
warning(f"LLM API rate limited (429), retrying after {retry_after}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(retry_after)
continue
else:
return {
'error': {
'message': 'Rate limited by LLM provider. Please retry later.',
'type': 'rate_limit_error',
'code': 429,
}
}
if resp.status == 500 and attempt < max_retries - 1:
# Transient server error - retry
delay = base_delay * (2 ** attempt)
warning(f"LLM API server error (500), retrying after {delay}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(delay)
continue
if resp.status != 200:
error_text = await resp.text()
return {
'error': {
'message': f'LLM API error: HTTP {resp.status}',
'type': 'api_error',
'code': resp.status,
'detail': error_text[:500],
}
}
data = await resp.json()
return data
except asyncio.TimeoutError:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
warning(f"LLM API timeout, retrying after {delay}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(delay)
continue
return {
'error': {
'message': 'LLM API request timed out after 300s',
'type': 'timeout_error',
'code': 504,
}
}
except ClientError as e:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
warning(f"LLM API connection error: {e}, retrying after {delay}s")
await asyncio.sleep(delay)
continue
return {
'error': {
'message': f'LLM API connection failed: {str(e)}',
'type': 'connection_error',
'code': 502,
}
}
return {
'error': {
'message': 'LLM API request failed after all retries',
'type': 'server_error',
'code': 500,
}
}
async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]:
"""Make a streaming LLM API call, yielding SSE chunks."""
async with ClientSession() as session:
async with session.post(url, headers=headers, json=body,
timeout=ClientTimeout(total=600)) as resp:
if resp.status != 200:
error_text = await resp.text()
error_data = {
'error': {
'message': f'LLM API error: HTTP {resp.status}',
'type': 'api_error',
'code': resp.status,
'detail': error_text[:500],
}
}
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n"
yield "data: [DONE]\n\n"
return
async for line in resp.content:
line = line.decode('utf-8').strip()
if not line:
continue
if line.startswith('data: '):
yield line + '\n\n'
# ============================================================
# Public API functions (registered to ServerEnv via init.py)
# ============================================================
async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
"""
OpenAI-compatible /v1/chat/completions endpoint.
Args:
body: dict with keys: model, messages, temperature, max_tokens,
stream, top_p, and other OpenAI-compatible params.
Returns:
If stream=False: dict matching OpenAI chat completion response.
If stream=True: aiohttp.StreamResponse with SSE.
"""
from aiohttp import web
model = body.get('model', 'default')
messages = body.get('messages', [])
temperature = body.get('temperature', 0.7)
max_tokens = body.get('max_tokens', None)
stream = body.get('stream', False)
top_p = body.get('top_p', 1.0)
start_time = time.time()
# Get LLM service config
config = await _async_get_llm_config()
if not config:
error("LLM service not configured: no harnessed_agent_config found")
return {
'error': {
'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.',
'type': 'configuration_error',
'code': 503,
}
}
service_url = config.get('llm_service_url') or config.get('api_endpoint')
api_key = config.get('llm_api_key') or config.get('api_key')
if not service_url or not api_key:
error(f"LLM service misconfigured: service_url={'set' if service_url else 'MISSING'}, api_key={'set' if api_key else 'MISSING'}")
return {
'error': {
'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.',
'type': 'configuration_error',
'code': 503,
}
}
# Use default model from config if request model is 'default' or empty
default_model = config.get('default_model', 'qwen3-max')
if not model or model == 'default':
model = default_model
info(f"LLM chat request: model={model}, stream={stream}, messages={len(messages)}, temperature={temperature}")
# Pass through extra params
extra_params = {}
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
'tool_choice', 'response_format', 'seed'):
if key in body:
extra_params[key] = body[key]
if stream:
resp = web.StreamResponse(
status=200,
reason='OK',
headers={
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'X-Accel-Buffering': 'no',
}
)
await resp.prepare(body.get('_request'))
request_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
created = _now_ts()
try:
async for chunk_line in _stream_llm_response(
service_url, {'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}'},
{
'model': model,
'messages': messages,
'temperature': temperature,
'top_p': top_p,
'stream': True,
'max_tokens': max_tokens,
**extra_params,
}
):
await resp.write(chunk_line.encode('utf-8') if isinstance(chunk_line, str) else chunk_line)
await resp.drain()
except Exception as e:
error_data = {
'error': {
'message': f'Streaming error: {str(e)}',
'type': 'server_error',
'code': 500,
}
}
await resp.write(f"data: {json.dumps(error_data)}\n\n".encode('utf-8'))
await resp.drain()
await resp.write(b"data: [DONE]\n\n")
await resp.drain()
return resp
else:
result = await _call_llm_api(
service_url=service_url,
api_key=api_key,
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=False,
top_p=top_p,
**extra_params,
)
elapsed = time.time() - start_time
if 'error' in result:
error(f"LLM chat error: model={model}, elapsed={elapsed:.2f}s, error={result.get('error',{}).get('message','')}")
else:
usage = result.get('usage', {})
info(f"LLM chat done: model={model}, elapsed={elapsed:.2f}s, tokens_in={usage.get('prompt_tokens','?')}, tokens_out={usage.get('completion_tokens','?')}")
return result
async def harnessed_llm_models() -> Dict[str, Any]:
"""
OpenAI-compatible /v1/models endpoint.
Returns list of available models from config.
"""
config = await _async_get_llm_config()
models_list = []
# Add default model from config
default_model = 'qwen3-max'
if config:
default_model = config.get('default_model', 'qwen3-max')
# If available_models is configured as JSON string, parse it
if config and config.get('available_models'):
try:
models_str = config['available_models']
if isinstance(models_str, str):
model_names = json.loads(models_str)
else:
model_names = models_str
for m in model_names:
if isinstance(m, str):
models_list.append({
'id': m,
'object': 'model',
'created': _now_ts(),
'owned_by': 'harnessed_agent',
})
elif isinstance(m, dict):
models_list.append(m)
except Exception:
pass
# Always include the default model if not already listed
existing_ids = {m['id'] for m in models_list}
if default_model not in existing_ids:
models_list.insert(0, {
'id': default_model,
'object': 'model',
'created': _now_ts(),
'owned_by': 'harnessed_agent',
})
return {
'object': 'list',
'data': models_list,
}
async def harnessed_llm_completions(body: Dict[str, Any]) -> Dict[str, Any]:
"""
OpenAI-compatible /v1/completions endpoint (legacy, non-chat).
Converts prompt to messages format internally.
"""
prompt = body.get('prompt', '')
model = body.get('model', 'default')
temperature = body.get('temperature', 0.7)
max_tokens = body.get('max_tokens', None)
stream = body.get('stream', False)
# Convert to chat format
messages = [{'role': 'user', 'content': prompt}]
# Build request body for chat completions
chat_body = {
'model': model,
'messages': messages,
'temperature': temperature,
'max_tokens': max_tokens,
'stream': stream,
'_request': body.get('_request'),
}
return await harnessed_llm_chat_completions(chat_body)

View File

@ -0,0 +1,475 @@
"""
LLM Client for Hermes Agent - Calls supplier LLM APIs via OpenAI-compatible interface.
This module provides a client-side implementation for calling external LLM providers
(OpenAI, DashScope, DeepSeek, etc.) through their OpenAI-compatible /v1/chat/completions API.
Usage:
# From .dspy files:
result = await llm_chat(messages=[{"role": "user", "content": "Hello"}])
# With explicit config:
result = await llm_chat(
messages=[{"role": "user", "content": "Hello"}],
model="qwen3-max",
temperature=0.7,
stream=False
)
# Stream mode:
async for chunk in llm_chat_stream(messages=[...]):
print(chunk.get("delta", ""))
"""
import json
import time
import asyncio
from typing import Dict, Any, List, Optional, AsyncGenerator
from datetime import datetime
try:
from aiohttp import ClientSession, ClientTimeout, ClientError
except ImportError:
ClientError = Exception
try:
from ahserver.serverenv import ServerEnv
from sqlor.dbpools import DBPools
from appPublic.log import info, debug, warning, error, exception
except ImportError:
class ServerEnv:
pass
class DBPools:
pass
def info(*a, **kw): print(*a)
def debug(*a, **kw): pass
def warning(*a, **kw): print(*a)
def error(*a, **kw): print(*a)
def exception(*a, **kw): pass
# ============================================================
# LLM Provider configurations
# ============================================================
LLM_PROVIDERS = {
'openai': {
'url': 'https://api.openai.com/v1',
'model_default': 'gpt-4o',
},
'dashscope': {
'url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
'model_default': 'qwen-plus',
},
'deepseek': {
'url': 'https://api.deepseek.com/v1',
'model_default': 'deepseek-chat',
},
'siliconflow': {
'url': 'https://api.siliconflow.cn/v1',
'model_default': 'Qwen/Qwen2.5-72B-Instruct',
},
}
# ============================================================
# Config retrieval
# ============================================================
async def _get_llm_config() -> Dict[str, Any]:
"""Get LLM client configuration from harnessed_agent_config table."""
try:
env = ServerEnv()
dbname = env.get_module_dbname('harnessed_agent')
except Exception:
dbname = 'default'
try:
async with DBPools().sqlorContext(dbname) as sor:
rows = await sor.R('harnessed_agent_config', {},
orderby='updated_at DESC', limit=1)
if rows:
return rows[0]
except Exception as e:
error(f"Failed to fetch LLM config: {e}")
return {}
def _resolve_provider(config: Dict[str, Any]) -> Dict[str, str]:
"""Resolve base URL and model from config, with provider presets."""
provider = (config.get('llm_provider') or '').lower()
service_url = config.get('llm_service_url', '')
api_key = config.get('llm_api_key', '')
model = config.get('default_model', '')
# If provider name is set, use preset URL
if provider and provider in LLM_PROVIDERS:
preset = LLM_PROVIDERS[provider]
if not service_url:
service_url = preset['url']
if not model:
model = preset['model_default']
return {
'service_url': service_url.rstrip('/'),
'api_key': api_key,
'model': model,
'provider': provider,
}
# ============================================================
# Core LLM client
# ============================================================
async def _post_chat_completions(
service_url: str,
api_key: str,
model: str,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
stream: bool = False,
top_p: float = 1.0,
extra: Optional[Dict] = None,
) -> Dict[str, Any]:
"""
Call OpenAI-compatible /v1/chat/completions endpoint.
Returns dict with OpenAI response format or error dict.
"""
if not service_url:
return {'error': {'message': 'llm_service_url not configured', 'type': 'configuration_error'}}
if not api_key:
return {'error': {'message': 'llm_api_key not configured', 'type': 'configuration_error'}}
url = f"{service_url}/chat/completions"
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}',
}
body = {
'model': model,
'messages': messages,
'temperature': temperature,
'top_p': top_p,
'stream': stream,
}
if max_tokens is not None:
body['max_tokens'] = max_tokens
if extra:
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
'tool_choice', 'response_format', 'seed'):
if key in extra and extra[key] is not None:
body[key] = extra[key]
max_retries = 3
base_delay = 2
for attempt in range(max_retries):
try:
async with ClientSession() as session:
async with session.post(url, headers=headers, json=body,
timeout=ClientTimeout(total=300)) as resp:
if resp.status == 429:
retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
if attempt < max_retries - 1:
warning(f"LLM rate limited (429), retrying in {retry_after}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(retry_after)
continue
return {'error': {'message': 'Rate limited by LLM provider', 'type': 'rate_limit_error', 'code': 429}}
if resp.status == 500 and attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
warning(f"LLM server error (500), retrying in {delay}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(delay)
continue
if resp.status != 200:
err_text = await resp.text()
return {
'error': {
'message': f'LLM API error: HTTP {resp.status}',
'type': 'api_error',
'code': resp.status,
'detail': err_text[:500],
}
}
return await resp.json()
except asyncio.TimeoutError:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
warning(f"LLM API timeout, retrying in {delay}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(delay)
continue
return {'error': {'message': 'LLM API request timed out', 'type': 'timeout_error', 'code': 504}}
except ClientError as e:
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
warning(f"LLM connection error: {e}, retrying in {delay}s")
await asyncio.sleep(delay)
continue
return {'error': {'message': f'LLM connection failed: {str(e)}', 'type': 'connection_error', 'code': 502}}
return {'error': {'message': 'LLM API failed after all retries', 'type': 'server_error', 'code': 500}}
async def _stream_chat_completions(
service_url: str,
api_key: str,
model: str,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
top_p: float = 1.0,
extra: Optional[Dict] = None,
) -> AsyncGenerator[Dict[str, Any], None]:
"""
Stream LLM response via SSE. Yields parsed chunk dicts.
Each yielded dict contains:
- delta: str (the text chunk)
- finish_reason: str or None
- raw: dict (the raw SSE chunk data)
"""
if not service_url:
yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_service_url not configured'}}
return
if not api_key:
yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_api_key not configured'}}
return
url = f"{service_url}/chat/completions"
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}',
}
body = {
'model': model,
'messages': messages,
'temperature': temperature,
'top_p': top_p,
'stream': True,
}
if max_tokens is not None:
body['max_tokens'] = max_tokens
if extra:
for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
'tool_choice', 'response_format', 'seed'):
if key in extra and extra[key] is not None:
body[key] = extra[key]
async with ClientSession() as session:
async with session.post(url, headers=headers, json=body,
timeout=ClientTimeout(total=600)) as resp:
if resp.status != 200:
err_text = await resp.text()
yield {
'delta': '',
'finish_reason': 'error',
'raw': {'error': f'HTTP {resp.status}', 'detail': err_text[:300]},
}
return
async for line in resp.content:
line = line.decode('utf-8').strip()
if not line or not line.startswith('data:'):
continue
data_str = line[5:].strip()
if data_str == '[DONE]':
break
try:
chunk = json.loads(data_str)
choices = chunk.get('choices', [])
if choices:
choice = choices[0]
delta = choice.get('delta', {})
text = delta.get('content', '') or ''
finish_reason = choice.get('finish_reason')
yield {
'delta': text,
'finish_reason': finish_reason,
'raw': chunk,
}
except json.JSONDecodeError:
continue
# ============================================================
# Public API functions (registered to ServerEnv)
# ============================================================
async def llm_chat(
messages: List[Dict[str, str]],
model: str = None,
temperature: float = None,
max_tokens: int = None,
stream: bool = False,
top_p: float = None,
**extra,
) -> Dict[str, Any]:
"""
Call LLM provider and get chat completion.
This is the primary function for AI calls within harnessed_agent.
Reads provider config from harnessed_agent_config table automatically.
Args:
messages: List of {role, content} dicts (OpenAI format)
model: Override model name (uses config default_model if not set)
temperature: Override temperature (uses config default_temperature if not set)
max_tokens: Max response tokens
stream: If True, uses streaming
top_p: Override top_p
**extra: Other OpenAI-compatible params (stop, tools, etc.)
Returns:
Dict matching OpenAI chat completion response, or error dict.
"""
config = await _get_llm_config()
provider = _resolve_provider(config)
resolved_model = model or provider['model']
resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7)
resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0)
start_time = time.time()
info(f"LLM chat: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}")
result = await _post_chat_completions(
service_url=provider['service_url'],
api_key=provider['api_key'],
model=resolved_model,
messages=messages,
temperature=resolved_temp,
max_tokens=max_tokens,
stream=False,
top_p=resolved_top_p,
extra=extra if extra else None,
)
elapsed = time.time() - start_time
if 'error' in result:
error(f"LLM chat error: model={resolved_model}, elapsed={elapsed:.2f}s, error={result['error'].get('message')}")
else:
usage = result.get('usage', {})
info(f"LLM chat done: model={resolved_model}, elapsed={elapsed:.2f}s, prompt_tokens={usage.get('prompt_tokens')}, completion_tokens={usage.get('completion_tokens')}")
return result
async def llm_chat_stream(
messages: List[Dict[str, str]],
model: str = None,
temperature: float = None,
max_tokens: int = None,
top_p: float = None,
**extra,
) -> AsyncGenerator[Dict[str, Any], None]:
"""
Stream LLM chat response.
Yields dicts with {delta: str, finish_reason: str|None, raw: dict}.
Example:
async for chunk in llm_chat_stream(messages=[{"role": "user", "content": "Hello"}]):
print(chunk['delta'], end='', flush=True)
"""
config = await _get_llm_config()
provider = _resolve_provider(config)
resolved_model = model or provider['model']
resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7)
resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0)
info(f"LLM chat stream: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}")
async for chunk in _stream_chat_completions(
service_url=provider['service_url'],
api_key=provider['api_key'],
model=resolved_model,
messages=messages,
temperature=resolved_temp,
max_tokens=max_tokens,
top_p=resolved_top_p,
extra=extra if extra else None,
):
yield chunk
async def llm_list_models() -> Dict[str, Any]:
"""List models available from the configured LLM provider."""
config = await _get_llm_config()
provider = _resolve_provider(config)
if not provider['service_url'] or not provider['api_key']:
return {'error': 'LLM not configured'}
# Try to call /v1/models endpoint
url = f"{provider['service_url']}/models"
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {provider["api_key"]}',
}
try:
async with ClientSession() as session:
async with session.get(url, headers=headers,
timeout=ClientTimeout(total=30)) as resp:
if resp.status == 200:
return await resp.json()
else:
return {'error': f'HTTP {resp.status}'}
except Exception as e:
return {'error': str(e)}
async def llm_simple(prompt: str, system: str = None, **kwargs) -> str:
"""
Simplified LLM call: returns just the response text.
Args:
prompt: User message text
system: Optional system prompt
**kwargs: Passed to llm_chat
Returns:
Response content string, or error message.
"""
messages = []
if system:
messages.append({'role': 'system', 'content': system})
messages.append({'role': 'user', 'content': prompt})
result = await llm_chat(messages=messages, **kwargs)
if 'error' in result:
return f"Error: {result['error'].get('message', 'Unknown error')}"
choices = result.get('choices', [])
if choices:
return choices[0].get('message', {}).get('content', '')
return ''
async def llm_get_config() -> Dict[str, Any]:
"""Get current LLM client configuration (with api_key masked)."""
config = await _get_llm_config()
provider = _resolve_provider(config)
return {
'provider': provider['provider'],
'service_url': provider['service_url'],
'api_key': '***' + provider['api_key'][-4:] if provider['api_key'] else '(not set)',
'default_model': provider['model'],
'default_temperature': config.get('default_temperature', 0.7),
}

View File

@ -9,7 +9,7 @@
"updated_at": "2026-04-15 21:06:00" "updated_at": "2026-04-15 21:06:00"
}, },
{ {
"id": "default_memory_notes_1", "id": "default_memory_notes_1",
"user_id": "user_1", "user_id": "user_1",
"target": "memory", "target": "memory",
"content": "Default memory notes for Hermes Agent module - User 1", "content": "Default memory notes for Hermes Agent module - User 1",
@ -25,7 +25,7 @@
"updated_at": "2026-04-15 21:06:00" "updated_at": "2026-04-15 21:06:00"
}, },
{ {
"id": "default_memory_notes_2", "id": "default_memory_notes_2",
"user_id": "user_2", "user_id": "user_2",
"target": "memory", "target": "memory",
"content": "Default memory notes for Hermes Agent module - User 2", "content": "Default memory notes for Hermes Agent module - User 2",
@ -70,7 +70,15 @@
"auto_cleanup_enabled": "1", "auto_cleanup_enabled": "1",
"min_retention_days": 30, "min_retention_days": 30,
"created_at": "2026-04-20 10:48:00", "created_at": "2026-04-20 10:48:00",
"updated_at": "2026-04-20 10:48:00" "updated_at": "2026-04-20 10:48:00",
"llm_provider": "dashscope",
"llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"llm_api_key": "",
"available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]",
"default_model": "qwen-plus",
"default_temperature": 0.7,
"top_p": 1.0,
"enable_streaming": "1"
}, },
{ {
"id": "default_agent_config_user_2", "id": "default_agent_config_user_2",
@ -84,7 +92,15 @@
"auto_cleanup_enabled": "1", "auto_cleanup_enabled": "1",
"min_retention_days": 30, "min_retention_days": 30,
"created_at": "2026-04-20 10:48:00", "created_at": "2026-04-20 10:48:00",
"updated_at": "2026-04-20 10:48:00" "updated_at": "2026-04-20 10:48:00",
"llm_provider": "dashscope",
"llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"llm_api_key": "",
"available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]",
"default_model": "qwen-plus",
"default_temperature": 0.7,
"top_p": 1.0,
"enable_streaming": "1"
} }
] ]
} }

View File

@ -21,6 +21,16 @@
{"value": "1", "text": "Enabled"}, {"value": "1", "text": "Enabled"},
{"value": "0", "text": "Disabled"} {"value": "0", "text": "Disabled"}
] ]
},
"llm_provider": {
"uitype": "code",
"data": [
{"value": "dashscope", "text": "阿里云 DashScope"},
{"value": "openai", "text": "OpenAI"},
{"value": "deepseek", "text": "DeepSeek"},
{"value": "siliconflow", "text": "SiliconFlow"},
{"value": "", "text": "自定义 (Custom URL)"}
]
} }
} }
}, },

View File

@ -97,6 +97,15 @@
"nullable": "no", "nullable": "no",
"default": "0.7" "default": "0.7"
}, },
{
"name": "top_p",
"title": "Default top_p for LLM calls",
"type": "float",
"length": 5,
"dec": 2,
"nullable": "no",
"default": "1.00"
},
{ {
"name": "enable_streaming", "name": "enable_streaming",
"title": "Enable streaming response for LLM calls", "title": "Enable streaming response for LLM calls",
@ -105,6 +114,15 @@
"nullable": "no", "nullable": "no",
"default": "1" "default": "1"
}, },
{
"name": "llm_provider",
"title": "LLM provider preset name",
"type": "str",
"length": 32,
"nullable": "yes",
"default": "dashscope",
"comments": "Provider preset: openai, dashscope, deepseek, siliconflow, or custom (empty)"
},
{ {
"name": "llm_service_url", "name": "llm_service_url",
"title": "LLM service base URL (OpenAI-compatible endpoint)", "title": "LLM service base URL (OpenAI-compatible endpoint)",

View File

@ -8,11 +8,8 @@ version = "1.0.0"
description = "Hermes Agent module - multi-user AI agent with memory, skills, workflows, and remote skill deployment" description = "Hermes Agent module - multi-user AI agent with memory, skills, workflows, and remote skill deployment"
requires-python = ">=3.10" requires-python = ">=3.10"
dependencies = [ dependencies = [
"ahserver",
"sqlor", "sqlor",
"apppublic", "bricks_for_python",
"appbase",
"rbac",
] ]
[project.optional-dependencies] [project.optional-dependencies]

View File

@ -1,46 +0,0 @@
"""
OpenAI-compatible /v1/chat/completions endpoint
Accepts POST with JSON body matching OpenAI API format.
"""
import json
async def main():
# Read request body
body = {}
try:
raw_body = await request.read()
if raw_body:
body = json.loads(raw_body)
except Exception as e:
result = {
'error': {
'message': f'Invalid JSON body: {str(e)}',
'type': 'invalid_request_error',
'code': 400,
}
}
return json.dumps(result, ensure_ascii=False)
# Pass the request object for streaming support
body['_request'] = request
# Call the LLM API handler
result = await harnessed_llm_chat_completions(body)
# Handle streaming response (StreamResponse)
from aiohttp.web_response import StreamResponse
if isinstance(result, StreamResponse):
return result
# Handle error response
if 'error' in result:
status_code = result.get('error', {}).get('code', 500)
resp = web.Response(
status=status_code,
body=json.dumps(result, ensure_ascii=False),
content_type='application/json'
)
return resp
# Return successful response
return json.dumps(result, ensure_ascii=False)

View File

@ -1,41 +0,0 @@
"""
OpenAI-compatible /v1/completions endpoint (legacy)
Accepts POST with JSON body matching OpenAI completions API format.
"""
import json
async def main():
# Read request body
body = {}
try:
raw_body = await request.read()
if raw_body:
body = json.loads(raw_body)
except Exception as e:
result = {
'error': {
'message': f'Invalid JSON body: {str(e)}',
'type': 'invalid_request_error',
'code': 400,
}
}
return json.dumps(result, ensure_ascii=False)
body['_request'] = request
result = await harnessed_llm_completions(body)
from aiohttp.web_response import StreamResponse
if isinstance(result, StreamResponse):
return result
if 'error' in result:
status_code = result.get('error', {}).get('code', 500)
resp = web.Response(
status=status_code,
body=json.dumps(result, ensure_ascii=False),
content_type='application/json'
)
return resp
return json.dumps(result, ensure_ascii=False)

View File

@ -1,9 +0,0 @@
"""
OpenAI-compatible /v1/models endpoint
Returns list of available models.
"""
import json
async def main():
result = await harnessed_llm_models()
return json.dumps(result, ensure_ascii=False)