refactor: LLM client for calling supplier LLM APIs (not server)

Replaces wrong-direction llm_api.py (which served OpenAI endpoints) with llm_client.py -- a client that calls supplier LLM APIs. New module: llm_client.py - llm_chat(messages, model, temperature, ...) -> OpenAI response dict - llm_chat_stream(messages, ...) -> async generator of SSE chunks - llm_simple(prompt, system) -> plain text response - llm_list_models() -> list available models from provider - llm_get_config() -> show current config (key masked) - Supports provider presets: openai, dashscope, deepseek, siliconflow - Retry with exponential backoff (3 attempts) - 429 rate limit handling with Retry-After - Structured logging via appPublic.log Model changes (harnessed_agent_config): - Add llm_provider (preset name: dashscope/openai/deepseek/siliconflow) - Add top_p field - llm_service_url defaults to DashScope compatible endpoint Other: - Remove wrong-direction /v1/ endpoints - Fix pyproject.toml deps: only sqlor + bricks_for_python - Update init/data.json seed data with LLM config fields - Update CRUD view with llm_provider dropdown
2026-05-07 11:57:04 +08:00 · 2026-05-07 11:57:04 +08:00 · 4e65ff8fe4
commit 4e65ff8fe4
parent e4f935de07
10 changed files with 536 additions and 575 deletions
--- a/harnessed_agent/init.py
+++ b/harnessed_agent/init.py
@ -20,10 +20,12 @@ from .config_functions import (
    harnessed_get_agent_config,
    harnessed_save_agent_config
 )
-from .llm_api import (
+from .llm_client import (
-    harnessed_llm_chat_completions,
+    llm_chat,
-    harnessed_llm_models,
+    llm_chat_stream,
-    harnessed_llm_completions,
+    llm_list_models,
    llm_simple,
    llm_get_config,
 )
 def load_harnessed_agent():
@ -49,7 +51,9 @@ def load_harnessed_agent():
    env.harnessed_get_agent_config = harnessed_get_agent_config
    env.harnessed_save_agent_config = harnessed_save_agent_config
-    # OpenAI-compatible LLM API
+    # LLM client -- calls supplier LLM APIs (OpenAI-compatible)
-    env.harnessed_llm_chat_completions = harnessed_llm_chat_completions
+    env.llm_chat = llm_chat
-    env.harnessed_llm_models = harnessed_llm_models
+    env.llm_chat_stream = llm_chat_stream
-    env.harnessed_llm_completions = harnessed_llm_completions
+    env.llm_list_models = llm_list_models
    env.llm_simple = llm_simple
    env.llm_get_config = llm_get_config
--- a/harnessed_agent/llm_api.py
+++ b/harnessed_agent/llm_api.py
@ -1,463 +0,0 @@
 """
 OpenAI-compatible LLM API for Hermes Agent
 Provides /v1/chat/completions and /v1/models endpoints compatible with OpenAI API spec.
 """
 import json
 import uuid
 import time
 import asyncio
 from typing import Dict, Any, List, Optional, AsyncGenerator
 from datetime import datetime
 try:
    from aiohttp import ClientSession, ClientTimeout
    from aiohttp.client_exceptions import ClientError
    HAS_AIOHTTP = True
 except ImportError:
    HAS_AIOHTTP = False
 try:
    from ahserver.serverenv import ServerEnv
    from sqlor.dbpools import DBPools
    from appPublic.worker import awaitify
    from appPublic.log import info, debug, warning, error, exception
 except ImportError:
    class ServerEnv:
        pass
    class DBPools:
        pass
    def awaitify(f):
        return f
    def info(*a, **kw): print(*a)
    def debug(*a, **kw): pass
    def warning(*a, **kw): print(*a)
    def error(*a, **kw): print(*a)
    def exception(*a, **kw): pass
 def _now_iso():
    return datetime.utcnow().isoformat() + 'Z'
 def _now_ts():
    return int(time.time())
 def _get_default_llm_config(user_id: str = None) -> Dict[str, Any]:
    """Get LLM service config from harnessed_agent_config table."""
    try:
        dbname = ServerEnv().get_module_dbname('harnessed_agent')
    except Exception:
        dbname = 'default'
    try:
        import asyncio
        async def _fetch():
            async with DBPools().sqlorContext(dbname) as sor:
                filters = {}
                if user_id:
                    filters['user_id'] = user_id
                rows = await sor.R('harnessed_agent_config', filters,
                                   orderby='updated_at DESC', limit=1)
                if rows:
                    return rows[0]
            return None
        try:
            loop = asyncio.get_event_loop()
            if loop.is_running():
                # Already in async context, need to use create_task pattern
                # but for simplicity in .dspy context we return None and let caller handle
                return None
            else:
                config = loop.run_until_complete(_fetch())
                return config
        except RuntimeError:
            return None
    except Exception:
        return None
 async def _async_get_llm_config(user_id: str = None) -> Dict[str, Any]:
    """Async version to get LLM service config."""
    try:
        env = ServerEnv()
        dbname = env.get_module_dbname('harnessed_agent')
    except Exception:
        dbname = 'default'
    try:
        async with DBPools().sqlorContext(dbname) as sor:
            filters = {}
            if user_id:
                filters['user_id'] = user_id
            rows = await sor.R('harnessed_agent_config', filters,
                               orderby='updated_at DESC', limit=1)
            if rows:
                return rows[0]
    except Exception as e:
        print(f"Error fetching LLM config: {e}")
    return None
 async def _call_llm_api(
    service_url: str,
    api_key: str,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float = 0.7,
    max_tokens: Optional[int] = None,
    stream: bool = False,
    top_p: float = 1.0,
    **kwargs
 ) -> Any:
    """Call an OpenAI-compatible LLM API endpoint."""
    url = service_url.rstrip('/') + '/chat/completions'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    }
    body = {
        'model': model,
        'messages': messages,
        'temperature': temperature,
        'top_p': top_p,
        'stream': stream,
    }
    if max_tokens is not None:
        body['max_tokens'] = max_tokens
    # Pass through any additional OpenAI-compatible parameters
    for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', 'tool_choice',
                'response_format', 'seed'):
        if key in kwargs and kwargs[key] is not None:
            body[key] = kwargs[key]
    if stream:
        return await _stream_llm_response(url, headers, body)
    else:
        return await _sync_llm_response(url, headers, body)
 async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]:
    """Make a non-streaming LLM API call with retry logic."""
    max_retries = 3
    base_delay = 2  # seconds
    for attempt in range(max_retries):
        try:
            async with ClientSession() as session:
                async with session.post(url, headers=headers, json=body,
                                        timeout=ClientTimeout(total=300)) as resp:
                    if resp.status == 429:
                        # Rate limited - respect Retry-After header
                        retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
                        if attempt < max_retries - 1:
                            warning(f"LLM API rate limited (429), retrying after {retry_after}s (attempt {attempt+1}/{max_retries})")
                            await asyncio.sleep(retry_after)
                            continue
                        else:
                            return {
                                'error': {
                                    'message': 'Rate limited by LLM provider. Please retry later.',
                                    'type': 'rate_limit_error',
                                    'code': 429,
                                }
                            }
                    if resp.status == 500 and attempt < max_retries - 1:
                        # Transient server error - retry
                        delay = base_delay * (2 ** attempt)
                        warning(f"LLM API server error (500), retrying after {delay}s (attempt {attempt+1}/{max_retries})")
                        await asyncio.sleep(delay)
                        continue
                    if resp.status != 200:
                        error_text = await resp.text()
                        return {
                            'error': {
                                'message': f'LLM API error: HTTP {resp.status}',
                                'type': 'api_error',
                                'code': resp.status,
                                'detail': error_text[:500],
                            }
                        }
                    data = await resp.json()
                    return data
        except asyncio.TimeoutError:
            if attempt < max_retries - 1:
                delay = base_delay * (2 ** attempt)
                warning(f"LLM API timeout, retrying after {delay}s (attempt {attempt+1}/{max_retries})")
                await asyncio.sleep(delay)
                continue
            return {
                'error': {
                    'message': 'LLM API request timed out after 300s',
                    'type': 'timeout_error',
                    'code': 504,
                }
            }
        except ClientError as e:
            if attempt < max_retries - 1:
                delay = base_delay * (2 ** attempt)
                warning(f"LLM API connection error: {e}, retrying after {delay}s")
                await asyncio.sleep(delay)
                continue
            return {
                'error': {
                    'message': f'LLM API connection failed: {str(e)}',
                    'type': 'connection_error',
                    'code': 502,
                }
            }
    return {
        'error': {
            'message': 'LLM API request failed after all retries',
            'type': 'server_error',
            'code': 500,
        }
    }
 async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]:
    """Make a streaming LLM API call, yielding SSE chunks."""
    async with ClientSession() as session:
        async with session.post(url, headers=headers, json=body,
                                timeout=ClientTimeout(total=600)) as resp:
            if resp.status != 200:
                error_text = await resp.text()
                error_data = {
                    'error': {
                        'message': f'LLM API error: HTTP {resp.status}',
                        'type': 'api_error',
                        'code': resp.status,
                        'detail': error_text[:500],
                    }
                }
                yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n"
                yield "data: [DONE]\n\n"
                return
            async for line in resp.content:
                line = line.decode('utf-8').strip()
                if not line:
                    continue
                if line.startswith('data: '):
                    yield line + '\n\n'
 # ============================================================
 # Public API functions (registered to ServerEnv via init.py)
 # ============================================================
 async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
    """
    OpenAI-compatible /v1/chat/completions endpoint.
    Args:
        body: dict with keys: model, messages, temperature, max_tokens,
              stream, top_p, and other OpenAI-compatible params.
    Returns:
        If stream=False: dict matching OpenAI chat completion response.
        If stream=True: aiohttp.StreamResponse with SSE.
    """
    from aiohttp import web
    model = body.get('model', 'default')
    messages = body.get('messages', [])
    temperature = body.get('temperature', 0.7)
    max_tokens = body.get('max_tokens', None)
    stream = body.get('stream', False)
    top_p = body.get('top_p', 1.0)
    start_time = time.time()
    # Get LLM service config
    config = await _async_get_llm_config()
    if not config:
        error("LLM service not configured: no harnessed_agent_config found")
        return {
            'error': {
                'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.',
                'type': 'configuration_error',
                'code': 503,
            }
        }
    service_url = config.get('llm_service_url') or config.get('api_endpoint')
    api_key = config.get('llm_api_key') or config.get('api_key')
    if not service_url or not api_key:
        error(f"LLM service misconfigured: service_url={'set' if service_url else 'MISSING'}, api_key={'set' if api_key else 'MISSING'}")
        return {
            'error': {
                'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.',
                'type': 'configuration_error',
                'code': 503,
            }
        }
    # Use default model from config if request model is 'default' or empty
    default_model = config.get('default_model', 'qwen3-max')
    if not model or model == 'default':
        model = default_model
    info(f"LLM chat request: model={model}, stream={stream}, messages={len(messages)}, temperature={temperature}")
    # Pass through extra params
    extra_params = {}
    for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
                'tool_choice', 'response_format', 'seed'):
        if key in body:
            extra_params[key] = body[key]
    if stream:
        resp = web.StreamResponse(
            status=200,
            reason='OK',
            headers={
                'Content-Type': 'text/event-stream',
                'Cache-Control': 'no-cache',
                'Connection': 'keep-alive',
                'X-Accel-Buffering': 'no',
            }
        )
        await resp.prepare(body.get('_request'))
        request_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
        created = _now_ts()
        try:
            async for chunk_line in _stream_llm_response(
                service_url, {'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}'},
                {
                    'model': model,
                    'messages': messages,
                    'temperature': temperature,
                    'top_p': top_p,
                    'stream': True,
                    'max_tokens': max_tokens,
                    **extra_params,
                }
            ):
                await resp.write(chunk_line.encode('utf-8') if isinstance(chunk_line, str) else chunk_line)
                await resp.drain()
        except Exception as e:
            error_data = {
                'error': {
                    'message': f'Streaming error: {str(e)}',
                    'type': 'server_error',
                    'code': 500,
                }
            }
            await resp.write(f"data: {json.dumps(error_data)}\n\n".encode('utf-8'))
            await resp.drain()
        await resp.write(b"data: [DONE]\n\n")
        await resp.drain()
        return resp
    else:
        result = await _call_llm_api(
            service_url=service_url,
            api_key=api_key,
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
            stream=False,
            top_p=top_p,
            **extra_params,
        )
        elapsed = time.time() - start_time
        if 'error' in result:
            error(f"LLM chat error: model={model}, elapsed={elapsed:.2f}s, error={result.get('error',{}).get('message','')}")
        else:
            usage = result.get('usage', {})
            info(f"LLM chat done: model={model}, elapsed={elapsed:.2f}s, tokens_in={usage.get('prompt_tokens','?')}, tokens_out={usage.get('completion_tokens','?')}")
        return result
 async def harnessed_llm_models() -> Dict[str, Any]:
    """
    OpenAI-compatible /v1/models endpoint.
    Returns list of available models from config.
    """
    config = await _async_get_llm_config()
    models_list = []
    # Add default model from config
    default_model = 'qwen3-max'
    if config:
        default_model = config.get('default_model', 'qwen3-max')
    # If available_models is configured as JSON string, parse it
    if config and config.get('available_models'):
        try:
            models_str = config['available_models']
            if isinstance(models_str, str):
                model_names = json.loads(models_str)
            else:
                model_names = models_str
            for m in model_names:
                if isinstance(m, str):
                    models_list.append({
                        'id': m,
                        'object': 'model',
                        'created': _now_ts(),
                        'owned_by': 'harnessed_agent',
                    })
                elif isinstance(m, dict):
                    models_list.append(m)
        except Exception:
            pass
    # Always include the default model if not already listed
    existing_ids = {m['id'] for m in models_list}
    if default_model not in existing_ids:
        models_list.insert(0, {
            'id': default_model,
            'object': 'model',
            'created': _now_ts(),
            'owned_by': 'harnessed_agent',
        })
    return {
        'object': 'list',
        'data': models_list,
    }
 async def harnessed_llm_completions(body: Dict[str, Any]) -> Dict[str, Any]:
    """
    OpenAI-compatible /v1/completions endpoint (legacy, non-chat).
    Converts prompt to messages format internally.
    """
    prompt = body.get('prompt', '')
    model = body.get('model', 'default')
    temperature = body.get('temperature', 0.7)
    max_tokens = body.get('max_tokens', None)
    stream = body.get('stream', False)
    # Convert to chat format
    messages = [{'role': 'user', 'content': prompt}]
    # Build request body for chat completions
    chat_body = {
        'model': model,
        'messages': messages,
        'temperature': temperature,
        'max_tokens': max_tokens,
        'stream': stream,
        '_request': body.get('_request'),
    }
    return await harnessed_llm_chat_completions(chat_body)
--- a/harnessed_agent/llm_client.py
+++ b/harnessed_agent/llm_client.py
@ -0,0 +1,475 @@
 """
 LLM Client for Hermes Agent - Calls supplier LLM APIs via OpenAI-compatible interface.
 This module provides a client-side implementation for calling external LLM providers
 (OpenAI, DashScope, DeepSeek, etc.) through their OpenAI-compatible /v1/chat/completions API.
 Usage:
    # From .dspy files:
    result = await llm_chat(messages=[{"role": "user", "content": "Hello"}])
    # With explicit config:
    result = await llm_chat(
        messages=[{"role": "user", "content": "Hello"}],
        model="qwen3-max",
        temperature=0.7,
        stream=False
    )
    # Stream mode:
    async for chunk in llm_chat_stream(messages=[...]):
        print(chunk.get("delta", ""))
 """
 import json
 import time
 import asyncio
 from typing import Dict, Any, List, Optional, AsyncGenerator
 from datetime import datetime
 try:
    from aiohttp import ClientSession, ClientTimeout, ClientError
 except ImportError:
    ClientError = Exception
 try:
    from ahserver.serverenv import ServerEnv
    from sqlor.dbpools import DBPools
    from appPublic.log import info, debug, warning, error, exception
 except ImportError:
    class ServerEnv:
        pass
    class DBPools:
        pass
    def info(*a, **kw): print(*a)
    def debug(*a, **kw): pass
    def warning(*a, **kw): print(*a)
    def error(*a, **kw): print(*a)
    def exception(*a, **kw): pass
 # ============================================================
 # LLM Provider configurations
 # ============================================================
 LLM_PROVIDERS = {
    'openai': {
        'url': 'https://api.openai.com/v1',
        'model_default': 'gpt-4o',
    },
    'dashscope': {
        'url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
        'model_default': 'qwen-plus',
    },
    'deepseek': {
        'url': 'https://api.deepseek.com/v1',
        'model_default': 'deepseek-chat',
    },
    'siliconflow': {
        'url': 'https://api.siliconflow.cn/v1',
        'model_default': 'Qwen/Qwen2.5-72B-Instruct',
    },
 }
 # ============================================================
 # Config retrieval
 # ============================================================
 async def _get_llm_config() -> Dict[str, Any]:
    """Get LLM client configuration from harnessed_agent_config table."""
    try:
        env = ServerEnv()
        dbname = env.get_module_dbname('harnessed_agent')
    except Exception:
        dbname = 'default'
    try:
        async with DBPools().sqlorContext(dbname) as sor:
            rows = await sor.R('harnessed_agent_config', {},
                               orderby='updated_at DESC', limit=1)
            if rows:
                return rows[0]
    except Exception as e:
        error(f"Failed to fetch LLM config: {e}")
    return {}
 def _resolve_provider(config: Dict[str, Any]) -> Dict[str, str]:
    """Resolve base URL and model from config, with provider presets."""
    provider = (config.get('llm_provider') or '').lower()
    service_url = config.get('llm_service_url', '')
    api_key = config.get('llm_api_key', '')
    model = config.get('default_model', '')
    # If provider name is set, use preset URL
    if provider and provider in LLM_PROVIDERS:
        preset = LLM_PROVIDERS[provider]
        if not service_url:
            service_url = preset['url']
        if not model:
            model = preset['model_default']
    return {
        'service_url': service_url.rstrip('/'),
        'api_key': api_key,
        'model': model,
        'provider': provider,
    }
 # ============================================================
 # Core LLM client
 # ============================================================
 async def _post_chat_completions(
    service_url: str,
    api_key: str,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float = 0.7,
    max_tokens: Optional[int] = None,
    stream: bool = False,
    top_p: float = 1.0,
    extra: Optional[Dict] = None,
 ) -> Dict[str, Any]:
    """
    Call OpenAI-compatible /v1/chat/completions endpoint.
    Returns dict with OpenAI response format or error dict.
    """
    if not service_url:
        return {'error': {'message': 'llm_service_url not configured', 'type': 'configuration_error'}}
    if not api_key:
        return {'error': {'message': 'llm_api_key not configured', 'type': 'configuration_error'}}
    url = f"{service_url}/chat/completions"
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    }
    body = {
        'model': model,
        'messages': messages,
        'temperature': temperature,
        'top_p': top_p,
        'stream': stream,
    }
    if max_tokens is not None:
        body['max_tokens'] = max_tokens
    if extra:
        for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
                    'tool_choice', 'response_format', 'seed'):
            if key in extra and extra[key] is not None:
                body[key] = extra[key]
    max_retries = 3
    base_delay = 2
    for attempt in range(max_retries):
        try:
            async with ClientSession() as session:
                async with session.post(url, headers=headers, json=body,
                                        timeout=ClientTimeout(total=300)) as resp:
                    if resp.status == 429:
                        retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
                        if attempt < max_retries - 1:
                            warning(f"LLM rate limited (429), retrying in {retry_after}s (attempt {attempt+1}/{max_retries})")
                            await asyncio.sleep(retry_after)
                            continue
                        return {'error': {'message': 'Rate limited by LLM provider', 'type': 'rate_limit_error', 'code': 429}}
                    if resp.status == 500 and attempt < max_retries - 1:
                        delay = base_delay * (2 ** attempt)
                        warning(f"LLM server error (500), retrying in {delay}s (attempt {attempt+1}/{max_retries})")
                        await asyncio.sleep(delay)
                        continue
                    if resp.status != 200:
                        err_text = await resp.text()
                        return {
                            'error': {
                                'message': f'LLM API error: HTTP {resp.status}',
                                'type': 'api_error',
                                'code': resp.status,
                                'detail': err_text[:500],
                            }
                        }
                    return await resp.json()
        except asyncio.TimeoutError:
            if attempt < max_retries - 1:
                delay = base_delay * (2 ** attempt)
                warning(f"LLM API timeout, retrying in {delay}s (attempt {attempt+1}/{max_retries})")
                await asyncio.sleep(delay)
                continue
            return {'error': {'message': 'LLM API request timed out', 'type': 'timeout_error', 'code': 504}}
        except ClientError as e:
            if attempt < max_retries - 1:
                delay = base_delay * (2 ** attempt)
                warning(f"LLM connection error: {e}, retrying in {delay}s")
                await asyncio.sleep(delay)
                continue
            return {'error': {'message': f'LLM connection failed: {str(e)}', 'type': 'connection_error', 'code': 502}}
    return {'error': {'message': 'LLM API failed after all retries', 'type': 'server_error', 'code': 500}}
 async def _stream_chat_completions(
    service_url: str,
    api_key: str,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float = 0.7,
    max_tokens: Optional[int] = None,
    top_p: float = 1.0,
    extra: Optional[Dict] = None,
 ) -> AsyncGenerator[Dict[str, Any], None]:
    """
    Stream LLM response via SSE. Yields parsed chunk dicts.
    Each yielded dict contains:
        - delta: str (the text chunk)
        - finish_reason: str or None
        - raw: dict (the raw SSE chunk data)
    """
    if not service_url:
        yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_service_url not configured'}}
        return
    if not api_key:
        yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_api_key not configured'}}
        return
    url = f"{service_url}/chat/completions"
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
    }
    body = {
        'model': model,
        'messages': messages,
        'temperature': temperature,
        'top_p': top_p,
        'stream': True,
    }
    if max_tokens is not None:
        body['max_tokens'] = max_tokens
    if extra:
        for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
                    'tool_choice', 'response_format', 'seed'):
            if key in extra and extra[key] is not None:
                body[key] = extra[key]
    async with ClientSession() as session:
        async with session.post(url, headers=headers, json=body,
                                timeout=ClientTimeout(total=600)) as resp:
            if resp.status != 200:
                err_text = await resp.text()
                yield {
                    'delta': '',
                    'finish_reason': 'error',
                    'raw': {'error': f'HTTP {resp.status}', 'detail': err_text[:300]},
                }
                return
            async for line in resp.content:
                line = line.decode('utf-8').strip()
                if not line or not line.startswith('data:'):
                    continue
                data_str = line[5:].strip()
                if data_str == '[DONE]':
                    break
                try:
                    chunk = json.loads(data_str)
                    choices = chunk.get('choices', [])
                    if choices:
                        choice = choices[0]
                        delta = choice.get('delta', {})
                        text = delta.get('content', '') or ''
                        finish_reason = choice.get('finish_reason')
                        yield {
                            'delta': text,
                            'finish_reason': finish_reason,
                            'raw': chunk,
                        }
                except json.JSONDecodeError:
                    continue
 # ============================================================
 # Public API functions (registered to ServerEnv)
 # ============================================================
 async def llm_chat(
    messages: List[Dict[str, str]],
    model: str = None,
    temperature: float = None,
    max_tokens: int = None,
    stream: bool = False,
    top_p: float = None,
    **extra,
 ) -> Dict[str, Any]:
    """
    Call LLM provider and get chat completion.
    This is the primary function for AI calls within harnessed_agent.
    Reads provider config from harnessed_agent_config table automatically.
    Args:
        messages: List of {role, content} dicts (OpenAI format)
        model: Override model name (uses config default_model if not set)
        temperature: Override temperature (uses config default_temperature if not set)
        max_tokens: Max response tokens
        stream: If True, uses streaming
        top_p: Override top_p
        **extra: Other OpenAI-compatible params (stop, tools, etc.)
    Returns:
        Dict matching OpenAI chat completion response, or error dict.
    """
    config = await _get_llm_config()
    provider = _resolve_provider(config)
    resolved_model = model or provider['model']
    resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7)
    resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0)
    start_time = time.time()
    info(f"LLM chat: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}")
    result = await _post_chat_completions(
        service_url=provider['service_url'],
        api_key=provider['api_key'],
        model=resolved_model,
        messages=messages,
        temperature=resolved_temp,
        max_tokens=max_tokens,
        stream=False,
        top_p=resolved_top_p,
        extra=extra if extra else None,
    )
    elapsed = time.time() - start_time
    if 'error' in result:
        error(f"LLM chat error: model={resolved_model}, elapsed={elapsed:.2f}s, error={result['error'].get('message')}")
    else:
        usage = result.get('usage', {})
        info(f"LLM chat done: model={resolved_model}, elapsed={elapsed:.2f}s, prompt_tokens={usage.get('prompt_tokens')}, completion_tokens={usage.get('completion_tokens')}")
    return result
 async def llm_chat_stream(
    messages: List[Dict[str, str]],
    model: str = None,
    temperature: float = None,
    max_tokens: int = None,
    top_p: float = None,
    **extra,
 ) -> AsyncGenerator[Dict[str, Any], None]:
    """
    Stream LLM chat response.
    Yields dicts with {delta: str, finish_reason: str|None, raw: dict}.
    Example:
        async for chunk in llm_chat_stream(messages=[{"role": "user", "content": "Hello"}]):
            print(chunk['delta'], end='', flush=True)
    """
    config = await _get_llm_config()
    provider = _resolve_provider(config)
    resolved_model = model or provider['model']
    resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7)
    resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0)
    info(f"LLM chat stream: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}")
    async for chunk in _stream_chat_completions(
        service_url=provider['service_url'],
        api_key=provider['api_key'],
        model=resolved_model,
        messages=messages,
        temperature=resolved_temp,
        max_tokens=max_tokens,
        top_p=resolved_top_p,
        extra=extra if extra else None,
    ):
        yield chunk
 async def llm_list_models() -> Dict[str, Any]:
    """List models available from the configured LLM provider."""
    config = await _get_llm_config()
    provider = _resolve_provider(config)
    if not provider['service_url'] or not provider['api_key']:
        return {'error': 'LLM not configured'}
    # Try to call /v1/models endpoint
    url = f"{provider['service_url']}/models"
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {provider["api_key"]}',
    }
    try:
        async with ClientSession() as session:
            async with session.get(url, headers=headers,
                                   timeout=ClientTimeout(total=30)) as resp:
                if resp.status == 200:
                    return await resp.json()
                else:
                    return {'error': f'HTTP {resp.status}'}
    except Exception as e:
        return {'error': str(e)}
 async def llm_simple(prompt: str, system: str = None, **kwargs) -> str:
    """
    Simplified LLM call: returns just the response text.
    Args:
        prompt: User message text
        system: Optional system prompt
        **kwargs: Passed to llm_chat
    Returns:
        Response content string, or error message.
    """
    messages = []
    if system:
        messages.append({'role': 'system', 'content': system})
    messages.append({'role': 'user', 'content': prompt})
    result = await llm_chat(messages=messages, **kwargs)
    if 'error' in result:
        return f"Error: {result['error'].get('message', 'Unknown error')}"
    choices = result.get('choices', [])
    if choices:
        return choices[0].get('message', {}).get('content', '')
    return ''
 async def llm_get_config() -> Dict[str, Any]:
    """Get current LLM client configuration (with api_key masked)."""
    config = await _get_llm_config()
    provider = _resolve_provider(config)
    return {
        'provider': provider['provider'],
        'service_url': provider['service_url'],
        'api_key': '***' + provider['api_key'][-4:] if provider['api_key'] else '(not set)',
        'default_model': provider['model'],
        'default_temperature': config.get('default_temperature', 0.7),
    }
--- a/init/data.json
+++ b/init/data.json
@ -9,7 +9,7 @@
      "updated_at": "2026-04-15 21:06:00"
    },
    {
-      "id": "default_memory_notes_1", 
+      "id": "default_memory_notes_1",
      "user_id": "user_1",
      "target": "memory",
      "content": "Default memory notes for Hermes Agent module - User 1",
@ -25,7 +25,7 @@
      "updated_at": "2026-04-15 21:06:00"
    },
    {
-      "id": "default_memory_notes_2", 
+      "id": "default_memory_notes_2",
      "user_id": "user_2",
      "target": "memory",
      "content": "Default memory notes for Hermes Agent module - User 2",
@ -70,7 +70,15 @@
      "auto_cleanup_enabled": "1",
      "min_retention_days": 30,
      "created_at": "2026-04-20 10:48:00",
-      "updated_at": "2026-04-20 10:48:00"
+      "updated_at": "2026-04-20 10:48:00",
      "llm_provider": "dashscope",
      "llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
      "llm_api_key": "",
      "available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]",
      "default_model": "qwen-plus",
      "default_temperature": 0.7,
      "top_p": 1.0,
      "enable_streaming": "1"
    },
    {
      "id": "default_agent_config_user_2",
@ -84,7 +92,15 @@
      "auto_cleanup_enabled": "1",
      "min_retention_days": 30,
      "created_at": "2026-04-20 10:48:00",
-      "updated_at": "2026-04-20 10:48:00"
+      "updated_at": "2026-04-20 10:48:00",
      "llm_provider": "dashscope",
      "llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
      "llm_api_key": "",
      "available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]",
      "default_model": "qwen-plus",
      "default_temperature": 0.7,
      "top_p": 1.0,
      "enable_streaming": "1"
    }
  ]
 }
--- a/json/harnessed_agent_config_view.json
+++ b/json/harnessed_agent_config_view.json
@ -21,6 +21,16 @@
                        {"value": "1", "text": "Enabled"},
                        {"value": "0", "text": "Disabled"}
                    ]
                },
                "llm_provider": {
                    "uitype": "code",
                    "data": [
                        {"value": "dashscope", "text": "阿里云 DashScope"},
                        {"value": "openai", "text": "OpenAI"},
                        {"value": "deepseek", "text": "DeepSeek"},
                        {"value": "siliconflow", "text": "SiliconFlow"},
                        {"value": "", "text": "自定义 (Custom URL)"}
                    ]
                }
            }
        },
--- a/models/harnessed_agent_config.json
+++ b/models/harnessed_agent_config.json
@ -97,6 +97,15 @@
            "nullable": "no",
            "default": "0.7"
        },
        {
            "name": "top_p",
            "title": "Default top_p for LLM calls",
            "type": "float",
            "length": 5,
            "dec": 2,
            "nullable": "no",
            "default": "1.00"
        },
        {
            "name": "enable_streaming",
            "title": "Enable streaming response for LLM calls",
@ -105,6 +114,15 @@
            "nullable": "no",
            "default": "1"
        },
        {
            "name": "llm_provider",
            "title": "LLM provider preset name",
            "type": "str",
            "length": 32,
            "nullable": "yes",
            "default": "dashscope",
            "comments": "Provider preset: openai, dashscope, deepseek, siliconflow, or custom (empty)"
        },
        {
            "name": "llm_service_url",
            "title": "LLM service base URL (OpenAI-compatible endpoint)",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -8,11 +8,8 @@ version = "1.0.0"
 description = "Hermes Agent module - multi-user AI agent with memory, skills, workflows, and remote skill deployment"
 requires-python = ">=3.10"
 dependencies = [
    "ahserver",
    "sqlor",
-    "apppublic",
+    "bricks_for_python",
    "appbase",
    "rbac",
 ]
 [project.optional-dependencies]
--- a/wwwroot/v1/chat/completions.dspy
+++ b/wwwroot/v1/chat/completions.dspy
@ -1,46 +0,0 @@
 """
 OpenAI-compatible /v1/chat/completions endpoint
 Accepts POST with JSON body matching OpenAI API format.
 """
 import json
 async def main():
    # Read request body
    body = {}
    try:
        raw_body = await request.read()
        if raw_body:
            body = json.loads(raw_body)
    except Exception as e:
        result = {
            'error': {
                'message': f'Invalid JSON body: {str(e)}',
                'type': 'invalid_request_error',
                'code': 400,
            }
        }
        return json.dumps(result, ensure_ascii=False)
    # Pass the request object for streaming support
    body['_request'] = request
    # Call the LLM API handler
    result = await harnessed_llm_chat_completions(body)
    # Handle streaming response (StreamResponse)
    from aiohttp.web_response import StreamResponse
    if isinstance(result, StreamResponse):
        return result
    # Handle error response
    if 'error' in result:
        status_code = result.get('error', {}).get('code', 500)
        resp = web.Response(
            status=status_code,
            body=json.dumps(result, ensure_ascii=False),
            content_type='application/json'
        )
        return resp
    # Return successful response
    return json.dumps(result, ensure_ascii=False)
--- a/wwwroot/v1/completions.dspy
+++ b/wwwroot/v1/completions.dspy
@ -1,41 +0,0 @@
 """
 OpenAI-compatible /v1/completions endpoint (legacy)
 Accepts POST with JSON body matching OpenAI completions API format.
 """
 import json
 async def main():
    # Read request body
    body = {}
    try:
        raw_body = await request.read()
        if raw_body:
            body = json.loads(raw_body)
    except Exception as e:
        result = {
            'error': {
                'message': f'Invalid JSON body: {str(e)}',
                'type': 'invalid_request_error',
                'code': 400,
            }
        }
        return json.dumps(result, ensure_ascii=False)
    body['_request'] = request
    result = await harnessed_llm_completions(body)
    from aiohttp.web_response import StreamResponse
    if isinstance(result, StreamResponse):
        return result
    if 'error' in result:
        status_code = result.get('error', {}).get('code', 500)
        resp = web.Response(
            status=status_code,
            body=json.dumps(result, ensure_ascii=False),
            content_type='application/json'
        )
        return resp
    return json.dumps(result, ensure_ascii=False)
--- a/wwwroot/v1/models.dspy
+++ b/wwwroot/v1/models.dspy
@ -1,9 +0,0 @@
 """
 OpenAI-compatible /v1/models endpoint
 Returns list of available models.
 """
 import json
 async def main():
    result = await harnessed_llm_models()
    return json.dumps(result, ensure_ascii=False)