refactor: LLM client for calling supplier LLM APIs (not server)

Replaces wrong-direction llm_api.py (which served OpenAI endpoints) with llm_client.py -- a client that calls supplier LLM APIs. New module: llm_client.py - llm_chat(messages, model, temperature, ...) -> OpenAI response dict - llm_chat_stream(messages, ...) -> async generator of SSE chunks - llm_simple(prompt, system) -> plain text response - llm_list_models() -> list available models from provider - llm_get_config() -> show current config (key masked) - Supports provider presets: openai, dashscope, deepseek, siliconflow - Retry with exponential backoff (3 attempts) - 429 rate limit handling with Retry-After - Structured logging via appPublic.log Model changes (harnessed_agent_config): - Add llm_provider (preset name: dashscope/openai/deepseek/siliconflow) - Add top_p field - llm_service_url defaults to DashScope compatible endpoint Other: - Remove wrong-direction /v1/ endpoints - Fix pyproject.toml deps: only sqlor + bricks_for_python - Update init/data.json seed data with LLM config fields - Update CRUD view with llm_provider dropdown
2026-05-07 11:57:04 +08:00 · 2026-05-07 11:57:04 +08:00 · 4e65ff8fe4
commit 4e65ff8fe4
parent e4f935de07
10 changed files with 536 additions and 575 deletions
--- a/harnessed_agent/init.py
+++ b/harnessed_agent/init.py
@ -20,10 +20,12 @@ from .config_functions import (
    harnessed_get_agent_config,
    harnessed_save_agent_config
 )
-from .llm_api import (
-    harnessed_llm_chat_completions,
-    harnessed_llm_models,
-    harnessed_llm_completions,
+from .llm_client import (
+    llm_chat,
+    llm_chat_stream,
+    llm_list_models,
+    llm_simple,
+    llm_get_config,
 )

 def load_harnessed_agent():
@ -49,7 +51,9 @@ def load_harnessed_agent():
    env.harnessed_get_agent_config = harnessed_get_agent_config
    env.harnessed_save_agent_config = harnessed_save_agent_config

-    # OpenAI-compatible LLM API
-    env.harnessed_llm_chat_completions = harnessed_llm_chat_completions
-    env.harnessed_llm_models = harnessed_llm_models
-    env.harnessed_llm_completions = harnessed_llm_completions
+    # LLM client -- calls supplier LLM APIs (OpenAI-compatible)
+    env.llm_chat = llm_chat
+    env.llm_chat_stream = llm_chat_stream
+    env.llm_list_models = llm_list_models
+    env.llm_simple = llm_simple
+    env.llm_get_config = llm_get_config
--- a/harnessed_agent/llm_api.py
+++ b/harnessed_agent/llm_api.py
@ -1,463 +0,0 @@
-"""
-OpenAI-compatible LLM API for Hermes Agent
-Provides /v1/chat/completions and /v1/models endpoints compatible with OpenAI API spec.
-"""
-import json
-import uuid
-import time
-import asyncio
-from typing import Dict, Any, List, Optional, AsyncGenerator
-from datetime import datetime
-
-try:
-    from aiohttp import ClientSession, ClientTimeout
-    from aiohttp.client_exceptions import ClientError
-    HAS_AIOHTTP = True
-except ImportError:
-    HAS_AIOHTTP = False
-
-try:
-    from ahserver.serverenv import ServerEnv
-    from sqlor.dbpools import DBPools
-    from appPublic.worker import awaitify
-    from appPublic.log import info, debug, warning, error, exception
-except ImportError:
-    class ServerEnv:
-        pass
-    class DBPools:
-        pass
-    def awaitify(f):
-        return f
-    def info(*a, **kw): print(*a)
-    def debug(*a, **kw): pass
-    def warning(*a, **kw): print(*a)
-    def error(*a, **kw): print(*a)
-    def exception(*a, **kw): pass
-
-
-def _now_iso():
-    return datetime.utcnow().isoformat() + 'Z'
-
-
-def _now_ts():
-    return int(time.time())
-
-
-def _get_default_llm_config(user_id: str = None) -> Dict[str, Any]:
-    """Get LLM service config from harnessed_agent_config table."""
-    try:
-        dbname = ServerEnv().get_module_dbname('harnessed_agent')
-    except Exception:
-        dbname = 'default'
-
-    try:
-        import asyncio
-        async def _fetch():
-            async with DBPools().sqlorContext(dbname) as sor:
-                filters = {}
-                if user_id:
-                    filters['user_id'] = user_id
-                rows = await sor.R('harnessed_agent_config', filters,
-                                   orderby='updated_at DESC', limit=1)
-                if rows:
-                    return rows[0]
-            return None
-
-        try:
-            loop = asyncio.get_event_loop()
-            if loop.is_running():
-                # Already in async context, need to use create_task pattern
-                # but for simplicity in .dspy context we return None and let caller handle
-                return None
-            else:
-                config = loop.run_until_complete(_fetch())
-                return config
-        except RuntimeError:
-            return None
-    except Exception:
-        return None
-
-
-async def _async_get_llm_config(user_id: str = None) -> Dict[str, Any]:
-    """Async version to get LLM service config."""
-    try:
-        env = ServerEnv()
-        dbname = env.get_module_dbname('harnessed_agent')
-    except Exception:
-        dbname = 'default'
-
-    try:
-        async with DBPools().sqlorContext(dbname) as sor:
-            filters = {}
-            if user_id:
-                filters['user_id'] = user_id
-            rows = await sor.R('harnessed_agent_config', filters,
-                               orderby='updated_at DESC', limit=1)
-            if rows:
-                return rows[0]
-    except Exception as e:
-        print(f"Error fetching LLM config: {e}")
-    return None
-
-
-async def _call_llm_api(
-    service_url: str,
-    api_key: str,
-    model: str,
-    messages: List[Dict[str, str]],
-    temperature: float = 0.7,
-    max_tokens: Optional[int] = None,
-    stream: bool = False,
-    top_p: float = 1.0,
-    **kwargs
-) -> Any:
-    """Call an OpenAI-compatible LLM API endpoint."""
-    url = service_url.rstrip('/') + '/chat/completions'
-
-    headers = {
-        'Content-Type': 'application/json',
-        'Authorization': f'Bearer {api_key}',
-    }
-
-    body = {
-        'model': model,
-        'messages': messages,
-        'temperature': temperature,
-        'top_p': top_p,
-        'stream': stream,
-    }
-
-    if max_tokens is not None:
-        body['max_tokens'] = max_tokens
-
-    # Pass through any additional OpenAI-compatible parameters
-    for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', 'tool_choice',
-                'response_format', 'seed'):
-        if key in kwargs and kwargs[key] is not None:
-            body[key] = kwargs[key]
-
-    if stream:
-        return await _stream_llm_response(url, headers, body)
-    else:
-        return await _sync_llm_response(url, headers, body)
-
-
-async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]:
-    """Make a non-streaming LLM API call with retry logic."""
-    max_retries = 3
-    base_delay = 2  # seconds
-
-    for attempt in range(max_retries):
-        try:
-            async with ClientSession() as session:
-                async with session.post(url, headers=headers, json=body,
-                                        timeout=ClientTimeout(total=300)) as resp:
-                    if resp.status == 429:
-                        # Rate limited - respect Retry-After header
-                        retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
-                        if attempt < max_retries - 1:
-                            warning(f"LLM API rate limited (429), retrying after {retry_after}s (attempt {attempt+1}/{max_retries})")
-                            await asyncio.sleep(retry_after)
-                            continue
-                        else:
-                            return {
-                                'error': {
-                                    'message': 'Rate limited by LLM provider. Please retry later.',
-                                    'type': 'rate_limit_error',
-                                    'code': 429,
-                                }
-                            }
-
-                    if resp.status == 500 and attempt < max_retries - 1:
-                        # Transient server error - retry
-                        delay = base_delay * (2 ** attempt)
-                        warning(f"LLM API server error (500), retrying after {delay}s (attempt {attempt+1}/{max_retries})")
-                        await asyncio.sleep(delay)
-                        continue
-
-                    if resp.status != 200:
-                        error_text = await resp.text()
-                        return {
-                            'error': {
-                                'message': f'LLM API error: HTTP {resp.status}',
-                                'type': 'api_error',
-                                'code': resp.status,
-                                'detail': error_text[:500],
-                            }
-                        }
-
-                    data = await resp.json()
-                    return data
-
-        except asyncio.TimeoutError:
-            if attempt < max_retries - 1:
-                delay = base_delay * (2 ** attempt)
-                warning(f"LLM API timeout, retrying after {delay}s (attempt {attempt+1}/{max_retries})")
-                await asyncio.sleep(delay)
-                continue
-            return {
-                'error': {
-                    'message': 'LLM API request timed out after 300s',
-                    'type': 'timeout_error',
-                    'code': 504,
-                }
-            }
-        except ClientError as e:
-            if attempt < max_retries - 1:
-                delay = base_delay * (2 ** attempt)
-                warning(f"LLM API connection error: {e}, retrying after {delay}s")
-                await asyncio.sleep(delay)
-                continue
-            return {
-                'error': {
-                    'message': f'LLM API connection failed: {str(e)}',
-                    'type': 'connection_error',
-                    'code': 502,
-                }
-            }
-
-    return {
-        'error': {
-            'message': 'LLM API request failed after all retries',
-            'type': 'server_error',
-            'code': 500,
-        }
-    }
-
-
-async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]:
-    """Make a streaming LLM API call, yielding SSE chunks."""
-    async with ClientSession() as session:
-        async with session.post(url, headers=headers, json=body,
-                                timeout=ClientTimeout(total=600)) as resp:
-            if resp.status != 200:
-                error_text = await resp.text()
-                error_data = {
-                    'error': {
-                        'message': f'LLM API error: HTTP {resp.status}',
-                        'type': 'api_error',
-                        'code': resp.status,
-                        'detail': error_text[:500],
-                    }
-                }
-                yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n"
-                yield "data: [DONE]\n\n"
-                return
-
-            async for line in resp.content:
-                line = line.decode('utf-8').strip()
-                if not line:
-                    continue
-                if line.startswith('data: '):
-                    yield line + '\n\n'
-
-
-# ============================================================
-# Public API functions (registered to ServerEnv via init.py)
-# ============================================================
-
-async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any:
-    """
-    OpenAI-compatible /v1/chat/completions endpoint.
-    
-    Args:
-        body: dict with keys: model, messages, temperature, max_tokens,
-              stream, top_p, and other OpenAI-compatible params.
-    
-    Returns:
-        If stream=False: dict matching OpenAI chat completion response.
-        If stream=True: aiohttp.StreamResponse with SSE.
-    """
-    from aiohttp import web
-
-    model = body.get('model', 'default')
-    messages = body.get('messages', [])
-    temperature = body.get('temperature', 0.7)
-    max_tokens = body.get('max_tokens', None)
-    stream = body.get('stream', False)
-    top_p = body.get('top_p', 1.0)
-
-    start_time = time.time()
-
-    # Get LLM service config
-    config = await _async_get_llm_config()
-    if not config:
-        error("LLM service not configured: no harnessed_agent_config found")
-        return {
-            'error': {
-                'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.',
-                'type': 'configuration_error',
-                'code': 503,
-            }
-        }
-
-    service_url = config.get('llm_service_url') or config.get('api_endpoint')
-    api_key = config.get('llm_api_key') or config.get('api_key')
-
-    if not service_url or not api_key:
-        error(f"LLM service misconfigured: service_url={'set' if service_url else 'MISSING'}, api_key={'set' if api_key else 'MISSING'}")
-        return {
-            'error': {
-                'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.',
-                'type': 'configuration_error',
-                'code': 503,
-            }
-        }
-
-    # Use default model from config if request model is 'default' or empty
-    default_model = config.get('default_model', 'qwen3-max')
-    if not model or model == 'default':
-        model = default_model
-
-    info(f"LLM chat request: model={model}, stream={stream}, messages={len(messages)}, temperature={temperature}")
-
-    # Pass through extra params
-    extra_params = {}
-    for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
-                'tool_choice', 'response_format', 'seed'):
-        if key in body:
-            extra_params[key] = body[key]
-
-    if stream:
-        resp = web.StreamResponse(
-            status=200,
-            reason='OK',
-            headers={
-                'Content-Type': 'text/event-stream',
-                'Cache-Control': 'no-cache',
-                'Connection': 'keep-alive',
-                'X-Accel-Buffering': 'no',
-            }
-        )
-        await resp.prepare(body.get('_request'))
-
-        request_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
-        created = _now_ts()
-
-        try:
-            async for chunk_line in _stream_llm_response(
-                service_url, {'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}'},
-                {
-                    'model': model,
-                    'messages': messages,
-                    'temperature': temperature,
-                    'top_p': top_p,
-                    'stream': True,
-                    'max_tokens': max_tokens,
-                    **extra_params,
-                }
-            ):
-                await resp.write(chunk_line.encode('utf-8') if isinstance(chunk_line, str) else chunk_line)
-                await resp.drain()
-        except Exception as e:
-            error_data = {
-                'error': {
-                    'message': f'Streaming error: {str(e)}',
-                    'type': 'server_error',
-                    'code': 500,
-                }
-            }
-            await resp.write(f"data: {json.dumps(error_data)}\n\n".encode('utf-8'))
-            await resp.drain()
-
-        await resp.write(b"data: [DONE]\n\n")
-        await resp.drain()
-        return resp
-
-    else:
-        result = await _call_llm_api(
-            service_url=service_url,
-            api_key=api_key,
-            model=model,
-            messages=messages,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            stream=False,
-            top_p=top_p,
-            **extra_params,
-        )
-        elapsed = time.time() - start_time
-        if 'error' in result:
-            error(f"LLM chat error: model={model}, elapsed={elapsed:.2f}s, error={result.get('error',{}).get('message','')}")
-        else:
-            usage = result.get('usage', {})
-            info(f"LLM chat done: model={model}, elapsed={elapsed:.2f}s, tokens_in={usage.get('prompt_tokens','?')}, tokens_out={usage.get('completion_tokens','?')}")
-        return result
-
-
-async def harnessed_llm_models() -> Dict[str, Any]:
-    """
-    OpenAI-compatible /v1/models endpoint.
-    Returns list of available models from config.
-    """
-    config = await _async_get_llm_config()
-
-    models_list = []
-
-    # Add default model from config
-    default_model = 'qwen3-max'
-    if config:
-        default_model = config.get('default_model', 'qwen3-max')
-
-    # If available_models is configured as JSON string, parse it
-    if config and config.get('available_models'):
-        try:
-            models_str = config['available_models']
-            if isinstance(models_str, str):
-                model_names = json.loads(models_str)
-            else:
-                model_names = models_str
-            for m in model_names:
-                if isinstance(m, str):
-                    models_list.append({
-                        'id': m,
-                        'object': 'model',
-                        'created': _now_ts(),
-                        'owned_by': 'harnessed_agent',
-                    })
-                elif isinstance(m, dict):
-                    models_list.append(m)
-        except Exception:
-            pass
-
-    # Always include the default model if not already listed
-    existing_ids = {m['id'] for m in models_list}
-    if default_model not in existing_ids:
-        models_list.insert(0, {
-            'id': default_model,
-            'object': 'model',
-            'created': _now_ts(),
-            'owned_by': 'harnessed_agent',
-        })
-
-    return {
-        'object': 'list',
-        'data': models_list,
-    }
-
-
-async def harnessed_llm_completions(body: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    OpenAI-compatible /v1/completions endpoint (legacy, non-chat).
-    Converts prompt to messages format internally.
-    """
-    prompt = body.get('prompt', '')
-    model = body.get('model', 'default')
-    temperature = body.get('temperature', 0.7)
-    max_tokens = body.get('max_tokens', None)
-    stream = body.get('stream', False)
-
-    # Convert to chat format
-    messages = [{'role': 'user', 'content': prompt}]
-
-    # Build request body for chat completions
-    chat_body = {
-        'model': model,
-        'messages': messages,
-        'temperature': temperature,
-        'max_tokens': max_tokens,
-        'stream': stream,
-        '_request': body.get('_request'),
-    }
-
-    return await harnessed_llm_chat_completions(chat_body)
--- a/harnessed_agent/llm_client.py
+++ b/harnessed_agent/llm_client.py
@ -0,0 +1,475 @@
+"""
+LLM Client for Hermes Agent - Calls supplier LLM APIs via OpenAI-compatible interface.
+
+This module provides a client-side implementation for calling external LLM providers
+(OpenAI, DashScope, DeepSeek, etc.) through their OpenAI-compatible /v1/chat/completions API.
+
+Usage:
+    # From .dspy files:
+    result = await llm_chat(messages=[{"role": "user", "content": "Hello"}])
+    
+    # With explicit config:
+    result = await llm_chat(
+        messages=[{"role": "user", "content": "Hello"}],
+        model="qwen3-max",
+        temperature=0.7,
+        stream=False
+    )
+    
+    # Stream mode:
+    async for chunk in llm_chat_stream(messages=[...]):
+        print(chunk.get("delta", ""))
+"""
+import json
+import time
+import asyncio
+from typing import Dict, Any, List, Optional, AsyncGenerator
+from datetime import datetime
+
+try:
+    from aiohttp import ClientSession, ClientTimeout, ClientError
+except ImportError:
+    ClientError = Exception
+
+try:
+    from ahserver.serverenv import ServerEnv
+    from sqlor.dbpools import DBPools
+    from appPublic.log import info, debug, warning, error, exception
+except ImportError:
+    class ServerEnv:
+        pass
+    class DBPools:
+        pass
+    def info(*a, **kw): print(*a)
+    def debug(*a, **kw): pass
+    def warning(*a, **kw): print(*a)
+    def error(*a, **kw): print(*a)
+    def exception(*a, **kw): pass
+
+
+# ============================================================
+# LLM Provider configurations
+# ============================================================
+
+LLM_PROVIDERS = {
+    'openai': {
+        'url': 'https://api.openai.com/v1',
+        'model_default': 'gpt-4o',
+    },
+    'dashscope': {
+        'url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+        'model_default': 'qwen-plus',
+    },
+    'deepseek': {
+        'url': 'https://api.deepseek.com/v1',
+        'model_default': 'deepseek-chat',
+    },
+    'siliconflow': {
+        'url': 'https://api.siliconflow.cn/v1',
+        'model_default': 'Qwen/Qwen2.5-72B-Instruct',
+    },
+}
+
+
+# ============================================================
+# Config retrieval
+# ============================================================
+
+async def _get_llm_config() -> Dict[str, Any]:
+    """Get LLM client configuration from harnessed_agent_config table."""
+    try:
+        env = ServerEnv()
+        dbname = env.get_module_dbname('harnessed_agent')
+    except Exception:
+        dbname = 'default'
+
+    try:
+        async with DBPools().sqlorContext(dbname) as sor:
+            rows = await sor.R('harnessed_agent_config', {},
+                               orderby='updated_at DESC', limit=1)
+            if rows:
+                return rows[0]
+    except Exception as e:
+        error(f"Failed to fetch LLM config: {e}")
+    return {}
+
+
+def _resolve_provider(config: Dict[str, Any]) -> Dict[str, str]:
+    """Resolve base URL and model from config, with provider presets."""
+    provider = (config.get('llm_provider') or '').lower()
+    service_url = config.get('llm_service_url', '')
+    api_key = config.get('llm_api_key', '')
+    model = config.get('default_model', '')
+
+    # If provider name is set, use preset URL
+    if provider and provider in LLM_PROVIDERS:
+        preset = LLM_PROVIDERS[provider]
+        if not service_url:
+            service_url = preset['url']
+        if not model:
+            model = preset['model_default']
+
+    return {
+        'service_url': service_url.rstrip('/'),
+        'api_key': api_key,
+        'model': model,
+        'provider': provider,
+    }
+
+
+# ============================================================
+# Core LLM client
+# ============================================================
+
+async def _post_chat_completions(
+    service_url: str,
+    api_key: str,
+    model: str,
+    messages: List[Dict[str, str]],
+    temperature: float = 0.7,
+    max_tokens: Optional[int] = None,
+    stream: bool = False,
+    top_p: float = 1.0,
+    extra: Optional[Dict] = None,
+) -> Dict[str, Any]:
+    """
+    Call OpenAI-compatible /v1/chat/completions endpoint.
+    
+    Returns dict with OpenAI response format or error dict.
+    """
+    if not service_url:
+        return {'error': {'message': 'llm_service_url not configured', 'type': 'configuration_error'}}
+    if not api_key:
+        return {'error': {'message': 'llm_api_key not configured', 'type': 'configuration_error'}}
+
+    url = f"{service_url}/chat/completions"
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {api_key}',
+    }
+
+    body = {
+        'model': model,
+        'messages': messages,
+        'temperature': temperature,
+        'top_p': top_p,
+        'stream': stream,
+    }
+
+    if max_tokens is not None:
+        body['max_tokens'] = max_tokens
+
+    if extra:
+        for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
+                    'tool_choice', 'response_format', 'seed'):
+            if key in extra and extra[key] is not None:
+                body[key] = extra[key]
+
+    max_retries = 3
+    base_delay = 2
+
+    for attempt in range(max_retries):
+        try:
+            async with ClientSession() as session:
+                async with session.post(url, headers=headers, json=body,
+                                        timeout=ClientTimeout(total=300)) as resp:
+
+                    if resp.status == 429:
+                        retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt)))
+                        if attempt < max_retries - 1:
+                            warning(f"LLM rate limited (429), retrying in {retry_after}s (attempt {attempt+1}/{max_retries})")
+                            await asyncio.sleep(retry_after)
+                            continue
+                        return {'error': {'message': 'Rate limited by LLM provider', 'type': 'rate_limit_error', 'code': 429}}
+
+                    if resp.status == 500 and attempt < max_retries - 1:
+                        delay = base_delay * (2 ** attempt)
+                        warning(f"LLM server error (500), retrying in {delay}s (attempt {attempt+1}/{max_retries})")
+                        await asyncio.sleep(delay)
+                        continue
+
+                    if resp.status != 200:
+                        err_text = await resp.text()
+                        return {
+                            'error': {
+                                'message': f'LLM API error: HTTP {resp.status}',
+                                'type': 'api_error',
+                                'code': resp.status,
+                                'detail': err_text[:500],
+                            }
+                        }
+
+                    return await resp.json()
+
+        except asyncio.TimeoutError:
+            if attempt < max_retries - 1:
+                delay = base_delay * (2 ** attempt)
+                warning(f"LLM API timeout, retrying in {delay}s (attempt {attempt+1}/{max_retries})")
+                await asyncio.sleep(delay)
+                continue
+            return {'error': {'message': 'LLM API request timed out', 'type': 'timeout_error', 'code': 504}}
+
+        except ClientError as e:
+            if attempt < max_retries - 1:
+                delay = base_delay * (2 ** attempt)
+                warning(f"LLM connection error: {e}, retrying in {delay}s")
+                await asyncio.sleep(delay)
+                continue
+            return {'error': {'message': f'LLM connection failed: {str(e)}', 'type': 'connection_error', 'code': 502}}
+
+    return {'error': {'message': 'LLM API failed after all retries', 'type': 'server_error', 'code': 500}}
+
+
+async def _stream_chat_completions(
+    service_url: str,
+    api_key: str,
+    model: str,
+    messages: List[Dict[str, str]],
+    temperature: float = 0.7,
+    max_tokens: Optional[int] = None,
+    top_p: float = 1.0,
+    extra: Optional[Dict] = None,
+) -> AsyncGenerator[Dict[str, Any], None]:
+    """
+    Stream LLM response via SSE. Yields parsed chunk dicts.
+    
+    Each yielded dict contains:
+        - delta: str (the text chunk)
+        - finish_reason: str or None
+        - raw: dict (the raw SSE chunk data)
+    """
+    if not service_url:
+        yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_service_url not configured'}}
+        return
+    if not api_key:
+        yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_api_key not configured'}}
+        return
+
+    url = f"{service_url}/chat/completions"
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {api_key}',
+    }
+
+    body = {
+        'model': model,
+        'messages': messages,
+        'temperature': temperature,
+        'top_p': top_p,
+        'stream': True,
+    }
+
+    if max_tokens is not None:
+        body['max_tokens'] = max_tokens
+
+    if extra:
+        for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools',
+                    'tool_choice', 'response_format', 'seed'):
+            if key in extra and extra[key] is not None:
+                body[key] = extra[key]
+
+    async with ClientSession() as session:
+        async with session.post(url, headers=headers, json=body,
+                                timeout=ClientTimeout(total=600)) as resp:
+            if resp.status != 200:
+                err_text = await resp.text()
+                yield {
+                    'delta': '',
+                    'finish_reason': 'error',
+                    'raw': {'error': f'HTTP {resp.status}', 'detail': err_text[:300]},
+                }
+                return
+
+            async for line in resp.content:
+                line = line.decode('utf-8').strip()
+                if not line or not line.startswith('data:'):
+                    continue
+                data_str = line[5:].strip()
+                if data_str == '[DONE]':
+                    break
+                try:
+                    chunk = json.loads(data_str)
+                    choices = chunk.get('choices', [])
+                    if choices:
+                        choice = choices[0]
+                        delta = choice.get('delta', {})
+                        text = delta.get('content', '') or ''
+                        finish_reason = choice.get('finish_reason')
+                        yield {
+                            'delta': text,
+                            'finish_reason': finish_reason,
+                            'raw': chunk,
+                        }
+                except json.JSONDecodeError:
+                    continue
+
+
+# ============================================================
+# Public API functions (registered to ServerEnv)
+# ============================================================
+
+async def llm_chat(
+    messages: List[Dict[str, str]],
+    model: str = None,
+    temperature: float = None,
+    max_tokens: int = None,
+    stream: bool = False,
+    top_p: float = None,
+    **extra,
+) -> Dict[str, Any]:
+    """
+    Call LLM provider and get chat completion.
+    
+    This is the primary function for AI calls within harnessed_agent.
+    Reads provider config from harnessed_agent_config table automatically.
+    
+    Args:
+        messages: List of {role, content} dicts (OpenAI format)
+        model: Override model name (uses config default_model if not set)
+        temperature: Override temperature (uses config default_temperature if not set)
+        max_tokens: Max response tokens
+        stream: If True, uses streaming
+        top_p: Override top_p
+        **extra: Other OpenAI-compatible params (stop, tools, etc.)
+    
+    Returns:
+        Dict matching OpenAI chat completion response, or error dict.
+    """
+    config = await _get_llm_config()
+    provider = _resolve_provider(config)
+
+    resolved_model = model or provider['model']
+    resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7)
+    resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0)
+
+    start_time = time.time()
+    info(f"LLM chat: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}")
+
+    result = await _post_chat_completions(
+        service_url=provider['service_url'],
+        api_key=provider['api_key'],
+        model=resolved_model,
+        messages=messages,
+        temperature=resolved_temp,
+        max_tokens=max_tokens,
+        stream=False,
+        top_p=resolved_top_p,
+        extra=extra if extra else None,
+    )
+
+    elapsed = time.time() - start_time
+    if 'error' in result:
+        error(f"LLM chat error: model={resolved_model}, elapsed={elapsed:.2f}s, error={result['error'].get('message')}")
+    else:
+        usage = result.get('usage', {})
+        info(f"LLM chat done: model={resolved_model}, elapsed={elapsed:.2f}s, prompt_tokens={usage.get('prompt_tokens')}, completion_tokens={usage.get('completion_tokens')}")
+
+    return result
+
+
+async def llm_chat_stream(
+    messages: List[Dict[str, str]],
+    model: str = None,
+    temperature: float = None,
+    max_tokens: int = None,
+    top_p: float = None,
+    **extra,
+) -> AsyncGenerator[Dict[str, Any], None]:
+    """
+    Stream LLM chat response.
+    
+    Yields dicts with {delta: str, finish_reason: str|None, raw: dict}.
+    
+    Example:
+        async for chunk in llm_chat_stream(messages=[{"role": "user", "content": "Hello"}]):
+            print(chunk['delta'], end='', flush=True)
+    """
+    config = await _get_llm_config()
+    provider = _resolve_provider(config)
+
+    resolved_model = model or provider['model']
+    resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7)
+    resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0)
+
+    info(f"LLM chat stream: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}")
+
+    async for chunk in _stream_chat_completions(
+        service_url=provider['service_url'],
+        api_key=provider['api_key'],
+        model=resolved_model,
+        messages=messages,
+        temperature=resolved_temp,
+        max_tokens=max_tokens,
+        top_p=resolved_top_p,
+        extra=extra if extra else None,
+    ):
+        yield chunk
+
+
+async def llm_list_models() -> Dict[str, Any]:
+    """List models available from the configured LLM provider."""
+    config = await _get_llm_config()
+    provider = _resolve_provider(config)
+
+    if not provider['service_url'] or not provider['api_key']:
+        return {'error': 'LLM not configured'}
+
+    # Try to call /v1/models endpoint
+    url = f"{provider['service_url']}/models"
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {provider["api_key"]}',
+    }
+
+    try:
+        async with ClientSession() as session:
+            async with session.get(url, headers=headers,
+                                   timeout=ClientTimeout(total=30)) as resp:
+                if resp.status == 200:
+                    return await resp.json()
+                else:
+                    return {'error': f'HTTP {resp.status}'}
+    except Exception as e:
+        return {'error': str(e)}
+
+
+async def llm_simple(prompt: str, system: str = None, **kwargs) -> str:
+    """
+    Simplified LLM call: returns just the response text.
+    
+    Args:
+        prompt: User message text
+        system: Optional system prompt
+        **kwargs: Passed to llm_chat
+    
+    Returns:
+        Response content string, or error message.
+    """
+    messages = []
+    if system:
+        messages.append({'role': 'system', 'content': system})
+    messages.append({'role': 'user', 'content': prompt})
+
+    result = await llm_chat(messages=messages, **kwargs)
+
+    if 'error' in result:
+        return f"Error: {result['error'].get('message', 'Unknown error')}"
+
+    choices = result.get('choices', [])
+    if choices:
+        return choices[0].get('message', {}).get('content', '')
+    return ''
+
+
+async def llm_get_config() -> Dict[str, Any]:
+    """Get current LLM client configuration (with api_key masked)."""
+    config = await _get_llm_config()
+    provider = _resolve_provider(config)
+
+    return {
+        'provider': provider['provider'],
+        'service_url': provider['service_url'],
+        'api_key': '***' + provider['api_key'][-4:] if provider['api_key'] else '(not set)',
+        'default_model': provider['model'],
+        'default_temperature': config.get('default_temperature', 0.7),
+    }
--- a/init/data.json
+++ b/init/data.json
@ -9,7 +9,7 @@
      "updated_at": "2026-04-15 21:06:00"
    },
    {
-      "id": "default_memory_notes_1", 
+      "id": "default_memory_notes_1",
      "user_id": "user_1",
      "target": "memory",
      "content": "Default memory notes for Hermes Agent module - User 1",
@ -25,7 +25,7 @@
      "updated_at": "2026-04-15 21:06:00"
    },
    {
-      "id": "default_memory_notes_2", 
+      "id": "default_memory_notes_2",
      "user_id": "user_2",
      "target": "memory",
      "content": "Default memory notes for Hermes Agent module - User 2",
@ -70,7 +70,15 @@
      "auto_cleanup_enabled": "1",
      "min_retention_days": 30,
      "created_at": "2026-04-20 10:48:00",
-      "updated_at": "2026-04-20 10:48:00"
+      "updated_at": "2026-04-20 10:48:00",
+      "llm_provider": "dashscope",
+      "llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+      "llm_api_key": "",
+      "available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]",
+      "default_model": "qwen-plus",
+      "default_temperature": 0.7,
+      "top_p": 1.0,
+      "enable_streaming": "1"
    },
    {
      "id": "default_agent_config_user_2",
@ -84,7 +92,15 @@
      "auto_cleanup_enabled": "1",
      "min_retention_days": 30,
      "created_at": "2026-04-20 10:48:00",
-      "updated_at": "2026-04-20 10:48:00"
+      "updated_at": "2026-04-20 10:48:00",
+      "llm_provider": "dashscope",
+      "llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+      "llm_api_key": "",
+      "available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]",
+      "default_model": "qwen-plus",
+      "default_temperature": 0.7,
+      "top_p": 1.0,
+      "enable_streaming": "1"
    }
  ]
 }
--- a/json/harnessed_agent_config_view.json
+++ b/json/harnessed_agent_config_view.json
@ -21,6 +21,16 @@
                        {"value": "1", "text": "Enabled"},
                        {"value": "0", "text": "Disabled"}
                    ]
+                },
+                "llm_provider": {
+                    "uitype": "code",
+                    "data": [
+                        {"value": "dashscope", "text": "阿里云 DashScope"},
+                        {"value": "openai", "text": "OpenAI"},
+                        {"value": "deepseek", "text": "DeepSeek"},
+                        {"value": "siliconflow", "text": "SiliconFlow"},
+                        {"value": "", "text": "自定义 (Custom URL)"}
+                    ]
                }
            }
        },
--- a/models/harnessed_agent_config.json
+++ b/models/harnessed_agent_config.json
@ -97,6 +97,15 @@
            "nullable": "no",
            "default": "0.7"
        },
+        {
+            "name": "top_p",
+            "title": "Default top_p for LLM calls",
+            "type": "float",
+            "length": 5,
+            "dec": 2,
+            "nullable": "no",
+            "default": "1.00"
+        },
        {
            "name": "enable_streaming",
            "title": "Enable streaming response for LLM calls",
@ -105,6 +114,15 @@
            "nullable": "no",
            "default": "1"
        },
+        {
+            "name": "llm_provider",
+            "title": "LLM provider preset name",
+            "type": "str",
+            "length": 32,
+            "nullable": "yes",
+            "default": "dashscope",
+            "comments": "Provider preset: openai, dashscope, deepseek, siliconflow, or custom (empty)"
+        },
        {
            "name": "llm_service_url",
            "title": "LLM service base URL (OpenAI-compatible endpoint)",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -8,11 +8,8 @@ version = "1.0.0"
 description = "Hermes Agent module - multi-user AI agent with memory, skills, workflows, and remote skill deployment"
 requires-python = ">=3.10"
 dependencies = [
-    "ahserver",
    "sqlor",
-    "apppublic",
-    "appbase",
-    "rbac",
+    "bricks_for_python",
 ]

 [project.optional-dependencies]
--- a/wwwroot/v1/chat/completions.dspy
+++ b/wwwroot/v1/chat/completions.dspy
@ -1,46 +0,0 @@
-"""
-OpenAI-compatible /v1/chat/completions endpoint
-Accepts POST with JSON body matching OpenAI API format.
-"""
-import json
-
-async def main():
-    # Read request body
-    body = {}
-    try:
-        raw_body = await request.read()
-        if raw_body:
-            body = json.loads(raw_body)
-    except Exception as e:
-        result = {
-            'error': {
-                'message': f'Invalid JSON body: {str(e)}',
-                'type': 'invalid_request_error',
-                'code': 400,
-            }
-        }
-        return json.dumps(result, ensure_ascii=False)
-
-    # Pass the request object for streaming support
-    body['_request'] = request
-
-    # Call the LLM API handler
-    result = await harnessed_llm_chat_completions(body)
-
-    # Handle streaming response (StreamResponse)
-    from aiohttp.web_response import StreamResponse
-    if isinstance(result, StreamResponse):
-        return result
-
-    # Handle error response
-    if 'error' in result:
-        status_code = result.get('error', {}).get('code', 500)
-        resp = web.Response(
-            status=status_code,
-            body=json.dumps(result, ensure_ascii=False),
-            content_type='application/json'
-        )
-        return resp
-
-    # Return successful response
-    return json.dumps(result, ensure_ascii=False)
--- a/wwwroot/v1/completions.dspy
+++ b/wwwroot/v1/completions.dspy
@ -1,41 +0,0 @@
-"""
-OpenAI-compatible /v1/completions endpoint (legacy)
-Accepts POST with JSON body matching OpenAI completions API format.
-"""
-import json
-
-async def main():
-    # Read request body
-    body = {}
-    try:
-        raw_body = await request.read()
-        if raw_body:
-            body = json.loads(raw_body)
-    except Exception as e:
-        result = {
-            'error': {
-                'message': f'Invalid JSON body: {str(e)}',
-                'type': 'invalid_request_error',
-                'code': 400,
-            }
-        }
-        return json.dumps(result, ensure_ascii=False)
-
-    body['_request'] = request
-
-    result = await harnessed_llm_completions(body)
-
-    from aiohttp.web_response import StreamResponse
-    if isinstance(result, StreamResponse):
-        return result
-
-    if 'error' in result:
-        status_code = result.get('error', {}).get('code', 500)
-        resp = web.Response(
-            status=status_code,
-            body=json.dumps(result, ensure_ascii=False),
-            content_type='application/json'
-        )
-        return resp
-
-    return json.dumps(result, ensure_ascii=False)
--- a/wwwroot/v1/models.dspy
+++ b/wwwroot/v1/models.dspy
@ -1,9 +0,0 @@
-"""
-OpenAI-compatible /v1/models endpoint
-Returns list of available models.
-"""
-import json
-
-async def main():
-    result = await harnessed_llm_models()
-    return json.dumps(result, ensure_ascii=False)