diff --git a/harnessed_agent/init.py b/harnessed_agent/init.py index bfe0a5d..c4f8441 100644 --- a/harnessed_agent/init.py +++ b/harnessed_agent/init.py @@ -20,10 +20,12 @@ from .config_functions import ( harnessed_get_agent_config, harnessed_save_agent_config ) -from .llm_api import ( - harnessed_llm_chat_completions, - harnessed_llm_models, - harnessed_llm_completions, +from .llm_client import ( + llm_chat, + llm_chat_stream, + llm_list_models, + llm_simple, + llm_get_config, ) def load_harnessed_agent(): @@ -49,7 +51,9 @@ def load_harnessed_agent(): env.harnessed_get_agent_config = harnessed_get_agent_config env.harnessed_save_agent_config = harnessed_save_agent_config - # OpenAI-compatible LLM API - env.harnessed_llm_chat_completions = harnessed_llm_chat_completions - env.harnessed_llm_models = harnessed_llm_models - env.harnessed_llm_completions = harnessed_llm_completions \ No newline at end of file + # LLM client -- calls supplier LLM APIs (OpenAI-compatible) + env.llm_chat = llm_chat + env.llm_chat_stream = llm_chat_stream + env.llm_list_models = llm_list_models + env.llm_simple = llm_simple + env.llm_get_config = llm_get_config \ No newline at end of file diff --git a/harnessed_agent/llm_api.py b/harnessed_agent/llm_api.py deleted file mode 100644 index b2a0576..0000000 --- a/harnessed_agent/llm_api.py +++ /dev/null @@ -1,463 +0,0 @@ -""" -OpenAI-compatible LLM API for Hermes Agent -Provides /v1/chat/completions and /v1/models endpoints compatible with OpenAI API spec. -""" -import json -import uuid -import time -import asyncio -from typing import Dict, Any, List, Optional, AsyncGenerator -from datetime import datetime - -try: - from aiohttp import ClientSession, ClientTimeout - from aiohttp.client_exceptions import ClientError - HAS_AIOHTTP = True -except ImportError: - HAS_AIOHTTP = False - -try: - from ahserver.serverenv import ServerEnv - from sqlor.dbpools import DBPools - from appPublic.worker import awaitify - from appPublic.log import info, debug, warning, error, exception -except ImportError: - class ServerEnv: - pass - class DBPools: - pass - def awaitify(f): - return f - def info(*a, **kw): print(*a) - def debug(*a, **kw): pass - def warning(*a, **kw): print(*a) - def error(*a, **kw): print(*a) - def exception(*a, **kw): pass - - -def _now_iso(): - return datetime.utcnow().isoformat() + 'Z' - - -def _now_ts(): - return int(time.time()) - - -def _get_default_llm_config(user_id: str = None) -> Dict[str, Any]: - """Get LLM service config from harnessed_agent_config table.""" - try: - dbname = ServerEnv().get_module_dbname('harnessed_agent') - except Exception: - dbname = 'default' - - try: - import asyncio - async def _fetch(): - async with DBPools().sqlorContext(dbname) as sor: - filters = {} - if user_id: - filters['user_id'] = user_id - rows = await sor.R('harnessed_agent_config', filters, - orderby='updated_at DESC', limit=1) - if rows: - return rows[0] - return None - - try: - loop = asyncio.get_event_loop() - if loop.is_running(): - # Already in async context, need to use create_task pattern - # but for simplicity in .dspy context we return None and let caller handle - return None - else: - config = loop.run_until_complete(_fetch()) - return config - except RuntimeError: - return None - except Exception: - return None - - -async def _async_get_llm_config(user_id: str = None) -> Dict[str, Any]: - """Async version to get LLM service config.""" - try: - env = ServerEnv() - dbname = env.get_module_dbname('harnessed_agent') - except Exception: - dbname = 'default' - - try: - async with DBPools().sqlorContext(dbname) as sor: - filters = {} - if user_id: - filters['user_id'] = user_id - rows = await sor.R('harnessed_agent_config', filters, - orderby='updated_at DESC', limit=1) - if rows: - return rows[0] - except Exception as e: - print(f"Error fetching LLM config: {e}") - return None - - -async def _call_llm_api( - service_url: str, - api_key: str, - model: str, - messages: List[Dict[str, str]], - temperature: float = 0.7, - max_tokens: Optional[int] = None, - stream: bool = False, - top_p: float = 1.0, - **kwargs -) -> Any: - """Call an OpenAI-compatible LLM API endpoint.""" - url = service_url.rstrip('/') + '/chat/completions' - - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {api_key}', - } - - body = { - 'model': model, - 'messages': messages, - 'temperature': temperature, - 'top_p': top_p, - 'stream': stream, - } - - if max_tokens is not None: - body['max_tokens'] = max_tokens - - # Pass through any additional OpenAI-compatible parameters - for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', 'tool_choice', - 'response_format', 'seed'): - if key in kwargs and kwargs[key] is not None: - body[key] = kwargs[key] - - if stream: - return await _stream_llm_response(url, headers, body) - else: - return await _sync_llm_response(url, headers, body) - - -async def _sync_llm_response(url: str, headers: Dict, body: Dict) -> Dict[str, Any]: - """Make a non-streaming LLM API call with retry logic.""" - max_retries = 3 - base_delay = 2 # seconds - - for attempt in range(max_retries): - try: - async with ClientSession() as session: - async with session.post(url, headers=headers, json=body, - timeout=ClientTimeout(total=300)) as resp: - if resp.status == 429: - # Rate limited - respect Retry-After header - retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt))) - if attempt < max_retries - 1: - warning(f"LLM API rate limited (429), retrying after {retry_after}s (attempt {attempt+1}/{max_retries})") - await asyncio.sleep(retry_after) - continue - else: - return { - 'error': { - 'message': 'Rate limited by LLM provider. Please retry later.', - 'type': 'rate_limit_error', - 'code': 429, - } - } - - if resp.status == 500 and attempt < max_retries - 1: - # Transient server error - retry - delay = base_delay * (2 ** attempt) - warning(f"LLM API server error (500), retrying after {delay}s (attempt {attempt+1}/{max_retries})") - await asyncio.sleep(delay) - continue - - if resp.status != 200: - error_text = await resp.text() - return { - 'error': { - 'message': f'LLM API error: HTTP {resp.status}', - 'type': 'api_error', - 'code': resp.status, - 'detail': error_text[:500], - } - } - - data = await resp.json() - return data - - except asyncio.TimeoutError: - if attempt < max_retries - 1: - delay = base_delay * (2 ** attempt) - warning(f"LLM API timeout, retrying after {delay}s (attempt {attempt+1}/{max_retries})") - await asyncio.sleep(delay) - continue - return { - 'error': { - 'message': 'LLM API request timed out after 300s', - 'type': 'timeout_error', - 'code': 504, - } - } - except ClientError as e: - if attempt < max_retries - 1: - delay = base_delay * (2 ** attempt) - warning(f"LLM API connection error: {e}, retrying after {delay}s") - await asyncio.sleep(delay) - continue - return { - 'error': { - 'message': f'LLM API connection failed: {str(e)}', - 'type': 'connection_error', - 'code': 502, - } - } - - return { - 'error': { - 'message': 'LLM API request failed after all retries', - 'type': 'server_error', - 'code': 500, - } - } - - -async def _stream_llm_response(url: str, headers: Dict, body: Dict) -> AsyncGenerator[str, None]: - """Make a streaming LLM API call, yielding SSE chunks.""" - async with ClientSession() as session: - async with session.post(url, headers=headers, json=body, - timeout=ClientTimeout(total=600)) as resp: - if resp.status != 200: - error_text = await resp.text() - error_data = { - 'error': { - 'message': f'LLM API error: HTTP {resp.status}', - 'type': 'api_error', - 'code': resp.status, - 'detail': error_text[:500], - } - } - yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n" - yield "data: [DONE]\n\n" - return - - async for line in resp.content: - line = line.decode('utf-8').strip() - if not line: - continue - if line.startswith('data: '): - yield line + '\n\n' - - -# ============================================================ -# Public API functions (registered to ServerEnv via init.py) -# ============================================================ - -async def harnessed_llm_chat_completions(body: Dict[str, Any]) -> Any: - """ - OpenAI-compatible /v1/chat/completions endpoint. - - Args: - body: dict with keys: model, messages, temperature, max_tokens, - stream, top_p, and other OpenAI-compatible params. - - Returns: - If stream=False: dict matching OpenAI chat completion response. - If stream=True: aiohttp.StreamResponse with SSE. - """ - from aiohttp import web - - model = body.get('model', 'default') - messages = body.get('messages', []) - temperature = body.get('temperature', 0.7) - max_tokens = body.get('max_tokens', None) - stream = body.get('stream', False) - top_p = body.get('top_p', 1.0) - - start_time = time.time() - - # Get LLM service config - config = await _async_get_llm_config() - if not config: - error("LLM service not configured: no harnessed_agent_config found") - return { - 'error': { - 'message': 'LLM service not configured. Please configure llm_service_url and llm_api_key in agent settings.', - 'type': 'configuration_error', - 'code': 503, - } - } - - service_url = config.get('llm_service_url') or config.get('api_endpoint') - api_key = config.get('llm_api_key') or config.get('api_key') - - if not service_url or not api_key: - error(f"LLM service misconfigured: service_url={'set' if service_url else 'MISSING'}, api_key={'set' if api_key else 'MISSING'}") - return { - 'error': { - 'message': 'LLM service URL or API key not configured. Set llm_service_url and llm_api_key in harnessed_agent_config.', - 'type': 'configuration_error', - 'code': 503, - } - } - - # Use default model from config if request model is 'default' or empty - default_model = config.get('default_model', 'qwen3-max') - if not model or model == 'default': - model = default_model - - info(f"LLM chat request: model={model}, stream={stream}, messages={len(messages)}, temperature={temperature}") - - # Pass through extra params - extra_params = {} - for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', - 'tool_choice', 'response_format', 'seed'): - if key in body: - extra_params[key] = body[key] - - if stream: - resp = web.StreamResponse( - status=200, - reason='OK', - headers={ - 'Content-Type': 'text/event-stream', - 'Cache-Control': 'no-cache', - 'Connection': 'keep-alive', - 'X-Accel-Buffering': 'no', - } - ) - await resp.prepare(body.get('_request')) - - request_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" - created = _now_ts() - - try: - async for chunk_line in _stream_llm_response( - service_url, {'Content-Type': 'application/json', 'Authorization': f'Bearer {api_key}'}, - { - 'model': model, - 'messages': messages, - 'temperature': temperature, - 'top_p': top_p, - 'stream': True, - 'max_tokens': max_tokens, - **extra_params, - } - ): - await resp.write(chunk_line.encode('utf-8') if isinstance(chunk_line, str) else chunk_line) - await resp.drain() - except Exception as e: - error_data = { - 'error': { - 'message': f'Streaming error: {str(e)}', - 'type': 'server_error', - 'code': 500, - } - } - await resp.write(f"data: {json.dumps(error_data)}\n\n".encode('utf-8')) - await resp.drain() - - await resp.write(b"data: [DONE]\n\n") - await resp.drain() - return resp - - else: - result = await _call_llm_api( - service_url=service_url, - api_key=api_key, - model=model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - stream=False, - top_p=top_p, - **extra_params, - ) - elapsed = time.time() - start_time - if 'error' in result: - error(f"LLM chat error: model={model}, elapsed={elapsed:.2f}s, error={result.get('error',{}).get('message','')}") - else: - usage = result.get('usage', {}) - info(f"LLM chat done: model={model}, elapsed={elapsed:.2f}s, tokens_in={usage.get('prompt_tokens','?')}, tokens_out={usage.get('completion_tokens','?')}") - return result - - -async def harnessed_llm_models() -> Dict[str, Any]: - """ - OpenAI-compatible /v1/models endpoint. - Returns list of available models from config. - """ - config = await _async_get_llm_config() - - models_list = [] - - # Add default model from config - default_model = 'qwen3-max' - if config: - default_model = config.get('default_model', 'qwen3-max') - - # If available_models is configured as JSON string, parse it - if config and config.get('available_models'): - try: - models_str = config['available_models'] - if isinstance(models_str, str): - model_names = json.loads(models_str) - else: - model_names = models_str - for m in model_names: - if isinstance(m, str): - models_list.append({ - 'id': m, - 'object': 'model', - 'created': _now_ts(), - 'owned_by': 'harnessed_agent', - }) - elif isinstance(m, dict): - models_list.append(m) - except Exception: - pass - - # Always include the default model if not already listed - existing_ids = {m['id'] for m in models_list} - if default_model not in existing_ids: - models_list.insert(0, { - 'id': default_model, - 'object': 'model', - 'created': _now_ts(), - 'owned_by': 'harnessed_agent', - }) - - return { - 'object': 'list', - 'data': models_list, - } - - -async def harnessed_llm_completions(body: Dict[str, Any]) -> Dict[str, Any]: - """ - OpenAI-compatible /v1/completions endpoint (legacy, non-chat). - Converts prompt to messages format internally. - """ - prompt = body.get('prompt', '') - model = body.get('model', 'default') - temperature = body.get('temperature', 0.7) - max_tokens = body.get('max_tokens', None) - stream = body.get('stream', False) - - # Convert to chat format - messages = [{'role': 'user', 'content': prompt}] - - # Build request body for chat completions - chat_body = { - 'model': model, - 'messages': messages, - 'temperature': temperature, - 'max_tokens': max_tokens, - 'stream': stream, - '_request': body.get('_request'), - } - - return await harnessed_llm_chat_completions(chat_body) diff --git a/harnessed_agent/llm_client.py b/harnessed_agent/llm_client.py new file mode 100644 index 0000000..6692e95 --- /dev/null +++ b/harnessed_agent/llm_client.py @@ -0,0 +1,475 @@ +""" +LLM Client for Hermes Agent - Calls supplier LLM APIs via OpenAI-compatible interface. + +This module provides a client-side implementation for calling external LLM providers +(OpenAI, DashScope, DeepSeek, etc.) through their OpenAI-compatible /v1/chat/completions API. + +Usage: + # From .dspy files: + result = await llm_chat(messages=[{"role": "user", "content": "Hello"}]) + + # With explicit config: + result = await llm_chat( + messages=[{"role": "user", "content": "Hello"}], + model="qwen3-max", + temperature=0.7, + stream=False + ) + + # Stream mode: + async for chunk in llm_chat_stream(messages=[...]): + print(chunk.get("delta", "")) +""" +import json +import time +import asyncio +from typing import Dict, Any, List, Optional, AsyncGenerator +from datetime import datetime + +try: + from aiohttp import ClientSession, ClientTimeout, ClientError +except ImportError: + ClientError = Exception + +try: + from ahserver.serverenv import ServerEnv + from sqlor.dbpools import DBPools + from appPublic.log import info, debug, warning, error, exception +except ImportError: + class ServerEnv: + pass + class DBPools: + pass + def info(*a, **kw): print(*a) + def debug(*a, **kw): pass + def warning(*a, **kw): print(*a) + def error(*a, **kw): print(*a) + def exception(*a, **kw): pass + + +# ============================================================ +# LLM Provider configurations +# ============================================================ + +LLM_PROVIDERS = { + 'openai': { + 'url': 'https://api.openai.com/v1', + 'model_default': 'gpt-4o', + }, + 'dashscope': { + 'url': 'https://dashscope.aliyuncs.com/compatible-mode/v1', + 'model_default': 'qwen-plus', + }, + 'deepseek': { + 'url': 'https://api.deepseek.com/v1', + 'model_default': 'deepseek-chat', + }, + 'siliconflow': { + 'url': 'https://api.siliconflow.cn/v1', + 'model_default': 'Qwen/Qwen2.5-72B-Instruct', + }, +} + + +# ============================================================ +# Config retrieval +# ============================================================ + +async def _get_llm_config() -> Dict[str, Any]: + """Get LLM client configuration from harnessed_agent_config table.""" + try: + env = ServerEnv() + dbname = env.get_module_dbname('harnessed_agent') + except Exception: + dbname = 'default' + + try: + async with DBPools().sqlorContext(dbname) as sor: + rows = await sor.R('harnessed_agent_config', {}, + orderby='updated_at DESC', limit=1) + if rows: + return rows[0] + except Exception as e: + error(f"Failed to fetch LLM config: {e}") + return {} + + +def _resolve_provider(config: Dict[str, Any]) -> Dict[str, str]: + """Resolve base URL and model from config, with provider presets.""" + provider = (config.get('llm_provider') or '').lower() + service_url = config.get('llm_service_url', '') + api_key = config.get('llm_api_key', '') + model = config.get('default_model', '') + + # If provider name is set, use preset URL + if provider and provider in LLM_PROVIDERS: + preset = LLM_PROVIDERS[provider] + if not service_url: + service_url = preset['url'] + if not model: + model = preset['model_default'] + + return { + 'service_url': service_url.rstrip('/'), + 'api_key': api_key, + 'model': model, + 'provider': provider, + } + + +# ============================================================ +# Core LLM client +# ============================================================ + +async def _post_chat_completions( + service_url: str, + api_key: str, + model: str, + messages: List[Dict[str, str]], + temperature: float = 0.7, + max_tokens: Optional[int] = None, + stream: bool = False, + top_p: float = 1.0, + extra: Optional[Dict] = None, +) -> Dict[str, Any]: + """ + Call OpenAI-compatible /v1/chat/completions endpoint. + + Returns dict with OpenAI response format or error dict. + """ + if not service_url: + return {'error': {'message': 'llm_service_url not configured', 'type': 'configuration_error'}} + if not api_key: + return {'error': {'message': 'llm_api_key not configured', 'type': 'configuration_error'}} + + url = f"{service_url}/chat/completions" + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}', + } + + body = { + 'model': model, + 'messages': messages, + 'temperature': temperature, + 'top_p': top_p, + 'stream': stream, + } + + if max_tokens is not None: + body['max_tokens'] = max_tokens + + if extra: + for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', + 'tool_choice', 'response_format', 'seed'): + if key in extra and extra[key] is not None: + body[key] = extra[key] + + max_retries = 3 + base_delay = 2 + + for attempt in range(max_retries): + try: + async with ClientSession() as session: + async with session.post(url, headers=headers, json=body, + timeout=ClientTimeout(total=300)) as resp: + + if resp.status == 429: + retry_after = int(resp.headers.get('Retry-After', base_delay * (2 ** attempt))) + if attempt < max_retries - 1: + warning(f"LLM rate limited (429), retrying in {retry_after}s (attempt {attempt+1}/{max_retries})") + await asyncio.sleep(retry_after) + continue + return {'error': {'message': 'Rate limited by LLM provider', 'type': 'rate_limit_error', 'code': 429}} + + if resp.status == 500 and attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) + warning(f"LLM server error (500), retrying in {delay}s (attempt {attempt+1}/{max_retries})") + await asyncio.sleep(delay) + continue + + if resp.status != 200: + err_text = await resp.text() + return { + 'error': { + 'message': f'LLM API error: HTTP {resp.status}', + 'type': 'api_error', + 'code': resp.status, + 'detail': err_text[:500], + } + } + + return await resp.json() + + except asyncio.TimeoutError: + if attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) + warning(f"LLM API timeout, retrying in {delay}s (attempt {attempt+1}/{max_retries})") + await asyncio.sleep(delay) + continue + return {'error': {'message': 'LLM API request timed out', 'type': 'timeout_error', 'code': 504}} + + except ClientError as e: + if attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) + warning(f"LLM connection error: {e}, retrying in {delay}s") + await asyncio.sleep(delay) + continue + return {'error': {'message': f'LLM connection failed: {str(e)}', 'type': 'connection_error', 'code': 502}} + + return {'error': {'message': 'LLM API failed after all retries', 'type': 'server_error', 'code': 500}} + + +async def _stream_chat_completions( + service_url: str, + api_key: str, + model: str, + messages: List[Dict[str, str]], + temperature: float = 0.7, + max_tokens: Optional[int] = None, + top_p: float = 1.0, + extra: Optional[Dict] = None, +) -> AsyncGenerator[Dict[str, Any], None]: + """ + Stream LLM response via SSE. Yields parsed chunk dicts. + + Each yielded dict contains: + - delta: str (the text chunk) + - finish_reason: str or None + - raw: dict (the raw SSE chunk data) + """ + if not service_url: + yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_service_url not configured'}} + return + if not api_key: + yield {'delta': '', 'finish_reason': 'error', 'raw': {'error': 'llm_api_key not configured'}} + return + + url = f"{service_url}/chat/completions" + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}', + } + + body = { + 'model': model, + 'messages': messages, + 'temperature': temperature, + 'top_p': top_p, + 'stream': True, + } + + if max_tokens is not None: + body['max_tokens'] = max_tokens + + if extra: + for key in ('stop', 'presence_penalty', 'frequency_penalty', 'tools', + 'tool_choice', 'response_format', 'seed'): + if key in extra and extra[key] is not None: + body[key] = extra[key] + + async with ClientSession() as session: + async with session.post(url, headers=headers, json=body, + timeout=ClientTimeout(total=600)) as resp: + if resp.status != 200: + err_text = await resp.text() + yield { + 'delta': '', + 'finish_reason': 'error', + 'raw': {'error': f'HTTP {resp.status}', 'detail': err_text[:300]}, + } + return + + async for line in resp.content: + line = line.decode('utf-8').strip() + if not line or not line.startswith('data:'): + continue + data_str = line[5:].strip() + if data_str == '[DONE]': + break + try: + chunk = json.loads(data_str) + choices = chunk.get('choices', []) + if choices: + choice = choices[0] + delta = choice.get('delta', {}) + text = delta.get('content', '') or '' + finish_reason = choice.get('finish_reason') + yield { + 'delta': text, + 'finish_reason': finish_reason, + 'raw': chunk, + } + except json.JSONDecodeError: + continue + + +# ============================================================ +# Public API functions (registered to ServerEnv) +# ============================================================ + +async def llm_chat( + messages: List[Dict[str, str]], + model: str = None, + temperature: float = None, + max_tokens: int = None, + stream: bool = False, + top_p: float = None, + **extra, +) -> Dict[str, Any]: + """ + Call LLM provider and get chat completion. + + This is the primary function for AI calls within harnessed_agent. + Reads provider config from harnessed_agent_config table automatically. + + Args: + messages: List of {role, content} dicts (OpenAI format) + model: Override model name (uses config default_model if not set) + temperature: Override temperature (uses config default_temperature if not set) + max_tokens: Max response tokens + stream: If True, uses streaming + top_p: Override top_p + **extra: Other OpenAI-compatible params (stop, tools, etc.) + + Returns: + Dict matching OpenAI chat completion response, or error dict. + """ + config = await _get_llm_config() + provider = _resolve_provider(config) + + resolved_model = model or provider['model'] + resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7) + resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0) + + start_time = time.time() + info(f"LLM chat: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}") + + result = await _post_chat_completions( + service_url=provider['service_url'], + api_key=provider['api_key'], + model=resolved_model, + messages=messages, + temperature=resolved_temp, + max_tokens=max_tokens, + stream=False, + top_p=resolved_top_p, + extra=extra if extra else None, + ) + + elapsed = time.time() - start_time + if 'error' in result: + error(f"LLM chat error: model={resolved_model}, elapsed={elapsed:.2f}s, error={result['error'].get('message')}") + else: + usage = result.get('usage', {}) + info(f"LLM chat done: model={resolved_model}, elapsed={elapsed:.2f}s, prompt_tokens={usage.get('prompt_tokens')}, completion_tokens={usage.get('completion_tokens')}") + + return result + + +async def llm_chat_stream( + messages: List[Dict[str, str]], + model: str = None, + temperature: float = None, + max_tokens: int = None, + top_p: float = None, + **extra, +) -> AsyncGenerator[Dict[str, Any], None]: + """ + Stream LLM chat response. + + Yields dicts with {delta: str, finish_reason: str|None, raw: dict}. + + Example: + async for chunk in llm_chat_stream(messages=[{"role": "user", "content": "Hello"}]): + print(chunk['delta'], end='', flush=True) + """ + config = await _get_llm_config() + provider = _resolve_provider(config) + + resolved_model = model or provider['model'] + resolved_temp = temperature if temperature is not None else config.get('default_temperature', 0.7) + resolved_top_p = top_p if top_p is not None else config.get('top_p', 1.0) + + info(f"LLM chat stream: model={resolved_model}, temp={resolved_temp}, messages={len(messages)}") + + async for chunk in _stream_chat_completions( + service_url=provider['service_url'], + api_key=provider['api_key'], + model=resolved_model, + messages=messages, + temperature=resolved_temp, + max_tokens=max_tokens, + top_p=resolved_top_p, + extra=extra if extra else None, + ): + yield chunk + + +async def llm_list_models() -> Dict[str, Any]: + """List models available from the configured LLM provider.""" + config = await _get_llm_config() + provider = _resolve_provider(config) + + if not provider['service_url'] or not provider['api_key']: + return {'error': 'LLM not configured'} + + # Try to call /v1/models endpoint + url = f"{provider['service_url']}/models" + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {provider["api_key"]}', + } + + try: + async with ClientSession() as session: + async with session.get(url, headers=headers, + timeout=ClientTimeout(total=30)) as resp: + if resp.status == 200: + return await resp.json() + else: + return {'error': f'HTTP {resp.status}'} + except Exception as e: + return {'error': str(e)} + + +async def llm_simple(prompt: str, system: str = None, **kwargs) -> str: + """ + Simplified LLM call: returns just the response text. + + Args: + prompt: User message text + system: Optional system prompt + **kwargs: Passed to llm_chat + + Returns: + Response content string, or error message. + """ + messages = [] + if system: + messages.append({'role': 'system', 'content': system}) + messages.append({'role': 'user', 'content': prompt}) + + result = await llm_chat(messages=messages, **kwargs) + + if 'error' in result: + return f"Error: {result['error'].get('message', 'Unknown error')}" + + choices = result.get('choices', []) + if choices: + return choices[0].get('message', {}).get('content', '') + return '' + + +async def llm_get_config() -> Dict[str, Any]: + """Get current LLM client configuration (with api_key masked).""" + config = await _get_llm_config() + provider = _resolve_provider(config) + + return { + 'provider': provider['provider'], + 'service_url': provider['service_url'], + 'api_key': '***' + provider['api_key'][-4:] if provider['api_key'] else '(not set)', + 'default_model': provider['model'], + 'default_temperature': config.get('default_temperature', 0.7), + } diff --git a/init/data.json b/init/data.json index 25a86f2..b07049e 100644 --- a/init/data.json +++ b/init/data.json @@ -9,7 +9,7 @@ "updated_at": "2026-04-15 21:06:00" }, { - "id": "default_memory_notes_1", + "id": "default_memory_notes_1", "user_id": "user_1", "target": "memory", "content": "Default memory notes for Hermes Agent module - User 1", @@ -25,7 +25,7 @@ "updated_at": "2026-04-15 21:06:00" }, { - "id": "default_memory_notes_2", + "id": "default_memory_notes_2", "user_id": "user_2", "target": "memory", "content": "Default memory notes for Hermes Agent module - User 2", @@ -70,7 +70,15 @@ "auto_cleanup_enabled": "1", "min_retention_days": 30, "created_at": "2026-04-20 10:48:00", - "updated_at": "2026-04-20 10:48:00" + "updated_at": "2026-04-20 10:48:00", + "llm_provider": "dashscope", + "llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", + "llm_api_key": "", + "available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]", + "default_model": "qwen-plus", + "default_temperature": 0.7, + "top_p": 1.0, + "enable_streaming": "1" }, { "id": "default_agent_config_user_2", @@ -84,7 +92,15 @@ "auto_cleanup_enabled": "1", "min_retention_days": 30, "created_at": "2026-04-20 10:48:00", - "updated_at": "2026-04-20 10:48:00" + "updated_at": "2026-04-20 10:48:00", + "llm_provider": "dashscope", + "llm_service_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", + "llm_api_key": "", + "available_models": "[\"qwen3-max\", \"qwen-plus\", \"qwen-turbo\"]", + "default_model": "qwen-plus", + "default_temperature": 0.7, + "top_p": 1.0, + "enable_streaming": "1" } ] } \ No newline at end of file diff --git a/json/harnessed_agent_config_view.json b/json/harnessed_agent_config_view.json index 0ec981f..daf21fc 100644 --- a/json/harnessed_agent_config_view.json +++ b/json/harnessed_agent_config_view.json @@ -21,6 +21,16 @@ {"value": "1", "text": "Enabled"}, {"value": "0", "text": "Disabled"} ] + }, + "llm_provider": { + "uitype": "code", + "data": [ + {"value": "dashscope", "text": "阿里云 DashScope"}, + {"value": "openai", "text": "OpenAI"}, + {"value": "deepseek", "text": "DeepSeek"}, + {"value": "siliconflow", "text": "SiliconFlow"}, + {"value": "", "text": "自定义 (Custom URL)"} + ] } } }, diff --git a/models/harnessed_agent_config.json b/models/harnessed_agent_config.json index 238e0d5..d19a9ab 100644 --- a/models/harnessed_agent_config.json +++ b/models/harnessed_agent_config.json @@ -97,6 +97,15 @@ "nullable": "no", "default": "0.7" }, + { + "name": "top_p", + "title": "Default top_p for LLM calls", + "type": "float", + "length": 5, + "dec": 2, + "nullable": "no", + "default": "1.00" + }, { "name": "enable_streaming", "title": "Enable streaming response for LLM calls", @@ -105,6 +114,15 @@ "nullable": "no", "default": "1" }, + { + "name": "llm_provider", + "title": "LLM provider preset name", + "type": "str", + "length": 32, + "nullable": "yes", + "default": "dashscope", + "comments": "Provider preset: openai, dashscope, deepseek, siliconflow, or custom (empty)" + }, { "name": "llm_service_url", "title": "LLM service base URL (OpenAI-compatible endpoint)", diff --git a/pyproject.toml b/pyproject.toml index 4546283..572c4bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,11 +8,8 @@ version = "1.0.0" description = "Hermes Agent module - multi-user AI agent with memory, skills, workflows, and remote skill deployment" requires-python = ">=3.10" dependencies = [ - "ahserver", "sqlor", - "apppublic", - "appbase", - "rbac", + "bricks_for_python", ] [project.optional-dependencies] diff --git a/wwwroot/v1/chat/completions.dspy b/wwwroot/v1/chat/completions.dspy deleted file mode 100644 index b1360b2..0000000 --- a/wwwroot/v1/chat/completions.dspy +++ /dev/null @@ -1,46 +0,0 @@ -""" -OpenAI-compatible /v1/chat/completions endpoint -Accepts POST with JSON body matching OpenAI API format. -""" -import json - -async def main(): - # Read request body - body = {} - try: - raw_body = await request.read() - if raw_body: - body = json.loads(raw_body) - except Exception as e: - result = { - 'error': { - 'message': f'Invalid JSON body: {str(e)}', - 'type': 'invalid_request_error', - 'code': 400, - } - } - return json.dumps(result, ensure_ascii=False) - - # Pass the request object for streaming support - body['_request'] = request - - # Call the LLM API handler - result = await harnessed_llm_chat_completions(body) - - # Handle streaming response (StreamResponse) - from aiohttp.web_response import StreamResponse - if isinstance(result, StreamResponse): - return result - - # Handle error response - if 'error' in result: - status_code = result.get('error', {}).get('code', 500) - resp = web.Response( - status=status_code, - body=json.dumps(result, ensure_ascii=False), - content_type='application/json' - ) - return resp - - # Return successful response - return json.dumps(result, ensure_ascii=False) diff --git a/wwwroot/v1/completions.dspy b/wwwroot/v1/completions.dspy deleted file mode 100644 index c5e5fc4..0000000 --- a/wwwroot/v1/completions.dspy +++ /dev/null @@ -1,41 +0,0 @@ -""" -OpenAI-compatible /v1/completions endpoint (legacy) -Accepts POST with JSON body matching OpenAI completions API format. -""" -import json - -async def main(): - # Read request body - body = {} - try: - raw_body = await request.read() - if raw_body: - body = json.loads(raw_body) - except Exception as e: - result = { - 'error': { - 'message': f'Invalid JSON body: {str(e)}', - 'type': 'invalid_request_error', - 'code': 400, - } - } - return json.dumps(result, ensure_ascii=False) - - body['_request'] = request - - result = await harnessed_llm_completions(body) - - from aiohttp.web_response import StreamResponse - if isinstance(result, StreamResponse): - return result - - if 'error' in result: - status_code = result.get('error', {}).get('code', 500) - resp = web.Response( - status=status_code, - body=json.dumps(result, ensure_ascii=False), - content_type='application/json' - ) - return resp - - return json.dumps(result, ensure_ascii=False) diff --git a/wwwroot/v1/models.dspy b/wwwroot/v1/models.dspy deleted file mode 100644 index b7c178a..0000000 --- a/wwwroot/v1/models.dspy +++ /dev/null @@ -1,9 +0,0 @@ -""" -OpenAI-compatible /v1/models endpoint -Returns list of available models. -""" -import json - -async def main(): - result = await harnessed_llm_models() - return json.dumps(result, ensure_ascii=False)