From df8aafe1d8c7fdb8d0197b3f5268b0d22755d386 Mon Sep 17 00:00:00 2001 From: yumoqing Date: Thu, 4 Jun 2026 13:58:26 +0800 Subject: [PATCH] feat: add TTS and ASR audio API endpoints - POST /v1/audio/speech (TTS): MiniMax Speech 2.6 Turbo/HD, 2.5 HD, F5-TTS local - POST /v1/audio/transcriptions (ASR): qwen3-asr-flash, Nvidia parakeet - Add comprehensive docs for both endpoints in API.md - Update load_path.py RBAC (logined + customer roles) --- docs/API.md | 129 +++++++++++++++++++++ scripts/load_path.py | 4 + wwwroot/v1/audio/speech/index.dspy | 74 ++++++++++++ wwwroot/v1/audio/transcriptions/index.dspy | 71 ++++++++++++ 4 files changed, 278 insertions(+) create mode 100644 wwwroot/v1/audio/speech/index.dspy create mode 100644 wwwroot/v1/audio/transcriptions/index.dspy diff --git a/docs/API.md b/docs/API.md index 9e0b810..d2f3f17 100644 --- a/docs/API.md +++ b/docs/API.md @@ -511,6 +511,135 @@ Music 2.0 能根据文本描述和歌词直接生成包含人声的完整歌曲 --- +## POST /v1/audio/speech + +文本转语音(TTS)接口。 + +### 必填参数 + +| 参数 | 类型 | 说明 | +|------|------|------| +| `model` | string | 模型名称,如 `"speech-2.6-turbo"`, `"speech-2.6-hd"` | +| `catelogid` | string | 目录类型ID,固定为 `"tts"` | +| `prompt` | string | 需要合成的文本内容,最长 10,000 字符 | + +### 可选参数 + +| 参数 | 类型 | 说明 | +|------|------|------| +| `speaker` | string | 说话人/音色ID,如 `"female-tianmei"` | +| `speed` | float | 语速,默认 `1.0` | +| `emotion` | string | 情感,如 `"happy"`, `"sad"` | +| `transno` | string | 交易流水号 | + +### 请求示例 + +```json +{ + "model": "speech-2.6-turbo", + "catelogid": "tts", + "prompt": "你好,欢迎使用语音合成服务", + "speaker": "female-tianmei", + "speed": 1.0, + "emotion": "happy" +} +``` + +### 响应格式 + +MiniMax TTS 为流式接口,逐块返回音频数据(hex编码自动转base64): + +```json +{ + "status": "SUCCEEDED", + "audio": "base64_encoded_audio_data" +} +``` + +### 可用模型 + +| 模型名称 | model 参数 | 说明 | +|---------|-----------|------| +| MiniMax Speech 2.6 Turbo | `speech-2.6-turbo` | 极速版,更快更优惠,适用于语音聊天和数字人 | +| MiniMax Speech 2.6 HD | `speech-2.6-hd` | 高清版,超低延时,更高自然度 | +| MiniMax Speech 2.5 HD | `speech-2.5-hd-preview` | Preview版本 | +| F5-TTS 本地 | `f5tts` | 本地部署,零样本声音克隆,多语言支持 | + +### 错误响应 + +| 状态码 | 说明 | +|--------|------| +| 400 | 缺少必填参数或模型不存在 | +| 403 | 未登录 | +| 429 | 账户余额不足 | + +--- + +## POST /v1/audio/transcriptions + +语音识别(ASR)接口,将音频转为文本。 + +### 必填参数 + +| 参数 | 类型 | 说明 | +|------|------|------| +| `model` | string | 模型名称,如 `"qwen3-asr-flash"`, `"parakeet-tdt-0.6b-v2"` | +| `catelogid` | string | 目录类型ID,固定为 `"asr"` | +| `audio_file` | string | 音频文件URL | + +### 可选参数 + +| 参数 | 类型 | 说明 | +|------|------|------| +| `language` | string | 语言代码(部分模型支持) | +| `transno` | string | 交易流水号 | + +### 请求示例 + +```json +{ + "model": "qwen3-asr-flash", + "catelogid": "asr", + "audio_file": "https://example.com/audio.wav" +} +``` + +### 响应格式 + +```json +{ + "text": "识别出的文本内容", + "usage": { + "duration_seconds": 5.2 + } +} +``` + +### 可用模型 + +| 模型名称 | model 参数 | 说明 | +|---------|-----------|------| +| 通义千问 ASR | `qwen3-asr-flash` | 多语种识别、歌唱识别、情感识别、噪声拒识,0.00026元/秒 | +| Nvidia ASR | `parakeet-tdt-0.6b-v2` | 仅支持英文,6亿参数,支持标点/大小写/时间戳 | + +### 通义千问 ASR 核心功能 + +- 多语种识别:涵盖普通话及多种方言(粤语、四川话等) +- 复杂环境适应:自动语种检测与智能非人声过滤 +- 歌唱识别:伴随BGM下也能实现整首歌曲转写 +- 上下文增强:通过配置上下文提高识别准确率 +- 情感识别:支持惊讶、平静、愉快、悲伤、厌恶、愤怒、恐惧 + +### 错误响应 + +| 状态码 | 说明 | +|--------|------| +| 400 | 缺少必填参数或模型不存在 | +| 403 | 未登录 | +| 429 | 账户余额不足 | + +--- + ## GET /v1/tasks 查询异步任务状态。 diff --git a/scripts/load_path.py b/scripts/load_path.py index 14f8f49..1a6b477 100644 --- a/scripts/load_path.py +++ b/scripts/load_path.py @@ -164,6 +164,8 @@ PATHS_LOGINED = [ f"/{MOD}/v1/tasks/index.dspy", f"/{MOD}/v1/video/generations/index.dspy", f"/{MOD}/v1/music/generations/index.dspy", + f"/{MOD}/v1/audio/speech/index.dspy", + f"/{MOD}/v1/audio/transcriptions/index.dspy", # 其他子目录 f"/{MOD}/list_llmcatelogs/index.dspy", @@ -184,6 +186,8 @@ PATHS_V1_CUSTOMER = [ f"/{MOD}/v1/video/generations/index.dspy", f"/{MOD}/v1/image/generations/index.dspy", f"/{MOD}/v1/music/generations/index.dspy", + f"/{MOD}/v1/audio/speech/index.dspy", + f"/{MOD}/v1/audio/transcriptions/index.dspy", f"/{MOD}/v1/models/index.dspy", f"/{MOD}/v1/tasks/index.dspy", ] diff --git a/wwwroot/v1/audio/speech/index.dspy b/wwwroot/v1/audio/speech/index.dspy new file mode 100644 index 0000000..6e528c3 --- /dev/null +++ b/wwwroot/v1/audio/speech/index.dspy @@ -0,0 +1,74 @@ +# OpenAI-compatible Text-to-Speech API +# POST /v1/audio/speech +# Required params: model, catelogid, prompt (text to synthesize) +# Optional params: speaker (voice_id), speed, emotion +# +# Example request: +# { +# "model": "speech-2.6-turbo", +# "catelogid": "tts", +# "prompt": "你好,欢迎使用语音合成服务", +# "speaker": "female-tianmei", +# "speed": 1.0, +# "emotion": "happy" +# } +# +# Response (stream, hex audio chunks): +# { +# "status": "SUCCEEDED", +# "audio": "base64_encoded_audio_data" +# } + +userid = await get_user() +userorgid = await get_userorgid() +if userid is None: + debug('need login') + return openai_403() + +# Validate required parameters +if not params_kw.model: + d = return_error('Missing required parameter: model') + return json_response(d, status=400) + +if not params_kw.catelogid: + d = return_error('Missing required parameter: catelogid') + return json_response(d, status=400) + +if not params_kw.prompt: + d = return_error('Missing required parameter: prompt (text to synthesize)') + return json_response(d, status=400) + +lctype = params_kw.catelogid + +env = request._run_ns +async with get_sor_context(env, 'llmage') as sor: + # Look up llm by model name and catalog type through llm_api_map + sql = """select distinct a.* from llm a +join llm_api_map m on a.id = m.llmid +join llmcatelog b on m.llmcatelogid = b.id +where (b.id = ${lctype}$ OR b.name = ${lctype}$) + and a.model=${model}$ + and a.status = 'published'""" + recs = await sor.sqlExe(sql, { + 'lctype': lctype, + 'model': params_kw.model + }) + if len(recs) == 0: + debug(f'{params_kw.model=} not found for catalog {lctype}') + return openai_400() + params_kw.llmid = recs[0].id + +debug(f'{params_kw.llmid=}') + +# Check balance +f = await checkCustomerBalance(params_kw.llmid, userid, userorgid) +if not f: + debug(f'{userid=} balance not enough') + return openai_429() + +# Generate task ID and attach to params +if not params_kw.transno: + params_kw.transno = getID() + +# Call inference (TTS can be stream or sync depending on model) +return await inference(request, env=env) diff --git a/wwwroot/v1/audio/transcriptions/index.dspy b/wwwroot/v1/audio/transcriptions/index.dspy new file mode 100644 index 0000000..787b473 --- /dev/null +++ b/wwwroot/v1/audio/transcriptions/index.dspy @@ -0,0 +1,71 @@ +# OpenAI-compatible Audio Transcription API (ASR) +# POST /v1/audio/transcriptions +# Required params: model, catelogid, audio_file (audio URL or base64) +# Optional params: language +# +# Example request: +# { +# "model": "qwen3-asr-flash", +# "catelogid": "asr", +# "audio_file": "https://example.com/audio.wav" +# } +# +# Response: +# { +# "text": "识别出的文本内容", +# "usage": { "duration_seconds": 5.2 } +# } + +userid = await get_user() +userorgid = await get_userorgid() +if userid is None: + debug('need login') + return openai_403() + +# Validate required parameters +if not params_kw.model: + d = return_error('Missing required parameter: model') + return json_response(d, status=400) + +if not params_kw.catelogid: + d = return_error('Missing required parameter: catelogid') + return json_response(d, status=400) + +if not params_kw.audio_file: + d = return_error('Missing required parameter: audio_file') + return json_response(d, status=400) + +lctype = params_kw.catelogid + +env = request._run_ns +async with get_sor_context(env, 'llmage') as sor: + # Look up llm by model name and catalog type through llm_api_map + sql = """select distinct a.* from llm a +join llm_api_map m on a.id = m.llmid +join llmcatelog b on m.llmcatelogid = b.id +where (b.id = ${lctype}$ OR b.name = ${lctype}$) + and a.model=${model}$ + and a.status = 'published'""" + recs = await sor.sqlExe(sql, { + 'lctype': lctype, + 'model': params_kw.model + }) + if len(recs) == 0: + debug(f'{params_kw.model=} not found for catalog {lctype}') + return openai_400() + params_kw.llmid = recs[0].id + +debug(f'{params_kw.llmid=}') + +# Check balance +f = await checkCustomerBalance(params_kw.llmid, userid, userorgid) +if not f: + debug(f'{userid=} balance not enough') + return openai_429() + +# Generate task ID and attach to params +if not params_kw.transno: + params_kw.transno = getID() + +# Call inference (ASR is synchronous) +return await inference(request, env=env)