commit e18aac65955d49a32ff88097992906420ccfe69d Author: Hermes Agent Date: Sun Jun 14 14:46:20 2026 +0800 Initial: faster-whisper ASR HTTP service (ahserver+longtasks+Redis) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..33b4bf7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +__pycache__/ +*.pyc +nohup*.out +*.egg-info +.env +py3/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..c06198a --- /dev/null +++ b/README.md @@ -0,0 +1,182 @@ +# ASR Service + +Speech-to-text service powered by [faster-whisper](https://github.com/SYSTRAN/faster-whisper) (CTranslate2 backend). Uses the `large-v3-turbo` model for fast, high-quality transcription with word-level timestamps. + +## Architecture + +``` +Client --> Redis Queue ("asr") --> ASRTasks (LongTasks worker) + | + v + faster-whisper (GPU) + | + v + Result (JSON) +``` + +- **ahserver**: Web framework serving HTTP on port 9925 +- **longtasks**: Redis-backed async task queue with worker management +- **Redis**: Task queue broker (queue name: `asr`) +- **faster-whisper**: ASR engine running on GPU (CUDA, float16) + +The service follows the same ahserver+longtasks pattern as wan22-service and realesrgan-service. + +## Model + +- **Model**: faster-whisper-large-v3-turbo-ct2 +- **Path**: `/data/ymq/models/deepdml/faster-whisper-large-v3-turbo-ct2` +- **Device**: CUDA (float16) +- **GPU**: Isolated via `CUDA_VISIBLE_DEVICES` (default GPU 5) + +The model is lazy-loaded on first transcription request and stays in GPU memory for subsequent requests. + +## Deployment + +### Prerequisites + +- Python venv with faster-whisper 1.2.1: `/data/ymq/demucs_venv` +- Redis server running on 127.0.0.1:6379 +- CUDA-capable GPU + +### Start + +```bash +cd /data/ymq/asr-service +bash start.sh +``` + +### Stop + +```bash +cd /data/ymq/asr-service +bash stop.sh +``` + +### Health Check + +```bash +curl http://localhost:9925/health +``` + +Returns: +```json +{ + "status": "ok", + "service": "asr-service", + "model": "faster-whisper-large-v3-turbo-ct2" +} +``` + +## API Usage + +Tasks are submitted via Redis, same pattern as wan22-service. + +### Submit a Transcription Task + +```python +import redis +import json +import uuid + +r = redis.Redis(host='127.0.0.1', port=6379) + +task_id = str(uuid.uuid4()) +payload = { + "task_id": task_id, + "task_type": "transcribe", + "audio_path": "/path/to/audio.wav", + "language": "zh", + "word_timestamps": True, + "vad_filter": True, + "output_path": "/tmp/asr-outputs/result.json" +} + +# Push to the Redis queue +r.lpush('asr:queue', json.dumps(payload)) +print(f"Task submitted: {task_id}") +``` + +### Check Task Status + +```python +# Task status is stored in Redis by longtasks +status = r.get(f'asr:status:{task_id}') +result = r.get(f'asr:result:{task_id}') +``` + +## Task Payload Format + +| Field | Type | Required | Default | Description | +|------------------|--------|----------|---------|--------------------------------------| +| task_type | string | Yes | - | Must be `"transcribe"` | +| audio_path | string | Yes | - | Path to input audio file | +| language | string | No | `"zh"` | Language code (zh, en, ja, etc.) | +| word_timestamps | bool | No | `True` | Enable word-level timestamps | +| vad_filter | bool | No | `True` | Enable voice activity detection | +| output_path | string | No | - | If set, save result JSON to this path| + +## Output Format + +```json +{ + "status": "ok", + "text": "Full transcription text...", + "language": "zh", + "language_probability": 0.9876, + "duration": 125.340, + "segments": [ + { + "text": "Segment text", + "start": 0.000, + "end": 5.120, + "words": [ + { + "word": "你好", + "start": 0.000, + "end": 0.800, + "probability": 0.9523 + } + ] + } + ], + "processing_time": 3.45, + "audio_path": "/path/to/audio.wav" +} +``` + +## Configuration + +Config file: `conf/config.json` + +| Setting | Value | Description | +|-----------------------|------------------------------|--------------------------------| +| website.port | 9925 | HTTP listen port | +| website.host | 0.0.0.0 | Bind address | +| session_redis | 127.0.0.1:6379 db=1 | Session storage | +| password_key | ASRService2026Key | Auth key | +| filesroot | /tmp/asr-outputs | Output files directory | + +### Environment Variables + +| Variable | Default | Description | +|----------------------|---------|---------------------------------------| +| ASR_GPU_ID | 5 | GPU device ID (for logging) | +| CUDA_VISIBLE_DEVICES | 5 | CUDA device isolation | +| PYTHONPATH | . | Python module search path | + +## File Structure + +``` +asr-service/ +├── ah.py # Main entry point +├── start.sh # Start script +├── stop.sh # Stop script +├── conf/ +│ └── config.json # Service configuration +├── app/ +│ └── health.dspy # Health check endpoint +├── workers/ +│ ├── __init__.py +│ └── transcribe.py # Transcription worker +└── README.md +``` diff --git a/ah.py b/ah.py new file mode 100644 index 0000000..601d51c --- /dev/null +++ b/ah.py @@ -0,0 +1,43 @@ +import os +from ahserver.webapp import webapp +from ahserver.serverenv import ServerEnv +from ahserver.configuredServer import add_startup +from longtasks.longtasks import LongTasks, schedule_once +from appPublic.log import debug + + +class ASRTasks(LongTasks): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.gpu_id = int(os.environ.get('ASR_GPU_ID', '5')) + + async def process_task(self, payload, workid=None): + import json + if isinstance(payload, str): + payload = json.loads(payload) + task_type = payload.get('task_type', '') + if task_type == 'transcribe': + from workers.transcribe import run_transcribe + return await run_transcribe(self, payload) + raise ValueError(f'Unknown task_type: {task_type}') + + +async def on_app_built(app): + env = ServerEnv() + lt = env.longtasks + if lt: + schedule_once(0.1, lt.run) + debug(f'ASR longtasks worker started, GPU: {lt.gpu_id}') + + +def init(): + env = ServerEnv() + env.longtasks = ASRTasks( + 'redis://127.0.0.1:6379', 'asr', + worker_cnt=1, stuck_seconds=600, max_age_hours=24 + ) + add_startup(on_app_built) + + +if __name__ == '__main__': + webapp(init) diff --git a/app/health.dspy b/app/health.dspy new file mode 100644 index 0000000..9734c2a --- /dev/null +++ b/app/health.dspy @@ -0,0 +1,5 @@ +{{ + "status": "ok", + "service": "asr-service", + "model": "faster-whisper-large-v3-turbo-ct2" +}} diff --git a/conf/config.json b/conf/config.json new file mode 100644 index 0000000..c4c3121 --- /dev/null +++ b/conf/config.json @@ -0,0 +1 @@ +{"password_key":"ASRService2026Key","databases":{},"session_redis":{"host":"127.0.0.1","port":6379,"db":1},"website":{"paths":[["$[workdir]$/app",""]],"host":"0.0.0.0","port":9925,"coding":"utf-8","indexes":["index.html","index.dspy"],"processors":[[".dspy","dspy"]],"startswiths":[{"leading":"/idfile","registerfunction":"idfile"}]},"hot_reload":false,"filesroot":"/tmp/asr-outputs"} diff --git a/start.sh b/start.sh new file mode 100755 index 0000000..9731884 --- /dev/null +++ b/start.sh @@ -0,0 +1,7 @@ +#!/bin/bash +cd /data/ymq/asr-service +export ASR_GPU_ID=5 +export CUDA_VISIBLE_DEVICES=5 +export PYTHONPATH=/data/ymq/asr-service +nohup /data/ymq/demucs_venv/bin/python ah.py > nohup.out 2>&1 & +echo "asr-service started, PID: $!, GPU: $ASR_GPU_ID" diff --git a/stop.sh b/stop.sh new file mode 100755 index 0000000..8c66210 --- /dev/null +++ b/stop.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Stop the asr-service +PID=$(pgrep -f "python ah.py" | head -1) +if [ -z "$PID" ]; then + echo "asr-service is not running" + exit 0 +fi + +echo "Stopping asr-service (PID: $PID)..." +kill "$PID" + +# Wait up to 10 seconds for graceful shutdown +for i in $(seq 1 10); do + if ! kill -0 "$PID" 2>/dev/null; then + echo "asr-service stopped" + exit 0 + fi + sleep 1 +done + +# Force kill if still running +echo "Force killing asr-service (PID: $PID)..." +kill -9 "$PID" +echo "asr-service killed" diff --git a/workers/__init__.py b/workers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/workers/transcribe.py b/workers/transcribe.py new file mode 100644 index 0000000..6d82ce4 --- /dev/null +++ b/workers/transcribe.py @@ -0,0 +1,144 @@ +""" +ASR Transcription Worker using faster-whisper. + +Lazy-loads the model on first use and keeps it in GPU memory. +Processes transcription tasks from the Redis queue. +""" + +import os +import json +import asyncio +import time +from appPublic.log import debug, error + +# Module-level model cache (lazy-loaded, stays in memory) +_model = None +_model_lock = None + +MODEL_PATH = '/data/ymq/models/deepdml/faster-whisper-large-v3-turbo-ct2' + + +def _get_lock(): + """Get or create the async lock for model loading.""" + global _model_lock + if _model_lock is None: + _model_lock = asyncio.Lock() + return _model_lock + + +async def load_model(): + """Lazy-load the faster-whisper model. Thread-safe, loads once.""" + global _model + if _model is not None: + return _model + + async with _get_lock(): + # Double-check after acquiring lock + if _model is not None: + return _model + + debug(f'Loading faster-whisper model from {MODEL_PATH}...') + t0 = time.time() + + from faster_whisper import WhisperModel + + # CUDA device 0 — CUDA_VISIBLE_DEVICES already isolates the GPU + _model = WhisperModel( + MODEL_PATH, + device='cuda', + device_index=0, + compute_type='float16', + num_workers=1, + ) + + elapsed = time.time() - t0 + debug(f'faster-whisper model loaded in {elapsed:.1f}s') + return _model + + +async def run_transcribe(tasks, payload): + """ + Run transcription on an audio file. + + Payload fields: + audio_path (str): Path to the audio file (required) + language (str): Language code, default 'zh' + word_timestamps (bool): Enable word-level timestamps, default True + vad_filter (bool): Enable VAD filter, default True + output_path (str): Optional path to save result JSON + + Returns: + dict with segments, language, duration, etc. + """ + audio_path = payload.get('audio_path') + if not audio_path: + raise ValueError('audio_path is required') + + if not os.path.exists(audio_path): + raise FileNotFoundError(f'Audio file not found: {audio_path}') + + language = payload.get('language', 'zh') + word_timestamps = payload.get('word_timestamps', True) + vad_filter = payload.get('vad_filter', True) + output_path = payload.get('output_path') + + debug(f'Transcribing: {audio_path} (lang={language}, vad={vad_filter}, words={word_timestamps})') + t0 = time.time() + + model = await load_model() + + # Run the synchronous transcription in a thread to not block the event loop + loop = asyncio.get_event_loop() + segments_gen, info = await loop.run_in_executor( + None, + lambda: model.transcribe( + audio_path, + language=language, + word_timestamps=word_timestamps, + vad_filter=vad_filter, + ) + ) + + # Collect segments + segments = [] + for seg in segments_gen: + seg_data = { + 'text': seg.text, + 'start': round(seg.start, 3), + 'end': round(seg.end, 3), + } + if word_timestamps and seg.words: + seg_data['words'] = [ + { + 'word': w.word, + 'start': round(w.start, 3), + 'end': round(w.end, 3), + 'probability': round(w.probability, 4), + } + for w in seg.words + ] + segments.append(seg_data) + + elapsed = time.time() - t0 + result = { + 'status': 'ok', + 'text': ' '.join(s['text'] for s in segments), + 'language': info.language, + 'language_probability': round(info.language_probability, 4), + 'duration': round(info.duration, 3), + 'segments': segments, + 'processing_time': round(elapsed, 2), + 'audio_path': audio_path, + } + + debug(f'Transcription done in {elapsed:.1f}s: {len(segments)} segments, ' + f'duration={info.duration:.1f}s, lang={info.language}') + + # Save result if output_path specified + if output_path: + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + debug(f'Result saved to {output_path}') + + return result