first commit

2026-04-17 15:16:12 +08:00 · 2026-04-17 15:16:12 +08:00 · ebe895f7fb
commit ebe895f7fb
7 changed files with 244 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,31 @@
 # 音频文本对齐服务
 本服务部署在有GPU的主机上
 ## api
 请求格式
 ```
 curl -X POST https://server:port/align \
  -H "Content-Type: application/json" \
 	-F "text=音频中的文字" \
 	-F "audio_file=@/path/to/音频文件"
 ```
 输出：
 ```
 [
 	{
 		"sentence": "世界啊你好",
 		"start": 0.123,
 		"end": 1.45,
 		"chars":[
 			{
 				"char": "世",
 				"start":0.123,
 				"end": 0.543
 			},
 			...
 		]
 	}
 	...
 ]
 ```
--- a/app/align.py
+++ b/app/align.py
@ -0,0 +1,133 @@
 import torch
 import torchaudio
 import numpy as np
 from transformers import AutoProcessor, Wav2Vec2ForCTC
 from ctc_segmentation import (
    ctc_segmentation,
    CtcSegmentationParameters,
    prepare_text
 )
 class AlignEngine:
    def __init__(
        self,
        model_path: str,
        device: str = "cuda",
        dtype: str = "float16"
    ):
        """
        model_path: 本地模型路径（例如 /models/mms-aligner）
        device: cuda / cpu
        dtype: float16 / float32
        """
        self.device = device
        # dtype处理
        if dtype == "float16":
            self.dtype = torch.float16
        else:
            self.dtype = torch.float32
        # 加载processor + model（本地）
        self.processor = AutoProcessor.from_pretrained(model_path)
        self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
        self.model.to(self.device)
        self.model.eval()
        if self.device == "cuda":
            self.model = self.model.half()
        # vocab缓存（避免重复计算）
        vocab = self.processor.tokenizer.get_vocab()
        self.inv_vocab = {v: k for k, v in vocab.items()}
        self.labels = [self.inv_vocab[i] for i in range(len(self.inv_vocab))]
        self.sample_rate = 16000
    # -----------------------------
    # 音频加载
    # -----------------------------
    def load_audio(self, audio_path):
        speech, sr = torchaudio.load(audio_path)
        if sr != self.sample_rate:
            speech = torchaudio.functional.resample(
                speech, sr, self.sample_rate
            )
        return speech.squeeze()
    # -----------------------------
    # logits计算
    # -----------------------------
    def get_logits(self, speech):
        inputs = self.processor(
            speech,
            sampling_rate=self.sample_rate,
            return_tensors="pt"
        )
        input_values = inputs.input_values.to(self.device)
        with torch.no_grad():
            logits = self.model(input_values).logits
        return logits[0].detach().cpu().numpy()
    # -----------------------------
    # 主对齐函数（逐字）
    # -----------------------------
    def align(self, audio_path: str, text: str):
        """
        返回逐字对齐结果：
        [
          {"char": "你", "start": 0.1, "end": 0.2},
          ...
        ]
        """
        speech = self.load_audio(audio_path)
        logits = self.get_logits(speech)
        # 中文/多语言 → 强制逐字
        text = text.replace(" ", "")
        chars = list(text)
        config = CtcSegmentationParameters()
        config.char_list = self.labels
        ground_truth_mat, utt_begin_indices = prepare_text(
            config, [chars]
        )
        timings, char_probs, state_list = ctc_segmentation(
            config,
            logits,
            ground_truth_mat
        )
        # 每帧时间
        audio_duration = speech.shape[-1] / self.sample_rate
        frame_duration = audio_duration / len(timings)
        results = []
        base = utt_begin_indices[0]
        for i, c in enumerate(chars):
            start = timings[base + i]
            end = timings[base + i + 1]
            results.append({
                "char": c,
                "start": float(start * frame_duration),
                "end": float(end * frame_duration),
                "prob": float(char_probs[base + i])
            })
        return results
--- a/app/aligner.py
+++ b/app/aligner.py
@ -0,0 +1,39 @@
 from align import AlignEngine
 from ahserver.serverenv import ServerEnv
 from ahserver.webapp import webapp
 from ahserver.filestorage import FileStorage
 from appPublic.worker import awaitify
 from appPublic.jsonConfig import getConfig
 async def align(audio_webpath, text):
    env = ServerEnv()
    fs = FileStorage()
    audio_path = fs.realPath(audio_webpath)
    align = awaitfy(env.align_engine.align)
    s = await align(audio_path, text)
    lines = text.split('\n')
    c_pos = 0
    sentences = []
    for l in lines:
        if l:
            segment={
                'sentence': l,
                'start': s[c_pos]['start']
                'chars':[]
            }
            for c in l:
                c_pos += 1
                segment['chars'].append(s[c_pos])
            segment['end'] = s[c_pos -1]['end']
            sentences.append(segment)
    return sentences           
 def init():
    env = ServerEnv()
    config = getConfig()
    env.align_engine = AlignEngine(config.align_model)
    env.align = align
 if __name_ == '__main__':
    webapp(init)
--- a/conf/config.json
+++ b/conf/config.json
@ -0,0 +1,36 @@
 {
    "align_model": "/data/ymq/models/MahmoudAshraf/mms-300m-1130-forced-aligner",
    "website":{
        "paths":[
            ["$[workdir]$/wwwroot",""]
        ],
        "client_max_size":1000000000,
        "host":"0.0.0.0",
        "port":8080,
        "coding":"utf-8",
        "indexes":[
            "index.html",
            "index.ui",
            "index.dspy"
        ],
        "processors":[
            [".proxy","proxy"],
            [".tmpl.js","tmpl"],
            [".tmpl.css","tmpl"],
            [".html.tmpl","tmpl"],
            [".tmpl","tmpl"],
            [".app","app"],
            [".ui","bui"],
            [".dspy","dspy"]
        ],
        "startswiths":[
            {
                "leading":"/idfile",
                "registerfunction":"idfile"
            },{
                "leading":"/i18n_getmsgs",
                "registerfunction":"i18n"
            }
        ]
    }
 }
--- a/files/README.md
+++ b/files/README.md
--- a/logs/README.md
+++ b/logs/README.md
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
 torch
 torchaudio
 transformers
 librosa
 ctc-segmentation