This commit is contained in:
yumoqing 2025-07-18 15:50:49 +08:00
commit 3d2f799eee
55 changed files with 5901 additions and 0 deletions

0
README.md Normal file
View File

Binary file not shown.

Binary file not shown.

246
llmengine/base_chat_llm.py Normal file
View File

@ -0,0 +1,246 @@
import threading
import asyncio
import json
import torch
from time import time
from transformers import TextIteratorStreamer
from appPublic.log import debug
from appPublic.worker import awaitify
from appPublic.uniqueID import getID
model_pathMap = {
}
def llm_register(model_key, Klass):
model_pathMap[model_key] = Klass
def get_llm_class(model_path):
for k,klass in model_pathMap.items():
if len(model_path.split(k)) > 1:
return klass
print(f'{model_pathMap=}')
return None
class BaseChatLLM:
def use_mps_if_prosible(self):
if torch.backends.mps.is_available():
device = torch.device("mps")
self.model = self.model.to(device)
def get_session_key(self):
return self.model_id + ':messages'
def _get_session_messages(self, session):
key = self.get_session_key()
messages = session.get(key) or []
return messages
def _set_session_messages(self, session, messages):
key = self.get_session_key()
session[key] = messages
def get_streamer(self):
return TextIteratorStreamer(
tokenizer=self.tokenizer,
skip_special_tokens=True,
skip_prompt=True
)
def output_generator(self, streamer):
all_txt = ''
t1 = time()
i = 0
id = f'chatllm-{getID}'
for txt in streamer:
if txt == '':
continue
if i == 0:
t2 = time()
i += 1
all_txt += txt
yield {
"id":id,
"object":"chat.completion.chunk",
"created":time(),
"model":self.model_id,
"choices":[
{
"index":0,
"delta":{
"content":txt
},
"logprobs":None,
"finish_reason":None
}
]
}
t3 = time()
t = all_txt
unk = self.tokenizer(t, return_tensors="pt")
output_tokens = len(unk["input_ids"][0])
yield {
"id":id,
"object":"chat.completion.chunk",
"created":time(),
"model":self.model_id,
"response_time": t2 - t1,
"finish_time": t3 - t1,
"output_token": output_tokens,
"choices":[
{
"index":0,
"delta":{
"content":""
},
"logprobs":None,
"finish_reason":"stop"
}
]
}
def _generator(self, session, prompt, image_path, video_path, audio_path, sys_prompt):
messages = self._get_session_messages(session)
if sys_prompt:
messages.append(self._build_sys_message(sys_prompt))
messages.append(self._build_user_message(prompt, image_path=image_path))
# debug(f'{messages=}')
all_txt = ''
for d in self._gen(messages):
if d['choices'][0]['finish_reason'] == 'stop':
messages.append(self._build_assistant_message(all_txt))
else:
all_txt += d['choices'][0]['delta']['content']
yield d
self._set_session_messages(session, messages)
async def _async_generator(self, session, prompt, image_path, video_path, audio_path, sys_prompt):
for d in self._generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
await asyncio.sleep(0)
yield d
def generate(self, session, prompt,
image_path=None,
video_path=None,
audio_path=None,
sys_prompt=None):
for d in self._generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
if d['choices'][0]['finish_reason'] == 'stop':
return d
def stream_generate(self, session, prompt,
image_path=None,
video_path=None,
audio_path=None,
sys_prompt=None):
for d in self._generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
s = f'data: {json.dumps(d)}\n'
yield s
async def async_generate(self, session, prompt,
image_path=None,
video_path=None,
audio_path=None,
sys_prompt=None):
async for d in self._async_generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
await asyncio.sleep(0)
if d['choices'][0]['finish_reason'] == 'stop':
return d
async def async_stream_generate(self, session, prompt,
image_path=None,
video_path=None,
audio_path=None,
sys_prompt=None):
async for d in self._async_generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
s = f'data: {json.dumps(d)}\n'
yield s
yield 'data: [DONE]'
def build_kwargs(self, inputs, streamer):
generate_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=512,
do_sample=True,
eos_token_id=self.tokenizer.eos_token_id
)
return generate_kwargs
def _messages2inputs(self, messages):
return self.processor.apply_chat_template(
messages, add_generation_prompt=True,
tokenize=True,
return_dict=True, return_tensors="pt"
).to(self.model.device, dtype=torch.bfloat16)
def _gen(self, messages):
inputs = self._messages2inputs(messages)
input_len = inputs["input_ids"].shape[-1]
streamer = self.get_streamer()
kwargs = self.build_kwargs(inputs, streamer)
thread = threading.Thread(target=self.model.generate,
kwargs=kwargs)
thread.start()
for d in self.output_generator(streamer):
if d['choices'][0]['finish_reason'] == 'stop':
d['input_tokens'] = input_len
yield d
class T2TChatLLM(BaseChatLLM):
def _build_assistant_message(self, prompt):
return {
"role":"assistant",
"content":prompt
}
def _build_sys_message(self, prompt):
return {
"role":"system",
"content": prompt
}
def _build_user_message(self, prompt, **kw):
return {
"role":"user",
"content": prompt
}
class MMChatLLM(BaseChatLLM):
""" multiple modal chat LLM """
def _build_assistant_message(self, prompt):
return {
"role":"assistant",
"content":[{"type": "text", "text": prompt}]
}
def _build_sys_message(self, prompt):
return {
"role":"system",
"content":[{"type": "text", "text": prompt}]
}
def _build_user_message(self, prompt, image_path=None,
video_path=None, audio_path=None):
contents = [
{
"type":"text", "text": prompt
}
]
if image_path:
contents.append({
"type": "image",
"image": image_path
})
if video_path:
contents.append({
"type": "video",
"video":video_path
})
if audio_path:
contents.append({
"tyoe": "audio",
"audio": audio_path
})
return {
"role": "user",
"content": contents
}

View File

@ -0,0 +1,46 @@
import torch
model_pathMap = {
}
def llm_register(model_key, Klass):
global model_pathMap
model_pathMap[model_key] = Klass
def get_llm_class(model_path):
for k,klass in model_pathMap.items():
if len(model_path.split(k)) > 1:
return klass
print(f'{model_pathMap=}')
return None
class BaseEmbedding:
def use_mps_if_prosible(self):
if torch.backends.mps.is_available():
device = torch.device("mps")
self.model = self.model.to(device)
def embeddings(self, input):
es = self.model.encode(input)
data = []
for i, e in enumerate(es):
d = {
"object": "embedding",
"index": i,
"embedding": e.tolist()
}
data.append(d)
return {
"object": "list",
"data": data,
"model": self.model_name,
"usage": {
"prompt_tokens": 0,
"total_tokens": 0
}
}
def similarity(self, qvector, dcovectors):
s = self.model.similarity([qvector], docvectors)
return s[0]

View File

@ -0,0 +1,84 @@
import torch
model_pathMap = {
}
def llm_register(model_key, Klass):
model_pathMap[model_key] = Klass
def get_llm_class(model_path):
for k,klass in model_pathMap.items():
if len(model_path.split(k)) > 1:
return klass
print(f'{model_pathMap=}')
return None
class BaseReranker:
def __init__(self, model_id, **kw):
self.model_id = model_id
def use_mps_if_prosible(self):
if torch.backends.mps.is_available():
device = torch.device("mps")
self.model = self.model.to(device)
def process_inputs(self, pairs):
inputs = self.tokenizer(
pairs, padding=False, truncation='longest_first',
return_attention_mask=False, max_length=self.max_length
)
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=self.max_length)
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs
def build_sys_prompt(self, sys_prompt):
return f"<|im_start|>system\n{sys_prompt}\n<|im_end|>"
def build_user_prompt(self, query, doc, instruct=''):
return f'<|im_start|>user\n<Instruct>: {instruct}\n<Query>:{query}\n<Document>:\n{doc}<|im_end|>'
def build_assistant_prompt(self):
return "<|im_start|>assistant\n<think>\n\n</think>\n\n"
def compute_logits(self, inputs, **kwargs):
batch_scores = self.model(**inputs).logits[:, -1, :]
# true_vector = batch_scores[:, token_true_id]
# false_vector = batch_scores[:, token_false_id]
# batch_scores = torch.stack([false_vector, true_vector], dim=1)
batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
scores = batch_scores[:, 1].exp().tolist()
return scores
def build_pairs(self, query, docs, sys_prompt="", task=""):
sys_str = self.build_sys_prompt(sys_prompt)
ass_str = self.build_assistant_prompt()
pairs = [ sys_str + '\n' + self.build_user_prompt(task, query, doc) + '\n' + ass_str for doc in docs ]
return pairs
def rerank(self, query, docs, top_n, sys_prompt="", task=""):
pairs = self.build_pairs(query, docs, sys_prompt=sys_prompt, task=task)
with torch.no_grad():
inputs = self.process_inputs(pairs)
scores = self.compute_logits(inputs)
data = []
for i, s in enumerate(scores):
d = {
'index':i,
'relevance_score': s
}
data.append(d)
data = sorted(data,
key=lambda x: x["relevance_score"],
reverse=True)
if len(data) > top_n:
data = data[:top_n]
ret = {
"data": data,
"object": "rerank.result",
"model": self.model_name,
"usage": {
"prompt_tokens": 0,
"total_tokens": 0
}
}
return ret

View File

@ -0,0 +1,80 @@
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_pathMap = {
}
def llm_register(model_key, Klass):
model_pathMap[model_key] = Klass
def get_llm_class(model_path):
for k,klass in model_pathMap.items():
if len(model_path.split(k)) > 1:
return klass
print(f'{model_pathMap=}')
return None
class BaseRelationLLM:
def extract_triplets_typed(self, text):
triplets = []
relation = ''
text = text.strip()
current = 'x'
subject, relation, object_, object_type, subject_type = '','','','',''
for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").replace("tp_XX", "").replace("__en__", "").split():
if token == "<triplet>" or token == "<relation>":
current = 't'
if relation != '':
triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
relation = ''
subject = ''
elif token.startswith("<") and token.endswith(">"):
if current == 't' or current == 'o':
current = 's'
if relation != '':
triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
object_ = ''
subject_type = token[1:-1]
else:
current = 'o'
object_type = token[1:-1]
relation = ''
else:
if current == 't':
subject += ' ' + token
elif current == 's':
object_ += ' ' + token
elif current == 'o':
relation += ' ' + token
if subject != '' and relation != '' and object_ != '' and object_type != '' and subject_type != '':
triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
return triplets
def build_inputs(self, text):
# Tokenizer text
return self.tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')
def gen_preds(self, inputs):
# Generate
generated_tokens = self.model.generate(
inputs['input_ids'].to(self.model.device)
attention_mask=inputs["attention_mask"].to(self.model.device),
decoder_start_token_id = self.tokenizer.convert_tokens_to_ids("tp_XX"),
**self.gen_kwargs
)
# Extract text
decoded_preds = self.tokenizer.batch_decode(generated_tokens,
skip_special_tokens=False)
return decoded_preds
def extract_triplets(self, text):
inputs = build_inputs(text)
preds = gen_preds(inputs)
# Extract triplets
triplets = []
for idx, sentence in enumerate(decoded_preds):
x = self.extract_triplets_typed(sentence)
triplets += x
print(triplets)

31
llmengine/bge_reranker.py Normal file
View File

@ -0,0 +1,31 @@
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from llmengine.base_reranker import BaseReranker, llm_register
class BgeReranker(BaseReranker):
def __init__(self, model_id, max_length=8096):
if 'bge-reranker' not in model_id:
e = Exception(f'{model_id} is not a bge-reranker')
raise e
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
model.eval()
self.model = model
self.model_id = model_id
self.model_name = model_id.split('/')[-1]
def build_pairs(self, query, docs, **kw):
return [[query, doc] for doc in docs]
def process_inputs(self, pairs):
inputs = self.tokenizer(pairs, padding=True,
truncation=True, return_tensors='pt', max_length=512)
return inputs
def compute_logits(self, inputs):
scores = self.model(**inputs,
return_dict=True).logits.view(-1, ).float()
scores = [ s.item() for s in scores ]
return scores
llm_register('bge-reranker', BgeReranker)

212
llmengine/chatllm.py Normal file
View File

@ -0,0 +1,212 @@
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from time import time
import torch
from threading import Thread
def is_chat_model(model_name: str, tokenizer) -> bool:
chat_keywords = ["chat", "chatml", "phi", "llama-chat", "mistral-instruct"]
if any(k in model_name.lower() for k in chat_keywords):
return True
if tokenizer and hasattr(tokenizer, "additional_special_tokens"):
if any(tag in tokenizer.additional_special_tokens for tag in ["<|user|>", "<|system|>", "<|assistant|>"]):
return True
return False
def build_chat_prompt(messages):
prompt = ""
for message in messages:
role = message["role"]
content = message["content"]
prompt += f"<|{role}|>\n{content}\n"
prompt += "<|assistant|>\n" # 生成开始
return prompt
class CountingStreamer(TextIteratorStreamer):
def __init__(self, tokenizer, skip_prompt=True, **kw):
super().__init__(tokenizer, skip_prompt=skip_prompt, **kw)
self.token_count = 0
def __next__(self, *args, **kw):
output_ids = super().__iter__(*args, **kw)
self.token_count += output_ids.sequences.shape[1]
return output_ids
class TransformersChatEngine:
def __init__(self, model_name: str, device: str = None, fp16: bool = True,
output_json=True,
gpus: int = 1):
"""
通用大模型加载器支持 GPU 数量与编号控制
:param model_name: 模型名称或路径
:param device: 指定设备如 "cuda:0"默认自动选择
:param fp16: 是否使用 fp16 精度适用于支持的 GPU
:param gpus: 使用的 GPU 数量1 表示单卡>1 表示多卡推理使用 device_map='auto'
"""
self.output_json = output_json
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
self.is_multi_gpu = gpus > 1 and torch.cuda.device_count() >= gpus
print(f"✅ Using device: {self.device}, GPUs: {gpus}, Multi-GPU: {self.is_multi_gpu}")
# Tokenizer 加载
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# 模型加载
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if fp16 and "cuda" in self.device else torch.float32,
device_map="auto" if self.is_multi_gpu else None
)
if not self.is_multi_gpu:
self.model.to(self.device)
self.model.eval()
self.is_chat = is_chat_model(model_name, self.tokenizer)
if self.is_chat:
self.messages = [ ]
print(f'{self.model.generation_config=}')
def set_system_prompt(self, prompt):
if self.is_chat:
self.messages = [{
'role': 'system',
'content': prompt
}]
def set_assistant_prompt(self, prompt):
if self.is_chat:
self.messages.append({
'role': 'assistant',
'content': prompt
})
def set_user_prompt(self, prompt):
if self.is_chat:
self.messages.append({
'role': 'user',
'content': prompt
})
return build_chat_prompt(self.messages)
return prompt
def generate(self, prompt: str):
t1 = time()
prompt = self.set_user_prompt(prompt)
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
output_ids = self.model.generate(
**inputs,
max_new_tokens=128,
generation_config=self.model.generation_config
)
output_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
t2 = time
text = output_text[len(prompt):] if output_text.startswith(prompt) else output_text
self.set_assistant_prompt(text)
if not self.output_json:
return text
input_tokens = inputs["input_ids"].shape[1]
output_tokens = len(self.tokenizer(text, return_tensors="pt")["input_ids"][0])
return {
'content':text,
'input_tokens': input_tokens,
'output_tokens': output_tokens,
'finish_time': t2 - t1,
'response_time': t2 - t1
}
def stream_generate(self, prompt: str):
t1 = time()
prompt = self.set_user_prompt(prompt)
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
input_tokens = inputs["input_ids"].shape[1]
streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=16000,
generation_config=self.model.generation_config
)
thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
thread.start()
first = True
all_txt = ''
for new_text in streamer:
all_txt += new_text
if first:
t2 = time()
first = False
if not self.output_json:
yield new_text
yield {
'content': new_text,
'done': False
}
output_tokens = len(self.tokenizer(all_txt, return_tensors="pt")["input_ids"][0])
self.set_assistant_prompt(all_txt)
t3 = time()
if self.output_json:
yield {
'done': True,
'content':'',
'response_time': t2 - t1,
'finish_time': t3 - t1,
'input_tokens': input_tokens,
'output_tokens': output_tokens
}
if __name__ == '__main__':
import os
import sys
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Transformers Chat CLI")
parser.add_argument("--model", type=str, required=True, help="模型路径或 Hugging Face 名称")
parser.add_argument("--gpus", type=int, default=1, help="使用 GPU 数量")
parser.add_argument("--stream", action="store_true", help="是否流式输出")
return parser.parse_args()
def print_content(outd):
if isinstance(outd, dict):
print(outd['content'], end="", flush=True)
else:
print(outd, end="", flush=True)
def print_info(outd):
if isinstance(outd, dict):
if outd['done']:
print(f"response_time={outd['response_time']}, finish_time={outd['finish_time']}, input_tokens={outd['input_tokens']}, output_tokens={outd['output_tokens']}\n")
else:
print('\n');
def generate(engine, stream):
while True:
print('prompt("q" to exit):')
p = input()
if p == 'q':
break
if not p:
continue
if stream:
for outd in engine.stream_generate(p):
print_content(outd)
print('\n')
print_info(outd)
else:
outd = engine.generate(p)
print_content(outd)
print('\n')
print__info(outd)
def main():
args = parse_args()
print(f'{args=}')
engine = TransformersChatEngine(
model_name=args.model,
gpus=args.gpus
)
generate(engine, args.stream)
main()

57
llmengine/client/llmclient Executable file
View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
from traceback import format_exc
import asyncio
import codecs
import json
import argparse
from appPublic.streamhttpclient import liner, StreamHttpClient
from appPublic.log import MyLogger
def user_message(prompt, fn=None):
x = ''
if fn:
x = user_file(fn)
return prompt + x
def user_file(fn):
with codecs.open(fn, 'r', 'utf-8') as f:
return f.read()
async def main():
parser = argparse.ArgumentParser(prog='devops')
parser.add_argument('-f', '--file')
parser.add_argument('-p', '--prompt')
parser.add_argument('-s', '--sys_prompt')
parser.add_argument('-m', '--model')
parser.add_argument('url')
args = parser.parse_args()
d = {
'model': args.model,
'stream': True,
'prompt': user_message(args.prompt, args.file),
'sys_prompt':args.sys_prompt
}
hc = StreamHttpClient()
headers = {
'Content-Type': 'application/json'
}
i = 0
buffer = ''
reco = hc('POST', args.url, headers=headers, data=json.dumps(d))
async for chunk in liner(reco):
chunk = chunk[6:]
if chunk != '[DONE]':
try:
f = json.loads(chunk)
except Exception as e:
print(f'****{chunk=} error {e} {format_exc()}')
continue
if not f['choices'][0]['finish_reason']:
print(f['choices'][0]['delta']['content'], end='', flush=True)
else:
pass
print('\n\n')
if __name__ == '__main__':
MyLogger('null', levelname='error', logfile='/dev/null')
asyncio.new_event_loop().run_until_complete(main())

59
llmengine/devstral.py Normal file
View File

@ -0,0 +1,59 @@
# for model mistralai/Devstral-Small-2505
from appPublic.worker import awaitify
from appPublic.log import debug
from ahserver.serverenv import get_serverenv
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from mistral_common.protocol.instruct.messages import (
SystemMessage, UserMessage, AssistantMessage
)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
import torch
from llmengine.base_chat_llm import BaseChatLLM, T2TChatLLM, llm_register
class DevstralLLM(T2TChatLLM):
def __init__(self, model_id):
tekken_file = f'{model_id}/tekken.json'
self.tokenizer = MistralTokenizer.from_file(tekken_file)
self.model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype="auto",
device_map="auto"
)
self.model_id = model_id
def _build_assistant_message(self, prompt):
return AssistantMessage(content=prompt)
def _build_sys_message(self, prompt):
return SystemMessage(content=prompt)
def _build_user_message(self, prompt, **kw):
return UserMessage(content=prompt)
def get_streamer(self):
return TextIteratorStreamer(
tokenizer=self.tokenizer,
skip_prompt=True
)
def build_kwargs(self, inputs, streamer):
generate_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=32768,
do_sample=True
)
return generate_kwargs
def _messages2inputs(self, messages):
tokenized = self.tokenizer.encode_chat_completion(
ChatCompletionRequest(messages=messages)
)
return {
'input_ids': torch.tensor([tokenized.tokens])
}
llm_register('mistralai/Devstral', DevstralLLM)

95
llmengine/embedding.py Normal file
View File

@ -0,0 +1,95 @@
from traceback import format_exc
import os
import sys
import argparse
from llmengine.qwen3embedding import *
from llmengine.base_embedding import get_llm_class
from appPublic.registerfunction import RegisterFunction
from appPublic.worker import awaitify
from appPublic.log import debug, exception
from ahserver.serverenv import ServerEnv
from ahserver.globalEnv import stream_response
from ahserver.webapp import webserver
from aiohttp_session import get_session
helptext = """embeddings api:
path: /v1/embeddings
headers: {
"Content-Type": "application/json"
}
data: {
"input": "this is a test"
}
or {
"input":[
"this is first sentence",
"this is second setence"
]
}
response is a json
{
"object": "list",
"data": [
{
"object": "embedding",
"index": 0,
"embedding": [0.0123, -0.0456, ...]
}
],
"model": "text-embedding-3-small",
"usage": {
"prompt_tokens": 0,
"total_tokens": 0
}
}
"""
def init():
rf = RegisterFunction()
rf.register('embeddings', embeddings)
rf.register('docs', docs)
async def docs(request, params_kw, *params, **kw):
return helptext
async def embeddings(request, params_kw, *params, **kw):
debug(f'{params_kw.input=}')
se = ServerEnv()
engine = se.engine
f = awaitify(engine.embeddings)
input = params_kw.input
if input is None:
e = exception(f'input is None')
raise e
if isinstance(input, str):
input = [input]
arr = await f(input)
debug(f'{arr=}, type(arr)')
return arr
def main():
parser = argparse.ArgumentParser(prog="Embedding")
parser.add_argument('-w', '--workdir')
parser.add_argument('-p', '--port')
parser.add_argument('model_path')
args = parser.parse_args()
Klass = get_llm_class(args.model_path)
if Klass is None:
e = Exception(f'{args.model_path} has not mapping to a model class')
exception(f'{e}, {format_exc()}')
raise e
se = ServerEnv()
se.engine = Klass(args.model_path)
se.engine.use_mps_if_prosible()
workdir = args.workdir or os.getcwd()
port = args.port
debug(f'{args=}')
webserver(init, workdir, port)
if __name__ == '__main__':
main()

44
llmengine/gemma3_it.py Normal file
View File

@ -0,0 +1,44 @@
#!/share/vllm-0.8.5/bin/python
# pip install accelerate
import threading
from time import time
from appPublic.worker import awaitify
from ahserver.serverenv import get_serverenv
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
from PIL import Image
import requests
import torch
from llmengine.base_chat_llm import MMChatLLM, llm_register
class Gemma3LLM(MMChatLLM):
def __init__(self, model_id):
self.model = Gemma3ForConditionalGeneration.from_pretrained(
model_id, device_map="auto"
).eval()
self.processor = AutoProcessor.from_pretrained(model_id)
self.tokenizer = self.processor.tokenizer
self.messages = []
self.model_id = model_id
llm_register("gemma-3", Gemma3LLM)
if __name__ == '__main__':
gemma3 = Gemma3LLM('/share/models/google/gemma-3-4b-it')
session = {}
while True:
print('input prompt')
p = input()
if p:
if p == 'q':
break;
print('input image path')
imgpath=input()
for d in gemma3.stream_generate(session, p, image_path=imgpath):
if not d['done']:
print(d['text'], end='', flush=True)
else:
x = {k:v for k,v in d.items() if k != 'text'}
print(f'\n{x}\n')

53
llmengine/medgemma3_it.py Normal file
View File

@ -0,0 +1,53 @@
# pip install accelerate
import time
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import requests
import torch
from llmengine.base_chat_llm import MMChatLLM, llm_register
model_id = "google/medgemma-4b-it"
class MedgemmaLLM(MMChatLLM):
def __init__(self, model_id):
self.model = AutoModelForImageTextToText.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
self.processor = AutoProcessor.from_pretrained(model_id)
self.tokenizer = self.processor.tokenizer
self.model_id = model_id
def _messages2inputs(self, messages):
inputs = self.processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to(self.model.device, dtype=torch.bfloat16)
return inputs
llm_register("google/medgemma", MedgemmaLLM)
if __name__ == '__main__':
med = MedgemmaLLM('/share/models/google/medgemma-4b-it')
session = {}
while True:
print(f'chat with {med.model_id}')
print('input prompt')
p = input()
if p:
if p == 'q':
break;
print('input image path')
imgpath=input()
for d in med.stream_generate(session, p, image_path=imgpath):
if not d['done']:
print(d['text'], end='', flush=True)
else:
x = {k:v for k,v in d.items() if k != 'text'}
print(f'\n{x}\n')

View File

@ -0,0 +1,49 @@
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from base_triplets import BaseTriplets, llm_register
class MrebelTriplets(BaseTriplets):
def __init__(self, model_id):
if 'mrebel' not in model_id:
raise Exception(f'{model_id} is not a mrebel model')
# Load model and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_id,
src_lang="zh_XX", tgt_lang="tp_XX")
# Here we set English ("en_XX") as source language.
# To change the source language swap the first token of the
# input for your desired language or change to supported language.
# For catalan ("ca_XX") or greek ("el_EL")
# (not included in mBART pretraining) you need a workaround:
# tokenizer._src_lang = "ca_XX"
# tokenizer.cur_lang_code_id = tokenizer.convert_tokens_to_ids("ca_XX")
# tokenizer.set_src_lang_special_tokens("ca_XX")
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
self.model_id = model_id
self.model_name = model_id.split('/')[-1]
self.gen_kwargs = {
"max_length": 256,
"length_penalty": 0,
"num_beams": 3,
"num_return_sequences": 3,
"forced_bos_token_id": None,
}
def build_inputs(self, text):
# Tokenizer text
return self.tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')
def gen_preds(self, inputs):
# Generate
generated_tokens = self.model.generate(
inputs['input_ids'].to(self.model.device)
attention_mask=inputs["attention_mask"].to(self.model.device),
decoder_start_token_id = self.tokenizer.convert_tokens_to_ids("tp_XX"),
**self.gen_kwargs
)
# Extract text
decoded_preds = self.tokenizer.batch_decode(generated_tokens,
skip_special_tokens=False)
return decoded_preds
llm_register('mrebel', MrebelTriplets)

68
llmengine/qwen3.py Normal file
View File

@ -0,0 +1,68 @@
#!/share/vllm-0.8.5/bin/python
# pip install accelerate
from appPublic.worker import awaitify
from appPublic.log import debug
from ahserver.serverenv import get_serverenv
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch
from llmengine.base_chat_llm import BaseChatLLM, T2TChatLLM, llm_register
class Qwen3LLM(T2TChatLLM):
def __init__(self, model_id):
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
self.model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype="auto",
device_map="auto"
)
if torch.backends.mps.is_available():
device = torch.device("mps")
self.model = self.model.to(device)
self.model_id = model_id
def build_kwargs(self, inputs, streamer):
generate_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=32768,
do_sample=True,
eos_token_id=self.tokenizer.eos_token_id
)
return generate_kwargs
def _messages2inputs(self, messages):
debug(f'{messages=}')
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=True
)
return self.tokenizer([text], return_tensors="pt").to(self.model.device)
llm_register("Qwen/Qwen3", Qwen3LLM)
if __name__ == '__main__':
import sys
model_path = sys.argv[1]
q3 = Qwen3LLM(model_path)
session = {}
while True:
print('input prompt')
p = input()
if p:
if p == 'q':
break;
for d in q3.stream_generate(session, p):
print(d)
"""
if not d['done']:
print(d['text'], end='', flush=True)
else:
x = {k:v for k,v in d.items() if k != 'text'}
print(f'\n{x}\n')
"""

View File

@ -0,0 +1,16 @@
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
from llmengine.base_reranker import BaseReranker, llm_register
class Qwen3Reranker(BaseReranker):
def __init__(self, model_id, max_length=8096):
if 'Qwen3-Reranker' not in model_id:
e = Exception(f'{model_id} is not a Qwen3-Reranker')
raise e
self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
self.model = AutoModelForCausalLM.from_pretrained(model_id).eval()
self.model_id = model_id
self.model_name = model_id.split('/')[-1]
self.max_length = 8192
llm_register('Qwen3-Reranker', Qwen3Reranker)

View File

@ -0,0 +1,22 @@
# Requires transformers>=4.51.0
# Requires sentence-transformers>=2.7.0
from sentence_transformers import SentenceTransformer
from llmengine.base_embedding import BaseEmbedding, llm_register
class Qwen3Embedding(BaseEmbedding):
def __init__(self, model_id, max_length=8096):
# Load the model
self.model = SentenceTransformer(model_id)
# We recommend enabling flash_attention_2 for better acceleration and memory saving,
# together with setting `padding_side` to "left":
# model = SentenceTransformer(
# "Qwen/Qwen3-Embedding-0.6B",
# model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
# tokenizer_kwargs={"padding_side": "left"},
# )
self.max_length = max_length
self.model_id = model_id
self.model_name = model_id.split('/')[-1]
llm_register('Qwen3-Embedding', Qwen3Embedding)

106
llmengine/rerank.py Normal file
View File

@ -0,0 +1,106 @@
from traceback import format_exc
import os
import sys
import argparse
from llmengine.qwen3_reranker import *
from llmengine.bge_reranker import *
from llmengine.base_reranker import get_llm_class
from appPublic.registerfunction import RegisterFunction
from appPublic.worker import awaitify
from appPublic.log import debug, exception
from ahserver.serverenv import ServerEnv
from ahserver.webapp import webserver
helptext = """rerank api:
path: /v1/rerank
headers: {
"Content-Type": "application/json"
}
data:
{
"model": "rerank-001",
"query": "什么是量子计算?",
"documents": [
"量子计算是一种使用量子比特进行计算的方式。",
"古典计算机使用的是二进制位。",
"天气预报依赖于统计模型。",
"量子计算与物理学密切相关。"
},
"top_n": 2
}
response is a json
{
"data": [
{
"index": 0,
"relevance_score": 0.95
},
{
"index": 3,
"relevance_score": 0.89
}
],
"object": "rerank.result",
"model": "rerank-001",
"usage": {
"prompt_tokens": 0,
"total_tokens": 0
}
}
"""
def init():
rf = RegisterFunction()
rf.register('rerank', rerank)
rf.register('docs', docs)
async def docs(request, params_kw, *params, **kw):
return helptext
async def rerank(request, params_kw, *params, **kw):
debug(f'{params_kw.query=}, {params_kw.documents=}, {params_kw.top_n=}')
se = ServerEnv()
engine = se.engine
f = awaitify(engine.rerank)
query = params_kw.query
if query is None:
e = Exception(f'query is None')
raise e
documents = params_kw.documents
if documents is None:
e = Exception(f'documents is None')
raise e
if isinstance(documents, str):
documents = [documents]
top_n = params_kw.top_n
if top_n is None:
top_n = 5
arr = await f(query, params_kw.documents, top_n)
debug(f'{arr=}, type(arr)')
return arr
def main():
parser = argparse.ArgumentParser(prog="Rerank")
parser.add_argument('-w', '--workdir')
parser.add_argument('-p', '--port')
parser.add_argument('model_path')
args = parser.parse_args()
Klass = get_llm_class(args.model_path)
if Klass is None:
e = Exception(f'{args.model_path} has not mapping to a model class')
exception(f'{e}, {format_exc()}')
raise e
se = ServerEnv()
se.engine = Klass(args.model_path)
se.engine.use_mps_if_prosible()
workdir = args.workdir or os.getcwd()
port = args.port
debug(f'{args=}')
webserver(init, workdir, port)
if __name__ == '__main__':
main()

62
llmengine/server.py Normal file
View File

@ -0,0 +1,62 @@
from traceback import format_exc
import os
import sys
import argparse
from llmengine.base_chat_llm import BaseChatLLM, get_llm_class
from llmengine.gemma3_it import Gemma3LLM
from llmengine.medgemma3_it import MedgemmaLLM
from llmengine.qwen3 import Qwen3LLM
from appPublic.registerfunction import RegisterFunction
from appPublic.log import debug, exception
from ahserver.serverenv import ServerEnv
from ahserver.globalEnv import stream_response
from ahserver.webapp import webserver
from aiohttp_session import get_session
def init():
rf = RegisterFunction()
rf.register('chat_completions', chat_completions)
async def chat_completions(request, params_kw, *params, **kw):
async def gor():
se = ServerEnv()
engine = se.chat_engine
session = await get_session(request)
kwargs = {
}
if params_kw.image_path:
kwargs['image_path'] = fs.reapPath(params_kw.image_path)
if params_kw.video_path:
kwargs['video_path'] = fs.reapPath(params_kw.video_path)
if params_kw.audio_path:
kwargs['audio_path'] = fs.reapPath(params_kw.audio_path)
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
debug(f'{d=}')
yield d
return await stream_response(request, gor)
def main():
parser = argparse.ArgumentParser(prog="Sage")
parser.add_argument('-w', '--workdir')
parser.add_argument('-p', '--port')
parser.add_argument('model_path')
args = parser.parse_args()
Klass = get_llm_class(args.model_path)
if Klass is None:
e = Exception(f'{args.model_path} has not mapping to a model class')
exception(f'{e}, {format_exc()}')
raise e
se = ServerEnv()
se.engine = Klass(args.model_path)
se.engine.use_mps_if_prosible()
workdir = args.workdir or os.getcwd()
port = args.port
webserver(init, workdir, port)
if __name__ == '__main__':
main()

24
pyproject.toml Normal file
View File

@ -0,0 +1,24 @@
[project]
name="llmengine"
version = "0.0.1"
description = "Your project description"
authors = [{ name = "yu moqing", email = "yumoqing@gmail.com" }]
readme = "README.md"
requires-python = ">=3.8"
license = {text = "MIT"}
dependencies = [
"torch",
"transformers",
"sentence-transformers>=2.7.0",
# "flash_attention_2",
"mistral-common",
"accelerate"
]
[project.optional-dependencies]
dev = ["pytest", "black", "mypy"]
[build-system]
requires = ["setuptools>=61", "wheel"]
build-backend = "setuptools.build_meta"

11
t Normal file
View File

@ -0,0 +1,11 @@
select x.*, 'folder' as rtype,
case when y.id is null then 1
else 0 end as is_leaf
from
(select * from folder
where parentid is null) x left join
(select unique a.* from
folder a left join folder b
on a.id = b.parentid
where b.id is not NULL) y
on x.id = y.id

3
test/chat/Qwen3-0.6B Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
~/models/tsfm.env/bin/python -m llmengine.server -p 9999 ~/models/Qwen/Qwen3-0.6B

3
test/chat/Qwen3-Embedding-0.6B Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
~/models/tsfm.env/bin/python -m llmengine.embedding -w ~/models/tsfm -p 9998 ~/models/Qwen/Qwen3-Embedding-0.6B

View File

@ -0,0 +1,51 @@
{
"filesroot":"$[workdir]$/files",
"logger":{
"name":"llmengine",
"levelname":"info",
"logfile":"$[workdir]$/logs/llmengine.log"
},
"website":{
"paths":[
["$[workdir]$/wwwroot",""]
],
"client_max_size":10000,
"host":"0.0.0.0",
"port":9995,
"coding":"utf-8",
"ssl_gg":{
"crtfile":"$[workdir]$/conf/www.bsppo.com.pem",
"keyfile":"$[workdir]$/conf/www.bsppo.com.key"
},
"indexes":[
"index.html",
"index.ui"
],
"startswiths":[
{
"leading":"/idfile",
"registerfunction":"idfile"
},{
"leading": "/v1/chat/completions",
"registerfunction": "chat_completions"
}
],
"processors":[
[".tmpl","tmpl"],
[".app","app"],
[".ui","bui"],
[".dspy","dspy"],
[".md","md"]
],
"rsakey_oops":{
"privatekey":"$[workdir]$/conf/rsa_private_key.pem",
"publickey":"$[workdir]$/conf/rsa_public_key.pem"
},
"session_max_time":3000,
"session_issue_time":2500,
"session_redis_notuse":{
"url":"redis://127.0.0.1:6379"
}
}
}

View File

@ -0,0 +1,6 @@
{
"ymq": {
"ref_text": "\u8f7b\u91cf\u5e94\u7528\u670d\u52a1\u5668\u5907\u6848\u6761\u4ef6\uff1a\u8d2d\u4e70\u65f6\u957f\u57283\u4e2a\u6708\u53ca\u4ee5\u4e0a",
"ref_audio": "/data/ymq/py/f5tts/files/87/103/66/49/record.wav"
}
}

3627
test/chat/logs/llmengine.log Normal file

File diff suppressed because one or more lines are too long

27
test/chatllm Executable file
View File

@ -0,0 +1,27 @@
#!/share/vllm-0.8.5/bin/python
import os
import sys
import argparse
def get_args():
parser = argparse.ArgumentParser(description="Example script using argparse")
parser.add_argument('--gpus', '-g', type=str, required=False, default='0', help='Identify GPU id, default is 0, comma split')
parser.add_argument("--stream", action="store_true", help="是否流式输出", default=True)
parser.add_argument('modelpath', type=str, help='Path to model folder')
args = parser.parse_args()
return args
def main():
args = get_args()
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
gpus = args.gpus.split(',')
cnt=len(gpus)
stream=' --stream' if args.stream else ' '
cmdline = f'/share/vllm-0.8.5/bin/python -m llmengine.chatllm --model {args.modelpath} --gpus {cnt} {stream}'
print(args, cmdline)
os.system(cmdline)
if __name__ == '__main__':
main()

0
test/ds-r1-8b Executable file
View File

View File

@ -0,0 +1,3 @@
#!/bin/bash
~/models/tsfm.env/bin/python -m llmengine.embedding -p 9998 ~/models/Qwen/Qwen3-Embedding-0.6B

View File

@ -0,0 +1,50 @@
{
"filesroot":"$[workdir]$/files",
"logger":{
"name":"llmengine",
"levelname":"info",
"logfile":"$[workdir]$/logs/llmengine.log"
},
"website":{
"paths":[
["$[workdir]$/wwwroot",""]
],
"client_max_size":10000,
"host":"0.0.0.0",
"port":9995,
"coding":"utf-8",
"indexes":[
"index.html",
"index.ui"
],
"startswiths":[
{
"leading":"/idfile",
"registerfunction":"idfile"
},{
"leading": "/v1/embeddings",
"registerfunction": "embeddings"
},{
"leading": "/docs",
"registerfunction": "docs"
}
],
"processors":[
[".tmpl","tmpl"],
[".app","app"],
[".ui","bui"],
[".dspy","dspy"],
[".md","md"]
],
"rsakey_oops":{
"privatekey":"$[workdir]$/conf/rsa_private_key.pem",
"publickey":"$[workdir]$/conf/rsa_public_key.pem"
},
"session_max_time":3000,
"session_issue_time":2500,
"session_redis_notuse":{
"url":"redis://127.0.0.1:6379"
}
}
}

View File

View File

@ -0,0 +1,48 @@
[Unit]
Description=A Embedding Service using Qwen3-Embedding-0.6B
# After=network.target DeepSeek70B-kyyds671b-ray.service
# Requires=DeepSeek70B-kyyds671b-ray.service
StartLimitIntervalSec=60
StartLimitBurst=5
[Service]
# 核心启动参数(保持原有配置)
User=ymq
Group=ymq
WorkingDirectory=/share/ymq/run/embeddings
#定义环境变量,所有节点的启动脚本与服务需一致
#Environment="NCCL_SOCKET_IFNAME=enp196s0f0np0"
#ExecStartPre=/data/kyyds671b/ray_check.sh
ExecStart=/share/ymq/run/embeddings/start.sh
ExecStop=/share/ymq/run/embeddings/stop.sh
# 超时与停止控制(新增部分)
# 启动超时延长至 120 秒
# TimeoutStartSec=120
# 停止等待时间 30 秒
# TimeoutStopSec=30
# 优先发送 SIGINT 信号(更适合 Python 程序)
# KillSignal=SIGINT
# 最终强制终止信号
# RestartKillSignal=SIGKILL
# 混合终止模式
# KillMode=mixed
# 重启策略
# Restart=on-failure
# RestartSec=10s
# 服务管理(保持原有配置+增强)
#Restart=always
#RestartSec=10 # 重启间隔从 5 秒调整为 10 秒
#append 是继续写入相当于>> file是从新写入 相当于>
StandardOutput=append:/var/log/embeddings/embeddings.log
StandardError=append:/var/log/embeddings/error.log
SyslogIdentifier=embeddings
# 资源限制(保持可选配置)
#LimitNOFILE=65536
#LimitNPROC=65536
# GPU 支持
#Environment=CUDA_VISIBLE_DEVICES=0,1
[Install]
WantedBy=multi-user.target

3
test/embeddings/start.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
CUDA_VISIBLE_DEVICES=7 /share/vllm-0.8.5/bin/python -m llmengine.embedding -p 9998 /d/ymq/models/Qwen/Qwen3-Embedding-0.6B

4
test/embeddings/stop.sh Executable file
View File

@ -0,0 +1,4 @@
#!/usr/bin/bash
killname Qwen/Qwen3-Embedding

119
test/gemma-3-4b-it Executable file
View File

@ -0,0 +1,119 @@
#!/share/vllm-0.8.5/bin/python
# pip install accelerate
import torch
lfrom time import time
from appPublic.worker import awaitify
from ahserver.serverenv import get_serverenv
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
from PIL import Image
import requests
import torch
class Gemma3LLM:
def __init__(self, model_id):
self.model = Gemma3ForConditionalGeneration.from_pretrained(
model_id, device_map="auto"
).eval()
self.processor = AutoProcessor.from_pretrained(model_id)
self.messages = []
self.model_id = model_id
async def get_session_key(self):
return self.model_id + ':messages'
async def get_session_messages(self, request):
f = get_serverenv('get_session')
session = await f(request)
key = self.get_session_key()
messages = session.get(key) or []
return messages
async def set_session_messages(self, request, messages):
f = get_serverenv('get_session')
session = await f(request)
key = self.get_session_key()
await session[key] = messages
def _generate(self, request, prompt, image_path=None, sys_prompt=None):
if sys_prompt:
sys_message = self._build_sys_message(sys_prompt)
self.messages.append(sys_message)
user_message = self._build_user_message(prompt, image_path=image_path)
self.messages.append(user_message)
data = self._gen(self.messages)
self.messages.append(self._build_assistant_message(data['text']))
def _build_assistant_message(self, prompt):
return {
"role":"assistant",
"content":[{"type": "text", "text": prompt}]
}
def _build_sys_message(self, prompt):
return {
"role":"system",
"content":[{"type": "text", "text": prompt}]
}
def _build_user_message(self, prompt, image_path=None):
contents = [
{
"type":"text", "text": prompt
}
]
if image_path:
contents.append({
"type": "image",
"image": image_path
})
return {
"role": "user",
"content": contents
}
def _gen(self, messages):
t1 = time()
inputs = self.processor.apply_chat_template(
messages, add_generation_prompt=True,
tokenize=True,
return_dict=True, return_tensors="pt"
).to(self.model.device, dtype=torch.bfloat16)
input_len = inputs["input_ids"].shape[-1]
with torch.inference_mode():
generation = self.model.generate(**inputs, max_new_tokens=1000, do_sample=True)
generation = generation[0][input_len:]
decoded = self.processor.decode(generation, skip_special_tokens=True)
t2 = time()
return {
"role": "assistant",
"input_tokens": input_len,
"output_token": len(generation),
"timecost": t2 - t1,
"text": decoded
}
async def generate(self, request, prompt, image_path=None, sys_prompt=None):
messages = self.get_session_messages(request)
if sys_prompt and len(messages) == 0:
messages.append(self._build_sys_message(sys_prompt))
messages.append(self._build_user_message(prompt, image_path=image_path))
f = awaitify(self._gen)
data = await f(messages)
messages.append(self._build_assistant_message(data['text']))
self.set_session_message(request, messages)
if __name__ == '__main__':
gemma3 = Gemma3LLM('/share/models/google/gemma-3-4b-it')
while True:
print('input prompt')
p = input()
if p:
if p == 'q':
break;
print('input image path')
imgpath=input()
t = gemma3._generate(p, image_path=imgpath)
print(t)

3
test/gemma3.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/bash
CUDA_VISIBLE_DEVICES=1 /share/vllm-0.8.5/bin/python -m llmengine.gemma3_it

BIN
test/gemma3/.run.sh.swp Normal file

Binary file not shown.

View File

@ -0,0 +1,51 @@
{
"filesroot":"$[workdir]$/files",
"logger":{
"name":"llmengine",
"levelname":"info",
"logfile":"$[workdir]$/logs/llmengine.log"
},
"website":{
"paths":[
["$[workdir]$/wwwroot",""]
],
"client_max_size":10000,
"host":"0.0.0.0",
"port":9995,
"coding":"utf-8",
"ssl_gg":{
"crtfile":"$[workdir]$/conf/www.bsppo.com.pem",
"keyfile":"$[workdir]$/conf/www.bsppo.com.key"
},
"indexes":[
"index.html",
"index.ui"
],
"startswiths":[
{
"leading":"/idfile",
"registerfunction":"idfile"
},{
"leading": "/v1/chat/completions",
"registerfunction": "chat_completions"
}
],
"processors":[
[".tmpl","tmpl"],
[".app","app"],
[".ui","bui"],
[".dspy","dspy"],
[".md","md"]
],
"rsakey_oops":{
"privatekey":"$[workdir]$/conf/rsa_private_key.pem",
"publickey":"$[workdir]$/conf/rsa_public_key.pem"
},
"session_max_time":3000,
"session_issue_time":2500,
"session_redis_notuse":{
"url":"redis://127.0.0.1:6379"
}
}
}

View File

@ -0,0 +1,6 @@
{
"ymq": {
"ref_text": "\u8f7b\u91cf\u5e94\u7528\u670d\u52a1\u5668\u5907\u6848\u6761\u4ef6\uff1a\u8d2d\u4e70\u65f6\u957f\u57283\u4e2a\u6708\u53ca\u4ee5\u4e0a",
"ref_audio": "/data/ymq/py/f5tts/files/87/103/66/49/record.wav"
}
}

View File

@ -0,0 +1,16 @@
[Unit]
Wants=systemd-networkd.service
[Service]
User=ymq
Group=ymq
Type=forking
WorkingDirectory=/share/ymq/run/gemma3
ExecStart=/share/ymq/run/gemma3/start.sh
ExecStop=/share/ymq/run/gemma3/stop.sh
StandardOutput=append:/var/log/gemma3/gemma3.log
StandardError=append:/var/log/gemma3/gemma3.log
SyslogIdentifier=gemma3
[Install]
WantedBy=multi-user.target

4
test/gemma3/install.sh Executable file
View File

@ -0,0 +1,4 @@
sudo mkdir /var/log/gemma3
sudo cp gemma3.service /etc/systemd/system
sudo systemctl enable gemma3
sudo systemctl start gemma3

View File

@ -0,0 +1,342 @@
2025-06-09 08:13:26.400[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
2025-06-09 08:13:26.411[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
2025-06-09 08:13:26.418[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.008466005325317383, (0.000392913818359375), except=name 'stream_response' is not defined
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 42, in chat_completions
return await stream_response(request, gor)
NameError: name 'stream_response' is not defined
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 42, in chat_completions
return await stream_response(request, gor)
NameError: name 'stream_response' is not defined
2025-06-09 08:15:08.876[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
2025-06-09 08:15:08.884[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
2025-06-09 08:15:08.891[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.005657672882080078, (9.679794311523438e-05), except=get_session() missing 1 required positional argument: 'request'
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 43, in chat_completions
return await stream_response(request, gor)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
async for d in async_data_generator():
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 31, in gor
session = await get_session()
TypeError: get_session() missing 1 required positional argument: 'request'
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 43, in chat_completions
return await stream_response(request, gor)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
async for d in async_data_generator():
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 31, in gor
session = await get_session()
TypeError: get_session() missing 1 required positional argument: 'request'
2025-06-09 08:19:30.169[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
2025-06-09 08:19:30.177[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
2025-06-09 08:19:30.223[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.03934144973754883, (0.00010514259338378906), except='None' has no attribute 'startswith'
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 45, in chat_completions
return await stream_response(request, gor)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
async for d in async_data_generator():
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 41, in gor
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
for d in self._generator(session, prompt,
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
for d in self._gen(messages):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
inputs = self._messages2inputs(messages)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 32, in _messages2inputs
text = self.tokenizer.apply_chat_template(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
rendered_chat = compiled_template.render(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
self.environment.handle_exception()
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
raise rewrite_traceback_stack(source=source)
File "<template>", line 20, in top-level template code
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
if not __self.is_safe_callable(__obj):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 45, in chat_completions
return await stream_response(request, gor)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
async for d in async_data_generator():
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 41, in gor
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
for d in self._generator(session, prompt,
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
for d in self._gen(messages):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
inputs = self._messages2inputs(messages)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 32, in _messages2inputs
text = self.tokenizer.apply_chat_template(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
rendered_chat = compiled_template.render(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
self.environment.handle_exception()
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
raise rewrite_traceback_stack(source=source)
File "<template>", line 20, in top-level template code
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
if not __self.is_safe_callable(__obj):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
2025-06-09 08:28:03.514[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
2025-06-09 08:28:03.522[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
2025-06-09 08:28:03.526[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py:33]messages=[{'role': 'user', 'content': None}]
2025-06-09 08:28:03.579[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.05059671401977539, (0.00011324882507324219), except='None' has no attribute 'startswith'
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 45, in chat_completions
return await stream_response(request, gor)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
async for d in async_data_generator():
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 41, in gor
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
for d in self._generator(session, prompt,
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
for d in self._gen(messages):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
inputs = self._messages2inputs(messages)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 34, in _messages2inputs
text = self.tokenizer.apply_chat_template(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
rendered_chat = compiled_template.render(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
self.environment.handle_exception()
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
raise rewrite_traceback_stack(source=source)
File "<template>", line 20, in top-level template code
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
if not __self.is_safe_callable(__obj):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 45, in chat_completions
return await stream_response(request, gor)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
async for d in async_data_generator():
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 41, in gor
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
for d in self._generator(session, prompt,
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
for d in self._gen(messages):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
inputs = self._messages2inputs(messages)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 34, in _messages2inputs
text = self.tokenizer.apply_chat_template(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
rendered_chat = compiled_template.render(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
self.environment.handle_exception()
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
raise rewrite_traceback_stack(source=source)
File "<template>", line 20, in top-level template code
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
if not __self.is_safe_callable(__obj):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
2025-06-09 08:31:48.954[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
2025-06-09 08:31:48.961[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
2025-06-09 08:31:48.964[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py:29]params_kw={'{\n"prompt":"who are you"\n}': ''}, params=(), kw={}
2025-06-09 08:31:48.968[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py:33]messages=[{'role': 'user', 'content': None}]
2025-06-09 08:31:49.009[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.04324674606323242, (8.392333984375e-05), except='None' has no attribute 'startswith'
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 46, in chat_completions
return await stream_response(request, gor)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
async for d in async_data_generator():
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 42, in gor
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
for d in self._generator(session, prompt,
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
for d in self._gen(messages):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
inputs = self._messages2inputs(messages)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 34, in _messages2inputs
text = self.tokenizer.apply_chat_template(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
rendered_chat = compiled_template.render(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
self.environment.handle_exception()
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
raise rewrite_traceback_stack(source=source)
File "<template>", line 20, in top-level template code
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
if not __self.is_safe_callable(__obj):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
Traceback (most recent call last):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
ret = await handler(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
ret = await processor.handle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
await self.execute(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
await self.datahandle(request)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
x = await self.path_call(request, self.path)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
return await f(request, params_kw, *args)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 46, in chat_completions
return await stream_response(request, gor)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
async for d in async_data_generator():
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 42, in gor
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
for d in self._generator(session, prompt,
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
for d in self._gen(messages):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
inputs = self._messages2inputs(messages)
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 34, in _messages2inputs
text = self.tokenizer.apply_chat_template(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
rendered_chat = compiled_template.render(
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
self.environment.handle_exception()
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
raise rewrite_traceback_stack(source=source)
File "<template>", line 20, in top-level template code
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
if not __self.is_safe_callable(__obj):
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
2025-06-09 08:37:22.471[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
2025-06-09 08:37:22.479[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'prompt': 'who are you'}, args=[]
2025-06-09 08:37:22.483[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py:29]params_kw={'prompt': 'who are you'}, params=(), kw={}
2025-06-09 08:37:22.486[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py:33]messages=[{'role': 'user', 'content': 'who are you'}]
2025-06-09 08:48:12.725[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
2025-06-09 08:48:12.735[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'prompt': 'who are you'}, args=[]
2025-06-09 08:48:12.738[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py:29]params_kw={'prompt': 'who are you'}, params=(), kw={}
2025-06-09 08:48:12.742[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py:33]messages=[{'role': 'user', 'content': 'who are you'}]

4
test/gemma3/start.sh Executable file
View File

@ -0,0 +1,4 @@
#!/usr/bin/bash
CUDA_VISIBLE_DEVICES=5 /share/vllm-0.8.5/bin/python -m llmengine.server -p 9999 /share/models/google/gemma-3-4b-it &
CUDA_VISIBLE_DEVICES=5 /share/vllm-0.8.5/bin/python -m llmengine.server -p 9999 /share/models/google/gemma-3-4b-it &

3
test/gemma3/stop.sh Normal file
View File

@ -0,0 +1,3 @@
#!/usr/bin/bash
/d/ymq/bin/killname gemma-3-4b-it

3
test/medgemma3.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/bash
CUDA_VISIBLE_DEVICES=0 /share/vllm-0.8.5/bin/python -m llmengine.medgemma3_it

30
test/phi4 Executable file
View File

@ -0,0 +1,30 @@
#!/share/vllm-0.8.5/bin/python
import transformers
pipeline = transformers.pipeline(
"text-generation",
model="/share/ymq/models/microsoft/phi-4",
model_kwargs={"torch_dtype": "auto"},
device_map="auto",
)
messages = [
{"role": "system", "content": "You are a medieval knight and must provide explanations to modern people."},
]
while True:
print('input prompt')
p = input()
if not p:
continue
if p == 'q':
break
messages.append({
'role':'user',
'content': p
})
outputs = pipeline(messages, max_new_tokens=1024)
messages = outputs[0]["generated_text"]
print(messages[-1]['content'])

3
test/qwen3.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/bash
~/models/tsfm.env/bin/python -m llmengine.server ~/models/Qwen/Qwen3-0.6B

3
test/qwen3_embedding.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
~/models/tsfm.env/bin/python -m llmengine.embedding ~/models/Qwen/Qwen3-Embedding-0.6B

View File

@ -0,0 +1,50 @@
{
"filesroot":"$[workdir]$/files",
"logger":{
"name":"llmengine",
"levelname":"info",
"logfile":"$[workdir]$/logs/llmengine.log"
},
"website":{
"paths":[
["$[workdir]$/wwwroot",""]
],
"client_max_size":10000,
"host":"0.0.0.0",
"port":9995,
"coding":"utf-8",
"indexes":[
"index.html",
"index.ui"
],
"startswiths":[
{
"leading":"/idfile",
"registerfunction":"idfile"
},{
"leading": "/v1/rerank",
"registerfunction": "rerank"
},{
"leading": "/docs",
"registerfunction": "docs"
}
],
"processors":[
[".tmpl","tmpl"],
[".app","app"],
[".ui","bui"],
[".dspy","dspy"],
[".md","md"]
],
"rsakey_oops":{
"privatekey":"$[workdir]$/conf/rsa_private_key.pem",
"publickey":"$[workdir]$/conf/rsa_public_key.pem"
},
"session_max_time":3000,
"session_issue_time":2500,
"session_redis_notuse":{
"url":"redis://127.0.0.1:6379"
}
}
}

View File

View File

@ -0,0 +1,48 @@
[Unit]
Description=A Rerank Service using Qwen3-Reranker-0.6B
# After=network.target DeepSeek70B-kyyds671b-ray.service
# Requires=DeepSeek70B-kyyds671b-ray.service
StartLimitIntervalSec=60
StartLimitBurst=5
[Service]
# 核心启动参数(保持原有配置)
User=ymq
Group=ymq
WorkingDirectory=/share/ymq/run/reranker
#定义环境变量,所有节点的启动脚本与服务需一致
#Environment="NCCL_SOCKET_IFNAME=enp196s0f0np0"
#ExecStartPre=/data/kyyds671b/ray_check.sh
ExecStart=/share/ymq/run/reranker/start.sh
ExecStop=/share/ymq/run/reranker/stop.sh
# 超时与停止控制(新增部分)
# 启动超时延长至 120 秒
# TimeoutStartSec=120
# 停止等待时间 30 秒
# TimeoutStopSec=30
# 优先发送 SIGINT 信号(更适合 Python 程序)
# KillSignal=SIGINT
# 最终强制终止信号
# RestartKillSignal=SIGKILL
# 混合终止模式
# KillMode=mixed
# 重启策略
# Restart=on-failure
# RestartSec=10s
# 服务管理(保持原有配置+增强)
#Restart=always
#RestartSec=10 # 重启间隔从 5 秒调整为 10 秒
#append 是继续写入相当于>> file是从新写入 相当于>
StandardOutput=append:/var/log/rerank/rerank.log
StandardError=append:/var/log/rerank/error.log
SyslogIdentifier=rerank
# 资源限制(保持可选配置)
#LimitNOFILE=65536
#LimitNPROC=65536
# GPU 支持
#Environment=CUDA_VISIBLE_DEVICES=0,1
[Install]
WantedBy=multi-user.target

4
test/reranker/start.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/bash
# CUDA_VISIBLE_DEVICES=7 /share/vllm-0.8.5/bin/python -m llmengine.rerank -p 9997 /d/ymq/models/Qwen/Qwen3-Reranker-0___6B
CUDA_VISIBLE_DEVICES=7 /share/vllm-0.8.5/bin/python -m llmengine.rerank -p 9997 /share/models/BAAI/bge-reranker-v2-m3

5
test/reranker/stop.sh Executable file
View File

@ -0,0 +1,5 @@
#!/usr/bin/bash
#killname Qwen/Qwen3-Reranker
killname BAAI/bge-reranker

17
test/reranker/t.sh Executable file
View File

@ -0,0 +1,17 @@
#!/usr/bin/bash
curl http://localhost:9997/v1/rerank \
-H "Content-Type: application/json" \
-d @- <<EOF
{
"model": "rerank-001",
"query": "什么是量子计算?",
"documents": [
"量子计算是一种使用量子比特进行计算的方式。",
"古典计算机使用的是二进制位。",
"天气预报依赖于统计模型。",
"量子计算与物理学密切相关。"
],
"top_n": 5
}
EOF