buggix
This commit is contained in:
commit
3d2f799eee
BIN
llmengine/__pycache__/base_chat_llm.cpython-313.pyc
Normal file
BIN
llmengine/__pycache__/base_chat_llm.cpython-313.pyc
Normal file
Binary file not shown.
BIN
llmengine/__pycache__/server.cpython-313.pyc
Normal file
BIN
llmengine/__pycache__/server.cpython-313.pyc
Normal file
Binary file not shown.
246
llmengine/base_chat_llm.py
Normal file
246
llmengine/base_chat_llm.py
Normal file
@ -0,0 +1,246 @@
|
|||||||
|
import threading
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
from time import time
|
||||||
|
from transformers import TextIteratorStreamer
|
||||||
|
from appPublic.log import debug
|
||||||
|
from appPublic.worker import awaitify
|
||||||
|
from appPublic.uniqueID import getID
|
||||||
|
|
||||||
|
model_pathMap = {
|
||||||
|
}
|
||||||
|
def llm_register(model_key, Klass):
|
||||||
|
model_pathMap[model_key] = Klass
|
||||||
|
|
||||||
|
def get_llm_class(model_path):
|
||||||
|
for k,klass in model_pathMap.items():
|
||||||
|
if len(model_path.split(k)) > 1:
|
||||||
|
return klass
|
||||||
|
print(f'{model_pathMap=}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
class BaseChatLLM:
|
||||||
|
def use_mps_if_prosible(self):
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
device = torch.device("mps")
|
||||||
|
self.model = self.model.to(device)
|
||||||
|
|
||||||
|
def get_session_key(self):
|
||||||
|
return self.model_id + ':messages'
|
||||||
|
|
||||||
|
def _get_session_messages(self, session):
|
||||||
|
key = self.get_session_key()
|
||||||
|
messages = session.get(key) or []
|
||||||
|
return messages
|
||||||
|
|
||||||
|
def _set_session_messages(self, session, messages):
|
||||||
|
key = self.get_session_key()
|
||||||
|
session[key] = messages
|
||||||
|
|
||||||
|
def get_streamer(self):
|
||||||
|
return TextIteratorStreamer(
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
skip_special_tokens=True,
|
||||||
|
skip_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def output_generator(self, streamer):
|
||||||
|
all_txt = ''
|
||||||
|
t1 = time()
|
||||||
|
i = 0
|
||||||
|
id = f'chatllm-{getID}'
|
||||||
|
for txt in streamer:
|
||||||
|
if txt == '':
|
||||||
|
continue
|
||||||
|
if i == 0:
|
||||||
|
t2 = time()
|
||||||
|
i += 1
|
||||||
|
all_txt += txt
|
||||||
|
yield {
|
||||||
|
"id":id,
|
||||||
|
"object":"chat.completion.chunk",
|
||||||
|
"created":time(),
|
||||||
|
"model":self.model_id,
|
||||||
|
"choices":[
|
||||||
|
{
|
||||||
|
"index":0,
|
||||||
|
"delta":{
|
||||||
|
"content":txt
|
||||||
|
},
|
||||||
|
"logprobs":None,
|
||||||
|
"finish_reason":None
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
t3 = time()
|
||||||
|
t = all_txt
|
||||||
|
unk = self.tokenizer(t, return_tensors="pt")
|
||||||
|
output_tokens = len(unk["input_ids"][0])
|
||||||
|
yield {
|
||||||
|
"id":id,
|
||||||
|
"object":"chat.completion.chunk",
|
||||||
|
"created":time(),
|
||||||
|
"model":self.model_id,
|
||||||
|
"response_time": t2 - t1,
|
||||||
|
"finish_time": t3 - t1,
|
||||||
|
"output_token": output_tokens,
|
||||||
|
"choices":[
|
||||||
|
{
|
||||||
|
"index":0,
|
||||||
|
"delta":{
|
||||||
|
"content":""
|
||||||
|
},
|
||||||
|
"logprobs":None,
|
||||||
|
"finish_reason":"stop"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
def _generator(self, session, prompt, image_path, video_path, audio_path, sys_prompt):
|
||||||
|
messages = self._get_session_messages(session)
|
||||||
|
if sys_prompt:
|
||||||
|
messages.append(self._build_sys_message(sys_prompt))
|
||||||
|
messages.append(self._build_user_message(prompt, image_path=image_path))
|
||||||
|
# debug(f'{messages=}')
|
||||||
|
all_txt = ''
|
||||||
|
for d in self._gen(messages):
|
||||||
|
if d['choices'][0]['finish_reason'] == 'stop':
|
||||||
|
messages.append(self._build_assistant_message(all_txt))
|
||||||
|
else:
|
||||||
|
all_txt += d['choices'][0]['delta']['content']
|
||||||
|
yield d
|
||||||
|
self._set_session_messages(session, messages)
|
||||||
|
|
||||||
|
async def _async_generator(self, session, prompt, image_path, video_path, audio_path, sys_prompt):
|
||||||
|
for d in self._generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
yield d
|
||||||
|
|
||||||
|
def generate(self, session, prompt,
|
||||||
|
image_path=None,
|
||||||
|
video_path=None,
|
||||||
|
audio_path=None,
|
||||||
|
sys_prompt=None):
|
||||||
|
for d in self._generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
|
||||||
|
if d['choices'][0]['finish_reason'] == 'stop':
|
||||||
|
return d
|
||||||
|
def stream_generate(self, session, prompt,
|
||||||
|
image_path=None,
|
||||||
|
video_path=None,
|
||||||
|
audio_path=None,
|
||||||
|
sys_prompt=None):
|
||||||
|
for d in self._generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
|
||||||
|
s = f'data: {json.dumps(d)}\n'
|
||||||
|
yield s
|
||||||
|
|
||||||
|
async def async_generate(self, session, prompt,
|
||||||
|
image_path=None,
|
||||||
|
video_path=None,
|
||||||
|
audio_path=None,
|
||||||
|
sys_prompt=None):
|
||||||
|
async for d in self._async_generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
if d['choices'][0]['finish_reason'] == 'stop':
|
||||||
|
return d
|
||||||
|
|
||||||
|
async def async_stream_generate(self, session, prompt,
|
||||||
|
image_path=None,
|
||||||
|
video_path=None,
|
||||||
|
audio_path=None,
|
||||||
|
sys_prompt=None):
|
||||||
|
async for d in self._async_generator(session, prompt, image_path, video_path, audio_path, sys_prompt):
|
||||||
|
s = f'data: {json.dumps(d)}\n'
|
||||||
|
yield s
|
||||||
|
yield 'data: [DONE]'
|
||||||
|
|
||||||
|
def build_kwargs(self, inputs, streamer):
|
||||||
|
generate_kwargs = dict(
|
||||||
|
**inputs,
|
||||||
|
streamer=streamer,
|
||||||
|
max_new_tokens=512,
|
||||||
|
do_sample=True,
|
||||||
|
eos_token_id=self.tokenizer.eos_token_id
|
||||||
|
)
|
||||||
|
return generate_kwargs
|
||||||
|
|
||||||
|
def _messages2inputs(self, messages):
|
||||||
|
return self.processor.apply_chat_template(
|
||||||
|
messages, add_generation_prompt=True,
|
||||||
|
tokenize=True,
|
||||||
|
return_dict=True, return_tensors="pt"
|
||||||
|
).to(self.model.device, dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
def _gen(self, messages):
|
||||||
|
inputs = self._messages2inputs(messages)
|
||||||
|
input_len = inputs["input_ids"].shape[-1]
|
||||||
|
streamer = self.get_streamer()
|
||||||
|
kwargs = self.build_kwargs(inputs, streamer)
|
||||||
|
thread = threading.Thread(target=self.model.generate,
|
||||||
|
kwargs=kwargs)
|
||||||
|
thread.start()
|
||||||
|
for d in self.output_generator(streamer):
|
||||||
|
if d['choices'][0]['finish_reason'] == 'stop':
|
||||||
|
d['input_tokens'] = input_len
|
||||||
|
yield d
|
||||||
|
|
||||||
|
class T2TChatLLM(BaseChatLLM):
|
||||||
|
def _build_assistant_message(self, prompt):
|
||||||
|
return {
|
||||||
|
"role":"assistant",
|
||||||
|
"content":prompt
|
||||||
|
}
|
||||||
|
|
||||||
|
def _build_sys_message(self, prompt):
|
||||||
|
return {
|
||||||
|
"role":"system",
|
||||||
|
"content": prompt
|
||||||
|
}
|
||||||
|
|
||||||
|
def _build_user_message(self, prompt, **kw):
|
||||||
|
return {
|
||||||
|
"role":"user",
|
||||||
|
"content": prompt
|
||||||
|
}
|
||||||
|
|
||||||
|
class MMChatLLM(BaseChatLLM):
|
||||||
|
""" multiple modal chat LLM """
|
||||||
|
def _build_assistant_message(self, prompt):
|
||||||
|
return {
|
||||||
|
"role":"assistant",
|
||||||
|
"content":[{"type": "text", "text": prompt}]
|
||||||
|
}
|
||||||
|
|
||||||
|
def _build_sys_message(self, prompt):
|
||||||
|
return {
|
||||||
|
"role":"system",
|
||||||
|
"content":[{"type": "text", "text": prompt}]
|
||||||
|
}
|
||||||
|
|
||||||
|
def _build_user_message(self, prompt, image_path=None,
|
||||||
|
video_path=None, audio_path=None):
|
||||||
|
contents = [
|
||||||
|
{
|
||||||
|
"type":"text", "text": prompt
|
||||||
|
}
|
||||||
|
]
|
||||||
|
if image_path:
|
||||||
|
contents.append({
|
||||||
|
"type": "image",
|
||||||
|
"image": image_path
|
||||||
|
})
|
||||||
|
if video_path:
|
||||||
|
contents.append({
|
||||||
|
"type": "video",
|
||||||
|
"video":video_path
|
||||||
|
})
|
||||||
|
if audio_path:
|
||||||
|
contents.append({
|
||||||
|
"tyoe": "audio",
|
||||||
|
"audio": audio_path
|
||||||
|
})
|
||||||
|
return {
|
||||||
|
"role": "user",
|
||||||
|
"content": contents
|
||||||
|
}
|
||||||
|
|
||||||
46
llmengine/base_embedding.py
Normal file
46
llmengine/base_embedding.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
model_pathMap = {
|
||||||
|
}
|
||||||
|
def llm_register(model_key, Klass):
|
||||||
|
global model_pathMap
|
||||||
|
model_pathMap[model_key] = Klass
|
||||||
|
|
||||||
|
def get_llm_class(model_path):
|
||||||
|
for k,klass in model_pathMap.items():
|
||||||
|
if len(model_path.split(k)) > 1:
|
||||||
|
return klass
|
||||||
|
print(f'{model_pathMap=}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
class BaseEmbedding:
|
||||||
|
|
||||||
|
def use_mps_if_prosible(self):
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
device = torch.device("mps")
|
||||||
|
self.model = self.model.to(device)
|
||||||
|
|
||||||
|
def embeddings(self, input):
|
||||||
|
es = self.model.encode(input)
|
||||||
|
data = []
|
||||||
|
for i, e in enumerate(es):
|
||||||
|
d = {
|
||||||
|
"object": "embedding",
|
||||||
|
"index": i,
|
||||||
|
"embedding": e.tolist()
|
||||||
|
}
|
||||||
|
data.append(d)
|
||||||
|
return {
|
||||||
|
"object": "list",
|
||||||
|
"data": data,
|
||||||
|
"model": self.model_name,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 0,
|
||||||
|
"total_tokens": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def similarity(self, qvector, dcovectors):
|
||||||
|
s = self.model.similarity([qvector], docvectors)
|
||||||
|
return s[0]
|
||||||
|
|
||||||
84
llmengine/base_reranker.py
Normal file
84
llmengine/base_reranker.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
model_pathMap = {
|
||||||
|
}
|
||||||
|
def llm_register(model_key, Klass):
|
||||||
|
model_pathMap[model_key] = Klass
|
||||||
|
|
||||||
|
def get_llm_class(model_path):
|
||||||
|
for k,klass in model_pathMap.items():
|
||||||
|
if len(model_path.split(k)) > 1:
|
||||||
|
return klass
|
||||||
|
print(f'{model_pathMap=}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
class BaseReranker:
|
||||||
|
def __init__(self, model_id, **kw):
|
||||||
|
self.model_id = model_id
|
||||||
|
|
||||||
|
def use_mps_if_prosible(self):
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
device = torch.device("mps")
|
||||||
|
self.model = self.model.to(device)
|
||||||
|
|
||||||
|
def process_inputs(self, pairs):
|
||||||
|
inputs = self.tokenizer(
|
||||||
|
pairs, padding=False, truncation='longest_first',
|
||||||
|
return_attention_mask=False, max_length=self.max_length
|
||||||
|
)
|
||||||
|
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=self.max_length)
|
||||||
|
for key in inputs:
|
||||||
|
inputs[key] = inputs[key].to(self.model.device)
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
def build_sys_prompt(self, sys_prompt):
|
||||||
|
return f"<|im_start|>system\n{sys_prompt}\n<|im_end|>"
|
||||||
|
|
||||||
|
def build_user_prompt(self, query, doc, instruct=''):
|
||||||
|
return f'<|im_start|>user\n<Instruct>: {instruct}\n<Query>:{query}\n<Document>:\n{doc}<|im_end|>'
|
||||||
|
|
||||||
|
def build_assistant_prompt(self):
|
||||||
|
return "<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
||||||
|
|
||||||
|
def compute_logits(self, inputs, **kwargs):
|
||||||
|
batch_scores = self.model(**inputs).logits[:, -1, :]
|
||||||
|
# true_vector = batch_scores[:, token_true_id]
|
||||||
|
# false_vector = batch_scores[:, token_false_id]
|
||||||
|
# batch_scores = torch.stack([false_vector, true_vector], dim=1)
|
||||||
|
batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
|
||||||
|
scores = batch_scores[:, 1].exp().tolist()
|
||||||
|
return scores
|
||||||
|
|
||||||
|
def build_pairs(self, query, docs, sys_prompt="", task=""):
|
||||||
|
sys_str = self.build_sys_prompt(sys_prompt)
|
||||||
|
ass_str = self.build_assistant_prompt()
|
||||||
|
pairs = [ sys_str + '\n' + self.build_user_prompt(task, query, doc) + '\n' + ass_str for doc in docs ]
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
def rerank(self, query, docs, top_n, sys_prompt="", task=""):
|
||||||
|
pairs = self.build_pairs(query, docs, sys_prompt=sys_prompt, task=task)
|
||||||
|
with torch.no_grad():
|
||||||
|
inputs = self.process_inputs(pairs)
|
||||||
|
scores = self.compute_logits(inputs)
|
||||||
|
data = []
|
||||||
|
for i, s in enumerate(scores):
|
||||||
|
d = {
|
||||||
|
'index':i,
|
||||||
|
'relevance_score': s
|
||||||
|
}
|
||||||
|
data.append(d)
|
||||||
|
data = sorted(data,
|
||||||
|
key=lambda x: x["relevance_score"],
|
||||||
|
reverse=True)
|
||||||
|
if len(data) > top_n:
|
||||||
|
data = data[:top_n]
|
||||||
|
ret = {
|
||||||
|
"data": data,
|
||||||
|
"object": "rerank.result",
|
||||||
|
"model": self.model_name,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 0,
|
||||||
|
"total_tokens": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret
|
||||||
80
llmengine/base_triplets.py
Normal file
80
llmengine/base_triplets.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
||||||
|
|
||||||
|
model_pathMap = {
|
||||||
|
}
|
||||||
|
def llm_register(model_key, Klass):
|
||||||
|
model_pathMap[model_key] = Klass
|
||||||
|
|
||||||
|
def get_llm_class(model_path):
|
||||||
|
for k,klass in model_pathMap.items():
|
||||||
|
if len(model_path.split(k)) > 1:
|
||||||
|
return klass
|
||||||
|
print(f'{model_pathMap=}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
class BaseRelationLLM:
|
||||||
|
|
||||||
|
def extract_triplets_typed(self, text):
|
||||||
|
triplets = []
|
||||||
|
relation = ''
|
||||||
|
text = text.strip()
|
||||||
|
current = 'x'
|
||||||
|
subject, relation, object_, object_type, subject_type = '','','','',''
|
||||||
|
|
||||||
|
for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").replace("tp_XX", "").replace("__en__", "").split():
|
||||||
|
if token == "<triplet>" or token == "<relation>":
|
||||||
|
current = 't'
|
||||||
|
if relation != '':
|
||||||
|
triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
|
||||||
|
relation = ''
|
||||||
|
subject = ''
|
||||||
|
elif token.startswith("<") and token.endswith(">"):
|
||||||
|
if current == 't' or current == 'o':
|
||||||
|
current = 's'
|
||||||
|
if relation != '':
|
||||||
|
triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
|
||||||
|
object_ = ''
|
||||||
|
subject_type = token[1:-1]
|
||||||
|
else:
|
||||||
|
current = 'o'
|
||||||
|
object_type = token[1:-1]
|
||||||
|
relation = ''
|
||||||
|
else:
|
||||||
|
if current == 't':
|
||||||
|
subject += ' ' + token
|
||||||
|
elif current == 's':
|
||||||
|
object_ += ' ' + token
|
||||||
|
elif current == 'o':
|
||||||
|
relation += ' ' + token
|
||||||
|
if subject != '' and relation != '' and object_ != '' and object_type != '' and subject_type != '':
|
||||||
|
triplets.append({'head': subject.strip(), 'head_type': subject_type, 'type': relation.strip(),'tail': object_.strip(), 'tail_type': object_type})
|
||||||
|
return triplets
|
||||||
|
|
||||||
|
def build_inputs(self, text):
|
||||||
|
# Tokenizer text
|
||||||
|
return self.tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')
|
||||||
|
|
||||||
|
def gen_preds(self, inputs):
|
||||||
|
# Generate
|
||||||
|
generated_tokens = self.model.generate(
|
||||||
|
inputs['input_ids'].to(self.model.device)
|
||||||
|
attention_mask=inputs["attention_mask"].to(self.model.device),
|
||||||
|
decoder_start_token_id = self.tokenizer.convert_tokens_to_ids("tp_XX"),
|
||||||
|
**self.gen_kwargs
|
||||||
|
)
|
||||||
|
# Extract text
|
||||||
|
decoded_preds = self.tokenizer.batch_decode(generated_tokens,
|
||||||
|
skip_special_tokens=False)
|
||||||
|
return decoded_preds
|
||||||
|
|
||||||
|
def extract_triplets(self, text):
|
||||||
|
inputs = build_inputs(text)
|
||||||
|
preds = gen_preds(inputs)
|
||||||
|
|
||||||
|
# Extract triplets
|
||||||
|
triplets = []
|
||||||
|
for idx, sentence in enumerate(decoded_preds):
|
||||||
|
x = self.extract_triplets_typed(sentence)
|
||||||
|
triplets += x
|
||||||
|
print(triplets)
|
||||||
|
|
||||||
31
llmengine/bge_reranker.py
Normal file
31
llmengine/bge_reranker.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import torch
|
||||||
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||||
|
from llmengine.base_reranker import BaseReranker, llm_register
|
||||||
|
|
||||||
|
class BgeReranker(BaseReranker):
|
||||||
|
def __init__(self, model_id, max_length=8096):
|
||||||
|
if 'bge-reranker' not in model_id:
|
||||||
|
e = Exception(f'{model_id} is not a bge-reranker')
|
||||||
|
raise e
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(model_id)
|
||||||
|
model.eval()
|
||||||
|
self.model = model
|
||||||
|
self.model_id = model_id
|
||||||
|
self.model_name = model_id.split('/')[-1]
|
||||||
|
|
||||||
|
def build_pairs(self, query, docs, **kw):
|
||||||
|
return [[query, doc] for doc in docs]
|
||||||
|
|
||||||
|
def process_inputs(self, pairs):
|
||||||
|
inputs = self.tokenizer(pairs, padding=True,
|
||||||
|
truncation=True, return_tensors='pt', max_length=512)
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
def compute_logits(self, inputs):
|
||||||
|
scores = self.model(**inputs,
|
||||||
|
return_dict=True).logits.view(-1, ).float()
|
||||||
|
scores = [ s.item() for s in scores ]
|
||||||
|
return scores
|
||||||
|
|
||||||
|
llm_register('bge-reranker', BgeReranker)
|
||||||
212
llmengine/chatllm.py
Normal file
212
llmengine/chatllm.py
Normal file
@ -0,0 +1,212 @@
|
|||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
|
||||||
|
from time import time
|
||||||
|
import torch
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
def is_chat_model(model_name: str, tokenizer) -> bool:
|
||||||
|
chat_keywords = ["chat", "chatml", "phi", "llama-chat", "mistral-instruct"]
|
||||||
|
if any(k in model_name.lower() for k in chat_keywords):
|
||||||
|
return True
|
||||||
|
if tokenizer and hasattr(tokenizer, "additional_special_tokens"):
|
||||||
|
if any(tag in tokenizer.additional_special_tokens for tag in ["<|user|>", "<|system|>", "<|assistant|>"]):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def build_chat_prompt(messages):
|
||||||
|
prompt = ""
|
||||||
|
for message in messages:
|
||||||
|
role = message["role"]
|
||||||
|
content = message["content"]
|
||||||
|
prompt += f"<|{role}|>\n{content}\n"
|
||||||
|
prompt += "<|assistant|>\n" # 生成开始
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
class CountingStreamer(TextIteratorStreamer):
|
||||||
|
def __init__(self, tokenizer, skip_prompt=True, **kw):
|
||||||
|
super().__init__(tokenizer, skip_prompt=skip_prompt, **kw)
|
||||||
|
self.token_count = 0
|
||||||
|
|
||||||
|
def __next__(self, *args, **kw):
|
||||||
|
output_ids = super().__iter__(*args, **kw)
|
||||||
|
self.token_count += output_ids.sequences.shape[1]
|
||||||
|
return output_ids
|
||||||
|
|
||||||
|
class TransformersChatEngine:
|
||||||
|
def __init__(self, model_name: str, device: str = None, fp16: bool = True,
|
||||||
|
output_json=True,
|
||||||
|
gpus: int = 1):
|
||||||
|
"""
|
||||||
|
通用大模型加载器,支持 GPU 数量与编号控制
|
||||||
|
:param model_name: 模型名称或路径
|
||||||
|
:param device: 指定设备如 "cuda:0",默认自动选择
|
||||||
|
:param fp16: 是否使用 fp16 精度(适用于支持的 GPU)
|
||||||
|
:param gpus: 使用的 GPU 数量,1 表示单卡,>1 表示多卡推理(使用 device_map='auto')
|
||||||
|
"""
|
||||||
|
self.output_json = output_json
|
||||||
|
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
self.is_multi_gpu = gpus > 1 and torch.cuda.device_count() >= gpus
|
||||||
|
|
||||||
|
print(f"✅ Using device: {self.device}, GPUs: {gpus}, Multi-GPU: {self.is_multi_gpu}")
|
||||||
|
|
||||||
|
# Tokenizer 加载
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
||||||
|
|
||||||
|
# 模型加载
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype=torch.float16 if fp16 and "cuda" in self.device else torch.float32,
|
||||||
|
device_map="auto" if self.is_multi_gpu else None
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self.is_multi_gpu:
|
||||||
|
self.model.to(self.device)
|
||||||
|
|
||||||
|
self.model.eval()
|
||||||
|
self.is_chat = is_chat_model(model_name, self.tokenizer)
|
||||||
|
if self.is_chat:
|
||||||
|
self.messages = [ ]
|
||||||
|
|
||||||
|
print(f'{self.model.generation_config=}')
|
||||||
|
|
||||||
|
def set_system_prompt(self, prompt):
|
||||||
|
if self.is_chat:
|
||||||
|
self.messages = [{
|
||||||
|
|
||||||
|
'role': 'system',
|
||||||
|
'content': prompt
|
||||||
|
}]
|
||||||
|
def set_assistant_prompt(self, prompt):
|
||||||
|
if self.is_chat:
|
||||||
|
self.messages.append({
|
||||||
|
'role': 'assistant',
|
||||||
|
'content': prompt
|
||||||
|
})
|
||||||
|
def set_user_prompt(self, prompt):
|
||||||
|
if self.is_chat:
|
||||||
|
self.messages.append({
|
||||||
|
'role': 'user',
|
||||||
|
'content': prompt
|
||||||
|
})
|
||||||
|
return build_chat_prompt(self.messages)
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def generate(self, prompt: str):
|
||||||
|
t1 = time()
|
||||||
|
prompt = self.set_user_prompt(prompt)
|
||||||
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||||
|
output_ids = self.model.generate(
|
||||||
|
**inputs,
|
||||||
|
max_new_tokens=128,
|
||||||
|
generation_config=self.model.generation_config
|
||||||
|
)
|
||||||
|
output_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
||||||
|
t2 = time
|
||||||
|
text = output_text[len(prompt):] if output_text.startswith(prompt) else output_text
|
||||||
|
self.set_assistant_prompt(text)
|
||||||
|
if not self.output_json:
|
||||||
|
return text
|
||||||
|
input_tokens = inputs["input_ids"].shape[1]
|
||||||
|
output_tokens = len(self.tokenizer(text, return_tensors="pt")["input_ids"][0])
|
||||||
|
return {
|
||||||
|
'content':text,
|
||||||
|
'input_tokens': input_tokens,
|
||||||
|
'output_tokens': output_tokens,
|
||||||
|
'finish_time': t2 - t1,
|
||||||
|
'response_time': t2 - t1
|
||||||
|
}
|
||||||
|
|
||||||
|
def stream_generate(self, prompt: str):
|
||||||
|
t1 = time()
|
||||||
|
prompt = self.set_user_prompt(prompt)
|
||||||
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
||||||
|
input_tokens = inputs["input_ids"].shape[1]
|
||||||
|
streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
|
||||||
|
|
||||||
|
generation_kwargs = dict(
|
||||||
|
**inputs,
|
||||||
|
streamer=streamer,
|
||||||
|
max_new_tokens=16000,
|
||||||
|
generation_config=self.model.generation_config
|
||||||
|
)
|
||||||
|
|
||||||
|
thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
|
||||||
|
thread.start()
|
||||||
|
first = True
|
||||||
|
all_txt = ''
|
||||||
|
for new_text in streamer:
|
||||||
|
all_txt += new_text
|
||||||
|
if first:
|
||||||
|
t2 = time()
|
||||||
|
first = False
|
||||||
|
if not self.output_json:
|
||||||
|
yield new_text
|
||||||
|
yield {
|
||||||
|
'content': new_text,
|
||||||
|
'done': False
|
||||||
|
}
|
||||||
|
output_tokens = len(self.tokenizer(all_txt, return_tensors="pt")["input_ids"][0])
|
||||||
|
self.set_assistant_prompt(all_txt)
|
||||||
|
t3 = time()
|
||||||
|
if self.output_json:
|
||||||
|
yield {
|
||||||
|
'done': True,
|
||||||
|
'content':'',
|
||||||
|
'response_time': t2 - t1,
|
||||||
|
'finish_time': t3 - t1,
|
||||||
|
'input_tokens': input_tokens,
|
||||||
|
'output_tokens': output_tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Transformers Chat CLI")
|
||||||
|
parser.add_argument("--model", type=str, required=True, help="模型路径或 Hugging Face 名称")
|
||||||
|
parser.add_argument("--gpus", type=int, default=1, help="使用 GPU 数量")
|
||||||
|
parser.add_argument("--stream", action="store_true", help="是否流式输出")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def print_content(outd):
|
||||||
|
if isinstance(outd, dict):
|
||||||
|
print(outd['content'], end="", flush=True)
|
||||||
|
else:
|
||||||
|
print(outd, end="", flush=True)
|
||||||
|
|
||||||
|
def print_info(outd):
|
||||||
|
if isinstance(outd, dict):
|
||||||
|
if outd['done']:
|
||||||
|
print(f"response_time={outd['response_time']}, finish_time={outd['finish_time']}, input_tokens={outd['input_tokens']}, output_tokens={outd['output_tokens']}\n")
|
||||||
|
else:
|
||||||
|
print('\n');
|
||||||
|
|
||||||
|
def generate(engine, stream):
|
||||||
|
while True:
|
||||||
|
print('prompt("q" to exit):')
|
||||||
|
p = input()
|
||||||
|
if p == 'q':
|
||||||
|
break
|
||||||
|
if not p:
|
||||||
|
continue
|
||||||
|
if stream:
|
||||||
|
for outd in engine.stream_generate(p):
|
||||||
|
print_content(outd)
|
||||||
|
print('\n')
|
||||||
|
print_info(outd)
|
||||||
|
else:
|
||||||
|
outd = engine.generate(p)
|
||||||
|
print_content(outd)
|
||||||
|
print('\n')
|
||||||
|
print__info(outd)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
print(f'{args=}')
|
||||||
|
engine = TransformersChatEngine(
|
||||||
|
model_name=args.model,
|
||||||
|
gpus=args.gpus
|
||||||
|
)
|
||||||
|
generate(engine, args.stream)
|
||||||
|
|
||||||
|
main()
|
||||||
57
llmengine/client/llmclient
Executable file
57
llmengine/client/llmclient
Executable file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
from traceback import format_exc
|
||||||
|
import asyncio
|
||||||
|
import codecs
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from appPublic.streamhttpclient import liner, StreamHttpClient
|
||||||
|
from appPublic.log import MyLogger
|
||||||
|
|
||||||
|
def user_message(prompt, fn=None):
|
||||||
|
x = ''
|
||||||
|
if fn:
|
||||||
|
x = user_file(fn)
|
||||||
|
return prompt + x
|
||||||
|
|
||||||
|
def user_file(fn):
|
||||||
|
with codecs.open(fn, 'r', 'utf-8') as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser(prog='devops')
|
||||||
|
parser.add_argument('-f', '--file')
|
||||||
|
parser.add_argument('-p', '--prompt')
|
||||||
|
parser.add_argument('-s', '--sys_prompt')
|
||||||
|
parser.add_argument('-m', '--model')
|
||||||
|
parser.add_argument('url')
|
||||||
|
args = parser.parse_args()
|
||||||
|
d = {
|
||||||
|
'model': args.model,
|
||||||
|
'stream': True,
|
||||||
|
'prompt': user_message(args.prompt, args.file),
|
||||||
|
'sys_prompt':args.sys_prompt
|
||||||
|
}
|
||||||
|
hc = StreamHttpClient()
|
||||||
|
headers = {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
i = 0
|
||||||
|
buffer = ''
|
||||||
|
reco = hc('POST', args.url, headers=headers, data=json.dumps(d))
|
||||||
|
async for chunk in liner(reco):
|
||||||
|
chunk = chunk[6:]
|
||||||
|
if chunk != '[DONE]':
|
||||||
|
try:
|
||||||
|
f = json.loads(chunk)
|
||||||
|
except Exception as e:
|
||||||
|
print(f'****{chunk=} error {e} {format_exc()}')
|
||||||
|
continue
|
||||||
|
if not f['choices'][0]['finish_reason']:
|
||||||
|
print(f['choices'][0]['delta']['content'], end='', flush=True)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
print('\n\n')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
MyLogger('null', levelname='error', logfile='/dev/null')
|
||||||
|
asyncio.new_event_loop().run_until_complete(main())
|
||||||
59
llmengine/devstral.py
Normal file
59
llmengine/devstral.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
# for model mistralai/Devstral-Small-2505
|
||||||
|
from appPublic.worker import awaitify
|
||||||
|
from appPublic.log import debug
|
||||||
|
from ahserver.serverenv import get_serverenv
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
||||||
|
from mistral_common.protocol.instruct.messages import (
|
||||||
|
SystemMessage, UserMessage, AssistantMessage
|
||||||
|
)
|
||||||
|
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||||
|
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from llmengine.base_chat_llm import BaseChatLLM, T2TChatLLM, llm_register
|
||||||
|
|
||||||
|
class DevstralLLM(T2TChatLLM):
|
||||||
|
def __init__(self, model_id):
|
||||||
|
tekken_file = f'{model_id}/tekken.json'
|
||||||
|
self.tokenizer = MistralTokenizer.from_file(tekken_file)
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
torch_dtype="auto",
|
||||||
|
device_map="auto"
|
||||||
|
)
|
||||||
|
self.model_id = model_id
|
||||||
|
|
||||||
|
def _build_assistant_message(self, prompt):
|
||||||
|
return AssistantMessage(content=prompt)
|
||||||
|
|
||||||
|
def _build_sys_message(self, prompt):
|
||||||
|
return SystemMessage(content=prompt)
|
||||||
|
|
||||||
|
def _build_user_message(self, prompt, **kw):
|
||||||
|
return UserMessage(content=prompt)
|
||||||
|
|
||||||
|
def get_streamer(self):
|
||||||
|
return TextIteratorStreamer(
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
skip_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def build_kwargs(self, inputs, streamer):
|
||||||
|
generate_kwargs = dict(
|
||||||
|
**inputs,
|
||||||
|
streamer=streamer,
|
||||||
|
max_new_tokens=32768,
|
||||||
|
do_sample=True
|
||||||
|
)
|
||||||
|
return generate_kwargs
|
||||||
|
|
||||||
|
def _messages2inputs(self, messages):
|
||||||
|
tokenized = self.tokenizer.encode_chat_completion(
|
||||||
|
ChatCompletionRequest(messages=messages)
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
'input_ids': torch.tensor([tokenized.tokens])
|
||||||
|
}
|
||||||
|
|
||||||
|
llm_register('mistralai/Devstral', DevstralLLM)
|
||||||
|
|
||||||
95
llmengine/embedding.py
Normal file
95
llmengine/embedding.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
from traceback import format_exc
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
from llmengine.qwen3embedding import *
|
||||||
|
from llmengine.base_embedding import get_llm_class
|
||||||
|
|
||||||
|
from appPublic.registerfunction import RegisterFunction
|
||||||
|
from appPublic.worker import awaitify
|
||||||
|
from appPublic.log import debug, exception
|
||||||
|
from ahserver.serverenv import ServerEnv
|
||||||
|
from ahserver.globalEnv import stream_response
|
||||||
|
from ahserver.webapp import webserver
|
||||||
|
|
||||||
|
from aiohttp_session import get_session
|
||||||
|
|
||||||
|
helptext = """embeddings api:
|
||||||
|
path: /v1/embeddings
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
data: {
|
||||||
|
"input": "this is a test"
|
||||||
|
}
|
||||||
|
or {
|
||||||
|
"input":[
|
||||||
|
"this is first sentence",
|
||||||
|
"this is second setence"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
response is a json
|
||||||
|
{
|
||||||
|
"object": "list",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"object": "embedding",
|
||||||
|
"index": 0,
|
||||||
|
"embedding": [0.0123, -0.0456, ...]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"model": "text-embedding-3-small",
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 0,
|
||||||
|
"total_tokens": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def init():
|
||||||
|
rf = RegisterFunction()
|
||||||
|
rf.register('embeddings', embeddings)
|
||||||
|
rf.register('docs', docs)
|
||||||
|
|
||||||
|
async def docs(request, params_kw, *params, **kw):
|
||||||
|
return helptext
|
||||||
|
|
||||||
|
async def embeddings(request, params_kw, *params, **kw):
|
||||||
|
debug(f'{params_kw.input=}')
|
||||||
|
se = ServerEnv()
|
||||||
|
engine = se.engine
|
||||||
|
f = awaitify(engine.embeddings)
|
||||||
|
input = params_kw.input
|
||||||
|
if input is None:
|
||||||
|
e = exception(f'input is None')
|
||||||
|
raise e
|
||||||
|
if isinstance(input, str):
|
||||||
|
input = [input]
|
||||||
|
arr = await f(input)
|
||||||
|
debug(f'{arr=}, type(arr)')
|
||||||
|
return arr
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(prog="Embedding")
|
||||||
|
parser.add_argument('-w', '--workdir')
|
||||||
|
parser.add_argument('-p', '--port')
|
||||||
|
parser.add_argument('model_path')
|
||||||
|
args = parser.parse_args()
|
||||||
|
Klass = get_llm_class(args.model_path)
|
||||||
|
if Klass is None:
|
||||||
|
e = Exception(f'{args.model_path} has not mapping to a model class')
|
||||||
|
exception(f'{e}, {format_exc()}')
|
||||||
|
raise e
|
||||||
|
se = ServerEnv()
|
||||||
|
se.engine = Klass(args.model_path)
|
||||||
|
se.engine.use_mps_if_prosible()
|
||||||
|
workdir = args.workdir or os.getcwd()
|
||||||
|
port = args.port
|
||||||
|
debug(f'{args=}')
|
||||||
|
webserver(init, workdir, port)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
44
llmengine/gemma3_it.py
Normal file
44
llmengine/gemma3_it.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
#!/share/vllm-0.8.5/bin/python
|
||||||
|
|
||||||
|
# pip install accelerate
|
||||||
|
import threading
|
||||||
|
from time import time
|
||||||
|
from appPublic.worker import awaitify
|
||||||
|
from ahserver.serverenv import get_serverenv
|
||||||
|
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
|
||||||
|
from PIL import Image
|
||||||
|
import requests
|
||||||
|
import torch
|
||||||
|
from llmengine.base_chat_llm import MMChatLLM, llm_register
|
||||||
|
|
||||||
|
class Gemma3LLM(MMChatLLM):
|
||||||
|
def __init__(self, model_id):
|
||||||
|
self.model = Gemma3ForConditionalGeneration.from_pretrained(
|
||||||
|
model_id, device_map="auto"
|
||||||
|
).eval()
|
||||||
|
self.processor = AutoProcessor.from_pretrained(model_id)
|
||||||
|
self.tokenizer = self.processor.tokenizer
|
||||||
|
self.messages = []
|
||||||
|
self.model_id = model_id
|
||||||
|
|
||||||
|
llm_register("gemma-3", Gemma3LLM)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
gemma3 = Gemma3LLM('/share/models/google/gemma-3-4b-it')
|
||||||
|
session = {}
|
||||||
|
while True:
|
||||||
|
print('input prompt')
|
||||||
|
p = input()
|
||||||
|
if p:
|
||||||
|
if p == 'q':
|
||||||
|
break;
|
||||||
|
print('input image path')
|
||||||
|
imgpath=input()
|
||||||
|
for d in gemma3.stream_generate(session, p, image_path=imgpath):
|
||||||
|
if not d['done']:
|
||||||
|
print(d['text'], end='', flush=True)
|
||||||
|
else:
|
||||||
|
x = {k:v for k,v in d.items() if k != 'text'}
|
||||||
|
print(f'\n{x}\n')
|
||||||
|
|
||||||
|
|
||||||
53
llmengine/medgemma3_it.py
Normal file
53
llmengine/medgemma3_it.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
# pip install accelerate
|
||||||
|
import time
|
||||||
|
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||||
|
from PIL import Image
|
||||||
|
import requests
|
||||||
|
import torch
|
||||||
|
from llmengine.base_chat_llm import MMChatLLM, llm_register
|
||||||
|
|
||||||
|
model_id = "google/medgemma-4b-it"
|
||||||
|
|
||||||
|
class MedgemmaLLM(MMChatLLM):
|
||||||
|
def __init__(self, model_id):
|
||||||
|
self.model = AutoModelForImageTextToText.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
self.processor = AutoProcessor.from_pretrained(model_id)
|
||||||
|
self.tokenizer = self.processor.tokenizer
|
||||||
|
self.model_id = model_id
|
||||||
|
|
||||||
|
def _messages2inputs(self, messages):
|
||||||
|
inputs = self.processor.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=True,
|
||||||
|
return_dict=True,
|
||||||
|
return_tensors="pt"
|
||||||
|
).to(self.model.device, dtype=torch.bfloat16)
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
llm_register("google/medgemma", MedgemmaLLM)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
med = MedgemmaLLM('/share/models/google/medgemma-4b-it')
|
||||||
|
session = {}
|
||||||
|
while True:
|
||||||
|
print(f'chat with {med.model_id}')
|
||||||
|
print('input prompt')
|
||||||
|
p = input()
|
||||||
|
if p:
|
||||||
|
if p == 'q':
|
||||||
|
break;
|
||||||
|
print('input image path')
|
||||||
|
imgpath=input()
|
||||||
|
for d in med.stream_generate(session, p, image_path=imgpath):
|
||||||
|
if not d['done']:
|
||||||
|
print(d['text'], end='', flush=True)
|
||||||
|
else:
|
||||||
|
x = {k:v for k,v in d.items() if k != 'text'}
|
||||||
|
print(f'\n{x}\n')
|
||||||
|
|
||||||
|
|
||||||
49
llmengine/mrebel_triplet.py
Normal file
49
llmengine/mrebel_triplet.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
|
||||||
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
||||||
|
from base_triplets import BaseTriplets, llm_register
|
||||||
|
|
||||||
|
class MrebelTriplets(BaseTriplets):
|
||||||
|
def __init__(self, model_id):
|
||||||
|
if 'mrebel' not in model_id:
|
||||||
|
raise Exception(f'{model_id} is not a mrebel model')
|
||||||
|
|
||||||
|
# Load model and tokenizer
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id,
|
||||||
|
src_lang="zh_XX", tgt_lang="tp_XX")
|
||||||
|
# Here we set English ("en_XX") as source language.
|
||||||
|
# To change the source language swap the first token of the
|
||||||
|
# input for your desired language or change to supported language.
|
||||||
|
# For catalan ("ca_XX") or greek ("el_EL")
|
||||||
|
# (not included in mBART pretraining) you need a workaround:
|
||||||
|
# tokenizer._src_lang = "ca_XX"
|
||||||
|
# tokenizer.cur_lang_code_id = tokenizer.convert_tokens_to_ids("ca_XX")
|
||||||
|
# tokenizer.set_src_lang_special_tokens("ca_XX")
|
||||||
|
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
|
||||||
|
self.model_id = model_id
|
||||||
|
self.model_name = model_id.split('/')[-1]
|
||||||
|
self.gen_kwargs = {
|
||||||
|
"max_length": 256,
|
||||||
|
"length_penalty": 0,
|
||||||
|
"num_beams": 3,
|
||||||
|
"num_return_sequences": 3,
|
||||||
|
"forced_bos_token_id": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
def build_inputs(self, text):
|
||||||
|
# Tokenizer text
|
||||||
|
return self.tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')
|
||||||
|
|
||||||
|
def gen_preds(self, inputs):
|
||||||
|
# Generate
|
||||||
|
generated_tokens = self.model.generate(
|
||||||
|
inputs['input_ids'].to(self.model.device)
|
||||||
|
attention_mask=inputs["attention_mask"].to(self.model.device),
|
||||||
|
decoder_start_token_id = self.tokenizer.convert_tokens_to_ids("tp_XX"),
|
||||||
|
**self.gen_kwargs
|
||||||
|
)
|
||||||
|
# Extract text
|
||||||
|
decoded_preds = self.tokenizer.batch_decode(generated_tokens,
|
||||||
|
skip_special_tokens=False)
|
||||||
|
return decoded_preds
|
||||||
|
|
||||||
|
llm_register('mrebel', MrebelTriplets)
|
||||||
68
llmengine/qwen3.py
Normal file
68
llmengine/qwen3.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
#!/share/vllm-0.8.5/bin/python
|
||||||
|
|
||||||
|
# pip install accelerate
|
||||||
|
from appPublic.worker import awaitify
|
||||||
|
from appPublic.log import debug
|
||||||
|
from ahserver.serverenv import get_serverenv
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
from PIL import Image
|
||||||
|
import torch
|
||||||
|
from llmengine.base_chat_llm import BaseChatLLM, T2TChatLLM, llm_register
|
||||||
|
|
||||||
|
class Qwen3LLM(T2TChatLLM):
|
||||||
|
def __init__(self, model_id):
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
torch_dtype="auto",
|
||||||
|
device_map="auto"
|
||||||
|
)
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
device = torch.device("mps")
|
||||||
|
self.model = self.model.to(device)
|
||||||
|
self.model_id = model_id
|
||||||
|
|
||||||
|
def build_kwargs(self, inputs, streamer):
|
||||||
|
generate_kwargs = dict(
|
||||||
|
**inputs,
|
||||||
|
streamer=streamer,
|
||||||
|
max_new_tokens=32768,
|
||||||
|
do_sample=True,
|
||||||
|
eos_token_id=self.tokenizer.eos_token_id
|
||||||
|
)
|
||||||
|
return generate_kwargs
|
||||||
|
|
||||||
|
def _messages2inputs(self, messages):
|
||||||
|
debug(f'{messages=}')
|
||||||
|
text = self.tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
enable_thinking=True
|
||||||
|
)
|
||||||
|
return self.tokenizer([text], return_tensors="pt").to(self.model.device)
|
||||||
|
|
||||||
|
llm_register("Qwen/Qwen3", Qwen3LLM)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
model_path = sys.argv[1]
|
||||||
|
q3 = Qwen3LLM(model_path)
|
||||||
|
session = {}
|
||||||
|
while True:
|
||||||
|
print('input prompt')
|
||||||
|
p = input()
|
||||||
|
if p:
|
||||||
|
if p == 'q':
|
||||||
|
break;
|
||||||
|
for d in q3.stream_generate(session, p):
|
||||||
|
print(d)
|
||||||
|
"""
|
||||||
|
if not d['done']:
|
||||||
|
print(d['text'], end='', flush=True)
|
||||||
|
else:
|
||||||
|
x = {k:v for k,v in d.items() if k != 'text'}
|
||||||
|
print(f'\n{x}\n')
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
16
llmengine/qwen3_reranker.py
Normal file
16
llmengine/qwen3_reranker.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
import torch
|
||||||
|
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
|
||||||
|
from llmengine.base_reranker import BaseReranker, llm_register
|
||||||
|
|
||||||
|
class Qwen3Reranker(BaseReranker):
|
||||||
|
def __init__(self, model_id, max_length=8096):
|
||||||
|
if 'Qwen3-Reranker' not in model_id:
|
||||||
|
e = Exception(f'{model_id} is not a Qwen3-Reranker')
|
||||||
|
raise e
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(model_id).eval()
|
||||||
|
self.model_id = model_id
|
||||||
|
self.model_name = model_id.split('/')[-1]
|
||||||
|
self.max_length = 8192
|
||||||
|
|
||||||
|
llm_register('Qwen3-Reranker', Qwen3Reranker)
|
||||||
22
llmengine/qwen3embedding.py
Normal file
22
llmengine/qwen3embedding.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# Requires transformers>=4.51.0
|
||||||
|
# Requires sentence-transformers>=2.7.0
|
||||||
|
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from llmengine.base_embedding import BaseEmbedding, llm_register
|
||||||
|
|
||||||
|
class Qwen3Embedding(BaseEmbedding):
|
||||||
|
def __init__(self, model_id, max_length=8096):
|
||||||
|
# Load the model
|
||||||
|
self.model = SentenceTransformer(model_id)
|
||||||
|
# We recommend enabling flash_attention_2 for better acceleration and memory saving,
|
||||||
|
# together with setting `padding_side` to "left":
|
||||||
|
# model = SentenceTransformer(
|
||||||
|
# "Qwen/Qwen3-Embedding-0.6B",
|
||||||
|
# model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
|
||||||
|
# tokenizer_kwargs={"padding_side": "left"},
|
||||||
|
# )
|
||||||
|
self.max_length = max_length
|
||||||
|
self.model_id = model_id
|
||||||
|
self.model_name = model_id.split('/')[-1]
|
||||||
|
|
||||||
|
llm_register('Qwen3-Embedding', Qwen3Embedding)
|
||||||
106
llmengine/rerank.py
Normal file
106
llmengine/rerank.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
from traceback import format_exc
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
from llmengine.qwen3_reranker import *
|
||||||
|
from llmengine.bge_reranker import *
|
||||||
|
from llmengine.base_reranker import get_llm_class
|
||||||
|
|
||||||
|
from appPublic.registerfunction import RegisterFunction
|
||||||
|
from appPublic.worker import awaitify
|
||||||
|
from appPublic.log import debug, exception
|
||||||
|
from ahserver.serverenv import ServerEnv
|
||||||
|
from ahserver.webapp import webserver
|
||||||
|
|
||||||
|
helptext = """rerank api:
|
||||||
|
path: /v1/rerank
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
data:
|
||||||
|
{
|
||||||
|
"model": "rerank-001",
|
||||||
|
"query": "什么是量子计算?",
|
||||||
|
"documents": [
|
||||||
|
"量子计算是一种使用量子比特进行计算的方式。",
|
||||||
|
"古典计算机使用的是二进制位。",
|
||||||
|
"天气预报依赖于统计模型。",
|
||||||
|
"量子计算与物理学密切相关。"
|
||||||
|
},
|
||||||
|
"top_n": 2
|
||||||
|
}
|
||||||
|
|
||||||
|
response is a json
|
||||||
|
{
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"relevance_score": 0.95
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 3,
|
||||||
|
"relevance_score": 0.89
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"object": "rerank.result",
|
||||||
|
"model": "rerank-001",
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 0,
|
||||||
|
"total_tokens": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def init():
|
||||||
|
rf = RegisterFunction()
|
||||||
|
rf.register('rerank', rerank)
|
||||||
|
rf.register('docs', docs)
|
||||||
|
|
||||||
|
async def docs(request, params_kw, *params, **kw):
|
||||||
|
return helptext
|
||||||
|
|
||||||
|
async def rerank(request, params_kw, *params, **kw):
|
||||||
|
debug(f'{params_kw.query=}, {params_kw.documents=}, {params_kw.top_n=}')
|
||||||
|
se = ServerEnv()
|
||||||
|
engine = se.engine
|
||||||
|
f = awaitify(engine.rerank)
|
||||||
|
query = params_kw.query
|
||||||
|
if query is None:
|
||||||
|
e = Exception(f'query is None')
|
||||||
|
raise e
|
||||||
|
documents = params_kw.documents
|
||||||
|
if documents is None:
|
||||||
|
e = Exception(f'documents is None')
|
||||||
|
raise e
|
||||||
|
if isinstance(documents, str):
|
||||||
|
documents = [documents]
|
||||||
|
top_n = params_kw.top_n
|
||||||
|
if top_n is None:
|
||||||
|
top_n = 5
|
||||||
|
arr = await f(query, params_kw.documents, top_n)
|
||||||
|
debug(f'{arr=}, type(arr)')
|
||||||
|
return arr
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(prog="Rerank")
|
||||||
|
parser.add_argument('-w', '--workdir')
|
||||||
|
parser.add_argument('-p', '--port')
|
||||||
|
parser.add_argument('model_path')
|
||||||
|
args = parser.parse_args()
|
||||||
|
Klass = get_llm_class(args.model_path)
|
||||||
|
if Klass is None:
|
||||||
|
e = Exception(f'{args.model_path} has not mapping to a model class')
|
||||||
|
exception(f'{e}, {format_exc()}')
|
||||||
|
raise e
|
||||||
|
se = ServerEnv()
|
||||||
|
se.engine = Klass(args.model_path)
|
||||||
|
se.engine.use_mps_if_prosible()
|
||||||
|
workdir = args.workdir or os.getcwd()
|
||||||
|
port = args.port
|
||||||
|
debug(f'{args=}')
|
||||||
|
webserver(init, workdir, port)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
62
llmengine/server.py
Normal file
62
llmengine/server.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
from traceback import format_exc
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from llmengine.base_chat_llm import BaseChatLLM, get_llm_class
|
||||||
|
from llmengine.gemma3_it import Gemma3LLM
|
||||||
|
from llmengine.medgemma3_it import MedgemmaLLM
|
||||||
|
from llmengine.qwen3 import Qwen3LLM
|
||||||
|
|
||||||
|
from appPublic.registerfunction import RegisterFunction
|
||||||
|
from appPublic.log import debug, exception
|
||||||
|
from ahserver.serverenv import ServerEnv
|
||||||
|
from ahserver.globalEnv import stream_response
|
||||||
|
from ahserver.webapp import webserver
|
||||||
|
|
||||||
|
from aiohttp_session import get_session
|
||||||
|
|
||||||
|
def init():
|
||||||
|
rf = RegisterFunction()
|
||||||
|
rf.register('chat_completions', chat_completions)
|
||||||
|
|
||||||
|
async def chat_completions(request, params_kw, *params, **kw):
|
||||||
|
async def gor():
|
||||||
|
se = ServerEnv()
|
||||||
|
engine = se.chat_engine
|
||||||
|
session = await get_session(request)
|
||||||
|
kwargs = {
|
||||||
|
}
|
||||||
|
if params_kw.image_path:
|
||||||
|
kwargs['image_path'] = fs.reapPath(params_kw.image_path)
|
||||||
|
if params_kw.video_path:
|
||||||
|
kwargs['video_path'] = fs.reapPath(params_kw.video_path)
|
||||||
|
if params_kw.audio_path:
|
||||||
|
kwargs['audio_path'] = fs.reapPath(params_kw.audio_path)
|
||||||
|
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
|
||||||
|
debug(f'{d=}')
|
||||||
|
yield d
|
||||||
|
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(prog="Sage")
|
||||||
|
parser.add_argument('-w', '--workdir')
|
||||||
|
parser.add_argument('-p', '--port')
|
||||||
|
parser.add_argument('model_path')
|
||||||
|
args = parser.parse_args()
|
||||||
|
Klass = get_llm_class(args.model_path)
|
||||||
|
if Klass is None:
|
||||||
|
e = Exception(f'{args.model_path} has not mapping to a model class')
|
||||||
|
exception(f'{e}, {format_exc()}')
|
||||||
|
raise e
|
||||||
|
se = ServerEnv()
|
||||||
|
se.engine = Klass(args.model_path)
|
||||||
|
se.engine.use_mps_if_prosible()
|
||||||
|
workdir = args.workdir or os.getcwd()
|
||||||
|
port = args.port
|
||||||
|
webserver(init, workdir, port)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
24
pyproject.toml
Normal file
24
pyproject.toml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
[project]
|
||||||
|
name="llmengine"
|
||||||
|
version = "0.0.1"
|
||||||
|
description = "Your project description"
|
||||||
|
authors = [{ name = "yu moqing", email = "yumoqing@gmail.com" }]
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.8"
|
||||||
|
license = {text = "MIT"}
|
||||||
|
dependencies = [
|
||||||
|
"torch",
|
||||||
|
"transformers",
|
||||||
|
"sentence-transformers>=2.7.0",
|
||||||
|
# "flash_attention_2",
|
||||||
|
"mistral-common",
|
||||||
|
"accelerate"
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = ["pytest", "black", "mypy"]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
11
t
Normal file
11
t
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
select x.*, 'folder' as rtype,
|
||||||
|
case when y.id is null then 1
|
||||||
|
else 0 end as is_leaf
|
||||||
|
from
|
||||||
|
(select * from folder
|
||||||
|
where parentid is null) x left join
|
||||||
|
(select unique a.* from
|
||||||
|
folder a left join folder b
|
||||||
|
on a.id = b.parentid
|
||||||
|
where b.id is not NULL) y
|
||||||
|
on x.id = y.id
|
||||||
3
test/chat/Qwen3-0.6B
Executable file
3
test/chat/Qwen3-0.6B
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
~/models/tsfm.env/bin/python -m llmengine.server -p 9999 ~/models/Qwen/Qwen3-0.6B
|
||||||
3
test/chat/Qwen3-Embedding-0.6B
Executable file
3
test/chat/Qwen3-Embedding-0.6B
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
~/models/tsfm.env/bin/python -m llmengine.embedding -w ~/models/tsfm -p 9998 ~/models/Qwen/Qwen3-Embedding-0.6B
|
||||||
51
test/chat/conf/config.json
Normal file
51
test/chat/conf/config.json
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
{
|
||||||
|
"filesroot":"$[workdir]$/files",
|
||||||
|
"logger":{
|
||||||
|
"name":"llmengine",
|
||||||
|
"levelname":"info",
|
||||||
|
"logfile":"$[workdir]$/logs/llmengine.log"
|
||||||
|
},
|
||||||
|
"website":{
|
||||||
|
"paths":[
|
||||||
|
["$[workdir]$/wwwroot",""]
|
||||||
|
],
|
||||||
|
"client_max_size":10000,
|
||||||
|
"host":"0.0.0.0",
|
||||||
|
"port":9995,
|
||||||
|
"coding":"utf-8",
|
||||||
|
"ssl_gg":{
|
||||||
|
"crtfile":"$[workdir]$/conf/www.bsppo.com.pem",
|
||||||
|
"keyfile":"$[workdir]$/conf/www.bsppo.com.key"
|
||||||
|
},
|
||||||
|
"indexes":[
|
||||||
|
"index.html",
|
||||||
|
"index.ui"
|
||||||
|
],
|
||||||
|
"startswiths":[
|
||||||
|
{
|
||||||
|
"leading":"/idfile",
|
||||||
|
"registerfunction":"idfile"
|
||||||
|
},{
|
||||||
|
"leading": "/v1/chat/completions",
|
||||||
|
"registerfunction": "chat_completions"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"processors":[
|
||||||
|
[".tmpl","tmpl"],
|
||||||
|
[".app","app"],
|
||||||
|
[".ui","bui"],
|
||||||
|
[".dspy","dspy"],
|
||||||
|
[".md","md"]
|
||||||
|
],
|
||||||
|
"rsakey_oops":{
|
||||||
|
"privatekey":"$[workdir]$/conf/rsa_private_key.pem",
|
||||||
|
"publickey":"$[workdir]$/conf/rsa_public_key.pem"
|
||||||
|
},
|
||||||
|
"session_max_time":3000,
|
||||||
|
"session_issue_time":2500,
|
||||||
|
"session_redis_notuse":{
|
||||||
|
"url":"redis://127.0.0.1:6379"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
6
test/chat/conf/speakers.json
Normal file
6
test/chat/conf/speakers.json
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"ymq": {
|
||||||
|
"ref_text": "\u8f7b\u91cf\u5e94\u7528\u670d\u52a1\u5668\u5907\u6848\u6761\u4ef6\uff1a\u8d2d\u4e70\u65f6\u957f\u57283\u4e2a\u6708\u53ca\u4ee5\u4e0a",
|
||||||
|
"ref_audio": "/data/ymq/py/f5tts/files/87/103/66/49/record.wav"
|
||||||
|
}
|
||||||
|
}
|
||||||
3627
test/chat/logs/llmengine.log
Normal file
3627
test/chat/logs/llmengine.log
Normal file
File diff suppressed because one or more lines are too long
27
test/chatllm
Executable file
27
test/chatllm
Executable file
@ -0,0 +1,27 @@
|
|||||||
|
#!/share/vllm-0.8.5/bin/python
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Example script using argparse")
|
||||||
|
parser.add_argument('--gpus', '-g', type=str, required=False, default='0', help='Identify GPU id, default is 0, comma split')
|
||||||
|
parser.add_argument("--stream", action="store_true", help="是否流式输出", default=True)
|
||||||
|
parser.add_argument('modelpath', type=str, help='Path to model folder')
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
|
||||||
|
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
||||||
|
gpus = args.gpus.split(',')
|
||||||
|
cnt=len(gpus)
|
||||||
|
stream=' --stream' if args.stream else ' '
|
||||||
|
cmdline = f'/share/vllm-0.8.5/bin/python -m llmengine.chatllm --model {args.modelpath} --gpus {cnt} {stream}'
|
||||||
|
print(args, cmdline)
|
||||||
|
os.system(cmdline)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
0
test/ds-r1-8b
Executable file
0
test/ds-r1-8b
Executable file
3
test/embeddings/Qwen3-Embedding-0.6B
Executable file
3
test/embeddings/Qwen3-Embedding-0.6B
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
~/models/tsfm.env/bin/python -m llmengine.embedding -p 9998 ~/models/Qwen/Qwen3-Embedding-0.6B
|
||||||
50
test/embeddings/conf/config.json
Normal file
50
test/embeddings/conf/config.json
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
{
|
||||||
|
"filesroot":"$[workdir]$/files",
|
||||||
|
"logger":{
|
||||||
|
"name":"llmengine",
|
||||||
|
"levelname":"info",
|
||||||
|
"logfile":"$[workdir]$/logs/llmengine.log"
|
||||||
|
},
|
||||||
|
"website":{
|
||||||
|
"paths":[
|
||||||
|
["$[workdir]$/wwwroot",""]
|
||||||
|
],
|
||||||
|
"client_max_size":10000,
|
||||||
|
"host":"0.0.0.0",
|
||||||
|
"port":9995,
|
||||||
|
"coding":"utf-8",
|
||||||
|
"indexes":[
|
||||||
|
"index.html",
|
||||||
|
"index.ui"
|
||||||
|
],
|
||||||
|
"startswiths":[
|
||||||
|
{
|
||||||
|
"leading":"/idfile",
|
||||||
|
"registerfunction":"idfile"
|
||||||
|
},{
|
||||||
|
"leading": "/v1/embeddings",
|
||||||
|
"registerfunction": "embeddings"
|
||||||
|
},{
|
||||||
|
"leading": "/docs",
|
||||||
|
"registerfunction": "docs"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"processors":[
|
||||||
|
[".tmpl","tmpl"],
|
||||||
|
[".app","app"],
|
||||||
|
[".ui","bui"],
|
||||||
|
[".dspy","dspy"],
|
||||||
|
[".md","md"]
|
||||||
|
],
|
||||||
|
"rsakey_oops":{
|
||||||
|
"privatekey":"$[workdir]$/conf/rsa_private_key.pem",
|
||||||
|
"publickey":"$[workdir]$/conf/rsa_public_key.pem"
|
||||||
|
},
|
||||||
|
"session_max_time":3000,
|
||||||
|
"session_issue_time":2500,
|
||||||
|
"session_redis_notuse":{
|
||||||
|
"url":"redis://127.0.0.1:6379"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
0
test/embeddings/logs/llmengine.log
Normal file
0
test/embeddings/logs/llmengine.log
Normal file
48
test/embeddings/qwen3-embedding.service
Normal file
48
test/embeddings/qwen3-embedding.service
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=A Embedding Service using Qwen3-Embedding-0.6B
|
||||||
|
# After=network.target DeepSeek70B-kyyds671b-ray.service
|
||||||
|
# Requires=DeepSeek70B-kyyds671b-ray.service
|
||||||
|
StartLimitIntervalSec=60
|
||||||
|
StartLimitBurst=5
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
# 核心启动参数(保持原有配置)
|
||||||
|
User=ymq
|
||||||
|
Group=ymq
|
||||||
|
WorkingDirectory=/share/ymq/run/embeddings
|
||||||
|
#定义环境变量,所有节点的启动脚本与服务需一致
|
||||||
|
#Environment="NCCL_SOCKET_IFNAME=enp196s0f0np0"
|
||||||
|
#ExecStartPre=/data/kyyds671b/ray_check.sh
|
||||||
|
ExecStart=/share/ymq/run/embeddings/start.sh
|
||||||
|
ExecStop=/share/ymq/run/embeddings/stop.sh
|
||||||
|
|
||||||
|
# 超时与停止控制(新增部分)
|
||||||
|
# 启动超时延长至 120 秒
|
||||||
|
# TimeoutStartSec=120
|
||||||
|
# 停止等待时间 30 秒
|
||||||
|
# TimeoutStopSec=30
|
||||||
|
# 优先发送 SIGINT 信号(更适合 Python 程序)
|
||||||
|
# KillSignal=SIGINT
|
||||||
|
# 最终强制终止信号
|
||||||
|
# RestartKillSignal=SIGKILL
|
||||||
|
# 混合终止模式
|
||||||
|
# KillMode=mixed
|
||||||
|
# 重启策略
|
||||||
|
# Restart=on-failure
|
||||||
|
# RestartSec=10s
|
||||||
|
# 服务管理(保持原有配置+增强)
|
||||||
|
#Restart=always
|
||||||
|
#RestartSec=10 # 重启间隔从 5 秒调整为 10 秒
|
||||||
|
#append 是继续写入相当于>> file是从新写入 相当于>
|
||||||
|
StandardOutput=append:/var/log/embeddings/embeddings.log
|
||||||
|
StandardError=append:/var/log/embeddings/error.log
|
||||||
|
SyslogIdentifier=embeddings
|
||||||
|
# 资源限制(保持可选配置)
|
||||||
|
#LimitNOFILE=65536
|
||||||
|
#LimitNPROC=65536
|
||||||
|
# GPU 支持
|
||||||
|
#Environment=CUDA_VISIBLE_DEVICES=0,1
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
|
||||||
3
test/embeddings/start.sh
Executable file
3
test/embeddings/start.sh
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=7 /share/vllm-0.8.5/bin/python -m llmengine.embedding -p 9998 /d/ymq/models/Qwen/Qwen3-Embedding-0.6B
|
||||||
4
test/embeddings/stop.sh
Executable file
4
test/embeddings/stop.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
|
||||||
|
killname Qwen/Qwen3-Embedding
|
||||||
|
|
||||||
119
test/gemma-3-4b-it
Executable file
119
test/gemma-3-4b-it
Executable file
@ -0,0 +1,119 @@
|
|||||||
|
#!/share/vllm-0.8.5/bin/python
|
||||||
|
|
||||||
|
# pip install accelerate
|
||||||
|
import torch
|
||||||
|
lfrom time import time
|
||||||
|
from appPublic.worker import awaitify
|
||||||
|
from ahserver.serverenv import get_serverenv
|
||||||
|
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
|
||||||
|
from PIL import Image
|
||||||
|
import requests
|
||||||
|
import torch
|
||||||
|
|
||||||
|
class Gemma3LLM:
|
||||||
|
def __init__(self, model_id):
|
||||||
|
self.model = Gemma3ForConditionalGeneration.from_pretrained(
|
||||||
|
model_id, device_map="auto"
|
||||||
|
).eval()
|
||||||
|
self.processor = AutoProcessor.from_pretrained(model_id)
|
||||||
|
self.messages = []
|
||||||
|
self.model_id = model_id
|
||||||
|
|
||||||
|
async def get_session_key(self):
|
||||||
|
return self.model_id + ':messages'
|
||||||
|
|
||||||
|
async def get_session_messages(self, request):
|
||||||
|
f = get_serverenv('get_session')
|
||||||
|
session = await f(request)
|
||||||
|
key = self.get_session_key()
|
||||||
|
messages = session.get(key) or []
|
||||||
|
return messages
|
||||||
|
|
||||||
|
async def set_session_messages(self, request, messages):
|
||||||
|
f = get_serverenv('get_session')
|
||||||
|
session = await f(request)
|
||||||
|
key = self.get_session_key()
|
||||||
|
await session[key] = messages
|
||||||
|
|
||||||
|
def _generate(self, request, prompt, image_path=None, sys_prompt=None):
|
||||||
|
if sys_prompt:
|
||||||
|
sys_message = self._build_sys_message(sys_prompt)
|
||||||
|
self.messages.append(sys_message)
|
||||||
|
user_message = self._build_user_message(prompt, image_path=image_path)
|
||||||
|
self.messages.append(user_message)
|
||||||
|
data = self._gen(self.messages)
|
||||||
|
self.messages.append(self._build_assistant_message(data['text']))
|
||||||
|
|
||||||
|
def _build_assistant_message(self, prompt):
|
||||||
|
return {
|
||||||
|
"role":"assistant",
|
||||||
|
"content":[{"type": "text", "text": prompt}]
|
||||||
|
}
|
||||||
|
|
||||||
|
def _build_sys_message(self, prompt):
|
||||||
|
return {
|
||||||
|
"role":"system",
|
||||||
|
"content":[{"type": "text", "text": prompt}]
|
||||||
|
}
|
||||||
|
|
||||||
|
def _build_user_message(self, prompt, image_path=None):
|
||||||
|
contents = [
|
||||||
|
{
|
||||||
|
"type":"text", "text": prompt
|
||||||
|
}
|
||||||
|
]
|
||||||
|
if image_path:
|
||||||
|
contents.append({
|
||||||
|
"type": "image",
|
||||||
|
"image": image_path
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"role": "user",
|
||||||
|
"content": contents
|
||||||
|
}
|
||||||
|
|
||||||
|
def _gen(self, messages):
|
||||||
|
t1 = time()
|
||||||
|
inputs = self.processor.apply_chat_template(
|
||||||
|
messages, add_generation_prompt=True,
|
||||||
|
tokenize=True,
|
||||||
|
return_dict=True, return_tensors="pt"
|
||||||
|
).to(self.model.device, dtype=torch.bfloat16)
|
||||||
|
input_len = inputs["input_ids"].shape[-1]
|
||||||
|
with torch.inference_mode():
|
||||||
|
generation = self.model.generate(**inputs, max_new_tokens=1000, do_sample=True)
|
||||||
|
generation = generation[0][input_len:]
|
||||||
|
decoded = self.processor.decode(generation, skip_special_tokens=True)
|
||||||
|
t2 = time()
|
||||||
|
return {
|
||||||
|
"role": "assistant",
|
||||||
|
"input_tokens": input_len,
|
||||||
|
"output_token": len(generation),
|
||||||
|
"timecost": t2 - t1,
|
||||||
|
"text": decoded
|
||||||
|
}
|
||||||
|
|
||||||
|
async def generate(self, request, prompt, image_path=None, sys_prompt=None):
|
||||||
|
messages = self.get_session_messages(request)
|
||||||
|
if sys_prompt and len(messages) == 0:
|
||||||
|
messages.append(self._build_sys_message(sys_prompt))
|
||||||
|
messages.append(self._build_user_message(prompt, image_path=image_path))
|
||||||
|
f = awaitify(self._gen)
|
||||||
|
data = await f(messages)
|
||||||
|
messages.append(self._build_assistant_message(data['text']))
|
||||||
|
self.set_session_message(request, messages)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
gemma3 = Gemma3LLM('/share/models/google/gemma-3-4b-it')
|
||||||
|
while True:
|
||||||
|
print('input prompt')
|
||||||
|
p = input()
|
||||||
|
if p:
|
||||||
|
if p == 'q':
|
||||||
|
break;
|
||||||
|
print('input image path')
|
||||||
|
imgpath=input()
|
||||||
|
t = gemma3._generate(p, image_path=imgpath)
|
||||||
|
print(t)
|
||||||
|
|
||||||
3
test/gemma3.sh
Executable file
3
test/gemma3.sh
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=1 /share/vllm-0.8.5/bin/python -m llmengine.gemma3_it
|
||||||
BIN
test/gemma3/.run.sh.swp
Normal file
BIN
test/gemma3/.run.sh.swp
Normal file
Binary file not shown.
51
test/gemma3/conf/config.json
Normal file
51
test/gemma3/conf/config.json
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
{
|
||||||
|
"filesroot":"$[workdir]$/files",
|
||||||
|
"logger":{
|
||||||
|
"name":"llmengine",
|
||||||
|
"levelname":"info",
|
||||||
|
"logfile":"$[workdir]$/logs/llmengine.log"
|
||||||
|
},
|
||||||
|
"website":{
|
||||||
|
"paths":[
|
||||||
|
["$[workdir]$/wwwroot",""]
|
||||||
|
],
|
||||||
|
"client_max_size":10000,
|
||||||
|
"host":"0.0.0.0",
|
||||||
|
"port":9995,
|
||||||
|
"coding":"utf-8",
|
||||||
|
"ssl_gg":{
|
||||||
|
"crtfile":"$[workdir]$/conf/www.bsppo.com.pem",
|
||||||
|
"keyfile":"$[workdir]$/conf/www.bsppo.com.key"
|
||||||
|
},
|
||||||
|
"indexes":[
|
||||||
|
"index.html",
|
||||||
|
"index.ui"
|
||||||
|
],
|
||||||
|
"startswiths":[
|
||||||
|
{
|
||||||
|
"leading":"/idfile",
|
||||||
|
"registerfunction":"idfile"
|
||||||
|
},{
|
||||||
|
"leading": "/v1/chat/completions",
|
||||||
|
"registerfunction": "chat_completions"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"processors":[
|
||||||
|
[".tmpl","tmpl"],
|
||||||
|
[".app","app"],
|
||||||
|
[".ui","bui"],
|
||||||
|
[".dspy","dspy"],
|
||||||
|
[".md","md"]
|
||||||
|
],
|
||||||
|
"rsakey_oops":{
|
||||||
|
"privatekey":"$[workdir]$/conf/rsa_private_key.pem",
|
||||||
|
"publickey":"$[workdir]$/conf/rsa_public_key.pem"
|
||||||
|
},
|
||||||
|
"session_max_time":3000,
|
||||||
|
"session_issue_time":2500,
|
||||||
|
"session_redis_notuse":{
|
||||||
|
"url":"redis://127.0.0.1:6379"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
6
test/gemma3/conf/speakers.json
Normal file
6
test/gemma3/conf/speakers.json
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"ymq": {
|
||||||
|
"ref_text": "\u8f7b\u91cf\u5e94\u7528\u670d\u52a1\u5668\u5907\u6848\u6761\u4ef6\uff1a\u8d2d\u4e70\u65f6\u957f\u57283\u4e2a\u6708\u53ca\u4ee5\u4e0a",
|
||||||
|
"ref_audio": "/data/ymq/py/f5tts/files/87/103/66/49/record.wav"
|
||||||
|
}
|
||||||
|
}
|
||||||
16
test/gemma3/gemma3.service
Normal file
16
test/gemma3/gemma3.service
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
[Unit]
|
||||||
|
Wants=systemd-networkd.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=ymq
|
||||||
|
Group=ymq
|
||||||
|
Type=forking
|
||||||
|
WorkingDirectory=/share/ymq/run/gemma3
|
||||||
|
ExecStart=/share/ymq/run/gemma3/start.sh
|
||||||
|
ExecStop=/share/ymq/run/gemma3/stop.sh
|
||||||
|
StandardOutput=append:/var/log/gemma3/gemma3.log
|
||||||
|
StandardError=append:/var/log/gemma3/gemma3.log
|
||||||
|
SyslogIdentifier=gemma3
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
4
test/gemma3/install.sh
Executable file
4
test/gemma3/install.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
sudo mkdir /var/log/gemma3
|
||||||
|
sudo cp gemma3.service /etc/systemd/system
|
||||||
|
sudo systemctl enable gemma3
|
||||||
|
sudo systemctl start gemma3
|
||||||
342
test/gemma3/logs/llmengine.log
Normal file
342
test/gemma3/logs/llmengine.log
Normal file
@ -0,0 +1,342 @@
|
|||||||
|
2025-06-09 08:13:26.400[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
|
||||||
|
2025-06-09 08:13:26.411[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
|
||||||
|
2025-06-09 08:13:26.418[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.008466005325317383, (0.000392913818359375), except=name 'stream_response' is not defined
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 42, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
NameError: name 'stream_response' is not defined
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 42, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
NameError: name 'stream_response' is not defined
|
||||||
|
|
||||||
|
2025-06-09 08:15:08.876[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
|
||||||
|
2025-06-09 08:15:08.884[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
|
||||||
|
2025-06-09 08:15:08.891[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.005657672882080078, (9.679794311523438e-05), except=get_session() missing 1 required positional argument: 'request'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 43, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
|
||||||
|
async for d in async_data_generator():
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 31, in gor
|
||||||
|
session = await get_session()
|
||||||
|
TypeError: get_session() missing 1 required positional argument: 'request'
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 43, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
|
||||||
|
async for d in async_data_generator():
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 31, in gor
|
||||||
|
session = await get_session()
|
||||||
|
TypeError: get_session() missing 1 required positional argument: 'request'
|
||||||
|
|
||||||
|
2025-06-09 08:19:30.169[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
|
||||||
|
2025-06-09 08:19:30.177[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
|
||||||
|
2025-06-09 08:19:30.223[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.03934144973754883, (0.00010514259338378906), except='None' has no attribute 'startswith'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 45, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
|
||||||
|
async for d in async_data_generator():
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 41, in gor
|
||||||
|
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
|
||||||
|
for d in self._generator(session, prompt,
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
|
||||||
|
for d in self._gen(messages):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
|
||||||
|
inputs = self._messages2inputs(messages)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 32, in _messages2inputs
|
||||||
|
text = self.tokenizer.apply_chat_template(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
|
||||||
|
rendered_chat = compiled_template.render(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
|
||||||
|
self.environment.handle_exception()
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
|
||||||
|
raise rewrite_traceback_stack(source=source)
|
||||||
|
File "<template>", line 20, in top-level template code
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
|
||||||
|
if not __self.is_safe_callable(__obj):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
|
||||||
|
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
|
||||||
|
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 45, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
|
||||||
|
async for d in async_data_generator():
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 41, in gor
|
||||||
|
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
|
||||||
|
for d in self._generator(session, prompt,
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
|
||||||
|
for d in self._gen(messages):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
|
||||||
|
inputs = self._messages2inputs(messages)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 32, in _messages2inputs
|
||||||
|
text = self.tokenizer.apply_chat_template(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
|
||||||
|
rendered_chat = compiled_template.render(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
|
||||||
|
self.environment.handle_exception()
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
|
||||||
|
raise rewrite_traceback_stack(source=source)
|
||||||
|
File "<template>", line 20, in top-level template code
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
|
||||||
|
if not __self.is_safe_callable(__obj):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
|
||||||
|
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
|
||||||
|
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
|
||||||
|
|
||||||
|
2025-06-09 08:28:03.514[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
|
||||||
|
2025-06-09 08:28:03.522[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
|
||||||
|
2025-06-09 08:28:03.526[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py:33]messages=[{'role': 'user', 'content': None}]
|
||||||
|
2025-06-09 08:28:03.579[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.05059671401977539, (0.00011324882507324219), except='None' has no attribute 'startswith'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 45, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
|
||||||
|
async for d in async_data_generator():
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 41, in gor
|
||||||
|
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
|
||||||
|
for d in self._generator(session, prompt,
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
|
||||||
|
for d in self._gen(messages):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
|
||||||
|
inputs = self._messages2inputs(messages)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 34, in _messages2inputs
|
||||||
|
text = self.tokenizer.apply_chat_template(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
|
||||||
|
rendered_chat = compiled_template.render(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
|
||||||
|
self.environment.handle_exception()
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
|
||||||
|
raise rewrite_traceback_stack(source=source)
|
||||||
|
File "<template>", line 20, in top-level template code
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
|
||||||
|
if not __self.is_safe_callable(__obj):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
|
||||||
|
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
|
||||||
|
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 45, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
|
||||||
|
async for d in async_data_generator():
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 41, in gor
|
||||||
|
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
|
||||||
|
for d in self._generator(session, prompt,
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
|
||||||
|
for d in self._gen(messages):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
|
||||||
|
inputs = self._messages2inputs(messages)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 34, in _messages2inputs
|
||||||
|
text = self.tokenizer.apply_chat_template(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
|
||||||
|
rendered_chat = compiled_template.render(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
|
||||||
|
self.environment.handle_exception()
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
|
||||||
|
raise rewrite_traceback_stack(source=source)
|
||||||
|
File "<template>", line 20, in top-level template code
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
|
||||||
|
if not __self.is_safe_callable(__obj):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
|
||||||
|
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
|
||||||
|
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
|
||||||
|
|
||||||
|
2025-06-09 08:31:48.954[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
|
||||||
|
2025-06-09 08:31:48.961[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'{\n"prompt":"who are you"\n}': ''}, args=[]
|
||||||
|
2025-06-09 08:31:48.964[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py:29]params_kw={'{\n"prompt":"who are you"\n}': ''}, params=(), kw={}
|
||||||
|
2025-06-09 08:31:48.968[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py:33]messages=[{'role': 'user', 'content': None}]
|
||||||
|
2025-06-09 08:31:49.009[llmengine][exception][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:168]Exception=client(127.0.0.1) None access /v1/chat/completions cost 0.04324674606323242, (8.392333984375e-05), except='None' has no attribute 'startswith'
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 46, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
|
||||||
|
async for d in async_data_generator():
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 42, in gor
|
||||||
|
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
|
||||||
|
for d in self._generator(session, prompt,
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
|
||||||
|
for d in self._gen(messages):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
|
||||||
|
inputs = self._messages2inputs(messages)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 34, in _messages2inputs
|
||||||
|
text = self.tokenizer.apply_chat_template(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
|
||||||
|
rendered_chat = compiled_template.render(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
|
||||||
|
self.environment.handle_exception()
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
|
||||||
|
raise rewrite_traceback_stack(source=source)
|
||||||
|
File "<template>", line 20, in top-level template code
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
|
||||||
|
if not __self.is_safe_callable(__obj):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
|
||||||
|
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
|
||||||
|
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py", line 161, in checkAuth
|
||||||
|
ret = await handler(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/processorResource.py", line 351, in _handle
|
||||||
|
ret = await processor.handle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 95, in handle
|
||||||
|
await self.execute(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/baseProcessor.py", line 86, in execute
|
||||||
|
await self.datahandle(request)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 46, in datahandle
|
||||||
|
x = await self.path_call(request, self.path)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py", line 42, in path_call
|
||||||
|
return await f(request, params_kw, *args)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 46, in chat_completions
|
||||||
|
return await stream_response(request, gor)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/globalEnv.py", line 58, in stream_response
|
||||||
|
async for d in async_data_generator():
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py", line 42, in gor
|
||||||
|
async for d in engine.async_stream_generate(session, params_kw.prompt, **kwargs):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 112, in async_stream_generate
|
||||||
|
for d in self._generator(session, prompt,
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 66, in _generator
|
||||||
|
for d in self._gen(messages):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/base_chat_llm.py", line 137, in _gen
|
||||||
|
inputs = self._messages2inputs(messages)
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py", line 34, in _messages2inputs
|
||||||
|
text = self.tokenizer.apply_chat_template(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1695, in apply_chat_template
|
||||||
|
rendered_chat = compiled_template.render(
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 1295, in render
|
||||||
|
self.environment.handle_exception()
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/environment.py", line 942, in handle_exception
|
||||||
|
raise rewrite_traceback_stack(source=source)
|
||||||
|
File "<template>", line 20, in top-level template code
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 399, in call
|
||||||
|
if not __self.is_safe_callable(__obj):
|
||||||
|
File "/share/vllm-0.8.5/lib/python3.10/site-packages/jinja2/sandbox.py", line 265, in is_safe_callable
|
||||||
|
getattr(obj, "unsafe_callable", False) or getattr(obj, "alters_data", False)
|
||||||
|
jinja2.exceptions.UndefinedError: 'None' has no attribute 'startswith'
|
||||||
|
|
||||||
|
2025-06-09 08:37:22.471[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
|
||||||
|
2025-06-09 08:37:22.479[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'prompt': 'who are you'}, args=[]
|
||||||
|
2025-06-09 08:37:22.483[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py:29]params_kw={'prompt': 'who are you'}, params=(), kw={}
|
||||||
|
2025-06-09 08:37:22.486[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py:33]messages=[{'role': 'user', 'content': 'who are you'}]
|
||||||
|
2025-06-09 08:48:12.725[llmengine][info][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/auth_api.py:151]checkAuth() called ... request.path='/v1/chat/completions'
|
||||||
|
2025-06-09 08:48:12.735[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/ahserver/functionProcessor.py:40]params_kw={'prompt': 'who are you'}, args=[]
|
||||||
|
2025-06-09 08:48:12.738[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/server.py:29]params_kw={'prompt': 'who are you'}, params=(), kw={}
|
||||||
|
2025-06-09 08:48:12.742[llmengine][debug][/share/vllm-0.8.5/lib/python3.10/site-packages/llmengine/qwen3.py:33]messages=[{'role': 'user', 'content': 'who are you'}]
|
||||||
4
test/gemma3/start.sh
Executable file
4
test/gemma3/start.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=5 /share/vllm-0.8.5/bin/python -m llmengine.server -p 9999 /share/models/google/gemma-3-4b-it &
|
||||||
|
CUDA_VISIBLE_DEVICES=5 /share/vllm-0.8.5/bin/python -m llmengine.server -p 9999 /share/models/google/gemma-3-4b-it &
|
||||||
3
test/gemma3/stop.sh
Normal file
3
test/gemma3/stop.sh
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
|
||||||
|
/d/ymq/bin/killname gemma-3-4b-it
|
||||||
3
test/medgemma3.sh
Executable file
3
test/medgemma3.sh
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=0 /share/vllm-0.8.5/bin/python -m llmengine.medgemma3_it
|
||||||
30
test/phi4
Executable file
30
test/phi4
Executable file
@ -0,0 +1,30 @@
|
|||||||
|
#!/share/vllm-0.8.5/bin/python
|
||||||
|
|
||||||
|
import transformers
|
||||||
|
|
||||||
|
pipeline = transformers.pipeline(
|
||||||
|
"text-generation",
|
||||||
|
model="/share/ymq/models/microsoft/phi-4",
|
||||||
|
model_kwargs={"torch_dtype": "auto"},
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "You are a medieval knight and must provide explanations to modern people."},
|
||||||
|
]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print('input prompt')
|
||||||
|
p = input()
|
||||||
|
if not p:
|
||||||
|
continue
|
||||||
|
if p == 'q':
|
||||||
|
break
|
||||||
|
messages.append({
|
||||||
|
'role':'user',
|
||||||
|
'content': p
|
||||||
|
})
|
||||||
|
|
||||||
|
outputs = pipeline(messages, max_new_tokens=1024)
|
||||||
|
messages = outputs[0]["generated_text"]
|
||||||
|
print(messages[-1]['content'])
|
||||||
|
|
||||||
3
test/qwen3.sh
Executable file
3
test/qwen3.sh
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
|
||||||
|
~/models/tsfm.env/bin/python -m llmengine.server ~/models/Qwen/Qwen3-0.6B
|
||||||
3
test/qwen3_embedding.sh
Executable file
3
test/qwen3_embedding.sh
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
~/models/tsfm.env/bin/python -m llmengine.embedding ~/models/Qwen/Qwen3-Embedding-0.6B
|
||||||
50
test/reranker/conf/config.json
Normal file
50
test/reranker/conf/config.json
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
{
|
||||||
|
"filesroot":"$[workdir]$/files",
|
||||||
|
"logger":{
|
||||||
|
"name":"llmengine",
|
||||||
|
"levelname":"info",
|
||||||
|
"logfile":"$[workdir]$/logs/llmengine.log"
|
||||||
|
},
|
||||||
|
"website":{
|
||||||
|
"paths":[
|
||||||
|
["$[workdir]$/wwwroot",""]
|
||||||
|
],
|
||||||
|
"client_max_size":10000,
|
||||||
|
"host":"0.0.0.0",
|
||||||
|
"port":9995,
|
||||||
|
"coding":"utf-8",
|
||||||
|
"indexes":[
|
||||||
|
"index.html",
|
||||||
|
"index.ui"
|
||||||
|
],
|
||||||
|
"startswiths":[
|
||||||
|
{
|
||||||
|
"leading":"/idfile",
|
||||||
|
"registerfunction":"idfile"
|
||||||
|
},{
|
||||||
|
"leading": "/v1/rerank",
|
||||||
|
"registerfunction": "rerank"
|
||||||
|
},{
|
||||||
|
"leading": "/docs",
|
||||||
|
"registerfunction": "docs"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"processors":[
|
||||||
|
[".tmpl","tmpl"],
|
||||||
|
[".app","app"],
|
||||||
|
[".ui","bui"],
|
||||||
|
[".dspy","dspy"],
|
||||||
|
[".md","md"]
|
||||||
|
],
|
||||||
|
"rsakey_oops":{
|
||||||
|
"privatekey":"$[workdir]$/conf/rsa_private_key.pem",
|
||||||
|
"publickey":"$[workdir]$/conf/rsa_public_key.pem"
|
||||||
|
},
|
||||||
|
"session_max_time":3000,
|
||||||
|
"session_issue_time":2500,
|
||||||
|
"session_redis_notuse":{
|
||||||
|
"url":"redis://127.0.0.1:6379"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
0
test/reranker/logs/llmengine.log
Normal file
0
test/reranker/logs/llmengine.log
Normal file
48
test/reranker/qwen3-reranker.service
Normal file
48
test/reranker/qwen3-reranker.service
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=A Rerank Service using Qwen3-Reranker-0.6B
|
||||||
|
# After=network.target DeepSeek70B-kyyds671b-ray.service
|
||||||
|
# Requires=DeepSeek70B-kyyds671b-ray.service
|
||||||
|
StartLimitIntervalSec=60
|
||||||
|
StartLimitBurst=5
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
# 核心启动参数(保持原有配置)
|
||||||
|
User=ymq
|
||||||
|
Group=ymq
|
||||||
|
WorkingDirectory=/share/ymq/run/reranker
|
||||||
|
#定义环境变量,所有节点的启动脚本与服务需一致
|
||||||
|
#Environment="NCCL_SOCKET_IFNAME=enp196s0f0np0"
|
||||||
|
#ExecStartPre=/data/kyyds671b/ray_check.sh
|
||||||
|
ExecStart=/share/ymq/run/reranker/start.sh
|
||||||
|
ExecStop=/share/ymq/run/reranker/stop.sh
|
||||||
|
|
||||||
|
# 超时与停止控制(新增部分)
|
||||||
|
# 启动超时延长至 120 秒
|
||||||
|
# TimeoutStartSec=120
|
||||||
|
# 停止等待时间 30 秒
|
||||||
|
# TimeoutStopSec=30
|
||||||
|
# 优先发送 SIGINT 信号(更适合 Python 程序)
|
||||||
|
# KillSignal=SIGINT
|
||||||
|
# 最终强制终止信号
|
||||||
|
# RestartKillSignal=SIGKILL
|
||||||
|
# 混合终止模式
|
||||||
|
# KillMode=mixed
|
||||||
|
# 重启策略
|
||||||
|
# Restart=on-failure
|
||||||
|
# RestartSec=10s
|
||||||
|
# 服务管理(保持原有配置+增强)
|
||||||
|
#Restart=always
|
||||||
|
#RestartSec=10 # 重启间隔从 5 秒调整为 10 秒
|
||||||
|
#append 是继续写入相当于>> file是从新写入 相当于>
|
||||||
|
StandardOutput=append:/var/log/rerank/rerank.log
|
||||||
|
StandardError=append:/var/log/rerank/error.log
|
||||||
|
SyslogIdentifier=rerank
|
||||||
|
# 资源限制(保持可选配置)
|
||||||
|
#LimitNOFILE=65536
|
||||||
|
#LimitNPROC=65536
|
||||||
|
# GPU 支持
|
||||||
|
#Environment=CUDA_VISIBLE_DEVICES=0,1
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
|
||||||
4
test/reranker/start.sh
Executable file
4
test/reranker/start.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# CUDA_VISIBLE_DEVICES=7 /share/vllm-0.8.5/bin/python -m llmengine.rerank -p 9997 /d/ymq/models/Qwen/Qwen3-Reranker-0___6B
|
||||||
|
CUDA_VISIBLE_DEVICES=7 /share/vllm-0.8.5/bin/python -m llmengine.rerank -p 9997 /share/models/BAAI/bge-reranker-v2-m3
|
||||||
5
test/reranker/stop.sh
Executable file
5
test/reranker/stop.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
|
||||||
|
#killname Qwen/Qwen3-Reranker
|
||||||
|
killname BAAI/bge-reranker
|
||||||
|
|
||||||
17
test/reranker/t.sh
Executable file
17
test/reranker/t.sh
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
|
||||||
|
curl http://localhost:9997/v1/rerank \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d @- <<EOF
|
||||||
|
{
|
||||||
|
"model": "rerank-001",
|
||||||
|
"query": "什么是量子计算?",
|
||||||
|
"documents": [
|
||||||
|
"量子计算是一种使用量子比特进行计算的方式。",
|
||||||
|
"古典计算机使用的是二进制位。",
|
||||||
|
"天气预报依赖于统计模型。",
|
||||||
|
"量子计算与物理学密切相关。"
|
||||||
|
],
|
||||||
|
"top_n": 5
|
||||||
|
}
|
||||||
|
EOF
|
||||||
Loading…
x
Reference in New Issue
Block a user