bugfix
This commit is contained in:
parent
f84f4f14f9
commit
da44517b80
@ -2,9 +2,12 @@ import torch
|
|||||||
|
|
||||||
model_pathMap = {
|
model_pathMap = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def llm_register(model_key, Klass):
|
def llm_register(model_key, Klass):
|
||||||
model_pathMap[model_key] = Klass
|
model_pathMap[model_key] = Klass
|
||||||
|
|
||||||
|
|
||||||
def get_llm_class(model_path):
|
def get_llm_class(model_path):
|
||||||
for k, klass in model_pathMap.items():
|
for k, klass in model_pathMap.items():
|
||||||
if len(model_path.split(k)) > 1:
|
if len(model_path.split(k)) > 1:
|
||||||
@ -12,14 +15,17 @@ def get_llm_class(model_path):
|
|||||||
print(f'{model_pathMap=}')
|
print(f'{model_pathMap=}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
class BaseReranker:
|
class BaseReranker:
|
||||||
def __init__(self, model_id, **kw):
|
def __init__(self, model_id, **kw):
|
||||||
self.model_id = model_id
|
self.model_id = model_id
|
||||||
|
|
||||||
def use_mps_if_prosible(self):
|
def use_mps_if_prosible(self):
|
||||||
if torch.backends.mps.is_available():
|
if torch.cuda.is_available():
|
||||||
device = torch.device("mps")
|
device = torch.device("cuda")
|
||||||
self.model = self.model.to(device)
|
self.model = self.model.to(device, dtype=torch.float16)
|
||||||
|
else:
|
||||||
|
raise Exception("GPU not available, but required for FP16 inference")
|
||||||
|
|
||||||
def process_inputs(self, pairs):
|
def process_inputs(self, pairs):
|
||||||
inputs = self.tokenizer(
|
inputs = self.tokenizer(
|
||||||
@ -42,9 +48,6 @@ class BaseReranker:
|
|||||||
|
|
||||||
def compute_logits(self, inputs, **kwargs):
|
def compute_logits(self, inputs, **kwargs):
|
||||||
batch_scores = self.model(**inputs).logits[:, -1, :]
|
batch_scores = self.model(**inputs).logits[:, -1, :]
|
||||||
# true_vector = batch_scores[:, token_true_id]
|
|
||||||
# false_vector = batch_scores[:, token_false_id]
|
|
||||||
# batch_scores = torch.stack([false_vector, true_vector], dim=1)
|
|
||||||
batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
|
batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
|
||||||
scores = batch_scores[:, 1].exp().tolist()
|
scores = batch_scores[:, 1].exp().tolist()
|
||||||
return scores
|
return scores
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||||
from llmengine.base_reranker import BaseReranker, llm_register
|
from llmengine.base_reranker import BaseReranker, llm_register
|
||||||
|
from torch.cuda.amp import autocast
|
||||||
|
|
||||||
class BgeReranker(BaseReranker):
|
class BgeReranker(BaseReranker):
|
||||||
def __init__(self, model_id, max_length=8096):
|
def __init__(self, model_id, max_length=8096):
|
||||||
@ -8,8 +9,10 @@ class BgeReranker(BaseReranker):
|
|||||||
e = Exception(f'{model_id} is not a bge-reranker')
|
e = Exception(f'{model_id} is not a bge-reranker')
|
||||||
raise e
|
raise e
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(model_id)
|
model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
model = model.to('cuda')
|
||||||
self.model = model
|
self.model = model
|
||||||
self.model_id = model_id
|
self.model_id = model_id
|
||||||
self.model_name = model_id.split('/')[-1]
|
self.model_name = model_id.split('/')[-1]
|
||||||
@ -20,11 +23,13 @@ class BgeReranker(BaseReranker):
|
|||||||
def process_inputs(self, pairs):
|
def process_inputs(self, pairs):
|
||||||
inputs = self.tokenizer(pairs, padding=True,
|
inputs = self.tokenizer(pairs, padding=True,
|
||||||
truncation=True, return_tensors='pt', max_length=512)
|
truncation=True, return_tensors='pt', max_length=512)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
inputs = {k: v.to('cuda') for k, v in inputs.items()}
|
||||||
return inputs
|
return inputs
|
||||||
|
|
||||||
def compute_logits(self, inputs):
|
def compute_logits(self, inputs):
|
||||||
scores = self.model(**inputs,
|
with autocast():
|
||||||
return_dict=True).logits.view(-1, ).float()
|
scores = self.model(**inputs, return_dict=True).logits.view(-1,)
|
||||||
scores = [s.item() for s in scores]
|
scores = [s.item() for s in scores]
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
|||||||
@ -1,16 +1,122 @@
|
|||||||
import torch
|
import torch
|
||||||
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
from llmengine.base_reranker import BaseReranker, llm_register
|
from llmengine.base_reranker import BaseReranker, llm_register
|
||||||
|
|
||||||
|
|
||||||
class Qwen3Reranker(BaseReranker):
|
class Qwen3Reranker(BaseReranker):
|
||||||
def __init__(self, model_id, max_length=8096):
|
def __init__(self, model_id, max_length=1024): # 设置 max_length 为 1024
|
||||||
if 'Qwen3-Reranker' not in model_id:
|
if 'Qwen3-Reranker' not in model_id:
|
||||||
e = Exception(f'{model_id} is not a Qwen3-Reranker')
|
raise Exception(f'{model_id} is not a Qwen3-Reranker')
|
||||||
raise e
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
|
||||||
self.model = AutoModelForCausalLM.from_pretrained(model_id).eval()
|
|
||||||
|
# 使用 FP16(GPU)
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
torch_dtype=torch.float16
|
||||||
|
).eval()
|
||||||
|
|
||||||
self.model_id = model_id
|
self.model_id = model_id
|
||||||
self.model_name = model_id.split('/')[-1]
|
self.model_name = model_id.split('/')[-1]
|
||||||
self.max_length = 8192
|
self.max_length = max_length
|
||||||
|
|
||||||
|
# 初始化前缀和后缀标记
|
||||||
|
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
|
||||||
|
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
|
||||||
|
prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
|
||||||
|
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
||||||
|
self.prefix_tokens = self.tokenizer.encode(prefix, add_special_tokens=False)
|
||||||
|
self.suffix_tokens = self.tokenizer.encode(suffix, add_special_tokens=False)
|
||||||
|
|
||||||
|
def format_instruction(self, instruction, query, doc):
|
||||||
|
if instruction is None:
|
||||||
|
instruction = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||||
|
output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
|
||||||
|
instruction=instruction, query=query, doc=doc
|
||||||
|
)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def process_inputs(self, pairs, batch_size=8):
|
||||||
|
all_inputs = []
|
||||||
|
for i in range(0, len(pairs), batch_size):
|
||||||
|
batch_pairs = pairs[i:i + batch_size]
|
||||||
|
inputs = self.tokenizer(
|
||||||
|
batch_pairs,
|
||||||
|
padding=False,
|
||||||
|
truncation='longest_first',
|
||||||
|
return_attention_mask=False,
|
||||||
|
max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens)
|
||||||
|
)
|
||||||
|
for j, ele in enumerate(inputs['input_ids']):
|
||||||
|
inputs['input_ids'][j] = self.prefix_tokens + ele + self.suffix_tokens
|
||||||
|
inputs = self.tokenizer.pad(
|
||||||
|
inputs,
|
||||||
|
padding=True,
|
||||||
|
return_tensors="pt",
|
||||||
|
max_length=self.max_length
|
||||||
|
)
|
||||||
|
for key in inputs:
|
||||||
|
inputs[key] = inputs[key].to(self.model.device)
|
||||||
|
all_inputs.append(inputs)
|
||||||
|
# 清理内存
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
return all_inputs
|
||||||
|
|
||||||
|
def compute_logits(self, inputs, **kwargs):
|
||||||
|
batch_scores = self.model(**inputs).logits[:, -1, :]
|
||||||
|
true_vector = batch_scores[:, self.token_true_id]
|
||||||
|
false_vector = batch_scores[:, self.token_false_id]
|
||||||
|
batch_scores = torch.stack([false_vector, true_vector], dim=1)
|
||||||
|
batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
|
||||||
|
scores = batch_scores[:, 1].exp().tolist()
|
||||||
|
return scores
|
||||||
|
|
||||||
|
def build_pairs(self, query, docs, sys_prompt="", task=""):
|
||||||
|
pairs = [self.format_instruction(task, query, doc) for doc in docs]
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
def rerank(self, query, docs, top_n, sys_prompt="", task="", batch_size=8):
|
||||||
|
if query is None:
|
||||||
|
raise Exception("query is None")
|
||||||
|
if docs is None or not docs:
|
||||||
|
raise Exception("documents is None or empty")
|
||||||
|
if not isinstance(docs, list):
|
||||||
|
docs = [docs]
|
||||||
|
if top_n is None or top_n <= 0:
|
||||||
|
top_n = len(docs)
|
||||||
|
|
||||||
|
pairs = self.build_pairs(query, docs, sys_prompt=sys_prompt, task=task)
|
||||||
|
print(f"Number of documents: {len(docs)}")
|
||||||
|
for i, p in enumerate(pairs):
|
||||||
|
print(f"Pair {i} token length: {len(self.tokenizer.encode(p))}")
|
||||||
|
scores = []
|
||||||
|
with torch.no_grad():
|
||||||
|
input_batches = self.process_inputs(pairs, batch_size=batch_size)
|
||||||
|
for inputs in input_batches:
|
||||||
|
batch_scores = self.compute_logits(inputs)
|
||||||
|
scores.extend(batch_scores)
|
||||||
|
# 清理内存
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
data = [
|
||||||
|
{'index': i, 'relevance_score': s}
|
||||||
|
for i, s in enumerate(scores)
|
||||||
|
]
|
||||||
|
data = sorted(data, key=lambda x: x["relevance_score"], reverse=True)
|
||||||
|
if len(data) > top_n:
|
||||||
|
data = data[:top_n]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"data": data,
|
||||||
|
"object": "rerank.result",
|
||||||
|
"model": self.model_name,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": sum(len(self.tokenizer.encode(p)) for p in pairs),
|
||||||
|
"total_tokens": sum(len(self.tokenizer.encode(p)) for p in pairs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
llm_register('Qwen3-Reranker', Qwen3Reranker)
|
llm_register('Qwen3-Reranker', Qwen3Reranker)
|
||||||
@ -1,4 +1,3 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# CUDA_VISIBLE_DEVICES=7 /share/vllm-0.8.5/bin/python -m llmengine.rerank -p 9997 /d/ymq/models/Qwen/Qwen3-Reranker-0___6B
|
|
||||||
CUDA_VISIBLE_DEVICES=7 /share/vllm-0.8.5/bin/python -m llmengine.rerank -p 9997 /share/models/BAAI/bge-reranker-v2-m3
|
CUDA_VISIBLE_DEVICES=7 /share/vllm-0.8.5/bin/python -m llmengine.rerank -p 9997 /share/models/BAAI/bge-reranker-v2-m3
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user