ragapi
This commit is contained in:
parent
ea1a9a084f
commit
516edb5b6a
@ -185,7 +185,7 @@ where a.orgid = b.orgid
|
||||
start_load = time.time()
|
||||
text = fileloader(realpath)
|
||||
# debug(f"处理后的文件内容是:{text=}")
|
||||
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s.;,\n]', '', text)
|
||||
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s.;,\n/]', '', text)
|
||||
timings["load_file"] = time.time() - start_load
|
||||
debug(f"加载文件耗时: {timings['load_file']:.2f} 秒, 文本长度: {len(text)}")
|
||||
if not text or not text.strip():
|
||||
@ -202,6 +202,7 @@ where a.orgid = b.orgid
|
||||
timings["split_text"] = time.time() - start_split
|
||||
debug(
|
||||
f"文本分片耗时: {timings['split_text']:.2f} 秒, 分片数量: {len(chunks)}, 分片内容: {[chunk.page_content[:50] for chunk in chunks[:5]]}")
|
||||
debug(f"分片内容: {[chunk.page_content[:100] + '...' for chunk in chunks]}")
|
||||
if not chunks:
|
||||
raise ValueError(f"文件 {realpath} 未生成任何文档块")
|
||||
|
||||
@ -269,18 +270,15 @@ where a.orgid = b.orgid
|
||||
try:
|
||||
chunk_texts = [doc.page_content for doc in chunks]
|
||||
debug(f"处理 {len(chunk_texts)} 个分片进行三元组抽取")
|
||||
tasks = [
|
||||
api_service.extract_triples(
|
||||
triples = []
|
||||
for i, chunk in enumerate(chunk_texts):
|
||||
result = await api_service.extract_triples(
|
||||
request=request,
|
||||
text=chunk,
|
||||
upappid=service_params['triples'],
|
||||
apiname="Babelscape/mrebel-large",
|
||||
user=userid
|
||||
) for chunk in chunk_texts
|
||||
]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
triples = []
|
||||
for i, result in enumerate(results):
|
||||
)
|
||||
if isinstance(result, list):
|
||||
triples.extend(result)
|
||||
debug(f"分片 {i + 1} 抽取到 {len(result)} 个三元组: {result[:5]}")
|
||||
@ -506,10 +504,3 @@ async def test_ragfilemgr():
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_ragfilemgr())
|
||||
|
||||
|
||||
## usage
|
||||
# mgr = RagFileMgr(fiid)
|
||||
# await mgr.add_file(request, params_kw)
|
||||
# await mgr.delete_file(request, file_id)
|
||||
##
|
||||
|
||||
@ -5,6 +5,7 @@ from appPublic.log import debug, error, info
|
||||
import time
|
||||
import traceback
|
||||
import json
|
||||
import math
|
||||
|
||||
helptext = """kyrag API:
|
||||
|
||||
@ -81,11 +82,44 @@ async def fusedsearch(request, params_kw, *params, **kw):
|
||||
# orgid = "04J6VbxLqB_9RPMcgOv_8"
|
||||
# userid = "04J6VbxLqB_9RPMcgOv_8"
|
||||
query = params_kw.get('query', '')
|
||||
fiids = params_kw.get('fiids', [])
|
||||
limit = int(params_kw.get('limit', 5))
|
||||
# 统一模式处理 limit 参数
|
||||
raw_limit = params_kw.get('limit') or (
|
||||
params_kw.get('retrieval_setting', {}).get('top_k')
|
||||
if isinstance(params_kw.get('retrieval_setting'), dict)
|
||||
else None
|
||||
)
|
||||
|
||||
# 标准化为整数值
|
||||
if raw_limit is None:
|
||||
limit = 5 # 两个来源都不存在时使用默认值
|
||||
elif isinstance(raw_limit, (int, float)):
|
||||
limit = int(raw_limit) # 数值类型直接转换
|
||||
elif isinstance(raw_limit, str):
|
||||
try:
|
||||
# 字符串转换为整数
|
||||
limit = int(raw_limit)
|
||||
except (TypeError, ValueError):
|
||||
limit = 5 # 转换失败使用默认值
|
||||
else:
|
||||
limit = 5 # 其他意外类型使用默认值
|
||||
debug(f"limit: {limit}")
|
||||
raw_fiids = params_kw.get('fiids') or params_kw.get('knowledge_id')
|
||||
|
||||
# 标准化为列表格式
|
||||
if raw_fiids is None:
|
||||
fiids = [] # 两个参数都不存在
|
||||
elif isinstance(raw_fiids, list):
|
||||
fiids = [str(item).strip() for item in raw_fiids] # 已经是列表
|
||||
elif isinstance(raw_fiids, str):
|
||||
# 处理逗号分隔的字符串或单个ID字符串
|
||||
fiids = [f.strip() for f in raw_fiids.split(',') if f.strip()]
|
||||
elif isinstance(raw_fiids, (int, float)):
|
||||
fiids = [str(int(raw_fiids))] # 数值类型转为字符串列表
|
||||
else:
|
||||
fiids = [] # 其他意外类型
|
||||
|
||||
debug(f"fiids: {fiids}")
|
||||
if isinstance(fiids, str):
|
||||
fiids = [f.strip() for f in fiids.split(',') if f.strip()]
|
||||
|
||||
# 验证 fiids的orgid与orgid = await f()是否一致
|
||||
if fiids:
|
||||
db = DBPools()
|
||||
@ -197,6 +231,7 @@ async def fusedsearch(request, params_kw, *params, **kw):
|
||||
# 调用搜索端点
|
||||
sum = limit + 5
|
||||
search_start = time.time()
|
||||
debug(f"orgid: {orgid}")
|
||||
result = await api_service.milvus_search_query(
|
||||
request=request,
|
||||
query_vector=query_vector,
|
||||
@ -240,8 +275,34 @@ async def fusedsearch(request, params_kw, *params, **kw):
|
||||
|
||||
timing_stats["total_time"] = time.time() - start_time
|
||||
info(f"融合搜索完成,返回 {len(unique_results)} 条结果,总耗时: {timing_stats['total_time']:.3f} 秒")
|
||||
return {"results": unique_results[:limit], "timing": timing_stats}
|
||||
|
||||
# debug(f"results: {unique_results[:limit]},timing: {timing_stats}")
|
||||
# return {"results": unique_results[:limit], "timing": timing_stats}
|
||||
|
||||
|
||||
dify_records = []
|
||||
dify_result = []
|
||||
for res in unique_results[:limit]:
|
||||
rerank_score = res.get('rerank_score', 0)
|
||||
score = 1 / (1 + math.exp(-rerank_score)) if rerank_score is not None else 1 - res.get('distance', 0)
|
||||
score = max(0.0, min(1.0, score))
|
||||
content = res.get('text', '')
|
||||
title = res.get('metadata', {}).get('filename', 'Untitled')
|
||||
document_id = res.get('metadata', {}).get('document_id', '')
|
||||
dify_records.append({
|
||||
"content": content,
|
||||
"score": score,
|
||||
"title": title
|
||||
})
|
||||
dify_result.append({
|
||||
"content": content,
|
||||
"title": title,
|
||||
"metadata": {"document_id": document_id}
|
||||
})
|
||||
|
||||
info(f"融合搜索完成,返回 {len(dify_records)} 条结果,总耗时: {(time.time() - start_time):.3f} 秒")
|
||||
debug(f"records: {dify_records}, result: {dify_result}")
|
||||
return {"records": dify_records, "result": dify_result, "own":{"results": unique_results[:limit], "timing": timing_stats}}
|
||||
except Exception as e:
|
||||
error(f"融合搜索失败: {str(e)}, 堆栈: {traceback.format_exc()}")
|
||||
return {"results": [], "timing": timing_stats}
|
||||
|
||||
@ -321,6 +321,7 @@ class APIService:
|
||||
async def milvus_search_query(self, request, query_vector: List[float], userid: str, knowledge_base_ids: list, limit: int, offset: int, upappid: str, apiname: str, user: str) -> Dict[str, Any]:
|
||||
"""根据用户知识库检索 Milvus"""
|
||||
request_id = str(uuid.uuid4())
|
||||
debug(f"userid:{userid}")
|
||||
debug(f"Request #{request_id} started for Milvus search")
|
||||
try:
|
||||
uapi = UAPI(request, DictObject(**globals()))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user