ragapi
This commit is contained in:
parent
ea1a9a084f
commit
516edb5b6a
@ -185,7 +185,7 @@ where a.orgid = b.orgid
|
|||||||
start_load = time.time()
|
start_load = time.time()
|
||||||
text = fileloader(realpath)
|
text = fileloader(realpath)
|
||||||
# debug(f"处理后的文件内容是:{text=}")
|
# debug(f"处理后的文件内容是:{text=}")
|
||||||
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s.;,\n]', '', text)
|
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s.;,\n/]', '', text)
|
||||||
timings["load_file"] = time.time() - start_load
|
timings["load_file"] = time.time() - start_load
|
||||||
debug(f"加载文件耗时: {timings['load_file']:.2f} 秒, 文本长度: {len(text)}")
|
debug(f"加载文件耗时: {timings['load_file']:.2f} 秒, 文本长度: {len(text)}")
|
||||||
if not text or not text.strip():
|
if not text or not text.strip():
|
||||||
@ -202,6 +202,7 @@ where a.orgid = b.orgid
|
|||||||
timings["split_text"] = time.time() - start_split
|
timings["split_text"] = time.time() - start_split
|
||||||
debug(
|
debug(
|
||||||
f"文本分片耗时: {timings['split_text']:.2f} 秒, 分片数量: {len(chunks)}, 分片内容: {[chunk.page_content[:50] for chunk in chunks[:5]]}")
|
f"文本分片耗时: {timings['split_text']:.2f} 秒, 分片数量: {len(chunks)}, 分片内容: {[chunk.page_content[:50] for chunk in chunks[:5]]}")
|
||||||
|
debug(f"分片内容: {[chunk.page_content[:100] + '...' for chunk in chunks]}")
|
||||||
if not chunks:
|
if not chunks:
|
||||||
raise ValueError(f"文件 {realpath} 未生成任何文档块")
|
raise ValueError(f"文件 {realpath} 未生成任何文档块")
|
||||||
|
|
||||||
@ -269,18 +270,15 @@ where a.orgid = b.orgid
|
|||||||
try:
|
try:
|
||||||
chunk_texts = [doc.page_content for doc in chunks]
|
chunk_texts = [doc.page_content for doc in chunks]
|
||||||
debug(f"处理 {len(chunk_texts)} 个分片进行三元组抽取")
|
debug(f"处理 {len(chunk_texts)} 个分片进行三元组抽取")
|
||||||
tasks = [
|
triples = []
|
||||||
api_service.extract_triples(
|
for i, chunk in enumerate(chunk_texts):
|
||||||
|
result = await api_service.extract_triples(
|
||||||
request=request,
|
request=request,
|
||||||
text=chunk,
|
text=chunk,
|
||||||
upappid=service_params['triples'],
|
upappid=service_params['triples'],
|
||||||
apiname="Babelscape/mrebel-large",
|
apiname="Babelscape/mrebel-large",
|
||||||
user=userid
|
user=userid
|
||||||
) for chunk in chunk_texts
|
)
|
||||||
]
|
|
||||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
||||||
triples = []
|
|
||||||
for i, result in enumerate(results):
|
|
||||||
if isinstance(result, list):
|
if isinstance(result, list):
|
||||||
triples.extend(result)
|
triples.extend(result)
|
||||||
debug(f"分片 {i + 1} 抽取到 {len(result)} 个三元组: {result[:5]}")
|
debug(f"分片 {i + 1} 抽取到 {len(result)} 个三元组: {result[:5]}")
|
||||||
@ -506,10 +504,3 @@ async def test_ragfilemgr():
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(test_ragfilemgr())
|
asyncio.run(test_ragfilemgr())
|
||||||
|
|
||||||
|
|
||||||
## usage
|
|
||||||
# mgr = RagFileMgr(fiid)
|
|
||||||
# await mgr.add_file(request, params_kw)
|
|
||||||
# await mgr.delete_file(request, file_id)
|
|
||||||
##
|
|
||||||
|
|||||||
@ -5,6 +5,7 @@ from appPublic.log import debug, error, info
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
import json
|
import json
|
||||||
|
import math
|
||||||
|
|
||||||
helptext = """kyrag API:
|
helptext = """kyrag API:
|
||||||
|
|
||||||
@ -81,11 +82,44 @@ async def fusedsearch(request, params_kw, *params, **kw):
|
|||||||
# orgid = "04J6VbxLqB_9RPMcgOv_8"
|
# orgid = "04J6VbxLqB_9RPMcgOv_8"
|
||||||
# userid = "04J6VbxLqB_9RPMcgOv_8"
|
# userid = "04J6VbxLqB_9RPMcgOv_8"
|
||||||
query = params_kw.get('query', '')
|
query = params_kw.get('query', '')
|
||||||
fiids = params_kw.get('fiids', [])
|
# 统一模式处理 limit 参数
|
||||||
limit = int(params_kw.get('limit', 5))
|
raw_limit = params_kw.get('limit') or (
|
||||||
|
params_kw.get('retrieval_setting', {}).get('top_k')
|
||||||
|
if isinstance(params_kw.get('retrieval_setting'), dict)
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# 标准化为整数值
|
||||||
|
if raw_limit is None:
|
||||||
|
limit = 5 # 两个来源都不存在时使用默认值
|
||||||
|
elif isinstance(raw_limit, (int, float)):
|
||||||
|
limit = int(raw_limit) # 数值类型直接转换
|
||||||
|
elif isinstance(raw_limit, str):
|
||||||
|
try:
|
||||||
|
# 字符串转换为整数
|
||||||
|
limit = int(raw_limit)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
limit = 5 # 转换失败使用默认值
|
||||||
|
else:
|
||||||
|
limit = 5 # 其他意外类型使用默认值
|
||||||
|
debug(f"limit: {limit}")
|
||||||
|
raw_fiids = params_kw.get('fiids') or params_kw.get('knowledge_id')
|
||||||
|
|
||||||
|
# 标准化为列表格式
|
||||||
|
if raw_fiids is None:
|
||||||
|
fiids = [] # 两个参数都不存在
|
||||||
|
elif isinstance(raw_fiids, list):
|
||||||
|
fiids = [str(item).strip() for item in raw_fiids] # 已经是列表
|
||||||
|
elif isinstance(raw_fiids, str):
|
||||||
|
# 处理逗号分隔的字符串或单个ID字符串
|
||||||
|
fiids = [f.strip() for f in raw_fiids.split(',') if f.strip()]
|
||||||
|
elif isinstance(raw_fiids, (int, float)):
|
||||||
|
fiids = [str(int(raw_fiids))] # 数值类型转为字符串列表
|
||||||
|
else:
|
||||||
|
fiids = [] # 其他意外类型
|
||||||
|
|
||||||
debug(f"fiids: {fiids}")
|
debug(f"fiids: {fiids}")
|
||||||
if isinstance(fiids, str):
|
|
||||||
fiids = [f.strip() for f in fiids.split(',') if f.strip()]
|
|
||||||
# 验证 fiids的orgid与orgid = await f()是否一致
|
# 验证 fiids的orgid与orgid = await f()是否一致
|
||||||
if fiids:
|
if fiids:
|
||||||
db = DBPools()
|
db = DBPools()
|
||||||
@ -197,6 +231,7 @@ async def fusedsearch(request, params_kw, *params, **kw):
|
|||||||
# 调用搜索端点
|
# 调用搜索端点
|
||||||
sum = limit + 5
|
sum = limit + 5
|
||||||
search_start = time.time()
|
search_start = time.time()
|
||||||
|
debug(f"orgid: {orgid}")
|
||||||
result = await api_service.milvus_search_query(
|
result = await api_service.milvus_search_query(
|
||||||
request=request,
|
request=request,
|
||||||
query_vector=query_vector,
|
query_vector=query_vector,
|
||||||
@ -240,8 +275,34 @@ async def fusedsearch(request, params_kw, *params, **kw):
|
|||||||
|
|
||||||
timing_stats["total_time"] = time.time() - start_time
|
timing_stats["total_time"] = time.time() - start_time
|
||||||
info(f"融合搜索完成,返回 {len(unique_results)} 条结果,总耗时: {timing_stats['total_time']:.3f} 秒")
|
info(f"融合搜索完成,返回 {len(unique_results)} 条结果,总耗时: {timing_stats['total_time']:.3f} 秒")
|
||||||
return {"results": unique_results[:limit], "timing": timing_stats}
|
|
||||||
|
|
||||||
|
# debug(f"results: {unique_results[:limit]},timing: {timing_stats}")
|
||||||
|
# return {"results": unique_results[:limit], "timing": timing_stats}
|
||||||
|
|
||||||
|
|
||||||
|
dify_records = []
|
||||||
|
dify_result = []
|
||||||
|
for res in unique_results[:limit]:
|
||||||
|
rerank_score = res.get('rerank_score', 0)
|
||||||
|
score = 1 / (1 + math.exp(-rerank_score)) if rerank_score is not None else 1 - res.get('distance', 0)
|
||||||
|
score = max(0.0, min(1.0, score))
|
||||||
|
content = res.get('text', '')
|
||||||
|
title = res.get('metadata', {}).get('filename', 'Untitled')
|
||||||
|
document_id = res.get('metadata', {}).get('document_id', '')
|
||||||
|
dify_records.append({
|
||||||
|
"content": content,
|
||||||
|
"score": score,
|
||||||
|
"title": title
|
||||||
|
})
|
||||||
|
dify_result.append({
|
||||||
|
"content": content,
|
||||||
|
"title": title,
|
||||||
|
"metadata": {"document_id": document_id}
|
||||||
|
})
|
||||||
|
|
||||||
|
info(f"融合搜索完成,返回 {len(dify_records)} 条结果,总耗时: {(time.time() - start_time):.3f} 秒")
|
||||||
|
debug(f"records: {dify_records}, result: {dify_result}")
|
||||||
|
return {"records": dify_records, "result": dify_result, "own":{"results": unique_results[:limit], "timing": timing_stats}}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error(f"融合搜索失败: {str(e)}, 堆栈: {traceback.format_exc()}")
|
error(f"融合搜索失败: {str(e)}, 堆栈: {traceback.format_exc()}")
|
||||||
return {"results": [], "timing": timing_stats}
|
return {"results": [], "timing": timing_stats}
|
||||||
|
|||||||
@ -321,6 +321,7 @@ class APIService:
|
|||||||
async def milvus_search_query(self, request, query_vector: List[float], userid: str, knowledge_base_ids: list, limit: int, offset: int, upappid: str, apiname: str, user: str) -> Dict[str, Any]:
|
async def milvus_search_query(self, request, query_vector: List[float], userid: str, knowledge_base_ids: list, limit: int, offset: int, upappid: str, apiname: str, user: str) -> Dict[str, Any]:
|
||||||
"""根据用户知识库检索 Milvus"""
|
"""根据用户知识库检索 Milvus"""
|
||||||
request_id = str(uuid.uuid4())
|
request_id = str(uuid.uuid4())
|
||||||
|
debug(f"userid:{userid}")
|
||||||
debug(f"Request #{request_id} started for Milvus search")
|
debug(f"Request #{request_id} started for Milvus search")
|
||||||
try:
|
try:
|
||||||
uapi = UAPI(request, DictObject(**globals()))
|
uapi = UAPI(request, DictObject(**globals()))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user