This commit is contained in:
wangmeihua 2025-09-09 13:17:22 +08:00
parent ea1a9a084f
commit 516edb5b6a
3 changed files with 481 additions and 428 deletions

View File

@ -185,7 +185,7 @@ where a.orgid = b.orgid
start_load = time.time()
text = fileloader(realpath)
# debug(f"处理后的文件内容是:{text=}")
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s.;,\n]', '', text)
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s.;,\n/]', '', text)
timings["load_file"] = time.time() - start_load
debug(f"加载文件耗时: {timings['load_file']:.2f} 秒, 文本长度: {len(text)}")
if not text or not text.strip():
@ -202,6 +202,7 @@ where a.orgid = b.orgid
timings["split_text"] = time.time() - start_split
debug(
f"文本分片耗时: {timings['split_text']:.2f} 秒, 分片数量: {len(chunks)}, 分片内容: {[chunk.page_content[:50] for chunk in chunks[:5]]}")
debug(f"分片内容: {[chunk.page_content[:100] + '...' for chunk in chunks]}")
if not chunks:
raise ValueError(f"文件 {realpath} 未生成任何文档块")
@ -269,18 +270,15 @@ where a.orgid = b.orgid
try:
chunk_texts = [doc.page_content for doc in chunks]
debug(f"处理 {len(chunk_texts)} 个分片进行三元组抽取")
tasks = [
api_service.extract_triples(
triples = []
for i, chunk in enumerate(chunk_texts):
result = await api_service.extract_triples(
request=request,
text=chunk,
upappid=service_params['triples'],
apiname="Babelscape/mrebel-large",
user=userid
) for chunk in chunk_texts
]
results = await asyncio.gather(*tasks, return_exceptions=True)
triples = []
for i, result in enumerate(results):
)
if isinstance(result, list):
triples.extend(result)
debug(f"分片 {i + 1} 抽取到 {len(result)} 个三元组: {result[:5]}")
@ -506,10 +504,3 @@ async def test_ragfilemgr():
if __name__ == "__main__":
asyncio.run(test_ragfilemgr())
## usage
# mgr = RagFileMgr(fiid)
# await mgr.add_file(request, params_kw)
# await mgr.delete_file(request, file_id)
##

View File

@ -5,6 +5,7 @@ from appPublic.log import debug, error, info
import time
import traceback
import json
import math
helptext = """kyrag API:
@ -81,11 +82,44 @@ async def fusedsearch(request, params_kw, *params, **kw):
# orgid = "04J6VbxLqB_9RPMcgOv_8"
# userid = "04J6VbxLqB_9RPMcgOv_8"
query = params_kw.get('query', '')
fiids = params_kw.get('fiids', [])
limit = int(params_kw.get('limit', 5))
# 统一模式处理 limit 参数
raw_limit = params_kw.get('limit') or (
params_kw.get('retrieval_setting', {}).get('top_k')
if isinstance(params_kw.get('retrieval_setting'), dict)
else None
)
# 标准化为整数值
if raw_limit is None:
limit = 5 # 两个来源都不存在时使用默认值
elif isinstance(raw_limit, (int, float)):
limit = int(raw_limit) # 数值类型直接转换
elif isinstance(raw_limit, str):
try:
# 字符串转换为整数
limit = int(raw_limit)
except (TypeError, ValueError):
limit = 5 # 转换失败使用默认值
else:
limit = 5 # 其他意外类型使用默认值
debug(f"limit: {limit}")
raw_fiids = params_kw.get('fiids') or params_kw.get('knowledge_id')
# 标准化为列表格式
if raw_fiids is None:
fiids = [] # 两个参数都不存在
elif isinstance(raw_fiids, list):
fiids = [str(item).strip() for item in raw_fiids] # 已经是列表
elif isinstance(raw_fiids, str):
# 处理逗号分隔的字符串或单个ID字符串
fiids = [f.strip() for f in raw_fiids.split(',') if f.strip()]
elif isinstance(raw_fiids, (int, float)):
fiids = [str(int(raw_fiids))] # 数值类型转为字符串列表
else:
fiids = [] # 其他意外类型
debug(f"fiids: {fiids}")
if isinstance(fiids, str):
fiids = [f.strip() for f in fiids.split(',') if f.strip()]
# 验证 fiids的orgid与orgid = await f()是否一致
if fiids:
db = DBPools()
@ -197,6 +231,7 @@ async def fusedsearch(request, params_kw, *params, **kw):
# 调用搜索端点
sum = limit + 5
search_start = time.time()
debug(f"orgid: {orgid}")
result = await api_service.milvus_search_query(
request=request,
query_vector=query_vector,
@ -240,8 +275,34 @@ async def fusedsearch(request, params_kw, *params, **kw):
timing_stats["total_time"] = time.time() - start_time
info(f"融合搜索完成,返回 {len(unique_results)} 条结果,总耗时: {timing_stats['total_time']:.3f}")
return {"results": unique_results[:limit], "timing": timing_stats}
# debug(f"results: {unique_results[:limit]},timing: {timing_stats}")
# return {"results": unique_results[:limit], "timing": timing_stats}
dify_records = []
dify_result = []
for res in unique_results[:limit]:
rerank_score = res.get('rerank_score', 0)
score = 1 / (1 + math.exp(-rerank_score)) if rerank_score is not None else 1 - res.get('distance', 0)
score = max(0.0, min(1.0, score))
content = res.get('text', '')
title = res.get('metadata', {}).get('filename', 'Untitled')
document_id = res.get('metadata', {}).get('document_id', '')
dify_records.append({
"content": content,
"score": score,
"title": title
})
dify_result.append({
"content": content,
"title": title,
"metadata": {"document_id": document_id}
})
info(f"融合搜索完成,返回 {len(dify_records)} 条结果,总耗时: {(time.time() - start_time):.3f}")
debug(f"records: {dify_records}, result: {dify_result}")
return {"records": dify_records, "result": dify_result, "own":{"results": unique_results[:limit], "timing": timing_stats}}
except Exception as e:
error(f"融合搜索失败: {str(e)}, 堆栈: {traceback.format_exc()}")
return {"results": [], "timing": timing_stats}

View File

@ -321,6 +321,7 @@ class APIService:
async def milvus_search_query(self, request, query_vector: List[float], userid: str, knowledge_base_ids: list, limit: int, offset: int, upappid: str, apiname: str, user: str) -> Dict[str, Any]:
"""根据用户知识库检索 Milvus"""
request_id = str(uuid.uuid4())
debug(f"userid:{userid}")
debug(f"Request #{request_id} started for Milvus search")
try:
uapi = UAPI(request, DictObject(**globals()))