删除相关文件

2025-07-28 10:55:08 +08:00 · 2025-07-28 10:55:08 +08:00 · 22ad6e48fd
commit 22ad6e48fd
parent 08fac45422
15 changed files with 0 additions and 2836 deletions
--- a/rag/allfusedsearch.py
+++ b/rag/allfusedsearch.py
@ -1,290 +0,0 @@
 import os
 import logging
 import yaml
 import numpy as np
 from typing import List, Dict, Any
 from pymilvus import Collection, utility
 from langchain_huggingface import HuggingFaceEmbeddings
 from vector import initialize_milvus_connection
 from searchquery import extract_entities, match_triplets
 from rerank import rerank_results
 import torch
 import time
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    TEXT_EMBEDDING_MODEL = config['models']['text_embedding_model']
 except Exception as e:
    raise RuntimeError(f"无法加载配置文件: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 logger.handlers.clear()
 logger.propagate = False
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 try:
    with open(config['logging']['file'], 'a', encoding='utf-8') as f:
        pass
 except Exception as e:
    raise RuntimeError(f"日志文件 {config['logging']['file']} 不可写: {str(e)}")
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 file_handler = logging.FileHandler(config['logging']['file'], encoding='utf-8')
 file_handler.setFormatter(formatter)
 stream_handler = logging.StreamHandler()
 stream_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
 logger.addHandler(stream_handler)
 # 初始化嵌入模型
 embedding = HuggingFaceEmbeddings(
    model_name=TEXT_EMBEDDING_MODEL,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
 )
 try:
    test_vector = embedding.embed_query("test")
    if len(test_vector) != 1024:
        raise ValueError(f"嵌入模型输出维度 {len(test_vector)} 不匹配预期 1024")
    logger.debug("嵌入模型加载成功")
 except Exception as e:
    logger.error(f"嵌入模型加载失败: {str(e)}")
    raise RuntimeError(f"嵌入模型加载失败: {str(e)}")
 # 缓存三元组
 TRIPLET_CACHE = {}
 def load_triplets_to_cache(userid: str, document_id: str) -> List[Dict]:
    """加载三元组到缓存"""
    cache_key = f"{document_id}_{userid}"
    if cache_key in TRIPLET_CACHE:
        logger.debug(f"从缓存加载三元组: {cache_key}")
        return TRIPLET_CACHE[cache_key]
    triplet_file = f"/share/wangmeihua/rag/triples/{document_id}_{userid}.txt"
    triplets = []
    try:
        with open(triplet_file, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) < 3:
                    continue
                head, type_, tail = parts[:3]
                triplets.append({'head': head, 'type': type_, 'tail': tail})
        TRIPLET_CACHE[cache_key] = triplets
        logger.debug(f"加载三元组文件: {triplet_file}, 数量: {len(triplets)}")
        return triplets
    except Exception as e:
        logger.error(f"加载三元组失败: {triplet_file}, 错误: {str(e)}")
        return []
 def fused_search(
        query: str,
        userid: str,
        db_type: str,
        file_paths: List[str],
        limit: int = 5,
        offset: int = 0,
        use_rerank: bool = True
 ) -> List[Dict[str, Any]]:
    """
    融合 RAG 和三元组召回文本块：
    - 收集所有输入文件的三元组，拼接为融合文本，向量化后在所有文件中搜索。
    - 结果去重并按 rerank_score 或 distance 排序，重排序使用融合文本。
    参数:
        query (str): 查询文本
        userid (str): 用户 ID
        db_type (str): 数据库类型 (e.g., 'textdb')
        file_paths (List[str]): 文件路径列表
        limit (int): 返回结果数量
        offset (int): 偏移量
        use_rerank (bool): 是否使用重排序
    返回:
        List[Dict[str, Any]]: 召回结果，包含 text、distance、source、metadata、rerank_score
    """
    try:
        logger.info(f"开始融合搜索: query={query}, userid={userid}, db_type={db_type}")
        start_time = time.time()
        # 参数验证
        if not query or not userid or not db_type or not file_paths:
            raise ValueError("query、userid、db_type 和 file_paths 不能为空")
        if "_" in userid or "_" in db_type:
            raise ValueError("userid 和 db_type 不能包含下划线")
        # 初始化 Milvus 连接
        connections = initialize_milvus_connection()
        collection_name = f"ragdb_{db_type}"
        if not utility.has_collection(collection_name):
            logger.warning(f"集合 {collection_name} 不存在")
            return []
        collection = Collection(collection_name)
        collection.load()
        logger.debug(f"加载 Milvus 集合: {collection_name}")
        # 提取实体
        entity_start = time.time()
        query_entities = extract_entities(query)
        logger.debug(f"提取实体: {query_entities}, 耗时: {time.time() - entity_start:.3f}s")
        # 收集所有文件的 document_id 和三元组
        doc_id_map = {}
        filenames = []
        all_triplets = []
        for file_path in file_paths:
            filename = os.path.basename(file_path)
            filenames.append(filename)
            logger.debug(f"处理文件: {filename}")
            # 获取 document_id
            results_query = collection.query(
                expr=f"userid == '{userid}' and filename == '{filename}'",
                output_fields=["document_id"],
                limit=1
            )
            if not results_query:
                logger.warning(f"未找到 userid {userid} 和 filename {filename} 对应的文档")
                continue
            document_id = results_query[0]["document_id"]
            doc_id_map[filename] = document_id
            load_triplets_to_cache(userid, document_id)
            # 获取匹配的三元组
            triplet_start = time.time()
            matched_triplets = match_triplets(query, query_entities, userid, document_id)
            logger.debug(
                f"文件 {filename} 匹配三元组: {len(matched_triplets)} 条, 耗时: {time.time() - triplet_start:.3f}s")
            all_triplets.extend(matched_triplets)
        if not doc_id_map:
            logger.warning("未找到任何有效文档")
            return []
        # 拼接融合文本
        triplet_texts = []
        for triplet in all_triplets:
            head = triplet['head']
            type_ = triplet['type']
            tail = triplet['tail']
            if not head or not type_ or not tail:
                logger.debug(f"无效三元组: {triplet}")
                continue
            triplet_texts.append(f"{head} {type_} {tail}")
        # 定义融合文本
        fused_text = query if not triplet_texts else f"{query} {' '.join(triplet_texts)}"
        logger.debug(f"融合文本: {fused_text}, 三元组数量: {len(triplet_texts)}")
        # 向量化
        embed_start = time.time()
        query_vector = embedding.embed_query(fused_text)
        query_vector = np.array(query_vector) / np.linalg.norm(query_vector)
        logger.debug(f"生成融合向量，维度: {len(query_vector)}, 耗时: {time.time() - embed_start:.3f}s")
        # Milvus 搜索
        expr = f"userid == '{userid}' and filename in {filenames}"
        search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}
        milvus_start = time.time()
        milvus_results = collection.search(
            data=[query_vector],
            anns_field="vector",
            param=search_params,
            limit=100,
            expr=expr,
            output_fields=["text", "userid", "document_id", "filename", "file_path", "upload_time", "file_type"],
            offset=offset
        )
        logger.debug(f"Milvus 搜索耗时: {time.time() - milvus_start:.3f}s")
        results = []
        for hits in milvus_results:
            for hit in hits:
                result = {
                    "text": hit.entity.get("text"),
                    "distance": hit.distance,
                    "source": "fused_query" if not triplet_texts else f"fused_triplets_{len(triplet_texts)}",
                    "metadata": {
                        "userid": hit.entity.get("userid"),
                        "document_id": hit.entity.get("document_id"),
                        "filename": hit.entity.get("filename"),
                        "file_path": hit.entity.get("file_path"),
                        "upload_time": hit.entity.get("upload_time"),
                        "file_type": hit.entity.get("file_type")
                    }
                }
                results.append(result)
                logger.debug(
                    f"召回: text={result['text'][:100]}..., distance={result['distance']}, filename={result['metadata']['filename']}")
        # 去重
        unique_results = []
        seen_texts = set()
        for result in results:
            text = result['text']
            if not text:
                logger.warning(f"发现空文本结果: {result['metadata']}")
                continue
            if text in seen_texts:
                logger.debug(f"移除重复文本: text={text[:100]}..., filename={result['metadata']['filename']}")
                continue
            seen_texts.add(text)
            unique_results.append(result)
        logger.info(f"去重后结果数量: {len(unique_results)} (原始数量: {len(results)})")
        # 可选：重排序
        if use_rerank and unique_results:
            logger.debug("开始重排序")
            logger.debug(f"重排序查询: {fused_text}")
            rerank_start = time.time()
            reranked_results = rerank_results(fused_text, unique_results)
            reranked_results = sorted(reranked_results, key=lambda x: x.get('rerank_score', 0), reverse=True)
            logger.debug(f"重排序分数分布: {[round(r.get('rerank_score', 0), 3) for r in reranked_results]}")
            logger.debug(f"重排序耗时: {time.time() - rerank_start:.3f}s")
            for i, result in enumerate(reranked_results):
                logger.debug(
                    f"排序结果 {i + 1}: text={result['text'][:100]}..., distance={result['distance']}, rerank_score={result.get('rerank_score', 'N/A')}")
            logger.info(f"总耗时: {time.time() - start_time:.3f}s")
            return reranked_results[:limit]
        # 按 distance 降序排序
        sorted_results = sorted(unique_results, key=lambda x: x['distance'], reverse=True)
        for i, result in enumerate(sorted_results):
            logger.debug(f"排序结果 {i + 1}: text={result['text'][:100]}..., distance={result['distance']}")
        logger.info(f"总耗时: {time.time() - start_time:.3f}s")
        return sorted_results[:limit]
    except Exception as e:
        logger.error(f"融合搜索失败: {str(e)}")
        import traceback
        logger.error(traceback.format_exc())
        return []
 if __name__ == "__main__":
    query = "什么是知识抽取？"
    userid = "testuser1"
    db_type = "textdb"
    file_paths = [
        "/share/wangmeihua/rag/data/test.docx",
        "/share/wangmeihua/rag/data/zongshu.pdf",
        "/share/wangmeihua/rag/data/qianru.pdf",
    ]
    try:
        results = fused_search(query, userid, db_type, file_paths, limit=10, offset=0)
        for i, result in enumerate(results):
            print(f"Result {i + 1}:")
            print(f"Text: {result['text'][:200]}...")
            print(f"Distance: {result['distance']:.3f}")
            print(
                f"Rerank Score: {result.get('rerank_score', 'N/A') if isinstance(result.get('rerank_score'), (int, float)) else 'N/A':.3f}")
            print(f"Source: {result['source']}")
            print(f"Metadata: {result['metadata']}\n")
    except Exception as e:
        print(f"搜索失败: {str(e)}")
--- a/rag/combinedsearch.py
+++ b/rag/combinedsearch.py
@ -1,190 +0,0 @@
 import os
 import yaml
 import logging
 from typing import List, Dict
 from pymilvus import connections, Collection, utility
 from langchain_huggingface import HuggingFaceEmbeddings
 from query import search_query
 from searchquery import searchquery
 from rerank import rerank_results
 from vector import initialize_milvus_connection, cleanup_milvus_connection
 import torch
 from functools import lru_cache
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    MILVUS_DB_PATH = config['database']['milvus_db_path']
    TEXT_EMBEDDING_MODEL = config['models']['text_embedding_model']
 except Exception as e:
    print(f"加载配置文件 {CONFIG_PATH} 失败: {str(e)}")
    raise RuntimeError(f"加载配置文件: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 logger.handlers.clear()  # 清除现有处理器
 logger.propagate = False  # 禁用传播
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 file_handler = logging.FileHandler(config['logging']['file'], encoding='utf-8')
 file_handler.setFormatter(formatter)
 stream_handler = logging.StreamHandler()
 stream_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
 logger.addHandler(stream_handler)
 # 初始化嵌入模型（缓存）
@lru_cache(maxsize=1000)
 def get_embedding(text: str) -> List[float]:
    embedding = HuggingFaceEmbeddings(
        model_name=TEXT_EMBEDDING_MODEL,
        model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )
    vector = embedding.embed_query(text)
    if len(vector) != 1024:
        raise ValueError(f"嵌入模型输出维度 {len(vector)} 不匹配预期 1024")
    return vector
 def combined_search(query: str, userid: str, db_type: str, file_paths: List[str], limit: int = 10, offset: int = 0) -> List[Dict]:
    """
    结合 RAG 和三元组检索，召回相关文本块，使用 BGE Reranker 重排序。
    参数:
        query (str): 查询文本
        userid (str): 用户ID
        db_type (str): 数据库类型
        file_paths (List[str]): 文档路径列表
        limit (int): 返回的最大结果数，默认为 10
        offset (int): 偏移量，默认为 0
    返回:
        List[Dict]: 包含 text、distance、source、metadata 和 rerank_score 的结果列表
    """
    try:
        # 参数验证
        if not query or not userid or not db_type or not file_paths:
            raise ValueError("query、userid、db_type 和 file_paths 不能为空")
        if "_" in userid or "_" in db_type:
            raise ValueError("userid 和 db_type 不能包含下划线")
        if len(userid) > 100 or len(db_type) > 100:
            raise ValueError("userid 或 db_type 的长度超出限制")
        if limit <= 0 or limit > 16384:
            raise ValueError("limit 必须在 1 到 16384 之间")
        if offset < 0:
            raise ValueError("offset 不能为负数")
        if limit + offset > 16384:
            raise ValueError("limit + offset 不能超过 16384")
        for file_path in file_paths:
            if not isinstance(file_path, str):
                raise ValueError(f"file_path 必须是字符串: {file_path}")
            if len(os.path.basename(file_path)) > 255:
                raise ValueError(f"文件名长度超出 255 个字符: {file_path}")
        # 初始化 Milvus 连接
        initialize_milvus_connection()
        collection_name = f"ragdb_{db_type}"
        if not utility.has_collection(collection_name):
            logger.warning(f"集合 {collection_name} 不存在")
            return []
        # RAG 检索，使用默认 limit=3
        rag_results = search_query(query, userid, db_type, file_paths, offset=offset)
        for result in rag_results:
            result['source'] = 'rag'
        logger.info(f"RAG 检索返回 {len(rag_results)} 条结果")
        # 三元组检索，使用默认 limit=3
        triplet_results = searchquery(query, userid, db_type, file_paths, offset=offset)
        for result in triplet_results:
            result['source'] = 'triplet'
        logger.info(f"三元组检索返回 {len(triplet_results)} 条结果")
        # 记录三元组检索结果详情
        for idx, result in enumerate(triplet_results, 1):
            logger.debug(f"三元组结果 {idx}: text={result['text'][:200]}..., distance={result['distance']:.4f}, metadata={result['metadata']}")
        # 合并结果
        all_results = rag_results + triplet_results
        if not all_results:
            logger.warning("RAG 和三元组检索均无结果")
            return []
        # 记录合并前的结果
        logger.debug("合并前结果：")
        for idx, result in enumerate(all_results, 1):
            logger.debug(f"结果 {idx} ({result['source']}): text={result['text'][:200]}..., distance={result['distance']:.4f}, metadata={result['metadata']}")
        # 使用 BGE Reranker 重排序
        reranked_results = rerank_results(query, all_results, top_k=len(all_results))
        # 按 rerank_score 排序（不去重）
        sorted_results = sorted(reranked_results, key=lambda x: x['rerank_score'], reverse=True)
        # 记录排序后的结果
        logger.debug("重排序后结果：")
        for idx, result in enumerate(sorted_results, 1):
            logger.debug(f"排序结果 {idx} ({result['source']}): text={result['text'][:200]}..., distance={result['distance']:.4f}, rerank_score={result['rerank_score']:.6f}, metadata={result['metadata']}")
        # 去重（基于 text，保留 rerank_score 最大的记录）
        unique_results = []
        text_to_result = {}
        for result in sorted_results:
            text = result['text']
            if text not in text_to_result or result['rerank_score'] > text_to_result[text]['rerank_score']:
                text_to_result[text] = result
        unique_results = list(text_to_result.values())
        # 记录去重后的结果
        logger.debug("去重后结果：")
        for idx, result in enumerate(unique_results, 1):
            logger.debug(f"去重结果 {idx} ({result['source']}): text={result['text'][:200]}..., distance={result['distance']:.4f}, rerank_score={result['rerank_score']:.6f}, metadata={result['metadata']}")
        # 限制结果数量
        final_results = unique_results[:limit]
        logger.info(f"合并后返回 {len(final_results)} 条唯一结果")
        # 移除 weighted_score 字段（若存在），保留 rerank_score 和 source
        for result in final_results:
            result.pop('weighted_score', None)
        return final_results
    except Exception as e:
        logger.error(f"合并搜索失败: {str(e)}")
        import traceback
        logger.debug(traceback.format_exc())
        return []
    finally:
        cleanup_milvus_connection()
 if __name__ == "__main__":
    # 测试代码
    query = "知识图谱构建需要什么技术？"
    userid = "testuser1"
    db_type = "textdb"
    file_paths = [
        "/share/wangmeihua/rag/data/test.docx",
        "/share/wangmeihua/rag/data/zongshu.pdf",
        "/share/wangmeihua/rag/data/qianru.pdf"
    ]
    limit = 10
    offset = 0
    try:
        results = combined_search(query, userid, db_type, file_paths, limit, offset)
        print(f"搜索结果 ({len(results)} 条):")
        for idx, result in enumerate(results, 1):
            print(f"结果 {idx}:")
            print(f"内容: {result['text'][:200]}...")
            print(f"距离: {result['distance']}")
            print(f"来源: {result['source']}")
            print(f"重排序分数: {result['rerank_score']}")
            print(f"元数据: {result['metadata']}")
            print("-" * 50)
    except Exception as e:
        print(f"搜索失败: {str(e)}")
--- a/rag/deletefile.py
+++ b/rag/deletefile.py
@ -1,138 +0,0 @@
 import logging
 import yaml
 import os
 from pymilvus import connections, Collection, utility
 from vector import initialize_milvus_connection
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    MILVUS_DB_PATH = config['database']['milvus_db_path']
    TEXT_EMBEDDING_MODEL = config['models']['text_embedding_model']
 except Exception as e:
    print(f"加载配置文件 {CONFIG_PATH} 失败: {str(e)}")
    raise RuntimeError(f"无法加载配置文件: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 for handler in (logging.FileHandler(config['logging']['file'], encoding='utf-8'), logging.StreamHandler()):
    handler.setFormatter(formatter)
    logger.addHandler(handler)
 def delete_document(db_type: str, userid: str, filename: str) -> bool:
    """
    根据 db_type、userid 和 filename 删除用户的指定文件数据。
    参数:
        db_type (str): 数据库类型（如 'textdb', 'pptdb'）
        userid (str): 用户 ID
        filename (str): 文件名（如 'test.docx'）
    返回:
        bool: 删除是否成功
    异常:
        ValueError: 参数无效
        RuntimeError: 数据库操作失败
    """
    try:
        # 参数验证
        if not db_type or "_" in db_type:
            raise ValueError("db_type 不能为空且不能包含下划线")
        if not userid or "_" in userid:
            raise ValueError("userid 不能为空且不能包含下划线")
        if not filename:
            raise ValueError("filename 不能为空")
        if len(db_type) > 100 or len(userid) > 100 or len(filename) > 255:
            raise ValueError("db_type、userid 或 filename 的长度超出限制")
        # 初始化 Milvus 连接
        initialize_milvus_connection()
        logger.debug(f"已连接到 Milvus Lite，路径: {MILVUS_DB_PATH}")
        # 检查集合是否存在
        collection_name = f"ragdb_{db_type}"
        if not utility.has_collection(collection_name):
            logger.warning(f"集合 {collection_name} 不存在")
            return False
        # 加载集合
        try:
            collection = Collection(collection_name)
            collection.load()
            logger.debug(f"加载集合: {collection_name}")
        except Exception as e:
            logger.error(f"加载集合 {collection_name} 失败: {str(e)}")
            raise RuntimeError(f"加载集合失败: {str(e)}")
        # 查询匹配的 document_id
        expr = f"userid == '{userid}' and filename == '{filename}'"
        logger.debug(f"查询表达式: {expr}")
        try:
            results = collection.query(
                expr=expr,
                output_fields=["document_id"],
                limit=1000
            )
            if not results:
                logger.warning(f"没有找到 userid={userid}, filename={filename} 的记录")
                return False
            document_ids = list(set(result["document_id"] for result in results if "document_id" in result))
            logger.debug(f"找到 {len(document_ids)} 个 document_id: {document_ids}")
        except Exception as e:
            logger.error(f"查询 document_id 失败: {str(e)}")
            raise RuntimeError(f"查询失败: {str(e)}")
        # 执行删除
        total_deleted = 0
        for doc_id in document_ids:
            try:
                delete_expr = f"userid == '{userid}' and document_id == '{doc_id}'"
                logger.debug(f"删除表达式: {delete_expr}")
                delete_result = collection.delete(delete_expr)
                deleted_count = delete_result.delete_count
                total_deleted += deleted_count
                logger.info(f"成功删除 document_id={doc_id} 的 {deleted_count} 条记录")
            except Exception as e:
                logger.error(f"删除 document_id={doc_id} 失败: {str(e)}")
                continue
        if total_deleted == 0:
            logger.warning(f"没有删除任何记录，userid={userid}, filename={filename}")
            return False
        logger.info(f"总计删除 {total_deleted} 条记录，userid={userid}, filename={filename}")
        return True
    except ValueError as ve:
        logger.error(f"参数验证失败: {str(ve)}")
        return False
    except RuntimeError as re:
        logger.error(f"数据库操作失败: {str(re)}")
        return False
    except Exception as e:
        logger.error(f"删除文件失败: {str(e)}")
        import traceback
        logger.debug(traceback.format_exc())
        return False
    finally:
        try:
            connections.disconnect("default")
            logger.debug("已断开 Milvus 连接")
        except Exception as e:
            logger.warning(f"断开 Milvus 连接失败: {str(e)}")
 if __name__ == "__main__":
    # 测试用例
    db_type = "textdb"
    userid = "testuser2"
    filename = "test.docx"
    logger.info(f"测试：删除 userid={userid}, filename={filename} 的文件")
    result = delete_document(db_type, userid, filename)
    print(f"删除结果: {result}")
--- a/rag/embed.py
+++ b/rag/embed.py
@ -1,183 +0,0 @@
 import os
 import uuid
 import yaml
 import logging
 from datetime import datetime
 from typing import List
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from pymilvus import connections
 from vector import get_vector_db
 from filetxt.loader import fileloader
 from extract import extract_and_save_triplets
 from kgc import KnowledgeGraph
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    MILVUS_DB_PATH = config['database']['milvus_db_path']
 except Exception as e:
    logger.error(f"加载配置文件 {CONFIG_PATH} 失败: {str(e)}")
    raise RuntimeError(f"无法加载配置文件: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 logger.handlers.clear()
 logger.propagate = False
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 file_handler = logging.FileHandler(config['logging']['file'], encoding='utf-8')
 file_handler.setFormatter(formatter)
 stream_handler = logging.StreamHandler()
 stream_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
 logger.addHandler(stream_handler)
 def generate_document_id() -> str:
    """为文件生成唯一的 document_id"""
    return str(uuid.uuid4())
 def load_and_split_data(file_path: str, userid: str, document_id: str) -> List[Document]:
    """
    加载文件，分片并生成带有元数据的 Document 对象。
    """
    try:
        if not os.path.exists(file_path):
            raise ValueError(f"文件 {file_path} 不存在")
        if os.path.getsize(file_path) == 0:
            raise ValueError(f"文件 {file_path} 为空")
        logger.debug(f"检查文件: {file_path}, 大小: {os.path.getsize(file_path)} 字节")
        ext = file_path.rsplit('.', 1)[1].lower()
        logger.debug(f"文件扩展名: {ext}")
        logger.debug("开始加载文件")
        text = fileloader(file_path)
        if not text or not text.strip():
            raise ValueError(f"文件 {file_path} 加载为空")
        document = Document(page_content=text)
        logger.debug(f"加载完成，生成 1 个文档")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=200,
            length_function=len,
        )
        chunks = text_splitter.split_documents([document])
        logger.debug(f"分割完成，生成 {len(chunks)} 个文档块")
        filename = os.path.basename(file_path)
        upload_time = datetime.now().isoformat()
        documents = []
        for i, chunk in enumerate(chunks):
            chunk.metadata.update({
                'userid': userid,
                'document_id': document_id,
                'filename': filename,
                'file_path': file_path,
                'upload_time': upload_time,
                'file_type': ext,
                'chunk_index': i,
                'source': file_path,
            })
            required_fields = ['userid', 'document_id', 'filename', 'file_path', 'upload_time', 'file_type']
            if not all(field in chunk.metadata and chunk.metadata[field] for field in required_fields):
                raise ValueError(f"文档元数据缺少必需字段或值为空: {chunk.metadata}")
            documents.append(chunk)
            logger.debug(f"生成文档块 {i}: metadata={chunk.metadata}")
        logger.debug(f"文件 {file_path} 加载并分割为 {len(documents)} 个文档块，document_id: {document_id}")
        return documents
    except Exception as e:
        logger.error(f"加载或分割文件 {file_path} 失败: {str(e)}")
        import traceback
        logger.debug(traceback.format_exc())
        raise ValueError(f"加载或分割文件失败: {str(e)}")
 def embed(file_path: str, userid: str, db_type: str) -> bool:
    """
    嵌入文件到 Milvus 向量数据库，抽取三元组保存到指定路径，并将三元组存储到 Neo4j。
    """
    try:
        if not userid or not db_type:
            raise ValueError("userid 和 db_type 不能为空")
        if "_" in userid:
            raise ValueError("userid 不能包含下划线")
        if "_" in db_type:
            raise ValueError("db_type 不能包含下划线")
        if not os.path.exists(file_path):
            raise ValueError(f"文件 {file_path} 不存在")
        supported_formats = {'pdf', 'doc', 'docx', 'xlsx', 'xls', 'ppt', 'pptx', 'csv', 'txt'}
        ext = file_path.rsplit('.', 1)[1].lower()
        if ext not in supported_formats:
            logger.error(f"文件 {file_path} 格式不支持，支持的格式: {', '.join(supported_formats)}")
            raise ValueError(f"不支持的文件格式: {ext}, 支持的格式: {', '.join(supported_formats)}")
        document_id = generate_document_id()
        logger.info(f"生成 document_id: {document_id} for file: {file_path}")
        logger.info(f"开始处理文件 {file_path}，userid: {userid}，db_type: {db_type}")
        chunks = load_and_split_data(file_path, userid, document_id)
        if not chunks:
            logger.error(f"文件 {file_path} 未生成任何文档块")
            raise ValueError("未生成任何文档块")
        logger.debug(f"处理文件 {file_path}，生成 {len(chunks)} 个文档块")
        logger.debug(f"第一个文档块: {chunks[0].page_content[:200]}")
        db = get_vector_db(userid, db_type, documents=chunks)
        if not db:
            logger.error(f"无法初始化或插入到向量数据库 ragdb_{db_type}")
            raise RuntimeError(f"数据库操作失败")
        try:
            full_text = fileloader(file_path)
            if full_text and full_text.strip():
                success = extract_and_save_triplets(full_text, document_id, userid)
                triplet_file_path = f"/share/wangmeihua/rag/triples/{document_id}_{userid}.txt"
                if success and os.path.exists(triplet_file_path):
                    logger.info(f"文件 {file_path} 三元组保存到: {triplet_file_path}")
                    try:
                        kg = KnowledgeGraph(data_path=triplet_file_path, document_id=document_id)
                        logger.info(f"Step 1: 导入图谱节点到 Neo4j，document_id: {document_id}")
                        kg.create_graphnodes()
                        logger.info(f"Step 2: 导入图谱边到 Neo4j，document_id: {document_id}")
                        kg.create_graphrels()
                        logger.info(f"Step 3: 导出 Neo4j 节点数据，document_id: {document_id}")
                        kg.export_data()
                        logger.info(f"文件 {file_path} 三元组成功插入 Neo4j")
                    except Exception as e:
                        logger.warning(f"将三元组插入 Neo4j 失败: {str(e)}，但不影响 Milvus 嵌入")
                else:
                    logger.warning(f"文件 {file_path} 的三元组抽取失败或文件不存在: {triplet_file_path}")
            else:
                logger.warning(f"文件 {file_path} 内容为空，无法抽取三元组")
        except Exception as e:
            logger.error(f"文件 {file_path} 三元组抽取失败: {str(e)}，但不影响向量化")
        logger.info(f"文件 {file_path} 成功嵌入到数据库 ragdb_{db_type}")
        return True
    except ValueError as ve:
        logger.error(f"嵌入文件 {file_path} 失败: {str(ve)}")
        return False
    except RuntimeError as re:
        logger.error(f"嵌入文件 {file_path} 失败: {str(re)}")
        return False
    except Exception as e:
        logger.error(f"嵌入文件 {file_path} 失败: {str(e)}")
        import traceback
        logger.debug(traceback.format_exc())
        return False
 if __name__ == "__main__":
    test_file = "/share/wangmeihua/rag/data/test.docx"
    userid = "testuser1"
    db_type = "textdb"
    result = embed(test_file, userid, db_type)
    print(f"嵌入结果: {result}")
--- a/rag/extract.py
+++ b/rag/extract.py
@ -1,225 +0,0 @@
 import os
 import torch
 import re
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 import logging
 import yaml
 import time
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
 except Exception as e:
    print(f"加载配置文件 {CONFIG_PATH} 失败: {str(e)}")
    raise RuntimeError(f"无法加载配置文件: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 for handler in (logging.FileHandler(config['logging']['file'], encoding='utf-8'), logging.StreamHandler()):
    handler.setFormatter(formatter)
    logger.addHandler(handler)
 # 三元组保存路径
 TRIPLES_OUTPUT_DIR = "/share/wangmeihua/rag/triples"
 os.makedirs(TRIPLES_OUTPUT_DIR, exist_ok=True)
 # 加载 mREBEL 模型和分词器
 local_path = "/share/models/Babelscape/mrebel-large"
 try:
    tokenizer = AutoTokenizer.from_pretrained(local_path, src_lang="zh_CN", tgt_lang="tp_XX")
    model = AutoModelForSeq2SeqLM.from_pretrained(local_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    triplet_id = tokenizer.convert_tokens_to_ids("<triplet>")
    logger.debug(f"成功加载 mREBEL 模型，分词器 triplet_id: {triplet_id}")
 except Exception as e:
    logger.error(f"加载 mREBEL 模型失败: {str(e)}")
    raise RuntimeError(f"加载 mREBEL 模型失败: {str(e)}")
 # 优化生成参数
 gen_kwargs = {
    "max_length": 512,
    "min_length": 10,
    "length_penalty": 0.5,
    "num_beams": 3,
    "num_return_sequences": 1,
    "no_repeat_ngram_size": 2,
    "early_stopping": True,
    "decoder_start_token_id": triplet_id,
 }
 def split_document(text: str, max_chunk_size: int = 150) -> list:
    """分割文档为语义完整的块"""
    sentences = re.split(r'(?<=[。！？；\n])', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += sentence
        else:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk)
    return chunks
 def extract_triplets_typed(text: str) -> list:
    """解析 mREBEL 生成文本，匹配 <triplet> <entity1> <type1> <entity2> <type2> <relation> 格式"""
    triplets = []
    logger.debug(f"原始生成文本: {text}")
    # 分割标记
    tokens = []
    in_tag = False
    buffer = ""
    for char in text:
        if char == '<':
            in_tag = True
            if buffer:
                tokens.append(buffer.strip())
                buffer = ""
            buffer += char
        elif char == '>':
            in_tag = False
            buffer += char
            tokens.append(buffer.strip())
            buffer = ""
        else:
            buffer += char
    if buffer:
        tokens.append(buffer.strip())
    # 过滤特殊标记
    special_tokens = ["<s>", "<pad>", "</s>", "tp_XX", "__en__", "__zh__", "zh_CN"]
    tokens = [t for t in tokens if t not in special_tokens and t]
    logger.debug(f"处理后标记: {tokens}")
    # 解析三元组
    i = 0
    while i < len(tokens):
        if tokens[i] == "<triplet>" and i + 5 < len(tokens):
            entity1 = tokens[i + 1]
            type1 = tokens[i + 2][1:-1] if tokens[i + 2].startswith("<") and tokens[i + 2].endswith(">") else ""
            entity2 = tokens[i + 3]
            type2 = tokens[i + 4][1:-1] if tokens[i + 4].startswith("<") and tokens[i + 4].endswith(">") else ""
            relation = tokens[i + 5]
            if entity1 and type1 and entity2 and type2 and relation:
                triplets.append({
                    'head': entity1.strip(),
                    'head_type': type1,
                    'type': relation.strip(),
                    'tail': entity2.strip(),
                    'tail_type': type2
                })
                logger.debug(f"添加三元组: {entity1}({type1}) - {relation} - {entity2}({type2})")
            i += 6
        else:
            i += 1
    return triplets
 def extract_and_save_triplets(text: str, document_id: str, userid: str) -> bool:
    """
    从文本中抽取三元组并保存到指定路径。
    参数:
        text (str): 输入文本
        document_id (str): 文档ID
        userid (str): 用户ID
    返回:
        bool: 三元组抽取和保存是否成功
    """
    try:
        if not text or not document_id or not userid:
            raise ValueError("text、document_id 和 userid 不能为空")
        if "_" in document_id or "_" in userid:
            raise ValueError("document_id 和 userid 不能包含下划线")
        start_time = time.time()
        logger.info(f"开始抽取文档 {document_id} 的三元组，userid: {userid}")
        # 分割文本为语义块
        text_chunks = split_document(text, max_chunk_size=150)
        logger.debug(f"分割为 {len(text_chunks)} 个文本块")
        # 处理所有文本块
        all_triplets = []
        for i, chunk in enumerate(text_chunks):
            logger.debug(f"处理块 {i + 1}/{len(text_chunks)}: {chunk[:50]}...")
            # 分词
            model_inputs = tokenizer(
                chunk,
                max_length=256,
                padding=True,
                truncation=True,
                return_tensors="pt"
            ).to(device)
            # 生成
            try:
                generated_tokens = model.generate(
                    model_inputs["input_ids"],
                    attention_mask=model_inputs["attention_mask"],
                    **gen_kwargs,
                )
                decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
                for idx, sentence in enumerate(decoded_preds):
                    logger.debug(f"块 {i + 1} 生成文本: {sentence}")
                    triplets = extract_triplets_typed(sentence)
                    if triplets:
                        logger.debug(f"块 {i + 1} 提取到 {len(triplets)} 个三元组")
                        all_triplets.extend(triplets)
            except Exception as e:
                logger.warning(f"处理块 {i + 1} 时出错: {str(e)}")
                continue
        # 去重
        unique_triplets = []
        seen = set()
        for t in all_triplets:
            identifier = (t['head'].lower(), t['type'].lower(), t['tail'].lower())
            if identifier not in seen:
                seen.add(identifier)
                unique_triplets.append(t)
        # 保存结果
        output_file = os.path.join(TRIPLES_OUTPUT_DIR, f"{document_id}_{userid}.txt")
        try:
            with open(output_file, "w", encoding="utf-8") as f:
                for t in unique_triplets:
                    f.write(f"{t['head']}\t{t['type']}\t{t['tail']}\t{t['head_type']}\t{t['tail_type']}\n")
            logger.info(f"文档 {document_id} 的 {len(unique_triplets)} 个三元组已保存到: {output_file}")
        except Exception as e:
            logger.error(f"保存文档 {document_id} 的三元组失败: {str(e)}")
            return False
        end_time = time.time()
        logger.info(f"文档 {document_id} 三元组抽取完成，耗时: {end_time - start_time:.2f} 秒")
        return True
    except Exception as e:
        logger.error(f"抽取或保存三元组失败: {str(e)}")
        import traceback
        logger.debug(traceback.format_exc())
        return False
 if __name__ == "__main__":
    # 测试用例
    test_text = "知识图谱是一个结构化的语义知识库。深度学习是基于深层神经网络的机器学习子集。"
    document_id = "testdoc123"
    userid = "testuser1"
    result = extract_and_save_triplets(test_text, document_id, userid)
    print(f"抽取结果: {result}")
--- a/rag/fusedsearch.py
+++ b/rag/fusedsearch.py
@ -1,290 +0,0 @@
 import os
 import logging
 import yaml
 import numpy as np
 from typing import List, Dict, Any
 from pymilvus import Collection, utility
 from langchain_huggingface import HuggingFaceEmbeddings
 from vector import initialize_milvus_connection
 from searchquery import extract_entities, match_triplets
 from rerank import rerank_results
 import torch
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    TEXT_EMBEDDING_MODEL = config['models']['text_embedding_model']
 except Exception as e:
    raise RuntimeError(f"无法加载配置文件: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 logger.handlers.clear()
 logger.propagate = False
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 try:
    with open(config['logging']['file'], 'a', encoding='utf-8') as f:
        pass
 except Exception as e:
    raise RuntimeError(f"日志文件 {config['logging']['file']} 不可写: {str(e)}")
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 file_handler = logging.FileHandler(config['logging']['file'], encoding='utf-8')
 file_handler.setFormatter(formatter)
 stream_handler = logging.StreamHandler()
 stream_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
 logger.addHandler(stream_handler)
 # 初始化嵌入模型
 embedding = HuggingFaceEmbeddings(
    model_name=TEXT_EMBEDDING_MODEL,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
 )
 try:
    test_vector = embedding.embed_query("test")
    if len(test_vector) != 1024:
        raise ValueError(f"嵌入模型输出维度 {len(test_vector)} 不匹配预期 1024")
    logger.debug("嵌入模型加载成功")
 except Exception as e:
    logger.error(f"嵌入模型加载失败: {str(e)}")
    raise RuntimeError(f"嵌入模型加载失败: {str(e)}")
 def fused_search(
    query: str,
    userid: str,
    db_type: str,
    file_paths: List[str],
    limit: int = 10,
    offset: int = 0,
    use_rerank: bool = True
 ) -> List[Dict[str, Any]]:
    """
    融合 RAG 和三元组召回文本块：
    - 调用 searchquery.py 的 extract_entities 和 match_triplets 获取三元组。
    - 将所有匹配三元组拼接为融合文本，向量化后在 Milvus 中搜索。
    参数:
        query (str): 查询文本
        userid (str): 用户 ID
        db_type (str): 数据库类型 (e.g., 'textdb')
        file_paths (List[str]): 文件路径列表
        limit (int): 返回结果数量
        offset (int): 偏移量
        use_rerank (bool): 是否使用重排序
    返回:
        List[Dict[str, Any]]: 召回结果，包含 text、distance、metadata
    """
    try:
        logger.info(f"开始融合搜索: query={query}, userid={userid}, db_type={db_type}")
        # 参数验证
        if not query or not userid or not db_type or not file_paths:
            raise ValueError("query、userid、db_type 和 file_paths 不能为空")
        if "_" in userid or "_" in db_type:
            raise ValueError("userid 和 db_type 不能包含下划线")
        # 初始化 Milvus 连接
        connections = initialize_milvus_connection()
        collection_name = f"ragdb_{db_type}"
        if not utility.has_collection(collection_name):
            logger.warning(f"集合 {collection_name} 不存在")
            return []
        collection = Collection(collection_name)
        collection.load()
        logger.debug(f"加载 Milvus 集合: {collection_name}")
        # 提取实体
        query_entities = extract_entities(query)
        logger.debug(f"提取实体: {query_entities}")
        # 收集所有结果
        results = []
        search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}
        for file_path in file_paths:
            filename = os.path.basename(file_path)
            logger.debug(f"处理文件: {filename}")
            # 获取 document_id
            results_query = collection.query(
                expr=f"userid == '{userid}' and filename == '{filename}'",
                output_fields=["document_id"],
                limit=1
            )
            if not results_query:
                logger.warning(f"未找到 userid {userid} 和 filename {filename} 对应的文档")
                continue
            document_id = results_query[0]["document_id"]
            logger.debug(f"找到 document_id: {document_id}")
            # 获取匹配的三元组
            matched_triplets = match_triplets(query, query_entities, userid, document_id)
            logger.debug(f"匹配三元组: {matched_triplets}")
            # 若无三元组，使用原查询向量化
            if not matched_triplets:
                logger.debug(f"无匹配三元组，使用原查询: {query}")
                query_vector = embedding.embed_query(query)
                expr = f"userid == '{userid}' and filename == '{filename}'"
                milvus_results = collection.search(
                    data=[query_vector],
                    anns_field="vector",
                    param=search_params,
                    limit=limit,
                    expr=expr,
                    output_fields=["text", "userid", "document_id", "filename", "file_path", "upload_time", "file_type"],
                    offset=offset
                )
                for hits in milvus_results:
                    for hit in hits:
                        result = {
                            "text": hit.entity.get("text"),
                            "distance": hit.distance,
                            "source": "fused_query",
                            "metadata": {
                                "userid": hit.entity.get("userid"),
                                "document_id": hit.entity.get("document_id"),
                                "filename": hit.entity.get("filename"),
                                "file_path": hit.entity.get("file_path"),
                                "upload_time": hit.entity.get("upload_time"),
                                "file_type": hit.entity.get("file_type")
                            }
                        }
                        results.append(result)
                        logger.debug(f"召回: text={result['text'][:100]}..., distance={result['distance']}")
                continue
            # 拼接所有三元组
            triplet_texts = []
            for triplet in matched_triplets:
                head = triplet['head']
                type = triplet['type']
                tail = triplet['tail']
                if not head or not type or not tail:
                    logger.debug(f"无效三元组: {triplet}")
                    continue
                triplet_texts.append(f"{head} {type} {tail}")
            if not triplet_texts:
                logger.debug(f"无有效三元组，使用原查询: {query}")
                query_vector = embedding.embed_query(query)
                expr = f"userid == '{userid}' and filename == '{filename}'"
                milvus_results = collection.search(
                    data=[query_vector],
                    anns_field="vector",
                    param=search_params,
                    limit=5,
                    expr=expr,
                    output_fields=["text", "userid", "document_id", "filename", "file_path", "upload_time", "file_type"],
                    offset=offset
                )
                for hits in milvus_results:
                    for hit in hits:
                        result = {
                            "text": hit.entity.get("text"),
                            "distance": hit.distance,
                            "source": "fused_query",
                            "metadata": {
                                "userid": hit.entity.get("userid"),
                                "document_id": hit.entity.get("document_id"),
                                "filename": hit.entity.get("filename"),
                                "file_path": hit.entity.get("file_path"),
                                "upload_time": hit.entity.get("upload_time"),
                                "file_type": hit.entity.get("file_type")
                            }
                        }
                        results.append(result)
                        logger.debug(f"召回: text={result['text'][:100]}..., distance={result['distance']}")
                continue
            # 生成融合文本
            fused_text = f"{query} {' '.join(triplet_texts)}"
            logger.debug(f"融合文本: {fused_text}")
            # 向量化
            fused_vector = embedding.embed_query(fused_text)
            fused_vector = np.array(fused_vector) / np.linalg.norm(fused_vector)
            logger.debug(f"生成融合向量，维度: {len(fused_vector)}")
            # Milvus 搜索
            expr = f"userid == '{userid}' and filename == '{filename}'"
            milvus_results = collection.search(
                data=[fused_vector],
                anns_field="vector",
                param=search_params,
                limit=5,
                expr=expr,
                output_fields=["text", "userid", "document_id", "filename", "file_path", "upload_time", "file_type"],
                offset=offset
            )
            for hits in milvus_results:
                for hit in hits:
                    result = {
                        "text": hit.entity.get("text"),
                        "distance": hit.distance,
                        "source": f"fused_triplets_{len(triplet_texts)}",
                        "metadata": {
                            "userid": hit.entity.get("userid"),
                            "document_id": hit.entity.get("document_id"),
                            "filename": hit.entity.get("filename"),
                            "file_path": hit.entity.get("file_path"),
                            "upload_time": hit.entity.get("upload_time"),
                            "file_type": hit.entity.get("file_type")
                        }
                    }
                    results.append(result)
                    logger.debug(f"召回: text={result['text'][:100]}..., distance={result['distance']}")
        # 去重
        unique_results = []
        seen_texts = set()
        for result in results:
            text = result['text']
            if text not in seen_texts:
                seen_texts.add(text)
                unique_results.append(result)
        logger.debug(f"去重后结果数量: {len(unique_results)}")
        # 可选：重排序
        if use_rerank and unique_results:
            logger.debug("开始重排序")
            reranked_results = rerank_results(query, unique_results)
            # 按 rerank_score 降序排序
            reranked_results = sorted(reranked_results, key=lambda x: x['rerank_score'], reverse=True)
            for i, result in enumerate(reranked_results):
                logger.debug(f"排序结果 {i+1}: text={result['text'][:100]}..., distance={result['distance']}, rerank_score={result['rerank_score']}")
            return reranked_results[:limit]
        # 按 distance 降序排序
        sorted_results = sorted(unique_results, key=lambda x: x['distance'], reverse=True)
        for i, result in enumerate(sorted_results):
            logger.debug(f"排序结果 {i+1}: text={result['text'][:100]}..., distance={result['distance']}")
        return sorted_results[:limit]
    except Exception as e:
        logger.error(f"融合搜索失败: {str(e)}")
        import traceback
        logger.debug(traceback.format_exc())
        return []
 if __name__ == "__main__":
    query = "知识图谱构建需要什么技术？"
    userid = "testuser1"
    db_type = "textdb"
    file_paths = [
        "/share/wangmeihua/rag/data/test.docx",
        "/share/wangmeihua/rag/data/zongshu.pdf",
        "/share/wangmeihua/rag/data/qianru.pdf"
    ]
    results = fused_search(query, userid, db_type, file_paths, limit=10, offset=0)
    for i, result in enumerate(results):
        print(f"Result {i+1}:")
        print(f"Text: {result['text'][:200]}...")
        print(f"Distance: {result['distance']}")
        print(f"Source: {result['source']}")
        print(f"Metadata: {result['metadata']}\n")
--- a/rag/kdb.py
+++ b/rag/kdb.py
@ -1,81 +0,0 @@
 from traceback import format_exc
 from appPublic.uniqueID import getID
 from appPublic.timeUtils import curDateString
 from appPublic.dictObject import DictObject
 from sqlor.dbpools import DBPools
 from ahserver.serverenv import get_serverenv
 from ahserver.filestorage import FileStorage
 async def add_kdb(kdb:dict) -> None:
 	"""
 	添加知识库
 	"""
 	kdb = DictObject(**kdb)
 	kdb.parentid=None
 	if kdb.id is None:
 		kdb.id = getID()
 	kdb.entity_type = '0'
 	kdb.create_date = curDateString()
 	if kdb.orgid is None:
 		e = Exception(f'Can not add none orgid kdb')
 		exception(f'{e}\n{format_exc()}')
 		raise e
 	f = get_serverenv('get_module_dbname')
 	dbname = f('rag')
 	db = DBPools()
 	async with db.sqlorContext(dbname) as sor:
 		await C('kdb', kdb.copy())
 async def add_dir(kdb:dict) -> None:
 	"""
 	添加子目录
 	"""
 	kdb = DictObject(**kdb)
 	if kdb.parentid is None:
 		e = Exception(f'Can not add root folder')
 		exception(f'{e}\n{format_exc()}')
 		raise e
 	if kdb.id is None:
 		kdb.id = getID()
 	kdb.entity_type = '1'
 	kdb.create_date = curDateString()
 	f = get_serverenv('get_module_dbname')
 	dbname = f('rag')
 	db = DBPools()
 	async with db.sqlorContext(dbname) as sor:
 		await C('kdb', kdb.copy())
 async def add_doc(doc:dict) -> None:
 	"""
 	添加文档
 	"""
 	doc = DictObject(**doc)
 	if doc.parentid is None:
 		e = Exception(f'Can not add root document')
 		exception(f'{e}\n{format_exc()}')
 		raise e
 	if doc.id is None:
 		doc.id = getID()
 	fs = FileStorage()
 	doc.realpath = fs.realPath(doc.webpath)
 	doc.create_date = curDateString()
 	f = get_serverenv('get_module_dbname')
 	dbname = f('rag')
 	db = DBPools()
 	async with db.sqlorContext(dbname) as sor:
 		await C('doc', doc.copy())
 async def get_all_docs(sor, kdbid):
 	"""
 	获取所有kdbid下的文档，含子目录的
 	"""
 	docs = await sor.R('doc', {'parentid':kdbid})
 	kdbs = await sor.R('kdb', {'parentid':kdbid})
 	for kdb in kdbs:
 		docs1 = await get_all_docs(kdb.id)
 		docs += docs1
 	return docs
--- a/rag/kgc.py
+++ b/rag/kgc.py
@ -1,194 +0,0 @@
 import os
 import logging
 import re
 from py2neo import Graph, Node, Relationship
 from typing import Set, List, Dict, Tuple
 from ufw.common import share_dir
 # 配置日志
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 class KnowledgeGraph:
    def __init__(self, data_path: str, document_id: str = None):
        self.data_path = data_path
        self.document_id = document_id or os.path.basename(data_path).split('_')[0]
        self.g = Graph("bolt://10.18.34.18:7687", auth=('neo4j', '261229..wmh'))
        logger.info(f"开始构建知识图谱，data_path: {self.data_path}, document_id: {self.document_id}")
        # 验证 data_path 是否有效
        if not os.path.exists(self.data_path):
            logger.error(f"数据路径 {self.data_path} 不存在")
            raise ValueError(f"数据路径 {self.data_path} 不存在")
    def _normalize_label(self, entity_type: str) -> str:
        """规范化实体类型为 Neo4j 标签"""
        if not entity_type or not entity_type.strip():
            return 'Entity'
        entity_type = re.sub(r'[^\w\s]', '', entity_type.strip())
        words = entity_type.split()
        label = '_'.join(word.capitalize() for word in words if word)
        return label or 'Entity'
    def _clean_relation(self, relation: str) -> Tuple[str, str]:
        """清洗关系，返回 (rel_type, rel_name)"""
        relation = relation.strip()
        if not relation:
            return 'RELATED_TO', '相关'
        if relation.startswith('<') and relation.endswith('>'):
            cleaned_relation = relation[1:-1]
            rel_name = cleaned_relation
            rel_type = re.sub(r'[^\w\s]', '', cleaned_relation).replace(' ', '_').upper()
        else:
            rel_name = relation
            rel_type = re.sub(r'[^\w\s]', '', relation).replace(' ', '_').upper()
            if 'instance of' in relation.lower():
                rel_type = 'INSTANCE_OF'
                rel_name = '实例'
            elif 'subclass of' in relation.lower():
                rel_type = 'SUBCLASS_OF'
                rel_name = '子类'
            elif 'part of' in relation.lower():
                rel_type = 'PART_OF'
                rel_name = '部分'
        logger.debug(f"处理关系: {relation} -> {rel_type} ({rel_name})")
        return rel_type, rel_name
    def read_nodes(self) -> Tuple[Dict[str, Set], Dict[str, List], List[Dict]]:
        """读取三元组数据，返回节点和关系"""
        nodes_by_label = {}
        relations_by_type = {}
        triples = []
        try:
            logger.debug(f"尝试读取文件: {self.data_path}")
            with open(self.data_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if not line or line.startswith('#'):
                        continue
                    parts = line.split('\t')
                    if len(parts) != 5:
                        logger.warning(f"无效行: {line}")
                        continue
                    head, relation, tail, head_type, tail_type = parts
                    head_label = self._normalize_label(head_type)
                    tail_label = self._normalize_label(tail_type)
                    logger.debug(f"实体类型: {head_type} -> {head_label}, {tail_type} -> {tail_label}")
                    if head_label not in nodes_by_label:
                        nodes_by_label[head_label] = set()
                    if tail_label not in nodes_by_label:
                        nodes_by_label[tail_label] = set()
                    nodes_by_label[head_label].add(head)
                    nodes_by_label[tail_label].add(tail)
                    rel_type, rel_name = self._clean_relation(relation)
                    if rel_type not in relations_by_type:
                        relations_by_type[rel_type] = []
                    relations_by_type[rel_type].append({
                        'head': head,
                        'tail': tail,
                        'head_label': head_label,
                        'tail_label': tail_label,
                        'rel_name': rel_name
                    })
                    triples.append({
                        'head': head,
                        'relation': relation,
                        'tail': tail,
                        'head_type': head_type,
                        'tail_type': tail_type
                    })
            logger.info(f"读取节点: {sum(len(nodes) for nodes in nodes_by_label.values())} 个")
            logger.info(f"读取关系: {sum(len(rels) for rels in relations_by_type.values())} 条")
            return nodes_by_label, relations_by_type, triples
        except Exception as e:
            logger.error(f"读取数据失败: {str(e)}，data_path: {self.data_path}")
            raise RuntimeError(f"读取数据失败: {str(e)}")
    def create_node(self, label: str, nodes: Set[str]):
        """创建节点，包含 document_id 属性"""
        count = 0
        for node_name in nodes:
            query = f"MATCH (n:{label} {{name: '{node_name}', document_id: '{self.document_id}'}}) RETURN n"
            try:
                if self.g.run(query).data():
                    continue
                node = Node(label, name=node_name, document_id=self.document_id)
                self.g.create(node)
                count += 1
                logger.debug(f"创建节点: {label} - {node_name} (document_id: {self.document_id})")
            except Exception as e:
                logger.error(f"创建节点失败: {label} - {node_name}, 错误: {str(e)}")
        logger.info(f"创建 {label} 节点: {count}/{len(nodes)} 个")
        return count
    def create_relationship(self, rel_type: str, relations: List[Dict]):
        """创建关系"""
        count = 0
        total = len(relations)
        seen_edges = set()
        for rel in relations:
            head, tail, head_label, tail_label, rel_name = (
                rel['head'], rel['tail'], rel['head_label'], rel['tail_label'], rel['rel_name']
            )
            edge_key = f"{head_label}:{head}###{tail_label}:{tail}###{rel_type}"
            if edge_key in seen_edges:
                continue
            seen_edges.add(edge_key)
            query = (
                f"MATCH (p:{head_label} {{name: '{head}', document_id: '{self.document_id}'}}), "
                f"(q:{tail_label} {{name: '{tail}', document_id: '{self.document_id}'}}) "
                f"CREATE (p)-[r:{rel_type} {{name: '{rel_name}'}}]->(q)"
            )
            try:
                self.g.run(query)
                count += 1
                logger.debug(f"创建关系: {head} -[{rel_type}]-> {tail} (document_id: {self.document_id})")
            except Exception as e:
                logger.error(f"创建关系失败: {query}, 错误: {str(e)}")
        logger.info(f"创建 {rel_type} 关系: {count}/{total} 条")
        return count
    def create_graphnodes(self):
        """创建所有节点"""
        nodes_by_label, _, _ = self.read_nodes()
        total = 0
        for label, nodes in nodes_by_label.items():
            total += self.create_node(label, nodes)
        logger.info(f"总计创建节点: {total} 个")
        return total
    def create_graphrels(self):
        """创建所有关系"""
        _, relations_by_type, _ = self.read_nodes()
        total = 0
        for rel_type, relations in relations_by_type.items():
            total += self.create_relationship(rel_type, relations)
        logger.info(f"总计创建关系: {total} 条")
        return total
    def export_data(self):
        """导出节点到文件，包含 document_id"""
        nodes_by_label, _, _ = self.read_nodes()
        os.makedirs('dict', exist_ok=True)
        for label, nodes in nodes_by_label.items():
            with open(f'dict/{label.lower()}.txt', 'w', encoding='utf-8') as f:
                f.write('\n'.join(f"{name}\t{self.document_id}" for name in sorted(nodes)))
            logger.info(f"导出 {label} 节点到 dict/{label.lower()}.txt: {len(nodes)} 个")
        return
 if __name__ == '__main__':
    data_path = '/share/wangmeihua/rag/triples/26911c68-9107-4bb4-8f31-ff776991a119_testuser2.txt'
    handler = KnowledgeGraph(data_path)
    logger.info("Step 1: 导入图谱节点中")
    handler.create_graphnodes()
    logger.info("Step 2: 导入图谱边中")
    handler.create_graphrels()
    logger.info("Step 3: 导出数据")
    handler.export_data()
--- a/rag/query.py
+++ b/rag/query.py
@ -1,201 +0,0 @@
 import os
 import yaml
 import logging
 from typing import List, Dict
 from pymilvus import connections, Collection, utility
 from langchain_huggingface import HuggingFaceEmbeddings
 from vector import initialize_milvus_connection, cleanup_milvus_connection
 import torch
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    MILVUS_DB_PATH = config['database']['milvus_db_path']
    TEXT_EMBEDDING_MODEL = config['models']['text_embedding_model']
 except Exception as e:
    print(f"加载配置文件 {CONFIG_PATH} 失败: {str(e)}")
    raise RuntimeError(f"无法加载配置文件: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 logger.handlers.clear()  # 清除现有处理器，避免重复
 logger.propagate = False  # 禁用传播到父级
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 file_handler = logging.FileHandler(config['logging']['file'], encoding='utf-8')
 file_handler.setFormatter(formatter)
 stream_handler = logging.StreamHandler()
 stream_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
 logger.addHandler(stream_handler)
 def search_query(query: str, userid: str, db_type: str, file_paths: List[str], limit: int = 5, offset: int = 0) -> List[Dict]:
    """
    根据用户输入的查询文本，在指定 db_type 的知识库中搜索与 userid 相关的指定文档。
    参数:
        query (str): 用户输入的查询文本
        userid (str): 用户ID，用于过滤
        db_type (str): 数据库类型（例如 'textdb'）
        file_paths (List[str]): 文档路径列表（支持1到多个文件）
        limit (int): 返回的最大结果数，默认为 10
        offset (int): 偏移量，用于分页，默认为 0
    返回:
        List[Dict]: 搜索结果，每个元素为包含 text、distance 和 metadata 的字典
    异常:
        ValueError: 参数无效
        RuntimeError: 模型加载或 Milvus 操作失败
    """
    try:
        # 参数验证
        if not query:
            raise ValueError("查询文本不能为空")
        if not userid or not db_type:
            raise ValueError("userid 和 db_type 不能为空")
        if "_" in userid or "_" in db_type:
            raise ValueError("userid 和 db_type 不能包含下划线")
        if len(userid) > 100 or len(db_type) > 100:
            raise ValueError("userid 或 db_type 的长度超出限制")
        if limit <= 0 or limit > 16384:
            raise ValueError("limit 必须在 1 到 16384 之间")
        if offset < 0:
            raise ValueError("offset 不能为负数")
        if limit + offset > 16384:
            raise ValueError("limit + offset 不能超过 16384")
        if not file_paths:
            raise ValueError("file_paths 不能为空")
        for file_path in file_paths:
            if not isinstance(file_path, str):
                raise ValueError(f"file_path 必须是字符串: {file_path}")
            if len(os.path.basename(file_path)) > 255:
                raise ValueError(f"文件名长度超出 255 个字符: {file_path}")
            if "_" in os.path.basename(file_path):
                raise ValueError(f"文件名 {file_path} 不能包含下划线")
        # 初始化嵌入模型
        model_path = TEXT_EMBEDDING_MODEL
        if not os.path.exists(model_path):
            raise ValueError(f"模型路径 {model_path} 不存在")
        embedding = HuggingFaceEmbeddings(
            model_name=model_path,
            model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        try:
            test_vector = embedding.embed_query("test")
            if len(test_vector) != 1024:
                raise ValueError(f"嵌入模型输出维度 {len(test_vector)} 不匹配预期 1024")
            logger.debug("嵌入模型加载成功")
        except Exception as e:
            logger.error(f"嵌入模型加载失败: {str(e)}")
            raise RuntimeError(f"嵌入模型加载失败: {str(e)}")
        # 将查询转换为向量
        query_vector = embedding.embed_query(query)
        logger.debug(f"查询向量维度: {len(query_vector)}")
        # 连接到 Milvus
        initialize_milvus_connection()
        # 检查集合是否存在
        collection_name = f"ragdb_{db_type}"
        if not utility.has_collection(collection_name):
            logger.warning(f"集合 {collection_name} 不存在")
            return []
        # 加载集合
        try:
            collection = Collection(collection_name)
            collection.load()
            logger.debug(f"加载集合: {collection_name}")
        except Exception as e:
            logger.error(f"加载集合 {collection_name} 失败: {str(e)}")
            raise RuntimeError(f"加载集合失败: {str(e)}")
        # 构造搜索参数
        search_params = {
            "metric_type": "COSINE",  # 与 vector.py 一致
            "params": {"nprobe": 10}  # 优化搜索性能
        }
        # 构造过滤表达式，限制在指定文件
        filenames = [os.path.basename(file_path) for file_path in file_paths]
        filename_expr = " or ".join([f"filename == '{filename}'" for filename in filenames])
        expr = f"userid == '{userid}' and ({filename_expr})"
        logger.debug(f"搜索参数: {search_params}, 表达式: {expr}, limit: {limit}, offset: {offset}")
        # 执行搜索
        try:
            results = collection.search(
                data=[query_vector],
                anns_field="vector",
                param=search_params,
                limit=limit,
                expr=expr,
                output_fields=["text", "userid", "document_id", "filename", "file_path", "upload_time", "file_type"],
                offset=offset
            )
        except Exception as e:
            logger.error(f"搜索失败: {str(e)}")
            raise RuntimeError(f"搜索失败: {str(e)}")
        # 处理搜索结果
        search_results = []
        for hits in results:
            for hit in hits:
                metadata = {
                    "userid": hit.entity.get("userid"),
                    "document_id": hit.entity.get("document_id"),
                    "filename": hit.entity.get("filename"),
                    "file_path": hit.entity.get("file_path"),
                    "upload_time": hit.entity.get("upload_time"),
                    "file_type": hit.entity.get("file_type")
                }
                result = {
                    "text": hit.entity.get("text"),
                    "distance": hit.distance,
                    "metadata": metadata
                }
                search_results.append(result)
                logger.debug(f"命中: text: {result['text'][:200]}..., 距离: {hit.distance}, 元数据: {metadata}")
        logger.debug(f"搜索完成，返回 {len(search_results)} 条结果")
        return search_results
    except Exception as e:
        logger.error(f"搜索失败: {str(e)}")
        import traceback
        logger.debug(traceback.format_exc())
        raise
    finally:
        cleanup_milvus_connection()
 if __name__ == "__main__":
    # 测试代码
    query = "知识图谱的知识融合是什么？"
    userid = "testuser2"
    db_type = "textdb"
    file_paths = [
        "/share/wangmeihua/rag/data/test.docx",
        "/share/wangmeihua/rag/data/test.txt"
    ]
    limit = 5
    offset = 0
    try:
        results = search_query(query, userid, db_type, file_paths, limit, offset)
        print(f"搜索结果 ({len(results)} 条):")
        for idx, result in enumerate(results, 1):
            print(f"结果 {idx}:")
            print(f"内容: {result['text'][:200]}...")
            print(f"距离: {result['distance']}")
            print(f"元数据: {result['metadata']}")
            print("-" * 50)
    except Exception as e:
        print(f"搜索失败: {str(e)}")
--- a/rag/rerank.py
+++ b/rag/rerank.py
@ -1,80 +0,0 @@
 import os
 import yaml
 import logging
 from typing import List, Dict
 from pymilvus.model.reranker import BGERerankFunction
 import torch
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
 except Exception as e:
    print(f"加载配置文件 {CONFIG_PATH} 失败: {str(e)}")
    raise RuntimeError(f"加载配置文件: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 logger.handlers.clear()  # 清除现有处理器
 logger.propagate = False  # 禁用传播
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 file_handler = logging.FileHandler(config['logging']['file'], encoding='utf-8')
 file_handler.setFormatter(formatter)
 stream_handler = logging.StreamHandler()
 stream_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
 logger.addHandler(stream_handler)
 def rerank_results(query: str, results: List[Dict], top_k: int = 10) -> List[Dict]:
    """
    使用 BGE Reranker 模型对查询和文本块进行重排序。
    参数:
        query (str): 查询文本
        results (List[Dict]): 包含 text、distance、source 和 metadata 的结果列表
        top_k (int): 返回的最大结果数，默认为 10
    返回:
        List[Dict]: 重排序后的结果列表，包含 text、distance、source、metadata 和 rerank_score
    """
    try:
        # 初始化 BGE Reranker
        bge_rf = BGERerankFunction(
            model_name="/share/models/BAAI/bge-reranker-v2-m3",
            device="cuda:0" if torch.cuda.is_available() else "cpu"
        )
        logger.debug(f"BGE Reranker 初始化成功，模型路径: /share/models/BAAI/bge-reranker-v2-m3, 设备: {'cuda:0' if torch.cuda.is_available() else 'cpu'}")
        # 提取文本块
        documents = [result['text'] for result in results]
        if not documents:
            logger.warning("无文本块可重排序")
            return results
        # 重排序
        rerank_results = bge_rf(
            query=query,
            documents=documents,
            top_k=min(top_k, len(documents))
        )
        # 构建重排序结果
        reranked = []
        for result in rerank_results:
            original_result = results[result.index].copy()
            original_result['rerank_score'] = result.score
            reranked.append(original_result)
            logger.debug(f"重排序结果: text={result.text[:200]}..., rerank_score={result.score:.6f}, source={original_result['source']}")
        logger.info(f"重排序返回 {len(reranked)} 条结果")
        return reranked
    except Exception as e:
        logger.error(f"重排序失败: {str(e)}")
        import traceback
        logger.debug(traceback.format_exc())
        # 回退到原始结果
        return results
--- a/rag/searchquery.py
+++ b/rag/searchquery.py
@ -1,363 +0,0 @@
 import os
 import yaml
 import logging
 from typing import List, Dict
 from pymilvus import connections, Collection, utility
 from langchain_huggingface import HuggingFaceEmbeddings
 import numpy as np
 from scipy.spatial.distance import cosine
 from ltp import LTP
 from vector import initialize_milvus_connection, cleanup_milvus_connection
 import torch
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    MILVUS_DB_PATH = config['database']['milvus_db_path']
    TEXT_EMBEDDING_MODEL = config['models']['text_embedding_model']
 except Exception as e:
    print(f"加载配置文件 {CONFIG_PATH} 失败: {str(e)}")
    raise RuntimeError(f"加载配置文件失败: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 logger.handlers.clear()  # 清理现有处理器，避免重复
 logger.propagate = False  # 禁用传播到父级
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 file_handler = logging.FileHandler(config['logging']['file'], encoding='utf-8')
 file_handler.setFormatter(formatter)
 stream_handler = logging.StreamHandler()
 stream_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
 logger.addHandler(stream_handler)
 # 三元组保存路径
 TRIPLES_OUTPUT_DIR = '/share/wangmeihua/rag/triples'
 # 初始化嵌入模型
 embedding = HuggingFaceEmbeddings(
    model_name=TEXT_EMBEDDING_MODEL,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
 )
 try:
    test_vector = embedding.embed_query("test")
    if len(test_vector) != 1024:
        raise ValueError(f"嵌入模型输出维度 {len(test_vector)} 不匹配预期 1024")
    logger.debug("嵌入模型加载成功")
 except Exception as e:
    logger.error(f"嵌入模型加载失败: {str(e)}")
    raise RuntimeError(f"嵌入模型加载失败: {str(e)}")
 # 初始化 LTP 模型
 try:
    model_path = "/share/models/LTP/small"
    if not os.path.isdir(model_path):
        logger.warning(f"本地模型路径 {model_path} 不存在，尝试使用 Hugging Face 模型 'hit-scir/ltp-small'")
        model_path = "hit-scir/ltp-small"
    ltp = LTP(pretrained_model_name_or_path=model_path)
    if torch.cuda.is_available():
        ltp.to("cuda")
    logger.debug("LTP 模型加载成功")
 except Exception as e:
    logger.error(f"加载 LTP 模型失败: {str(e)}")
    raise RuntimeError(f"加载 LTP 模型失败: {str(e)}")
 def extract_entities(query: str) -> List[str]:
    """
    从查询文本中抽取实体，包括：
    - LTP NER 识别的实体（所有类型）。
    - LTP POS 标注为名词（'n'）的词。
    - LTP POS 标注为动词（'v'）的词。
    - 连续名词合并（如 '苹果 公司' -> '苹果公司'），移除子词。
    """
    try:
        if not query:
            raise ValueError("查询文本不能为空")
        # 使用 LTP pipeline 获取分词、词性、NER 结果
        result = ltp.pipeline([query], tasks=["cws", "pos", "ner"])
        words = result.cws[0]
        pos_list = result.pos[0]
        ner = result.ner[0]
        entities = []
        subword_set = set()  # 记录连续名词的子词
        # 提取 1：NER 实体（所有类型）
        logger.debug(f"NER 结果: {ner}")
        for entity_type, entity, start, end in ner:
            entities.append(entity)
        # 提取 2：合并连续名词
        combined = ""
        combined_words = []  # 记录当前连续名词的单词
        for i in range(len(words)):
            if pos_list[i] == 'n':
                combined += words[i]
                combined_words.append(words[i])
                if i + 1 < len(words) and pos_list[i + 1] == 'n':
                    continue
                if combined:
                    entities.append(combined)
                    subword_set.update(combined_words)
                    logger.debug(f"合并连续名词: {combined}, 子词: {combined_words}")
                    combined = ""
                    combined_words = []
            else:
                combined = ""
                combined_words = []
        logger.debug(f"连续名词子词集合: {subword_set}")
        # 提取 3：POS 名词（'n'），排除子词
        for word, pos in zip(words, pos_list):
            if pos == 'n' and word not in subword_set:
                entities.append(word)
        # 提取 4：POS 动词（'v'）
        for word, pos in zip(words, pos_list):
            if pos == 'v':
                entities.append(word)
        # 去重
        unique_entities = list(dict.fromkeys(entities))
        logger.info(f"从查询中提取到 {len(unique_entities)} 个唯一实体: {unique_entities}")
        return unique_entities
    except Exception as e:
        logger.error(f"实体抽取失败: {str(e)}")
        return []
 def load_triplets_from_file(triplet_file: str) -> List[Dict]:
    """从三元组文件中加载"""
    triplets = []
    try:
        if not os.path.exists(triplet_file):
            logger.warning(f"三元组文件 {triplet_file} 不存在")
            return []
        with open(triplet_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    parts = line.strip().split('\t')
                    if len(parts) >= 5:
                        head, relation, tail, head_type, tail_type = parts[:5]
                        triplets.append({
                            'head': head,
                            'head_type': head_type,
                            'type': relation,
                            'tail': tail,
                            'tail_type': tail_type
                        })
        logger.debug(f"从 {triplet_file} 加载 {len(triplets)} 个三元组")
        return triplets
    except Exception as e:
        logger.error(f"加载三元组文件 {triplet_file} 失败: {str(e)}")
        return []
 def match_triplets(query: str, query_entities: List[str], userid: str, document_id: str) -> List[Dict]:
    """
    匹配查询实体与文档三元组，使用语义嵌入：
    - 初始匹配：实体与 head 或 tail 相似度 ≥ 0.8。
    - 返回匹配的三元组。
    """
    matched_triplets = []
    ENTITY_SIMILARITY_THRESHOLD = 0.8  # 实体与 head/tail 相似度阈值
    try:
        # 加载三元组
        triplet_file = os.path.join(TRIPLES_OUTPUT_DIR, f"{document_id}_{userid}.txt")
        doc_triplets = load_triplets_from_file(triplet_file)
        if not doc_triplets:
            logger.debug(f"文档 document_id={document_id} 无三元组")
            return []
        # 缓存查询实体嵌入
        entity_vectors = {entity: embedding.embed_query(entity) for entity in query_entities}
        # 初始匹配
        for entity in query_entities:
            entity_vec = entity_vectors[entity]
            for d_triplet in doc_triplets:
                d_head_vec = embedding.embed_query(d_triplet['head'])
                d_tail_vec = embedding.embed_query(d_triplet['tail'])
                head_similarity = 1 - cosine(entity_vec, d_head_vec)
                tail_similarity = 1 - cosine(entity_vec, d_tail_vec)
                if head_similarity >= ENTITY_SIMILARITY_THRESHOLD or tail_similarity >= ENTITY_SIMILARITY_THRESHOLD:
                    matched_triplets.append(d_triplet)
                    logger.debug(f"匹配三元组: {d_triplet['head']} - {d_triplet['type']} - {d_triplet['tail']} "
                                 f"(entity={entity}, head_sim={head_similarity:.2f}, tail_sim={tail_similarity:.2f})")
        # 去重
        unique_matched = []
        seen = set()
        for t in matched_triplets:
            identifier = (t['head'].lower(), t['type'].lower(), t['tail'].lower())
            if identifier not in seen:
                seen.add(identifier)
                unique_matched.append(t)
        logger.info(f"找到 {len(unique_matched)} 个匹配的三元组")
        return unique_matched
    except Exception as e:
        logger.error(f"匹配三元组失败: {str(e)}")
        return []
 def searchquery(query: str, userid: str, db_type: str, file_paths: List[str], limit: int = 5, offset: int = 0) -> List[Dict]:
    """
    根据查询抽取实体，匹配指定文档的三元组，并在 Milvus 中搜索相关文档片段。
    """
    try:
        if not query or not userid or not db_type or not file_paths:
            raise ValueError("query、userid、db_type 和 file_paths 不能为空")
        if "_" in userid or "_" in db_type:
            raise ValueError("userid 和 db_type 不能包含下划线")
        if len(userid) > 100 or len(db_type) > 100:
            raise ValueError("userid 或 db_type 的长度超出限制")
        if limit <= 0 or limit > 16384:
            raise ValueError("limit 必须在 1 到 16384 之间")
        if offset < 0:
            raise ValueError("offset 不能为负数")
        if limit + offset > 16384:
            raise ValueError("limit + offset 不能超过 16384")
        initialize_milvus_connection()
        collection_name = f"ragdb_{db_type}"
        if not utility.has_collection(collection_name):
            logger.warning(f"集合 {collection_name} 不存在")
            return []
        collection = Collection(collection_name)
        collection.load()
        documents = []
        for file_path in file_paths:
            filename = os.path.basename(file_path)
            results = collection.query(
                expr=f"userid == '{userid}' and filename == '{filename}'",
                output_fields=["document_id", "filename"],
                limit=1
            )
            if not results:
                logger.warning(f"未找到 userid {userid} 和 filename {filename} 对应的文档")
                continue
            documents.append(results[0])
        if not documents:
            logger.warning("没有找到任何有效文档")
            return []
        logger.info(f"找到 {len(documents)} 个文档: {[doc['filename'] for doc in documents]}")
        query_entities = extract_entities(query)
        if not query_entities:
            logger.warning("未从查询中提取到实体")
            return []
        search_results = []
        for doc in documents:
            document_id = doc["document_id"]
            filename = doc["filename"]
            logger.debug(f"处理文档: document_id={document_id}, filename={filename}")
            matched_triplets = match_triplets(query, query_entities, userid, document_id)
            if not matched_triplets:
                logger.debug(f"文档 document_id={document_id} 未找到匹配的三元组")
                continue
            for triplet in matched_triplets:
                head = triplet['head']
                type = triplet['type']
                tail = triplet['tail']
                if not head or not type or not tail:
                    logger.debug(f"无效三元组: head={head}, type={type}, tail={tail}")
                    continue
                triplet_text = f"{head} {type} {tail}"
                logger.debug(f"搜索三元组: {triplet_text} (文档: {filename})")
                try:
                    search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}
                    query_vector = embedding.embed_query(triplet_text)
                    expr = f"userid == '{userid}' and filename == '{filename}' and text like '%{head}%{tail}%'"
                    logger.debug(f"搜索表达式: {expr}")
                    results = collection.search(
                        data=[query_vector],
                        anns_field="vector",
                        param=search_params,
                        limit=limit,
                        expr=expr,
                        output_fields=["text", "userid", "document_id", "filename", "file_path", "upload_time", "file_type"],
                        offset=offset
                    )
                    for hits in results:
                        for hit in hits:
                            metadata = {
                                "userid": hit.entity.get("userid"),
                                "document_id": hit.entity.get("document_id"),
                                "filename": hit.entity.get("filename"),
                                "file_path": hit.entity.get("file_path"),
                                "upload_time": hit.entity.get("upload_time"),
                                "file_type": hit.entity.get("file_type")
                            }
                            result = {
                                "text": hit.entity.get("text"),
                                "distance": hit.distance,
                                "metadata": metadata
                            }
                            search_results.append(result)
                            logger.debug(f"命中: text: {result['text'][:200]}..., 距离: {hit.distance}, 元数据: {metadata}")
                except Exception as e:
                    logger.warning(f"三元组 {triplet_text} 在文档 {filename} 搜索失败: {str(e)}")
                    continue
        unique_results = []
        seen_texts = set()
        for result in sorted(search_results, key=lambda x: x['distance'], reverse=True):
            if result['text'] not in seen_texts:
                unique_results.append(result)
                seen_texts.add(result['text'])
            if len(unique_results) >= limit:
                break
        logger.info(f"返回 {len(unique_results)} 条唯一结果")
        return unique_results
    except Exception as e:
        logger.error(f"搜索失败: {str(e)}")
        import traceback
        logger.debug(traceback.format_exc())
        return []
    finally:
        cleanup_milvus_connection()
 if __name__ == "__main__":
    query = "什么是知识图谱的知识抽取？"
    userid = "testuser1"
    db_type = "textdb"
    file_paths = [
        "/share/wangmeihua/rag/data/test.docx",
        "/share/wangmeihua/rag/data/zongshu.pdf",
        "/share/wangmeihua/rag/data/qianru.pdf"
    ]
    limit = 5
    offset = 0
    try:
        results = searchquery(query, userid, db_type, file_paths, limit, offset)
        print(f"搜索结果 ({len(results)} 条):")
        for idx, result in enumerate(results, 1):
            print(f"结果 {idx}:")
            print(f"内容: {result['text'][:200]}...")
            print(f"距离: {result['distance']}")
            print(f"元数据: {result['metadata']}")
            print("-" * 50)
    except Exception as e:
        print(f"搜索失败: {str(e)}")
--- a/rag/test.py
+++ b/rag/test.py
@ -1,9 +0,0 @@
 from py2neo import Graph,Node,Relationship,NodeMatcher
 username = 'neo4j'
 password = '261229..wmh'
 auth = (username, password)
 graph=Graph("bolt://10.18.34.18:7687", auth = auth)
 book_node=Node('经名',name='十三经')
 graph.create(book_node)
--- a/rag/vector.py
+++ b/rag/vector.py
@ -1,539 +0,0 @@
 import os
 import uuid
 import json
 import yaml
 from datetime import datetime
 from typing import List, Dict, Optional
 from pymilvus import connections, utility, Collection, CollectionSchema, FieldSchema, DataType
 from langchain_milvus import Milvus
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_core.documents import Document
 import torch
 import logging
 import time
 # 加载配置文件
 CONFIG_PATH = os.getenv('CONFIG_PATH', '/share/wangmeihua/rag/conf/milvusconfig.yaml')
 try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    MILVUS_DB_PATH = config['database']['milvus_db_path']
    TEXT_EMBEDDING_MODEL = config['models']['text_embedding_model']
 except Exception as e:
    print(f"加载配置文件 {CONFIG_PATH} 失败: {str(e)}")
    raise RuntimeError(f"无法加载配置文件: {str(e)}")
 # 配置日志
 logger = logging.getLogger(config['logging']['name'])
 logger.setLevel(getattr(logging, config['logging']['level'], logging.DEBUG))
 os.makedirs(os.path.dirname(config['logging']['file']), exist_ok=True)
 formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 for handler in (logging.FileHandler(config['logging']['file'], encoding='utf-8'), logging.StreamHandler()):
    handler.setFormatter(formatter)
    logger.addHandler(handler)
 def ensure_milvus_directory() -> None:
    """确保 Milvus 数据库目录存在"""
    db_dir = os.path.dirname(MILVUS_DB_PATH)
    if not os.path.exists(db_dir):
        os.makedirs(db_dir, exist_ok=True)
        logger.debug(f"创建 Milvus 目录: {db_dir}")
    if not os.access(db_dir, os.W_OK):
        raise RuntimeError(f"Milvus 目录 {db_dir} 不可写")
 def initialize_milvus_connection() -> None:
    """初始化 Milvus 连接，确保单一连接"""
    try:
        if not connections.has_connection("default"):
            connections.connect("default", uri=MILVUS_DB_PATH)
            logger.debug(f"已连接到 Milvus Lite，路径: {MILVUS_DB_PATH}")
        else:
            logger.debug("已存在 Milvus 连接，跳过重复连接")
    except Exception as e:
        logger.error(f"连接 Milvus 失败: {str(e)}")
        raise RuntimeError(f"连接 Milvus 失败: {str(e)}")
 def cleanup_milvus_connection() -> None:
    """清理 Milvus 连接，确保资源释放"""
    try:
        if connections.has_connection("default"):
            connections.disconnect("default")
            logger.debug("已断开 Milvus 连接")
            time.sleep(3)
    except Exception as e:
        logger.warning(f"断开 Milvus 连接失败: {str(e)}")
 def get_vector_db(userid: str, db_type: str, documents: List[Document]) -> Milvus:
    """
    初始化或访问 Milvus Lite 向量数据库集合，按 db_type 组织，利用 userid 区分用户，document_id 区分文档，并插入文档。
    """
    try:
        # 参数验证
        if not userid or not db_type:
            raise ValueError("userid 和 db_type 不能为空")
        if "_" in userid or "_" in db_type:
            raise ValueError("userid 和 db_type 不能包含下划线")
        if len(userid) > 100 or len(db_type) > 100:
            raise ValueError("userid 和 db_type 的长度应小于 100")
        if not documents or not all(isinstance(doc, Document) for doc in documents):
            raise ValueError("documents 不能为空且必须是 Document 对象列表")
        required_fields = ["userid", "document_id", "filename", "file_path", "upload_time", "file_type"]
        for doc in documents:
            if not all(field in doc.metadata and doc.metadata[field] for field in required_fields):
                raise ValueError(f"文档元数据缺少必需字段或字段值为空: {doc.metadata}")
            if doc.metadata["userid"] != userid:
                raise ValueError(f"文档元数据的 userid {doc.metadata['userid']} 与输入 userid {userid} 不一致")
        ensure_milvus_directory()
        initialize_milvus_connection()
        # 初始化嵌入模型
        model_path = TEXT_EMBEDDING_MODEL
        if not os.path.exists(model_path):
            raise ValueError(f"模型路径 {model_path} 不存在")
        embedding = HuggingFaceEmbeddings(
            model_name=model_path,
            model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        try:
            test_vector = embedding.embed_query("test")
            if len(test_vector) != 1024:
                raise ValueError(f"嵌入模型输出维度 {len(test_vector)} 不匹配预期 1024")
            logger.debug(f"嵌入模型加载成功，输出维度: {len(test_vector)}")
        except Exception as e:
            logger.error(f"嵌入模型加载失败: {str(e)}")
            raise RuntimeError(f"加载模型失败: {str(e)}")
        # 集合名称
        collection_name = f"ragdb_{db_type}"
        if len(collection_name) > 255:
            raise ValueError(f"集合名称 {collection_name} 超过 255 个字符")
        logger.debug(f"集合名称: {collection_name}")
        # 定义 schema，包含所有固定字段
        fields = [
            FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, max_length=36, auto_id=True),
            FieldSchema(name="userid", dtype=DataType.VARCHAR, max_length=100),
            FieldSchema(name="document_id", dtype=DataType.VARCHAR, max_length=36),
            FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
            FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1024),
            FieldSchema(name="filename", dtype=DataType.VARCHAR, max_length=255),
            FieldSchema(name="file_path", dtype=DataType.VARCHAR, max_length=1024),
            FieldSchema(name="upload_time", dtype=DataType.VARCHAR, max_length=64),
            FieldSchema(name="file_type", dtype=DataType.VARCHAR, max_length=64),
        ]
        schema = CollectionSchema(
            fields=fields,
            description=f"{db_type} 数据集合，跨用户使用，包含 document_id 和元数据字段",
            auto_id=True,
            primary_field="pk",
        )
        # 检查集合是否存在
        if utility.has_collection(collection_name):
            try:
                collection = Collection(collection_name)
                existing_schema = collection.schema
                expected_fields = {f.name for f in fields}
                actual_fields = {f.name for f in existing_schema.fields}
                vector_field = next((f for f in existing_schema.fields if f.name == "vector"), None)
                schema_compatible = False
                if expected_fields == actual_fields and vector_field is not None and vector_field.dtype == DataType.FLOAT_VECTOR:
                    dim = vector_field.params.get('dim', None) if hasattr(vector_field, 'params') and vector_field.params else None
                    schema_compatible = dim == 1024
                    logger.debug(f"检查集合 {collection_name} 的 schema: 字段匹配={expected_fields == actual_fields}, "
                                f"vector_field存在={vector_field is not None}, dtype={vector_field.dtype if vector_field else '无'}, "
                                f"dim={dim if dim is not None else '未定义'}")
                if not schema_compatible:
                    logger.warning(f"集合 {collection_name} 的 schema 不兼容，原因: "
                                  f"字段不匹配: {expected_fields.symmetric_difference(actual_fields) or '无'}, "
                                  f"vector_field: {vector_field is not None}, "
                                  f"dtype: {vector_field.dtype if vector_field else '无'}, "
                                  f"dim: {vector_field.params.get('dim', '未定义') if vector_field and hasattr(vector_field, 'params') and vector_field.params else '未定义'}")
                    utility.drop_collection(collection_name)
                else:
                    collection.load()
                    logger.debug(f"集合 {collection_name} 已存在并加载成功")
            except Exception as e:
                logger.error(f"加载集合 {collection_name} 失败: {str(e)}")
                raise RuntimeError(f"加载集合失败: {str(e)}")
        # 创建新集合
        if not utility.has_collection(collection_name):
            try:
                collection = Collection(collection_name, schema)
                collection.create_index(
                    field_name="vector",
                    index_params={"index_type": "AUTOINDEX", "metric_type": "COSINE"}
                )
                collection.create_index(
                    field_name="userid",
                    index_params={"index_type": "INVERTED"}
                )
                collection.create_index(
                    field_name="document_id",
                    index_params={"index_type": "INVERTED"}
                )
                collection.create_index(
                    field_name="filename",
                    index_params={"index_type": "INVERTED"}
                )
                collection.create_index(
                    field_name="file_path",
                    index_params={"index_type": "INVERTED"}
                )
                collection.create_index(
                    field_name="upload_time",
                    index_params={"index_type": "INVERTED"}
                )
                collection.create_index(
                    field_name="file_type",
                    index_params={"index_type": "INVERTED"}
                )
                collection.load()
                logger.debug(f"成功创建并加载集合: {collection_name}")
            except Exception as e:
                logger.error(f"创建集合 {collection_name} 失败: {str(e)}")
                raise RuntimeError(f"创建集合失败: {str(e)}")
        # 初始化 Milvus 向量存储
        try:
            vector_store = Milvus(
                embedding_function=embedding,
                collection_name=collection_name,
                connection_args={"uri": MILVUS_DB_PATH},
                drop_old=False,
                auto_id=True,
                primary_field="pk",
            )
            logger.debug(f"成功初始化 Milvus 向量存储: {collection_name}")
        except Exception as e:
            logger.error(f"初始化 Milvus 向量存储失败: {str(e)}")
            raise RuntimeError(f"初始化向量存储失败: {str(e)}")
        # 插入文档
        try:
            logger.debug(f"正在为 userid {userid} 插入 {len(documents)} 个文档到 {collection_name}")
            for doc in documents:
                logger.debug(f"插入文档元数据: {doc.metadata}")
            vector_store.add_documents(documents=documents)
            logger.debug(f"成功插入 {len(documents)} 个文档")
            # 立即查询验证
            collection = Collection(collection_name)
            collection.load()
            results = collection.query(
                expr=f"userid == '{userid}'",
                output_fields=["pk", "text", "document_id", "filename", "file_path", "upload_time", "file_type"],
                limit=10
            )
            for result in results:
                logger.debug(f"插入后查询结果: pk={result['pk']}, document_id={result['document_id']}, "
                            f"metadata={{'filename': '{result['filename']}', 'file_path': '{result['file_path']}', "
                            f"'upload_time': '{result['upload_time']}', 'file_type': '{result['file_type']}'}}")
        except Exception as e:
            logger.error(f"插入文档失败: {str(e)}")
            raise RuntimeError(f"插入文档失败: {str(e)}")
        return vector_store
    except Exception as e:
        logger.error(f"初始化 Milvus 向量存储失败: {str(e)}")
        raise
    finally:
        cleanup_milvus_connection()
 def get_document_mapping(userid: str, db_type: str) -> Dict[str, Dict]:
    """
    获取指定 userid 和 db_type 下的 document_id 与元数据的映射。
    """
    try:
        if not userid or "_" in userid:
            raise ValueError("userid 不能为空且不能包含下划线")
        if not db_type or "_" in db_type:
            raise ValueError("db_type 不能为空且不能包含下划线")
        initialize_milvus_connection()
        collection_name = f"ragdb_{db_type}"
        if not utility.has_collection(collection_name):
            logger.warning(f"集合 {collection_name} 不存在")
            return {}
        collection = Collection(collection_name)
        collection.load()
        results = collection.query(
            expr=f"userid == '{userid}'",
            output_fields=["userid", "document_id", "filename", "file_path", "upload_time", "file_type"],
            limit=100
        )
        mapping = {}
        for result in results:
            doc_id = result.get("document_id")
            if doc_id:
                mapping[doc_id] = {
                    "userid": result.get("userid", ""),
                    "filename": result.get("filename", ""),
                    "file_path": result.get("file_path", ""),
                    "upload_time": result.get("upload_time", ""),
                    "file_type": result.get("file_type", "")
                }
                logger.debug(f"document_id: {doc_id}, metadata: {mapping[doc_id]}")
        logger.debug(f"找到 {len(mapping)} 个文档的映射")
        return mapping
    except Exception as e:
        logger.error(f"获取文档映射失败: {str(e)}")
        raise RuntimeError(f"获取文档映射失败: {str(e)}")
 def list_user_collections() -> Dict[str, Dict]:
    """
    列出所有数据库类型（db_type）及其包含的用户（userid）与对应的文档（document_id）映射。
    """
    try:
        ensure_milvus_directory()
        initialize_milvus_connection()
        collections = utility.list_collections()
        db_types_with_data = {}
        for col in collections:
            if col.startswith("ragdb_"):
                db_type = col[len("ragdb_"):]
                logger.debug(f"处理集合: {col} (db_type: {db_type})")
                collection = Collection(col)
                collection.load()
                batch_size = 1000
                offset = 0
                user_document_map = {}
                while True:
                    try:
                        results = collection.query(
                            expr="",
                            output_fields=["userid", "document_id"],
                            limit=batch_size,
                            offset=offset
                        )
                        if not results:
                            break
                        for result in results:
                            userid = result.get("userid")
                            doc_id = result.get("document_id")
                            if userid and doc_id:
                                if userid not in user_document_map:
                                    user_document_map[userid] = set()
                                user_document_map[userid].add(doc_id)
                        offset += batch_size
                    except Exception as e:
                        logger.error(f"查询集合 {col} 的用户和文档失败: {str(e)}")
                        break
                # 转换为列表以便序列化
                user_document_map = {uid: list(doc_ids) for uid, doc_ids in user_document_map.items()}
                logger.debug(f"集合 {col} 中找到用户和文档映射: {user_document_map}")
                db_types_with_data[db_type] = {
                    "userids": user_document_map
                }
        logger.debug(f"可用 db_types 和数据: {db_types_with_data}")
        return db_types_with_data
    except Exception as e:
        logger.error(f"列出集合和用户失败: {str(e)}")
        raise
 def view_collection_details(userid: str) -> None:
    """
    查看特定 userid 在所有集合中的内容和容量，包含 document_id 和元数据。
    """
    try:
        if not userid or "_" in userid:
            raise ValueError("userid 不能为空且不能包含下划线")
        logger.debug(f"正在查看 userid {userid} 的集合")
        ensure_milvus_directory()
        initialize_milvus_connection()
        collections = utility.list_collections()
        db_types = [col[len("ragdb_"):] for col in collections if col.startswith("ragdb_")]
        if not db_types:
            logger.debug(f"未找到任何集合")
            return
        for db_type in db_types:
            collection_name = f"ragdb_{db_type}"
            if not utility.has_collection(collection_name):
                logger.warning(f"集合 {collection_name} 不存在")
                continue
            collection = Collection(collection_name)
            collection.load()
            try:
                all_pks = collection.query(expr=f"userid == '{userid}'", output_fields=["pk"], limit=10000)
                num_entities = len(all_pks)
                results = collection.query(
                    expr=f"userid == '{userid}'",
                    output_fields=["userid","text", "document_id", "filename", "file_path", "upload_time", "file_type"],
                    limit=10
                )
                logger.debug(f"集合 {collection_name} 中 userid {userid} 的文档数: {num_entities}")
                if num_entities == 0:
                    logger.debug(f"集合 {collection_name} 中 userid {userid} 无文档")
                    continue
                logger.debug(f"集合 {collection_name} 中 userid {userid} 的内容:")
                for idx, doc in enumerate(results, 1):
                    metadata = {
                        "userid": doc.get("userid", ""),
                        "filename": doc.get("filename", ""),
                        "file_path": doc.get("file_path", ""),
                        "upload_time": doc.get("upload_time", ""),
                        "file_type": doc.get("file_type", "")
                    }
                    logger.debug(f"文档 {idx}: 内容: {doc.get('text', '')[:200]}..., 元数据: {metadata}")
            except Exception as e:
                logger.error(f"查询集合 {collection_name} 的文档失败: {str(e)}")
                continue
    except Exception as e:
        logger.error(f"无法查看 userid {userid} 的集合详情: {str(e)}")
        raise
 def view_vector_data(db_type: str, userid: Optional[str] = None, document_id: Optional[str] = None, limit: int = 100) -> Dict[str, Dict]:
    """
    查看指定 db_type 中的向量数据，可选按 userid 和 document_id 过滤，包含完整元数据和向量。
    """
    try:
        if not db_type or "_" in db_type:
            raise ValueError("db_type 不能为空且不能包含下划线")
        if limit <= 0 or limit > 16384:
            raise ValueError("limit 必须在 1 到 16384 之间")
        if userid and "_" in userid:
            raise ValueError("userid 不能包含下划线")
        if document_id and "_" in document_id:
            raise ValueError("document_id 不能包含下划线")
        initialize_milvus_connection()
        collection_name = f"ragdb_{db_type}"
        if not utility.has_collection(collection_name):
            logger.warning(f"集合 {collection_name} 不存在")
            return {}
        collection = Collection(collection_name)
        collection.load()
        logger.debug(f"加载集合: {collection_name}")
        expr = []
        if userid:
            expr.append(f"userid == '{userid}'")
        if document_id:
            expr.append(f"document_id == '{document_id}'")
        expr = " && ".join(expr) if expr else ""
        results = collection.query(
            expr=expr,
            output_fields=["pk", "text", "document_id", "vector", "filename", "file_path", "upload_time", "file_type"],
            limit=limit
        )
        vector_data = {}
        for doc in results:
            pk = doc.get("pk", str(uuid.uuid4()))
            text = doc.get("text", "")
            doc_id = doc.get("document_id", "")
            vector = doc.get("vector", [])
            metadata = {
                "filename": doc.get("filename", ""),
                "file_path": doc.get("file_path", ""),
                "upload_time": doc.get("upload_time", ""),
                "file_type": doc.get("file_type", "")
            }
            vector_data[pk] = {
                "text": text,
                "vector": vector,
                "document_id": doc_id,
                "metadata": metadata
            }
            logger.debug(f"pk: {pk}, text: {text[:200]}..., document_id: {doc_id}, vector_length: {len(vector)}, metadata: {metadata}")
        logger.debug(f"共找到 {len(vector_data)} 条向量数据")
        return vector_data
    except Exception as e:
        logger.error(f"查看向量数据失败: {str(e)}")
        raise
 def main():
    userid = "testuser1"
    db_type = "textdb"
    # logger.info("\n测试 1：带文档初始化")
    # documents = [
    #     Document(
    #         page_content="深度学习是基于深层神经网络的机器学习子集。",
    #         metadata={
    #             "userid": userid,
    #             "document_id": str(uuid.uuid4()),
    #             "filename": "test_doc1.txt",
    #             "file_path": "/path/to/test_doc1.txt",
    #             "upload_time": datetime.now().isoformat(),
    #             "file_type": "txt"
    #         }
    #     ),
    #     Document(
    #         page_content="知识图谱是一个结构化的语义知识库。",
    #         metadata={
    #             "userid": userid,
    #             "document_id": str(uuid.uuid4()),
    #             "filename": "test_doc2.txt",
    #             "file_path": "/path/to/test_doc2.txt",
    #             "upload_time": datetime.now().isoformat(),
    #             "file_type": "txt"
    #         }
    #     ),
    # ]
    #
    # try:
    #     vector_store = get_vector_db(userid, db_type, documents=documents)
    #     logger.info(f"集合: ragdb_{db_type}")
    #     logger.info(f"成功为 userid {userid} 在 {db_type} 中插入文档")
    # except Exception as e:
    #     logger.error(f"失败: {str(e)}")
    logger.info("\n测试 2：列出所有 db_types 和文档映射")
    try:
        db_types = list_user_collections()
        logger.info(f"可用 db_types 和文档: {db_types}")
    except Exception as e:
        logger.error(f"失败: {str(e)}")
    logger.info(f"\n测试 3：查看 userid {userid} 的所有集合")
    try:
        view_collection_details(userid)
    except Exception as e:
        logger.error(f"失败: {str(e)}")
    # logger.info(f"\n测试 4：查看向量数据")
    # try:
    #     vector_data = view_vector_data(db_type, userid=userid)
    #     logger.info(f"向量数据: {vector_data}")
    # except Exception as e:
    #     logger.error(f"失败: {str(e)}")
    logger.info(f"\n测试 5：获取 userid {userid} 在{db_type}数据库的文档映射")
    try:
        mapping = get_document_mapping(userid, db_type)
        logger.info(f"文档映射: {mapping}")
    except Exception as e:
        logger.error(f"失败: {str(e)}")
 if __name__ == "__main__":
    main()
--- a/rag/version.py
+++ b/rag/version.py
@ -1 +0,0 @@
 __version__ = '0.0.1'
--- a/setup.py
+++ b/setup.py
@ -1,52 +0,0 @@
 # -*- coding: utf-8 -*-
 from  rag.version import __version__
 try:
 	from setuptools import setup
 except ImportError:
 	from distutils.core import setup
 required = []
 with open('requirements.txt', 'r') as f:
 	ls = f.read()
 	required = ls.split('\n')
 with open('rag/version.py', 'r') as f:
 	x = f.read()
 	y = x[x.index("'")+1:]
 	z = y[:y.index("'")]
 	version = z
 with open("README.md", "r") as fh:
    long_description = fh.read()
 name = "rag"
 description = "rag"
 author = "yumoqing"
 email = "yumoqing@gmail.com"
 package_data = {}
 setup(
 	name="rag",
 	version=version,
 	# uncomment the following lines if you fill them out in release.py
 	description=description,
 	author=author,
 	author_email=email,
   	platforms='any',
 	install_requires=required ,
 	packages=[
 		"rag"
 	],
 	package_data=package_data,
 	keywords = [
 	],
 	url="https://github.com/yumoqing/rag",
 	long_description=long_description,
 	long_description_content_type="text/markdown",
 	classifiers = [
 		'Operating System :: OS Independent',
 		'Programming Language :: Python :: 3',
 		'License :: OSI Approved :: MIT License',
 	],
 )