rag

2025-11-12 15:52:03 +08:00 · 2025-11-12 15:52:03 +08:00 · 985c5a998a
commit 985c5a998a
parent 3a726ef958
2 changed files with 341 additions and 1 deletions
--- a/rag/fileprocess.py
+++ b/rag/fileprocess.py
@ -0,0 +1,341 @@
 import numpy as np
 import os
 import re
 from pdf2image import convert_from_path
 from appPublic.log import debug, error, info
 from pathlib import Path
 import zipfile
 import xml.etree.ElementTree as ET
 from PIL import Image
 from typing import List
 # ==================== 新增：路径安全化函数 ====================
 def safe_filename(name: str) -> str:
    """
    安全化文件名/目录名
    - 去除首尾空格
    - 多个空格 → 单空格
    - 非法字符 → 下划线
    - 空格 → 下划线（推荐！永不炸）
    """
    name = name.strip()
    name = re.sub(r'\s+', ' ', name)          # 多个空格合并
    name = re.sub(r'[<>:"/\\|?*]', '_', name) # 非法字符
    name = name.replace(' ', '_')             # 空格 → 下划线（关键！）
    return name
 def render_pdf_to_images(pdf_path, base_output_dir, dpi=200, image_format="PNG")-> List[str]:
    """
    将PDF文件的每一页渲染为图片
    参数:
        pdf_path (str): PDF文件路径
        page_output_dir (str): 输出图片的目录
        dpi (int): 图片分辨率，默认200
        image_format (str): 图片格式，默认PNG
    返回:
        int: 成功渲染的页面数量
    """
    pdf_filename = safe_filename(Path(pdf_path).stem)
    page_output_dir = os.path.join(base_output_dir, pdf_filename)
    # 创建输出目录（如果不存在）
    if not os.path.exists(page_output_dir):
        os.makedirs(page_output_dir, exist_ok=True)
        debug(f"创建输出目录: {page_output_dir}")
    try:
        # 检查PDF文件是否存在
        if not os.path.exists(pdf_path):
            error(f"PDF文件不存在: {pdf_path}")
            return []
        debug(f"开始渲染PDF: {pdf_path}")
        debug(f"输出目录: {page_output_dir}")
        debug(f"分辨率: {dpi} DPI, 格式: {image_format}")
        # 渲染PDF页面为图片
        pages = convert_from_path(pdf_path, dpi=dpi)
        debug(f"PDF总页数: {len(pages)}")
        debug("📄 正在渲染 PDF 页面...")
        img_paths = []
        for i, page in enumerate(pages, start=1):
            try:
                # 生成图片文件路径
                img_path = os.path.join(page_output_dir, f"page_{i:03d}.{image_format.lower()}")
                img_paths.append(img_path)
                # 保存图片
                page.save(img_path, image_format)
                debug(f"✅ 已保存 {img_path}")
            except Exception as e:
                error(f"保存第 {i} 页失败: {e}")
                continue
        debug(f"渲染完成: 成功保存{len(pages)} 页")
        return img_paths
    except Exception as e:
        error(f"渲染PDF失败: {e}")
        return []
 def extract_images_from_word(doc_path, base_output_dir) -> List[str]:
    """
    从Word文档中提取所有图像
    参数:
        doc_path (str): Word文档路径（.docx格式）
        base_output_dir (str): 基础输出目录，会在此目录下创建以文档名命名的子文件夹
    返回:
        int: 成功提取的图像数量
    """
    # 检查文件是否为.docx格式
    if not doc_path.lower().endswith('.docx'):
        error(f"仅支持.docx格式的Word文档: {doc_path}")
        return []
    # 从文档路径提取文件名（不含扩展名）
    doc_filename = safe_filename(Path(doc_path).stem)
    # 创建以文档名命名的子文件夹
    image_output_dir = os.path.join(base_output_dir, doc_filename)
    # 创建输出目录（如果不存在）
    if not os.path.exists(image_output_dir):
        os.makedirs(image_output_dir, exist_ok=True)
        debug(f"创建输出目录: {image_output_dir}")
    try:
        # 检查文档是否存在
        if not os.path.exists(doc_path):
            error(f"Word文档不存在: {doc_path}")
            return []
        debug(f"开始从Word文档提取图像: {doc_path}")
        debug(f"输出目录: {image_output_dir}")
        # 将.docx文件视为zip文件处理
        with zipfile.ZipFile(doc_path, 'r') as docx:
            # 获取所有文件列表
            file_list = docx.namelist()
            # 筛选出图像文件（通常位于word/media/目录下）
            image_files = [f for f in file_list if f.startswith('word/media/') and not f.endswith('/') and os.path.basename(f)]
            debug(f"找到 {len(image_files)} 个图像文件")
            img_paths = []
            for i, image_path in enumerate(image_files):
                try:
                    # 提取图像文件名
                    image_name = os.path.basename(image_path)
                    # 确保文件名有效
                    if not image_name or image_name == "media":
                        # 从路径中提取有意义的文件名
                        parts = image_path.split('/')
                        for part in reversed(parts):
                            if part and part != "media":
                                image_name = part
                                break
                        else:
                            image_name = f"image_{i + 1}.png"
                    # 添加文件扩展名如果缺失
                    if not Path(image_name).suffix:
                        # 尝试从文件内容检测格式，否则使用默认png
                        image_name += ".png"
                    # 生成输出文件路径
                    output_path = os.path.join(image_output_dir, f"image_{i + 1:03d}_{image_name}")
                    img_paths.append(output_path)
                    # 提取并保存图像
                    with docx.open(image_path) as image_file:
                        image_data = image_file.read()
                        # 保存图像数据
                        with open(output_path, 'wb') as f:
                            f.write(image_data)
                    debug(f"✅ 已提取图像: {output_path}")
                except Exception as e:
                    error(f"提取图像 {image_path} 失败: {e}")
                    continue
        debug(f"Word文档图像提取完成: 成功提取 {len(image_files)} 个图像")
        return img_paths
    except Exception as e:
        error(f"提取Word文档图像失败: {e}")
        return []
 def extract_images_from_ppt(ppt_path, base_output_dir) -> List[str]:
    """
    从PowerPoint演示文稿中提取所有图像
    参数:
        ppt_path (str): PowerPoint文件路径（.pptx格式）
        base_output_dir (str): 基础输出目录，会在此目录下创建以PPT名命名的子文件夹
    返回:
        int: 成功提取的图像数量
    """
    # 检查文件是否为.pptx格式
    if not ppt_path.lower().endswith('.pptx'):
        error(f"仅支持.pptx格式的PowerPoint文档: {ppt_path}")
        return []
    # 从PPT路径提取文件名（不含扩展名）
    ppt_filename = safe_filename(Path(ppt_path).stem)
    # 创建以PPT名命名的子文件夹
    image_output_dir = os.path.join(base_output_dir, ppt_filename)
    # 创建输出目录（如果不存在）
    if not os.path.exists(image_output_dir):
        os.makedirs(image_output_dir, exist_ok=True)
        debug(f"创建输出目录: {image_output_dir}")
    try:
        # 检查PPT文件是否存在
        if not os.path.exists(ppt_path):
            error(f"PowerPoint文档不存在: {ppt_path}")
            return []
        debug(f"开始从PowerPoint文档提取图像: {ppt_path}")
        debug(f"输出目录: {image_output_dir}")
        # 将.pptx文件视为zip文件处理
        with zipfile.ZipFile(ppt_path, 'r') as pptx:
            # 获取所有文件列表
            file_list = pptx.namelist()
            # 筛选出图像文件（通常位于ppt/media/目录下）
            image_files = [f for f in file_list if f.startswith('ppt/media/') and not f.endswith('/') and os.path.basename(f)]
            debug(f"找到 {len(image_files)} 个图像文件")
            img_paths = []
            for i, image_path in enumerate(image_files):
                try:
                    # 提取图像文件名
                    image_name = Path(image_path).name
                    # 验证文件名有效性
                    if not image_name or image_name == "media":
                        parts = image_path.split('/')
                        for part in reversed(parts):
                            if part and part != "media":
                                image_name = part
                                break
                        else:
                            image_name = f"image_{i + 1}.png"
                    # 确保有文件扩展名
                    if not Path(image_name).suffix:
                        image_name += ".png"
                    # 生成输出文件路径
                    output_path = os.path.join(image_output_dir, f"image_{i + 1:03d}_{image_name}")
                    img_paths.append(output_path)
                    # 提取并保存图像
                    with pptx.open(image_path) as image_file:
                        image_data = image_file.read()
                        # 保存图像数据
                        with open(output_path, 'wb') as f:
                            f.write(image_data)
                    debug(f"✅ 已提取图像: {output_path}")
                except Exception as e:
                    error(f"提取图像 {image_path} 失败: {e}")
                    continue
        debug(f"PowerPoint文档图像提取完成: 成功提取{len(image_files)} 个图像")
        return img_paths
    except Exception as e:
        error(f"提取PowerPoint文档图像失败: {e}")
        return []
 def extract_images_from_file(file_path, base_output_dir="/home/wangmeihua/kyrag/data/extracted_images", file_type=None):
    """
    通用函数：根据文件类型自动选择提取方法
    参数:
        file_path (str): 文件路径
        base_output_dir (str): 基础输出目录
        file_type (str): 文件类型（可选，自动检测）
    返回:
        int: 成功提取的图像/页面数量
    """
    # 如果没有指定文件类型，根据扩展名自动检测
    if file_type is None:
        ext = Path(file_path).suffix.lower()
        if ext == '.pdf':
            file_type = 'pdf'
        elif ext == '.docx':
            file_type = 'word'
        elif ext == '.pptx':
            file_type = 'ppt'
        else:
            error(f"不支持的文件类型: {ext}")
            return []
    # 根据文件类型调用相应的函数
    if file_type == 'pdf':
        return render_pdf_to_images(file_path, base_output_dir)
    elif file_type == 'word':
        return extract_images_from_word(file_path, base_output_dir)
    elif file_type == 'ppt':
        return extract_images_from_ppt(file_path, base_output_dir)
    else:
        error(f"不支持的文件类型: {file_type}")
        return []
 # 使用示例
 if __name__ == "__main__":
    base_output_dir = "/home/wangmeihua/kyrag/data/extracted_images"
    # PDF文件处理
    pdf_path = "/home/wangmeihua/kyrag/22-zh-review.pdf"
    pdf_imgs = extract_images_from_file(pdf_path, base_output_dir, 'pdf')
    debug(f"pdf_imgs: {pdf_imgs}")
    if len(pdf_imgs) > 0:
        debug(f"成功处理PDF: {len(pdf_imgs)} 页")
    else:
        error("PDF处理失败")
    # Word文档处理
    doc_path = "/home/wangmeihua/kyrag/test.docx"
    if os.path.exists(doc_path):
        doc_imgs = extract_images_from_file(doc_path, base_output_dir, 'word')
        debug(f"doc_imgs: {doc_imgs}")
        if len(doc_imgs) > 0:
            debug(f"成功处理Word文档: {len(doc_imgs)} 个图像")
        else:
            error("Word文档处理失败")
    else:
        debug(f"Word文档不存在: {doc_path}")
    # PowerPoint处理
    ppt_path = "/home/wangmeihua/kyrag/提示学习-王美华.pptx"
    if os.path.exists(ppt_path):
        ppt_imgs = extract_images_from_file(ppt_path, base_output_dir, 'ppt')
        if len(ppt_imgs) > 0:
            debug(f"成功处理PowerPoint: {len(ppt_imgs)} 个图像")
        else:
            error("PowerPoint处理失败")
    else:
        debug(f"PowerPoint文档不存在: {ppt_path}")
--- a/rag/transaction_manager.py
+++ b/rag/transaction_manager.py
@ -20,7 +20,6 @@ class OperationType(Enum):
    VECTOR_SEARCH = "vector_search"
    RERANK = "rerank"
@dataclass
 class RollbackOperation:
    """回滚操作记录"""