rag-pipeline/core/chunker.py

# -*- coding:utf-8 -*-
"""Document chunking strategies."""
import re
from typing import List, Dict


def recursive_chunk(text: str, chunk_size: int = 512, overlap: int = 64) -> List[Dict]:
    """Split text into chunks with overlap."""
    if not text or len(text) <= chunk_size:
        return [{"id": "chunk_0", "text": text, "index": 0}]

    chunks = []
    start = 0
    idx = 0

    while start < len(text):
        end = start + chunk_size

        # Try to break at sentence boundary
        if end < len(text):
            # Look for sentence ending in the last 100 chars
            search_start = max(start + chunk_size - 100, start)
            search_region = text[search_start:end]

            for sep in ['。', '！', '？', '. ', '! ', '? ', '\n\n', '\n', '；', ';']:
                last_sep = search_region.rfind(sep)
                if last_sep >= 0:
                    end = search_start + last_sep + len(sep)
                    break

        chunk_text = text[start:end].strip()
        if chunk_text:
            chunks.append({
                "id": f"chunk_{idx}",
                "text": chunk_text,
                "index": idx
            })
            idx += 1

        # Move forward with overlap
        start = end - overlap
        if start >= len(text):
            break

    return chunks


def sentence_chunk(text: str) -> List[Dict]:
    """Split text by sentences."""
    # Split on Chinese/English sentence endings
    sentences = re.split(r'(?<=[。！？.!?])\s*', text)

    chunks = []
    for idx, sent in enumerate(sentences):
        sent = sent.strip()
        if sent:
            chunks.append({
                "id": f"chunk_{idx}",
                "text": sent,
                "index": idx
            })

    return chunks


def chunk_document(text: str, strategy: str = "recursive", **kwargs) -> List[Dict]:
    """Chunk a document using the specified strategy."""
    if strategy == "sentence":
        return sentence_chunk(text)
    else:
        return recursive_chunk(text,
                            chunk_size=kwargs.get("chunk_size", 512),
                            overlap=kwargs.get("overlap", 64))