# -*- coding:utf-8 -*- """Document chunking strategies.""" import re from typing import List, Dict def recursive_chunk(text: str, chunk_size: int = 512, overlap: int = 64) -> List[Dict]: """Split text into chunks with overlap.""" if not text or len(text) <= chunk_size: return [{"id": "chunk_0", "text": text, "index": 0}] chunks = [] start = 0 idx = 0 while start < len(text): end = start + chunk_size # Try to break at sentence boundary if end < len(text): # Look for sentence ending in the last 100 chars search_start = max(start + chunk_size - 100, start) search_region = text[search_start:end] for sep in ['。', '!', '?', '. ', '! ', '? ', '\n\n', '\n', ';', ';']: last_sep = search_region.rfind(sep) if last_sep >= 0: end = search_start + last_sep + len(sep) break chunk_text = text[start:end].strip() if chunk_text: chunks.append({ "id": f"chunk_{idx}", "text": chunk_text, "index": idx }) idx += 1 # Move forward with overlap start = end - overlap if start >= len(text): break return chunks def sentence_chunk(text: str) -> List[Dict]: """Split text by sentences.""" # Split on Chinese/English sentence endings sentences = re.split(r'(?<=[。!?.!?])\s*', text) chunks = [] for idx, sent in enumerate(sentences): sent = sent.strip() if sent: chunks.append({ "id": f"chunk_{idx}", "text": sent, "index": idx }) return chunks def chunk_document(text: str, strategy: str = "recursive", **kwargs) -> List[Dict]: """Chunk a document using the specified strategy.""" if strategy == "sentence": return sentence_chunk(text) else: return recursive_chunk(text, chunk_size=kwargs.get("chunk_size", 512), overlap=kwargs.get("overlap", 64))