rag-pipeline/core/chunker.py
yumoqing 4aaeb42035 feat: initial rag-pipeline service - pluggable RAG with KG support
- CLIP embedding (9086) + Milvus VDB (8886) + NetworkX graph (9092)
- BGE-Reranker (9090) for result reranking
- Hybrid retrieval: vector search + graph expansion + RRF fusion
- API: /api/ingest, /api/search, /api/pipelines, /api/plugins, /api/status
- Two pipelines: kg-rag-standard (full) and kg-rag-lite (vector only)
- Tested E2E: ingest + search with rerank_score=0.99
2026-06-15 20:42:33 +08:00

74 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding:utf-8 -*-
"""Document chunking strategies."""
import re
from typing import List, Dict
def recursive_chunk(text: str, chunk_size: int = 512, overlap: int = 64) -> List[Dict]:
"""Split text into chunks with overlap."""
if not text or len(text) <= chunk_size:
return [{"id": "chunk_0", "text": text, "index": 0}]
chunks = []
start = 0
idx = 0
while start < len(text):
end = start + chunk_size
# Try to break at sentence boundary
if end < len(text):
# Look for sentence ending in the last 100 chars
search_start = max(start + chunk_size - 100, start)
search_region = text[search_start:end]
for sep in ['', '', '', '. ', '! ', '? ', '\n\n', '\n', '', ';']:
last_sep = search_region.rfind(sep)
if last_sep >= 0:
end = search_start + last_sep + len(sep)
break
chunk_text = text[start:end].strip()
if chunk_text:
chunks.append({
"id": f"chunk_{idx}",
"text": chunk_text,
"index": idx
})
idx += 1
# Move forward with overlap
start = end - overlap
if start >= len(text):
break
return chunks
def sentence_chunk(text: str) -> List[Dict]:
"""Split text by sentences."""
# Split on Chinese/English sentence endings
sentences = re.split(r'(?<=[。!?.!?])\s*', text)
chunks = []
for idx, sent in enumerate(sentences):
sent = sent.strip()
if sent:
chunks.append({
"id": f"chunk_{idx}",
"text": sent,
"index": idx
})
return chunks
def chunk_document(text: str, strategy: str = "recursive", **kwargs) -> List[Dict]:
"""Chunk a document using the specified strategy."""
if strategy == "sentence":
return sentence_chunk(text)
else:
return recursive_chunk(text,
chunk_size=kwargs.get("chunk_size", 512),
overlap=kwargs.get("overlap", 64))