- CLIP embedding (9086) + Milvus VDB (8886) + NetworkX graph (9092) - BGE-Reranker (9090) for result reranking - Hybrid retrieval: vector search + graph expansion + RRF fusion - API: /api/ingest, /api/search, /api/pipelines, /api/plugins, /api/status - Two pipelines: kg-rag-standard (full) and kg-rag-lite (vector only) - Tested E2E: ingest + search with rerank_score=0.99
74 lines
2.2 KiB
Python
74 lines
2.2 KiB
Python
# -*- coding:utf-8 -*-
|
||
"""Document chunking strategies."""
|
||
import re
|
||
from typing import List, Dict
|
||
|
||
|
||
def recursive_chunk(text: str, chunk_size: int = 512, overlap: int = 64) -> List[Dict]:
|
||
"""Split text into chunks with overlap."""
|
||
if not text or len(text) <= chunk_size:
|
||
return [{"id": "chunk_0", "text": text, "index": 0}]
|
||
|
||
chunks = []
|
||
start = 0
|
||
idx = 0
|
||
|
||
while start < len(text):
|
||
end = start + chunk_size
|
||
|
||
# Try to break at sentence boundary
|
||
if end < len(text):
|
||
# Look for sentence ending in the last 100 chars
|
||
search_start = max(start + chunk_size - 100, start)
|
||
search_region = text[search_start:end]
|
||
|
||
for sep in ['。', '!', '?', '. ', '! ', '? ', '\n\n', '\n', ';', ';']:
|
||
last_sep = search_region.rfind(sep)
|
||
if last_sep >= 0:
|
||
end = search_start + last_sep + len(sep)
|
||
break
|
||
|
||
chunk_text = text[start:end].strip()
|
||
if chunk_text:
|
||
chunks.append({
|
||
"id": f"chunk_{idx}",
|
||
"text": chunk_text,
|
||
"index": idx
|
||
})
|
||
idx += 1
|
||
|
||
# Move forward with overlap
|
||
start = end - overlap
|
||
if start >= len(text):
|
||
break
|
||
|
||
return chunks
|
||
|
||
|
||
def sentence_chunk(text: str) -> List[Dict]:
|
||
"""Split text by sentences."""
|
||
# Split on Chinese/English sentence endings
|
||
sentences = re.split(r'(?<=[。!?.!?])\s*', text)
|
||
|
||
chunks = []
|
||
for idx, sent in enumerate(sentences):
|
||
sent = sent.strip()
|
||
if sent:
|
||
chunks.append({
|
||
"id": f"chunk_{idx}",
|
||
"text": sent,
|
||
"index": idx
|
||
})
|
||
|
||
return chunks
|
||
|
||
|
||
def chunk_document(text: str, strategy: str = "recursive", **kwargs) -> List[Dict]:
|
||
"""Chunk a document using the specified strategy."""
|
||
if strategy == "sentence":
|
||
return sentence_chunk(text)
|
||
else:
|
||
return recursive_chunk(text,
|
||
chunk_size=kwargs.get("chunk_size", 512),
|
||
overlap=kwargs.get("overlap", 64))
|