rag-pipeline/core/extractor.py
yumoqing 4aaeb42035 feat: initial rag-pipeline service - pluggable RAG with KG support
- CLIP embedding (9086) + Milvus VDB (8886) + NetworkX graph (9092)
- BGE-Reranker (9090) for result reranking
- Hybrid retrieval: vector search + graph expansion + RRF fusion
- API: /api/ingest, /api/search, /api/pipelines, /api/plugins, /api/status
- Two pipelines: kg-rag-standard (full) and kg-rag-lite (vector only)
- Tested E2E: ingest + search with rerank_score=0.99
2026-06-15 20:42:33 +08:00

80 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding:utf-8 -*-
"""Entity and relation extraction via LLM."""
import json
import re
from typing import List, Dict
EXTRACTION_PROMPT = """你是一个知识图谱抽取专家。请从以下文本中抽取实体和关系。
文本:
{text}
请按以下JSON格式返回
{{
"entities": [
{{"name": "实体名", "type": "实体类型(person/company/product/concept/event/location)", "description": "简要描述"}}
],
"relations": [
{{"source": "源实体名", "target": "目标实体名", "relation": "关系类型", "description": "关系描述"}}
]
}}
只返回JSON不要其他内容。"""
def extract_entities_relations(text: str, llm_func=None) -> Dict:
"""Extract entities and relations from text using LLM.
Args:
text: Input text
llm_func: Function that takes a prompt and returns LLM response text.
If None, returns empty result.
"""
if not llm_func:
return {"entities": [], "relations": []}
prompt = EXTRACTION_PROMPT.format(text=text[:2000]) # Limit text length
try:
response = llm_func(prompt)
# Try to extract JSON from response
# Remove markdown code blocks if present
response = response.strip()
if response.startswith('```'):
response = re.sub(r'^```(?:json)?\s*', '', response)
response = re.sub(r'\s*```$', '', response)
result = json.loads(response)
# Validate structure
entities = result.get("entities", [])
relations = result.get("relations", [])
# Basic validation
valid_entities = []
for e in entities:
if isinstance(e, dict) and "name" in e:
valid_entities.append({
"name": e["name"],
"type": e.get("type", "unknown"),
"description": e.get("description", "")
})
valid_relations = []
entity_names = {e["name"] for e in valid_entities}
for r in relations:
if isinstance(r, dict) and "source" in r and "target" in r:
valid_relations.append({
"source": r["source"],
"target": r["target"],
"relation": r.get("relation", "related_to"),
"description": r.get("description", "")
})
return {"entities": valid_entities, "relations": valid_relations}
except Exception as e:
return {"entities": [], "relations": [], "error": str(e)}