rag-pipeline/core/extractor.py

# -*- coding:utf-8 -*-
"""Entity and relation extraction via LLM."""
import json
import re
from typing import List, Dict


EXTRACTION_PROMPT = """你是一个知识图谱抽取专家。请从以下文本中抽取实体和关系。

文本：
{text}

请按以下JSON格式返回：
{{
    "entities": [
        {{"name": "实体名", "type": "实体类型(person/company/product/concept/event/location)", "description": "简要描述"}}
    ],
    "relations": [
        {{"source": "源实体名", "target": "目标实体名", "relation": "关系类型", "description": "关系描述"}}
    ]
}}

只返回JSON，不要其他内容。"""


def extract_entities_relations(text: str, llm_func=None) -> Dict:
    """Extract entities and relations from text using LLM.

    Args:
        text: Input text
        llm_func: Function that takes a prompt and returns LLM response text.
                  If None, returns empty result.
    """
    if not llm_func:
        return {"entities": [], "relations": []}

    prompt = EXTRACTION_PROMPT.format(text=text[:2000])  # Limit text length

    try:
        response = llm_func(prompt)

        # Try to extract JSON from response
        # Remove markdown code blocks if present
        response = response.strip()
        if response.startswith('```'):
            response = re.sub(r'^```(?:json)?\s*', '', response)
            response = re.sub(r'\s*```$', '', response)

        result = json.loads(response)

        # Validate structure
        entities = result.get("entities", [])
        relations = result.get("relations", [])

        # Basic validation
        valid_entities = []
        for e in entities:
            if isinstance(e, dict) and "name" in e:
                valid_entities.append({
                    "name": e["name"],
                    "type": e.get("type", "unknown"),
                    "description": e.get("description", "")
                })

        valid_relations = []
        entity_names = {e["name"] for e in valid_entities}
        for r in relations:
            if isinstance(r, dict) and "source" in r and "target" in r:
                valid_relations.append({
                    "source": r["source"],
                    "target": r["target"],
                    "relation": r.get("relation", "related_to"),
                    "description": r.get("description", "")
                })

        return {"entities": valid_entities, "relations": valid_relations}

    except Exception as e:
        return {"entities": [], "relations": [], "error": str(e)}