yulongyan_citu
/
rag_project


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
							#!/usr/bin/python
# -*- coding: <utf-8> -*-
import json
import os
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain.prompts import (
        PromptTemplate,
)
from typing import List
from langchain.output_parsers import ResponseSchema,StructuredOutputParser
from langchain_community.graphs import Neo4jGraph
from langchain.schema import AIMessage


def connect():
    os.environ["NEO4J_URI"] = "bolt://192.168.3.91:27687"
    os.environ["NEO4J_USERNAME"] = "neo4j"
    os.environ["NEO4J_PASSWORD"] = "123456"

    graph = Neo4jGraph()
    return graph

def extract_question_info(question:str,llm)->List[str]:
    # 定义要接收的响应模式
    response_schemas = [
        ResponseSchema(name="entity", description="All the person, organization, or business entities that"""
                                                  "appear in the text")
    ]
    # 创建输出解析器
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    # 获取格式指示
    format_instructions = output_parser.get_format_instructions()
    # 根据模板创建提示，同时在提示中加入输出解析器的说明
    prompt_template = PromptTemplate(
        template="Answer the user query.\n{format_instructions}\n{query}\n",
        input_variables=["query"],
        partial_variables={"format_instructions": format_instructions},
    )
    # 根据提示准备模型的输入
    inputData = prompt_template.format(query=question)

    # 获取模型的输出
    output = llm.invoke(inputData)

    # 去掉 JSON 内容前后的 ```json 和 ``` 标记
    if isinstance(output, AIMessage):
        # 从 AIMessage 对象中提取内容
        json_content = output.content.strip('```json').strip('```').strip()
    else:
        raise TypeError("Expected an AIMessage object")

    # 解析 JSON 内容
    data = json.loads(json_content)

    # 获取 names 列表
    names = data.get('entity',[])
    # 用户问题的实体输出
    # print(names)

    if isinstance(names, str):
        names = [names]
    return names

def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

# Fulltext index query
def structured_retriever(llm,graph,question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    # 前面提取到的实体
    names = extract_question_info(question,llm)
    for entity in names:
        # 图谱中匹配到的节点限制返回相似度不得低于0.5
        # query = generate_full_text_query(entity)
        # print(f"Query:{query}")
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('dataops', $query, {limit:2})
            YIELD node, score
            WHERE score >= 0.5
            // score 判断
            CALL {
              WITH node
              MATCH (node)-[r]->(neighbor)
              RETURN node.name + ' - ' + type(r) + ' -> ' + neighbor.name AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r]-(neighbor)
              RETURN neighbor.name + ' - ' + type(r) + ' -> ' +  node.name AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": entity},
        )
        result += "\n".join([el['output'] for el in response])
    return result

# 非结构化的全文索引
def text_structured_retriever(llm,graph,question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    # 前面提取到的实体
    names = extract_question_info(question,llm)
    for entity in names:
        # 图谱中匹配到的节点限制返回相似度不得低于0.5
        # query = generate_full_text_query(entity)
        # print(f"Query:{query}")
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('unstructure', $query, {limit:4})
            YIELD node, score
            WHERE score >= 0.2
            // score 判断
            CALL {
              WITH node
              MATCH (node)-[r]->(neighbor)
              RETURN node.name + ' - ' + type(r) + ' -> ' + neighbor.name AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r]-(neighbor)
              RETURN neighbor.name + ' - ' + type(r) + ' -> ' +  node.name AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": entity},
        )
        result += "\n".join([el['output'] for el in response])
    return result