4 недель назад · a7a36b0e9a
--- a/app_config.py
+++ b/app_config.py
@@ -5,7 +5,9 @@ import os
 
				 load_dotenv()
			
 
				 
			
 
				 # 使用的模型类型（"qwen" 或 "deepseek"）
			
 
				-MODEL_TYPE = "qwen"
			
 
				+LLM_MODEL_NAME = "qwen"
			
 
				+# 向量数据库类型， chromadb 或 pgvector
			
 
				+VECTOR_DB_NAME = "pgvector"
			
 
				 
			
 
				 # DeepSeek模型配置
			
 
				 DEEPSEEK_CONFIG = {
			
@@ -54,7 +56,31 @@ APP_DB_CONFIG = {
 
				 # ChromaDB配置
			
 
				 # CHROMADB_PATH = "."  
			
 
				 
			
 
				-# 批处理配置
			
 
				-BATCH_PROCESSING_ENABLED = True
			
 
				-BATCH_SIZE = 10
			
 
				-MAX_WORKERS = 4
			
 
				+# PgVector数据库连接配置 (向量数据库，独立于业务数据库)
			
 
				+PGVECTOR_CONFIG = {
			
 
				+    "host": "192.168.67.1",
			
 
				+    "port": 5432,
			
 
				+    "dbname": "pgvector_db",
			
 
				+    "user": os.getenv("PGVECTOR_DB_USER"),
			
 
				+    "password": os.getenv("PGVECTOR_DB_PASSWORD")
			
 
				+}
			
 
				+
			
 
				+# 训练脚本批处理配置
			
 
				+# 这些配置仅用于 training/run_training.py 训练脚本的批处理优化
			
 
				+TRAINING_BATCH_PROCESSING_ENABLED = True    # 是否启用训练数据批处理
			
 
				+TRAINING_BATCH_SIZE = 10                    # 每批处理的训练项目数量
			
 
				+TRAINING_MAX_WORKERS = 4                    # 训练批处理的最大工作线程数
			
 
				+
			
 
				+# 训练数据路径配置
			
 
				+# 支持以下格式：
			
 
				+# 1. 相对路径（以 . 开头）：
			
 
				+#    "./training/data"     - 项目根目录下的training/data
			
 
				+#    "../data"             - 项目根目录上级的data目录
			
 
				+# 2. 绝对路径：
			
 
				+#    "/home/user/data"     - Linux绝对路径
			
 
				+#    "C:/data"             - Windows绝对路径
			
 
				+#    "D:\\training\\data"  - Windows绝对路径（转义反斜杠）
			
 
				+# 3. 相对路径（不以.开头）：
			
 
				+#    "training/data"       - 相对于项目根目录
			
 
				+#    "my_data"             - 项目根目录下的my_data文件夹
			
 
				+TRAINING_DATA_PATH = "./training/data"
			
--- a/custompgvector/__init__.py
+++ b/custompgvector/__init__.py
@@ -0,0 +1 @@
 
				+from .pgvector import PG_VectorStore
			
--- a/custompgvector/pgvector.py
+++ b/custompgvector/pgvector.py
@@ -0,0 +1,254 @@
 
				+import ast
			
 
				+import json
			
 
				+import logging
			
 
				+import uuid
			
 
				+
			
 
				+import pandas as pd
			
 
				+from langchain_core.documents import Document
			
 
				+from langchain_postgres.vectorstores import PGVector
			
 
				+from sqlalchemy import create_engine, text
			
 
				+
			
 
				+from vanna.exceptions import ValidationError
			
 
				+from vanna.base import VannaBase
			
 
				+from vanna.types import TrainingPlan, TrainingPlanItem
			
 
				+
			
 
				+
			
 
				+class PG_VectorStore(VannaBase):
			
 
				+    def __init__(self, config=None):
			
 
				+        if not config or "connection_string" not in config:
			
 
				+            raise ValueError(
			
 
				+                "A valid 'config' dictionary with a 'connection_string' is required.")
			
 
				+
			
 
				+        VannaBase.__init__(self, config=config)
			
 
				+
			
 
				+        if config and "connection_string" in config:
			
 
				+            self.connection_string = config.get("connection_string")
			
 
				+            self.n_results = config.get("n_results", 10)
			
 
				+
			
 
				+        if config and "embedding_function" in config:
			
 
				+            self.embedding_function = config.get("embedding_function")
			
 
				+        else:
			
 
				+            raise ValueError("No embedding_function was found.")
			
 
				+            # from langchain_huggingface import HuggingFaceEmbeddings
			
 
				+            # self.embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
			
 
				+
			
 
				+        self.sql_collection = PGVector(
			
 
				+            embeddings=self.embedding_function,
			
 
				+            collection_name="sql",
			
 
				+            connection=self.connection_string,
			
 
				+        )
			
 
				+        self.ddl_collection = PGVector(
			
 
				+            embeddings=self.embedding_function,
			
 
				+            collection_name="ddl",
			
 
				+            connection=self.connection_string,
			
 
				+        )
			
 
				+        self.documentation_collection = PGVector(
			
 
				+            embeddings=self.embedding_function,
			
 
				+            collection_name="documentation",
			
 
				+            connection=self.connection_string,
			
 
				+        )
			
 
				+
			
 
				+    def add_question_sql(self, question: str, sql: str, **kwargs) -> str:
			
 
				+        question_sql_json = json.dumps(
			
 
				+            {
			
 
				+                "question": question,
			
 
				+                "sql": sql,
			
 
				+            },
			
 
				+            ensure_ascii=False,
			
 
				+        )
			
 
				+        id = str(uuid.uuid4()) + "-sql"
			
 
				+        createdat = kwargs.get("createdat")
			
 
				+        doc = Document(
			
 
				+            page_content=question_sql_json,
			
 
				+            metadata={"id": id, "createdat": createdat},
			
 
				+        )
			
 
				+        self.sql_collection.add_documents([doc], ids=[doc.metadata["id"]])
			
 
				+
			
 
				+        return id
			
 
				+
			
 
				+    def add_ddl(self, ddl: str, **kwargs) -> str:
			
 
				+        _id = str(uuid.uuid4()) + "-ddl"
			
 
				+        doc = Document(
			
 
				+            page_content=ddl,
			
 
				+            metadata={"id": _id},
			
 
				+        )
			
 
				+        self.ddl_collection.add_documents([doc], ids=[doc.metadata["id"]])
			
 
				+        return _id
			
 
				+
			
 
				+    def add_documentation(self, documentation: str, **kwargs) -> str:
			
 
				+        _id = str(uuid.uuid4()) + "-doc"
			
 
				+        doc = Document(
			
 
				+            page_content=documentation,
			
 
				+            metadata={"id": _id},
			
 
				+        )
			
 
				+        self.documentation_collection.add_documents([doc], ids=[doc.metadata["id"]])
			
 
				+        return _id
			
 
				+
			
 
				+    def get_collection(self, collection_name):
			
 
				+        match collection_name:
			
 
				+            case "sql":
			
 
				+                return self.sql_collection
			
 
				+            case "ddl":
			
 
				+                return self.ddl_collection
			
 
				+            case "documentation":
			
 
				+                return self.documentation_collection
			
 
				+            case _:
			
 
				+                raise ValueError("Specified collection does not exist.")
			
 
				+
			
 
				+    def get_similar_question_sql(self, question: str) -> list:
			
 
				+        documents = self.sql_collection.similarity_search(query=question, k=self.n_results)
			
 
				+        return [ast.literal_eval(document.page_content) for document in documents]
			
 
				+
			
 
				+    def get_related_ddl(self, question: str, **kwargs) -> list:
			
 
				+        documents = self.ddl_collection.similarity_search(query=question, k=self.n_results)
			
 
				+        return [document.page_content for document in documents]
			
 
				+
			
 
				+    def get_related_documentation(self, question: str, **kwargs) -> list:
			
 
				+        documents = self.documentation_collection.similarity_search(query=question, k=self.n_results)
			
 
				+        return [document.page_content for document in documents]
			
 
				+
			
 
				+    def train(
			
 
				+        self,
			
 
				+        question: str | None = None,
			
 
				+        sql: str | None = None,
			
 
				+        ddl: str | None = None,
			
 
				+        documentation: str | None = None,
			
 
				+        plan: TrainingPlan | None = None,
			
 
				+        createdat: str | None = None,
			
 
				+    ):
			
 
				+        if question and not sql:
			
 
				+            raise ValidationError("Please provide a SQL query.")
			
 
				+
			
 
				+        if documentation:
			
 
				+            logging.info(f"Adding documentation: {documentation}")
			
 
				+            return self.add_documentation(documentation)
			
 
				+
			
 
				+        if sql and question:
			
 
				+            return self.add_question_sql(question=question, sql=sql, createdat=createdat)
			
 
				+
			
 
				+        if ddl:
			
 
				+            logging.info(f"Adding ddl: {ddl}")
			
 
				+            return self.add_ddl(ddl)
			
 
				+
			
 
				+        if plan:
			
 
				+            for item in plan._plan:
			
 
				+                if item.item_type == TrainingPlanItem.ITEM_TYPE_DDL:
			
 
				+                    self.add_ddl(item.item_value)
			
 
				+                elif item.item_type == TrainingPlanItem.ITEM_TYPE_IS:
			
 
				+                    self.add_documentation(item.item_value)
			
 
				+                elif item.item_type == TrainingPlanItem.ITEM_TYPE_SQL and item.item_name:
			
 
				+                    self.add_question_sql(question=item.item_name, sql=item.item_value)
			
 
				+
			
 
				+    def get_training_data(self, **kwargs) -> pd.DataFrame:
			
 
				+        # Establishing the connection
			
 
				+        engine = create_engine(self.connection_string)
			
 
				+
			
 
				+        # Querying the 'langchain_pg_embedding' table
			
 
				+        query_embedding = "SELECT cmetadata, document FROM langchain_pg_embedding"
			
 
				+        df_embedding = pd.read_sql(query_embedding, engine)
			
 
				+
			
 
				+        # List to accumulate the processed rows
			
 
				+        processed_rows = []
			
 
				+
			
 
				+        # Process each row in the DataFrame
			
 
				+        for _, row in df_embedding.iterrows():
			
 
				+            custom_id = row["cmetadata"]["id"]
			
 
				+            document = row["document"]
			
 
				+            training_data_type = "documentation" if custom_id[-3:] == "doc" else custom_id[-3:]
			
 
				+
			
 
				+            if training_data_type == "sql":
			
 
				+                # Convert the document string to a dictionary
			
 
				+                try:
			
 
				+                    doc_dict = ast.literal_eval(document)
			
 
				+                    question = doc_dict.get("question")
			
 
				+                    content = doc_dict.get("sql")
			
 
				+                except (ValueError, SyntaxError):
			
 
				+                    logging.info(f"Skipping row with custom_id {custom_id} due to parsing error.")
			
 
				+                    continue
			
 
				+            elif training_data_type in ["documentation", "ddl"]:
			
 
				+                question = None  # Default value for question
			
 
				+                content = document
			
 
				+            else:
			
 
				+                # If the suffix is not recognized, skip this row
			
 
				+                logging.info(f"Skipping row with custom_id {custom_id} due to unrecognized training data type.")
			
 
				+                continue
			
 
				+
			
 
				+            # Append the processed data to the list
			
 
				+            processed_rows.append(
			
 
				+                {"id": custom_id, "question": question, "content": content, "training_data_type": training_data_type}
			
 
				+            )
			
 
				+
			
 
				+        # Create a DataFrame from the list of processed rows
			
 
				+        df_processed = pd.DataFrame(processed_rows)
			
 
				+
			
 
				+        return df_processed
			
 
				+
			
 
				+    def remove_training_data(self, id: str, **kwargs) -> bool:
			
 
				+        # Create the database engine
			
 
				+        engine = create_engine(self.connection_string)
			
 
				+
			
 
				+        # SQL DELETE statement
			
 
				+        delete_statement = text(
			
 
				+            """
			
 
				+            DELETE FROM langchain_pg_embedding
			
 
				+            WHERE cmetadata ->> 'id' = :id
			
 
				+        """
			
 
				+        )
			
 
				+
			
 
				+        # Connect to the database and execute the delete statement
			
 
				+        with engine.connect() as connection:
			
 
				+            # Start a transaction
			
 
				+            with connection.begin() as transaction:
			
 
				+                try:
			
 
				+                    result = connection.execute(delete_statement, {"id": id})
			
 
				+                    # Commit the transaction if the delete was successful
			
 
				+                    transaction.commit()
			
 
				+                    # Check if any row was deleted and return True or False accordingly
			
 
				+                    return result.rowcount > 0
			
 
				+                except Exception as e:
			
 
				+                    # Rollback the transaction in case of error
			
 
				+                    logging.error(f"An error occurred: {e}")
			
 
				+                    transaction.rollback()
			
 
				+                    return False
			
 
				+
			
 
				+    def remove_collection(self, collection_name: str) -> bool:
			
 
				+        engine = create_engine(self.connection_string)
			
 
				+
			
 
				+        # Determine the suffix to look for based on the collection name
			
 
				+        suffix_map = {"ddl": "ddl", "sql": "sql", "documentation": "doc"}
			
 
				+        suffix = suffix_map.get(collection_name)
			
 
				+
			
 
				+        if not suffix:
			
 
				+            logging.info("Invalid collection name. Choose from 'ddl', 'sql', or 'documentation'.")
			
 
				+            return False
			
 
				+
			
 
				+        # SQL query to delete rows based on the condition
			
 
				+        query = text(
			
 
				+            f"""
			
 
				+            DELETE FROM langchain_pg_embedding
			
 
				+            WHERE cmetadata->>'id' LIKE '%{suffix}'
			
 
				+        """
			
 
				+        )
			
 
				+
			
 
				+        # Execute the deletion within a transaction block
			
 
				+        with engine.connect() as connection:
			
 
				+            with connection.begin() as transaction:
			
 
				+                try:
			
 
				+                    result = connection.execute(query)
			
 
				+                    transaction.commit()  # Explicitly commit the transaction
			
 
				+                    if result.rowcount > 0:
			
 
				+                        logging.info(
			
 
				+                            f"Deleted {result.rowcount} rows from "
			
 
				+                            f"langchain_pg_embedding where collection is {collection_name}."
			
 
				+                        )
			
 
				+                        return True
			
 
				+                    else:
			
 
				+                        logging.info(f"No rows deleted for collection {collection_name}.")
			
 
				+                        return False
			
 
				+                except Exception as e:
			
 
				+                    logging.error(f"An error occurred: {e}")
			
 
				+                    transaction.rollback()  # Rollback in case of error
			
 
				+                    return False
			
 
				+
			
 
				+    def generate_embedding(self, *args, **kwargs):
			
 
				+        pass
			
--- a/docs/pgvector.md
+++ b/docs/pgvector.md
@@ -0,0 +1,53 @@
 
				+## 使用PgVector作为向量数据库
			
 
				+
			
 
				+### 1.下面是langchain自动创建的表结构，可以参考这个结构来创建自己的表结构。
			
 
				+
			
 
				+```sql
			
 
				+-- 创建向量表
			
 
				+create table public.langchain_pg_embedding
			
 
				+(
			
 
				+    id            varchar not null
			
 
				+        primary key,
			
 
				+    collection_id uuid
			
 
				+        references public.langchain_pg_collection
			
 
				+            on delete cascade,
			
 
				+    embedding     vector,
			
 
				+    document      varchar,
			
 
				+    cmetadata     jsonb
			
 
				+);
			
 
				+
			
 
				+alter table public.langchain_pg_embedding
			
 
				+    owner to postgres;
			
 
				+
			
 
				+create index ix_cmetadata_gin
			
 
				+    on public.langchain_pg_embedding using gin (cmetadata jsonb_path_ops);
			
 
				+
			
 
				+-- 创建集合表
			
 
				+create table public.langchain_pg_collection
			
 
				+(
			
 
				+    uuid      uuid    not null
			
 
				+        primary key,
			
 
				+    name      varchar not null
			
 
				+        unique,
			
 
				+    cmetadata json
			
 
				+);
			
 
				+
			
 
				+alter table public.langchain_pg_collection
			
 
				+    owner to postgres;
			
 
				+
			
 
				+
			
 
				+```
			
 
				+
			
 
				+### 2. 为了便于测试，我会删除向量表的外键。
			
 
				+
			
 
				+```sql  
			
 
				+alter table public.langchain_pg_embedding
			
 
				+    drop constraint langchain_pg_embedding_collection_id_fkey;
			
 
				+```
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/docs/run_training说明.md
+++ b/docs/run_training说明.md
@@ -0,0 +1,147 @@
 
				+## 文件扩展名与处理函数对应关系
			
 
				+
			
 
				+### 文件处理优先级和判断逻辑
			
 
				+代码中的文件类型判断按以下顺序进行：
			
 
				+
			
 
				+1. **`.ddl`** → DDL文件
			
 
				+2. **`.md` 或 `.markdown`** → 文档文件  
			
 
				+3. **`_pair.json` 或 `_pairs.json`** → JSON问答对文件
			
 
				+4. **`_pair.sql` 或 `_pairs.sql`** → 格式化问答对文件
			
 
				+5. **`.sql` (但不以 `_pair.sql` 或 `_pairs.sql` 结尾)** → SQL示例文件
			
 
				+6. **其他** → 跳过处理
			
 
				+
			
 
				+### 1. **DDL文件** (`.ddl`)
			
 
				+- **处理函数**: `train_ddl_statements()`
			
 
				+- **调用的训练函数**: `train_ddl()`
			
 
				+- **文件格式**: 
			
 
				+  - 使用分号 (`;`) 作为分隔符
			
 
				+  - 每个DDL语句之间用分号分隔
			
 
				+  - 示例格式:
			
 
				+    ```sql
			
 
				+    CREATE TABLE users (
			
 
				+        id INT PRIMARY KEY,
			
 
				+        name VARCHAR(100)
			
 
				+    );
			
 
				+    CREATE TABLE orders (
			
 
				+        id INT PRIMARY KEY,
			
 
				+        user_id INT REFERENCES users(id)
			
 
				+    );
			
 
				+    ```
			
 
				+
			
 
				+### 2. **文档文件** (`.md`, `.markdown`)
			
 
				+- **处理函数**: `train_documentation_blocks()`
			
 
				+- **调用的训练函数**: `train_documentation()`
			
 
				+- **文件格式**:
			
 
				+  - **Markdown文件**: 按标题级别自动分割 (`#`, `##`, `###`)
			
 
				+  - **非Markdown文件**: 使用 `---` 作为分隔符
			
 
				+  - 示例格式:
			
 
				+    ```markdown
			
 
				+    # 用户表说明
			
 
				+    用户表存储系统中所有用户的基本信息...
			
 
				+    
			
 
				+    ## 字段说明
			
 
				+    - id: 用户唯一标识符
			
 
				+    - name: 用户姓名
			
 
				+    
			
 
				+    ### 注意事项
			
 
				+    用户名不能重复...
			
 
				+    ```
			
 
				+
			
 
				+### 3. **SQL示例文件** (`.sql`, 但排除 `_pair.sql` 和 `_pairs.sql`)
			
 
				+- **处理函数**: `train_sql_examples()`
			
 
				+- **调用的训练函数**: `train_sql_example()`
			
 
				+- **文件格式**:
			
 
				+  - 使用分号 (`;`) 作为分隔符
			
 
				+  - 每个SQL示例之间用分号分隔
			
 
				+  - 示例格式:
			
 
				+    ```sql
			
 
				+    SELECT * FROM users WHERE age > 18;
			
 
				+    SELECT COUNT(*) FROM orders WHERE status = 'completed';
			
 
				+    SELECT u.name, COUNT(o.id) FROM users u LEFT JOIN orders o ON u.id = o.user_id GROUP BY u.id;
			
 
				+    ```
			
 
				+
			
 
				+### 4. **格式化问答对文件** (`_pair.sql`, `_pairs.sql`)
			
 
				+- **处理函数**: `train_formatted_question_sql_pairs()`
			
 
				+- **调用的训练函数**: `train_question_sql_pair()`
			
 
				+- **文件格式**:
			
 
				+  - 使用 `Question:` 和 `SQL:` 标记
			
 
				+  - 问答对之间用双空行分隔
			
 
				+  - 支持单行和多行SQL
			
 
				+  - 示例格式:
			
 
				+    ```
			
 
				+    Question: 查询所有成年用户
			
 
				+    SQL: SELECT * FROM users WHERE age >= 18;
			
 
				+
			
 
				+    Question: 统计每个用户的订单数量
			
 
				+    SQL: 
			
 
				+    SELECT u.name, COUNT(o.id) as order_count
			
 
				+    FROM users u 
			
 
				+    LEFT JOIN orders o ON u.id = o.user_id 
			
 
				+    GROUP BY u.id, u.name;
			
 
				+    ```
			
 
				+
			
 
				+### 5. **JSON格式问答对文件** (`_pair.json`, `_pairs.json`)
			
 
				+- **处理函数**: `train_json_question_sql_pairs()`
			
 
				+- **调用的训练函数**: `train_question_sql_pair()`
			
 
				+- **文件格式**:
			
 
				+  - 标准JSON数组格式
			
 
				+  - 每个对象包含 `question` 和 `sql` 字段
			
 
				+  - 示例格式:
			
 
				+    ```json
			
 
				+    [
			
 
				+        {
			
 
				+            "question": "查询所有成年用户",
			
 
				+            "sql": "SELECT * FROM users WHERE age >= 18"
			
 
				+        },
			
 
				+        {
			
 
				+            "question": "统计每个用户的订单数量",
			
 
				+            "sql": "SELECT u.name, COUNT(o.id) as order_count FROM users u LEFT JOIN orders o ON u.id = o.user_id GROUP BY u.id, u.name"
			
 
				+        }
			
 
				+    ]
			
 
				+    ```
			
 
				+
			
 
				+### 6. **传统问答对文件** (其他格式，通过 `train_question_sql_pairs()` 处理)
			
 
				+- **处理函数**: `train_question_sql_pairs()`
			
 
				+- **调用的训练函数**: `train_question_sql_pair()`
			
 
				+- **文件格式**:
			
 
				+  - 每行一个问答对
			
 
				+  - 使用 `::` 分隔问题和SQL
			
 
				+  - 示例格式:
			
 
				+    ```
			
 
				+    查询所有成年用户::SELECT * FROM users WHERE age >= 18
			
 
				+    统计订单总数::SELECT COUNT(*) FROM orders
			
 
				+    ```
			
 
				+
			
 
				+
			
 
				+
			
 
				+## 统计信息
			
 
				+
			
 
				+训练完成后会显示以下统计：
			
 
				+- DDL文件数量
			
 
				+- 文档文件数量  
			
 
				+- SQL示例文件数量
			
 
				+- 格式化问答对文件数量
			
 
				+- JSON问答对文件数量
			
 
				+
			
 
				+这个设计使得训练系统能够灵活处理多种不同格式的训练数据，满足不同场景下的数据准备需求。
			
 
				+
			
 
				+
			
 
				+# 训练脚本批处理配置
			
 
				+# 这些配置仅用于 training/run_training.py 训练脚本的批处理优化
			
 
				+# 批处理可以提高训练效率，但会增加内存使用和复杂度
			
 
				+# 
			
 
				+# TRAINING_BATCH_PROCESSING_ENABLED: 
			
 
				+#   - True: 启用批处理，将多个训练项目打包一起处理
			
 
				+#   - False: 逐项处理，每个训练项目单独处理（更稳定但较慢）
			
 
				+# 
			
 
				+# TRAINING_BATCH_SIZE: 每批处理的训练项目数量
			
 
				+#   - 较大值: 处理更快但占用更多内存
			
 
				+#   - 较小值: 内存占用少但处理较慢
			
 
				+#   - 建议范围: 5-20
			
 
				+# 
			
 
				+# TRAINING_MAX_WORKERS: 训练批处理的最大工作线程数
			
 
				+#   - 建议设置为CPU核心数的1-2倍
			
 
				+#   - 过多线程可能导致资源竞争
			
 
				+TRAINING_BATCH_PROCESSING_ENABLED = True    # 是否启用训练数据批处理
			
 
				+TRAINING_BATCH_SIZE = 10                    # 每批处理的训练项目数量
			
 
				+TRAINING_MAX_WORKERS = 4                    # 训练批处理的最大工作线程数
			
--- a/docs/training_path_examples.md
+++ b/docs/training_path_examples.md
@@ -0,0 +1,99 @@
 
				+# 训练数据路径配置示例
			
 
				+
			
 
				+在 `app_config.py` 中，您可以通过修改 `TRAINING_DATA_PATH` 来配置训练数据的路径。
			
 
				+
			
 
				+## 配置方式
			
 
				+
			
 
				+### 1. 相对路径（以 . 开头）
			
 
				+```python
			
 
				+# 项目根目录下的training/data文件夹
			
 
				+TRAINING_DATA_PATH = "./training/data"
			
 
				+
			
 
				+# 项目根目录下的my_training_data文件夹
			
 
				+TRAINING_DATA_PATH = "./my_training_data"
			
 
				+
			
 
				+# 项目根目录上级的data文件夹
			
 
				+TRAINING_DATA_PATH = "../data"
			
 
				+
			
 
				+# 项目根目录上级的training_files文件夹
			
 
				+TRAINING_DATA_PATH = "../training_files"
			
 
				+```
			
 
				+
			
 
				+### 2. 绝对路径
			
 
				+
			
 
				+#### Linux/Mac 系统
			
 
				+```python
			
 
				+# Linux绝对路径
			
 
				+TRAINING_DATA_PATH = "/home/username/training_data"
			
 
				+
			
 
				+# Mac绝对路径
			
 
				+TRAINING_DATA_PATH = "/Users/username/Documents/training_data"
			
 
				+```
			
 
				+
			
 
				+#### Windows 系统
			
 
				+```python
			
 
				+# Windows绝对路径（使用正斜杠）
			
 
				+TRAINING_DATA_PATH = "C:/training_data"
			
 
				+TRAINING_DATA_PATH = "D:/Projects/my_training_data"
			
 
				+
			
 
				+# Windows绝对路径（使用反斜杠，需要转义）
			
 
				+TRAINING_DATA_PATH = "C:\\training_data"
			
 
				+TRAINING_DATA_PATH = "D:\\Projects\\my_training_data"
			
 
				+```
			
 
				+
			
 
				+### 3. 相对路径（不以 . 开头）
			
 
				+```python
			
 
				+# 相对于项目根目录
			
 
				+TRAINING_DATA_PATH = "training/data"      # 等同于 "./training/data"
			
 
				+TRAINING_DATA_PATH = "my_data"            # 等同于 "./my_data"
			
 
				+TRAINING_DATA_PATH = "data/training"      # 等同于 "./data/training"
			
 
				+```
			
 
				+
			
 
				+## 使用示例
			
 
				+
			
 
				+### 默认配置
			
 
				+```python
			
 
				+# 使用项目默认的训练数据目录
			
 
				+TRAINING_DATA_PATH = "./training/data"
			
 
				+```
			
 
				+
			
 
				+### 自定义本地目录
			
 
				+```python
			
 
				+# 使用项目根目录下的自定义文件夹
			
 
				+TRAINING_DATA_PATH = "./my_training_files"
			
 
				+```
			
 
				+
			
 
				+### 外部目录
			
 
				+```python
			
 
				+# Linux/Mac
			
 
				+TRAINING_DATA_PATH = "/home/user/Documents/sql_training_data"
			
 
				+
			
 
				+# Windows
			
 
				+TRAINING_DATA_PATH = "D:/SQL_Training_Data"
			
 
				+```
			
 
				+
			
 
				+## 命令行覆盖
			
 
				+
			
 
				+即使在配置文件中设置了路径，您仍然可以通过命令行参数临时覆盖：
			
 
				+
			
 
				+```bash
			
 
				+# 使用配置文件中的路径
			
 
				+python training/run_training.py
			
 
				+
			
 
				+# 临时使用其他路径
			
 
				+python training/run_training.py --data_path "./custom_data"
			
 
				+python training/run_training.py --data_path "/absolute/path/to/data"
			
 
				+python training/run_training.py --data_path "C:/Windows/Path/To/Data"
			
 
				+```
			
 
				+
			
 
				+## 路径验证
			
 
				+
			
 
				+运行训练脚本时，会显示路径解析结果：
			
 
				+```
			
 
				+===== 训练数据路径配置 =====
			
 
				+配置文件中的路径: ./training/data
			
 
				+解析后的绝对路径: /full/path/to/project/training/data
			
 
				+==============================
			
 
				+```
			
 
				+
			
 
				+这样您可以确认路径是否正确解析。 
			
--- a/embedding_function.py
+++ b/embedding_function.py
@@ -81,6 +81,29 @@ class EmbeddingFunction:
 
				                 
			
 
				         return embeddings
			
 
				     
			
 
				+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
			
 
				+        """
			
 
				+        为文档列表生成嵌入向量 (LangChain 接口)
			
 
				+        
			
 
				+        Args:
			
 
				+            texts: 要嵌入的文档列表
			
 
				+            
			
 
				+        Returns:
			
 
				+            List[List[float]]: 嵌入向量列表
			
 
				+        """
			
 
				+        return self.__call__(texts)
			
 
				+    
			
 
				+    def embed_query(self, text: str) -> List[float]:
			
 
				+        """
			
 
				+        为查询文本生成嵌入向量 (LangChain 接口)
			
 
				+        
			
 
				+        Args:
			
 
				+            text: 要嵌入的查询文本
			
 
				+            
			
 
				+        Returns:
			
 
				+            List[float]: 嵌入向量
			
 
				+        """
			
 
				+        return self.generate_embedding(text)
			
 
				     
			
 
				     def generate_embedding(self, text: str) -> List[float]:
			
 
				         """
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
 
				 vanna[chromadb,openai,postgres]==0.7.9
			
 
				 flask==3.1.1
			
 
				 plotly==5.22.0
			
 
				+langchain-core==0.3.64
			
 
				+langchain-postgres==0.0.14
			
--- a/training/run_training.py
+++ b/training/run_training.py
@@ -393,6 +393,67 @@ def process_training_files(data_path):
 
				         
			
 
				     return True
			
 
				 
			
 
				+def check_pgvector_connection():
			
 
				+    """检查 PgVector 数据库连接是否可用
			
 
				+    
			
 
				+    Returns:
			
 
				+        bool: 连接成功返回True，否则返回False
			
 
				+    """
			
 
				+    import app_config
			
 
				+    from sqlalchemy import create_engine, text
			
 
				+    
			
 
				+    try:
			
 
				+        # 构建连接字符串
			
 
				+        pg_config = app_config.PGVECTOR_CONFIG
			
 
				+        connection_string = f"postgresql://{pg_config['user']}:{pg_config['password']}@{pg_config['host']}:{pg_config['port']}/{pg_config['dbname']}"
			
 
				+        
			
 
				+        print(f"正在测试 PgVector 数据库连接...")
			
 
				+        print(f"连接地址: {pg_config['host']}:{pg_config['port']}/{pg_config['dbname']}")
			
 
				+        
			
 
				+        # 创建数据库引擎并测试连接
			
 
				+        engine = create_engine(connection_string)
			
 
				+        
			
 
				+        with engine.connect() as connection:
			
 
				+            # 测试基本连接
			
 
				+            result = connection.execute(text("SELECT 1"))
			
 
				+            result.fetchone()
			
 
				+            
			
 
				+            # 检查是否安装了 pgvector 扩展
			
 
				+            try:
			
 
				+                result = connection.execute(text("SELECT extname FROM pg_extension WHERE extname = 'vector'"))
			
 
				+                extension_exists = result.fetchone() is not None
			
 
				+                
			
 
				+                if extension_exists:
			
 
				+                    print("✓ PgVector 扩展已安装")
			
 
				+                else:
			
 
				+                    print("⚠ 警告: PgVector 扩展未安装，请确保已安装 pgvector 扩展")
			
 
				+                    
			
 
				+            except Exception as ext_e:
			
 
				+                print(f"⚠ 无法检查 pgvector 扩展状态: {ext_e}")
			
 
				+            
			
 
				+            # 检查训练数据表是否存在
			
 
				+            try:
			
 
				+                result = connection.execute(text("SELECT tablename FROM pg_tables WHERE tablename = 'langchain_pg_embedding'"))
			
 
				+                table_exists = result.fetchone() is not None
			
 
				+                
			
 
				+                if table_exists:
			
 
				+                    # 获取表中的记录数
			
 
				+                    result = connection.execute(text("SELECT COUNT(*) FROM langchain_pg_embedding"))
			
 
				+                    count = result.fetchone()[0]
			
 
				+                    print(f"✓ 训练数据表存在，当前包含 {count} 条记录")
			
 
				+                else:
			
 
				+                    print("ℹ 训练数据表尚未创建（首次训练时会自动创建）")
			
 
				+                    
			
 
				+            except Exception as table_e:
			
 
				+                print(f"⚠ 无法检查训练数据表状态: {table_e}")
			
 
				+        
			
 
				+        print("✓ PgVector 数据库连接测试成功")
			
 
				+        return True
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"✗ PgVector 数据库连接失败: {e}")
			
 
				+        return False
			
 
				+
			
 
				 def main():
			
 
				     """主函数：配置和运行训练流程"""
			
 
				     
			
@@ -402,43 +463,90 @@ def main():
 
				     
			
 
				     # 解析命令行参数
			
 
				     parser = argparse.ArgumentParser(description='训练Vanna NL2SQL模型')
			
 
				-    parser.add_argument('--data_path', type=str, default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data'),
			
 
				-                        help='训练数据目录路径 (默认: training/data)')
			
 
				+    
			
 
				+    # 获取默认路径并进行智能处理
			
 
				+    def resolve_training_data_path():
			
 
				+        """智能解析训练数据路径"""
			
 
				+        config_path = getattr(app_config, 'TRAINING_DATA_PATH', './training/data')
			
 
				+        
			
 
				+        # 如果是绝对路径，直接返回
			
 
				+        if os.path.isabs(config_path):
			
 
				+            return config_path
			
 
				+        
			
 
				+        # 如果以 . 开头，相对于项目根目录解析
			
 
				+        if config_path.startswith('./') or config_path.startswith('../'):
			
 
				+            project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
			
 
				+            return os.path.join(project_root, config_path)
			
 
				+        
			
 
				+        # 其他情况，相对于项目根目录
			
 
				+        project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
			
 
				+        return os.path.join(project_root, config_path)
			
 
				+    
			
 
				+    default_path = resolve_training_data_path()
			
 
				+    
			
 
				+    parser.add_argument('--data_path', type=str, default=default_path,
			
 
				+                        help='训练数据目录路径 (默认: 从app_config.TRAINING_DATA_PATH)')
			
 
				     args = parser.parse_args()
			
 
				     
			
 
				     # 使用Path对象处理路径以确保跨平台兼容性
			
 
				     data_path = Path(args.data_path)
			
 
				     
			
 
				+    # 显示路径解析结果
			
 
				+    print(f"\n===== 训练数据路径配置 =====")
			
 
				+    print(f"配置文件中的路径: {getattr(app_config, 'TRAINING_DATA_PATH', '未配置')}")
			
 
				+    print(f"解析后的绝对路径: {os.path.abspath(data_path)}")
			
 
				+    print("==============================")
			
 
				+    
			
 
				     # 设置正确的项目根目录路径
			
 
				     project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
			
 
				 
			
 
				     # 检查嵌入模型连接
			
 
				     check_embedding_model_connection()
			
 
				     
			
 
				-    # 打印ChromaDB相关信息
			
 
				-    try:
			
 
				+    # 根据配置的向量数据库类型显示相应信息
			
 
				+    vector_db_type = app_config.VECTOR_DB_NAME.lower()
			
 
				+    
			
 
				+    if vector_db_type == "chromadb":
			
 
				+        # 打印ChromaDB相关信息
			
 
				         try:
			
 
				-            import chromadb
			
 
				-            chroma_version = chromadb.__version__
			
 
				-        except ImportError:
			
 
				-            chroma_version = "未知"
			
 
				-        
			
 
				-        # 尝试查看当前使用的ChromaDB文件
			
 
				-        chroma_file = "chroma.sqlite3"  # 默认文件名
			
 
				-        
			
 
				-        # 使用项目根目录作为ChromaDB文件路径
			
 
				-        db_file_path = os.path.join(project_root, chroma_file)
			
 
				+            try:
			
 
				+                import chromadb
			
 
				+                chroma_version = chromadb.__version__
			
 
				+            except ImportError:
			
 
				+                chroma_version = "未知"
			
 
				+            
			
 
				+            # 尝试查看当前使用的ChromaDB文件
			
 
				+            chroma_file = "chroma.sqlite3"  # 默认文件名
			
 
				+            
			
 
				+            # 使用项目根目录作为ChromaDB文件路径
			
 
				+            db_file_path = os.path.join(project_root, chroma_file)
			
 
				 
			
 
				-        if os.path.exists(db_file_path):
			
 
				-            file_size = os.path.getsize(db_file_path) / 1024  # KB
			
 
				-            print(f"\n===== ChromaDB数据库: {os.path.abspath(db_file_path)} (大小: {file_size:.2f} KB) =====")
			
 
				-        else:
			
 
				-            print(f"\n===== 未找到ChromaDB数据库文件于: {os.path.abspath(db_file_path)} =====")
			
 
				+            if os.path.exists(db_file_path):
			
 
				+                file_size = os.path.getsize(db_file_path) / 1024  # KB
			
 
				+                print(f"\n===== ChromaDB数据库: {os.path.abspath(db_file_path)} (大小: {file_size:.2f} KB) =====")
			
 
				+            else:
			
 
				+                print(f"\n===== 未找到ChromaDB数据库文件于: {os.path.abspath(db_file_path)} =====")
			
 
				+                
			
 
				+            # 打印ChromaDB版本
			
 
				+            print(f"===== ChromaDB客户端库版本: {chroma_version} =====\n")
			
 
				+        except Exception as e:
			
 
				+            print(f"\n===== 无法获取ChromaDB信息: {e} =====\n")
			
 
				             
			
 
				-        # 打印ChromaDB版本
			
 
				-        print(f"===== ChromaDB客户端库版本: {chroma_version} =====\n")
			
 
				-    except Exception as e:
			
 
				-        print(f"\n===== 无法获取ChromaDB信息: {e} =====\n")
			
 
				+    elif vector_db_type == "pgvector":
			
 
				+        # 打印PgVector相关信息并测试连接
			
 
				+        print(f"\n===== PgVector数据库配置 =====")
			
 
				+        pg_config = app_config.PGVECTOR_CONFIG
			
 
				+        print(f"数据库地址: {pg_config['host']}:{pg_config['port']}")
			
 
				+        print(f"数据库名称: {pg_config['dbname']}")
			
 
				+        print(f"用户名: {pg_config['user']}")
			
 
				+        print("==============================\n")
			
 
				+        
			
 
				+        # 测试PgVector连接
			
 
				+        if not check_pgvector_connection():
			
 
				+            print("PgVector 数据库连接失败，训练过程终止。")
			
 
				+            sys.exit(1)
			
 
				+    else:
			
 
				+        print(f"\n===== 未知的向量数据库类型: {vector_db_type} =====\n")
			
 
				     
			
 
				     # 处理训练文件
			
 
				     process_successful = process_training_files(data_path)
			
@@ -455,20 +563,26 @@ def main():
 
				         vn = create_vanna_instance()
			
 
				         
			
 
				         # 根据向量数据库类型执行不同的验证逻辑
			
 
				-        # 由于已确定只使用ChromaDB，简化这部分逻辑
			
 
				         try:
			
 
				             training_data = vn.get_training_data()
			
 
				             if training_data is not None and not training_data.empty:
			
 
				-                # get_training_data 内部通常会打印数量，这里可以补充一个总结
			
 
				-                print(f"已从ChromaDB中检索到 {len(training_data)} 条训练数据进行验证。")
			
 
				+                print(f"✓ 已从{vector_db_type.upper()}中检索到 {len(training_data)} 条训练数据进行验证。")
			
 
				+                
			
 
				+                # 显示训练数据类型统计
			
 
				+                if 'training_data_type' in training_data.columns:
			
 
				+                    type_counts = training_data['training_data_type'].value_counts()
			
 
				+                    print("训练数据类型统计:")
			
 
				+                    for data_type, count in type_counts.items():
			
 
				+                        print(f"  {data_type}: {count} 条")
			
 
				+                        
			
 
				             elif training_data is not None and training_data.empty:
			
 
				-                 print("在ChromaDB中未找到任何训练数据。")
			
 
				+                print(f"⚠ 在{vector_db_type.upper()}中未找到任何训练数据。")
			
 
				             else: # training_data is None
			
 
				-                print("无法从Vanna获取训练数据 (可能返回了None)。请检查连接和Vanna实现。")
			
 
				+                print(f"⚠ 无法从Vanna获取训练数据 (可能返回了None)。请检查{vector_db_type.upper()}连接和Vanna实现。")
			
 
				 
			
 
				         except Exception as e:
			
 
				-            print(f"验证训练数据失败: {e}")
			
 
				-            print("请检查ChromaDB连接和表结构。")
			
 
				+            print(f"✗ 验证训练数据失败: {e}")
			
 
				+            print(f"请检查{vector_db_type.upper()}连接和表结构。")
			
 
				     else:
			
 
				         print("\n===== 未能找到或处理任何训练文件，训练过程终止 =====")
			
 
				     
			
@@ -477,9 +591,15 @@ def main():
 
				     print(f"模型名称: {app_config.EMBEDDING_CONFIG.get('model_name')}")
			
 
				     print(f"向量维度: {app_config.EMBEDDING_CONFIG.get('embedding_dimension')}")
			
 
				     print(f"API服务: {app_config.EMBEDDING_CONFIG.get('base_url')}")
			
 
				-    # 打印ChromaDB路径信息
			
 
				-    chroma_display_path = os.path.abspath(project_root)
			
 
				-    print(f"向量数据库: ChromaDB ({chroma_display_path})")
			
 
				+    
			
 
				+    # 根据配置显示向量数据库信息
			
 
				+    if vector_db_type == "chromadb":
			
 
				+        chroma_display_path = os.path.abspath(project_root)
			
 
				+        print(f"向量数据库: ChromaDB ({chroma_display_path})")
			
 
				+    elif vector_db_type == "pgvector":
			
 
				+        pg_config = app_config.PGVECTOR_CONFIG
			
 
				+        print(f"向量数据库: PgVector ({pg_config['host']}:{pg_config['port']}/{pg_config['dbname']})")
			
 
				+    
			
 
				     print("===== 训练流程完成 =====\n")
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/training/vanna_trainer.py
+++ b/training/vanna_trainer.py
@@ -31,13 +31,14 @@ if hasattr(app_config, 'EMBEDDING_CONFIG'):
 
				         print(f"API服务: {app_config.EMBEDDING_CONFIG['base_url']}")
			
 
				 print("==============================")
			
 
				 
			
 
				-# 从app_config获取其他配置
			
 
				-BATCH_PROCESSING_ENABLED = app_config.BATCH_PROCESSING_ENABLED
			
 
				-BATCH_SIZE = app_config.BATCH_SIZE
			
 
				-MAX_WORKERS = app_config.MAX_WORKERS
			
 
				+# 从app_config获取训练批处理配置
			
 
				+BATCH_PROCESSING_ENABLED = app_config.TRAINING_BATCH_PROCESSING_ENABLED
			
 
				+BATCH_SIZE = app_config.TRAINING_BATCH_SIZE
			
 
				+MAX_WORKERS = app_config.TRAINING_MAX_WORKERS
			
 
				 
			
 
				 
			
 
				-# 数据批处理器
			
 
				+# 训练数据批处理器
			
 
				+# 专门用于优化训练过程的批处理器，将多个训练项目打包处理以提高效率
			
 
				 class BatchProcessor:
			
 
				     def __init__(self, batch_size=BATCH_SIZE, max_workers=MAX_WORKERS):
			
 
				         self.batch_size = batch_size
			
@@ -51,7 +52,7 @@ class BatchProcessor:
 
				         # 是否启用批处理
			
 
				         self.batch_enabled = BATCH_PROCESSING_ENABLED       
			
 
				 
			
 
				-        print(f"[DEBUG] 批处理器初始化: 启用={self.batch_enabled}, 批大小={self.batch_size}, 最大工作线程={self.max_workers}")
			
 
				+        print(f"[DEBUG] 训练批处理器初始化: 启用={self.batch_enabled}, 批大小={self.batch_size}, 最大工作线程={self.max_workers}")
			
 
				     
			
 
				     def add_item(self, batch_type: str, item: Dict[str, Any]):
			
 
				         """添加一个项目到批处理队列"""
			
@@ -152,15 +153,16 @@ class BatchProcessor:
 
				             # 清空队列
			
 
				             self.batches = defaultdict(list)
			
 
				         
			
 
				-        print("[INFO] 所有批处理项目已完成")
			
 
				+        print("[INFO] 所有训练批处理项目已完成")
			
 
				     
			
 
				     def shutdown(self):
			
 
				         """关闭处理器和线程池"""
			
 
				         self.flush_all()
			
 
				         self.executor.shutdown(wait=True)
			
 
				-        print("[INFO] 批处理器已关闭")
			
 
				+        print("[INFO] 训练批处理器已关闭")
			
 
				 
			
 
				-# 创建全局批处理器实例
			
 
				+# 创建全局训练批处理器实例
			
 
				+# 用于所有训练函数的批处理优化
			
 
				 batch_processor = BatchProcessor()
			
 
				 
			
 
				 # 原始训练函数的批处理增强版本
			
--- a/vanna_llm_factory.py
+++ b/vanna_llm_factory.py
@@ -1,12 +1,12 @@
 
				 """
			
 
				 Vanna LLM 工厂文件，专注于 ChromaDB 并简化配置。
			
 
				 """
			
 
				+import app_config, os
			
 
				 from vanna.chromadb import ChromaDB_VectorStore  # 从 Vanna 系统获取
			
 
				 from customqianwen.Custom_QianwenAI_chat import QianWenAI_Chat
			
 
				 from customdeepseek.custom_deepseek_chat import DeepSeekChat
			
 
				-import app_config 
			
 
				 from embedding_function import get_embedding_function
			
 
				-import os
			
 
				+from custompgvector import PG_VectorStore
			
 
				 
			
 
				 class Vanna_Qwen_ChromaDB(ChromaDB_VectorStore, QianWenAI_Chat):
			
 
				     def __init__(self, config=None):
			
@@ -18,6 +18,24 @@ class Vanna_DeepSeek_ChromaDB(ChromaDB_VectorStore, DeepSeekChat):
 
				         ChromaDB_VectorStore.__init__(self, config=config)
			
 
				         DeepSeekChat.__init__(self, config=config)
			
 
				 
			
 
				+class Vanna_Qwen_PGVector(PG_VectorStore, QianWenAI_Chat):
			
 
				+    def __init__(self, config=None):
			
 
				+        PG_VectorStore.__init__(self, config=config)
			
 
				+        QianWenAI_Chat.__init__(self, config=config)
			
 
				+
			
 
				+class Vanna_DeepSeek_PGVector(PG_VectorStore, DeepSeekChat):
			
 
				+    def __init__(self, config=None):
			
 
				+        PG_VectorStore.__init__(self, config=config)
			
 
				+        DeepSeekChat.__init__(self, config=config)
			
 
				+
			
 
				+# 组合映射表
			
 
				+LLM_VECTOR_DB_MAP = {
			
 
				+    ('deepseek', 'chromadb'): Vanna_DeepSeek_ChromaDB,
			
 
				+    ('deepseek', 'pgvector'): Vanna_DeepSeek_PGVector,
			
 
				+    ('qwen', 'chromadb'): Vanna_Qwen_ChromaDB,
			
 
				+    ('qwen', 'pgvector'): Vanna_Qwen_PGVector,
			
 
				+}
			
 
				+
			
 
				 def create_vanna_instance(config_module=None):
			
 
				     """
			
 
				     工厂函数：创建并初始化一个Vanna实例 (LLM 和 ChromaDB 特定版本)
			
@@ -31,55 +49,48 @@ def create_vanna_instance(config_module=None):
 
				     if config_module is None:
			
 
				         config_module = app_config
			
 
				 
			
 
				-    model_type = config_module.MODEL_TYPE.lower()
			
 
				+    llm_model_name  = config_module.LLM_MODEL_NAME.lower()
			
 
				+    vector_db_name = config_module.VECTOR_DB_NAME.lower()   
			
 
				+
			
 
				+    if (llm_model_name, vector_db_name) not in LLM_VECTOR_DB_MAP:
			
 
				+        raise ValueError(f"不支持的模型类型: {llm_model_name} 或 向量数据库类型: {vector_db_name}")
			
 
				     
			
 
				     config = {}
			
 
				-    if model_type == "deepseek":
			
 
				+    if llm_model_name == "deepseek":
			
 
				         config = config_module.DEEPSEEK_CONFIG.copy()
			
 
				-        print(f"创建DeepSeek模型实例，使用模型: {config['model']}")
			
 
				-        # 检查API密钥
			
 
				-        if not config.get("api_key"):
			
 
				-            print(f"\n错误: DeepSeek API密钥未设置或为空")
			
 
				-            print(f"请在.env文件中设置DEEPSEEK_API_KEY环境变量")
			
 
				-            print(f"无法继续执行，程序退出\n")
			
 
				-            import sys
			
 
				-            sys.exit(1)
			
 
				-    elif model_type == "qwen":
			
 
				+        print(f"创建DeepSeek模型实例，使用模型: {config.get('model', 'deepseek-chat')}")
			
 
				+    elif llm_model_name == "qwen":
			
 
				         config = config_module.QWEN_CONFIG.copy()
			
 
				-        print(f"创建Qwen模型实例，使用模型: {config['model']}")
			
 
				-        # 检查API密钥
			
 
				-        if not config.get("api_key"):
			
 
				-            print(f"\n错误: Qwen API密钥未设置或为空")
			
 
				-            print(f"请在.env文件中设置QWEN_API_KEY环境变量")
			
 
				-            print(f"无法继续执行，程序退出\n")
			
 
				-            import sys
			
 
				-            sys.exit(1)
			
 
				+        print(f"创建Qwen模型实例，使用模型: {config.get('model', 'qwen-plus-latest')}")
			
 
				     else:
			
 
				-        raise ValueError(f"不支持的模型类型: {model_type}") 
			
 
				+        raise ValueError(f"不支持的模型类型: {llm_model_name}") 
			
 
				+    
			
 
				+    if vector_db_name == "chromadb":
			
 
				+        config["path"] = os.path.dirname(os.path.abspath(__file__))
			
 
				+        print(f"已配置使用ChromaDB作为向量数据库，路径：{config['path']}")
			
 
				+    elif vector_db_name == "pgvector":
			
 
				+        # 构建PostgreSQL连接字符串
			
 
				+        pg_config = config_module.PGVECTOR_CONFIG
			
 
				+        connection_string = f"postgresql://{pg_config['user']}:{pg_config['password']}@{pg_config['host']}:{pg_config['port']}/{pg_config['dbname']}"
			
 
				+        config["connection_string"] = connection_string
			
 
				+        print(f"已配置使用PgVector作为向量数据库，连接字符串: {connection_string}")
			
 
				+    else:
			
 
				+        raise ValueError(f"不支持的向量数据库类型: {vector_db_name}")    
			
 
				     
			
 
				     embedding_function = get_embedding_function()
			
 
				 
			
 
				     config["embedding_function"] = embedding_function
			
 
				     print(f"已配置使用 EMBEDDING_CONFIG 中的嵌入模型: {config_module.EMBEDDING_CONFIG['model_name']}, 维度: {config_module.EMBEDDING_CONFIG['embedding_dimension']}")
			
 
				     
			
 
				-    # 设置ChromaDB路径为项目根目录
			
 
				-    project_root = os.path.dirname(os.path.abspath(__file__))
			
 
				-    config["path"] = project_root
			
 
				-    print(f"已配置使用ChromaDB作为向量数据库，路径：{project_root}")
			
 
				-    
			
 
				-    vn = None
			
 
				-    if model_type == "deepseek":
			
 
				-        vn = Vanna_DeepSeek_ChromaDB(config=config)
			
 
				-        print("创建DeepSeek+ChromaDB实例")
			
 
				-    elif model_type == "qwen":
			
 
				-        vn = Vanna_Qwen_ChromaDB(config=config)
			
 
				-        print("创建Qwen+ChromaDB实例")
			
 
				+    key = (llm_model_name, vector_db_name)
			
 
				+    cls = LLM_VECTOR_DB_MAP.get(key)
			
 
				+    if cls is None:
			
 
				+        raise ValueError(f"不支持的组合: 模型类型={llm_model_name}, 向量数据库类型={vector_db_name}")
			
 
				     
			
 
				-    if vn is None:
			
 
				-        raise ValueError(f"未能成功创建Vanna实例，不支持的模型类型: {model_type}")
			
 
				+    vn = cls(config=config)
			
 
				 
			
 
				     vn.connect_to_postgres(**config_module.APP_DB_CONFIG)           
			
 
				-    print(f"已连接到业务数据库: "
			
 
				+    print(f"连接到PostgreSQL业务数据库: "
			
 
				           f"{config_module.APP_DB_CONFIG['host']}:"
			
 
				           f"{config_module.APP_DB_CONFIG['port']}/"
			
 
				           f"{config_module.APP_DB_CONFIG['dbname']}")