19 Commity 8a2d5e325f ... c793722717

Autor SHA1 Wiadomość Data
  wangxq c793722717 修复返回状态问题,优化两个查询的API。 1 miesiąc temu
  wangxq 1759e64281 发布到143的第二个版本 1 miesiąc temu
  wangxq 738f40f21e 初步完成对单步执行的测试,修复了一些bug. 1 miesiąc temu
  wangxq bb0b2a4687 修复data_pipeline单实例的数据库连接问题,增加上传数据文件到task目录的API. 1 miesiąc temu
  wangxq d6ffe2fac0 发现数据库连接的bug,准备为data_pipeline模块修改单例模式。 1 miesiąc temu
  wangxq bde144ffb0 在data_pipline中增加了conn_str参数,可以从conn_str中获取app_db的名字. 1 miesiąc temu
  wangxq 9af004e1e3 在data_pipline中增加了conn_str参数,可以不指定app_db的名字. 1 miesiąc temu
  wangxq 31f1504378 测试了data_pipline日志模块,现在增强版本的data_pipline的api. 1 miesiąc temu
  wangxq e606828120 修正了data_pipline日志的模块,它现在是独立的,下面准备修改作业执行过程的监控功能。 1 miesiąc temu
  wangxq 6313111c3c 修改了data_pipeline的调用api,在日志管理方面遇到问题,准备重构这个模块的日志管理. 1 miesiąc temu
  wangxq 54502af5e8 准备把当前版本提交到开发环境。 1 miesiąc temu
  wangxq 800b28075b 重载vannabase基类日志输出功能,修复生成的SQL as 英文的问题。 1 miesiąc temu
  wangxq 98b52e1065 开始测试日志管理功能. 1 miesiąc temu
  wangxq 04e48d41f1 项目统一的日志管理重构完成. 1 miesiąc temu
  wangxq eb0f5f1fd4 准备全项目级别改造log服务。 1 miesiąc temu
  wangxq 5642f785ff 将数据库判断的提示词抽取为txt文件,由生成训练数据的时候,一起生成,然后动态提供给代码使用。另外,修复了无法生成SQL时,无法传递LLM response的问题。 1 miesiąc temu
  wangxq 0a8b2fd63b 在生成trainig_data的时候,添加了db_query_decision_prompt.txt,优化了产生的结果。 1 miesiąc temu
  wangxq 434784d3ec 准备修改metadata.txt的生成需求,改了表的字段,增加了这个表的md文件,增加了一个让LLM判断是否查询数据库的提示词文件。 1 miesiąc temu
  wangxq 847e45252b 训练数据生成与加载模块重构完成. 1 miesiąc temu
100 zmienionych plików z 9426 dodań i 951 usunięć
  1. 30 0
      .claude/settings.local.json
  2. 161 0
      CLAUDE.md
  3. 108 91
      agent/citu_agent.py
  4. 98 51
      agent/classifier.py
  5. 20 0
      agent/tools/db_query_decision_prompt.txt
  6. 20 0
      agent/tools/db_query_decision_prompt.txt.bak
  7. 7 3
      agent/tools/general_chat.py
  8. 10 6
      agent/tools/sql_execution.py
  9. 7 19
      agent/tools/sql_generation.py
  10. 8 4
      agent/tools/summary_generation.py
  11. 13 9
      agent/tools/utils.py
  12. 297 0
      api_usage_examples.md
  13. 1 14
      app_config.py
  14. 1827 99
      citu_app.py
  15. 15 13
      common/embedding_cache_manager.py
  16. 22 18
      common/qa_feedback_manager.py
  17. 37 35
      common/redis_conversation_manager.py
  18. 12 8
      common/utils.py
  19. 11 7
      common/vanna_combinations.py
  20. 8 4
      common/vanna_instance.py
  21. 93 0
      config/logging_config.yaml
  22. 17 12
      core/embedding_function.py
  23. 41 0
      core/logging/__init__.py
  24. 214 0
      core/logging/log_manager.py
  25. 11 7
      core/vanna_llm_factory.py
  26. 17 13
      customembedding/ollama_embedding.py
  27. 120 151
      customllm/base_llm_chat.py
  28. 15 15
      customllm/deepseek_chat.py
  29. 116 0
      customllm/llm_prompts.yaml
  30. 112 0
      customllm/llm_prompts_bak.yaml
  31. 169 0
      customllm/load_prompts.py
  32. 31 31
      customllm/ollama_chat.py
  33. 8 8
      customllm/qianwen_chat.py
  34. 58 22
      custompgvector/pgvector.py
  35. 5 5
      data_pipeline/README.md
  36. 4 4
      data_pipeline/__init__.py
  37. 0 0
      data_pipeline/analyzers/__init__.py
  38. 2 2
      data_pipeline/analyzers/md_analyzer.py
  39. 62 33
      data_pipeline/analyzers/theme_extractor.py
  40. 9 0
      data_pipeline/api/__init__.py
  41. 895 0
      data_pipeline/api/simple_db_manager.py
  42. 901 0
      data_pipeline/api/simple_file_manager.py
  43. 628 0
      data_pipeline/api/simple_workflow.py
  44. 370 0
      data_pipeline/api/table_inspector_api.py
  45. 18 3
      data_pipeline/config.py
  46. 5 0
      data_pipeline/ddl_generation/__init__.py
  47. 16 12
      data_pipeline/ddl_generation/ddl_md_generator.py
  48. 27 12
      data_pipeline/ddl_generation/training_data_agent.py
  49. 29 0
      data_pipeline/dp_logging/__init__.py
  50. 156 0
      data_pipeline/dp_logging/manager.py
  51. 544 0
      data_pipeline/metadata_only_generator.py
  52. 0 0
      data_pipeline/prompts/__init__.py
  53. 0 0
      data_pipeline/prompts/business_dictionary.txt
  54. 1 0
      data_pipeline/qa_generation/__init__.py
  55. 250 31
      data_pipeline/qa_generation/qs_agent.py
  56. 5 5
      data_pipeline/qa_generation/qs_generator.py
  57. 227 59
      data_pipeline/schema_workflow.py
  58. 235 0
      data_pipeline/sql/init_tables.sql
  59. 5 5
      data_pipeline/tables.txt
  60. 77 0
      data_pipeline/task_executor.py
  61. 0 0
      data_pipeline/tools/__init__.py
  62. 29 28
      data_pipeline/tools/base.py
  63. 24 7
      data_pipeline/tools/comment_generator.py
  64. 14 11
      data_pipeline/tools/data_sampler.py
  65. 2 2
      data_pipeline/tools/database_inspector.py
  66. 46 30
      data_pipeline/tools/ddl_generator.py
  67. 3 3
      data_pipeline/tools/doc_generator.py
  68. 1 0
      data_pipeline/trainer/__init__.py
  69. 57 26
      data_pipeline/trainer/run_training.py
  70. 37 33
      data_pipeline/trainer/vanna_trainer.py
  71. 31 0
      data_pipeline/training_data/task_20250701_131627/bss_business_day_data.ddl
  72. 32 0
      data_pipeline/training_data/task_20250701_131627/bss_business_day_data_detail.md
  73. 17 0
      data_pipeline/training_data/task_20250701_131627/bss_car_day_count.ddl
  74. 18 0
      data_pipeline/training_data/task_20250701_131627/bss_car_day_count_detail.md
  75. 15 0
      data_pipeline/training_data/task_20250701_131627/bss_company.ddl
  76. 15 0
      data_pipeline/training_data/task_20250701_131627/bss_company_detail.md
  77. 16 0
      data_pipeline/training_data/task_20250701_131627/bss_section_route.ddl
  78. 7 0
      data_pipeline/training_data/task_20250701_131627/bss_section_route_area_link.ddl
  79. 7 0
      data_pipeline/training_data/task_20250701_131627/bss_section_route_area_link_detail.md
  80. 16 0
      data_pipeline/training_data/task_20250701_131627/bss_section_route_detail.md
  81. 19 0
      data_pipeline/training_data/task_20250701_131627/bss_service_area.ddl
  82. 21 0
      data_pipeline/training_data/task_20250701_131627/bss_service_area_detail.md
  83. 18 0
      data_pipeline/training_data/task_20250701_131627/bss_service_area_mapper.ddl
  84. 19 0
      data_pipeline/training_data/task_20250701_131627/bss_service_area_mapper_detail.md
  85. 10 0
      data_pipeline/training_data/task_20250701_131627/db_query_decision_prompt.txt
  86. 10 0
      data_pipeline/training_data/task_20250701_131627/filename_mapping.txt
  87. 62 0
      data_pipeline/training_data/task_20250701_131627/metadata.txt
  88. 20 0
      data_pipeline/training_data/task_20250701_131627/metadata_detail.md
  89. 190 0
      data_pipeline/training_data/task_20250701_131627/qs_highway_db_20250701_134736_pair.json
  90. 202 0
      data_pipeline/training_data/task_20250701_131627/qs_highway_db_20250701_134736_pair.json.backup
  91. 14 0
      data_pipeline/training_data/task_20250701_131627/task_config.json
  92. 88 0
      data_pipeline/training_data/task_20250701_131627/task_result.json
  93. 17 0
      data_pipeline/training_data/task_20250701_175640/bss_car_day_count.ddl
  94. 18 0
      data_pipeline/training_data/task_20250701_175640/bss_car_day_count_detail.md
  95. 14 0
      data_pipeline/training_data/task_20250701_175640/task_config.json
  96. 14 0
      data_pipeline/training_data/task_20250701_180014/task_config.json
  97. 31 0
      data_pipeline/training_data/task_20250701_184430/bss_business_day_data.ddl
  98. 32 0
      data_pipeline/training_data/task_20250701_184430/bss_business_day_data_detail.md
  99. 17 0
      data_pipeline/training_data/task_20250701_184430/bss_car_day_count.ddl
  100. 18 0
      data_pipeline/training_data/task_20250701_184430/bss_car_day_count_detail.md

+ 30 - 0
.claude/settings.local.json

@@ -0,0 +1,30 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(rg:*)",
+      "Bash(rg:*)",
+      "Bash(find:*)",
+      "Bash(mkdir:*)",
+      "Bash(cp:*)",
+      "Bash(grep:*)",
+      "Bash(python:*)",
+      "Bash(source:*)",
+      "Bash(ls:*)",
+      "Bash(.venv/Scripts/activate)",
+      "Bash(.venv/Scripts/python.exe -m data_pipeline:*)",
+      "Bash(.venv/Scripts/python.exe data_pipeline/training_data/run_training.py:*)",
+      "Bash(.venv/Scripts/python.exe:*)",
+      "Bash(mv:*)",
+      "Bash(rm:*)",
+      "Bash(.venv/bin/python:*)",
+      "Bash(./.venv/Scripts/python.exe:*)",
+      "Bash(sed:*)",
+      "Bash(\".venv/Scripts/python.exe\" -c \"import sys; sys.path.append('.'); from data_pipeline.logging import get_logger; print('独立日志系统导入成功')\")",
+      "Bash(\".venv/Scripts/python.exe\" -c \"import sys; sys.path.append('.'); from data_pipeline.schema_workflow import SchemaWorkflowOrchestrator; print('SchemaWorkflowOrchestrator导入成功')\")",
+      "Bash(\".venv/Scripts/python.exe\" -c \"import sys; sys.path.append('.'); from data_pipeline.api.simple_workflow import SimpleWorkflowExecutor; print('SimpleWorkflowExecutor导入成功')\")",
+      "Bash(\".venv/Scripts/python.exe\":*)",
+      "Bash(curl:*)"
+    ],
+    "deny": []
+  }
+}

+ 161 - 0
CLAUDE.md

@@ -0,0 +1,161 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Development Commands
+
+### Start Applications
+
+```bash
+# Start Chainlit conversational interface (primary UI)
+chainlit run chainlit_app.py
+
+# Start Flask web interface (simple API)
+python flask_app.py
+
+# Start advanced Flask application with full agent APIs
+python citu_app.py
+```
+
+### Training and Data Management
+
+```bash
+# Run training pipeline with data from data_pipeline/training_data directory
+python -m data_pipeline.trainer.run_training --data_path ./data_pipeline/training_data/
+
+# Complete automated schema workflow (DDL generation → Q&A generation → SQL validation → Training data loading)
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@host:port/database_name" \
+  --table-list tables.txt \
+  --business-context "业务系统描述" \
+  --output-dir ./data_pipeline/training_data/
+
+# Generate only schema documentation without validation
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@host:port/db_name" \
+  --table-list tables.txt \
+  --business-context "系统描述" \
+  --skip-validation
+```
+
+### Testing
+
+```bash
+# Test QA feedback and conversation management APIs
+python test_qa_apis.py
+
+# Test training data management APIs  
+python test_training_data_apis.py
+```
+
+## Core Architecture
+
+### Application Entry Points
+
+- **`chainlit_app.py`** - Modern conversational UI with streaming responses, fallback mechanisms, and comprehensive error handling
+- **`citu_app.py`** - Production Flask application with full REST APIs for agent queries, conversation management, QA feedback, and health monitoring
+- **`flask_app.py`** - Simple REST API for basic database queries
+
+### Central Configuration
+
+**`app_config.py`** is the main configuration hub controlling:
+
+```python
+# Multi-provider LLM selection
+LLM_MODEL_TYPE = "api"  # api or ollama
+API_LLM_MODEL = "qianwen"  # qianwen or deepseek
+
+# Vector database selection  
+VECTOR_DB_TYPE = "pgvector"  # chromadb or pgvector
+
+# Agent routing behavior
+QUESTION_ROUTING_MODE = "hybrid"  # hybrid, database_direct, chat_direct, llm_only
+
+# Feature toggles
+ENABLE_RESULT_SUMMARY = True
+ENABLE_CONVERSATION_CONTEXT = True
+DISPLAY_RESULT_THINKING = False
+```
+
+### LLM and Vector Database Combinations
+
+The system supports 6 LLM + vector database combinations via **`common/vanna_combinations.py`**:
+- QianWen + ChromaDB/PgVector
+- DeepSeek + ChromaDB/PgVector  
+- Ollama + ChromaDB/PgVector
+
+All combinations are created through **`core/vanna_llm_factory.py`** using factory pattern.
+
+### Agent System Architecture
+
+**`agent/citu_agent.py`** implements a sophisticated LangGraph-based workflow:
+
+```
+Question → Classify → [DATABASE Path] → SQL Generation → SQL Validation → SQL Execution → Summary
+                   → [CHAT Path] → General Chat
+```
+
+**Routing Modes:**
+- `hybrid` (default) - Intelligent classification between database and chat
+- `database_direct` - Skip classification, direct SQL generation
+- `chat_direct` - Skip classification, direct chat response
+- `llm_only` - LLM-based classification only
+
+### Database Integration
+
+**Three-Database Architecture:**
+1. **Business Database** (`APP_DB_CONFIG`) - Source data for queries
+2. **Vector Database** (`PGVECTOR_CONFIG`) - Training data and embeddings
+3. **Redis Cache** (`REDIS_*`) - Conversations, QA results, embedding cache
+
+### Training Data Pipeline
+
+**Training data is managed in `data_pipeline/training_data/` directory.**
+
+**File Format Mapping:**
+- `.ddl` files → `train_ddl_statements()`
+- `.md/.markdown` → `train_documentation_blocks()`
+- `_pair.json/_pairs.json` → `train_json_question_sql_pairs()`
+- `_pair.sql/_pairs.sql` → `train_formatted_question_sql_pairs()`
+- `.sql` (other) → `train_sql_examples()`
+
+### Data Pipeline System
+
+**`data_pipeline/`** provides automated database reverse engineering:
+
+1. **Database Inspector** - Automatic schema discovery
+2. **DDL Generator** - PostgreSQL DDL with intelligent comments
+3. **Documentation Generator** - Detailed markdown documentation  
+4. **Q&A Generator** (`qa_generation/`) - LLM-generated question-SQL pairs
+5. **SQL Validator** (`validators/`) - EXPLAIN-based validation with auto-repair
+6. **Training Pipeline** (`trainer/`) - Vanna.ai training data ingestion
+
+## Key Patterns
+
+### Singleton Pattern
+**`common/vanna_instance.py`** implements thread-safe singleton for global Vanna instance management.
+
+### Caching Strategy
+Multi-layer caching via **`common/`**:
+- **`session_aware_cache.py`** - Web session-aware caching
+- **`embedding_cache_manager.py`** - High-performance embedding caching
+- **`redis_conversation_manager.py`** - Conversation lifecycle management
+
+### Error Handling
+Comprehensive fallback mechanisms throughout the stack:
+- SQL generation failures → General chat responses
+- LLM timeouts → Cached responses
+- Database connection issues → Health check endpoints
+
+### Configuration Precedence
+1. Environment variables (`.env` file)
+2. **`app_config.py`** defaults
+3. Module-specific configs (e.g., **`data_pipeline/config.py`**)
+
+## Important Notes
+
+- The system requires PostgreSQL for business data and optionally PgVector for vector storage
+- Redis is essential for conversation management and caching
+- Training data generation is resource-intensive and should be run with appropriate database permissions
+- The agent system supports both streaming and non-streaming responses based on LLM provider capabilities
+- Always test configuration changes with health check endpoints before production deployment

+ 108 - 91
agent/citu_agent.py

@@ -4,50 +4,54 @@ from langgraph.graph import StateGraph, END
 from langchain.agents import AgentExecutor, create_openai_tools_agent
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.messages import SystemMessage, HumanMessage
+from core.logging import get_agent_logger
 
 from agent.state import AgentState
 from agent.classifier import QuestionClassifier
 from agent.tools import TOOLS, generate_sql, execute_sql, generate_summary, general_chat
-from agent.utils import get_compatible_llm
+from agent.tools.utils import get_compatible_llm
 from app_config import ENABLE_RESULT_SUMMARY
 
 class CituLangGraphAgent:
     """Citu LangGraph智能助手主类 - 使用@tool装饰器 + Agent工具调用"""
     
     def __init__(self):
+        # 初始化日志
+        self.logger = get_agent_logger("CituAgent")
+        
         # 加载配置
         try:
             from agent.config import get_current_config, get_nested_config
             self.config = get_current_config()
-            print("[CITU_AGENT] 加载Agent配置完成")
+            self.logger.info("加载Agent配置完成")
         except ImportError:
             self.config = {}
-            print("[CITU_AGENT] 配置文件不可用,使用默认配置")
+            self.logger.warning("配置文件不可用,使用默认配置")
         
         self.classifier = QuestionClassifier()
         self.tools = TOOLS
         self.llm = get_compatible_llm()
         
         # 注意:现在使用直接工具调用模式,不再需要预创建Agent执行器
-        print("[CITU_AGENT] 使用直接工具调用模式")
+        self.logger.info("使用直接工具调用模式")
         
         # 不在构造时创建workflow,改为动态创建以支持路由模式参数
         # self.workflow = self._create_workflow()
-        print("[CITU_AGENT] LangGraph Agent with Direct Tools初始化完成")
+        self.logger.info("LangGraph Agent with Direct Tools初始化完成")
     
     def _create_workflow(self, routing_mode: str = None) -> StateGraph:
         """根据路由模式创建不同的工作流"""
         # 确定使用的路由模式
         if routing_mode:
             QUESTION_ROUTING_MODE = routing_mode
-            print(f"[CITU_AGENT] 创建工作流,使用传入的路由模式: {QUESTION_ROUTING_MODE}")
+            self.logger.info(f"创建工作流,使用传入的路由模式: {QUESTION_ROUTING_MODE}")
         else:
             try:
                 from app_config import QUESTION_ROUTING_MODE
-                print(f"[CITU_AGENT] 创建工作流,使用配置文件路由模式: {QUESTION_ROUTING_MODE}")
+                self.logger.info(f"创建工作流,使用配置文件路由模式: {QUESTION_ROUTING_MODE}")
             except ImportError:
                 QUESTION_ROUTING_MODE = "hybrid"
-                print(f"[CITU_AGENT] 配置导入失败,使用默认路由模式: {QUESTION_ROUTING_MODE}")
+                self.logger.warning(f"配置导入失败,使用默认路由模式: {QUESTION_ROUTING_MODE}")
         
         workflow = StateGraph(AgentState)
         
@@ -137,12 +141,12 @@ class CituLangGraphAgent:
             state["current_step"] = "direct_database_init"
             state["execution_path"].append("init_direct_database")
             
-            print(f"[DIRECT_DATABASE] 直接数据库模式初始化完成")
+            self.logger.info("直接数据库模式初始化完成")
             
             return state
             
         except Exception as e:
-            print(f"[ERROR] 直接数据库模式初始化异常: {str(e)}")
+            self.logger.error(f"直接数据库模式初始化异常: {str(e)}")
             state["error"] = f"直接数据库模式初始化失败: {str(e)}"
             state["error_code"] = 500
             state["execution_path"].append("init_direct_database_error")
@@ -163,12 +167,12 @@ class CituLangGraphAgent:
             state["current_step"] = "direct_chat_init"
             state["execution_path"].append("init_direct_chat")
             
-            print(f"[DIRECT_CHAT] 直接聊天模式初始化完成")
+            self.logger.info("直接聊天模式初始化完成")
             
             return state
             
         except Exception as e:
-            print(f"[ERROR] 直接聊天模式初始化异常: {str(e)}")
+            self.logger.error(f"直接聊天模式初始化异常: {str(e)}")
             state["error"] = f"直接聊天模式初始化失败: {str(e)}"
             state["error_code"] = 500
             state["execution_path"].append("init_direct_chat_error")
@@ -180,12 +184,12 @@ class CituLangGraphAgent:
             # 从state中获取路由模式,而不是从配置文件读取
             routing_mode = state.get("routing_mode", "hybrid")
             
-            print(f"[CLASSIFY_NODE] 开始分类问题: {state['question']}")
+            self.logger.info(f"开始分类问题: {state['question']}")
             
             # 获取上下文类型(如果有的话)
             context_type = state.get("context_type")
             if context_type:
-                print(f"[CLASSIFY_NODE] 检测到上下文类型: {context_type}")
+                self.logger.info(f"检测到上下文类型: {context_type}")
             
             # 使用渐进式分类策略,传递路由模式
             classification_result = self.classifier.classify(state["question"], context_type, routing_mode)
@@ -199,13 +203,13 @@ class CituLangGraphAgent:
             state["current_step"] = "classified"
             state["execution_path"].append("classify")
             
-            print(f"[CLASSIFY_NODE] 分类结果: {classification_result.question_type}, 置信度: {classification_result.confidence}")
-            print(f"[CLASSIFY_NODE] 路由模式: {routing_mode}, 分类方法: {classification_result.method}")
+            self.logger.info(f"分类结果: {classification_result.question_type}, 置信度: {classification_result.confidence}")
+            self.logger.info(f"路由模式: {routing_mode}, 分类方法: {classification_result.method}")
             
             return state
             
         except Exception as e:
-            print(f"[ERROR] 问题分类异常: {str(e)}")
+            self.logger.error(f"问题分类异常: {str(e)}")
             state["error"] = f"问题分类失败: {str(e)}"
             state["error_code"] = 500
             state["execution_path"].append("classify_error")
@@ -214,12 +218,12 @@ class CituLangGraphAgent:
     async def _agent_sql_generation_node(self, state: AgentState) -> AgentState:
         """SQL生成验证节点 - 负责生成SQL、验证SQL和决定路由"""
         try:
-            print(f"[SQL_GENERATION] 开始处理SQL生成和验证: {state['question']}")
+            self.logger.info(f"开始处理SQL生成和验证: {state['question']}")
             
             question = state["question"]
             
             # 步骤1:生成SQL
-            print(f"[SQL_GENERATION] 步骤1:生成SQL")
+            self.logger.info("步骤1:生成SQL")
             sql_result = generate_sql.invoke({"question": question, "allow_llm_to_see_data": True})
             
             if not sql_result.get("success"):
@@ -227,7 +231,8 @@ class CituLangGraphAgent:
                 error_message = sql_result.get("error", "")
                 error_type = sql_result.get("error_type", "")
                 
-                print(f"[SQL_GENERATION] SQL生成失败: {error_message}")
+                #print(f"[SQL_GENERATION] SQL生成失败: {error_message}")
+                self.logger.debug(f"error_type = '{error_type}'")
                 
                 # 根据错误类型生成用户提示
                 if "no relevant tables" in error_message.lower() or "table not found" in error_message.lower():
@@ -236,9 +241,15 @@ class CituLangGraphAgent:
                 elif "ambiguous" in error_message.lower() or "more information" in error_message.lower():
                     user_prompt = "您的问题需要更多信息才能准确查询,请提供更详细的描述。"
                     failure_reason = "ambiguous_question"
-                elif error_type == "llm_explanation":
-                    user_prompt = error_message + " 请尝试重新描述您的问题或询问其他内容。"
-                    failure_reason = "llm_explanation"
+                elif error_type == "llm_explanation" or error_type == "generation_failed_with_explanation":
+                    # 对于解释性文本,直接设置为聊天响应
+                    state["chat_response"] = error_message + " 请尝试提问其它问题。"
+                    state["sql_generation_success"] = False
+                    state["validation_error_type"] = "llm_explanation"
+                    state["current_step"] = "sql_generation_completed"
+                    state["execution_path"].append("agent_sql_generation")
+                    self.logger.info(f"返回LLM解释性答案: {error_message}")
+                    return state
                 else:
                     user_prompt = "无法生成有效的SQL查询,请尝试重新描述您的问题。"
                     failure_reason = "unknown_generation_failure"
@@ -250,16 +261,15 @@ class CituLangGraphAgent:
                 state["current_step"] = "sql_generation_failed"
                 state["execution_path"].append("agent_sql_generation_failed")
                 
-                print(f"[SQL_GENERATION] 生成失败: {failure_reason} - {user_prompt}")
+                self.logger.warning(f"生成失败: {failure_reason} - {user_prompt}")
                 return state
             
             sql = sql_result.get("sql")
             state["sql"] = sql
-            print(f"[SQL_GENERATION] SQL生成成功: {sql}")
             
             # 步骤1.5:检查是否为解释性响应而非SQL
             error_type = sql_result.get("error_type")
-            if error_type == "llm_explanation":
+            if error_type == "llm_explanation" or error_type == "generation_failed_with_explanation":
                 # LLM返回了解释性文本,直接作为最终答案
                 explanation = sql_result.get("error", "")
                 state["chat_response"] = explanation + " 请尝试提问其它问题。"
@@ -267,11 +277,18 @@ class CituLangGraphAgent:
                 state["validation_error_type"] = "llm_explanation"
                 state["current_step"] = "sql_generation_completed"
                 state["execution_path"].append("agent_sql_generation")
-                print(f"[SQL_GENERATION] 返回LLM解释性答案: {explanation}")
+                self.logger.info(f"返回LLM解释性答案: {explanation}")
+                return state
+            
+            if sql:
+                self.logger.info(f"SQL生成成功: {sql}")
+            else:
+                self.logger.warning("SQL为空,但不是解释性响应")
+                # 这种情况应该很少见,但为了安全起见保留原有的错误处理
                 return state
             
             # 额外验证:检查SQL格式(防止工具误判)
-            from agent.utils import _is_valid_sql_format
+            from agent.tools.utils import _is_valid_sql_format
             if not _is_valid_sql_format(sql):
                 # 内容看起来不是SQL,当作解释性响应处理
                 state["chat_response"] = sql + " 请尝试提问其它问题。"
@@ -279,12 +296,12 @@ class CituLangGraphAgent:
                 state["validation_error_type"] = "invalid_sql_format"
                 state["current_step"] = "sql_generation_completed"  
                 state["execution_path"].append("agent_sql_generation")
-                print(f"[SQL_GENERATION] 内容不是有效SQL,当作解释返回: {sql}")
+                self.logger.info(f"内容不是有效SQL,当作解释返回: {sql}")
                 return state
             
             # 步骤2:SQL验证(如果启用)
             if self._is_sql_validation_enabled():
-                print(f"[SQL_GENERATION] 步骤2:验证SQL")
+                self.logger.info("步骤2:验证SQL")
                 validation_result = await self._validate_sql_with_custom_priority(sql)
                 
                 if not validation_result.get("valid"):
@@ -293,7 +310,7 @@ class CituLangGraphAgent:
                     error_message = validation_result.get("error_message")
                     can_repair = validation_result.get("can_repair", False)
                     
-                    print(f"[SQL_GENERATION] SQL验证失败: {error_type} - {error_message}")
+                    self.logger.warning(f"SQL验证失败: {error_type} - {error_message}")
                     
                     if error_type == "forbidden_keywords":
                         # 禁止词错误,直接失败,不尝试修复
@@ -303,12 +320,12 @@ class CituLangGraphAgent:
                         state["validation_error_type"] = "forbidden_keywords"
                         state["current_step"] = "sql_validation_failed"
                         state["execution_path"].append("forbidden_keywords_failed")
-                        print(f"[SQL_GENERATION] 禁止词验证失败,直接结束")
+                        self.logger.warning("禁止词验证失败,直接结束")
                         return state
                     
                     elif error_type == "syntax_error" and can_repair and self._is_auto_repair_enabled():
                         # 语法错误,尝试修复(仅一次)
-                        print(f"[SQL_GENERATION] 尝试修复SQL语法错误(仅一次): {error_message}")
+                        self.logger.info(f"尝试修复SQL语法错误(仅一次): {error_message}")
                         state["sql_repair_attempted"] = True
                         
                         repair_result = await self._attempt_sql_repair_once(sql, error_message)
@@ -322,12 +339,12 @@ class CituLangGraphAgent:
                             state["sql_repair_success"] = True
                             state["current_step"] = "sql_generation_completed"
                             state["execution_path"].append("sql_repair_success")
-                            print(f"[SQL_GENERATION] SQL修复成功: {repaired_sql}")
+                            self.logger.info(f"SQL修复成功: {repaired_sql}")
                             return state
                         else:
                             # 修复失败,直接结束
                             repair_error = repair_result.get("error", "修复失败")
-                            print(f"[SQL_GENERATION] SQL修复失败: {repair_error}")
+                            self.logger.warning(f"SQL修复失败: {repair_error}")
                             state["sql_generation_success"] = False
                             state["sql_validation_success"] = False
                             state["sql_repair_success"] = False
@@ -344,13 +361,13 @@ class CituLangGraphAgent:
                         state["validation_error_type"] = error_type
                         state["current_step"] = "sql_validation_failed"
                         state["execution_path"].append("sql_validation_failed")
-                        print(f"[SQL_GENERATION] SQL验证失败,不尝试修复")
+                        self.logger.warning("SQL验证失败,不尝试修复")
                         return state
                 else:
-                    print(f"[SQL_GENERATION] SQL验证通过")
+                    self.logger.info("SQL验证通过")
                     state["sql_validation_success"] = True
             else:
-                print(f"[SQL_GENERATION] 跳过SQL验证(未启用)")
+                self.logger.info("跳过SQL验证(未启用)")
                 state["sql_validation_success"] = True
             
             # 生成和验证都成功
@@ -358,13 +375,13 @@ class CituLangGraphAgent:
             state["current_step"] = "sql_generation_completed"
             state["execution_path"].append("agent_sql_generation")
             
-            print(f"[SQL_GENERATION] SQL生成验证完成,准备执行")
+            self.logger.info("SQL生成验证完成,准备执行")
             return state
             
         except Exception as e:
-            print(f"[ERROR] SQL生成验证节点异常: {str(e)}")
+            self.logger.error(f"SQL生成验证节点异常: {str(e)}")
             import traceback
-            print(f"[ERROR] 详细错误信息: {traceback.format_exc()}")
+            self.logger.error(f"详细错误信息: {traceback.format_exc()}")
             state["sql_generation_success"] = False
             state["sql_validation_success"] = False
             state["user_prompt"] = f"SQL生成验证异常: {str(e)}"
@@ -376,13 +393,13 @@ class CituLangGraphAgent:
     def _agent_sql_execution_node(self, state: AgentState) -> AgentState:
         """SQL执行节点 - 负责执行已验证的SQL和生成摘要"""
         try:
-            print(f"[SQL_EXECUTION] 开始执行SQL: {state.get('sql', 'N/A')}")
+            self.logger.info(f"开始执行SQL: {state.get('sql', 'N/A')}")
             
             sql = state.get("sql")
             question = state["question"]
             
             if not sql:
-                print(f"[SQL_EXECUTION] 没有可执行的SQL")
+                self.logger.warning("没有可执行的SQL")
                 state["error"] = "没有可执行的SQL语句"
                 state["error_code"] = 500
                 state["current_step"] = "sql_execution_error"
@@ -390,11 +407,11 @@ class CituLangGraphAgent:
                 return state
             
             # 步骤1:执行SQL
-            print(f"[SQL_EXECUTION] 步骤1:执行SQL")
+            self.logger.info("步骤1:执行SQL")
             execute_result = execute_sql.invoke({"sql": sql})
             
             if not execute_result.get("success"):
-                print(f"[SQL_EXECUTION] SQL执行失败: {execute_result.get('error')}")
+                self.logger.error(f"SQL执行失败: {execute_result.get('error')}")
                 state["error"] = execute_result.get("error", "SQL执行失败")
                 state["error_code"] = 500
                 state["current_step"] = "sql_execution_error"
@@ -403,15 +420,15 @@ class CituLangGraphAgent:
             
             query_result = execute_result.get("data_result")
             state["query_result"] = query_result
-            print(f"[SQL_EXECUTION] SQL执行成功,返回 {query_result.get('row_count', 0)} 行数据")
+            self.logger.info(f"SQL执行成功,返回 {query_result.get('row_count', 0)} 行数据")
             
             # 步骤2:生成摘要(根据配置和数据情况)
             if ENABLE_RESULT_SUMMARY and query_result.get('row_count', 0) > 0:
-                print(f"[SQL_EXECUTION] 步骤2:生成摘要")
+                self.logger.info("步骤2:生成摘要")
                 
                 # 重要:提取原始问题用于摘要生成,避免历史记录循环嵌套
                 original_question = self._extract_original_question(question)
-                print(f"[SQL_EXECUTION] 原始问题: {original_question}")
+                self.logger.debug(f"原始问题: {original_question}")
                 
                 summary_result = generate_summary.invoke({
                     "question": original_question,  # 使用原始问题而不是enhanced_question
@@ -420,26 +437,26 @@ class CituLangGraphAgent:
                 })
                 
                 if not summary_result.get("success"):
-                    print(f"[SQL_EXECUTION] 摘要生成失败: {summary_result.get('message')}")
+                    self.logger.warning(f"摘要生成失败: {summary_result.get('message')}")
                     # 摘要生成失败不是致命错误,使用默认摘要
                     state["summary"] = f"查询执行完成,共返回 {query_result.get('row_count', 0)} 条记录。"
                 else:
                     state["summary"] = summary_result.get("summary")
-                    print(f"[SQL_EXECUTION] 摘要生成成功")
+                    self.logger.info("摘要生成成功")
             else:
-                print(f"[SQL_EXECUTION] 跳过摘要生成(ENABLE_RESULT_SUMMARY={ENABLE_RESULT_SUMMARY},数据行数={query_result.get('row_count', 0)})")
+                self.logger.info(f"跳过摘要生成(ENABLE_RESULT_SUMMARY={ENABLE_RESULT_SUMMARY},数据行数={query_result.get('row_count', 0)})")
                 # 不生成摘要时,不设置summary字段,让格式化响应节点决定如何处理
             
             state["current_step"] = "sql_execution_completed"
             state["execution_path"].append("agent_sql_execution")
             
-            print(f"[SQL_EXECUTION] SQL执行完成")
+            self.logger.info("SQL执行完成")
             return state
             
         except Exception as e:
-            print(f"[ERROR] SQL执行节点异常: {str(e)}")
+            self.logger.error(f"SQL执行节点异常: {str(e)}")
             import traceback
-            print(f"[ERROR] 详细错误信息: {traceback.format_exc()}")
+            self.logger.error(f"详细错误信息: {traceback.format_exc()}")
             state["error"] = f"SQL执行失败: {str(e)}"
             state["error_code"] = 500
             state["current_step"] = "sql_execution_error"
@@ -454,17 +471,17 @@ class CituLangGraphAgent:
         保留此方法仅为向后兼容,新的工作流使用拆分后的节点
         """
         try:
-            print(f"[DATABASE_AGENT] ⚠️  使用已废弃的database节点,建议使用新的拆分节点")
-            print(f"[DATABASE_AGENT] 开始处理数据库查询: {state['question']}")
+            self.logger.warning("使用已废弃的database节点,建议使用新的拆分节点")
+            self.logger.info(f"开始处理数据库查询: {state['question']}")
             
             question = state["question"]
             
             # 步骤1:生成SQL
-            print(f"[DATABASE_AGENT] 步骤1:生成SQL")
+            self.logger.info("步骤1:生成SQL")
             sql_result = generate_sql.invoke({"question": question, "allow_llm_to_see_data": True})
             
             if not sql_result.get("success"):
-                print(f"[DATABASE_AGENT] SQL生成失败: {sql_result.get('error')}")
+                self.logger.error(f"SQL生成失败: {sql_result.get('error')}")
                 state["error"] = sql_result.get("error", "SQL生成失败")
                 state["error_code"] = 500
                 state["current_step"] = "database_error"
@@ -473,7 +490,7 @@ class CituLangGraphAgent:
             
             sql = sql_result.get("sql")
             state["sql"] = sql
-            print(f"[DATABASE_AGENT] SQL生成成功: {sql}")
+            self.logger.info(f"SQL生成成功: {sql}")
             
             # 步骤1.5:检查是否为解释性响应而非SQL
             error_type = sql_result.get("error_type")
@@ -483,25 +500,25 @@ class CituLangGraphAgent:
                 state["chat_response"] = explanation + " 请尝试提问其它问题。"
                 state["current_step"] = "database_completed"
                 state["execution_path"].append("agent_database")
-                print(f"[DATABASE_AGENT] 返回LLM解释性答案: {explanation}")
+                self.logger.info(f"返回LLM解释性答案: {explanation}")
                 return state
             
             # 额外验证:检查SQL格式(防止工具误判)
-            from agent.utils import _is_valid_sql_format
+            from agent.tools.utils import _is_valid_sql_format
             if not _is_valid_sql_format(sql):
                 # 内容看起来不是SQL,当作解释性响应处理
                 state["chat_response"] = sql + " 请尝试提问其它问题。"
                 state["current_step"] = "database_completed"  
                 state["execution_path"].append("agent_database")
-                print(f"[DATABASE_AGENT] 内容不是有效SQL,当作解释返回: {sql}")
+                self.logger.info(f"内容不是有效SQL,当作解释返回: {sql}")
                 return state
             
             # 步骤2:执行SQL
-            print(f"[DATABASE_AGENT] 步骤2:执行SQL")
+            self.logger.info("步骤2:执行SQL")
             execute_result = execute_sql.invoke({"sql": sql})
             
             if not execute_result.get("success"):
-                print(f"[DATABASE_AGENT] SQL执行失败: {execute_result.get('error')}")
+                self.logger.error(f"SQL执行失败: {execute_result.get('error')}")
                 state["error"] = execute_result.get("error", "SQL执行失败")
                 state["error_code"] = 500
                 state["current_step"] = "database_error"
@@ -510,15 +527,15 @@ class CituLangGraphAgent:
             
             query_result = execute_result.get("data_result")
             state["query_result"] = query_result
-            print(f"[DATABASE_AGENT] SQL执行成功,返回 {query_result.get('row_count', 0)} 行数据")
+            self.logger.info(f"SQL执行成功,返回 {query_result.get('row_count', 0)} 行数据")
             
             # 步骤3:生成摘要(可通过配置控制,仅在有数据时生成)
             if ENABLE_RESULT_SUMMARY and query_result.get('row_count', 0) > 0:
-                print(f"[DATABASE_AGENT] 步骤3:生成摘要")
+                self.logger.info("步骤3:生成摘要")
                 
                 # 重要:提取原始问题用于摘要生成,避免历史记录循环嵌套
                 original_question = self._extract_original_question(question)
-                print(f"[DATABASE_AGENT] 原始问题: {original_question}")
+                self.logger.debug(f"原始问题: {original_question}")
                 
                 summary_result = generate_summary.invoke({
                     "question": original_question,  # 使用原始问题而不是enhanced_question
@@ -527,26 +544,26 @@ class CituLangGraphAgent:
                 })
                 
                 if not summary_result.get("success"):
-                    print(f"[DATABASE_AGENT] 摘要生成失败: {summary_result.get('message')}")
+                    self.logger.warning(f"摘要生成失败: {summary_result.get('message')}")
                     # 摘要生成失败不是致命错误,使用默认摘要
                     state["summary"] = f"查询执行完成,共返回 {query_result.get('row_count', 0)} 条记录。"
                 else:
                     state["summary"] = summary_result.get("summary")
-                    print(f"[DATABASE_AGENT] 摘要生成成功")
+                    self.logger.info("摘要生成成功")
             else:
-                print(f"[DATABASE_AGENT] 跳过摘要生成(ENABLE_RESULT_SUMMARY={ENABLE_RESULT_SUMMARY},数据行数={query_result.get('row_count', 0)})")
+                self.logger.info(f"跳过摘要生成(ENABLE_RESULT_SUMMARY={ENABLE_RESULT_SUMMARY},数据行数={query_result.get('row_count', 0)})")
                 # 不生成摘要时,不设置summary字段,让格式化响应节点决定如何处理
             
             state["current_step"] = "database_completed"
             state["execution_path"].append("agent_database")
             
-            print(f"[DATABASE_AGENT] 数据库查询完成")
+            self.logger.info("数据库查询完成")
             return state
             
         except Exception as e:
-            print(f"[ERROR] 数据库Agent异常: {str(e)}")
+            self.logger.error(f"数据库Agent异常: {str(e)}")
             import traceback
-            print(f"[ERROR] 详细错误信息: {traceback.format_exc()}")
+            self.logger.error(f"详细错误信息: {traceback.format_exc()}")
             state["error"] = f"数据库查询失败: {str(e)}"
             state["error_code"] = 500
             state["current_step"] = "database_error"
@@ -556,7 +573,7 @@ class CituLangGraphAgent:
     def _agent_chat_node(self, state: AgentState) -> AgentState:
         """聊天Agent节点 - 直接工具调用模式"""
         try:
-            print(f"[CHAT_AGENT] 开始处理聊天: {state['question']}")
+            self.logger.info(f"开始处理聊天: {state['question']}")
             
             question = state["question"]
             
@@ -571,7 +588,7 @@ class CituLangGraphAgent:
                 pass
             
             # 直接调用general_chat工具
-            print(f"[CHAT_AGENT] 调用general_chat工具")
+            self.logger.info("调用general_chat工具")
             chat_result = general_chat.invoke({
                 "question": question,
                 "context": context
@@ -579,22 +596,22 @@ class CituLangGraphAgent:
             
             if chat_result.get("success"):
                 state["chat_response"] = chat_result.get("response", "")
-                print(f"[CHAT_AGENT] 聊天处理成功")
+                self.logger.info("聊天处理成功")
             else:
                 # 处理失败,使用备用响应
                 state["chat_response"] = chat_result.get("response", "抱歉,我暂时无法处理您的问题。请稍后再试。")
-                print(f"[CHAT_AGENT] 聊天处理失败,使用备用响应: {chat_result.get('error')}")
+                self.logger.warning(f"聊天处理失败,使用备用响应: {chat_result.get('error')}")
             
             state["current_step"] = "chat_completed"
             state["execution_path"].append("agent_chat")
             
-            print(f"[CHAT_AGENT] 聊天处理完成")
+            self.logger.info("聊天处理完成")
             return state
             
         except Exception as e:
-            print(f"[ERROR] 聊天Agent异常: {str(e)}")
+            self.logger.error(f"聊天Agent异常: {str(e)}")
             import traceback
-            print(f"[ERROR] 详细错误信息: {traceback.format_exc()}")
+            self.logger.error(f"详细错误信息: {traceback.format_exc()}")
             state["chat_response"] = "抱歉,我暂时无法处理您的问题。请稍后再试,或者尝试询问数据相关的问题。"
             state["current_step"] = "chat_error"
             state["execution_path"].append("agent_chat_error")
@@ -603,7 +620,7 @@ class CituLangGraphAgent:
     def _format_response_node(self, state: AgentState) -> AgentState:
         """格式化最终响应节点"""
         try:
-            print(f"[FORMAT_NODE] 开始格式化响应,问题类型: {state['question_type']}")
+            self.logger.info(f"开始格式化响应,问题类型: {state['question_type']}")
             
             state["current_step"] = "completed"
             state["execution_path"].append("format_response")
@@ -724,11 +741,11 @@ class CituLangGraphAgent:
                     }
                 }
             
-            print(f"[FORMAT_NODE] 响应格式化完成")
+            self.logger.info("响应格式化完成")
             return state
             
         except Exception as e:
-            print(f"[ERROR] 响应格式化异常: {str(e)}")
+            self.logger.error(f"响应格式化异常: {str(e)}")
             state["final_response"] = {
                 "success": False,
                 "error": f"响应格式化异常: {str(e)}",
@@ -747,7 +764,7 @@ class CituLangGraphAgent:
         """
         sql_generation_success = state.get("sql_generation_success", False)
         
-        print(f"[ROUTE] SQL生成路由: success={sql_generation_success}")
+        self.logger.debug(f"SQL生成路由: success={sql_generation_success}")
         
         if sql_generation_success:
             return "continue_execution"  # 路由到SQL执行节点
@@ -767,7 +784,7 @@ class CituLangGraphAgent:
         question_type = state["question_type"]
         confidence = state["classification_confidence"]
         
-        print(f"[ROUTE] 分类路由: {question_type}, 置信度: {confidence} (完全信任分类器决策)")
+        self.logger.debug(f"分类路由: {question_type}, 置信度: {confidence} (完全信任分类器决策)")
         
         if question_type == "DATABASE":
             return "DATABASE"
@@ -790,11 +807,11 @@ class CituLangGraphAgent:
             Dict包含完整的处理结果
         """
         try:
-            print(f"[CITU_AGENT] 开始处理问题: {question}")
+            self.logger.info(f"开始处理问题: {question}")
             if context_type:
-                print(f"[CITU_AGENT] 上下文类型: {context_type}")
+                self.logger.info(f"上下文类型: {context_type}")
             if routing_mode:
-                print(f"[CITU_AGENT] 使用指定路由模式: {routing_mode}")
+                self.logger.info(f"使用指定路由模式: {routing_mode}")
             
             # 动态创建workflow(基于路由模式)
             workflow = self._create_workflow(routing_mode)
@@ -813,12 +830,12 @@ class CituLangGraphAgent:
             # 提取最终结果
             result = final_state["final_response"]
             
-            print(f"[CITU_AGENT] 问题处理完成: {result.get('success', False)}")
+            self.logger.info(f"问题处理完成: {result.get('success', False)}")
             
             return result
             
         except Exception as e:
-            print(f"[ERROR] Agent执行异常: {str(e)}")
+            self.logger.error(f"Agent执行异常: {str(e)}")
             return {
                 "success": False,
                 "error": f"Agent系统异常: {str(e)}",
@@ -1114,7 +1131,7 @@ class CituLangGraphAgent:
             return question.strip()
             
         except Exception as e:
-            print(f"[WARNING] 提取原始问题失败: {str(e)}")
+            self.logger.warning(f"提取原始问题失败: {str(e)}")
             return question.strip()
 
     async def health_check(self) -> Dict[str, Any]:

+ 98 - 51
agent/classifier.py

@@ -2,6 +2,7 @@
 import re
 from typing import Dict, Any, List, Optional
 from dataclasses import dataclass
+from core.logging import get_agent_logger
 
 @dataclass
 class ClassificationResult:
@@ -16,6 +17,9 @@ class QuestionClassifier:
     """
     
     def __init__(self):
+        # 初始化日志
+        self.logger = get_agent_logger("Classifier")
+        
         # 从配置文件加载阈值参数
         try:
             from agent.config import get_current_config, get_nested_config
@@ -27,7 +31,8 @@ class QuestionClassifier:
             self.confidence_increment = get_nested_config(config, "classification.confidence_increment", 0.08)
             self.llm_fallback_confidence = get_nested_config(config, "classification.llm_fallback_confidence", 0.5)
             self.uncertain_confidence = get_nested_config(config, "classification.uncertain_confidence", 0.2)
-            print("[CLASSIFIER] 从配置文件加载分类器参数完成")
+            self.medium_confidence_threshold = get_nested_config(config, "classification.medium_confidence_threshold", 0.6)
+            self.logger.info("从配置文件加载分类器参数完成")
         except ImportError:
             self.high_confidence_threshold = 0.7
             self.low_confidence_threshold = 0.4
@@ -36,7 +41,8 @@ class QuestionClassifier:
             self.confidence_increment = 0.08
             self.llm_fallback_confidence = 0.5
             self.uncertain_confidence = 0.2
-            print("[CLASSIFIER] 配置文件不可用,使用默认分类器参数")
+            self.medium_confidence_threshold = 0.6
+            self.logger.warning("配置文件不可用,使用默认分类器参数")
         
         # 基于高速公路服务区业务的精准关键词
         self.strong_business_keywords = {
@@ -66,8 +72,8 @@ class QuestionClassifier:
                 "高速线路", "公路线路"
             ],
             "系统查询指示词": [
-                "当前系统", "当前数据库", "当前数据",
-                "本系统", "系统", "数据库中", "数据中",
+                "当前系统", "当前数据库", "当前数据", "数据库"
+                "本系统", "系统", "数据库中", "数据中",
                 "现有数据", "已有数据", "存储的数据",
                 "平台数据", "我们的数据库", "这个系统"
             ]
@@ -80,7 +86,7 @@ class QuestionClassifier:
             "趋势", "占比", "百分比", "比例",
             "最大", "最小", "最高", "最低", "平均",
             "总计", "合计", "累计", "求和", "求平均",
-            "生成", "导出", "显示", "列出"
+            "生成", "导出", "显示", "列出", "共有"
         ]
         
         # 非业务实体词(包含则倾向CHAT)
@@ -94,7 +100,7 @@ class QuestionClassifier:
             "AI", "神经网络", "模型训练", "数据挖掘",
             
             # 身份询问
-            "你是谁", "你是什么", "你叫什么", "你的名字", 
+            "你是谁", "你是什么", "你叫什么", "你的名字", "你是什么AI",
             "什么模型", "大模型", "AI助手", "助手", "机器人",
             
             # 天气相关
@@ -104,20 +110,34 @@ class QuestionClassifier:
             # 其他生活常识
             "怎么做饭", "如何减肥", "健康", "医疗", "病症",
             "历史", "地理", "文学", "电影", "音乐", "体育",
-            "娱乐", "游戏", "小说", "新闻", "政治"
+            "娱乐", "游戏", "小说", "新闻", "政治", "战争",
+            "足球", "NBA", "篮球", "乒乓球", "冠军", "夺冠",
+            "高考",
+
+            # 旅游出行
+            "旅游","景点","门票","酒店","机票","航班","高铁","的士",
+            #情绪
+            "伤心","开心","无聊","生气","孤独","累了","烦恼","心情","难过","抑郁",
+            #商业
+            "股票","基金","理财","投资","经济","通货膨胀","上市",
+            #哲学
+            "人生意义","价值观","道德","信仰","宗教","爱情",
+            #地理
+            "全球","全国","亚洲","发展中","欧洲","美洲","东亚","东南亚","南美","非洲","大洋"
         ]
         
         # SQL关键词(技术层面的数据库操作)
+        # business_score +3
         self.sql_patterns = [
-            r"\b(select|from|where|group by|order by|having|join)\b",
-            r"\b(数据库|表名|字段名|SQL|sql)\b"
+            r"\b(select|from|where|group by|order by|having|join|update)\b",
+            r"\b(数据库|表名|表|字段名|SQL|sql|database|table)\b"
         ]
         
         # 聊天关键词(平台功能和帮助)
         self.chat_keywords = [
-            "你好", "谢谢", "再见", "怎么样", "如何", "为什么", "什么是",
+            "你好", "谢谢", "再见", "怎么样", "如何", "为什么", "什么是",
             "介绍", "解释", "说明", "帮助", "操作", "使用方法", "功能",
-            "教程", "指南", "手册"
+            "教程", "指南", "手册","讲解"
         ]
         
         # 追问关键词(用于检测追问型问题)
@@ -145,14 +165,14 @@ class QuestionClassifier:
         # 确定使用的路由模式
         if routing_mode:
             QUESTION_ROUTING_MODE = routing_mode
-            print(f"[CLASSIFIER] 使用传入的路由模式: {QUESTION_ROUTING_MODE}")
+            self.logger.info(f"使用传入的路由模式: {QUESTION_ROUTING_MODE}")
         else:
             try:
                 from app_config import QUESTION_ROUTING_MODE
-                print(f"[CLASSIFIER] 使用配置文件路由模式: {QUESTION_ROUTING_MODE}")
+                self.logger.info(f"使用配置文件路由模式: {QUESTION_ROUTING_MODE}")
             except ImportError:
                 QUESTION_ROUTING_MODE = "hybrid"
-                print(f"[CLASSIFIER] 配置导入失败,使用默认路由模式: {QUESTION_ROUTING_MODE}")
+                self.logger.info(f"配置导入失败,使用默认路由模式: {QUESTION_ROUTING_MODE}")
         
         # 根据路由模式选择分类策略
         if QUESTION_ROUTING_MODE == "database_direct":
@@ -182,36 +202,36 @@ class QuestionClassifier:
         2. 如果置信度不够且有上下文,考虑上下文辅助
         3. 检测话题切换,避免错误继承
         """
-        print(f"[CLASSIFIER] 渐进式分类 - 问题: {question}")
+        self.logger.info(f"渐进式分类 - 问题: {question}")
         if context_type:
-            print(f"[CLASSIFIER] 上下文类型: {context_type}")
+            self.logger.info(f"上下文类型: {context_type}")
         
         # 第一步:只基于问题本身分类
         primary_result = self._hybrid_classify(question)
-        print(f"[CLASSIFIER] 主分类结果: {primary_result.question_type}, 置信度: {primary_result.confidence}")
+        self.logger.info(f"主分类结果: {primary_result.question_type}, 置信度: {primary_result.confidence}")
         
         # 如果没有上下文,直接返回主分类结果
         if not context_type:
-            print(f"[CLASSIFIER] 无上下文,使用主分类结果")
+            self.logger.debug("无上下文,使用主分类结果")
             return primary_result
         
         # 如果置信度足够高,直接使用主分类结果
         if primary_result.confidence >= self.high_confidence_threshold:
-            print(f"[CLASSIFIER] 高置信度({primary_result.confidence}≥{self.high_confidence_threshold}),使用主分类结果")
+            self.logger.info(f"高置信度({primary_result.confidence}≥{self.high_confidence_threshold}),使用主分类结果")
             return primary_result
         
         # 检测明显的话题切换
         if self._is_topic_switch(question):
-            print(f"[CLASSIFIER] 检测到话题切换,忽略上下文")
+            self.logger.info("检测到话题切换,忽略上下文")
             return primary_result
         
         # 如果置信度较低,考虑上下文辅助
         if primary_result.confidence < self.medium_confidence_threshold:
-            print(f"[CLASSIFIER] 低置信度({primary_result.confidence}<{self.medium_confidence_threshold}),考虑上下文辅助")
+            self.logger.info(f"低置信度({primary_result.confidence}<{self.medium_confidence_threshold}),考虑上下文辅助")
             
             # 检测是否为追问型问题
             if self._is_follow_up_question(question):
-                print(f"[CLASSIFIER] 检测到追问型问题,继承上下文类型: {context_type}")
+                self.logger.info(f"检测到追问型问题,继承上下文类型: {context_type}")
                 return ClassificationResult(
                     question_type=context_type,
                     confidence=0.75,  # 给予中等置信度
@@ -220,7 +240,7 @@ class QuestionClassifier:
                 )
         
         # 中等置信度或其他情况,保持主分类结果
-        print(f"[CLASSIFIER] 保持主分类结果")
+        self.logger.debug("保持主分类结果")
         return primary_result
 
     def _is_follow_up_question(self, question: str) -> bool:
@@ -395,56 +415,74 @@ class QuestionClassifier:
                 method="rule_based_uncertain"
             )
     
+    def _load_business_context(self) -> str:
+        """从文件中加载数据库业务范围描述"""
+        try:
+            import os
+            current_dir = os.path.dirname(os.path.abspath(__file__))
+            prompt_file = os.path.join(current_dir, "tools", "db_query_decision_prompt.txt")
+            
+            with open(prompt_file, 'r', encoding='utf-8') as f:
+                content = f.read().strip()
+                
+            if not content:
+                raise ValueError("业务上下文文件为空")
+                
+            return content
+            
+        except FileNotFoundError:
+            error_msg = f"无法找到业务上下文文件: {prompt_file}"
+            self.logger.error(error_msg)
+            raise FileNotFoundError(error_msg)
+        except Exception as e:
+            error_msg = f"读取业务上下文文件失败: {str(e)}"
+            self.logger.error(error_msg)
+            raise RuntimeError(error_msg)
+
     def _enhanced_llm_classify(self, question: str) -> ClassificationResult:
         """增强的LLM分类:包含详细的业务上下文"""
         try:
             from common.vanna_instance import get_vanna_instance
             vn = get_vanna_instance()
             
+            # 动态加载业务上下文(如果失败会抛出异常)
+            business_context = self._load_business_context()
+            
             # 构建包含业务上下文的分类提示词
             classification_prompt = f"""
-请判断以下用户问题是否需要查询我们的高速公路服务区管理数据库。
+请判断以下用户问题是否需要查询我们的数据库。
 
 用户问题:{question}
 
-=== 数据库业务范围 ===
-本系统是高速公路服务区商业管理系统,包含以下业务数据:
-
-核心业务实体:
-- 服务区(bss_service_area):服务区基础信息、位置、状态,如"鄱阳湖服务区"、"信丰西服务区"
-- 档口/商铺(bss_branch):档口信息、品类(餐饮/小吃/便利店)、品牌,如"驿美餐饮"、"加水机"
-- 营业数据(bss_business_day_data):每日支付金额、订单数量,包含微信、支付宝、现金等支付方式
-- 车流量(bss_car_day_count):按车型统计的日流量数据,包含客车、货车、过境、危化品等
-- 公司信息(bss_company):服务区管理公司,如"驿美运营公司"
-
-关键业务指标:
-- 支付方式:微信支付(wx)、支付宝支付(zfb)、现金支付(rmb)、行吧支付(xs)、金豆支付(jd)
-- 营业数据:支付金额、订单数量、营业额、收入统计
-- 车流统计:按车型(客车/货车/过境/危化品/城际)的流量分析
-- 经营分析:餐饮、小吃、便利店、整体租赁等品类收入
-- 地理分区:北区、南区、西区、东区、两区
-
-高速线路:
-- 线路信息:大广、昌金、昌栗等高速线路
-- 路段管理:按线路统计服务区分布
+{business_context}
 
 === 判断标准 ===
 1. **DATABASE类型** - 需要查询数据库:
    - 涉及上述业务实体和指标的查询、统计、分析、报表
-   - 包含业务相关的时间查询,如"本月服务区营业额"、"上月档口收入"
-   - 例如:"本月营业额统计"、"档口收入排行"、"车流量分析"、"支付方式占比"
+   - 包含业务相关的时间查询
+   - 例如:业务数据统计、收入排行、流量分析、占比分析等
 
 2. **CHAT类型** - 不需要查询数据库:
    - 生活常识:水果蔬菜上市时间、动植物知识、天气等
    - 身份询问:你是谁、什么模型、AI助手等
    - 技术概念:人工智能、编程、算法等
    - 平台使用:功能介绍、操作帮助、使用教程等
+   - 旅游出行:旅游景点、酒店、机票、高铁、的士等
+   - 情绪:开心、伤心、无聊、生气、孤独、累了、烦恼、心情、难过、抑郁
+   - 商业:股票、基金、理财、投资、经济、通货膨胀、上市
+   - 哲学:人生意义、价值观、道德、信仰、宗教、爱情
+   - 政策:政策、法规、法律、条例、指南、手册、规章制度、实施细则
+   - 地理:全球、中国、亚洲、发展中、欧洲、美洲、东亚、东南亚、南美、非洲、大洋
+   - 体育:足球、NBA、篮球、乒乓球、冠军、夺冠
+   - 文学:小说、新闻、政治、战争、足球、NBA、篮球、乒乓球、冠军、夺冠
+   - 娱乐:游戏、小说、新闻、政治、战争、足球、NBA、篮球、乒乓球、冠军、夺冠、电影、电视剧、音乐、舞蹈、绘画、书法、摄影、雕塑、建筑、设计、
+   - 健康:健康、医疗、病症、健康、饮食、睡眠、心理、养生、减肥、美容、护肤
+   - 其他:高考、人生意义、价值观、道德、信仰、宗教、爱情、全球、全国、亚洲、发展中、欧洲、美洲、东亚、东南亚、南美、非洲、大洋
    - 例如:"荔枝几月份上市"、"今天天气如何"、"你是什么AI"、"怎么使用平台"
 
 **重要提示:**
 - 只有涉及高速公路服务区业务数据的问题才分类为DATABASE
-- 即使包含时间词汇(如"月份"、"时间"),也要看是否与我们的业务数据相关
-- 农产品上市时间、生活常识等都应分类为CHAT
+- 只要不是涉及高速公路服务区业务数据的问题都应分类为CHAT
 
 请基于问题与我们高速公路服务区业务数据的相关性来分类。
 
@@ -455,8 +493,8 @@ class QuestionClassifier:
 """
             
             # 专业的系统提示词
-            system_prompt = """你是一个专业的业务问题分类助手,专门负责高速公路服务区管理系统的问题分类。你具有以下特长:
-1. 深度理解高速公路服务区业务领域和数据范围
+            system_prompt = """你是一个专业的业务问题分类助手。你具有以下特长:
+1. 深度理解业务领域和数据范围
 2. 准确区分业务数据查询需求和一般性问题  
 3. 基于具体业务上下文进行精准分类,而不仅仅依赖关键词匹配
 4. 对边界情况能够给出合理的置信度评估
@@ -472,8 +510,17 @@ class QuestionClassifier:
             # 解析响应
             return self._parse_llm_response(response)
             
+        except (FileNotFoundError, RuntimeError) as e:
+            # 业务上下文加载失败,返回错误状态
+            self.logger.error(f"LLM分类失败,业务上下文不可用: {str(e)}")
+            return ClassificationResult(
+                question_type="CHAT",  # 失败时默认为CHAT,更安全
+                confidence=0.1,  # 很低的置信度表示分类不可靠
+                reason=f"业务上下文加载失败,无法进行准确分类: {str(e)}",
+                method="llm_context_error"
+            )
         except Exception as e:
-            print(f"[WARNING] 增强LLM分类失败: {str(e)}")
+            self.logger.warning(f"增强LLM分类失败: {str(e)}")
             return ClassificationResult(
                 question_type="CHAT",  # 失败时默认为CHAT,更安全
                 confidence=self.llm_fallback_confidence,

+ 20 - 0
agent/tools/db_query_decision_prompt.txt

@@ -0,0 +1,20 @@
+=== 数据库业务范围 ===
+本系统是高速公路服务区商业管理系统,包含以下业务数据:
+
+核心业务实体:
+- 服务区(bss_service_area):服务区基础信息、位置、状态,如"鄱阳湖服务区"、"信丰西服务区"
+- 档口/商铺(bss_branch):档口信息、品类(餐饮/小吃/便利店)、品牌,如"驿美餐饮"、"加水机"
+- 营业数据(bss_business_day_data):每日支付金额、订单数量,包含微信、支付宝、现金等支付方式
+- 车流量(bss_car_day_count):按车型统计的日流量数据,包含客车、货车、过境、危化品等
+- 公司信息(bss_company):服务区管理公司,如"驿美运营公司"
+
+关键业务指标:
+- 支付方式:微信支付(wx)、支付宝支付(zfb)、现金支付(rmb)、行吧支付(xs)、金豆支付(jd)
+- 营业数据:支付金额、订单数量、营业额、收入统计
+- 车流统计:按车型(客车/货车/过境/危化品/城际)的流量分析
+- 经营分析:餐饮、小吃、便利店、整体租赁等品类收入
+- 地理分区:北区、南区、西区、东区、两区
+
+高速线路:
+- 线路信息:大广、昌金、昌栗等高速线路
+- 路段管理:按线路统计服务区分布

+ 20 - 0
agent/tools/db_query_decision_prompt.txt.bak

@@ -0,0 +1,20 @@
+=== 数据库业务范围 ===
+本系统是高速公路服务区商业管理系统,包含以下业务数据:
+
+核心业务实体:
+- 服务区(bss_service_area):服务区基础信息、位置、状态,如"鄱阳湖服务区"、"信丰西服务区"
+- 档口/商铺(bss_branch):档口信息、品类(餐饮/小吃/便利店)、品牌,如"驿美餐饮"、"加水机"
+- 营业数据(bss_business_day_data):每日支付金额、订单数量,包含微信、支付宝、现金等支付方式
+- 车流量(bss_car_day_count):按车型统计的日流量数据,包含客车、货车、过境、危化品等
+- 公司信息(bss_company):服务区管理公司,如"驿美运营公司"
+
+关键业务指标:
+- 支付方式:微信支付(wx)、支付宝支付(zfb)、现金支付(rmb)、行吧支付(xs)、金豆支付(jd)
+- 营业数据:支付金额、订单数量、营业额、收入统计
+- 车流统计:按车型(客车/货车/过境/危化品/城际)的流量分析
+- 经营分析:餐饮、小吃、便利店、整体租赁等品类收入
+- 地理分区:北区、南区、西区、东区、两区
+
+高速线路:
+- 线路信息:大广、昌金、昌栗等高速线路
+- 路段管理:按线路统计服务区分布

+ 7 - 3
agent/tools/general_chat.py

@@ -2,6 +2,10 @@
 from langchain.tools import tool
 from typing import Dict, Any, Optional
 from common.vanna_instance import get_vanna_instance
+from core.logging import get_agent_logger
+
+# Initialize logger
+logger = get_agent_logger("GeneralChat")
 
 @tool
 def general_chat(question: str, context: Optional[str] = None) -> Dict[str, Any]:
@@ -21,7 +25,7 @@ def general_chat(question: str, context: Optional[str] = None) -> Dict[str, Any]
         }
     """
     try:
-        print(f"[TOOL:general_chat] 处理聊天问题: {question}")
+        logger.info(f"处理聊天问题: {question}")
         
         system_prompt = """
 你是Citu智能数据问答平台的AI助手,为用户提供全面的帮助和支持。
@@ -58,7 +62,7 @@ def general_chat(question: str, context: Optional[str] = None) -> Dict[str, Any]
         )
         
         if response:
-            print(f"[TOOL:general_chat] 聊天响应生成成功: {response[:100]}...")
+            logger.info(f"聊天响应生成成功: {response[:100]}...")
             return {
                 "success": True,
                 "response": response.strip(),
@@ -72,7 +76,7 @@ def general_chat(question: str, context: Optional[str] = None) -> Dict[str, Any]
             }
             
     except Exception as e:
-        print(f"[ERROR] 通用聊天异常: {str(e)}")
+        logger.error(f"通用聊天异常: {str(e)}")
         return {
             "success": False,
             "response": _get_fallback_response(question),

+ 10 - 6
agent/tools/sql_execution.py

@@ -6,6 +6,10 @@ import time
 import functools
 from common.vanna_instance import get_vanna_instance
 from app_config import API_MAX_RETURN_ROWS
+from core.logging import get_agent_logger
+
+# Initialize logger
+logger = get_agent_logger("SQLExecution")
 
 def retry_on_failure(max_retries: int = 2, delay: float = 1.0, backoff_factor: float = 2.0):
     """
@@ -29,7 +33,7 @@ def retry_on_failure(max_retries: int = 2, delay: float = 1.0, backoff_factor: f
                         if retries < max_retries:
                             retries += 1
                             wait_time = delay * (backoff_factor ** (retries - 1))
-                            print(f"[RETRY] {func.__name__} 执行失败,等待 {wait_time:.1f} 秒后重试 ({retries}/{max_retries})")
+                            logger.warning(f"{func.__name__} 执行失败,等待 {wait_time:.1f} 秒后重试 ({retries}/{max_retries})")
                             time.sleep(wait_time)
                             continue
                     
@@ -39,10 +43,10 @@ def retry_on_failure(max_retries: int = 2, delay: float = 1.0, backoff_factor: f
                     retries += 1
                     if retries <= max_retries:
                         wait_time = delay * (backoff_factor ** (retries - 1))
-                        print(f"[RETRY] {func.__name__} 异常: {str(e)}, 等待 {wait_time:.1f} 秒后重试 ({retries}/{max_retries})")
+                        logger.warning(f"{func.__name__} 异常: {str(e)}, 等待 {wait_time:.1f} 秒后重试 ({retries}/{max_retries})")
                         time.sleep(wait_time)
                     else:
-                        print(f"[RETRY] {func.__name__} 达到最大重试次数 ({max_retries}),抛出异常")
+                        logger.error(f"{func.__name__} 达到最大重试次数 ({max_retries}),抛出异常")
                         raise
             
             # 不应该到达这里,但为了安全性
@@ -75,7 +79,7 @@ def execute_sql(sql: str, max_rows: int = None) -> Dict[str, Any]:
     if max_rows is None:
         max_rows = API_MAX_RETURN_ROWS if API_MAX_RETURN_ROWS is not None else DEFAULT_MAX_RETURN_ROWS
     try:
-        print(f"[TOOL:execute_sql] 开始执行SQL: {sql[:100]}...")
+        logger.info(f"开始执行SQL: {sql[:100]}...")
         
         vn = get_vanna_instance()
         df = vn.run_sql(sql)
@@ -118,7 +122,7 @@ def execute_sql(sql: str, max_rows: int = None) -> Dict[str, Any]:
         rows = _process_dataframe_rows(limited_df.to_dict(orient="records"))
         columns = list(df.columns)
         
-        print(f"[TOOL:execute_sql] 查询成功,返回 {len(rows)} 行数据")
+        logger.info(f"查询成功,返回 {len(rows)} 行数据")
         
         result = {
             "success": True,
@@ -139,7 +143,7 @@ def execute_sql(sql: str, max_rows: int = None) -> Dict[str, Any]:
         
     except Exception as e:
         error_msg = str(e)
-        print(f"[ERROR] SQL执行异常: {error_msg}")
+        logger.error(f"SQL执行异常: {error_msg}")
         
         return {
             "success": False,

+ 7 - 19
agent/tools/sql_generation.py

@@ -2,6 +2,10 @@
 from langchain.tools import tool
 from typing import Dict, Any
 from common.vanna_instance import get_vanna_instance
+from core.logging import get_agent_logger
+
+# Initialize logger
+logger = get_agent_logger("SQLGeneration")
 
 @tool
 def generate_sql(question: str, allow_llm_to_see_data: bool = True) -> Dict[str, Any]:
@@ -22,7 +26,7 @@ def generate_sql(question: str, allow_llm_to_see_data: bool = True) -> Dict[str,
         }
     """
     try:
-        print(f"[TOOL:generate_sql] 开始生成SQL: {question}")
+        logger.info(f"开始生成SQL: {question}")
         
         vn = get_vanna_instance()
         sql = vn.generate_sql(question=question, allow_llm_to_see_data=allow_llm_to_see_data)
@@ -58,23 +62,7 @@ def generate_sql(question: str, allow_llm_to_see_data: bool = True) -> Dict[str,
                 "can_retry": True
             }
         
-        # 检查是否返回了错误信息而非SQL
-        error_indicators = [
-            "insufficient context", "无法生成", "sorry", "cannot generate",
-            "not enough information", "unclear", "unable to"
-        ]
-        
-        if any(indicator in sql_clean.lower() for indicator in error_indicators):
-            # 这是解释性文本(已在base_llm_chat.py中处理thinking内容)
-            return {
-                "success": False,
-                "sql": None,
-                "error": sql_clean,
-                "error_type": "llm_explanation",
-                "can_retry": False
-            }
-        
-        print(f"[TOOL:generate_sql] 成功生成SQL: {sql}")
+        logger.info(f"成功生成SQL: {sql}")
         return {
             "success": True,
             "sql": sql,
@@ -83,7 +71,7 @@ def generate_sql(question: str, allow_llm_to_see_data: bool = True) -> Dict[str,
         }
         
     except Exception as e:
-        print(f"[ERROR] SQL生成异常: {str(e)}")
+        logger.error(f"SQL生成异常: {str(e)}")
         return {
             "success": False,
             "sql": None,

+ 8 - 4
agent/tools/summary_generation.py

@@ -3,6 +3,10 @@ from langchain.tools import tool
 from typing import Dict, Any
 import pandas as pd
 from common.vanna_instance import get_vanna_instance
+from core.logging import get_agent_logger
+
+# Initialize logger
+logger = get_agent_logger("SummaryGeneration")
 
 @tool
 def generate_summary(question: str, query_result: Dict[str, Any], sql: str) -> Dict[str, Any]:
@@ -23,7 +27,7 @@ def generate_summary(question: str, query_result: Dict[str, Any], sql: str) -> D
         }
     """
     try:
-        print(f"[TOOL:generate_summary] 开始生成摘要,问题: {question}")
+        logger.info(f"开始生成摘要,问题: {question}")
         
         if not query_result or not query_result.get("rows"):
             return {
@@ -50,7 +54,7 @@ def generate_summary(question: str, query_result: Dict[str, Any], sql: str) -> D
             # 生成默认摘要
             summary = _generate_default_summary(question, query_result, sql)
         
-        print(f"[TOOL:generate_summary] 摘要生成成功: {summary[:100]}...")
+        logger.info(f"摘要生成成功: {summary[:100]}...")
         
         return {
             "success": True,
@@ -59,7 +63,7 @@ def generate_summary(question: str, query_result: Dict[str, Any], sql: str) -> D
         }
         
     except Exception as e:
-        print(f"[ERROR] 摘要生成异常: {str(e)}")
+        logger.error(f"摘要生成异常: {str(e)}")
         
         # 生成备用摘要
         fallback_summary = _generate_fallback_summary(question, query_result, sql)
@@ -82,7 +86,7 @@ def _reconstruct_dataframe(query_result: Dict[str, Any]) -> pd.DataFrame:
         return pd.DataFrame(rows, columns=columns)
         
     except Exception as e:
-        print(f"[WARNING] DataFrame重构失败: {str(e)}")
+        logger.warning(f"DataFrame重构失败: {str(e)}")
         return pd.DataFrame()
 
 def _generate_default_summary(question: str, query_result: Dict[str, Any], sql: str) -> str:

+ 13 - 9
agent/utils.py → agent/tools/utils.py

@@ -1,4 +1,4 @@
-# agent/utils.py
+# agent/tools/utils.py
 """
 Agent相关的工具函数
 """
@@ -7,6 +7,10 @@ import json
 from typing import Dict, Any, Callable, List, Optional
 from langchain_core.messages import BaseMessage, AIMessage, HumanMessage, SystemMessage, ToolMessage
 from langchain_core.tools import BaseTool
+from core.logging import get_agent_logger
+
+# Initialize logger
+logger = get_agent_logger("AgentUtils")
 
 def handle_tool_errors(func: Callable) -> Callable:
     """
@@ -17,7 +21,7 @@ def handle_tool_errors(func: Callable) -> Callable:
         try:
             return func(*args, **kwargs)
         except Exception as e:
-            print(f"[ERROR] 工具 {func.__name__} 执行失败: {str(e)}")
+            logger.error(f"工具 {func.__name__} 执行失败: {str(e)}")
             return {
                 "success": False,
                 "error": f"工具执行异常: {str(e)}",
@@ -50,7 +54,7 @@ class LLMWrapper:
                 return self._invoke_without_tools(messages, **kwargs)
                 
         except Exception as e:
-            print(f"[ERROR] LLM包装器调用失败: {str(e)}")
+            logger.error(f"LLM包装器调用失败: {str(e)}")
             return AIMessage(content=f"LLM调用失败: {str(e)}")
     
     def _should_use_tools(self, messages: List[BaseMessage]) -> bool:
@@ -88,7 +92,7 @@ class LLMWrapper:
                 return AIMessage(content=response)
                 
         except Exception as e:
-            print(f"[ERROR] 工具调用失败: {str(e)}")
+            logger.error(f"工具调用失败: {str(e)}")
             return self._invoke_without_tools(messages, **kwargs)
     
     def _invoke_without_tools(self, messages: List[BaseMessage], **kwargs):
@@ -206,26 +210,26 @@ def get_compatible_llm():
                     model=llm_config.get("model"),
                     temperature=llm_config.get("temperature", 0.7)
                 )
-                print("[INFO] 使用标准OpenAI兼容API")
+                logger.info("使用标准OpenAI兼容API")
                 return llm
             except ImportError:
-                print("[WARNING] langchain_openai 未安装,使用 Vanna 实例包装器")
+                logger.warning("langchain_openai 未安装,使用 Vanna 实例包装器")
         
         # 优先使用统一的 Vanna 实例
         from common.vanna_instance import get_vanna_instance
         vn = get_vanna_instance()
-        print("[INFO] 使用Vanna实例包装器")
+        logger.info("使用Vanna实例包装器")
         return LLMWrapper(vn)
         
     except Exception as e:
-        print(f"[ERROR] 获取 Vanna 实例失败: {str(e)}")
+        logger.error(f"获取 Vanna 实例失败: {str(e)}")
         # 回退到原有逻辑
         from common.utils import get_current_llm_config
         from customllm.qianwen_chat import QianWenChat
         
         llm_config = get_current_llm_config()
         custom_llm = QianWenChat(config=llm_config)
-        print("[INFO] 使用QianWen包装器")
+        logger.info("使用QianWen包装器")
         return LLMWrapper(custom_llm)
 
 def _is_valid_sql_format(sql_text: str) -> bool:

+ 297 - 0
api_usage_examples.md

@@ -0,0 +1,297 @@
+# 表检查API使用指南
+
+本文档介绍新开发的数据库表检查API的使用方法。
+
+## 📋 API概览
+
+### 1. 获取表列表
+- **路径**: `POST /api/v0/database/tables`
+- **功能**: 获取数据库中的表列表,支持表名模糊搜索
+
+### 2. 获取表DDL/文档
+- **路径**: `POST /api/v0/database/table/ddl`
+- **功能**: 获取表的DDL语句或MD文档
+
+## 🔧 API 1: 获取表列表
+
+### 请求示例
+
+#### 基础查询
+```bash
+curl -X POST http://localhost:8084/api/v0/database/tables \
+  -H "Content-Type: application/json" \
+  -d '{
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:5432/highway_db",
+    "schema": "public,ods"
+  }'
+```
+
+#### 表名模糊搜索
+```bash
+curl -X POST http://localhost:8084/api/v0/database/tables \
+  -H "Content-Type: application/json" \
+  -d '{
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:5432/highway_db",
+    "schema": "public,ods",
+    "table_name_pattern": "ods_*"
+  }'
+```
+
+### 参数说明
+
+| 参数 | 类型 | 必需 | 说明 |
+|------|------|------|------|
+| db_connection | string | ✅ | 完整的PostgreSQL连接字符串 |
+| schema | string | ❌ | 查询的schema,支持多个用逗号分隔,默认为public |
+| table_name_pattern | string | ❌ | 表名模糊搜索模式,支持通配符:`ods_*`、`*_dim`、`*fact*` |
+
+### 响应示例
+
+#### 基础查询响应
+```json
+{
+  "success": true,
+  "code": 200,
+  "message": "获取表列表成功",
+  "data": {
+    "tables": [
+      "public.bss_company",
+      "public.bss_branch_copy",
+      "ods.raw_data"
+    ],
+    "total": 3,
+    "schemas": ["public", "ods"],
+    "db_connection_info": {
+      "database": "highway_db"
+    }
+  }
+}
+```
+
+#### 模糊搜索响应
+```json
+{
+  "success": true,
+  "code": 200,
+  "message": "获取表列表成功",
+  "data": {
+    "tables": [
+      "ods.ods_user",
+      "ods.ods_order",
+      "ods.ods_product"
+    ],
+    "total": 3,
+    "schemas": ["ods"],
+    "table_name_pattern": "ods_*",
+    "db_connection_info": {
+      "database": "highway_db"
+    }
+  }
+}
+```
+
+## 📄 API 2: 获取表DDL/文档
+
+### 请求示例
+
+#### DDL格式
+```bash
+curl -X POST http://localhost:8084/api/v0/database/table/ddl \
+  -H "Content-Type: application/json" \
+  -d '{
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:5432/highway_db",
+    "table": "public.bss_company",
+    "business_context": "高速公路服务区管理系统",
+    "type": "ddl"
+  }'
+```
+
+#### MD文档格式
+```bash
+curl -X POST http://localhost:8084/api/v0/database/table/ddl \
+  -H "Content-Type: application/json" \
+  -d '{
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:5432/highway_db",
+    "table": "public.bss_company",
+    "business_context": "高速公路服务区管理系统",
+    "type": "md"
+  }'
+```
+
+#### 同时获取DDL和MD
+```bash
+curl -X POST http://localhost:8084/api/v0/database/table/ddl \
+  -H "Content-Type: application/json" \
+  -d '{
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:5432/highway_db",
+    "table": "public.bss_company",
+    "business_context": "高速公路服务区管理系统",
+    "type": "both"
+  }'
+```
+
+### 参数说明
+
+| 参数 | 类型 | 必需 | 说明 |
+|------|------|------|------|
+| db_connection | string | ✅ | 完整的PostgreSQL连接字符串 |
+| table | string | ✅ | 表名,格式为 schema.tablename |
+| business_context | string | ❌ | 业务上下文描述,用于LLM生成更准确的注释 |
+| type | string | ❌ | 输出类型:ddl/md/both,默认ddl |
+
+### 响应示例
+
+```json
+{
+  "success": true,
+  "code": 200,
+  "message": "获取表DDL成功",
+  "data": {
+    "ddl": "-- 中文名: 服务区档口基础信息表\n-- 描述: 服务区档口基础信息表...\ncreate table public.bss_company (\n  id varchar(32) not null     -- 主键ID,\n  ...\n);",
+    "md": "## bss_company(服务区档口基础信息表)\n...",
+    "table_info": {
+      "table_name": "bss_company",
+      "schema_name": "public",
+      "full_name": "public.bss_company",
+      "comment": "服务区档口基础信息表",
+      "field_count": 15,
+      "row_count": 1000,
+      "table_size": "256 kB"
+    },
+    "fields": [
+      {
+        "name": "id",
+        "type": "varchar",
+        "nullable": false,
+        "comment": "主键ID",
+        "is_primary_key": true,
+        "is_foreign_key": false,
+        "default_value": null,
+        "is_enum": false,
+        "enum_values": []
+      }
+    ],
+    "generation_info": {
+      "business_context": "高速公路服务区管理系统",
+      "output_type": "both",
+      "has_llm_comments": true,
+      "database": "highway_db"
+    }
+  }
+}
+```
+
+## 🚀 特性说明
+
+### 表名模糊搜索(新增功能)
+- 支持通配符模式:`ods_*`、`*_dim`、`*fact*`
+- 支持SQL LIKE语法:`ods_%`、`%_dim`
+- 数据库层面高效过滤,适用于大量表的场景
+- 自动转换通配符为SQL LIKE语法
+
+### 智能注释生成
+- 当提供`business_context`时,系统会调用LLM生成智能注释
+- LLM会结合表结构、样例数据和业务上下文生成准确的中文注释
+- 自动识别枚举字段并提供可能的取值
+
+### 多格式输出
+- **DDL**: 标准的CREATE TABLE语句,包含注释
+- **MD**: Markdown格式的表文档,适合文档系统
+- **Both**: 同时提供DDL和MD格式
+
+### 高性能设计
+- 复用现有的`data_pipeline`模块,90%+代码复用率
+- 异步处理,支持并发请求
+- 智能缓存,避免重复计算
+
+## 🧪 测试方法
+
+运行测试脚本:
+```bash
+python test_table_inspector_api.py
+```
+
+测试脚本包含:
+- 表列表API的各种参数组合测试
+- 表名模糊搜索功能测试
+- DDL/MD生成API的功能测试
+- 错误处理测试
+- 性能基准测试
+
+## ⚠️ 注意事项
+
+1. **连接字符串**: 必须包含完整的数据库信息
+2. **LLM调用**: 当提供`business_context`时会调用LLM,响应时间较长
+3. **权限要求**: 需要数据库的读取权限
+4. **超时设置**: DDL生成包含LLM调用,建议设置60秒以上超时
+5. **表名模糊搜索**: 支持 `*` 通配符和 `%` SQL语法,区分大小写
+
+## 🔗 集成示例
+
+### JavaScript/前端集成
+```javascript
+// 获取表列表
+const tables = await fetch('/api/v0/database/tables', {
+  method: 'POST',
+  headers: { 'Content-Type': 'application/json' },
+  body: JSON.stringify({
+    db_connection: 'postgresql://user:pass@host:5432/db',
+    schema: 'public'
+  })
+}).then(r => r.json());
+
+// 获取表列表(使用模糊搜索)
+const filteredTables = await fetch('/api/v0/database/tables', {
+  method: 'POST',
+  headers: { 'Content-Type': 'application/json' },
+  body: JSON.stringify({
+    db_connection: 'postgresql://user:pass@host:5432/db',
+    schema: 'public,ods',
+    table_name_pattern: 'ods_*'
+  })
+}).then(r => r.json());
+
+// 获取表DDL
+const ddl = await fetch('/api/v0/database/table/ddl', {
+  method: 'POST',
+  headers: { 'Content-Type': 'application/json' },
+  body: JSON.stringify({
+    db_connection: 'postgresql://user:pass@host:5432/db',
+    table: 'public.users',
+    business_context: '用户管理系统',
+    type: 'both'
+  })
+}).then(r => r.json());
+```
+
+### Python集成
+```python
+import requests
+
+# 获取表列表
+response = requests.post('http://localhost:8084/api/v0/database/tables', 
+  json={
+    'db_connection': 'postgresql://user:pass@host:5432/db',
+    'schema': 'public'
+  })
+tables = response.json()
+
+# 获取表列表(使用模糊搜索)
+response = requests.post('http://localhost:8084/api/v0/database/tables', 
+  json={
+    'db_connection': 'postgresql://user:pass@host:5432/db',
+    'schema': 'public,ods',
+    'table_name_pattern': 'ods_*'
+  })
+ods_tables = response.json()
+
+# 获取表DDL  
+response = requests.post('http://localhost:8084/api/v0/database/table/ddl',
+  json={
+    'db_connection': 'postgresql://user:pass@host:5432/db', 
+    'table': 'public.users',
+    'business_context': '用户管理系统',
+    'type': 'ddl'
+  })
+ddl = response.json()
+``` 

+ 1 - 14
app_config.py

@@ -117,19 +117,6 @@ TRAINING_BATCH_PROCESSING_ENABLED = False   # 是否启用训练数据批处理
 TRAINING_BATCH_SIZE = 10                    # 每批处理的训练项目数量
 TRAINING_MAX_WORKERS = 1                    # 训练批处理的最大工作线程数(设置为1确保单线程)
 
-# 训练数据路径配置
-# 支持以下格式:
-# 1. 相对路径(以 . 开头):
-#    "./training/data"     - 项目根目录下的training/data
-#    "../data"             - 项目根目录上级的data目录
-# 2. 绝对路径:
-#    "/home/user/data"     - Linux绝对路径
-#    "C:/data"             - Windows绝对路径
-#    "D:\\training\\data"  - Windows绝对路径(转义反斜杠)
-# 3. 相对路径(不以.开头):
-#    "training/data"       - 相对于项目根目录
-#    "my_data"             - 项目根目录下的my_data文件夹
-TRAINING_DATA_PATH = "./training/data"
 
 # 是否启用问题重写功能,也就是上下文问题合并。
 REWRITE_QUESTION_ENABLED = False
@@ -182,7 +169,7 @@ REDIS_PASSWORD = None
 
 # 缓存开关配置
 ENABLE_CONVERSATION_CONTEXT = True      # 是否启用对话上下文
-ENABLE_QUESTION_ANSWER_CACHE = True     # 是否启用问答结果缓存
+ENABLE_QUESTION_ANSWER_CACHE = False     # 是否启用问答结果缓存
 ENABLE_EMBEDDING_CACHE = True           # 是否启用embedding向量缓存
 
 # TTL配置(单位:秒)

+ 1827 - 99
citu_app.py

@@ -1,4 +1,8 @@
 # 给dataops 对话助手返回结果
+# 初始化日志系统 - 必须在最前面
+from core.logging import initialize_logging, get_app_logger, set_log_context, clear_log_context
+initialize_logging()
+
 from vanna.flask import VannaFlaskApp
 from core.vanna_llm_factory import create_vanna_instance
 from flask import request, jsonify
@@ -15,13 +19,10 @@ import sqlparse  # 用于SQL语法检查
 from common.redis_conversation_manager import RedisConversationManager  # 添加Redis对话管理器导入
 
 from common.qa_feedback_manager import QAFeedbackManager
-from common.result import success_response, bad_request_response, not_found_response, internal_error_response
-
-
 from common.result import (  # 统一导入所有需要的响应函数
-    bad_request_response, service_unavailable_response, 
+    success_response, bad_request_response, not_found_response, internal_error_response,
+    error_response, service_unavailable_response, 
     agent_success_response, agent_error_response,
-    internal_error_response, success_response,
     validation_failed_response
 )
 from app_config import (  # 添加Redis相关配置导入
@@ -31,6 +32,9 @@ from app_config import (  # 添加Redis相关配置导入
     ENABLE_QUESTION_ANSWER_CACHE
 )
 
+# 创建app logger
+logger = get_app_logger("CituApp")
+
 # 设置默认的最大返回行数
 DEFAULT_MAX_RETURN_ROWS = 200
 MAX_RETURN_ROWS = API_MAX_RETURN_ROWS if API_MAX_RETURN_ROWS is not None else DEFAULT_MAX_RETURN_ROWS
@@ -131,9 +135,9 @@ def ask_full():
                 if ENABLE_RESULT_SUMMARY:
                     try:
                         summary = vn.generate_summary(question=question, df=df)
-                        print(f"[INFO] 成功生成摘要: {summary}")
+                        logger.info(f"成功生成摘要: {summary}")
                     except Exception as e:
-                        print(f"[WARNING] 生成摘要失败: {str(e)}")
+                        logger.warning(f"生成摘要失败: {str(e)}")
                         summary = None
 
         # 构建返回数据
@@ -156,7 +160,7 @@ def ask_full():
         ))
         
     except Exception as e:
-        print(f"[ERROR] ask_full执行失败: {str(e)}")
+        logger.error(f"ask_full执行失败: {str(e)}")
         
         # 即使发生异常,也检查是否有业务层面的解释
         if hasattr(vn, 'last_llm_explanation') and vn.last_llm_explanation:
@@ -219,7 +223,7 @@ def citu_run_sql():
         ))
         
     except Exception as e:
-        print(f"[ERROR] citu_run_sql执行失败: {str(e)}")
+        logger.error(f"citu_run_sql执行失败: {str(e)}")
         from common.result import internal_error_response
         return jsonify(internal_error_response(
             response_text=f"SQL执行失败,请检查SQL语句是否正确"
@@ -245,27 +249,27 @@ def ask_cached():
     try:
         # 生成conversation_id
         # 调试:查看generate_id的实际行为
-        print(f"[DEBUG] 输入问题: '{question}'")
+        logger.debug(f"输入问题: '{question}'")
         conversation_id = app.cache.generate_id(question=question)
-        print(f"[DEBUG] 生成的conversation_id: {conversation_id}")
+        logger.debug(f"生成的conversation_id: {conversation_id}")
         
         # 再次用相同问题测试
         conversation_id2 = app.cache.generate_id(question=question)
-        print(f"[DEBUG] 再次生成的conversation_id: {conversation_id2}")
-        print(f"[DEBUG] 两次ID是否相同: {conversation_id == conversation_id2}")
+        logger.debug(f"再次生成的conversation_id: {conversation_id2}")
+        logger.debug(f"两次ID是否相同: {conversation_id == conversation_id2}")
         
         # 检查缓存
         cached_sql = app.cache.get(id=conversation_id, field="sql")
         
         if cached_sql is not None:
             # 缓存命中
-            print(f"[CACHE HIT] 使用缓存结果: {conversation_id}")
+            logger.info(f"[CACHE HIT] 使用缓存结果: {conversation_id}")
             sql = cached_sql
             df = app.cache.get(id=conversation_id, field="df")
             summary = app.cache.get(id=conversation_id, field="summary")
         else:
             # 缓存未命中,执行新查询
-            print(f"[CACHE MISS] 执行新查询: {conversation_id}")
+            logger.info(f"[CACHE MISS] 执行新查询: {conversation_id}")
             
             sql, df, _ = vn.ask(
                 question=question,
@@ -301,9 +305,9 @@ def ask_cached():
             if ENABLE_RESULT_SUMMARY and isinstance(df, pd.DataFrame) and not df.empty:
                 try:
                     summary = vn.generate_summary(question=question, df=df)
-                    print(f"[INFO] 成功生成摘要: {summary}")
+                    logger.info(f"成功生成摘要: {summary}")
                 except Exception as e:
-                    print(f"[WARNING] 生成摘要失败: {str(e)}")
+                    logger.warning(f"生成摘要失败: {str(e)}")
                     summary = None
             
             app.cache.set(id=conversation_id, field="summary", value=summary)
@@ -348,7 +352,7 @@ def ask_cached():
         ))
         
     except Exception as e:
-        print(f"[ERROR] ask_cached执行失败: {str(e)}")
+        logger.error(f"ask_cached执行失败: {str(e)}")
         from common.result import internal_error_response
         return jsonify(internal_error_response(
             response_text="查询处理失败,请稍后重试"
@@ -386,10 +390,10 @@ def citu_train_question_sql():
         # 正确的调用方式:同时传递question和sql
         if question:
             training_id = vn.train(question=question, sql=sql)
-            print(f"训练成功,训练ID为:{training_id},问题:{question},SQL:{sql}")
+            logger.info(f"训练成功,训练ID为:{training_id},问题:{question},SQL:{sql}")
         else:
             training_id = vn.train(sql=sql)
-            print(f"训练成功,训练ID为:{training_id},SQL:{sql}")
+            logger.info(f"训练成功,训练ID为:{training_id},SQL:{sql}")
 
         from common.result import success_response
         return jsonify(success_response(
@@ -418,23 +422,23 @@ def get_citu_langraph_agent():
     if citu_langraph_agent is None:
         try:
             from agent.citu_agent import CituLangGraphAgent
-            print("[CITU_APP] 开始创建LangGraph Agent实例...")
+            logger.info("开始创建LangGraph Agent实例...")
             citu_langraph_agent = CituLangGraphAgent()
-            print("[CITU_APP] LangGraph Agent实例创建成功")
+            logger.info("LangGraph Agent实例创建成功")
         except ImportError as e:
-            print(f"[CRITICAL] Agent模块导入失败: {str(e)}")
-            print("[CRITICAL] 请检查agent模块是否存在以及依赖是否正确安装")
+            logger.critical(f"Agent模块导入失败: {str(e)}")
+            logger.critical("请检查agent模块是否存在以及依赖是否正确安装")
             raise Exception(f"Agent模块导入失败: {str(e)}")
         except Exception as e:
-            print(f"[CRITICAL] LangGraph Agent实例创建失败: {str(e)}")
-            print(f"[CRITICAL] 错误类型: {type(e).__name__}")
+            logger.critical(f"LangGraph Agent实例创建失败: {str(e)}")
+            logger.critical(f"错误类型: {type(e).__name__}")
             # 提供更有用的错误信息
             if "config" in str(e).lower():
-                print("[CRITICAL] 可能是配置文件问题,请检查配置")
+                logger.critical("可能是配置文件问题,请检查配置")
             elif "llm" in str(e).lower():
-                print("[CRITICAL] 可能是LLM连接问题,请检查LLM配置")
+                logger.critical("可能是LLM连接问题,请检查LLM配置")
             elif "tool" in str(e).lower():
-                print("[CRITICAL] 可能是工具加载问题,请检查工具模块")
+                logger.critical("可能是工具加载问题,请检查工具模块")
             raise Exception(f"Agent初始化失败: {str(e)}")
     return citu_langraph_agent
 
@@ -495,15 +499,15 @@ def ask_agent():
                         metadata = message.get("metadata", {})
                         context_type = metadata.get("type")
                         if context_type:
-                            print(f"[AGENT_API] 检测到上下文类型: {context_type}")
+                            logger.info(f"[AGENT_API] 检测到上下文类型: {context_type}")
                             break
             except Exception as e:
-                print(f"[WARNING] 获取上下文类型失败: {str(e)}")
+                logger.warning(f"获取上下文类型失败: {str(e)}")
         
         # 4. 检查缓存(新逻辑:放宽使用条件,严控存储条件)
         cached_answer = redis_conversation_manager.get_cached_answer(question, context)
         if cached_answer:
-            print(f"[AGENT_API] 使用缓存答案")
+            logger.info(f"[AGENT_API] 使用缓存答案")
             
             # 确定缓存答案的助手回复内容(使用与非缓存相同的优先级逻辑)
             cached_response_type = cached_answer.get("type", "UNKNOWN")
@@ -567,31 +571,31 @@ def ask_agent():
         # 6. 构建带上下文的问题
         if context:
             enhanced_question = f"\n[CONTEXT]\n{context}\n\n[CURRENT]\n{question}"
-            print(f"[AGENT_API] 使用上下文,长度: {len(context)}字符")
+            logger.info(f"[AGENT_API] 使用上下文,长度: {len(context)}字符")
         else:
             enhanced_question = question
-            print(f"[AGENT_API] 新对话,无上下文")
+            logger.info(f"[AGENT_API] 新对话,无上下文")
         
         # 7. 确定最终使用的路由模式(优先级逻辑)
         if api_routing_mode:
             # API传了参数,优先使用
             effective_routing_mode = api_routing_mode
-            print(f"[AGENT_API] 使用API指定的路由模式: {effective_routing_mode}")
+            logger.info(f"[AGENT_API] 使用API指定的路由模式: {effective_routing_mode}")
         else:
             # API没传参数,使用配置文件
             try:
                 from app_config import QUESTION_ROUTING_MODE
                 effective_routing_mode = QUESTION_ROUTING_MODE
-                print(f"[AGENT_API] 使用配置文件路由模式: {effective_routing_mode}")
+                logger.info(f"[AGENT_API] 使用配置文件路由模式: {effective_routing_mode}")
             except ImportError:
                 effective_routing_mode = "hybrid"
-                print(f"[AGENT_API] 配置文件读取失败,使用默认路由模式: {effective_routing_mode}")
+                logger.info(f"[AGENT_API] 配置文件读取失败,使用默认路由模式: {effective_routing_mode}")
         
         # 8. 现有Agent处理逻辑(修改为传递路由模式)
         try:
             agent = get_citu_langraph_agent()
         except Exception as e:
-            print(f"[CRITICAL] Agent初始化失败: {str(e)}")
+            logger.critical(f"Agent初始化失败: {str(e)}")
             return jsonify(service_unavailable_response(
                 response_text="AI服务暂时不可用,请稍后重试",
                 can_retry=True
@@ -687,7 +691,7 @@ def ask_agent():
             )), error_code
         
     except Exception as e:
-        print(f"[ERROR] ask_agent执行失败: {str(e)}")
+        logger.error(f"ask_agent执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="查询处理失败,请稍后重试"
         )), 500
@@ -757,7 +761,7 @@ def agent_health():
         
         # 检查3: LLM连接(简单测试)
         try:
-            from agent.utils import get_compatible_llm
+            from agent.tools.utils import get_compatible_llm
             llm = get_compatible_llm()
             health_data["checks"]["llm_connection"] = llm is not None
         except Exception as e:
@@ -784,9 +788,9 @@ def agent_health():
                 health_data["status"] = "degraded"
                 health_data["message"] = "部分组件异常"
         except Exception as e:
-            print(f"[ERROR] 健康检查异常: {str(e)}")
+            logger.error(f"健康检查异常: {str(e)}")
             import traceback
-            print(f"[ERROR] 详细健康检查错误: {traceback.format_exc()}")
+            logger.error(f"详细健康检查错误: {traceback.format_exc()}")
             health_data["status"] = "degraded"
             health_data["message"] = f"完整测试失败: {str(e)}"
         
@@ -803,9 +807,9 @@ def agent_health():
             return jsonify(health_error_response(**health_data)), 503
             
     except Exception as e:
-        print(f"[ERROR] 顶层健康检查异常: {str(e)}")
+        logger.error(f"顶层健康检查异常: {str(e)}")
         import traceback
-        print(f"[ERROR] 详细错误信息: {traceback.format_exc()}")
+        logger.error(f"详细错误信息: {traceback.format_exc()}")
         from common.result import internal_error_response
         return jsonify(internal_error_response(
             response_text="健康检查失败,请稍后重试"
@@ -1517,7 +1521,7 @@ def training_error_question_sql():
         question = data.get('question')
         sql = data.get('sql')
         
-        print(f"[DEBUG] 接收到错误SQL训练请求: question={question}, sql={sql}")
+        logger.debug(f"接收到错误SQL训练请求: question={question}, sql={sql}")
         
         if not question or not sql:
             from common.result import bad_request_response
@@ -1535,7 +1539,7 @@ def training_error_question_sql():
         # 使用vn实例的train_error_sql方法存储错误SQL
         id = vn.train_error_sql(question=question, sql=sql)
         
-        print(f"[INFO] 成功存储错误SQL,ID: {id}")
+        logger.info(f"成功存储错误SQL,ID: {id}")
         
         from common.result import success_response
         return jsonify(success_response(
@@ -1547,7 +1551,7 @@ def training_error_question_sql():
         ))
         
     except Exception as e:
-        print(f"[ERROR] 存储错误SQL失败: {str(e)}")
+        logger.error(f"存储错误SQL失败: {str(e)}")
         from common.result import internal_error_response
         return jsonify(internal_error_response(
             response_text="存储错误SQL失败,请稍后重试"
@@ -1593,7 +1597,7 @@ def get_user_conversations(user_id: str):
                     conversation['conversation_title'] = "空对话"
                     
             except Exception as e:
-                print(f"[WARNING] 获取对话标题失败 {conversation_id}: {str(e)}")
+                logger.warning(f"获取对话标题失败 {conversation_id}: {str(e)}")
                 conversation['conversation_title'] = "对话"
         
         return jsonify(success_response(
@@ -1747,7 +1751,7 @@ def get_user_conversations_with_messages(user_id: str):
         ))
         
     except Exception as e:
-        print(f"[ERROR] 获取用户完整对话数据失败: {str(e)}")
+        logger.error(f"获取用户完整对话数据失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="获取用户对话数据失败,请稍后重试"
         )), 500
@@ -1770,7 +1774,7 @@ def embedding_cache_stats():
         ))
         
     except Exception as e:
-        print(f"[ERROR] 获取embedding缓存统计失败: {str(e)}")
+        logger.error(f"获取embedding缓存统计失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="获取embedding缓存统计失败,请稍后重试"
         )), 500
@@ -1801,7 +1805,7 @@ def embedding_cache_cleanup():
             )), 500
         
     except Exception as e:
-        print(f"[ERROR] 清空embedding缓存失败: {str(e)}")
+        logger.error(f"清空embedding缓存失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="清空embedding缓存失败,请稍后重试"
         )), 500
@@ -1827,15 +1831,15 @@ def get_qa_feedback_manager():
                 elif 'vn' in globals():
                     vanna_instance = vn
                 else:
-                    print("[INFO] 未找到可用的vanna实例,将创建新的数据库连接")
+                    logger.info("未找到可用的vanna实例,将创建新的数据库连接")
             except Exception as e:
-                print(f"[INFO] 获取vanna实例失败: {e},将创建新的数据库连接")
+                logger.info(f"获取vanna实例失败: {e},将创建新的数据库连接")
                 vanna_instance = None
             
             qa_feedback_manager = QAFeedbackManager(vanna_instance=vanna_instance)
-            print("[CITU_APP] QA反馈管理器实例创建成功")
+            logger.info("QA反馈管理器实例创建成功")
         except Exception as e:
-            print(f"[CRITICAL] QA反馈管理器创建失败: {str(e)}")
+            logger.critical(f"QA反馈管理器创建失败: {str(e)}")
             raise Exception(f"QA反馈管理器初始化失败: {str(e)}")
     return qa_feedback_manager
 
@@ -1904,7 +1908,7 @@ def qa_feedback_query():
         ))
         
     except Exception as e:
-        print(f"[ERROR] qa_feedback_query执行失败: {str(e)}")
+        logger.error(f"qa_feedback_query执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="查询反馈记录失败,请稍后重试"
         )), 500
@@ -1929,7 +1933,7 @@ def qa_feedback_delete(feedback_id):
             )), 404
             
     except Exception as e:
-        print(f"[ERROR] qa_feedback_delete执行失败: {str(e)}")
+        logger.error(f"qa_feedback_delete执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="删除反馈记录失败,请稍后重试"
         )), 500
@@ -1973,7 +1977,7 @@ def qa_feedback_update(feedback_id):
             )), 404
             
     except Exception as e:
-        print(f"[ERROR] qa_feedback_update执行失败: {str(e)}")
+        logger.error(f"qa_feedback_update执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="更新反馈记录失败,请稍后重试"
         )), 500
@@ -2026,7 +2030,7 @@ def qa_feedback_add_to_training():
                         sql=record['sql']
                     )
                     positive_count += 1
-                    print(f"[TRAINING] 正向训练成功 - ID: {record['id']}, TrainingID: {training_id}")
+                    logger.info(f"正向训练成功 - ID: {record['id']}, TrainingID: {training_id}")
                 else:
                     # 负向反馈 - 加入错误SQL训练集
                     training_id = vn.train_error_sql(
@@ -2034,18 +2038,18 @@ def qa_feedback_add_to_training():
                         sql=record['sql']
                     )
                     negative_count += 1
-                    print(f"[TRAINING] 负向训练成功 - ID: {record['id']}, TrainingID: {training_id}")
+                    logger.info(f"负向训练成功 - ID: {record['id']}, TrainingID: {training_id}")
                 
                 successfully_trained_ids.append(record['id'])
                 
             except Exception as e:
-                print(f"[ERROR] 训练失败 - 反馈ID: {record['id']}, 错误: {e}")
+                logger.error(f"训练失败 - 反馈ID: {record['id']}, 错误: {e}")
                 error_count += 1
         
         # 更新训练状态
         if successfully_trained_ids:
             updated_count = manager.mark_training_status(successfully_trained_ids, True)
-            print(f"[TRAINING] 批量更新训练状态完成,影响 {updated_count} 条记录")
+            logger.info(f"批量更新训练状态完成,影响 {updated_count} 条记录")
         
         # 构建响应
         total_processed = positive_count + negative_count + already_trained_count + error_count
@@ -2070,7 +2074,7 @@ def qa_feedback_add_to_training():
         ))
         
     except Exception as e:
-        print(f"[ERROR] qa_feedback_add_to_training执行失败: {str(e)}")
+        logger.error(f"qa_feedback_add_to_training执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="添加训练数据失败,请稍后重试"
         )), 500
@@ -2123,7 +2127,7 @@ def qa_feedback_add():
         ))
         
     except Exception as e:
-        print(f"[ERROR] qa_feedback_add执行失败: {str(e)}")
+        logger.error(f"qa_feedback_add执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="创建反馈记录失败,请稍后重试"
         )), 500
@@ -2158,7 +2162,7 @@ def qa_feedback_stats():
         ))
         
     except Exception as e:
-        print(f"[ERROR] qa_feedback_stats执行失败: {str(e)}")
+        logger.error(f"qa_feedback_stats执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="获取统计信息失败,请稍后重试"
         )), 500
@@ -2178,7 +2182,7 @@ def qa_cache_stats():
         ))
         
     except Exception as e:
-        print(f"[ERROR] 获取问答缓存统计失败: {str(e)}")
+        logger.error(f"获取问答缓存统计失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="获取问答缓存统计失败,请稍后重试"
         )), 500
@@ -2209,7 +2213,7 @@ def qa_cache_list():
         ))
         
     except Exception as e:
-        print(f"[ERROR] 获取问答缓存列表失败: {str(e)}")
+        logger.error(f"获取问答缓存列表失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="获取问答缓存列表失败,请稍后重试"
         )), 500
@@ -2235,7 +2239,7 @@ def qa_cache_cleanup():
         ))
         
     except Exception as e:
-        print(f"[ERROR] 清空问答缓存失败: {str(e)}")
+        logger.error(f"清空问答缓存失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="清空问答缓存失败,请稍后重试"
         )), 500
@@ -2367,7 +2371,7 @@ def get_total_training_count():
             return len(training_data)
         return 0
     except Exception as e:
-        print(f"[WARNING] 获取训练数据总数失败: {e}")
+        logger.warning(f"获取训练数据总数失败: {e}")
         return 0
 
 @app.flask_app.route('/api/v0/training_data/query', methods=['POST'])
@@ -2460,7 +2464,7 @@ def training_data_query():
         ))
         
     except Exception as e:
-        print(f"[ERROR] training_data_query执行失败: {str(e)}")
+        logger.error(f"training_data_query执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="查询训练数据失败,请稍后重试"
         )), 500
@@ -2520,20 +2524,51 @@ def training_data_create():
         # 获取创建后的总记录数
         current_total = get_total_training_count()
         
-        return jsonify(success_response(
-            response_text="训练数据创建完成",
-            data={
-                "total_requested": len(data_list),
-                "successfully_created": successful_count,
-                "failed_count": len(data_list) - successful_count,
-                "results": results,
-                "summary": type_summary,
-                "current_total_count": current_total
-            }
-        ))
+        # 根据实际执行结果决定响应状态
+        failed_count = len(data_list) - successful_count
+        
+        if failed_count == 0:
+            # 全部成功
+            return jsonify(success_response(
+                response_text="训练数据创建完成",
+                data={
+                    "total_requested": len(data_list),
+                    "successfully_created": successful_count,
+                    "failed_count": failed_count,
+                    "results": results,
+                    "summary": type_summary,
+                    "current_total_count": current_total
+                }
+            ))
+        elif successful_count == 0:
+            # 全部失败
+            return jsonify(error_response(
+                response_text="训练数据创建失败",
+                data={
+                    "total_requested": len(data_list),
+                    "successfully_created": successful_count,
+                    "failed_count": failed_count,
+                    "results": results,
+                    "summary": type_summary,
+                    "current_total_count": current_total
+                }
+            )), 400
+        else:
+            # 部分成功,部分失败
+            return jsonify(error_response(
+                response_text=f"训练数据创建部分成功,成功{successful_count}条,失败{failed_count}条",
+                data={
+                    "total_requested": len(data_list),
+                    "successfully_created": successful_count,
+                    "failed_count": failed_count,
+                    "results": results,
+                    "summary": type_summary,
+                    "current_total_count": current_total
+                }
+            )), 207
         
     except Exception as e:
-        print(f"[ERROR] training_data_create执行失败: {str(e)}")
+        logger.error(f"training_data_create执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="创建训练数据失败,请稍后重试"
         )), 500
@@ -2591,21 +2626,54 @@ def training_data_delete():
         # 获取删除后的总记录数
         current_total = get_total_training_count()
         
-        return jsonify(success_response(
-            response_text="训练数据删除完成",
-            data={
-                "total_requested": len(ids),
-                "successfully_deleted": len(deleted_ids),
-                "failed_count": len(failed_ids),
-                "deleted_ids": deleted_ids,
-                "failed_ids": failed_ids,
-                "failed_details": failed_details,
-                "current_total_count": current_total
-            }
-        ))
+        # 根据实际执行结果决定响应状态
+        failed_count = len(failed_ids)
+        
+        if failed_count == 0:
+            # 全部成功
+            return jsonify(success_response(
+                response_text="训练数据删除完成",
+                data={
+                    "total_requested": len(ids),
+                    "successfully_deleted": len(deleted_ids),
+                    "failed_count": failed_count,
+                    "deleted_ids": deleted_ids,
+                    "failed_ids": failed_ids,
+                    "failed_details": failed_details,
+                    "current_total_count": current_total
+                }
+            ))
+        elif len(deleted_ids) == 0:
+            # 全部失败
+            return jsonify(error_response(
+                response_text="训练数据删除失败",
+                data={
+                    "total_requested": len(ids),
+                    "successfully_deleted": len(deleted_ids),
+                    "failed_count": failed_count,
+                    "deleted_ids": deleted_ids,
+                    "failed_ids": failed_ids,
+                    "failed_details": failed_details,
+                    "current_total_count": current_total
+                }
+            )), 400
+        else:
+            # 部分成功,部分失败
+            return jsonify(error_response(
+                response_text=f"训练数据删除部分成功,成功{len(deleted_ids)}条,失败{failed_count}条",
+                data={
+                    "total_requested": len(ids),
+                    "successfully_deleted": len(deleted_ids),
+                    "failed_count": failed_count,
+                    "deleted_ids": deleted_ids,
+                    "failed_ids": failed_ids,
+                    "failed_details": failed_details,
+                    "current_total_count": current_total
+                }
+            )), 207
         
     except Exception as e:
-        print(f"[ERROR] training_data_delete执行失败: {str(e)}")
+        logger.error(f"training_data_delete执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="删除训练数据失败,请稍后重试"
         )), 500
@@ -2666,7 +2734,7 @@ def training_data_stats():
         ))
         
     except Exception as e:
-        print(f"[ERROR] training_data_stats执行失败: {str(e)}")
+        logger.error(f"training_data_stats执行失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="获取统计信息失败,请稍后重试"
         )), 500
@@ -2702,7 +2770,7 @@ def cache_overview_full():
         ))
         
     except Exception as e:
-        print(f"[ERROR] 获取综合缓存概览失败: {str(e)}")
+        logger.error(f"获取综合缓存概览失败: {str(e)}")
         return jsonify(internal_error_response(
             response_text="获取缓存概览失败,请稍后重试"
         )), 500
@@ -2748,5 +2816,1665 @@ const chatSession = new ChatSession();
 chatSession.askQuestion("各年龄段客户的流失率如何?");
 """
 
-print("正在启动Flask应用: http://localhost:8084")
-app.run(host="0.0.0.0", port=8084, debug=True)
+# ==================== Data Pipeline API ====================
+
+# 导入简化的Data Pipeline模块
+import asyncio
+import os
+from threading import Thread
+from flask import send_file
+
+from data_pipeline.api.simple_workflow import SimpleWorkflowManager
+from data_pipeline.api.simple_file_manager import SimpleFileManager
+
+# 创建简化的管理器
+data_pipeline_manager = None
+data_pipeline_file_manager = None
+
+def get_data_pipeline_manager():
+    """获取Data Pipeline管理器单例"""
+    global data_pipeline_manager
+    if data_pipeline_manager is None:
+        data_pipeline_manager = SimpleWorkflowManager()
+    return data_pipeline_manager
+
+def get_data_pipeline_file_manager():
+    """获取Data Pipeline文件管理器单例"""
+    global data_pipeline_file_manager
+    if data_pipeline_file_manager is None:
+        data_pipeline_file_manager = SimpleFileManager()
+    return data_pipeline_file_manager
+
+# ==================== 简化的Data Pipeline API端点 ====================
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks', methods=['POST'])
+def create_data_pipeline_task():
+    """创建数据管道任务"""
+    try:
+        req = request.get_json(force=True)
+        
+        # table_list_file和business_context现在都是可选参数
+        # 如果未提供table_list_file,将使用文件上传模式
+        
+        # 创建任务(支持可选的db_connection参数)
+        manager = get_data_pipeline_manager()
+        task_id = manager.create_task(
+            table_list_file=req.get('table_list_file'),
+            business_context=req.get('business_context'),
+            db_name=req.get('db_name'),  # 可选参数,用于指定特定数据库名称
+            db_connection=req.get('db_connection'),  # 可选参数,用于指定数据库连接字符串
+            task_name=req.get('task_name'),  # 可选参数,用于指定任务名称
+            enable_sql_validation=req.get('enable_sql_validation', True),
+            enable_llm_repair=req.get('enable_llm_repair', True),
+            modify_original_file=req.get('modify_original_file', True),
+            enable_training_data_load=req.get('enable_training_data_load', True)
+        )
+        
+        # 获取任务信息
+        task_info = manager.get_task_status(task_id)
+        
+        response_data = {
+            "task_id": task_id,
+            "task_name": task_info.get('task_name'),
+            "status": task_info.get('status'),
+            "created_at": task_info.get('created_at').isoformat() if task_info.get('created_at') else None
+        }
+        
+        # 检查是否为文件上传模式
+        file_upload_mode = not req.get('table_list_file')
+        response_message = "任务创建成功"
+        
+        if file_upload_mode:
+            response_data["file_upload_mode"] = True
+            response_data["next_step"] = f"POST /api/v0/data_pipeline/tasks/{task_id}/upload-table-list"
+            response_message += ",请上传表清单文件后再执行任务"
+        
+        return jsonify(success_response(
+            response_text=response_message,
+            data=response_data
+        )), 201
+        
+    except Exception as e:
+        logger.error(f"创建数据管道任务失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="创建任务失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/execute', methods=['POST'])
+def execute_data_pipeline_task(task_id):
+    """执行数据管道任务"""
+    try:
+        req = request.get_json(force=True) if request.is_json else {}
+        execution_mode = req.get('execution_mode', 'complete')
+        step_name = req.get('step_name')
+        
+        # 验证执行模式
+        if execution_mode not in ['complete', 'step']:
+            return jsonify(bad_request_response(
+                response_text="无效的执行模式,必须是 'complete' 或 'step'",
+                invalid_params=['execution_mode']
+            )), 400
+        
+        # 如果是步骤执行模式,验证步骤名称
+        if execution_mode == 'step':
+            if not step_name:
+                return jsonify(bad_request_response(
+                    response_text="步骤执行模式需要指定step_name",
+                    missing_params=['step_name']
+                )), 400
+            
+            valid_steps = ['ddl_generation', 'qa_generation', 'sql_validation', 'training_load']
+            if step_name not in valid_steps:
+                return jsonify(bad_request_response(
+                    response_text=f"无效的步骤名称,支持的步骤: {', '.join(valid_steps)}",
+                    invalid_params=['step_name']
+                )), 400
+        
+        # 检查任务是否存在
+        manager = get_data_pipeline_manager()
+        task_info = manager.get_task_status(task_id)
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 使用subprocess启动独立进程执行任务
+        def run_task_subprocess():
+            try:
+                import subprocess
+                import sys
+                from pathlib import Path
+                
+                # 构建执行命令
+                python_executable = sys.executable
+                script_path = Path(__file__).parent / "data_pipeline" / "task_executor.py"
+                
+                cmd = [
+                    python_executable,
+                    str(script_path),
+                    "--task-id", task_id,
+                    "--execution-mode", execution_mode
+                ]
+                
+                if step_name:
+                    cmd.extend(["--step-name", step_name])
+                
+                logger.info(f"启动任务进程: {' '.join(cmd)}")
+                
+                # 启动后台进程(不等待完成)
+                process = subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                    cwd=Path(__file__).parent
+                )
+                
+                logger.info(f"任务进程已启动: PID={process.pid}, task_id={task_id}")
+                
+            except Exception as e:
+                logger.error(f"启动任务进程失败: {task_id}, 错误: {str(e)}")
+        
+        # 在新线程中启动subprocess(避免阻塞API响应)
+        thread = Thread(target=run_task_subprocess, daemon=True)
+        thread.start()
+        
+        response_data = {
+            "task_id": task_id,
+            "execution_mode": execution_mode,
+            "step_name": step_name if execution_mode == 'step' else None,
+            "message": "任务正在后台执行,请通过状态接口查询进度"
+        }
+        
+        return jsonify(success_response(
+            response_text="任务执行已启动",
+            data=response_data
+        )), 202
+        
+    except Exception as e:
+        logger.error(f"启动数据管道任务执行失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="启动任务执行失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>', methods=['GET'])
+def get_data_pipeline_task_status(task_id):
+    """
+    获取数据管道任务状态
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "获取任务状态成功",
+        "data": {
+            "task_id": "task_20250627_143052",
+            "status": "in_progress",
+            "step_status": {
+                "ddl_generation": "completed",
+                "qa_generation": "running",
+                "sql_validation": "pending",
+                "training_load": "pending"
+            },
+            "created_at": "2025-06-27T14:30:52",
+            "started_at": "2025-06-27T14:31:00",
+            "parameters": {...},
+            "current_execution": {...},
+            "total_executions": 2
+        }
+    }
+    """
+    try:
+        manager = get_data_pipeline_manager()
+        task_info = manager.get_task_status(task_id)
+        
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 获取步骤状态
+        steps = manager.get_task_steps(task_id)
+        current_step = None
+        for step in steps:
+            if step['step_status'] == 'running':
+                current_step = step
+                break
+        
+        # 构建步骤状态摘要
+        step_status_summary = {}
+        for step in steps:
+            step_status_summary[step['step_name']] = step['step_status']
+        
+        response_data = {
+            "task_id": task_info['task_id'],
+            "task_name": task_info.get('task_name'),
+            "status": task_info['status'],
+            "step_status": step_status_summary,
+            "created_at": task_info['created_at'].isoformat() if task_info.get('created_at') else None,
+            "started_at": task_info['started_at'].isoformat() if task_info.get('started_at') else None,
+            "completed_at": task_info['completed_at'].isoformat() if task_info.get('completed_at') else None,
+            "parameters": task_info.get('parameters', {}),
+            "result": task_info.get('result'),
+            "error_message": task_info.get('error_message'),
+            "current_step": {
+                "execution_id": current_step['execution_id'],
+                "step": current_step['step_name'],
+                "status": current_step['step_status'],
+                "started_at": current_step['started_at'].isoformat() if current_step and current_step.get('started_at') else None
+            } if current_step else None,
+            "total_steps": len(steps),
+            "steps": [{
+                "step_name": step['step_name'],
+                "step_status": step['step_status'],
+                "started_at": step['started_at'].isoformat() if step.get('started_at') else None,
+                "completed_at": step['completed_at'].isoformat() if step.get('completed_at') else None,
+                "error_message": step.get('error_message')
+            } for step in steps]
+        }
+        
+        return jsonify(success_response(
+            response_text="获取任务状态成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取数据管道任务状态失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取任务状态失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/logs', methods=['GET'])
+def get_data_pipeline_task_logs(task_id):
+    """
+    获取数据管道任务日志(从任务目录文件读取)
+    
+    查询参数:
+    - limit: 日志行数限制,默认100
+    - level: 日志级别过滤,可选
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "获取任务日志成功",
+        "data": {
+            "task_id": "task_20250627_143052",
+            "logs": [
+                {
+                    "timestamp": "2025-06-27 14:30:52",
+                    "level": "INFO",
+                    "message": "任务开始执行"
+                }
+            ],
+            "total": 15,
+            "source": "file"
+        }
+    }
+    """
+    try:
+        limit = request.args.get('limit', 100, type=int)
+        level = request.args.get('level')
+        
+        # 限制最大查询数量
+        limit = min(limit, 1000)
+        
+        manager = get_data_pipeline_manager()
+        
+        # 验证任务是否存在
+        task_info = manager.get_task_status(task_id)
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 获取任务目录下的日志文件
+        import os
+        from pathlib import Path
+        
+        # 获取项目根目录的绝对路径
+        project_root = Path(__file__).parent.absolute()
+        task_dir = project_root / "data_pipeline" / "training_data" / task_id
+        log_file = task_dir / "data_pipeline.log"
+        
+        logs = []
+        if log_file.exists():
+            try:
+                # 读取日志文件的最后N行
+                with open(log_file, 'r', encoding='utf-8') as f:
+                    lines = f.readlines()
+                    
+                # 取最后limit行
+                recent_lines = lines[-limit:] if len(lines) > limit else lines
+                
+                # 解析日志行
+                import re
+                log_pattern = r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+?): (.+)$'
+                
+                for line in recent_lines:
+                    line = line.strip()
+                    if not line:
+                        continue
+                        
+                    match = re.match(log_pattern, line)
+                    if match:
+                        timestamp, log_level, logger_name, message = match.groups()
+                        
+                        # 级别过滤
+                        if level and log_level != level.upper():
+                            continue
+                            
+                        logs.append({
+                            "timestamp": timestamp,
+                            "level": log_level,
+                            "logger": logger_name,
+                            "message": message
+                        })
+                    else:
+                        # 处理多行日志(如异常堆栈)
+                        if logs:
+                            logs[-1]["message"] += f"\n{line}"
+                        
+            except Exception as e:
+                logger.error(f"读取日志文件失败: {e}")
+        
+        response_data = {
+            "task_id": task_id,
+            "logs": logs,
+            "total": len(logs),
+            "source": "file",
+            "log_file": str(log_file) if log_file.exists() else None
+        }
+        
+        return jsonify(success_response(
+            response_text="获取任务日志成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取数据管道任务日志失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取任务日志失败,请稍后重试"
+        )), 500
+
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks', methods=['GET'])
+def list_data_pipeline_tasks():
+    """获取数据管道任务列表"""
+    try:
+        limit = request.args.get('limit', 50, type=int)
+        offset = request.args.get('offset', 0, type=int)
+        status_filter = request.args.get('status')
+        
+        # 限制查询数量
+        limit = min(limit, 100)
+        
+        manager = get_data_pipeline_manager()
+        tasks = manager.get_tasks_list(
+            limit=limit,
+            offset=offset,
+            status_filter=status_filter
+        )
+        
+        # 格式化任务列表
+        formatted_tasks = []
+        for task in tasks:
+            formatted_tasks.append({
+                "task_id": task.get('task_id'),
+                "task_name": task.get('task_name'),
+                "status": task.get('status'),
+                "step_status": task.get('step_status'),
+                "created_at": task['created_at'].isoformat() if task.get('created_at') else None,
+                "started_at": task['started_at'].isoformat() if task.get('started_at') else None,
+                "completed_at": task['completed_at'].isoformat() if task.get('completed_at') else None,
+                "created_by": task.get('by_user'),
+                "db_name": task.get('db_name'),
+                "business_context": task.get('parameters', {}).get('business_context') if task.get('parameters') else None,
+                # 新增字段
+                "directory_exists": task.get('directory_exists', True),  # 默认为True,兼容旧数据
+                "updated_at": task['updated_at'].isoformat() if task.get('updated_at') else None
+            })
+        
+        response_data = {
+            "tasks": formatted_tasks,
+            "total": len(formatted_tasks),
+            "limit": limit,
+            "offset": offset
+        }
+        
+        return jsonify(success_response(
+            response_text="获取任务列表成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取数据管道任务列表失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取任务列表失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/query', methods=['POST'])
+def query_data_pipeline_tasks():
+    """
+    高级查询数据管道任务列表
+    
+    支持复杂筛选、排序、分页功能
+    
+    请求体:
+    {
+        "page": 1,                          // 页码,必须大于0,默认1
+        "page_size": 20,                    // 每页大小,1-100之间,默认20
+        "status": "completed",              // 可选,任务状态筛选:"pending"|"running"|"completed"|"failed"|"cancelled"
+        "task_name": "highway",             // 可选,任务名称模糊搜索,最大100字符
+        "created_by": "user123",            // 可选,创建者精确匹配
+        "db_name": "highway_db",            // 可选,数据库名称精确匹配
+        "created_time_start": "2025-01-01T00:00:00",  // 可选,创建时间范围开始
+        "created_time_end": "2025-12-31T23:59:59",    // 可选,创建时间范围结束
+        "started_time_start": "2025-01-01T00:00:00",  // 可选,开始时间范围开始
+        "started_time_end": "2025-12-31T23:59:59",    // 可选,开始时间范围结束
+        "completed_time_start": "2025-01-01T00:00:00", // 可选,完成时间范围开始
+        "completed_time_end": "2025-12-31T23:59:59",   // 可选,完成时间范围结束
+        "sort_by": "created_at",            // 可选,排序字段:"created_at"|"started_at"|"completed_at"|"task_name"|"status",默认"created_at"
+        "sort_order": "desc"                // 可选,排序方向:"asc"|"desc",默认"desc"
+    }
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "查询任务列表成功",
+        "data": {
+            "tasks": [...],
+            "pagination": {
+                "page": 1,
+                "page_size": 20,
+                "total": 150,
+                "total_pages": 8,
+                "has_next": true,
+                "has_prev": false
+            },
+            "filters_applied": {...},
+            "sort_applied": {...},
+            "query_time": "0.045s"
+        }
+    }
+    """
+    try:
+        # 获取请求数据
+        req = request.get_json(force=True) if request.is_json else {}
+        
+        # 解析参数,设置默认值
+        page = req.get('page', 1)
+        page_size = req.get('page_size', 20)
+        status = req.get('status')
+        task_name = req.get('task_name')
+        created_by = req.get('created_by')
+        db_name = req.get('db_name')
+        created_time_start = req.get('created_time_start')
+        created_time_end = req.get('created_time_end')
+        started_time_start = req.get('started_time_start')
+        started_time_end = req.get('started_time_end')
+        completed_time_start = req.get('completed_time_start')
+        completed_time_end = req.get('completed_time_end')
+        sort_by = req.get('sort_by', 'created_at')
+        sort_order = req.get('sort_order', 'desc')
+        
+        # 参数验证
+        # 验证分页参数
+        if page < 1:
+            return jsonify(bad_request_response(
+                response_text="页码必须大于0",
+                invalid_params=['page']
+            )), 400
+        
+        if page_size < 1 or page_size > 100:
+            return jsonify(bad_request_response(
+                response_text="每页大小必须在1-100之间",
+                invalid_params=['page_size']
+            )), 400
+        
+        # 验证任务名称长度
+        if task_name and len(task_name) > 100:
+            return jsonify(bad_request_response(
+                response_text="任务名称搜索关键词最大长度为100字符",
+                invalid_params=['task_name']
+            )), 400
+        
+        # 验证排序参数
+        allowed_sort_fields = ['created_at', 'started_at', 'completed_at', 'task_name', 'status']
+        if sort_by not in allowed_sort_fields:
+            return jsonify(bad_request_response(
+                response_text=f"不支持的排序字段: {sort_by},支持的字段: {', '.join(allowed_sort_fields)}",
+                invalid_params=['sort_by']
+            )), 400
+        
+        if sort_order.lower() not in ['asc', 'desc']:
+            return jsonify(bad_request_response(
+                response_text="排序方向必须是 'asc' 或 'desc'",
+                invalid_params=['sort_order']
+            )), 400
+        
+        # 验证状态筛选
+        if status:
+            allowed_statuses = ['pending', 'running', 'completed', 'failed', 'cancelled']
+            if status not in allowed_statuses:
+                return jsonify(bad_request_response(
+                    response_text=f"不支持的状态值: {status},支持的状态: {', '.join(allowed_statuses)}",
+                    invalid_params=['status']
+                )), 400
+        
+        # 调用管理器执行查询
+        manager = get_data_pipeline_manager()
+        result = manager.query_tasks_advanced(
+            page=page,
+            page_size=page_size,
+            status=status,
+            task_name=task_name,
+            created_by=created_by,
+            db_name=db_name,
+            created_time_start=created_time_start,
+            created_time_end=created_time_end,
+            started_time_start=started_time_start,
+            started_time_end=started_time_end,
+            completed_time_start=completed_time_start,
+            completed_time_end=completed_time_end,
+            sort_by=sort_by,
+            sort_order=sort_order
+        )
+        
+        # 格式化任务列表
+        formatted_tasks = []
+        for task in result['tasks']:
+            formatted_tasks.append({
+                "task_id": task.get('task_id'),
+                "task_name": task.get('task_name'),
+                "status": task.get('status'),
+                "step_status": task.get('step_status'),
+                "created_at": task['created_at'].isoformat() if task.get('created_at') else None,
+                "started_at": task['started_at'].isoformat() if task.get('started_at') else None,
+                "completed_at": task['completed_at'].isoformat() if task.get('completed_at') else None,
+                "created_by": task.get('by_user'),
+                "db_name": task.get('db_name'),
+                "business_context": task.get('parameters', {}).get('business_context') if task.get('parameters') else None,
+                "directory_exists": task.get('directory_exists', True),
+                "updated_at": task['updated_at'].isoformat() if task.get('updated_at') else None
+            })
+        
+        # 构建响应数据
+        response_data = {
+            "tasks": formatted_tasks,
+            "pagination": result['pagination'],
+            "filters_applied": {
+                k: v for k, v in {
+                    "status": status,
+                    "task_name": task_name,
+                    "created_by": created_by,
+                    "db_name": db_name,
+                    "created_time_start": created_time_start,
+                    "created_time_end": created_time_end,
+                    "started_time_start": started_time_start,
+                    "started_time_end": started_time_end,
+                    "completed_time_start": completed_time_start,
+                    "completed_time_end": completed_time_end
+                }.items() if v
+            },
+            "sort_applied": {
+                "sort_by": sort_by,
+                "sort_order": sort_order
+            },
+            "query_time": result.get('query_time', '0.000s')
+        }
+        
+        return jsonify(success_response(
+            response_text="查询任务列表成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"查询数据管道任务列表失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="查询任务列表失败,请稍后重试"
+        )), 500
+
+# ==================== 表检查API端点 ====================
+
+import asyncio
+from data_pipeline.api.table_inspector_api import TableInspectorAPI
+
+@app.flask_app.route('/api/v0/database/tables', methods=['POST'])
+def get_database_tables():
+    """
+    获取数据库表列表
+    
+    请求体:
+    {
+        "db_connection": "postgresql://postgres:postgres@192.168.67.1:5432/highway_db",  // 可选,不传则使用默认配置
+        "schema": "public,ods",  // 可选,支持多个schema用逗号分隔,默认为public
+        "table_name_pattern": "ods_*"  // 可选,表名模式匹配,支持通配符:ods_*、*_dim、*fact*、ods_%
+    }
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "获取表列表成功",
+        "data": {
+            "tables": ["public.table1", "public.table2", "ods.table3"],
+            "total": 3,
+            "schemas": ["public", "ods"],
+            "table_name_pattern": "ods_*"
+        }
+    }
+    """
+    try:
+        req = request.get_json(force=True)
+        
+        # 处理数据库连接参数(可选)
+        db_connection = req.get('db_connection')
+        if not db_connection:
+            # 使用app_config的默认数据库配置
+            import app_config
+            db_params = app_config.APP_DB_CONFIG
+            db_connection = f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"
+            logger.info("使用默认数据库配置获取表列表")
+        else:
+            logger.info("使用用户指定的数据库配置获取表列表")
+        
+        # 可选参数
+        schema = req.get('schema', '')
+        table_name_pattern = req.get('table_name_pattern')
+        
+        # 创建表检查API实例
+        table_inspector = TableInspectorAPI()
+        
+        # 使用asyncio运行异步方法
+        async def get_tables():
+            return await table_inspector.get_tables_list(db_connection, schema, table_name_pattern)
+        
+        # 在新的事件循环中运行异步方法
+        try:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            tables = loop.run_until_complete(get_tables())
+        finally:
+            loop.close()
+        
+        # 解析schema信息
+        parsed_schemas = table_inspector._parse_schemas(schema)
+        
+        response_data = {
+            "tables": tables,
+            "total": len(tables),
+            "schemas": parsed_schemas,
+            "db_connection_info": {
+                "database": db_connection.split('/')[-1].split('?')[0] if '/' in db_connection else "unknown"
+            }
+        }
+        
+        # 如果使用了表名模式,添加到响应中
+        if table_name_pattern:
+            response_data["table_name_pattern"] = table_name_pattern
+        
+        return jsonify(success_response(
+            response_text="获取表列表成功",
+            data=response_data
+        )), 200
+        
+    except Exception as e:
+        logger.error(f"获取数据库表列表失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text=f"获取表列表失败: {str(e)}"
+        )), 500
+
+@app.flask_app.route('/api/v0/database/table/ddl', methods=['POST'])
+def get_table_ddl():
+    """
+    获取表的DDL语句或MD文档
+    
+    请求体:
+    {
+        "db_connection": "postgresql://postgres:postgres@192.168.67.1:5432/highway_db",  // 可选,不传则使用默认配置
+        "table": "public.test",
+        "business_context": "这是高速公路服务区的相关数据",  // 可选
+        "type": "ddl"  // 可选,支持ddl/md/both,默认为ddl
+    }
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "获取表DDL成功",
+        "data": {
+            "ddl": "create table public.test (...);",
+            "md": "## test表...",  // 仅当type为md或both时返回
+            "table_info": {
+                "table_name": "test",
+                "schema_name": "public",
+                "full_name": "public.test",
+                "comment": "测试表",
+                "field_count": 10,
+                "row_count": 1000
+            },
+            "fields": [...]
+        }
+    }
+    """
+    try:
+        req = request.get_json(force=True)
+        
+        # 处理参数(table仍为必需,db_connection可选)
+        table = req.get('table')
+        db_connection = req.get('db_connection')
+        
+        if not table:
+            return jsonify(bad_request_response(
+                response_text="缺少必需参数:table",
+                missing_params=['table']
+            )), 400
+        
+        if not db_connection:
+            # 使用app_config的默认数据库配置
+            import app_config
+            db_params = app_config.APP_DB_CONFIG
+            db_connection = f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"
+            logger.info("使用默认数据库配置获取表DDL")
+        else:
+            logger.info("使用用户指定的数据库配置获取表DDL")
+        
+        # 可选参数
+        business_context = req.get('business_context', '')
+        output_type = req.get('type', 'ddl')
+        
+        # 验证type参数
+        valid_types = ['ddl', 'md', 'both']
+        if output_type not in valid_types:
+            return jsonify(bad_request_response(
+                response_text=f"无效的type参数: {output_type},支持的值: {valid_types}",
+                invalid_params=['type']
+            )), 400
+        
+        # 创建表检查API实例
+        table_inspector = TableInspectorAPI()
+        
+        # 使用asyncio运行异步方法
+        async def get_ddl():
+            return await table_inspector.get_table_ddl(
+                db_connection=db_connection,
+                table=table,
+                business_context=business_context,
+                output_type=output_type
+            )
+        
+        # 在新的事件循环中运行异步方法
+        try:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            result = loop.run_until_complete(get_ddl())
+        finally:
+            loop.close()
+        
+        response_data = {
+            **result,
+            "generation_info": {
+                "business_context": business_context,
+                "output_type": output_type,
+                "has_llm_comments": bool(business_context),
+                "database": db_connection.split('/')[-1].split('?')[0] if '/' in db_connection else "unknown"
+            }
+        }
+        
+        return jsonify(success_response(
+            response_text=f"获取表{output_type.upper()}成功",
+            data=response_data
+        )), 200
+        
+    except Exception as e:
+        logger.error(f"获取表DDL失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text=f"获取表{output_type.upper() if 'output_type' in locals() else 'DDL'}失败: {str(e)}"
+        )), 500
+
+# ==================== Data Pipeline 文件管理 API ====================
+
+from flask import send_file
+
+# 创建文件管理器
+data_pipeline_file_manager = None
+
+def get_data_pipeline_file_manager():
+    """获取Data Pipeline文件管理器单例"""
+    global data_pipeline_file_manager
+    if data_pipeline_file_manager is None:
+        data_pipeline_file_manager = SimpleFileManager()
+    return data_pipeline_file_manager
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/files', methods=['GET'])
+def get_data_pipeline_task_files(task_id):
+    """获取任务文件列表"""
+    try:
+        file_manager = get_data_pipeline_file_manager()
+        
+        # 获取任务文件
+        files = file_manager.get_task_files(task_id)
+        directory_info = file_manager.get_directory_info(task_id)
+        
+        # 格式化文件信息
+        formatted_files = []
+        for file_info in files:
+            formatted_files.append({
+                "file_name": file_info['file_name'],
+                "file_type": file_info['file_type'],
+                "file_size": file_info['file_size'],
+                "file_size_formatted": file_info['file_size_formatted'],
+                "created_at": file_info['created_at'].isoformat() if file_info.get('created_at') else None,
+                "modified_at": file_info['modified_at'].isoformat() if file_info.get('modified_at') else None,
+                "is_readable": file_info['is_readable']
+            })
+        
+        response_data = {
+            "task_id": task_id,
+            "files": formatted_files,
+            "directory_info": directory_info
+        }
+        
+        return jsonify(success_response(
+            response_text="获取任务文件列表成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取任务文件列表失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取任务文件列表失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/files/<file_name>', methods=['GET'])
+def download_data_pipeline_task_file(task_id, file_name):
+    """下载任务文件"""
+    try:
+        logger.info(f"开始下载文件: task_id={task_id}, file_name={file_name}")
+        
+        # 直接构建文件路径,避免依赖数据库
+        from pathlib import Path
+        import os
+        
+        # 获取项目根目录的绝对路径
+        project_root = Path(__file__).parent.absolute()
+        task_dir = project_root / "data_pipeline" / "training_data" / task_id
+        file_path = task_dir / file_name
+        
+        logger.info(f"文件路径: {file_path}")
+        
+        # 检查文件是否存在
+        if not file_path.exists():
+            logger.warning(f"文件不存在: {file_path}")
+            return jsonify(not_found_response(
+                response_text=f"文件不存在: {file_name}"
+            )), 404
+        
+        # 检查是否为文件(而不是目录)
+        if not file_path.is_file():
+            logger.warning(f"路径不是文件: {file_path}")
+            return jsonify(bad_request_response(
+                response_text=f"路径不是有效文件: {file_name}"
+            )), 400
+        
+        # 安全检查:确保文件在允许的目录内
+        try:
+            file_path.resolve().relative_to(task_dir.resolve())
+        except ValueError:
+            logger.warning(f"文件路径不安全: {file_path}")
+            return jsonify(bad_request_response(
+                response_text="非法的文件路径"
+            )), 400
+        
+        # 检查文件是否可读
+        if not os.access(file_path, os.R_OK):
+            logger.warning(f"文件不可读: {file_path}")
+            return jsonify(bad_request_response(
+                response_text="文件不可读"
+            )), 400
+        
+        logger.info(f"开始发送文件: {file_path}")
+        return send_file(
+            file_path,
+            as_attachment=True,
+            download_name=file_name
+        )
+        
+    except Exception as e:
+        logger.error(f"下载任务文件失败: task_id={task_id}, file_name={file_name}, 错误: {str(e)}", exc_info=True)
+        return jsonify(internal_error_response(
+            response_text="下载文件失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/upload-table-list', methods=['POST'])
+def upload_table_list_file(task_id):
+    """
+    上传表清单文件
+    
+    表单参数:
+    - file: 要上传的表清单文件(multipart/form-data)
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "表清单文件上传成功",
+        "data": {
+            "task_id": "task_20250701_123456",
+            "filename": "table_list.txt",
+            "file_size": 1024,
+            "file_size_formatted": "1.0 KB"
+        }
+    }
+    """
+    try:
+        # 验证任务是否存在
+        manager = get_data_pipeline_manager()
+        task_info = manager.get_task_status(task_id)
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 检查是否有文件上传
+        if 'file' not in request.files:
+            return jsonify(bad_request_response(
+                response_text="请选择要上传的表清单文件",
+                missing_params=['file']
+            )), 400
+        
+        file = request.files['file']
+        
+        # 验证文件名
+        if file.filename == '':
+            return jsonify(bad_request_response(
+                response_text="请选择有效的文件"
+            )), 400
+        
+        try:
+            # 使用文件管理器上传文件
+            file_manager = get_data_pipeline_file_manager()
+            result = file_manager.upload_table_list_file(task_id, file)
+            
+            response_data = {
+                "task_id": task_id,
+                "filename": result["filename"],
+                "file_size": result["file_size"],
+                "file_size_formatted": result["file_size_formatted"],
+                "upload_time": result["upload_time"].isoformat() if result.get("upload_time") else None
+            }
+            
+            return jsonify(success_response(
+                response_text="表清单文件上传成功",
+                data=response_data
+            )), 200
+            
+        except ValueError as e:
+            # 文件验证错误(如文件太大、空文件等)
+            return jsonify(bad_request_response(
+                response_text=str(e)
+            )), 400
+        except Exception as e:
+            logger.error(f"上传表清单文件失败: {str(e)}")
+            return jsonify(internal_error_response(
+                response_text="文件上传失败,请稍后重试"
+            )), 500
+        
+    except Exception as e:
+        logger.error(f"处理表清单文件上传请求失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="处理上传请求失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/table-list-info', methods=['GET'])
+def get_table_list_info(task_id):
+    """
+    获取任务的表清单文件信息
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "获取表清单文件信息成功",
+        "data": {
+            "task_id": "task_20250701_123456",
+            "has_file": true,
+            "filename": "table_list.txt",
+            "file_path": "./data_pipeline/training_data/task_20250701_123456/table_list.txt",
+            "file_size": 1024,
+            "file_size_formatted": "1.0 KB",
+            "uploaded_at": "2025-07-01T12:34:56",
+            "table_count": 5,
+            "is_readable": true
+        }
+    }
+    """
+    try:
+        file_manager = get_data_pipeline_file_manager()
+        
+        # 获取表清单文件信息
+        table_list_info = file_manager.get_table_list_file_info(task_id)
+        
+        response_data = {
+            "task_id": task_id,
+            "has_file": table_list_info.get("exists", False),
+            **table_list_info
+        }
+        
+        return jsonify(success_response(
+            response_text="获取表清单文件信息成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取表清单文件信息失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取表清单文件信息失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/table-list', methods=['POST'])
+def create_table_list_from_names(task_id):
+    """
+    通过POST方式提交表名列表并创建table_list.txt文件
+    
+    请求体:
+    {
+        "tables": ["table1", "schema.table2", "table3"]
+    }
+    或者:
+    {
+        "tables": "table1,schema.table2,table3"
+    }
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "表清单已成功创建",
+        "data": {
+            "task_id": "task_20250701_123456",
+            "filename": "table_list.txt",
+            "table_count": 3,
+            "file_size": 45,
+            "file_size_formatted": "45 B",
+            "created_time": "2025-07-01T12:34:56"
+        }
+    }
+    """
+    try:
+        # 验证任务是否存在
+        manager = get_data_pipeline_manager()
+        task_info = manager.get_task_status(task_id)
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 获取请求数据
+        req = request.get_json(force=True)
+        tables_param = req.get('tables')
+        
+        if not tables_param:
+            return jsonify(bad_request_response(
+                response_text="缺少必需参数:tables",
+                missing_params=['tables']
+            )), 400
+        
+        # 处理不同格式的表名参数
+        try:
+            if isinstance(tables_param, str):
+                # 逗号分隔的字符串格式
+                table_names = [name.strip() for name in tables_param.split(',') if name.strip()]
+            elif isinstance(tables_param, list):
+                # 数组格式
+                table_names = [str(name).strip() for name in tables_param if str(name).strip()]
+            else:
+                return jsonify(bad_request_response(
+                    response_text="tables参数格式错误,应为字符串(逗号分隔)或数组"
+                )), 400
+            
+            if not table_names:
+                return jsonify(bad_request_response(
+                    response_text="表名列表不能为空"
+                )), 400
+                
+        except Exception as e:
+            return jsonify(bad_request_response(
+                response_text=f"解析tables参数失败: {str(e)}"
+            )), 400
+        
+        try:
+            # 使用文件管理器创建表清单文件
+            file_manager = get_data_pipeline_file_manager()
+            result = file_manager.create_table_list_from_names(task_id, table_names)
+            
+            response_data = {
+                "task_id": task_id,
+                "filename": result["filename"],
+                "table_count": result["table_count"],
+                "unique_table_count": result["unique_table_count"],
+                "file_size": result["file_size"],
+                "file_size_formatted": result["file_size_formatted"],
+                "created_time": result["created_time"].isoformat() if result.get("created_time") else None,
+                "original_count": len(table_names) if isinstance(table_names, list) else len(tables_param.split(','))
+            }
+            
+            return jsonify(success_response(
+                response_text=f"表清单已成功创建,包含 {result['table_count']} 个表",
+                data=response_data
+            )), 200
+            
+        except ValueError as e:
+            # 表名验证错误(如格式错误、数量限制等)
+            return jsonify(bad_request_response(
+                response_text=str(e)
+            )), 400
+        except Exception as e:
+            logger.error(f"创建表清单文件失败: {str(e)}")
+            return jsonify(internal_error_response(
+                response_text="创建表清单文件失败,请稍后重试"
+            )), 500
+        
+    except Exception as e:
+        logger.error(f"处理表清单创建请求失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="处理请求失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/files', methods=['POST'])
+def upload_file_to_task(task_id):
+    """
+    上传文件到指定任务目录
+    
+    表单参数:
+    - file: 要上传的文件(multipart/form-data)
+    - overwrite_mode: 重名处理模式 (backup, replace, skip),默认为backup
+    
+    支持的文件类型:
+    - .ddl: DDL文件
+    - .md: Markdown文档
+    - .txt: 文本文件
+    - .json: JSON文件
+    - .sql: SQL文件
+    - .csv: CSV文件
+    
+    重名处理模式:
+    - backup: 备份原文件(默认)
+    - replace: 直接覆盖
+    - skip: 跳过上传
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "文件上传成功",
+        "data": {
+            "task_id": "task_20250701_123456",
+            "uploaded_file": {
+                "filename": "test.ddl",
+                "size": 1024,
+                "size_formatted": "1.0 KB",
+                "uploaded_at": "2025-07-01T12:34:56",
+                "overwrite_mode": "backup"
+            },
+            "backup_info": {  // 仅当overwrite_mode为backup且文件已存在时返回
+                "had_existing_file": true,
+                "backup_filename": "test.ddl_bak1",
+                "backup_version": 1,
+                "backup_created_at": "2025-07-01T12:34:56"
+            }
+        }
+    }
+    """
+    try:
+        # 验证任务是否存在
+        manager = get_data_pipeline_manager()
+        task_info = manager.get_task_status(task_id)
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 检查是否有文件上传
+        if 'file' not in request.files:
+            return jsonify(bad_request_response(
+                response_text="请选择要上传的文件",
+                missing_params=['file']
+            )), 400
+        
+        file = request.files['file']
+        
+        # 验证文件名
+        if file.filename == '':
+            return jsonify(bad_request_response(
+                response_text="请选择有效的文件"
+            )), 400
+        
+        # 获取重名处理模式
+        overwrite_mode = request.form.get('overwrite_mode', 'backup')
+        
+        # 验证重名处理模式
+        valid_modes = ['backup', 'replace', 'skip']
+        if overwrite_mode not in valid_modes:
+            return jsonify(bad_request_response(
+                response_text=f"无效的overwrite_mode参数: {overwrite_mode},支持的值: {valid_modes}",
+                invalid_params=['overwrite_mode']
+            )), 400
+        
+        try:
+            # 使用文件管理器上传文件
+            file_manager = get_data_pipeline_file_manager()
+            result = file_manager.upload_file_to_task(task_id, file, file.filename, overwrite_mode)
+            
+            # 检查是否跳过上传
+            if result.get('skipped'):
+                return jsonify(success_response(
+                    response_text=result.get('message', '文件已存在,跳过上传'),
+                    data=result
+                )), 200
+            
+            return jsonify(success_response(
+                response_text="文件上传成功",
+                data=result
+            )), 200
+            
+        except ValueError as e:
+            # 文件验证错误(如文件太大、空文件、不支持的类型等)
+            return jsonify(bad_request_response(
+                response_text=str(e)
+            )), 400
+        except Exception as e:
+            logger.error(f"上传文件失败: {str(e)}")
+            return jsonify(internal_error_response(
+                response_text="文件上传失败,请稍后重试"
+            )), 500
+        
+    except Exception as e:
+        logger.error(f"处理文件上传请求失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="处理上传请求失败,请稍后重试"
+        )), 500
+
+# ==================== 任务目录删除API ====================
+
+import shutil
+from pathlib import Path
+from datetime import datetime
+import psycopg2
+from app_config import PGVECTOR_CONFIG
+
+def delete_task_directory_simple(task_id, delete_database_records=False):
+    """
+    简单的任务目录删除功能
+    - 删除 data_pipeline/training_data/{task_id} 目录
+    - 更新数据库中的 directory_exists 字段
+    - 可选:删除数据库记录
+    """
+    try:
+        # 1. 删除目录
+        project_root = Path(__file__).parent.absolute()
+        task_dir = project_root / "data_pipeline" / "training_data" / task_id
+        
+        deleted_files_count = 0
+        deleted_size = 0
+        
+        if task_dir.exists():
+            # 计算删除前的统计信息
+            for file_path in task_dir.rglob('*'):
+                if file_path.is_file():
+                    deleted_files_count += 1
+                    deleted_size += file_path.stat().st_size
+            
+            # 删除目录
+            shutil.rmtree(task_dir)
+            directory_deleted = True
+        else:
+            directory_deleted = False
+        
+        # 2. 更新数据库
+        database_records_deleted = False
+        
+        try:
+            conn = psycopg2.connect(**PGVECTOR_CONFIG)
+            cur = conn.cursor()
+            
+            if delete_database_records:
+                # 删除任务步骤记录
+                cur.execute("DELETE FROM data_pipeline_task_steps WHERE task_id = %s", (task_id,))
+                # 删除任务主记录
+                cur.execute("DELETE FROM data_pipeline_tasks WHERE task_id = %s", (task_id,))
+                database_records_deleted = True
+            else:
+                # 只更新目录状态
+                cur.execute("""
+                    UPDATE data_pipeline_tasks 
+                    SET directory_exists = FALSE, updated_at = CURRENT_TIMESTAMP 
+                    WHERE task_id = %s
+                """, (task_id,))
+            
+            conn.commit()
+            cur.close()
+            conn.close()
+            
+        except Exception as db_error:
+            logger.error(f"数据库操作失败: {db_error}")
+            # 数据库失败不影响文件删除的结果
+        
+        # 3. 格式化文件大小
+        def format_size(size_bytes):
+            if size_bytes < 1024:
+                return f"{size_bytes} B"
+            elif size_bytes < 1024**2:
+                return f"{size_bytes/1024:.1f} KB"
+            elif size_bytes < 1024**3:
+                return f"{size_bytes/(1024**2):.1f} MB"
+            else:
+                return f"{size_bytes/(1024**3):.1f} GB"
+        
+        return {
+            "success": True,
+            "task_id": task_id,
+            "directory_deleted": directory_deleted,
+            "database_records_deleted": database_records_deleted,
+            "deleted_files_count": deleted_files_count,
+            "deleted_size": format_size(deleted_size),
+            "deleted_at": datetime.now().isoformat()
+        }
+        
+    except Exception as e:
+        logger.error(f"删除任务目录失败: {task_id}, 错误: {str(e)}")
+        return {
+            "success": False,
+            "task_id": task_id,
+            "error": str(e),
+            "error_code": "DELETE_FAILED"
+        }
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks', methods=['DELETE'])
+def delete_tasks():
+    """删除任务目录(支持单个和批量)"""
+    try:
+        # 获取请求参数
+        req = request.get_json(force=True)
+        
+        # 验证必需参数
+        task_ids = req.get('task_ids')
+        confirm = req.get('confirm')
+        
+        if not task_ids:
+            return jsonify(bad_request_response(
+                response_text="缺少必需参数: task_ids",
+                missing_params=['task_ids']
+            )), 400
+        
+        if not confirm:
+            return jsonify(bad_request_response(
+                response_text="缺少必需参数: confirm",
+                missing_params=['confirm']
+            )), 400
+        
+        if confirm != True:
+            return jsonify(bad_request_response(
+                response_text="confirm参数必须为true以确认删除操作"
+            )), 400
+        
+        if not isinstance(task_ids, list) or len(task_ids) == 0:
+            return jsonify(bad_request_response(
+                response_text="task_ids必须是非空的任务ID列表"
+            )), 400
+        
+        # 获取可选参数
+        delete_database_records = req.get('delete_database_records', False)
+        continue_on_error = req.get('continue_on_error', True)
+        
+        # 执行批量删除操作
+        deleted_tasks = []
+        failed_tasks = []
+        total_size_freed = 0
+        
+        for task_id in task_ids:
+            result = delete_task_directory_simple(task_id, delete_database_records)
+            
+            if result["success"]:
+                deleted_tasks.append(result)
+                # 累计释放的空间大小(这里简化处理,实际应该解析size字符串)
+            else:
+                failed_tasks.append({
+                    "task_id": task_id,
+                    "error": result["error"],
+                    "error_code": result.get("error_code", "UNKNOWN")
+                })
+                
+                if not continue_on_error:
+                    break
+        
+        # 构建响应
+        summary = {
+            "total_requested": len(task_ids),
+            "successfully_deleted": len(deleted_tasks),
+            "failed": len(failed_tasks)
+        }
+        
+        batch_result = {
+            "deleted_tasks": deleted_tasks,
+            "failed_tasks": failed_tasks,
+            "summary": summary,
+            "deleted_at": datetime.now().isoformat()
+        }
+        
+        if len(task_ids) == 1:
+            # 单个删除
+            if summary["failed"] == 0:
+                message = "任务目录删除成功"
+            else:
+                message = "任务目录删除失败"
+        else:
+            # 批量删除
+            if summary["failed"] == 0:
+                message = "批量删除完成"
+            elif summary["successfully_deleted"] == 0:
+                message = "批量删除失败"
+            else:
+                message = "批量删除部分完成"
+        
+        return jsonify(success_response(
+            response_text=message,
+            data=batch_result
+        )), 200
+        
+    except Exception as e:
+        logger.error(f"删除任务失败: 错误: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="删除任务失败,请稍后重试"
+        )), 500
+
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/logs/query', methods=['POST'])
+def query_data_pipeline_task_logs(task_id):
+    """
+    高级查询数据管道任务日志
+    
+    支持复杂筛选、排序、分页功能
+    
+    请求体:
+    {
+        "page": 1,                          // 页码,必须大于0,默认1
+        "page_size": 50,                    // 每页大小,1-500之间,默认50
+        "level": "ERROR",                   // 可选,日志级别筛选:"DEBUG"|"INFO"|"WARNING"|"ERROR"|"CRITICAL"
+        "start_time": "2025-01-01 00:00:00", // 可选,开始时间范围 (YYYY-MM-DD HH:MM:SS)
+        "end_time": "2025-01-02 23:59:59",   // 可选,结束时间范围 (YYYY-MM-DD HH:MM:SS)
+        "keyword": "failed",                 // 可选,关键字搜索(消息内容模糊匹配)
+        "logger_name": "DDLGenerator",       // 可选,日志记录器名称精确匹配
+        "step_name": "ddl_generation",       // 可选,执行步骤名称精确匹配
+        "sort_by": "timestamp",              // 可选,排序字段:"timestamp"|"level"|"logger"|"step"|"line_number",默认"timestamp"
+        "sort_order": "desc"                 // 可选,排序方向:"asc"|"desc",默认"desc"
+    }
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "查询任务日志成功",
+        "data": {
+            "logs": [
+                {
+                    "timestamp": "2025-07-01 14:30:52",
+                    "level": "INFO",
+                    "logger": "SimpleWorkflowExecutor",
+                    "step": "ddl_generation",
+                    "message": "开始DDL生成",
+                    "line_number": 15
+                }
+            ],
+            "pagination": {
+                "page": 1,
+                "page_size": 50,
+                "total": 1000,
+                "total_pages": 20,
+                "has_next": true,
+                "has_prev": false
+            },
+            "log_file_info": {
+                "exists": true,
+                "file_path": "/path/to/log/file",
+                "file_size": 1024000,
+                "file_size_formatted": "1.0 MB",
+                "last_modified": "2025-07-01T14:30:52",
+                "total_lines": 5000
+            },
+            "query_time": "0.123s"
+        }
+    }
+    """
+    try:
+        # 验证任务是否存在
+        manager = get_data_pipeline_manager()
+        task_info = manager.get_task_status(task_id)
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 解析请求数据
+        request_data = request.get_json() or {}
+        
+        # 参数验证
+        def _is_valid_time_format(time_str):
+            """验证时间格式是否有效"""
+            if not time_str:
+                return True
+            
+            # 支持的时间格式
+            time_formats = [
+                '%Y-%m-%d %H:%M:%S',     # 2025-01-01 00:00:00
+                '%Y-%m-%d',              # 2025-01-01
+                '%Y-%m-%dT%H:%M:%S',     # 2025-01-01T00:00:00
+                '%Y-%m-%dT%H:%M:%S.%f',  # 2025-01-01T00:00:00.123456
+            ]
+            
+            for fmt in time_formats:
+                try:
+                    from datetime import datetime
+                    datetime.strptime(time_str, fmt)
+                    return True
+                except ValueError:
+                    continue
+            return False
+        
+        # 提取和验证参数
+        page = request_data.get('page', 1)
+        page_size = request_data.get('page_size', 50)
+        level = request_data.get('level')
+        start_time = request_data.get('start_time')
+        end_time = request_data.get('end_time')
+        keyword = request_data.get('keyword')
+        logger_name = request_data.get('logger_name')
+        step_name = request_data.get('step_name')
+        sort_by = request_data.get('sort_by', 'timestamp')
+        sort_order = request_data.get('sort_order', 'desc')
+        
+        # 参数验证
+        if not isinstance(page, int) or page < 1:
+            return jsonify(bad_request_response(
+                response_text="页码必须是大于0的整数"
+            )), 400
+        
+        if not isinstance(page_size, int) or page_size < 1 or page_size > 500:
+            return jsonify(bad_request_response(
+                response_text="每页大小必须是1-500之间的整数"
+            )), 400
+        
+        # 验证日志级别
+        if level and level.upper() not in ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']:
+            return jsonify(bad_request_response(
+                response_text="日志级别必须是DEBUG、INFO、WARNING、ERROR、CRITICAL之一"
+            )), 400
+        
+        # 验证时间格式
+        if not _is_valid_time_format(start_time):
+            return jsonify(bad_request_response(
+                response_text="开始时间格式无效,支持格式:YYYY-MM-DD HH:MM:SS 或 YYYY-MM-DD"
+            )), 400
+        
+        if not _is_valid_time_format(end_time):
+            return jsonify(bad_request_response(
+                response_text="结束时间格式无效,支持格式:YYYY-MM-DD HH:MM:SS 或 YYYY-MM-DD"
+            )), 400
+        
+        # 验证关键字长度
+        if keyword and len(keyword) > 200:
+            return jsonify(bad_request_response(
+                response_text="关键字长度不能超过200个字符"
+            )), 400
+        
+        # 验证排序字段
+        allowed_sort_fields = ['timestamp', 'level', 'logger', 'step', 'line_number']
+        if sort_by not in allowed_sort_fields:
+            return jsonify(bad_request_response(
+                response_text=f"排序字段必须是以下之一: {', '.join(allowed_sort_fields)}"
+            )), 400
+        
+        # 验证排序方向
+        if sort_order.lower() not in ['asc', 'desc']:
+            return jsonify(bad_request_response(
+                response_text="排序方向必须是asc或desc"
+            )), 400
+        
+        # 创建工作流执行器并查询日志
+        from data_pipeline.api.simple_workflow import SimpleWorkflowExecutor
+        executor = SimpleWorkflowExecutor(task_id)
+        
+        try:
+            result = executor.query_logs_advanced(
+                page=page,
+                page_size=page_size,
+                level=level,
+                start_time=start_time,
+                end_time=end_time,
+                keyword=keyword,
+                logger_name=logger_name,
+                step_name=step_name,
+                sort_by=sort_by,
+                sort_order=sort_order
+            )
+            
+            return jsonify(success_response(
+                response_text="查询任务日志成功",
+                data=result
+            ))
+            
+        finally:
+            executor.cleanup()
+        
+    except Exception as e:
+        logger.error(f"查询数据管道任务日志失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="查询任务日志失败,请稍后重试"
+        )), 500
+
+
+if __name__ == '__main__':
+    logger.info("启动Flask应用: http://localhost:8084")
+    app.run(host="0.0.0.0", port=8084, debug=True)

+ 15 - 13
common/embedding_cache_manager.py

@@ -5,6 +5,7 @@ import time
 from typing import List, Optional, Dict, Any
 from datetime import datetime
 import app_config
+from core.logging import get_app_logger
 
 
 class EmbeddingCacheManager:
@@ -12,6 +13,7 @@ class EmbeddingCacheManager:
     
     def __init__(self):
         """初始化缓存管理器"""
+        self.logger = get_app_logger("EmbeddingCacheManager")
         self.redis_client = None
         self.cache_enabled = app_config.ENABLE_EMBEDDING_CACHE
         
@@ -28,9 +30,9 @@ class EmbeddingCacheManager:
                 )
                 # 测试连接
                 self.redis_client.ping()
-                print(f"[DEBUG] Embedding缓存管理器初始化成功")
+                self.logger.debug("Embedding缓存管理器初始化成功")
             except Exception as e:
-                print(f"[WARNING] Redis连接失败,embedding缓存将被禁用: {e}")
+                self.logger.warning(f"Redis连接失败,embedding缓存将被禁用: {e}")
                 self.cache_enabled = False
                 self.redis_client = None
     
@@ -72,7 +74,7 @@ class EmbeddingCacheManager:
                 'embedding_dimension': str(embedding_config.get('embedding_dimension', 'unknown'))
             }
         except Exception as e:
-            print(f"[WARNING] 获取模型信息失败: {e}")
+            self.logger.warning(f"获取模型信息失败: {e}")
             return {'model_name': 'unknown', 'embedding_dimension': 'unknown'}
     
     def get_cached_embedding(self, question: str) -> Optional[List[float]]:
@@ -97,13 +99,13 @@ class EmbeddingCacheManager:
                 data = json.loads(cached_data)
                 vector = data.get('vector')
                 if vector:
-                    print(f"[DEBUG] ✓ Embedding缓存命中: {question[:50]}...")
+                    self.logger.debug(f"✓ Embedding缓存命中: {question[:50]}...")
                     return vector
             
             return None
             
         except Exception as e:
-            print(f"[WARNING] 获取embedding缓存失败: {e}")
+            self.logger.warning(f"获取embedding缓存失败: {e}")
             return None
     
     def cache_embedding(self, question: str, vector: List[float]) -> bool:
@@ -141,7 +143,7 @@ class EmbeddingCacheManager:
                 json.dumps(cache_data, ensure_ascii=False)
             )
             
-            print(f"[DEBUG] ✓ Embedding向量已缓存: {question[:50]}... (维度: {len(vector)})")
+            self.logger.debug(f"✓ Embedding向量已缓存: {question[:50]}... (维度: {len(vector)})")
             
             # 检查缓存大小并清理
             self._cleanup_if_needed()
@@ -149,7 +151,7 @@ class EmbeddingCacheManager:
             return True
             
         except Exception as e:
-            print(f"[WARNING] 缓存embedding失败: {e}")
+            self.logger.warning(f"缓存embedding失败: {e}")
             return False
     
     def _cleanup_if_needed(self):
@@ -180,10 +182,10 @@ class EmbeddingCacheManager:
                 
                 if keys_to_delete:
                     self.redis_client.delete(*keys_to_delete)
-                    print(f"[DEBUG] 清理了 {len(keys_to_delete)} 个旧的embedding缓存")
+                    self.logger.debug(f"清理了 {len(keys_to_delete)} 个旧的embedding缓存")
                     
         except Exception as e:
-            print(f"[WARNING] 清理embedding缓存失败: {e}")
+            self.logger.warning(f"清理embedding缓存失败: {e}")
     
     def get_cache_stats(self) -> Dict[str, Any]:
         """
@@ -217,7 +219,7 @@ class EmbeddingCacheManager:
                     stats["memory_usage_mb"] = round(total_size_bytes / (1024 * 1024), 2)
             
         except Exception as e:
-            print(f"[WARNING] 获取缓存统计失败: {e}")
+            self.logger.warning(f"获取缓存统计失败: {e}")
         
         return stats
     
@@ -237,14 +239,14 @@ class EmbeddingCacheManager:
             
             if keys:
                 self.redis_client.delete(*keys)
-                print(f"[DEBUG] 已清空所有embedding缓存 ({len(keys)} 条)")
+                self.logger.debug(f"已清空所有embedding缓存 ({len(keys)} 条)")
                 return True
             else:
-                print(f"[DEBUG] 没有embedding缓存需要清空")
+                self.logger.debug("没有embedding缓存需要清空")
                 return True
                 
         except Exception as e:
-            print(f"[WARNING] 清空embedding缓存失败: {e}")
+            self.logger.warning(f"清空embedding缓存失败: {e}")
             return False
 
 

+ 22 - 18
common/qa_feedback_manager.py

@@ -8,6 +8,7 @@ from sqlalchemy.exc import OperationalError, ProgrammingError
 from datetime import datetime
 from typing import List, Dict, Any, Optional, Tuple
 import logging
+from core.logging import get_app_logger
 
 class QAFeedbackManager:
     """QA反馈数据管理器 - 复用Vanna连接版本"""
@@ -18,6 +19,9 @@ class QAFeedbackManager:
         Args:
             vanna_instance: 可选的vanna实例,用于复用其数据库连接
         """
+        # 初始化日志
+        self.logger = get_app_logger("QAFeedbackManager")
+        
         self.engine = None
         self.vanna_instance = vanna_instance
         self._init_database_connection()
@@ -29,7 +33,7 @@ class QAFeedbackManager:
             # 方案1: 优先尝试复用vanna连接
             if self.vanna_instance and hasattr(self.vanna_instance, 'engine'):
                 self.engine = self.vanna_instance.engine
-                print(f"[QAFeedbackManager] 复用Vanna数据库连接")
+                self.logger.info("复用Vanna数据库连接")
                 return
             
             # 方案2: 创建新的连接(原有方式)
@@ -52,10 +56,10 @@ class QAFeedbackManager:
             with self.engine.connect() as conn:
                 conn.execute(text("SELECT 1"))
             
-            print(f"[QAFeedbackManager] 数据库连接成功: {db_config['host']}:{db_config['port']}/{db_config['dbname']}")
+            self.logger.info(f"数据库连接成功: {db_config['host']}:{db_config['port']}/{db_config['dbname']}")
             
         except Exception as e:
-            print(f"[ERROR] QAFeedbackManager数据库连接失败: {e}")
+            self.logger.error(f"QAFeedbackManager数据库连接失败: {e}")
             raise
     
     def _ensure_table_exists(self):
@@ -91,10 +95,10 @@ class QAFeedbackManager:
                     for index_sql in create_indexes_sql:
                         conn.execute(text(index_sql))
                     
-            print("[QAFeedbackManager] qa_feedback表检查/创建成功")
+            self.logger.info("qa_feedback表检查/创建成功")
             
         except Exception as e:
-            print(f"[ERROR] qa_feedback表创建失败: {e}")
+            self.logger.error(f"qa_feedback表创建失败: {e}")
             raise
     
     def add_feedback(self, question: str, sql: str, is_thumb_up: bool, user_id: str = "guest") -> int:
@@ -127,11 +131,11 @@ class QAFeedbackManager:
                     })
                     feedback_id = result.fetchone()[0]
                 
-            print(f"[QAFeedbackManager] 反馈记录创建成功, ID: {feedback_id}")
+            self.logger.info(f"反馈记录创建成功, ID: {feedback_id}")
             return feedback_id
             
         except Exception as e:
-            print(f"[ERROR] 添加反馈记录失败: {e}")
+            self.logger.error(f"添加反馈记录失败: {e}")
             raise
     
     def query_feedback(self, page: int = 1, page_size: int = 20, 
@@ -232,7 +236,7 @@ class QAFeedbackManager:
             return records, total
             
         except Exception as e:
-            print(f"[ERROR] 查询反馈记录失败: {e}")
+            self.logger.error(f"查询反馈记录失败: {e}")
             raise
     
     def delete_feedback(self, feedback_id: int) -> bool:
@@ -252,14 +256,14 @@ class QAFeedbackManager:
                     result = conn.execute(text(delete_sql), {'id': feedback_id})
                 
                 if result.rowcount > 0:
-                    print(f"[QAFeedbackManager] 反馈记录删除成功, ID: {feedback_id}")
+                    self.logger.info(f"反馈记录删除成功, ID: {feedback_id}")
                     return True
                 else:
-                    print(f"[WARNING] 反馈记录不存在, ID: {feedback_id}")
+                    self.logger.warning(f"反馈记录不存在, ID: {feedback_id}")
                     return False
                     
         except Exception as e:
-            print(f"[ERROR] 删除反馈记录失败: {e}")
+            self.logger.error(f"删除反馈记录失败: {e}")
             raise
     
     def update_feedback(self, feedback_id: int, **kwargs) -> bool:
@@ -284,7 +288,7 @@ class QAFeedbackManager:
                 params[field] = value
         
         if not update_fields:
-            print("[WARNING] 没有有效的更新字段")
+            self.logger.warning("没有有效的更新字段")
             return False
         
         update_fields.append("update_time = :update_time")
@@ -301,14 +305,14 @@ class QAFeedbackManager:
                     result = conn.execute(text(update_sql), params)
                 
                 if result.rowcount > 0:
-                    print(f"[QAFeedbackManager] 反馈记录更新成功, ID: {feedback_id}")
+                    self.logger.info(f"反馈记录更新成功, ID: {feedback_id}")
                     return True
                 else:
-                    print(f"[WARNING] 反馈记录不存在或无变化, ID: {feedback_id}")
+                    self.logger.warning(f"反馈记录不存在或无变化, ID: {feedback_id}")
                     return False
                     
         except Exception as e:
-            print(f"[ERROR] 更新反馈记录失败: {e}")
+            self.logger.error(f"更新反馈记录失败: {e}")
             raise
     
     def get_feedback_by_ids(self, feedback_ids: List[int]) -> List[Dict]:
@@ -354,7 +358,7 @@ class QAFeedbackManager:
                 return records
                 
         except Exception as e:
-            print(f"[ERROR] 根据ID查询反馈记录失败: {e}")
+            self.logger.error(f"根据ID查询反馈记录失败: {e}")
             raise
     
     def mark_training_status(self, feedback_ids: List[int], status: bool = True) -> int:
@@ -386,9 +390,9 @@ class QAFeedbackManager:
                 with conn.begin():
                     result = conn.execute(text(update_sql), params)
                 
-                print(f"[QAFeedbackManager] 批量更新训练状态成功, 影响行数: {result.rowcount}")
+                self.logger.info(f"批量更新训练状态成功, 影响行数: {result.rowcount}")
                 return result.rowcount
                 
         except Exception as e:
-            print(f"[ERROR] 批量更新训练状态失败: {e}")
+            self.logger.error(f"批量更新训练状态失败: {e}")
             raise

+ 37 - 35
common/redis_conversation_manager.py

@@ -12,12 +12,14 @@ from app_config import (
     ENABLE_CONVERSATION_CONTEXT, ENABLE_QUESTION_ANSWER_CACHE,
     DEFAULT_ANONYMOUS_USER
 )
+from core.logging import get_app_logger
 
 class RedisConversationManager:
     """Redis对话管理器 - 修正版"""
     
     def __init__(self):
         """初始化Redis连接"""
+        self.logger = get_app_logger("RedisConversationManager")
         try:
             self.redis_client = redis.Redis(
                 host=REDIS_HOST,
@@ -30,9 +32,9 @@ class RedisConversationManager:
             )
             # 测试连接
             self.redis_client.ping()
-            print(f"[REDIS_CONV] Redis连接成功: {REDIS_HOST}:{REDIS_PORT}")
+            self.logger.info(f"Redis连接成功: {REDIS_HOST}:{REDIS_PORT}")
         except Exception as e:
-            print(f"[ERROR] Redis连接失败: {str(e)}")
+            self.logger.error(f"Redis连接失败: {str(e)}")
             self.redis_client = None
     
     def is_available(self) -> bool:
@@ -59,16 +61,16 @@ class RedisConversationManager:
         
         # 1. 优先使用登录用户ID
         if login_user_id:
-            print(f"[REDIS_CONV] 使用登录用户ID: {login_user_id}")
+            self.logger.debug(f"使用登录用户ID: {login_user_id}")
             return login_user_id
         
         # 2. 如果没有登录,尝试从请求参数获取user_id
         if user_id_from_request:
-            print(f"[REDIS_CONV] 使用请求参数user_id: {user_id_from_request}")
+            self.logger.debug(f"使用请求参数user_id: {user_id_from_request}")
             return user_id_from_request
         
         # 3. 都没有则为匿名用户(统一为guest)
-        print(f"[REDIS_CONV] 使用匿名用户: {DEFAULT_ANONYMOUS_USER}")
+        self.logger.debug(f"使用匿名用户: {DEFAULT_ANONYMOUS_USER}")
         return DEFAULT_ANONYMOUS_USER
     
     def resolve_conversation_id(self, user_id: str, conversation_id_input: Optional[str], 
@@ -87,13 +89,13 @@ class RedisConversationManager:
         # 1. 如果指定了conversation_id,验证后使用
         if conversation_id_input:
             if self._is_valid_conversation(conversation_id_input, user_id):
-                print(f"[REDIS_CONV] 使用指定对话: {conversation_id_input}")
+                self.logger.debug(f"使用指定对话: {conversation_id_input}")
                 return conversation_id_input, {
                     "status": "existing",
                     "message": "继续已有对话"
                 }
             else:
-                print(f"[WARN] 无效的conversation_id: {conversation_id_input},创建新对话")
+                self.logger.warning(f"无效的conversation_id: {conversation_id_input},创建新对话")
                 new_conversation_id = self.create_conversation(user_id)
                 return new_conversation_id, {
                     "status": "invalid_id_new",
@@ -105,7 +107,7 @@ class RedisConversationManager:
         if continue_conversation:
             recent_conversation = self._get_recent_conversation(user_id)
             if recent_conversation:
-                print(f"[REDIS_CONV] 继续最近对话: {recent_conversation}")
+                self.logger.debug(f"继续最近对话: {recent_conversation}")
                 return recent_conversation, {
                     "status": "existing",
                     "message": "继续最近对话"
@@ -113,7 +115,7 @@ class RedisConversationManager:
         
         # 3. 创建新对话
         new_conversation_id = self.create_conversation(user_id)
-        print(f"[REDIS_CONV] 创建新对话: {new_conversation_id}")
+        self.logger.debug(f"创建新对话: {new_conversation_id}")
         return new_conversation_id, {
             "status": "new",
             "message": "创建新对话"
@@ -180,11 +182,11 @@ class RedisConversationManager:
             # 添加到用户的对话列表
             self._add_conversation_to_user(user_id, conversation_id)
             
-            print(f"[REDIS_CONV] 创建对话成功: {conversation_id}")
+            self.logger.info(f"创建对话成功: {conversation_id}")
             return conversation_id
             
         except Exception as e:
-            print(f"[ERROR] 创建对话失败: {str(e)}")
+            self.logger.error(f"创建对话失败: {str(e)}")
             return conversation_id  # 返回ID但可能未存储
     
     def save_message(self, conversation_id: str, role: str, content: str, 
@@ -223,7 +225,7 @@ class RedisConversationManager:
             return True
             
         except Exception as e:
-            print(f"[ERROR] 保存消息失败: {str(e)}")
+            self.logger.error(f"保存消息失败: {str(e)}")
             return False
     
     def get_context(self, conversation_id: str, count: Optional[int] = None) -> str:
@@ -262,11 +264,11 @@ class RedisConversationManager:
                     continue
             
             context = "\n".join(context_parts)
-            print(f"[REDIS_CONV] 获取上下文成功: {len(context_parts)}条消息")
+            self.logger.debug(f"获取上下文成功: {len(context_parts)}条消息")
             return context
             
         except Exception as e:
-            print(f"[ERROR] 获取上下文失败: {str(e)}")
+            self.logger.error(f"获取上下文失败: {str(e)}")
             return ""
         
     def get_context_for_display(self, conversation_id: str, count: Optional[int] = None) -> str:
@@ -307,11 +309,11 @@ class RedisConversationManager:
                     continue
             
             context = "\n".join(context_parts)
-            print(f"[REDIS_CONV] 获取显示上下文成功: {len(context_parts)}条消息")
+            self.logger.debug(f"获取显示上下文成功: {len(context_parts)}条消息")
             return context
             
         except Exception as e:
-            print(f"[ERROR] 获取显示上下文失败: {str(e)}")
+            self.logger.error(f"获取显示上下文失败: {str(e)}")
             return ""
     
     
@@ -341,7 +343,7 @@ class RedisConversationManager:
             return parsed_messages
             
         except Exception as e:
-            print(f"[ERROR] 获取对话消息失败: {str(e)}")
+            self.logger.error(f"获取对话消息失败: {str(e)}")
             return []
     
     def get_conversation_meta(self, conversation_id: str) -> Dict:
@@ -353,7 +355,7 @@ class RedisConversationManager:
             meta_data = self.redis_client.hgetall(f"conversation:{conversation_id}:meta")
             return meta_data if meta_data else {}
         except Exception as e:
-            print(f"[ERROR] 获取对话元信息失败: {str(e)}")
+            self.logger.error(f"获取对话元信息失败: {str(e)}")
             return {}
     
     def get_conversations(self, user_id: str, limit: int = None) -> List[Dict]:
@@ -379,7 +381,7 @@ class RedisConversationManager:
             return conversations
             
         except Exception as e:
-            print(f"[ERROR] 获取用户对话列表失败: {str(e)}")
+            self.logger.error(f"获取用户对话列表失败: {str(e)}")
             return []
     
     # ==================== 智能缓存(修正版)====================
@@ -396,13 +398,13 @@ class RedisConversationManager:
             
             if cached_answer:
                 context_info = "有上下文" if context else "无上下文"
-                print(f"[REDIS_CONV] 缓存命中: {cache_key} ({context_info})")
+                self.logger.debug(f"缓存命中: {cache_key} ({context_info})")
                 return json.loads(cached_answer)
             
             return None
             
         except Exception as e:
-            print(f"[ERROR] 获取缓存答案失败: {str(e)}")
+            self.logger.error(f"获取缓存答案失败: {str(e)}")
             return None
     
     def cache_answer(self, question: str, answer: Dict, context: str = ""):
@@ -412,7 +414,7 @@ class RedisConversationManager:
         
         # 新增:如果有上下文,不缓存
         if context:
-            print(f"[REDIS_CONV] 跳过缓存存储:存在上下文")
+            self.logger.debug("跳过缓存存储:存在上下文")
             return
         
         try:
@@ -432,10 +434,10 @@ class RedisConversationManager:
                 json.dumps(answer_with_meta)
             )
             
-            print(f"[REDIS_CONV] 缓存答案成功: {cache_key}")
+            self.logger.debug(f"缓存答案成功: {cache_key}")
             
         except Exception as e:
-            print(f"[ERROR] 缓存答案失败: {str(e)}")
+            self.logger.error(f"缓存答案失败: {str(e)}")
     
     def _get_cache_key(self, question: str) -> str:
         """生成缓存键 - 简化版,只基于问题本身"""
@@ -464,7 +466,7 @@ class RedisConversationManager:
             )
             
         except Exception as e:
-            print(f"[ERROR] 添加对话到用户列表失败: {str(e)}")
+            self.logger.error(f"添加对话到用户列表失败: {str(e)}")
     
     def _update_conversation_meta(self, conversation_id: str):
         """更新对话元信息"""
@@ -482,7 +484,7 @@ class RedisConversationManager:
             )
             
         except Exception as e:
-            print(f"[ERROR] 更新对话元信息失败: {str(e)}")
+            self.logger.error(f"更新对话元信息失败: {str(e)}")
     
     # ==================== 管理方法 ====================
     
@@ -510,7 +512,7 @@ class RedisConversationManager:
             return stats
             
         except Exception as e:
-            print(f"[ERROR] 获取统计信息失败: {str(e)}")
+            self.logger.error(f"获取统计信息失败: {str(e)}")
             return {"available": False, "error": str(e)}
     
     def cleanup_expired_conversations(self):
@@ -542,10 +544,10 @@ class RedisConversationManager:
                         # 重新设置TTL
                         self.redis_client.expire(user_key, USER_CONVERSATIONS_TTL)
             
-            print(f"[REDIS_CONV] 清理完成,移除了 {cleaned_count} 个无效对话引用")
+            self.logger.info(f"清理完成,移除了 {cleaned_count} 个无效对话引用")
             
         except Exception as e:
-            print(f"[ERROR] 清理失败: {str(e)}")
+            self.logger.error(f"清理失败: {str(e)}")
     
     # ==================== 问答缓存管理方法 ====================
     
@@ -579,7 +581,7 @@ class RedisConversationManager:
             return stats
             
         except Exception as e:
-            print(f"[ERROR] 获取问答缓存统计失败: {str(e)}")
+            self.logger.error(f"获取问答缓存统计失败: {str(e)}")
             return {"available": False, "error": str(e)}
     
     def get_qa_cache_list(self, limit: int = 50) -> List[Dict]:
@@ -621,7 +623,7 @@ class RedisConversationManager:
                     # 跳过无效的JSON数据
                     continue
                 except Exception as e:
-                    print(f"[WARNING] 处理缓存项 {key} 失败: {e}")
+                    self.logger.warning(f"处理缓存项 {key} 失败: {e}")
                     continue
             
             # 按缓存时间倒序排列
@@ -630,7 +632,7 @@ class RedisConversationManager:
             return cache_list
             
         except Exception as e:
-            print(f"[ERROR] 获取问答缓存列表失败: {str(e)}")
+            self.logger.error(f"获取问答缓存列表失败: {str(e)}")
             return []
     
     def clear_all_qa_cache(self) -> int:
@@ -644,12 +646,12 @@ class RedisConversationManager:
             
             if keys:
                 deleted_count = self.redis_client.delete(*keys)
-                print(f"[REDIS_CONV] 清空问答缓存成功,删除了 {deleted_count} 个缓存项")
+                self.logger.info(f"清空问答缓存成功,删除了 {deleted_count} 个缓存项")
                 return deleted_count
             else:
-                print(f"[REDIS_CONV] 没有找到问答缓存项")
+                self.logger.info("没有找到问答缓存项")
                 return 0
                 
         except Exception as e:
-            print(f"[ERROR] 清空问答缓存失败: {str(e)}")
+            self.logger.error(f"清空问答缓存失败: {str(e)}")
             return 0 

+ 12 - 8
common/utils.py

@@ -2,6 +2,10 @@
 配置相关的工具函数
 用于处理不同模型类型的配置选择逻辑
 """
+from core.logging import get_app_logger
+
+# 初始化logger
+_logger = get_app_logger("ConfigUtils")
 
 def get_current_embedding_config():
     """
@@ -180,12 +184,12 @@ def print_current_config():
     """
     try:
         model_info = get_current_model_info()
-        print("=== 当前模型配置 ===")
-        print(f"LLM提供商: {model_info['llm_type']}")
-        print(f"LLM模型: {model_info['llm_model']}")
-        print(f"Embedding提供商: {model_info['embedding_type']}")
-        print(f"Embedding模型: {model_info['embedding_model']}")
-        print(f"向量数据库: {model_info['vector_db']}")
-        print("==================")
+        _logger.info("=== 当前模型配置 ===")
+        _logger.info(f"LLM提供商: {model_info['llm_type']}")
+        _logger.info(f"LLM模型: {model_info['llm_model']}")
+        _logger.info(f"Embedding提供商: {model_info['embedding_type']}")
+        _logger.info(f"Embedding模型: {model_info['embedding_model']}")
+        _logger.info(f"向量数据库: {model_info['vector_db']}")
+        _logger.info("==================")
     except Exception as e:
-        print(f"无法获取配置信息: {e}") 
+        _logger.error(f"无法获取配置信息: {e}") 

+ 11 - 7
common/vanna_combinations.py

@@ -2,13 +2,17 @@
 Vanna LLM与向量数据库的组合类
 统一管理所有LLM提供商与向量数据库的组合
 """
+from core.logging import get_app_logger
+
+# 初始化logger
+_logger = get_app_logger("VannaCombinations")
 
 # 向量数据库导入
 from vanna.chromadb import ChromaDB_VectorStore
 try:
     from custompgvector import PG_VectorStore
 except ImportError:
-    print("警告: 无法导入 PG_VectorStore,PGVector相关组合类将不可用")
+    _logger.warning("无法导入 PG_VectorStore,PGVector相关组合类将不可用")
     PG_VectorStore = None
 
 # LLM提供商导入 - 使用新的重构后的实现
@@ -17,7 +21,7 @@ from customllm.deepseek_chat import DeepSeekChat
 try:
     from customllm.ollama_chat import OllamaChat
 except ImportError:
-    print("警告: 无法导入 OllamaChat,Ollama相关组合类将不可用")
+    _logger.warning("无法导入 OllamaChat,Ollama相关组合类将不可用")
     OllamaChat = None
 
 
@@ -168,19 +172,19 @@ def list_available_combinations():
 
 def print_available_combinations():
     """打印所有可用的组合"""
-    print("可用的LLM与向量数据库组合:")
-    print("=" * 40)
+    _logger.info("可用的LLM与向量数据库组合:")
+    _logger.info("=" * 40)
     
     combinations = list_available_combinations()
     
     for llm_type, vector_dbs in combinations.items():
-        print(f"\n{llm_type.upper()} LLM:")
+        _logger.info(f"\n{llm_type.upper()} LLM:")
         for vector_db in vector_dbs:
             class_name = LLM_CLASS_MAP[llm_type][vector_db].__name__
-            print(f"  + {vector_db} -> {class_name}")
+            _logger.info(f"  + {vector_db} -> {class_name}")
     
     if not any(combinations.values()):
-        print("没有可用的组合,请检查依赖是否正确安装")
+        _logger.warning("没有可用的组合,请检查依赖是否正确安装")
 
 
 # ===== 向后兼容性支持 =====

+ 8 - 4
common/vanna_instance.py

@@ -4,6 +4,10 @@ Vanna实例单例管理器
 """
 import threading
 from typing import Optional
+from core.logging import get_app_logger
+
+# 初始化logger
+_logger = get_app_logger("VannaSingleton")
 
 # 全局变量
 _vanna_instance: Optional[object] = None
@@ -22,14 +26,14 @@ def get_vanna_instance():
     if _vanna_instance is None:
         with _instance_lock:
             if _vanna_instance is None:
-                print("[VANNA_SINGLETON] 创建 Vanna 实例...")
+                _logger.info("创建 Vanna 实例...")
                 try:
                     # 延迟导入,避免循环导入
                     from core.vanna_llm_factory import create_vanna_instance
                     _vanna_instance = create_vanna_instance()
-                    print("[VANNA_SINGLETON] Vanna 实例创建成功")
+                    _logger.info("Vanna 实例创建成功")
                 except Exception as e:
-                    print(f"[ERROR] Vanna 实例创建失败: {str(e)}")
+                    _logger.error(f"Vanna 实例创建失败: {str(e)}")
                     raise
     
     return _vanna_instance
@@ -41,7 +45,7 @@ def reset_vanna_instance():
     global _vanna_instance
     with _instance_lock:
         if _vanna_instance is not None:
-            print("[VANNA_SINGLETON] 重置 Vanna 实例")
+            _logger.info("重置 Vanna 实例")
             _vanna_instance = None
 
 def get_instance_status() -> dict:

+ 93 - 0
config/logging_config.yaml

@@ -0,0 +1,93 @@
+version: 1
+
+# 全局配置
+global:
+  base_level: INFO
+  
+# 默认配置(用于app.log)
+default:
+  level: INFO
+  console:
+    enabled: true
+    level: INFO
+    format: "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
+  file:
+    enabled: true
+    level: DEBUG
+    filename: "app.log"
+    format: "%(asctime)s [%(levelname)s] [%(name)s] [user:%(user_id)s] [session:%(session_id)s] %(filename)s:%(lineno)d - %(message)s"
+    rotation:
+      enabled: true
+      max_size: "50MB"
+      backup_count: 10
+
+# 模块特定配置
+modules:
+  app:
+    level: INFO
+    console:
+      enabled: true
+      level: INFO
+      format: "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
+    file:
+      enabled: true
+      level: DEBUG
+      filename: "app.log"
+      format: "%(asctime)s [%(levelname)s] [%(name)s] [user:%(user_id)s] [session:%(session_id)s] %(filename)s:%(lineno)d - %(message)s"
+      rotation:
+        enabled: true
+        max_size: "50MB"
+        backup_count: 10
+  
+  data_pipeline:
+    # 注意:data_pipeline的日志文件路径会在运行时动态设置到任务目录
+    # 这里的file配置主要用于格式和级别设置
+    level: DEBUG
+    console:
+      enabled: true
+      level: INFO
+      format: "%(asctime)s [%(levelname)s] Pipeline: %(message)s"
+    file:
+      enabled: true
+      level: DEBUG
+      # filename 将在运行时动态设置,不在这里指定
+      # filename: "data_pipeline.log"  # 移除固定路径
+      format: "%(asctime)s [%(levelname)s] [%(name)s] %(filename)s:%(lineno)d - %(message)s"
+      rotation:
+        # 对于任务特定的日志,通常不需要rotation
+        # 但保留配置以防单个任务产生大量日志
+        enabled: false  # 禁用rotation,因为每个任务的日志是独立的
+        max_size: "10MB"    # 如果启用,限制为10MB
+        backup_count: 2     # 如果启用,只保留2个备份
+  
+  agent:
+    level: DEBUG
+    console:
+      enabled: true
+      level: INFO
+      format: "%(asctime)s [%(levelname)s] Agent: %(message)s"
+    file:
+      enabled: true
+      level: DEBUG
+      filename: "agent.log"
+      format: "%(asctime)s [%(levelname)s] [%(name)s] [user:%(user_id)s] [session:%(session_id)s] %(filename)s:%(lineno)d - %(message)s"
+      rotation:
+        enabled: true
+        max_size: "30MB"
+        backup_count: 8
+  
+  vanna:
+    level: DEBUG
+    console:
+      enabled: true
+      level: INFO
+      format: "%(asctime)s [%(levelname)s] Vanna: %(message)s"
+    file:
+      enabled: true
+      level: DEBUG
+      filename: "vanna.log"
+      format: "%(asctime)s [%(levelname)s] [%(name)s] %(filename)s:%(lineno)d - %(message)s"
+      rotation:
+        enabled: true
+        max_size: "20MB"
+        backup_count: 5 

+ 17 - 12
core/embedding_function.py

@@ -2,6 +2,7 @@ import requests
 import time
 import numpy as np
 from typing import List, Callable
+from core.logging import get_vanna_logger
 
 class EmbeddingFunction:
     def __init__(self, model_name: str, api_key: str, base_url: str, embedding_dimension: int):
@@ -16,6 +17,9 @@ class EmbeddingFunction:
         self.max_retries = 3  # 设置默认的最大重试次数
         self.retry_interval = 2  # 设置默认的重试间隔秒数
         self.normalize_embeddings = True # 设置默认是否归一化
+        
+        # 初始化日志
+        self.logger = get_vanna_logger("EmbeddingFunction")
 
     def _normalize_vector(self, vector: List[float]) -> List[float]:
         """
@@ -54,7 +58,7 @@ class EmbeddingFunction:
                 vector = self.generate_embedding(text)
                 embeddings.append(vector)
             except Exception as e:
-                print(f"为文本 '{text}' 生成embedding失败: {e}")
+                self.logger.error(f"为文本 '{text}' 生成embedding失败: {e}")
                 # 重新抛出异常,不返回零向量
                 raise e
                 
@@ -135,7 +139,7 @@ class EmbeddingFunction:
                         retries += 1
                         if retries <= self.max_retries:
                             wait_time = self.retry_interval * (2 ** (retries - 1))  # 指数退避
-                            print(f"API请求失败,等待 {wait_time} 秒后重试 ({retries}/{self.max_retries})")
+                            self.logger.warning(f"API请求失败,等待 {wait_time} 秒后重试 ({retries}/{self.max_retries})")
                             time.sleep(wait_time)
                             continue
                     
@@ -155,14 +159,14 @@ class EmbeddingFunction:
                         # 验证向量维度
                         actual_dim = len(vector)
                         if actual_dim != self.embedding_dimension:
-                            print(f"警告: 向量维度不匹配: 期望 {self.embedding_dimension}, 实际 {actual_dim}")
+                            self.logger.warning(f"向量维度不匹配: 期望 {self.embedding_dimension}, 实际 {actual_dim}")
                     
                     # 如果需要归一化
                     if self.normalize_embeddings:
                         vector = self._normalize_vector(vector)
                     
                     # 添加成功生成embedding的debug日志
-                    print(f"[DEBUG] ✓ 成功生成embedding向量,维度: {len(vector)}")
+                    self.logger.debug(f"成功生成embedding向量,维度: {len(vector)}")
                     
                     return vector
                 else:
@@ -174,7 +178,7 @@ class EmbeddingFunction:
                 
                 if retries <= self.max_retries:
                     wait_time = self.retry_interval * (2 ** (retries - 1))  # 指数退避
-                    print(f"生成embedding时出错: {str(e)}, 等待 {wait_time} 秒后重试 ({retries}/{self.max_retries})")
+                    self.logger.warning(f"生成embedding时出错: {str(e)}, 等待 {wait_time} 秒后重试 ({retries}/{self.max_retries})")
                     time.sleep(wait_time)
                 else:
                     # 抛出异常而不是返回零向量,确保问题不被掩盖
@@ -203,8 +207,8 @@ class EmbeddingFunction:
         }
         
         try:
-            print(f"测试嵌入模型连接 - 模型: {self.model_name}")
-            print(f"API服务地址: {self.base_url}")
+            self.logger.info(f"测试嵌入模型连接 - 模型: {self.model_name}")
+            self.logger.info(f"API服务地址: {self.base_url}")
             
             # 验证配置
             if not self.api_key:
@@ -241,6 +245,7 @@ def test_embedding_connection() -> dict:
     Returns:
         dict: 测试结果,包括成功/失败状态、错误消息等
     """
+    logger = get_vanna_logger("EmbeddingTest")
     try:
         # 获取嵌入函数实例
         embedding_function = get_embedding_function()
@@ -249,18 +254,18 @@ def test_embedding_connection() -> dict:
         test_result = embedding_function.test_connection()
         
         if test_result["success"]:
-            print(f"嵌入模型连接测试成功!")
+            logger.info(f"嵌入模型连接测试成功!")
             if "警告" in test_result["message"]:
-                print(test_result["message"])
-                print(f"建议将app_config.py中的EMBEDDING_CONFIG['embedding_dimension']修改为{test_result['actual_dimension']}")
+                logger.warning(test_result["message"])
+                logger.warning(f"建议将app_config.py中的EMBEDDING_CONFIG['embedding_dimension']修改为{test_result['actual_dimension']}")
         else:
-            print(f"嵌入模型连接测试失败: {test_result['message']}")
+            logger.error(f"嵌入模型连接测试失败: {test_result['message']}")
             
         return test_result
         
     except Exception as e:
         error_message = f"无法测试嵌入模型连接: {str(e)}"
-        print(error_message)
+        logger.error(error_message)
         return {
             "success": False,
             "message": error_message

+ 41 - 0
core/logging/__init__.py

@@ -0,0 +1,41 @@
+from .log_manager import LogManager
+import logging
+
+# 全局日志管理器实例
+_log_manager = LogManager()
+
+def initialize_logging(config_path: str = "config/logging_config.yaml"):
+    """初始化项目日志系统"""
+    _log_manager.initialize(config_path)
+
+def get_logger(name: str, module: str = "default") -> logging.Logger:
+    """获取logger实例 - 主要API"""
+    return _log_manager.get_logger(name, module)
+
+# 便捷方法
+def get_data_pipeline_logger(name: str) -> logging.Logger:
+    """获取data_pipeline模块logger"""
+    return get_logger(name, "data_pipeline")
+
+def get_agent_logger(name: str) -> logging.Logger:
+    """获取agent模块logger"""
+    return get_logger(name, "agent")
+
+def get_vanna_logger(name: str) -> logging.Logger:
+    """获取vanna模块logger"""
+    return get_logger(name, "vanna")
+
+def get_app_logger(name: str) -> logging.Logger:
+    """获取app模块logger"""
+    return get_logger(name, "app")
+
+# 上下文管理便捷方法
+def set_log_context(**kwargs):
+    """设置日志上下文(可选)
+    示例: set_log_context(user_id='user123', session_id='sess456')
+    """
+    _log_manager.set_context(**kwargs)
+
+def clear_log_context():
+    """清除日志上下文"""
+    _log_manager.clear_context() 

+ 214 - 0
core/logging/log_manager.py

@@ -0,0 +1,214 @@
+import logging
+import logging.handlers
+import os
+from typing import Dict, Optional
+from pathlib import Path
+import yaml
+import contextvars
+
+# 上下文变量,存储可选的上下文信息
+log_context = contextvars.ContextVar('log_context', default={})
+
+class ContextFilter(logging.Filter):
+    """添加上下文信息到日志记录"""
+    def filter(self, record):
+        ctx = log_context.get()
+        # 设置默认值,避免格式化错误
+        record.session_id = ctx.get('session_id', 'N/A')
+        record.user_id = ctx.get('user_id', 'anonymous')
+        record.request_id = ctx.get('request_id', 'N/A')
+        return True
+
+class LogManager:
+    """统一日志管理器 - 类似Log4j的功能"""
+    
+    _instance = None
+    _loggers: Dict[str, logging.Logger] = {}
+    _initialized = False
+    _fallback_to_console = False  # 标记是否降级到控制台
+    
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    
+    def __init__(self):
+        if not self._initialized:
+            self.config = None
+            self.base_log_dir = Path("logs")
+            self._setup_base_directory()
+            LogManager._initialized = True
+    
+    def initialize(self, config_path: str = "config/logging_config.yaml"):
+        """初始化日志系统"""
+        self.config = self._load_config(config_path)
+        self._setup_base_directory()
+        self._configure_root_logger()
+    
+    def get_logger(self, name: str, module: str = "default") -> logging.Logger:
+        """获取指定模块的logger"""
+        logger_key = f"{module}.{name}"
+        
+        if logger_key not in self._loggers:
+            logger = logging.getLogger(logger_key)
+            self._configure_logger(logger, module)
+            self._loggers[logger_key] = logger
+        
+        return self._loggers[logger_key]
+    
+    def set_context(self, **kwargs):
+        """设置日志上下文(可选)"""
+        ctx = log_context.get()
+        ctx.update(kwargs)
+        log_context.set(ctx)
+    
+    def clear_context(self):
+        """清除日志上下文"""
+        log_context.set({})
+    
+    def _load_config(self, config_path: str) -> dict:
+        """加载配置文件"""
+        try:
+            with open(config_path, 'r', encoding='utf-8') as f:
+                return yaml.safe_load(f)
+        except FileNotFoundError:
+            import sys
+            sys.stderr.write(f"[WARNING] 配置文件 {config_path} 未找到,使用默认配置\n")
+            return self._get_default_config()
+        except Exception as e:
+            import sys
+            sys.stderr.write(f"[ERROR] 加载配置文件失败: {e},使用默认配置\n")
+            return self._get_default_config()
+    
+    def _get_default_config(self) -> dict:
+        """获取默认配置"""
+        return {
+            'global': {'base_level': 'INFO'},
+            'default': {
+                'level': 'INFO',
+                'console': {
+                    'enabled': True,
+                    'level': 'INFO',
+                    'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
+                },
+                'file': {
+                    'enabled': True,
+                    'level': 'DEBUG',
+                    'filename': 'app.log',
+                    'format': '%(asctime)s [%(levelname)s] [%(name)s] [user:%(user_id)s] [session:%(session_id)s] %(filename)s:%(lineno)d - %(message)s',
+                    'rotation': {
+                        'enabled': True,
+                        'max_size': '50MB',
+                        'backup_count': 10
+                    }
+                }
+            },
+            'modules': {}
+        }
+    
+    def _setup_base_directory(self):
+        """创建日志目录(带降级策略)"""
+        try:
+            os.makedirs(self.base_log_dir, exist_ok=True)
+            self._fallback_to_console = False
+        except Exception as e:
+            import sys
+            sys.stderr.write(f"[WARNING] 无法创建日志目录 {self.base_log_dir},将只使用控制台输出: {e}\n")
+            self._fallback_to_console = True
+    
+    def _configure_root_logger(self):
+        """配置根日志器"""
+        root_logger = logging.getLogger()
+        root_logger.setLevel(getattr(logging, self.config['global']['base_level'].upper()))
+    
+    def _configure_logger(self, logger: logging.Logger, module: str):
+        """配置具体的logger"""
+        # 如果配置未初始化,使用默认的控制台日志配置
+        if self.config is None:
+            logger.setLevel(logging.INFO)
+            if not logger.handlers:
+                console_handler = logging.StreamHandler()
+                formatter = logging.Formatter(
+                    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+                )
+                console_handler.setFormatter(formatter)
+                logger.addHandler(console_handler)
+                logger.propagate = False
+            return
+            
+        module_config = self.config.get('modules', {}).get(module, self.config['default'])
+        
+        # 设置日志级别
+        level = getattr(logging, module_config['level'].upper())
+        logger.setLevel(level)
+        
+        # 清除已有处理器
+        logger.handlers.clear()
+        logger.propagate = False
+        
+        # 添加控制台处理器
+        if module_config.get('console', {}).get('enabled', True):
+            console_handler = self._create_console_handler(module_config['console'])
+            console_handler.addFilter(ContextFilter())
+            logger.addHandler(console_handler)
+        
+        # 添加文件处理器(如果没有降级到控制台)
+        if not self._fallback_to_console and module_config.get('file', {}).get('enabled', True):
+            try:
+                file_handler = self._create_file_handler(module_config['file'], module)
+                file_handler.addFilter(ContextFilter())
+                logger.addHandler(file_handler)
+            except Exception as e:
+                import sys
+                sys.stderr.write(f"[WARNING] 无法创建文件处理器: {e}\n")
+                # 如果文件处理器创建失败,标记降级
+                self._fallback_to_console = True
+    
+    def _create_console_handler(self, console_config: dict) -> logging.StreamHandler:
+        """创建控制台处理器"""
+        handler = logging.StreamHandler()
+        handler.setLevel(getattr(logging, console_config.get('level', 'INFO').upper()))
+        
+        formatter = logging.Formatter(
+            console_config.get('format', '%(asctime)s [%(levelname)s] %(name)s: %(message)s'),
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        handler.setFormatter(formatter)
+        return handler
+    
+    def _create_file_handler(self, file_config: dict, module: str) -> logging.Handler:
+        """创建文件处理器(支持自动轮转)"""
+        log_file = self.base_log_dir / file_config.get('filename', f'{module}.log')
+        
+        # 使用RotatingFileHandler实现自动轮转和清理
+        rotation_config = file_config.get('rotation', {})
+        if rotation_config.get('enabled', False):
+            handler = logging.handlers.RotatingFileHandler(
+                log_file,
+                maxBytes=self._parse_size(rotation_config.get('max_size', '50MB')),
+                backupCount=rotation_config.get('backup_count', 10),
+                encoding='utf-8'
+            )
+        else:
+            handler = logging.FileHandler(log_file, encoding='utf-8')
+        
+        handler.setLevel(getattr(logging, file_config.get('level', 'DEBUG').upper()))
+        
+        formatter = logging.Formatter(
+            file_config.get('format', '%(asctime)s [%(levelname)s] [%(name)s] %(filename)s:%(lineno)d - %(message)s'),
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        handler.setFormatter(formatter)
+        return handler
+    
+    def _parse_size(self, size_str: str) -> int:
+        """解析大小字符串,如 '50MB' -> 字节数"""
+        size_str = size_str.upper()
+        if size_str.endswith('KB'):
+            return int(size_str[:-2]) * 1024
+        elif size_str.endswith('MB'):
+            return int(size_str[:-2]) * 1024 * 1024
+        elif size_str.endswith('GB'):
+            return int(size_str[:-2]) * 1024 * 1024 * 1024
+        else:
+            return int(size_str) 

+ 11 - 7
core/vanna_llm_factory.py

@@ -4,6 +4,10 @@ Vanna LLM 工厂文件,支持多种LLM提供商和向量数据库
 import app_config, os
 from core.embedding_function import get_embedding_function
 from common.vanna_combinations import get_vanna_class, print_available_combinations
+from core.logging import get_vanna_logger
+
+# 初始化日志
+logger = get_vanna_logger("VannaFactory")
 
 def create_vanna_instance(config_module=None):
     """
@@ -48,11 +52,11 @@ def create_vanna_instance(config_module=None):
         vector_db_type = model_info["vector_db"].lower()
         
         cls = get_vanna_class(llm_type, vector_db_type)
-        print(f"创建{llm_type.upper()}+{vector_db_type.upper()}实例")
+        logger.info(f"创建{llm_type.upper()}+{vector_db_type.upper()}实例")
         
     except ValueError as e:
-        print(f"错误: {e}")
-        print("\n可用的组合:")
+        logger.error(f"{e}")
+        logger.info("可用的组合:")
         print_available_combinations()
         raise
     
@@ -62,24 +66,24 @@ def create_vanna_instance(config_module=None):
     # 配置向量数据库
     if model_info["vector_db"] == "chromadb":
         config["path"] = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # 返回项目根目录
-        print(f"已配置使用ChromaDB,路径:{config['path']}")
+        logger.info(f"已配置使用ChromaDB,路径:{config['path']}")
     elif model_info["vector_db"] == "pgvector":
         # 构建PostgreSQL连接字符串
         connection_string = f"postgresql://{vector_db_config['user']}:{vector_db_config['password']}@{vector_db_config['host']}:{vector_db_config['port']}/{vector_db_config['dbname']}"
         config["connection_string"] = connection_string
-        print(f"已配置使用PgVector,连接字符串: {connection_string}")
+        logger.info(f"已配置使用PgVector,连接字符串: {connection_string}")
     
     # 配置embedding函数
     embedding_function = get_embedding_function()
     config["embedding_function"] = embedding_function
-    print(f"已配置使用{model_info['embedding_type'].upper()}嵌入模型: {model_info['embedding_model']}")
+    logger.info(f"已配置使用{model_info['embedding_type'].upper()}嵌入模型: {model_info['embedding_model']}")
     
     # 创建实例
     vn = cls(config=config)
 
     # 连接到业务数据库
     vn.connect_to_postgres(**config_module.APP_DB_CONFIG)           
-    print(f"已连接到业务数据库: "
+    logger.info(f"已连接到业务数据库: "
           f"{config_module.APP_DB_CONFIG['host']}:"
           f"{config_module.APP_DB_CONFIG['port']}/"
           f"{config_module.APP_DB_CONFIG['dbname']}")

+ 17 - 13
customembedding/ollama_embedding.py

@@ -2,6 +2,7 @@ import requests
 import time
 import numpy as np
 from typing import List, Callable
+from core.logging import get_vanna_logger
 
 class OllamaEmbeddingFunction:
     def __init__(self, model_name: str, base_url: str, embedding_dimension: int):
@@ -10,6 +11,9 @@ class OllamaEmbeddingFunction:
         self.embedding_dimension = embedding_dimension
         self.max_retries = 3
         self.retry_interval = 2
+        
+        # 初始化日志
+        self.logger = get_vanna_logger("OllamaEmbedding")
 
     def __call__(self, input) -> List[List[float]]:
         """为文本列表生成嵌入向量"""
@@ -22,7 +26,7 @@ class OllamaEmbeddingFunction:
                 embedding = self.generate_embedding(text)
                 embeddings.append(embedding)
             except Exception as e:
-                print(f"获取embedding时出错: {e}")
+                self.logger.error(f"获取embedding时出错: {e}")
                 embeddings.append([0.0] * self.embedding_dimension)
                 
         return embeddings
@@ -37,10 +41,10 @@ class OllamaEmbeddingFunction:
     
     def generate_embedding(self, text: str) -> List[float]:
         """为单个文本生成嵌入向量"""
-        print(f"生成Ollama嵌入向量,文本长度: {len(text)} 字符")
+        self.logger.debug(f"生成Ollama嵌入向量,文本长度: {len(text)} 字符")
         
         if not text or len(text.strip()) == 0:
-            print("输入文本为空,返回零向量")
+            self.logger.debug("输入文本为空,返回零向量")
             return [0.0] * self.embedding_dimension
 
         url = f"{self.base_url}/api/embeddings"
@@ -60,13 +64,13 @@ class OllamaEmbeddingFunction:
                 
                 if response.status_code != 200:
                     error_msg = f"Ollama API请求错误: {response.status_code}, {response.text}"
-                    print(error_msg)
+                    self.logger.error(error_msg)
                     
                     if response.status_code in (429, 500, 502, 503, 504):
                         retries += 1
                         if retries <= self.max_retries:
                             wait_time = self.retry_interval * (2 ** (retries - 1))
-                            print(f"等待 {wait_time} 秒后重试 ({retries}/{self.max_retries})")
+                            self.logger.info(f"等待 {wait_time} 秒后重试 ({retries}/{self.max_retries})")
                             time.sleep(wait_time)
                             continue
                     
@@ -80,7 +84,7 @@ class OllamaEmbeddingFunction:
                     # 验证向量维度
                     actual_dim = len(vector)
                     if actual_dim != self.embedding_dimension:
-                        print(f"向量维度不匹配: 期望 {self.embedding_dimension}, 实际 {actual_dim}")
+                        self.logger.debug(f"向量维度不匹配: 期望 {self.embedding_dimension}, 实际 {actual_dim}")
                         # 如果维度不匹配,可以选择截断或填充
                         if actual_dim > self.embedding_dimension:
                             vector = vector[:self.embedding_dimension]
@@ -88,23 +92,23 @@ class OllamaEmbeddingFunction:
                             vector.extend([0.0] * (self.embedding_dimension - actual_dim))
                     
                     # 添加成功生成embedding的debug日志
-                    print(f"[DEBUG] ✓ 成功生成Ollama embedding向量,维度: {len(vector)}")
+                    self.logger.debug(f"✓ 成功生成Ollama embedding向量,维度: {len(vector)}")
                     return vector
                 else:
                     error_msg = f"Ollama API返回格式异常: {result}"
-                    print(error_msg)
+                    self.logger.error(error_msg)
                     raise ValueError(error_msg)
                 
             except Exception as e:
-                print(f"生成Ollama embedding时出错: {str(e)}")
+                self.logger.error(f"生成Ollama embedding时出错: {str(e)}")
                 retries += 1
                 
                 if retries <= self.max_retries:
                     wait_time = self.retry_interval * (2 ** (retries - 1))
-                    print(f"等待 {wait_time} 秒后重试 ({retries}/{self.max_retries})")
+                    self.logger.info(f"等待 {wait_time} 秒后重试 ({retries}/{self.max_retries})")
                     time.sleep(wait_time)
                 else:
-                    print(f"已达到最大重试次数 ({self.max_retries}),生成embedding失败")
+                    self.logger.error(f"已达到最大重试次数 ({self.max_retries}),生成embedding失败")
                     return [0.0] * self.embedding_dimension
         
         raise RuntimeError("生成Ollama embedding失败")
@@ -121,8 +125,8 @@ class OllamaEmbeddingFunction:
         }
         
         try:
-            print(f"测试Ollama嵌入模型连接 - 模型: {self.model_name}")
-            print(f"Ollama服务地址: {self.base_url}")
+            self.logger.info(f"测试Ollama嵌入模型连接 - 模型: {self.model_name}")
+            self.logger.info(f"Ollama服务地址: {self.base_url}")
             
             vector = self.generate_embedding(test_text)
             actual_dimension = len(vector)

+ 120 - 151
customllm/base_llm_chat.py

@@ -4,8 +4,11 @@ from typing import List, Dict, Any, Optional, Union, Tuple
 import pandas as pd
 import plotly.graph_objs
 from vanna.base import VannaBase
+from core.logging import get_vanna_logger
 # 导入配置参数
 from app_config import REWRITE_QUESTION_ENABLED, DISPLAY_RESULT_THINKING
+# 导入提示词加载器
+from .load_prompts import get_prompt_loader
 
 
 class BaseLLMChat(VannaBase, ABC):
@@ -14,18 +17,24 @@ class BaseLLMChat(VannaBase, ABC):
     def __init__(self, config=None):
         VannaBase.__init__(self, config=config)
 
+        # 初始化日志
+        self.logger = get_vanna_logger("BaseLLMChat")
+
         # 存储LLM解释性文本
         self.last_llm_explanation = None
         
-        print("传入的 config 参数如下:")
+        # 初始化提示词加载器
+        self.prompt_loader = get_prompt_loader()
+        
+        self.logger.info("传入的 config 参数如下:")
         for key, value in self.config.items():
-            print(f"  {key}: {value}")
+            self.logger.info(f"  {key}: {value}")
         
         # 默认参数
         self.temperature = 0.7
         
         if "temperature" in config:
-            print(f"temperature is changed to: {config['temperature']}")
+            self.logger.info(f"temperature is changed to: {config['temperature']}")
             self.temperature = config["temperature"]
         
         # 加载错误SQL提示配置
@@ -36,36 +45,66 @@ class BaseLLMChat(VannaBase, ABC):
         try:
             import app_config
             enable_error_sql = getattr(app_config, 'ENABLE_ERROR_SQL_PROMPT', False)
-            print(f"[DEBUG] 错误SQL提示配置: ENABLE_ERROR_SQL_PROMPT = {enable_error_sql}")
+            self.logger.debug(f"错误SQL提示配置: ENABLE_ERROR_SQL_PROMPT = {enable_error_sql}")
             return enable_error_sql
         except (ImportError, AttributeError) as e:
-            print(f"[WARNING] 无法加载错误SQL提示配置: {e},使用默认值 False")
+            self.logger.warning(f"无法加载错误SQL提示配置: {e},使用默认值 False")
             return False
 
+    def log(self, message: str, title: str = "Info"):
+        """
+        重写父类的log方法,使用项目的日志系统替代print输出
+        
+        Args:
+            message: 日志消息
+            title: 日志标题
+        """
+        # 将Vanna的log输出转换为项目的日志格式
+        if title == "SQL Prompt":
+            # 对于SQL Prompt,使用debug级别,避免输出过长的内容
+            # 将列表格式转换为字符串,只显示前200个字符
+            if isinstance(message, list):
+                message_str = str(message)[:200] + "..." if len(str(message)) > 200 else str(message)
+            else:
+                message_str = str(message)[:200] + "..." if len(str(message)) > 200 else str(message)
+            self.logger.debug(f"[Vanna] {title}: {message_str}")
+        elif title == "LLM Response":
+            # 对于LLM响应,记录但不显示全部内容
+            if isinstance(message, str):
+                message_str = message[:200] + "..." if len(message) > 200 else message
+            else:
+                message_str = str(message)[:200] + "..." if len(str(message)) > 200 else str(message)
+            self.logger.debug(f"[Vanna] {title}: {message_str}")
+        elif title == "Extracted SQL":
+            # 对于提取的SQL,使用info级别
+            self.logger.info(f"[Vanna] {title}: {message}")
+        else:
+            # 其他日志使用info级别
+            self.logger.info(f"[Vanna] {title}: {message}")
+
     def system_message(self, message: str) -> dict:
         """创建系统消息格式"""
-        print(f"system_content: {message}")
+        self.logger.debug(f"system_content: {message}")
         return {"role": "system", "content": message}
 
     def user_message(self, message: str) -> dict:
         """创建用户消息格式"""
-        print(f"\nuser_content: {message}")
+        self.logger.debug(f"\nuser_content: {message}")
         return {"role": "user", "content": message}
 
     def assistant_message(self, message: str) -> dict:
         """创建助手消息格式"""
-        print(f"assistant_content: {message}")
+        self.logger.debug(f"assistant_content: {message}")
         return {"role": "assistant", "content": message}
 
     def get_sql_prompt(self, initial_prompt: str, question: str, question_sql_list: list, ddl_list: list, doc_list: list, **kwargs):
         """
         基于VannaBase源码实现,在第7点添加中文别名指令
         """
-        print(f"[DEBUG] 开始生成SQL提示词,问题: {question}")
+        self.logger.debug(f"开始生成SQL提示词,问题: {question}")
         
         if initial_prompt is None:
-            initial_prompt = f"You are a {self.dialect} expert. " + \
-            "Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions."
+            initial_prompt = self.prompt_loader.get_sql_initial_prompt(self.dialect)
 
         # 提取DDL内容(适配新的字典格式)
         ddl_content_list = []
@@ -101,7 +140,7 @@ class BaseLLMChat(VannaBase, ABC):
             try:
                 error_sql_list = self.get_related_error_sql(question, **kwargs)
                 if error_sql_list:
-                    print(f"[DEBUG] 找到 {len(error_sql_list)} 个相关的错误SQL示例")
+                    self.logger.debug(f"找到 {len(error_sql_list)} 个相关的错误SQL示例")
                     
                     # 构建格式化的负面提示内容
                     negative_prompt_content = "===Negative Examples\n"
@@ -110,44 +149,24 @@ class BaseLLMChat(VannaBase, ABC):
                     for i, error_example in enumerate(error_sql_list, 1):
                         if "question" in error_example and "sql" in error_example:
                             similarity = error_example.get('similarity', 'N/A')
-                            print(f"[DEBUG] 错误SQL示例 {i}: 相似度={similarity}")
+                            self.logger.debug(f"错误SQL示例 {i}: 相似度={similarity}")
                             negative_prompt_content += f"问题: {error_example['question']}\n"
                             negative_prompt_content += f"错误的SQL: {error_example['sql']}\n\n"
                     
                     # 将负面提示添加到初始提示中
                     initial_prompt += negative_prompt_content
                 else:
-                    print("[DEBUG] 未找到相关的错误SQL示例")
+                    self.logger.debug("未找到相关的错误SQL示例")
             except Exception as e:
-                print(f"[WARNING] 获取错误SQL示例失败: {e}")
-
-        initial_prompt += (
-            "===Response Guidelines \n"
-            "1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. \n"
-            "2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql \n"
-            "3. If the provided context is insufficient, please explain why it can't be generated. \n"
-            "4. Please use the most relevant table(s). \n"
-            "5. If the question has been asked and answered before, please repeat the answer exactly as it was given before. \n"
-            f"6. Ensure that the output SQL is {self.dialect}-compliant and executable, and free of syntax errors. \n"
-            "7. 在生成 SQL 查询时,如果出现 ORDER BY 子句,请遵循以下规则:\n"
-            "   - 对所有的排序字段(如聚合字段 SUM()、普通列等),请在 ORDER BY 中显式添加 NULLS LAST。\n"
-            "   - 不论是否使用 LIMIT,只要排序字段存在,都必须添加 NULLS LAST,以防止 NULL 排在结果顶部。\n"
-            "   - 示例参考:\n"
-            "     - ORDER BY total DESC NULLS LAST\n"
-            "     - ORDER BY zf_order DESC NULLS LAST\n"
-            "     - ORDER BY SUM(c.customer_count) DESC NULLS LAST \n"
-            "8. 【重要】请在SQL查询中为所有SELECT的列都使用中文别名:\n"
-            "   - 每个列都必须使用 AS 中文别名 的格式,没有例外\n"
-            "   - 包括原始字段名也要添加中文别名,例如:SELECT gender AS 性别, card_category AS 卡片类型\n"
-            "   - 计算字段也要有中文别名,例如:SELECT COUNT(*) AS 持卡人数\n"
-            "   - 中文别名要准确反映字段的业务含义"
-        )
+                self.logger.warning(f"获取错误SQL示例失败: {e}")
+
+        initial_prompt += self.prompt_loader.get_sql_response_guidelines(self.dialect)
 
         message_log = [self.system_message(initial_prompt)]
 
         for example in question_sql_list:
             if example is None:
-                print("example is None")
+                self.logger.warning("example is None")
             else:
                 if example is not None and "question" in example and "sql" in example:
                     message_log.append(self.user_message(example["question"]))
@@ -161,57 +180,15 @@ class BaseLLMChat(VannaBase, ABC):
         """
         重写父类方法,添加明确的中文图表指令
         """
-        # 构建更智能的中文图表指令,根据问题和数据内容生成有意义的标签
-        chinese_chart_instructions = (
-            "使用中文创建图表,要求:\n"
-            "1. 根据用户问题和数据内容,为图表生成有意义的中文标题\n"
-            "2. 根据数据列的实际含义,为X轴和Y轴生成准确的中文标签\n"
-            "3. 如果有图例,确保图例标签使用中文\n"
-            "4. 所有文本(包括标题、轴标签、图例、数据标签等)都必须使用中文\n"
-            "5. 标题应该简洁明了地概括图表要展示的内容\n"
-            "6. 轴标签应该准确反映对应数据列的业务含义\n"
-            "7. 选择最适合数据特点的图表类型(柱状图、折线图、饼图等)"
+        # 构建系统消息
+        system_msg = self.prompt_loader.get_chart_system_message(
+            question=question,
+            sql=sql,
+            df_metadata=df_metadata
         )
 
-        # 构建父类方法要求的message_log
-        system_msg_parts = []
-
-        if question:
-            system_msg_parts.append(
-                f"用户问题:'{question}'"
-            )
-            system_msg_parts.append(
-                f"以下是回答用户问题的pandas DataFrame数据:"
-            )
-        else:
-            system_msg_parts.append("以下是一个pandas DataFrame数据:")
-
-        if sql:
-            system_msg_parts.append(f"数据来源SQL查询:\n{sql}")
-
-        system_msg_parts.append(f"DataFrame结构信息:\n{df_metadata}")
-
-        system_msg = "\n\n".join(system_msg_parts)
-
-        # 构建更详细的用户消息,强调中文标签的重要性
-        user_msg = (
-            "请为这个DataFrame生成Python Plotly可视化代码。要求:\n\n"
-            "1. 假设数据存储在名为'df'的pandas DataFrame中\n"
-            "2. 如果DataFrame只有一个值,使用Indicator图表\n"
-            "3. 只返回Python代码,不要任何解释\n"
-            "4. 代码必须可以直接运行\n\n"
-            f"{chinese_chart_instructions}\n\n"
-            "特别注意:\n"
-            "- 不要使用'图表标题'、'X轴标签'、'Y轴标签'这样的通用标签\n"
-            "- 要根据实际数据内容和用户问题生成具体、有意义的中文标签\n"
-            "- 例如:如果是性别统计,X轴可能是'性别',Y轴可能是'人数'或'占比'\n"
-            "- 标题应该概括图表的主要内容,如'男女持卡比例分布'\n\n"
-            "数据标签和悬停信息要求:\n"
-            "- 不要使用%{text}这样的占位符变量\n"
-            "- 使用具体的数据值和中文单位,例如:text=df['列名'].astype(str) + '人'\n"
-            "- 悬停信息要清晰易懂,使用中文描述\n"
-            "- 确保所有显示的文本都是实际的数据值,不是变量占位符"
-        )
+        # 构建用户消息
+        user_msg = self.prompt_loader.get_chart_user_message()
 
         message_log = [
             self.system_message(system_msg),
@@ -225,7 +202,7 @@ class BaseLLMChat(VannaBase, ABC):
         if not DISPLAY_RESULT_THINKING:
             original_code = plotly_code
             plotly_code = self._remove_thinking_content(plotly_code)
-            print(f"[DEBUG] generate_plotly_code隐藏thinking内容 - 原始长度: {len(original_code)}, 处理后长度: {len(plotly_code)}")
+            self.logger.debug(f"generate_plotly_code隐藏thinking内容 - 原始长度: {len(original_code)}, 处理后长度: {len(plotly_code)}")
 
         return self._sanitize_plotly_code(self._extract_python_code(plotly_code))
 
@@ -270,12 +247,12 @@ class BaseLLMChat(VannaBase, ABC):
         对于Flask应用,这个方法决定了前端是否显示图表生成按钮
         """
         if df is None or df.empty:
-            print(f"[DEBUG] should_generate_chart: df为空,返回False")
+            self.logger.debug("should_generate_chart: df为空,返回False")
             return False
         
         # 如果数据有多行或多列,通常适合生成图表
         result = len(df) > 1 or len(df.columns) > 1
-        print(f"[DEBUG] should_generate_chart: df.shape={df.shape}, 返回{result}")
+        self.logger.debug(f"should_generate_chart: df.shape={df.shape}, 返回{result}")
         
         if result:
             return True
@@ -290,12 +267,12 @@ class BaseLLMChat(VannaBase, ABC):
             # 清空上次的解释性文本
             self.last_llm_explanation = None
             
-            print(f"[DEBUG] 尝试为问题生成SQL: {question}")
+            self.logger.debug(f"尝试为问题生成SQL: {question}")
             # 调用父类的 generate_sql
             sql = super().generate_sql(question, **kwargs)
             
             if not sql or sql.strip() == "":
-                print(f"[WARNING] 生成的SQL为空")
+                self.logger.warning("生成的SQL为空")
                 explanation = "无法生成SQL查询,可能是问题描述不够清晰或缺少必要的数据表信息。"
                 # 根据 DISPLAY_RESULT_THINKING 参数处理thinking内容
                 if not DISPLAY_RESULT_THINKING:
@@ -311,45 +288,46 @@ class BaseLLMChat(VannaBase, ABC):
             
             # 检查是否包含错误提示信息
             error_indicators = [
-                "insufficient context", "无法生成", "sorry", "cannot", "不能",
+                "insufficient context", "无法生成", "sorry", "cannot generate", "cannot", "不能",
                 "no relevant", "no suitable", "unable to", "无法", "抱歉",
-                "i don't have", "i cannot", "没有相关", "找不到", "不存在"
+                "i don't have", "i cannot", "没有相关", "找不到", "不存在", "上下文不足",
+                "没有直接存储", "无法直接查询", "没有存储", "not enough information", "unclear"
             ]
             
             for indicator in error_indicators:
                 if indicator in sql_lower:
-                    print(f"[WARNING] LLM返回错误信息而非SQL: {sql}")
+                    self.logger.warning(f"LLM返回错误信息而非SQL: {sql}")
                     # 保存LLM的解释性文本,并根据配置处理thinking内容
                     explanation = sql
                     if not DISPLAY_RESULT_THINKING:
                         explanation = self._remove_thinking_content(explanation)
-                        print(f"[DEBUG] 隐藏thinking内容 - SQL生成解释性文本")
+                        self.logger.debug("隐藏thinking内容 - SQL生成解释性文本")
                     self.last_llm_explanation = explanation
                     return None
             
             # 简单检查是否像SQL语句(至少包含一些SQL关键词)
             sql_keywords = ["select", "insert", "update", "delete", "with", "from", "where"]
             if not any(keyword in sql_lower for keyword in sql_keywords):
-                print(f"[WARNING] 返回内容不像有效SQL: {sql}")
+                self.logger.warning(f"返回内容不像有效SQL: {sql}")
                 # 保存LLM的解释性文本,并根据配置处理thinking内容
                 explanation = sql
                 if not DISPLAY_RESULT_THINKING:
                     explanation = self._remove_thinking_content(explanation)
-                    print(f"[DEBUG] 隐藏thinking内容 - SQL生成非有效SQL内容")
+                    self.logger.debug("隐藏thinking内容 - SQL生成非有效SQL内容")
                 self.last_llm_explanation = explanation
                 return None
                 
-            print(f"[SUCCESS] 成功生成SQL:\n {sql}")
+            self.logger.info(f"成功生成SQL:\n {sql}")
             # 清空解释性文本
             self.last_llm_explanation = None
             return sql
             
         except Exception as e:
-            print(f"[ERROR] SQL生成过程中出现异常: {str(e)}")
-            print(f"[ERROR] 异常类型: {type(e).__name__}")
+            self.logger.error(f"SQL生成过程中出现异常: {str(e)}")
+            self.logger.error(f"异常类型: {type(e).__name__}")
             # 导入traceback以获取详细错误信息
             import traceback
-            print(f"[ERROR] 详细错误信息: {traceback.format_exc()}")
+            self.logger.error(f"详细错误信息: {traceback.format_exc()}")
             explanation = f"SQL生成过程中出现异常: {str(e)}"
             # 根据 DISPLAY_RESULT_THINKING 参数处理thinking内容
             if not DISPLAY_RESULT_THINKING:
@@ -361,7 +339,7 @@ class BaseLLMChat(VannaBase, ABC):
         """根据SQL生成中文问题"""
         prompt = [
             self.system_message(
-                "请你根据下方SQL语句推测用户的业务提问,只返回清晰的自然语言问题,不要包含任何解释或SQL内容,也不要出现表名,问题要使用中文,并以问号结尾。"
+                self.prompt_loader.get_question_generation_prompt()
             ),
             self.user_message(sql)
         ]
@@ -371,7 +349,7 @@ class BaseLLMChat(VannaBase, ABC):
         if not DISPLAY_RESULT_THINKING:
             original_response = response
             response = self._remove_thinking_content(response)
-            print(f"[DEBUG] generate_question隐藏thinking内容 - 原始长度: {len(original_response)}, 处理后长度: {len(response)}")
+            self.logger.debug(f"generate_question隐藏thinking内容 - 原始长度: {len(original_response)}, 处理后长度: {len(response)}")
         
         return response
 
@@ -389,7 +367,7 @@ class BaseLLMChat(VannaBase, ABC):
     #         response = self.submit_prompt(prompt, **kwargs)
     #         return response
     #     except Exception as e:
-    #         print(f"[ERROR] LLM对话失败: {str(e)}")
+    #         self.logger.error(f"LLM对话失败: {str(e)}")
     #         return f"抱歉,我暂时无法回答您的问题。请稍后再试。"
 
     def chat_with_llm(self, question: str, system_prompt: str = None, **kwargs) -> str:
@@ -405,9 +383,7 @@ class BaseLLMChat(VannaBase, ABC):
         try:
             # 如果没有提供自定义系统提示词,使用默认的
             if system_prompt is None:
-                system_prompt = (
-                    "你是一个友好的AI助手,请用中文回答用户的问题。"
-                )
+                system_prompt = self.prompt_loader.get_chat_default_prompt()
             
             prompt = [
                 self.system_message(system_prompt),
@@ -420,12 +396,12 @@ class BaseLLMChat(VannaBase, ABC):
             if not DISPLAY_RESULT_THINKING:
                 original_response = response
                 response = self._remove_thinking_content(response)
-                print(f"[DEBUG] chat_with_llm隐藏thinking内容 - 原始长度: {len(original_response)}, 处理后长度: {len(response)}")
+                self.logger.debug(f"chat_with_llm隐藏thinking内容 - 原始长度: {len(original_response)}, 处理后长度: {len(response)}")
             
             return response
             
         except Exception as e:
-            print(f"[ERROR] LLM对话失败: {str(e)}")
+            self.logger.error(f"LLM对话失败: {str(e)}")
             return f"抱歉,我暂时无法回答您的问题。请稍后再试。"
 
     def generate_rewritten_question(self, last_question: str, new_question: str, **kwargs) -> str:
@@ -442,19 +418,17 @@ class BaseLLMChat(VannaBase, ABC):
         """
         # 如果未启用合并功能或没有上一个问题,直接返回新问题
         if not REWRITE_QUESTION_ENABLED or last_question is None:
-            print(f"[DEBUG] 问题合并功能{'未启用' if not REWRITE_QUESTION_ENABLED else '上一个问题为空'},直接返回新问题")
+            self.logger.debug(f"问题合并功能{'未启用' if not REWRITE_QUESTION_ENABLED else '上一个问题为空'},直接返回新问题")
             return new_question
         
-        print(f"[DEBUG] 启用问题合并功能,尝试合并问题")
-        print(f"[DEBUG] 上一个问题: {last_question}")
-        print(f"[DEBUG] 新问题: {new_question}")
+        self.logger.debug("启用问题合并功能,尝试合并问题")
+        self.logger.debug(f"上一个问题: {last_question}")
+        self.logger.debug(f"新问题: {new_question}")
         
         try:
             prompt = [
                 self.system_message(
-                    "你的目标是将一系列相关的问题合并成一个单一的问题。如果第二个问题与第一个问题无关且完全独立,则返回第二个问题。"
-                    "只返回新的合并问题,不要添加任何额外的解释。该问题理论上应该能够用一个SQL语句来回答。"
-                    "请用中文回答。"
+                    self.prompt_loader.get_question_merge_prompt()
                 ),
                 self.user_message(f"第一个问题: {last_question}\n第二个问题: {new_question}")
             ]
@@ -465,13 +439,13 @@ class BaseLLMChat(VannaBase, ABC):
             if not DISPLAY_RESULT_THINKING:
                 original_question = rewritten_question
                 rewritten_question = self._remove_thinking_content(rewritten_question)
-                print(f"[DEBUG] generate_rewritten_question隐藏thinking内容 - 原始长度: {len(original_question)}, 处理后长度: {len(rewritten_question)}")
+                self.logger.debug(f"generate_rewritten_question隐藏thinking内容 - 原始长度: {len(original_question)}, 处理后长度: {len(rewritten_question)}")
             
-            print(f"[DEBUG] 合并后的问题: {rewritten_question}")
+            self.logger.debug(f"合并后的问题: {rewritten_question}")
             return rewritten_question
             
         except Exception as e:
-            print(f"[ERROR] 问题合并失败: {str(e)}")
+            self.logger.error(f"问题合并失败: {str(e)}")
             # 如果合并失败,返回新问题
             return new_question
 
@@ -493,28 +467,23 @@ class BaseLLMChat(VannaBase, ABC):
             
             # 确保 df 是 pandas DataFrame
             if not isinstance(df, pd.DataFrame):
-                print(f"[WARNING] df 不是 pandas DataFrame,类型: {type(df)}")
+                self.logger.warning(f"df 不是 pandas DataFrame,类型: {type(df)}")
                 return "无法生成摘要:数据格式不正确"
             
             if df.empty:
                 return "查询结果为空,无数据可供摘要。"
             
-            print(f"[DEBUG] 生成摘要 - 问题: {question}")
-            print(f"[DEBUG] DataFrame 形状: {df.shape}")
+            self.logger.debug(f"生成摘要 - 问题: {question}")
+            self.logger.debug(f"DataFrame 形状: {df.shape}")
             
             # 构建包含中文指令的系统消息
-            system_content = (
-                f"你是一个专业的数据分析助手。用户提出了问题:'{question}'\n\n"
-                f"以下是查询结果的 pandas DataFrame 数据:\n{df.to_markdown()}\n\n"
-                "请用中文进行思考和分析,并用中文回答。"
+            system_content = self.prompt_loader.get_summary_system_message(
+                question=question,
+                df_markdown=df.to_markdown()
             )
             
             # 构建用户消息,强调中文思考和回答
-            user_content = (
-                "请基于用户提出的问题,简要总结这些数据。要求:\n"             
-                "1. 只进行简要总结,不要添加额外的解释\n"
-                "2. 如果数据中有数字,请保留适当的精度\n"            
-            )
+            user_content = self.prompt_loader.get_summary_user_instructions()
             
             message_log = [
                 self.system_message(system_content),
@@ -530,15 +499,15 @@ class BaseLLMChat(VannaBase, ABC):
                 # 移除 <think></think> 标签及其内容
                 original_summary = summary
                 summary = self._remove_thinking_content(summary)
-                print(f"[DEBUG] 隐藏thinking内容 - 原始长度: {len(original_summary)}, 处理后长度: {len(summary)}")
+                self.logger.debug(f"隐藏thinking内容 - 原始长度: {len(original_summary)}, 处理后长度: {len(summary)}")
             
-            print(f"[DEBUG] 生成的摘要: {summary[:100]}...")
+            self.logger.debug(f"生成的摘要: {summary[:100]}...")
             return summary
             
         except Exception as e:
-            print(f"[ERROR] 生成摘要失败: {str(e)}")
+            self.logger.error(f"生成摘要失败: {str(e)}")
             import traceback
-            print(f"[ERROR] 详细错误信息: {traceback.format_exc()}")
+            self.logger.error(f"详细错误信息: {traceback.format_exc()}")
             return f"生成摘要时出现错误:{str(e)}"
 
     def _remove_thinking_content(self, text: str) -> str:
@@ -597,7 +566,7 @@ class BaseLLMChat(VannaBase, ABC):
         try:
             sql = self.generate_sql(question=question, allow_llm_to_see_data=allow_llm_to_see_data)
         except Exception as e:
-            print(e)
+            self.logger.error(f"SQL generation error: {e}")
             self.last_llm_explanation = str(e)
             if print_results:
                 return None
@@ -607,7 +576,7 @@ class BaseLLMChat(VannaBase, ABC):
         # 如果SQL为空,说明有解释性文本,按照正常流程返回None
         # API层会检查 last_llm_explanation 来获取解释
         if sql is None:
-            print(f"[INFO] 无法生成SQL,解释: {self.last_llm_explanation}")
+            self.logger.info(f"无法生成SQL,解释: {self.last_llm_explanation}")
             if print_results:
                 return None
             else:
@@ -615,10 +584,10 @@ class BaseLLMChat(VannaBase, ABC):
 
         # 以下是正常的SQL执行流程(保持VannaBase原有逻辑)
         if print_results:
-            print(sql)
+            self.logger.info(f"Generated SQL: {sql}")
 
         if self.run_sql_is_set is False:
-            print("If you want to run the SQL query, connect to a database first.")
+            self.logger.info("If you want to run the SQL query, connect to a database first.")
             if print_results:
                 return None
             else:
@@ -628,7 +597,7 @@ class BaseLLMChat(VannaBase, ABC):
             df = self.run_sql(sql)
             
             if df is None:
-                print("The SQL query returned no results.")
+                self.logger.info("The SQL query returned no results.")
                 if print_results:
                     return None
                 else:
@@ -637,17 +606,17 @@ class BaseLLMChat(VannaBase, ABC):
             if print_results:
                 # 显示结果表格
                 if len(df) > 10:
-                    print(df.head(10).to_string())
-                    print(f"... ({len(df)} rows)")
+                    self.logger.info(f"Query results (first 10 rows):\n{df.head(10).to_string()}")
+                    self.logger.info(f"... ({len(df)} rows)")
                 else:
-                    print(df.to_string())
+                    self.logger.info(f"Query results:\n{df.to_string()}")
 
             # 如果启用了自动训练,添加问题-SQL对到训练集
             if auto_train:
                 try:
                     self.add_question_sql(question=question, sql=sql)
                 except Exception as e:
-                    print(f"Could not add question and sql to training data: {e}")
+                    self.logger.warning(f"Could not add question and sql to training data: {e}")
 
             if visualize:
                 try:
@@ -667,25 +636,25 @@ class BaseLLMChat(VannaBase, ABC):
                             )
                             if fig is not None:
                                 if print_results:
-                                    print("Chart generated (use fig.show() to display)")
+                                    self.logger.info("Chart generated (use fig.show() to display)")
                                 return sql, df, fig
                             else:
-                                print("Could not generate chart")
+                                self.logger.warning("Could not generate chart")
                                 return sql, df, None
                         else:
-                            print("No chart generated")
+                            self.logger.info("No chart generated")
                             return sql, df, None
                     else:
-                        print("Not generating chart for this data")
+                        self.logger.info("Not generating chart for this data")
                         return sql, df, None
                 except Exception as e:
-                    print(f"Couldn't generate chart: {e}")
+                    self.logger.error(f"Couldn't generate chart: {e}")
                     return sql, df, None
             else:
                 return sql, df, None
 
         except Exception as e:
-            print("Couldn't run sql: ", e)
+            self.logger.error(f"Couldn't run sql: {e}")
             if print_results:
                 return None
             else:

+ 15 - 15
customllm/deepseek_chat.py

@@ -7,8 +7,8 @@ class DeepSeekChat(BaseLLMChat):
     """DeepSeek AI聊天实现"""
     
     def __init__(self, config=None):
-        print("...DeepSeekChat init...")
         super().__init__(config=config)
+        self.logger.info("DeepSeekChat init")
 
         if config is None:
             self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
@@ -43,7 +43,7 @@ class DeepSeekChat(BaseLLMChat):
         # DeepSeek API约束:enable_thinking=True时建议使用stream=True
         # 如果stream=False但enable_thinking=True,则忽略enable_thinking
         if enable_thinking and not stream_mode:
-            print("WARNING: enable_thinking=True 不生效,因为它需要 stream=True")
+            self.logger.warning("enable_thinking=True 不生效,因为它需要 stream=True")
             enable_thinking = False
 
         # 确定使用的模型
@@ -68,18 +68,18 @@ class DeepSeekChat(BaseLLMChat):
 
         # 模型兼容性提示(但不强制切换)
         if enable_thinking and model not in ["deepseek-reasoner"]:
-            print(f"提示:模型 {model} 可能不支持推理功能,推理相关参数将被忽略")
+            self.logger.warning(f"提示:模型 {model} 可能不支持推理功能,推理相关参数将被忽略")
 
-        print(f"\nUsing model {model} for {num_tokens} tokens (approx)")
-        print(f"Enable thinking: {enable_thinking}, Stream mode: {stream_mode}")
+        self.logger.info(f"\nUsing model {model} for {num_tokens} tokens (approx)")
+        self.logger.info(f"Enable thinking: {enable_thinking}, Stream mode: {stream_mode}")
 
         # 方案1:通过 system prompt 控制中文输出(DeepSeek 不支持 language 参数)
         # 检查配置中的语言设置,并在 system prompt 中添加中文指令
         # language_setting = self.config.get("language", "").lower() if self.config else ""
-        # print(f"DEBUG: language_setting='{language_setting}', model='{model}', enable_thinking={enable_thinking}")
+        # self.logger.debug(f"language_setting='{language_setting}', model='{model}', enable_thinking={enable_thinking}")
         
         # if language_setting == "chinese" and enable_thinking:
-        #     print("DEBUG: ✅ 触发中文指令添加")
+        #     self.logger.debug("触发中文指令添加")
         #     # 为推理模型添加中文思考指令
         #     chinese_instruction = {"role": "system", "content": "请用中文进行思考和回答。在推理过程中,请使用中文进行分析和思考。<think></think>之间也请使用中文"}
         #     # 如果第一条消息不是 system 消息,则添加中文指令
@@ -90,7 +90,7 @@ class DeepSeekChat(BaseLLMChat):
         #         existing_content = prompt[0]["content"]
         #         prompt[0]["content"] = f"{existing_content}\n\n请用中文进行思考和回答。在推理过程中,请使用中文进行分析和思考。<think></think>之间也请使用中文"
         # else:
-        #     print(f"DEBUG: ❌ 未触发中文指令 - language_setting==chinese: {language_setting == 'chinese'}, model==deepseek-reasoner: {model == 'deepseek-reasoner'}, enable_thinking: {enable_thinking}")
+        #     self.logger.debug(f"未触发中文指令 - language_setting==chinese: {language_setting == 'chinese'}, model==deepseek-reasoner: {model == 'deepseek-reasoner'}, enable_thinking: {enable_thinking}")
 
         # 构建 API 调用参数
         api_params = {
@@ -112,7 +112,7 @@ class DeepSeekChat(BaseLLMChat):
             unsupported_params = ['top_p', 'presence_penalty', 'frequency_penalty', 'logprobs', 'top_logprobs']
             for param in unsupported_params:
                 if param in filtered_kwargs:
-                    print(f"警告:deepseek-reasoner 不支持参数 {param},已忽略")
+                    self.logger.warning(f"deepseek-reasoner 不支持参数 {param},已忽略")
                     filtered_kwargs.pop(param, None)
         else:
             # deepseek-chat 等其他模型,只过滤明确会导致错误的参数
@@ -125,9 +125,9 @@ class DeepSeekChat(BaseLLMChat):
         if stream_mode:
             # 流式处理模式
             if model == "deepseek-reasoner" and enable_thinking:
-                print("使用流式处理模式,启用推理功能")
+                self.logger.info("使用流式处理模式,启用推理功能")
             else:
-                print("使用流式处理模式,常规聊天")
+                self.logger.info("使用流式处理模式,常规聊天")
             
             response_stream = self.client.chat.completions.create(**api_params)
             
@@ -151,7 +151,7 @@ class DeepSeekChat(BaseLLMChat):
                 # 可选:打印推理过程
                 if collected_reasoning:
                     reasoning_text = "".join(collected_reasoning)
-                    print("Model reasoning process:\n", reasoning_text)
+                    self.logger.debug("Model reasoning process:\n" + reasoning_text)
                 
                 # 方案2:返回包含 <think></think> 标签的完整内容,与 QianWen 保持一致
                 final_content = "".join(collected_content)
@@ -173,9 +173,9 @@ class DeepSeekChat(BaseLLMChat):
         else:
             # 非流式处理模式
             if model == "deepseek-reasoner" and enable_thinking:
-                print("使用非流式处理模式,启用推理功能")
+                self.logger.info("使用非流式处理模式,启用推理功能")
             else:
-                print("使用非流式处理模式,常规聊天")
+                self.logger.info("使用非流式处理模式,常规聊天")
             
             response = self.client.chat.completions.create(**api_params)
             
@@ -187,7 +187,7 @@ class DeepSeekChat(BaseLLMChat):
                 reasoning_content = ""
                 if hasattr(message, 'reasoning_content') and message.reasoning_content:
                     reasoning_content = message.reasoning_content
-                    print("Model reasoning process:\n", reasoning_content)
+                    self.logger.debug("Model reasoning process:\n" + reasoning_content)
                 
                 # 方案2:返回包含 <think></think> 标签的完整内容,与 QianWen 保持一致
                 final_content = message.content

+ 116 - 0
customllm/llm_prompts.yaml

@@ -0,0 +1,116 @@
+# 提示词配置文件
+# 包含所有LLM交互使用的提示词模板
+# 用于customllm/base_llm_chat.py
+
+sql_generation:
+  # SQL生成的初始提示词
+  initial_prompt: |
+    You are a {dialect} expert. 
+    Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions.
+
+  # SQL生成的响应指南
+  response_guidelines: |
+    ===Response Guidelines 
+    **IMPORTANT**: All SQL queries MUST use Chinese aliases for ALL columns in SELECT clause.
+    
+    1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. 
+    2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql 
+    3. If the provided context is insufficient, please explain why it can't be generated. 
+    4. **Context Understanding**: If the question follows [CONTEXT]...[CURRENT] format, replace pronouns in [CURRENT] with specific entities from [CONTEXT].
+       - Example: If context mentions 'Nancheng Service Area has the most stalls', and current question is 'How many dining stalls does this service area have?', 
+         interpret it as 'How many dining stalls does Nancheng Service Area have?'
+    5. Please use the most relevant table(s). 
+    6. If the question has been asked and answered before, please repeat the answer exactly as it was given before. 
+    7. Ensure that the output SQL is {dialect}-compliant and executable, and free of syntax errors. 
+    8. Always add NULLS LAST to ORDER BY clauses to handle NULL values properly (e.g., ORDER BY total DESC NULLS LAST).
+    9. **MANDATORY**: ALL columns in SELECT must have Chinese aliases. This is non-negotiable:
+       - Every column MUST use AS with a Chinese alias
+       - Raw column names without aliases are NOT acceptable
+       - Examples: 
+         * CORRECT: SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总收入
+         * WRONG: SELECT service_name, SUM(pay_sum) AS total_revenue
+         * WRONG: SELECT service_name AS service_area, SUM(pay_sum) AS 总收入
+       - Common aliases: COUNT(*) AS 数量, SUM(...) AS 总计, AVG(...) AS 平均值, MAX(...) AS 最大值, MIN(...) AS 最小值
+
+chart_generation:
+  # Chart generation instructions
+  chinese_chart_instructions: |
+    Create charts with the following requirements:
+    1. Generate meaningful titles based on user questions and data content
+    2. Generate accurate labels for X-axis and Y-axis based on the actual meaning of data columns
+    3. If there are legends, ensure legend labels are descriptive
+    4. All text (including titles, axis labels, legends, data labels, etc.) must be clear and meaningful
+    5. Titles should concisely summarize what the chart is showing
+    6. Axis labels should accurately reflect the business meaning of corresponding data columns
+    7. Choose the most suitable chart type for the data characteristics (bar chart, line chart, pie chart, etc.)
+    8. All chart text must be in Chinese.
+
+  # System message template
+  system_message_template: |
+    User question: '{question}'
+    
+    Here is the pandas DataFrame data to answer the user's question:
+    
+    {sql_part}
+    
+    DataFrame structure information:
+    {df_metadata}
+
+  # User message template
+  user_message_template: |
+    Please generate Python Plotly visualization code for this DataFrame. Requirements:
+    
+    1. Assume the data is stored in a pandas DataFrame named 'df'
+    2. If the DataFrame has only one value, use an Indicator chart
+    3. Return only Python code without any explanations
+    4. The code must be directly executable
+    
+    {chinese_chart_instructions}
+    
+    Special notes:
+    - Do not use generic labels like 'Chart Title', 'X-axis Label', 'Y-axis Label'
+    - Generate specific, meaningful labels based on actual data content and user questions
+    - For example: if it's gender statistics, X-axis might be 'Gender', Y-axis might be 'Count' or 'Percentage'
+    - The title should summarize the main content of the chart, such as 'Gender Distribution of Cardholders'
+    
+    Data labels and hover information requirements:
+    - Do not use placeholder variables like %{text}
+    - Use specific data values and units, e.g.: text=df['column_name'].astype(str) + ' people'
+    - Hover information should be clear and easy to understand
+    - Ensure all displayed text is actual data values, not variable placeholders
+    
+    Please generate all text content in Chinese.
+
+question_generation:
+  # Generate question from SQL prompt
+  system_prompt: |
+    Based on the SQL statement below, infer the user's business question. Return only a clear natural language question without any explanations or SQL content. Do not include table names. The question should end with a question mark.
+    Please respond in Chinese.
+
+chat_with_llm:
+  # Default system prompt for chat conversations
+  default_system_prompt: |
+    You are a friendly AI assistant. Please respond in Chinese.
+
+question_merge:
+  # Question merging system prompt
+  system_prompt: |
+    Your goal is to merge a series of related questions into a single question. If the second question is unrelated and completely independent from the first question, return the second question.
+    Return only the new merged question without any additional explanations. The question should theoretically be answerable with a single SQL statement.
+    Please respond in Chinese.
+
+summary_generation:
+  # Summary generation system message
+  system_message_template: |
+    You are a professional data analysis assistant. The user asked: '{question}'
+    
+    Here is the pandas DataFrame data from the query results:{df_markdown}
+    
+    Please think and analyze in the context provided and respond accordingly.
+
+  # Summary generation user instructions
+  user_instructions: |
+    Based on the user's question, please briefly summarize this data. Requirements:
+    1. Provide only a brief summary without adding extra explanations
+    2. If there are numbers in the data, maintain appropriate precision
+    Please respond in Chinese. 

+ 112 - 0
customllm/llm_prompts_bak.yaml

@@ -0,0 +1,112 @@
+# 提示词配置文件
+# 包含所有LLM交互使用的提示词模板
+# 用于customllm/base_llm_chat.py
+
+sql_generation:
+  # SQL生成的初始提示词
+  initial_prompt: |
+    You are a {dialect} expert. 
+    Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions.
+
+  # SQL生成的响应指南
+  response_guidelines: |
+    ===Response Guidelines 
+    1. If the provided context is sufficient, please generate a valid SQL query without any explanations for the question. 
+    2. If the provided context is almost sufficient but requires knowledge of a specific string in a particular column, please generate an intermediate SQL query to find the distinct strings in that column. Prepend the query with a comment saying intermediate_sql 
+    3. If the provided context is insufficient, please explain why it can't be generated. 
+    4. **Context Understanding**: If the question follows [CONTEXT]...[CURRENT] format, replace pronouns in [CURRENT] with specific entities from [CONTEXT].
+       - Example: If context mentions 'Nancheng Service Area has the most stalls', and current question is 'How many dining stalls does this service area have?', 
+         interpret it as 'How many dining stalls does Nancheng Service Area have?'
+    5. Please use the most relevant table(s). 
+    6. If the question has been asked and answered before, please repeat the answer exactly as it was given before. 
+    7. Ensure that the output SQL is {dialect}-compliant and executable, and free of syntax errors. 
+    8. 在生成 SQL 查询时,如果出现 ORDER BY 子句,请遵循以下规则:
+       - 对所有的排序字段(如聚合字段 SUM()、普通列等),请在 ORDER BY 中显式添加 NULLS LAST。
+       - 不论是否使用 LIMIT,只要排序字段存在,都必须添加 NULLS LAST,以防止 NULL 排在结果顶部。
+       - 示例参考:
+         - ORDER BY total DESC NULLS LAST
+         - ORDER BY zf_order DESC NULLS LAST
+         - ORDER BY SUM(c.customer_count) DESC NULLS LAST 
+    9. 【重要】请在SQL查询中为所有SELECT的列都使用中文别名:
+       - 每个列都必须使用 AS 中文别名 的格式,没有例外
+       - 包括原始字段名也要添加中文别名,例如:SELECT gender AS 性别, card_category AS 卡片类型
+       - 计算字段也要有中文别名,例如:SELECT COUNT(*) AS 持卡人数
+       - 中文别名要准确反映字段的业务含义
+
+chart_generation:
+  # 中文图表指令
+  chinese_chart_instructions: |
+    使用中文创建图表,要求:
+    1. 根据用户问题和数据内容,为图表生成有意义的中文标题
+    2. 根据数据列的实际含义,为X轴和Y轴生成准确的中文标签
+    3. 如果有图例,确保图例标签使用中文
+    4. 所有文本(包括标题、轴标签、图例、数据标签等)都必须使用中文
+    5. 标题应该简洁明了地概括图表要展示的内容
+    6. 轴标签应该准确反映对应数据列的业务含义
+    7. 选择最适合数据特点的图表类型(柱状图、折线图、饼图等)
+
+  # 系统消息模板
+  system_message_template: |
+    用户问题:'{question}'
+    
+    以下是回答用户问题的pandas DataFrame数据:
+    
+    {sql_part}
+    
+    DataFrame结构信息:
+    {df_metadata}
+
+  # 用户消息模板
+  user_message_template: |
+    请为这个DataFrame生成Python Plotly可视化代码。要求:
+    
+    1. 假设数据存储在名为'df'的pandas DataFrame中
+    2. 如果DataFrame只有一个值,使用Indicator图表
+    3. 只返回Python代码,不要任何解释
+    4. 代码必须可以直接运行
+    
+    {chinese_chart_instructions}
+    
+    特别注意:
+    - 不要使用'图表标题'、'X轴标签'、'Y轴标签'这样的通用标签
+    - 要根据实际数据内容和用户问题生成具体、有意义的中文标签
+    - 例如:如果是性别统计,X轴可能是'性别',Y轴可能是'人数'或'占比'
+    - 标题应该概括图表的主要内容,如'男女持卡比例分布'
+    
+    数据标签和悬停信息要求:
+    - 不要使用%{text}这样的占位符变量
+    - 使用具体的数据值和中文单位,例如:text=df['列名'].astype(str) + '人'
+    - 悬停信息要清晰易懂,使用中文描述
+    - 确保所有显示的文本都是实际的数据值,不是变量占位符
+
+question_generation:
+  # 根据SQL生成问题的提示词
+  system_prompt: |
+    请你根据下方SQL语句推测用户的业务提问,只返回清晰的自然语言问题,不要包含任何解释或SQL内容,也不要出现表名,问题要使用中文,并以问号结尾。
+
+chat_with_llm:
+  # 聊天对话的默认系统提示词
+  default_system_prompt: |
+    你是一个友好的AI助手,请用中文回答用户的问题。
+
+question_merge:
+  # 问题合并的系统提示词
+  system_prompt: |
+    你的目标是将一系列相关的问题合并成一个单一的问题。如果第二个问题与第一个问题无关且完全独立,则返回第二个问题。
+    只返回新的合并问题,不要添加任何额外的解释。该问题理论上应该能够用一个SQL语句来回答。
+    请用中文回答。
+
+summary_generation:
+  # 摘要生成的系统消息
+  system_message_template: |
+    你是一个专业的数据分析助手。用户提出了问题:'{question}'
+    
+    以下是查询结果的 pandas DataFrame 数据:{df_markdown}
+    
+    请用中文进行思考和分析,并用中文回答。
+
+  # 摘要生成的用户提示词
+  user_instructions: |
+    请基于用户提出的问题,简要总结这些数据。要求:
+    1. 只进行简要总结,不要添加额外的解释
+    2. 如果数据中有数字,请保留适当的精度 

+ 169 - 0
customllm/load_prompts.py

@@ -0,0 +1,169 @@
+"""
+提示词加载器
+用于从yaml文件中加载LLM提示词配置
+"""
+import os
+import yaml
+from typing import Dict, Any
+from core.logging import get_vanna_logger
+
+
+class PromptLoader:
+    """提示词加载器类"""
+    
+    def __init__(self, config_path: str = None):
+        """
+        初始化提示词加载器
+        
+        Args:
+            config_path: yaml配置文件路径,默认为当前目录下的llm_prompts.yaml
+        """
+        self.logger = get_vanna_logger("PromptLoader")
+        
+        if config_path is None:
+            # 默认配置文件路径
+            current_dir = os.path.dirname(os.path.abspath(__file__))
+            config_path = os.path.join(current_dir, "llm_prompts.yaml")
+        
+        self.config_path = config_path
+        self._prompts = None
+        self._load_prompts()
+    
+    def _load_prompts(self):
+        """从yaml文件加载提示词配置"""
+        try:
+            with open(self.config_path, 'r', encoding='utf-8') as file:
+                self._prompts = yaml.safe_load(file)
+            self.logger.debug(f"成功加载提示词配置: {self.config_path}")
+        except FileNotFoundError:
+            self.logger.error(f"提示词配置文件未找到: {self.config_path}")
+            self._prompts = {}
+        except yaml.YAMLError as e:
+            self.logger.error(f"解析yaml配置文件失败: {e}")
+            self._prompts = {}
+        except Exception as e:
+            self.logger.error(f"加载提示词配置时出现未知错误: {e}")
+            self._prompts = {}
+    
+    def get_prompt(self, category: str, key: str, **kwargs) -> str:
+        """
+        获取指定的提示词
+        
+        Args:
+            category: 提示词类别 (如 'sql_generation', 'chart_generation' 等)
+            key: 提示词键名 (如 'initial_prompt', 'response_guidelines' 等)
+            **kwargs: 用于格式化提示词的变量
+            
+        Returns:
+            str: 格式化后的提示词,如果找不到则返回空字符串
+        """
+        try:
+            if category not in self._prompts:
+                self.logger.warning(f"未找到提示词类别: {category}")
+                return ""
+            
+            if key not in self._prompts[category]:
+                self.logger.warning(f"未找到提示词键: {category}.{key}")
+                return ""
+            
+            prompt_template = self._prompts[category][key]
+            
+            # 如果有格式化参数,进行格式化
+            if kwargs:
+                try:
+                    return prompt_template.format(**kwargs)
+                except KeyError as e:
+                    self.logger.warning(f"提示词格式化失败,缺少参数: {e}")
+                    return prompt_template
+            
+            return prompt_template
+            
+        except Exception as e:
+            self.logger.error(f"获取提示词时出现错误: {e}")
+            return ""
+    
+    def get_sql_initial_prompt(self, dialect: str) -> str:
+        """获取SQL生成的初始提示词"""
+        return self.get_prompt("sql_generation", "initial_prompt", dialect=dialect)
+    
+    def get_sql_response_guidelines(self, dialect: str) -> str:
+        """获取SQL生成的响应指南"""
+        return self.get_prompt("sql_generation", "response_guidelines", dialect=dialect)
+    
+    def get_chart_instructions(self) -> str:
+        """获取图表生成的中文指令"""
+        return self.get_prompt("chart_generation", "chinese_chart_instructions")
+    
+    def get_chart_system_message(self, question: str = None, sql: str = None, df_metadata: str = None) -> str:
+        """获取图表生成的系统消息"""
+        # 构建SQL部分
+        sql_part = f"数据来源SQL查询:\n{sql}" if sql else ""
+        
+        # 构建问题部分
+        if question:
+            question_text = f"用户问题:'{question}'\n\n以下是回答用户问题的pandas DataFrame数据:"
+        else:
+            question_text = "以下是一个pandas DataFrame数据:"
+        
+        return self.get_prompt(
+            "chart_generation", 
+            "system_message_template",
+            question=question_text,
+            sql_part=sql_part,
+            df_metadata=df_metadata or ""
+        )
+    
+    def get_chart_user_message(self) -> str:
+        """获取图表生成的用户消息"""
+        chinese_instructions = self.get_chart_instructions()
+        return self.get_prompt(
+            "chart_generation",
+            "user_message_template",
+            chinese_chart_instructions=chinese_instructions
+        )
+    
+    def get_question_generation_prompt(self) -> str:
+        """获取根据SQL生成问题的提示词"""
+        return self.get_prompt("question_generation", "system_prompt")
+    
+    def get_chat_default_prompt(self) -> str:
+        """获取聊天对话的默认系统提示词"""
+        return self.get_prompt("chat_with_llm", "default_system_prompt")
+    
+    def get_question_merge_prompt(self) -> str:
+        """获取问题合并的系统提示词"""
+        return self.get_prompt("question_merge", "system_prompt")
+    
+    def get_summary_system_message(self, question: str, df_markdown: str) -> str:
+        """获取摘要生成的系统消息"""
+        return self.get_prompt(
+            "summary_generation",
+            "system_message_template",
+            question=question,
+            df_markdown=df_markdown
+        )
+    
+    def get_summary_user_instructions(self) -> str:
+        """获取摘要生成的用户指令"""
+        return self.get_prompt("summary_generation", "user_instructions")
+    
+    def reload_prompts(self):
+        """重新加载提示词配置"""
+        self.logger.info("重新加载提示词配置")
+        self._load_prompts()
+
+
+# 全局提示词加载器实例
+_prompt_loader = None
+
+def get_prompt_loader() -> PromptLoader:
+    """
+    获取全局提示词加载器实例(单例模式)
+    
+    Returns:
+        PromptLoader: 提示词加载器实例
+    """
+    global _prompt_loader
+    if _prompt_loader is None:
+        _prompt_loader = PromptLoader()
+    return _prompt_loader

+ 31 - 31
customllm/ollama_chat.py

@@ -9,8 +9,8 @@ class OllamaChat(BaseLLMChat):
     """Ollama AI聊天实现"""
     
     def __init__(self, config=None):
-        print("...OllamaChat init...")
         super().__init__(config=config)
+        self.logger.info("OllamaChat init")
 
         # Ollama特定的配置参数
         self.base_url = config.get("base_url", "http://localhost:11434") if config else "http://localhost:11434"
@@ -31,13 +31,13 @@ class OllamaChat(BaseLLMChat):
         try:
             response = requests.get(f"{self.base_url}/api/tags", timeout=5)
             if response.status_code == 200:
-                print(f"✅ Ollama 服务连接正常: {self.base_url}")
+                self.logger.info(f"Ollama 服务连接正常: {self.base_url}")
                 return True
             else:
-                print(f"⚠️ Ollama 服务响应异常: {response.status_code}")
+                self.logger.warning(f"Ollama 服务响应异常: {response.status_code}")
                 return False
         except requests.exceptions.RequestException as e:
-            print(f"❌ Ollama 服务连接失败: {e}")
+            self.logger.error(f"Ollama 服务连接失败: {e}")
             return False
 
     def submit_prompt(self, prompt, **kwargs) -> str:
@@ -61,7 +61,7 @@ class OllamaChat(BaseLLMChat):
         # Ollama 约束:enable_thinking=True时建议使用stream=True
         # 如果stream=False但enable_thinking=True,则忽略enable_thinking
         if enable_thinking and not stream_mode:
-            print("WARNING: enable_thinking=True 不生效,因为它需要 stream=True")
+            self.logger.warning("enable_thinking=True 不生效,因为它需要 stream=True")
             enable_thinking = False
 
         # 智能模型选择
@@ -72,10 +72,10 @@ class OllamaChat(BaseLLMChat):
         
         # 模型兼容性提示(但不强制切换)
         if enable_thinking and not is_reasoning_model:
-            print(f"提示:模型 {model} 不是专门的推理模型,但仍会尝试启用推理功能")
+            self.logger.warning(f"提示:模型 {model} 不是专门的推理模型,但仍会尝试启用推理功能")
 
-        print(f"\nUsing Ollama model {model} for {num_tokens} tokens (approx)")
-        print(f"Enable thinking: {enable_thinking}, Stream mode: {stream_mode}")
+        self.logger.info(f"\nUsing Ollama model {model} for {num_tokens} tokens (approx)")
+        self.logger.info(f"Enable thinking: {enable_thinking}, Stream mode: {stream_mode}")
 
         # 准备Ollama API请求
         url = f"{self.base_url}/api/chat"
@@ -91,22 +91,22 @@ class OllamaChat(BaseLLMChat):
             if stream_mode:
                 # 流式处理模式
                 if enable_thinking:
-                    print("使用流式处理模式,启用推理功能")
+                    self.logger.info("使用流式处理模式,启用推理功能")
                 else:
-                    print("使用流式处理模式,常规聊天")
+                    self.logger.info("使用流式处理模式,常规聊天")
                 
                 return self._handle_stream_response(url, payload, enable_thinking)
             else:
                 # 非流式处理模式
                 if enable_thinking:
-                    print("使用非流式处理模式,启用推理功能")
+                    self.logger.info("使用非流式处理模式,启用推理功能")
                 else:
-                    print("使用非流式处理模式,常规聊天")
+                    self.logger.info("使用非流式处理模式,常规聊天")
                 
                 return self._handle_non_stream_response(url, payload, enable_thinking)
                 
         except requests.exceptions.RequestException as e:
-            print(f"Ollama API请求失败: {e}")
+            self.logger.error(f"Ollama API请求失败: {e}")
             raise Exception(f"Ollama API调用失败: {str(e)}")
 
     def _handle_stream_response(self, url: str, payload: dict, enable_reasoning: bool) -> str:
@@ -146,7 +146,7 @@ class OllamaChat(BaseLLMChat):
             reasoning_content, final_content = self._extract_reasoning(full_content)
             
             if reasoning_content:
-                print("Model reasoning process:\n", reasoning_content)
+                self.logger.debug("Model reasoning process:\n" + reasoning_content)
                 return final_content
         
         return full_content
@@ -169,7 +169,7 @@ class OllamaChat(BaseLLMChat):
             reasoning_content, final_content = self._extract_reasoning(content)
             
             if reasoning_content:
-                print("Model reasoning process:\n", reasoning_content)
+                self.logger.debug("Model reasoning process:\n" + reasoning_content)
                 return final_content
         
         return content
@@ -197,17 +197,17 @@ class OllamaChat(BaseLLMChat):
                 
                 # 检查目标模型是否存在
                 if self.model not in result["available_models"]:
-                    print(f"警告:模型 {self.model} 不存在,尝试拉取...")
+                    self.logger.warning(f"模型 {self.model} 不存在,尝试拉取...")
                     if not self.pull_model(self.model):
                         result["message"] = f"模型 {self.model} 不存在且拉取失败"
                         return result
             except Exception as e:
-                print(f"获取模型列表失败: {e}")
+                self.logger.error(f"获取模型列表失败: {e}")
                 result["available_models"] = [self.model]
             
-            print(f"测试Ollama连接 - 模型: {self.model}")
-            print(f"Ollama服务地址: {self.base_url}")
-            print(f"可用模型: {', '.join(result['available_models'])}")
+            self.logger.info(f"测试Ollama连接 - 模型: {self.model}")
+            self.logger.info(f"Ollama服务地址: {self.base_url}")
+            self.logger.info(f"可用模型: {', '.join(result['available_models'])}")
             
             # 测试简单对话
             prompt = [self.user_message(test_prompt)]
@@ -243,10 +243,10 @@ class OllamaChat(BaseLLMChat):
                     if reasoning_models:
                         return reasoning_models[0]  # 选择第一个推理模型
                     else:
-                        print("警告:未找到推理模型,使用默认模型")
+                        self.logger.warning("未找到推理模型,使用默认模型")
                         return self.model
                 except Exception as e:
-                    print(f"获取模型列表时出错: {e},使用默认模型")
+                    self.logger.error(f"获取模型列表时出错: {e},使用默认模型")
                     return self.model
             else:
                 # 根据 token 数量选择模型
@@ -258,7 +258,7 @@ class OllamaChat(BaseLLMChat):
                         if long_context_models:
                             return long_context_models[0]
                     except Exception as e:
-                        print(f"获取模型列表时出错: {e},使用默认模型")
+                        self.logger.error(f"获取模型列表时出错: {e},使用默认模型")
                 
                 return self.model
 
@@ -357,26 +357,26 @@ class OllamaChat(BaseLLMChat):
             models = [model["name"] for model in data.get("models", [])]
             return models if models else [self.model]  # 如果没有模型,返回默认模型
         except requests.exceptions.RequestException as e:
-            print(f"获取模型列表失败: {e}")
+            self.logger.error(f"获取模型列表失败: {e}")
             return [self.model]  # 返回默认模型
         except Exception as e:
-            print(f"解析模型列表失败: {e}")
+            self.logger.error(f"解析模型列表失败: {e}")
             return [self.model]  # 返回默认模型
 
     def pull_model(self, model_name: str) -> bool:
         """拉取模型"""
         try:
-            print(f"正在拉取模型: {model_name}")
+            self.logger.info(f"正在拉取模型: {model_name}")
             response = requests.post(
                 f"{self.base_url}/api/pull",
                 json={"name": model_name},
                 timeout=300  # 拉取模型可能需要较长时间
             )
             response.raise_for_status()
-            print(f"✅ 模型 {model_name} 拉取成功")
+            self.logger.info(f"模型 {model_name} 拉取成功")
             return True
         except requests.exceptions.RequestException as e:
-            print(f"❌ 模型 {model_name} 拉取失败: {e}")
+            self.logger.error(f"模型 {model_name} 拉取失败: {e}")
             return False
 
     def delete_model(self, model_name: str) -> bool:
@@ -388,10 +388,10 @@ class OllamaChat(BaseLLMChat):
                 timeout=self.timeout
             )
             response.raise_for_status()
-            print(f"✅ 模型 {model_name} 删除成功")
+            self.logger.info(f"模型 {model_name} 删除成功")
             return True
         except requests.exceptions.RequestException as e:
-            print(f"❌ 模型 {model_name} 删除失败: {e}")
+            self.logger.error(f"模型 {model_name} 删除失败: {e}")
             return False
 
     def get_model_info(self, model_name: str) -> Optional[Dict]:
@@ -405,7 +405,7 @@ class OllamaChat(BaseLLMChat):
             response.raise_for_status()
             return response.json()
         except requests.exceptions.RequestException as e:
-            print(f"获取模型信息失败: {e}")
+            self.logger.error(f"获取模型信息失败: {e}")
             return None
 
     def get_system_info(self) -> Dict:

+ 8 - 8
customllm/qianwen_chat.py

@@ -7,8 +7,8 @@ class QianWenChat(BaseLLMChat):
     """千问AI聊天实现"""
     
     def __init__(self, client=None, config=None):
-        print("...QianWenChat init...")
         super().__init__(config=config)
+        self.logger.info("QianWenChat init")
 
         if "api_type" in config:
             raise Exception(
@@ -65,7 +65,7 @@ class QianWenChat(BaseLLMChat):
         # 千问API约束:enable_thinking=True时必须stream=True
         # 如果stream=False但enable_thinking=True,则忽略enable_thinking
         if enable_thinking and not stream_mode:
-            print("WARNING: enable_thinking=True 不生效,因为它需要 stream=True")
+            self.logger.warning("enable_thinking=True 不生效,因为它需要 stream=True")
             enable_thinking = False
         
         # 创建一个干净的kwargs副本,移除可能导致API错误的自定义参数
@@ -112,15 +112,15 @@ class QianWenChat(BaseLLMChat):
                 model = "qwen-plus"
             common_params["model"] = model
         
-        print(f"\nUsing model {model} for {num_tokens} tokens (approx)")
-        print(f"Enable thinking: {enable_thinking}, Stream mode: {stream_mode}")
+        self.logger.info(f"\nUsing model {model} for {num_tokens} tokens (approx)")
+        self.logger.info(f"Enable thinking: {enable_thinking}, Stream mode: {stream_mode}")
         
         if stream_mode:
             # 流式处理模式
             if enable_thinking:
-                print("使用流式处理模式,启用thinking功能")
+                self.logger.info("使用流式处理模式,启用thinking功能")
             else:
-                print("使用流式处理模式,不启用thinking功能")
+                self.logger.info("使用流式处理模式,不启用thinking功能")
             
             response_stream = self.client.chat.completions.create(**common_params)
             
@@ -144,7 +144,7 @@ class QianWenChat(BaseLLMChat):
             # 可以在这里处理thinking的展示逻辑,如保存到日志等
             if enable_thinking and collected_thinking:
                 thinking_text = "".join(collected_thinking)
-                print("Model thinking process:\n", thinking_text)
+                self.logger.debug("Model thinking process:\n" + thinking_text)
             
             # 返回包含 <think></think> 标签的完整内容,与界面显示需求保持一致
             final_content = "".join(collected_content)
@@ -155,7 +155,7 @@ class QianWenChat(BaseLLMChat):
                 return final_content
         else:
             # 非流式处理模式
-            print("使用非流式处理模式")
+            self.logger.info("使用非流式处理模式")
             response = self.client.chat.completions.create(**common_params)
             
             # Find the first response from the chatbot that has text in it (some responses may not have text)

+ 58 - 22
custompgvector/pgvector.py

@@ -7,6 +7,7 @@ import pandas as pd
 from langchain_core.documents import Document
 from langchain_postgres.vectorstores import PGVector
 from sqlalchemy import create_engine, text
+from core.logging import get_vanna_logger
 
 from vanna.exceptions import ValidationError
 from vanna.base import VannaBase
@@ -23,6 +24,9 @@ class PG_VectorStore(VannaBase):
                 "A valid 'config' dictionary with a 'connection_string' is required.")
 
         VannaBase.__init__(self, config=config)
+        
+        # 初始化日志
+        self.logger = get_vanna_logger("PGVector")
 
         if config and "connection_string" in config:
             self.connection_string = config.get("connection_string")
@@ -135,7 +139,7 @@ class PG_VectorStore(VannaBase):
                 if generated_embedding:
                     embedding_cache.cache_embedding(question, generated_embedding)
             except Exception as e:
-                print(f"[WARNING] 缓存embedding失败: {e}")
+                self.logger.warning(f"缓存embedding失败: {e}")
 
         results = []
         for doc, score in docs_with_scores:
@@ -146,12 +150,16 @@ class PG_VectorStore(VannaBase):
             similarity = round(1 - score, 4)
 
             # 每条记录单独打印
-            print(f"[DEBUG] SQL Match: {base.get('question', '')} | similarity: {similarity}")
+            self.logger.debug(f"SQL Match: {base.get('question', '')} | similarity: {similarity}")
 
             # 添加 similarity 字段
             base["similarity"] = similarity
             results.append(base)
 
+        # 检查原始查询结果是否为空
+        if not results:
+            self.logger.warning(f"向量查询未找到任何相似的SQL问答对,问题: {question}")
+
         # 应用阈值过滤
         filtered_results = self._apply_score_threshold_filter(
             results, 
@@ -159,6 +167,10 @@ class PG_VectorStore(VannaBase):
             "SQL"
         )
 
+        # 检查过滤后结果是否为空
+        if results and not filtered_results:
+            self.logger.warning(f"向量查询找到了 {len(results)} 条SQL问答对,但全部被阈值过滤掉,问题: {question}")
+
         return filtered_results
 
     def get_related_ddl(self, question: str, **kwargs) -> list:
@@ -186,7 +198,7 @@ class PG_VectorStore(VannaBase):
                 if generated_embedding:
                     embedding_cache.cache_embedding(question, generated_embedding)
             except Exception as e:
-                print(f"[WARNING] 缓存embedding失败: {e}")
+                self.logger.warning(f"缓存embedding失败: {e}")
 
         results = []
         for doc, score in docs_with_scores:
@@ -194,7 +206,7 @@ class PG_VectorStore(VannaBase):
             similarity = round(1 - score, 4)
 
             # 每条记录单独打印
-            print(f"[DEBUG] DDL Match: {doc.page_content[:50]}... | similarity: {similarity}")
+            self.logger.debug(f"DDL Match: {doc.page_content[:50]}... | similarity: {similarity}")
 
             # 添加 similarity 字段
             result = {
@@ -203,6 +215,10 @@ class PG_VectorStore(VannaBase):
             }
             results.append(result)
 
+        # 检查原始查询结果是否为空
+        if not results:
+            self.logger.warning(f"向量查询未找到任何相关的DDL表结构,问题: {question}")
+
         # 应用阈值过滤
         filtered_results = self._apply_score_threshold_filter(
             results, 
@@ -210,6 +226,10 @@ class PG_VectorStore(VannaBase):
             "DDL"
         )
 
+        # 检查过滤后结果是否为空
+        if results and not filtered_results:
+            self.logger.warning(f"向量查询找到了 {len(results)} 条DDL表结构,但全部被阈值过滤掉,问题: {question}")
+
         return filtered_results
 
     def get_related_documentation(self, question: str, **kwargs) -> list:
@@ -237,7 +257,7 @@ class PG_VectorStore(VannaBase):
                 if generated_embedding:
                     embedding_cache.cache_embedding(question, generated_embedding)
             except Exception as e:
-                print(f"[WARNING] 缓存embedding失败: {e}")
+                self.logger.warning(f"缓存embedding失败: {e}")
 
         results = []
         for doc, score in docs_with_scores:
@@ -245,7 +265,7 @@ class PG_VectorStore(VannaBase):
             similarity = round(1 - score, 4)
 
             # 每条记录单独打印
-            print(f"[DEBUG] Doc Match: {doc.page_content[:50]}... | similarity: {similarity}")
+            self.logger.debug(f"Doc Match: {doc.page_content[:50]}... | similarity: {similarity}")
 
             # 添加 similarity 字段
             result = {
@@ -254,6 +274,10 @@ class PG_VectorStore(VannaBase):
             }
             results.append(result)
 
+        # 检查原始查询结果是否为空
+        if not results:
+            self.logger.warning(f"向量查询未找到任何相关的文档,问题: {question}")
+
         # 应用阈值过滤
         filtered_results = self._apply_score_threshold_filter(
             results, 
@@ -261,6 +285,10 @@ class PG_VectorStore(VannaBase):
             "DOC"
         )
 
+        # 检查过滤后结果是否为空
+        if results and not filtered_results:
+            self.logger.warning(f"向量查询找到了 {len(results)} 条文档,但全部被阈值过滤掉,问题: {question}")
+
         return filtered_results
 
     def _apply_score_threshold_filter(self, results: list, threshold_config_key: str, result_type: str) -> list:
@@ -284,19 +312,19 @@ class PG_VectorStore(VannaBase):
             enable_threshold = getattr(app_config, 'ENABLE_RESULT_VECTOR_SCORE_THRESHOLD', False)
             threshold = getattr(app_config, threshold_config_key, 0.65)
         except (ImportError, AttributeError) as e:
-            print(f"[WARNING] 无法加载阈值配置: {e},使用默认值")
+            self.logger.warning(f"无法加载阈值配置: {e},使用默认值")
             enable_threshold = False
             threshold = 0.65
         
         # 如果未启用阈值过滤,直接返回原结果
         if not enable_threshold:
-            print(f"[DEBUG] {result_type} 阈值过滤未启用,返回全部 {len(results)} 条结果")
+            self.logger.debug(f"{result_type} 阈值过滤未启用,返回全部 {len(results)} 条结果")
             return results
         
         total_count = len(results)
         min_required = max((total_count + 1) // 2, 1)
         
-        print(f"[DEBUG] {result_type} 阈值过滤: 总数={total_count}, 阈值={threshold}, 最少保留={min_required}")
+        self.logger.debug(f"{result_type} 阈值过滤: 总数={total_count}, 阈值={threshold}, 最少保留={min_required}")
         
         # 按相似度降序排序(确保最相似的在前面)
         sorted_results = sorted(results, key=lambda x: x.get('similarity', 0), reverse=True)
@@ -309,20 +337,20 @@ class PG_VectorStore(VannaBase):
             # 情况1: 满足阈值的结果数量 >= 最少保留数量,返回满足阈值的结果
             filtered_results = above_threshold
             filtered_count = len(above_threshold)
-            print(f"[DEBUG] {result_type} 过滤结果: 保留 {filtered_count} 条, 过滤掉 {total_count - filtered_count} 条 (全部满足阈值)")
+            self.logger.debug(f"{result_type} 过滤结果: 保留 {filtered_count} 条, 过滤掉 {total_count - filtered_count} 条 (全部满足阈值)")
         else:
             # 情况2: 满足阈值的结果数量 < 最少保留数量,强制保留前 min_required 条
             filtered_results = sorted_results[:min_required]
             above_count = len(above_threshold)
             below_count = min_required - above_count
             filtered_count = min_required
-            print(f"[DEBUG] {result_type} 过滤结果: 保留 {filtered_count} 条, 过滤掉 {total_count - filtered_count} 条 (满足阈值: {above_count}, 强制保留: {below_count})")
+            self.logger.debug(f"{result_type} 过滤结果: 保留 {filtered_count} 条, 过滤掉 {total_count - filtered_count} 条 (满足阈值: {above_count}, 强制保留: {below_count})")
         
         # 打印过滤详情
         for i, result in enumerate(filtered_results):
             similarity = result.get('similarity', 0)
             status = "✓" if similarity >= threshold else "✗"
-            print(f"[DEBUG] {result_type} 保留 {i+1}: similarity={similarity} {status}")
+            self.logger.debug(f"{result_type} 保留 {i+1}: similarity={similarity} {status}")
         
         return filtered_results
 
@@ -350,17 +378,17 @@ class PG_VectorStore(VannaBase):
             enable_threshold = getattr(app_config, 'ENABLE_RESULT_VECTOR_SCORE_THRESHOLD', False)
             threshold = getattr(app_config, 'RESULT_VECTOR_ERROR_SQL_SCORE_THRESHOLD', 0.5)
         except (ImportError, AttributeError) as e:
-            print(f"[WARNING] 无法加载错误SQL阈值配置: {e},使用默认值")
+            self.logger.warning(f"无法加载错误SQL阈值配置: {e},使用默认值")
             enable_threshold = False
             threshold = 0.5
         
         # 如果未启用阈值过滤,直接返回原结果
         if not enable_threshold:
-            print(f"[DEBUG] Error SQL 阈值过滤未启用,返回全部 {len(results)} 条结果")
+            self.logger.debug(f"Error SQL 阈值过滤未启用,返回全部 {len(results)} 条结果")
             return results
         
         total_count = len(results)
-        print(f"[DEBUG] Error SQL 阈值过滤: 总数={total_count}, 阈值={threshold}")
+        self.logger.debug(f"Error SQL 阈值过滤: 总数={total_count}, 阈值={threshold}")
         
         # 按相似度降序排序(确保最相似的在前面)
         sorted_results = sorted(results, key=lambda x: x.get('similarity', 0), reverse=True)
@@ -372,13 +400,13 @@ class PG_VectorStore(VannaBase):
         filtered_out_count = total_count - filtered_count
         
         if filtered_count > 0:
-            print(f"[DEBUG] Error SQL 过滤结果: 保留 {filtered_count} 条, 过滤掉 {filtered_out_count} 条")
+            self.logger.debug(f"Error SQL 过滤结果: 保留 {filtered_count} 条, 过滤掉 {filtered_out_count} 条")
             # 打印保留的结果详情
             for i, result in enumerate(filtered_results):
                 similarity = result.get('similarity', 0)
-                print(f"[DEBUG] Error SQL 保留 {i+1}: similarity={similarity} ✓")
+                self.logger.debug(f"Error SQL 保留 {i+1}: similarity={similarity} ✓")
         else:
-            print(f"[DEBUG] Error SQL 过滤结果: 所有 {total_count} 条结果都低于阈值 {threshold},返回空列表")
+            self.logger.debug(f"Error SQL 过滤结果: 所有 {total_count} 条结果都低于阈值 {threshold},返回空列表")
         
         return filtered_results
 
@@ -610,7 +638,7 @@ class PG_VectorStore(VannaBase):
                     if generated_embedding:
                         embedding_cache.cache_embedding(question, generated_embedding)
                 except Exception as e:
-                    print(f"[WARNING] 缓存embedding失败: {e}")
+                    self.logger.warning(f"缓存embedding失败: {e}")
             
             results = []
             for doc, score in docs_with_scores:
@@ -622,21 +650,29 @@ class PG_VectorStore(VannaBase):
                     similarity = round(1 - score, 4)
                     
                     # 每条记录单独打印
-                    print(f"[DEBUG] Error SQL Match: {base.get('question', '')} | similarity: {similarity}")
+                    self.logger.debug(f"Error SQL Match: {base.get('question', '')} | similarity: {similarity}")
                     
                     # 添加 similarity 字段
                     base["similarity"] = similarity
                     results.append(base)
                     
                 except (ValueError, SyntaxError) as e:
-                    print(f"Error parsing error SQL document: {e}")
+                    self.logger.error(f"Error parsing error SQL document: {e}")
                     continue
             
+            # 检查原始查询结果是否为空
+            if not results:
+                self.logger.warning(f"向量查询未找到任何相关的错误SQL示例,问题: {question}")
+
             # 应用错误SQL特有的阈值过滤逻辑
             filtered_results = self._apply_error_sql_threshold_filter(results)
             
+            # 检查过滤后结果是否为空
+            if results and not filtered_results:
+                self.logger.warning(f"向量查询找到了 {len(results)} 条错误SQL示例,但全部被阈值过滤掉,问题: {question}")
+
             return filtered_results
             
         except Exception as e:
-            print(f"Error retrieving error SQL examples: {e}")
+            self.logger.error(f"Error retrieving error SQL examples: {e}")
             return []

+ 5 - 5
schema_tools/README.md → data_pipeline/README.md

@@ -29,7 +29,7 @@ pip install asyncpg asyncio
 
 #### 命令行方式
 ```bash
-python -m schema_tools.schema_workflow_orchestrator \
+python -m schema_tools.schema_workflow \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --table-list tables.txt \
   --business-context "高速公路服务区管理系统" \
@@ -40,7 +40,7 @@ python -m schema_tools.schema_workflow_orchestrator \
 #### 编程方式
 ```python
 import asyncio
-from schema_tools.schema_workflow_orchestrator import SchemaWorkflowOrchestrator
+from schema_tools.schema_workflow import SchemaWorkflowOrchestrator
 
 async def run_complete_workflow():
     orchestrator = SchemaWorkflowOrchestrator(
@@ -73,17 +73,17 @@ asyncio.run(run_complete_workflow())
 #### 工作流编排器命令行选项
 ```bash
 # 跳过SQL验证
-python -m schema_tools.schema_workflow_orchestrator \
+python -m schema_tools.schema_workflow \
   --db-connection "postgresql://..." --table-list tables.txt \
   --business-context "系统" --db-name test_db --skip-validation
 
 # 禁用LLM修复
-python -m schema_tools.schema_workflow_orchestrator \
+python -m schema_tools.schema_workflow \
   --db-connection "postgresql://..." --table-list tables.txt \
   --business-context "系统" --db-name test_db --disable-llm-repair
 
 # 详细日志
-python -m schema_tools.schema_workflow_orchestrator \
+python -m schema_tools.schema_workflow \
   --db-connection "postgresql://..." --table-list tables.txt \
   --business-context "系统" --db-name test_db --verbose
 ```

+ 4 - 4
schema_tools/__init__.py → data_pipeline/__init__.py

@@ -3,10 +3,10 @@ Schema Tools - 自动化数据库逆向工程工具
 用于从PostgreSQL数据库生成vanna.ai格式的训练数据(DDL和MD文档)
 """
 
-from .training_data_agent import SchemaTrainingDataAgent
-from .qs_agent import QuestionSQLGenerationAgent
-from .sql_validation_agent import SQLValidationAgent
-from .schema_workflow_orchestrator import SchemaWorkflowOrchestrator
+from .ddl_generation.training_data_agent import SchemaTrainingDataAgent
+from .qa_generation.qs_agent import QuestionSQLGenerationAgent
+from .validators.sql_validation_agent import SQLValidationAgent
+from .schema_workflow import SchemaWorkflowOrchestrator
 from .config import SCHEMA_TOOLS_CONFIG, get_config, update_config
 
 __version__ = "1.0.0"

+ 0 - 0
schema_tools/analyzers/__init__.py → data_pipeline/analyzers/__init__.py


+ 2 - 2
schema_tools/analyzers/md_analyzer.py → data_pipeline/analyzers/md_analyzer.py

@@ -1,6 +1,6 @@
-import logging
 from pathlib import Path
 from typing import List, Dict, Any
+import logging
 
 
 class MDFileAnalyzer:
@@ -8,7 +8,7 @@ class MDFileAnalyzer:
     
     def __init__(self, output_dir: str):
         self.output_dir = Path(output_dir)
-        self.logger = logging.getLogger("schema_tools.MDFileAnalyzer")
+        self.logger = logging.getLogger("MDFileAnalyzer")
         
     async def read_all_md_files(self) -> str:
         """

+ 62 - 33
schema_tools/analyzers/theme_extractor.py → data_pipeline/analyzers/theme_extractor.py

@@ -1,9 +1,9 @@
 import asyncio
 import json
-import logging
 from typing import List, Dict, Any
 
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+import logging
 
 
 class ThemeExtractor:
@@ -19,7 +19,7 @@ class ThemeExtractor:
         """
         self.vn = vn
         self.business_context = business_context
-        self.logger = logging.getLogger("schema_tools.ThemeExtractor")
+        self.logger = logging.getLogger("ThemeExtractor")
         self.config = SCHEMA_TOOLS_CONFIG
         
     async def extract_themes(self, md_contents: str) -> List[Dict[str, Any]]:
@@ -64,10 +64,17 @@ class ThemeExtractor:
 要求:
 1. 每个主题应该有明确的业务价值和分析目标
 2. 主题之间应该有所区别,覆盖不同的业务领域  
-3. 你需要自行决定每个主题应该涉及哪些表
+3. 你需要自行决定每个主题应该涉及哪些表(使用实际存在的表名)
 4. 主题应该体现实际业务场景的数据分析需求
 5. 考虑时间维度、对比分析、排名统计等多种分析角度
-6. 为每个主题提供3-5个关键词,用于快速了解主题内容
+6. 在选择业务实体时,请忽略以下技术性字段:
+   - id、主键ID等标识字段
+   - create_time、created_at、create_ts等创建时间字段
+   - update_time、updated_at、update_ts等更新时间字段
+   - delete_time、deleted_at、delete_ts等删除时间字段
+   - version、版本号等版本控制字段
+   - created_by、updated_by、deleted_by等操作人字段
+7. 重点关注具有业务含义的实体字段和指标
 
 请以JSON格式输出:
 ```json
@@ -77,8 +84,8 @@ class ThemeExtractor:
       "topic_name": "日营业数据分析",
       "description": "基于 bss_business_day_data 表,分析每个服务区和档口每天的营业收入、订单数量、支付方式等",
       "related_tables": ["bss_business_day_data", "bss_branch", "bss_service_area"],
-      "keywords": ["收入", "订单", "支付方式", "日报表"],
-      "focus_areas": ["收入趋势", "服务区对比", "支付方式分布"]
+      "biz_entities": ["服务区", "档口", "支付方式", "营收"],
+      "biz_metrics": ["收入趋势", "服务区对比", "支付方式分布"]
     }}
   ]
 }}
@@ -88,8 +95,8 @@ class ThemeExtractor:
 - topic_name 简洁明了(10字以内)
 - description 详细说明分析目标和价值(50字左右)
 - related_tables 列出该主题需要用到的表名(数组格式)
-- keywords 提供3-5个核心关键词(数组格式
-- focus_areas 列出3-5个具体的分析角度(保留用于生成问题)"""
+- biz_entities 列出3-5个主要业务实体(表的维度字段或非数值型字段,如服务区、公司、车辆等
+- biz_metrics 列出3-5个主要业务指标名称(统计指标,如收入趋势、对比分析等)"""
         
         return prompt
     
@@ -142,16 +149,19 @@ class ThemeExtractor:
                     if isinstance(theme['related_tables'], str):
                         theme['related_tables'] = [theme['related_tables']]
                     
-                    # 确保keywords存在且是数组
-                    if 'keywords' not in theme:
-                        # 从description中提取关键词
-                        theme['keywords'] = self._extract_keywords_from_description(theme['description'])
-                    elif isinstance(theme['keywords'], str):
-                        theme['keywords'] = [theme['keywords']]
+                    # 确保biz_entities存在且是数组
+                    if 'biz_entities' not in theme:
+                        # 从description中提取业务实体
+                        theme['biz_entities'] = self._extract_biz_entities_from_description(theme['description'])
+                    elif isinstance(theme['biz_entities'], str):
+                        theme['biz_entities'] = [theme['biz_entities']]
                     
-                    # 保留focus_areas用于问题生成(如果没有则使用keywords)
-                    if 'focus_areas' not in theme:
-                        theme['focus_areas'] = theme['keywords'][:3]
+                    # 确保biz_metrics存在且是数组
+                    if 'biz_metrics' not in theme:
+                        # 从description中提取业务指标
+                        theme['biz_metrics'] = self._extract_biz_metrics_from_description(theme['description'])
+                    elif isinstance(theme['biz_metrics'], str):
+                        theme['biz_metrics'] = [theme['biz_metrics']]
                     
                     validated_themes.append(theme)
                 else:
@@ -167,23 +177,42 @@ class ThemeExtractor:
             self.logger.error(f"解析主题响应失败: {e}")
             raise
     
-    def _extract_keywords_from_description(self, description: str) -> List[str]:
-        """从描述中提取关键词(简单实现)"""
-        # 定义常见的业务关键词
-        business_keywords = [
-            "收入", "营业额", "订单", "支付", "统计", "分析", "趋势", "对比",
-            "排名", "汇总", "明细", "报表", "月度", "日度", "年度", "服务区",
-            "档口", "商品", "客流", "车流", "效率", "占比", "增长"
+    def _extract_biz_entities_from_description(self, description: str) -> List[str]:
+        """从描述中提取业务实体(简单实现)"""
+        # 定义常见的业务实体关键词
+        entity_keywords = [
+            "服务区", "档口", "商品", "公司", "分公司", "车辆", "支付方式",
+            "订单", "客户", "营收", "路段", "区域", "品牌", "品类"
+        ]
+        
+        # 从描述中查找出现的实体关键词
+        found_entities = []
+        for entity in entity_keywords:
+            if entity in description:
+                found_entities.append(entity)
+        
+        # 如果找到的太少,返回默认值
+        if len(found_entities) < 3:
+            found_entities = ["业务实体", "数据对象", "分析主体"]
+        
+        return found_entities[:5]  # 最多返回5个
+    
+    def _extract_biz_metrics_from_description(self, description: str) -> List[str]:
+        """从描述中提取业务指标(简单实现)"""
+        # 定义常见的业务指标关键词
+        metrics_keywords = [
+            "收入趋势", "营业额对比", "支付方式分布", "服务区对比", "增长率",
+            "占比分析", "排名统计", "效率评估", "流量分析", "转化率"
         ]
         
-        # 从描述中查找出现的关键词
-        found_keywords = []
-        for keyword in business_keywords:
-            if keyword in description:
-                found_keywords.append(keyword)
+        # 从描述中查找出现的指标关键词
+        found_metrics = []
+        for metric in metrics_keywords:
+            if any(word in description for word in metric.split()):
+                found_metrics.append(metric)
         
         # 如果找到的太少,返回默认值
-        if len(found_keywords) < 3:
-            found_keywords = ["数据分析", "统计报表", "业务查询"]
+        if len(found_metrics) < 3:
+            found_metrics = ["数据统计", "趋势分析", "对比分析"]
         
-        return found_keywords[:5]  # 最多返回5个 
+        return found_metrics[:5]  # 最多返回5个 

+ 9 - 0
data_pipeline/api/__init__.py

@@ -0,0 +1,9 @@
+"""
+Data Pipeline API模块
+
+提供数据管道任务的API支持,包括:
+- 任务管理
+- 执行跟踪
+- 日志记录
+- 文件管理
+"""

+ 895 - 0
data_pipeline/api/simple_db_manager.py

@@ -0,0 +1,895 @@
+"""
+Data Pipeline API 简化数据库管理器
+
+复用现有的pgvector数据库连接机制,提供Data Pipeline任务的数据库操作功能
+"""
+
+import json
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, List, Optional, Tuple
+
+import psycopg2
+from psycopg2.extras import RealDictCursor, Json
+
+from app_config import PGVECTOR_CONFIG
+import logging
+
+
+class SimpleTaskManager:
+    """简化的任务管理器,复用现有pgvector连接"""
+    
+    def __init__(self):
+        """初始化任务管理器"""
+        # 使用简单的控制台日志,不使用文件日志
+        self.logger = logging.getLogger("SimpleTaskManager")
+        self.logger.setLevel(logging.INFO)
+        self._connection = None
+    
+    def _get_connection(self):
+        """获取pgvector数据库连接"""
+        if self._connection is None or self._connection.closed:
+            try:
+                self._connection = psycopg2.connect(
+                    host=PGVECTOR_CONFIG.get('host'),
+                    port=PGVECTOR_CONFIG.get('port'),
+                    database=PGVECTOR_CONFIG.get('dbname'),
+                    user=PGVECTOR_CONFIG.get('user'),
+                    password=PGVECTOR_CONFIG.get('password')
+                )
+                self._connection.autocommit = True
+            except Exception as e:
+                self.logger.error(f"pgvector数据库连接失败: {e}")
+                raise
+        return self._connection
+    
+    def close_connection(self):
+        """关闭数据库连接"""
+        if self._connection and not self._connection.closed:
+            self._connection.close()
+            self._connection = None
+    
+    def generate_task_id(self) -> str:
+        """生成任务ID,格式: task_YYYYMMDD_HHMMSS"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        return f"task_{timestamp}"
+    
+    def create_task(self, 
+                   table_list_file: str = None,
+                   business_context: str = None,
+                   db_name: str = None,
+                   db_connection: str = None,
+                   task_name: str = None,
+                   **kwargs) -> str:
+        """创建新任务"""
+        task_id = self.generate_task_id()
+        
+        # 处理数据库连接和名称
+        if db_connection:
+            # 使用传入的 db_connection 参数
+            business_db_connection = db_connection
+            # 如果没有提供 db_name,从连接字符串中提取
+            if not db_name:
+                db_name = self._extract_db_name(db_connection)
+        else:
+            # 从 app_config 获取业务数据库连接信息
+            from app_config import APP_DB_CONFIG
+            business_db_connection = self._build_db_connection_string(APP_DB_CONFIG)
+            # 使用传入的db_name或从APP_DB_CONFIG提取
+            if not db_name:
+                db_name = APP_DB_CONFIG.get('dbname', 'business_db')
+        
+        # 处理table_list_file参数
+        # 如果未提供,将在执行时检查任务目录中的table_list.txt文件
+        task_table_list_file = table_list_file
+        if not task_table_list_file:
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            upload_config = SCHEMA_TOOLS_CONFIG.get("file_upload", {})
+            target_filename = upload_config.get("target_filename", "table_list.txt")
+            # 使用相对于任务目录的路径
+            task_table_list_file = f"{{task_directory}}/{target_filename}"
+        
+        # 构建参数
+        parameters = {
+            "db_connection": business_db_connection,  # 业务数据库连接(用于schema_workflow执行)
+            "table_list_file": task_table_list_file,
+            "business_context": business_context or "数据库管理系统",
+            "file_upload_mode": table_list_file is None,  # 标记是否使用文件上传模式
+            **kwargs
+        }
+        
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                # 创建任务记录
+                cursor.execute("""
+                    INSERT INTO data_pipeline_tasks (
+                        task_id, task_name, task_type, status, parameters, created_type, 
+                        by_user, db_name, output_directory
+                    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
+                """, (
+                    task_id, 
+                    task_name,
+                    'data_workflow', 
+                    'pending', 
+                    Json(parameters),
+                    'api',
+                    'guest',
+                    db_name,
+                    f"data_pipeline/training_data/{task_id}"
+                ))
+                
+                # 预创建所有步骤记录(策略A)
+                step_names = ['ddl_generation', 'qa_generation', 'sql_validation', 'training_load']
+                for step_name in step_names:
+                    cursor.execute("""
+                        INSERT INTO data_pipeline_task_steps (
+                            task_id, step_name, step_status
+                        ) VALUES (%s, %s, %s)
+                    """, (task_id, step_name, 'pending'))
+            
+            # 创建任务目录
+            try:
+                from data_pipeline.api.simple_file_manager import SimpleFileManager
+                file_manager = SimpleFileManager()
+                success = file_manager.create_task_directory(task_id)
+                if success:
+                    self.logger.info(f"任务目录创建成功: {task_id}")
+                else:
+                    self.logger.warning(f"任务目录创建失败,但任务记录已保存: {task_id}")
+            except Exception as dir_error:
+                self.logger.warning(f"创建任务目录时出错: {dir_error},但任务记录已保存: {task_id}")
+                
+            self.logger.info(f"任务创建成功: {task_id}")
+            return task_id
+            
+        except Exception as e:
+            self.logger.error(f"任务创建失败: {e}")
+            raise
+    
+    def get_task(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """获取任务信息"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+                cursor.execute("SELECT * FROM data_pipeline_tasks WHERE task_id = %s", (task_id,))
+                result = cursor.fetchone()
+                return dict(result) if result else None
+        except Exception as e:
+            self.logger.error(f"获取任务信息失败: {e}")
+            raise
+    
+    def update_task_status(self, task_id: str, status: str, error_message: Optional[str] = None):
+        """更新任务状态"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                update_fields = ["status = %s"]
+                values = [status]
+                
+                if status == 'in_progress' and not self._get_task_started_at(task_id):
+                    update_fields.append("started_at = CURRENT_TIMESTAMP")
+                
+                if status in ['completed', 'failed']:
+                    update_fields.append("completed_at = CURRENT_TIMESTAMP")
+                
+                if error_message:
+                    update_fields.append("error_message = %s")
+                    values.append(error_message)
+                
+                values.append(task_id)
+                
+                cursor.execute(f"""
+                    UPDATE data_pipeline_tasks 
+                    SET {', '.join(update_fields)}
+                    WHERE task_id = %s
+                """, values)
+                
+                self.logger.info(f"任务状态更新: {task_id} -> {status}")
+        except Exception as e:
+            self.logger.error(f"任务状态更新失败: {e}")
+            raise
+    
+    def update_step_status(self, task_id: str, step_name: str, step_status: str, error_message: Optional[str] = None):
+        """更新步骤状态"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                update_fields = ["step_status = %s"]
+                values = [step_status]
+                
+                # 如果状态是running,记录开始时间
+                if step_status == 'running':
+                    update_fields.append("started_at = CURRENT_TIMESTAMP")
+                
+                # 如果状态是completed或failed,记录完成时间
+                if step_status in ['completed', 'failed']:
+                    update_fields.append("completed_at = CURRENT_TIMESTAMP")
+                
+                # 如果有错误信息,记录错误信息
+                if error_message:
+                    update_fields.append("error_message = %s")
+                    values.append(error_message)
+                
+                values.extend([task_id, step_name])
+                
+                cursor.execute(f"""
+                    UPDATE data_pipeline_task_steps 
+                    SET {', '.join(update_fields)}
+                    WHERE task_id = %s AND step_name = %s
+                """, values)
+                
+                self.logger.debug(f"步骤状态更新: {task_id}.{step_name} -> {step_status}")
+        except Exception as e:
+            self.logger.error(f"步骤状态更新失败: {e}")
+            raise
+    
+    def update_step_execution_id(self, task_id: str, step_name: str, execution_id: str):
+        """更新步骤的execution_id"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                cursor.execute("""
+                    UPDATE data_pipeline_task_steps 
+                    SET execution_id = %s
+                    WHERE task_id = %s AND step_name = %s
+                """, (execution_id, task_id, step_name))
+                
+                self.logger.debug(f"步骤execution_id更新: {task_id}.{step_name} -> {execution_id}")
+        except Exception as e:
+            self.logger.error(f"步骤execution_id更新失败: {e}")
+            raise
+    
+    def start_step(self, task_id: str, step_name: str) -> str:
+        """开始执行步骤"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        execution_id = f"{task_id}_step_{step_name}_exec_{timestamp}"
+        
+        try:
+            # 更新步骤状态为running并设置execution_id
+            self.update_step_status(task_id, step_name, 'running')
+            self.update_step_execution_id(task_id, step_name, execution_id)
+                
+            self.logger.info(f"步骤开始执行: {task_id}.{step_name} -> {execution_id}")
+            return execution_id
+        except Exception as e:
+            self.logger.error(f"步骤开始执行失败: {e}")
+            raise
+    
+    def complete_step(self, task_id: str, step_name: str, status: str, error_message: Optional[str] = None):
+        """完成步骤执行"""
+        try:
+            self.update_step_status(task_id, step_name, status, error_message)
+            self.logger.info(f"步骤执行完成: {task_id}.{step_name} -> {status}")
+        except Exception as e:
+            self.logger.error(f"步骤执行完成失败: {e}")
+            raise
+    
+    def get_task_steps(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务的所有步骤状态"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+                cursor.execute("""
+                    SELECT * FROM data_pipeline_task_steps 
+                    WHERE task_id = %s 
+                    ORDER BY 
+                        CASE step_name 
+                          WHEN 'ddl_generation' THEN 1
+                          WHEN 'qa_generation' THEN 2
+                          WHEN 'sql_validation' THEN 3
+                          WHEN 'training_load' THEN 4
+                          ELSE 5 
+                        END
+                """, (task_id,))
+                
+                return [dict(row) for row in cursor.fetchall()]
+        except Exception as e:
+            self.logger.error(f"获取任务步骤状态失败: {e}")
+            raise
+    
+    def get_step_status(self, task_id: str, step_name: str) -> Optional[Dict[str, Any]]:
+        """获取特定步骤的状态"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+                cursor.execute("""
+                    SELECT * FROM data_pipeline_task_steps 
+                    WHERE task_id = %s AND step_name = %s
+                """, (task_id, step_name))
+                
+                result = cursor.fetchone()
+                return dict(result) if result else None
+        except Exception as e:
+            self.logger.error(f"获取步骤状态失败: {e}")
+            raise
+    
+    def get_tasks_list(self, limit: int = 50, offset: int = 0, status_filter: Optional[str] = None) -> List[Dict[str, Any]]:
+        """获取任务列表"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+                where_clause = ""
+                params = []
+                
+                if status_filter:
+                    where_clause = "WHERE t.status = %s"
+                    params.append(status_filter)
+                
+                params.extend([limit, offset])
+                
+                # 联表查询获取步骤状态汇总(包含新增字段)
+                cursor.execute(f"""
+                    SELECT 
+                        t.task_id,
+                        t.task_name,
+                        t.task_type,
+                        t.status,
+                        t.parameters,
+                        t.error_message,
+                        t.created_at,
+                        t.started_at,
+                        t.completed_at,
+                        t.created_type,
+                        t.by_user,
+                        t.output_directory,
+                        t.db_name,
+                        COALESCE(t.directory_exists, TRUE) as directory_exists,
+                        t.updated_at,
+                        CASE 
+                            WHEN COUNT(s.step_name) = 0 THEN NULL
+                            WHEN COUNT(s.step_name) FILTER (WHERE s.step_status = 'failed') > 0 THEN 'failed'
+                            WHEN COUNT(s.step_name) FILTER (WHERE s.step_status = 'running') > 0 THEN 'running'
+                            WHEN COUNT(s.step_name) FILTER (WHERE s.step_status = 'completed') = COUNT(s.step_name) THEN 'all_completed'
+                            WHEN COUNT(s.step_name) FILTER (WHERE s.step_status = 'completed') > 0 THEN 'partial_completed'
+                            ELSE 'pending'
+                        END as step_status
+                    FROM data_pipeline_tasks t
+                    LEFT JOIN data_pipeline_task_steps s ON t.task_id = s.task_id
+                    {where_clause}
+                    GROUP BY t.task_id, t.task_name, t.task_type, t.status, t.parameters, t.error_message, 
+                             t.created_at, t.started_at, t.completed_at, t.created_type, t.by_user, 
+                             t.output_directory, t.db_name, t.directory_exists, t.updated_at
+                    ORDER BY t.created_at DESC 
+                    LIMIT %s OFFSET %s
+                """, params)
+                
+                return [dict(row) for row in cursor.fetchall()]
+        except Exception as e:
+            self.logger.error(f"获取任务列表失败: {e}")
+            raise
+    
+    def _get_task_started_at(self, task_id: str) -> Optional[datetime]:
+        """获取任务开始时间"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                cursor.execute("SELECT started_at FROM data_pipeline_tasks WHERE task_id = %s", (task_id,))
+                result = cursor.fetchone()
+                return result[0] if result and result[0] else None
+        except Exception:
+            return None
+    
+    def _build_db_connection_string(self, db_config: dict) -> str:
+        """构建数据库连接字符串"""
+        try:
+            host = db_config.get('host', 'localhost')
+            port = db_config.get('port', 5432)
+            dbname = db_config.get('dbname', 'database')
+            user = db_config.get('user', 'postgres')
+            password = db_config.get('password', '')
+            
+            return f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
+        except Exception:
+            return "postgresql://localhost:5432/database"
+    
+    def _extract_db_name(self, connection_string: str) -> str:
+        """从连接字符串提取数据库名称"""
+        try:
+            if '/' in connection_string:
+                db_name = connection_string.split('/')[-1]
+                if '?' in db_name:
+                    db_name = db_name.split('?')[0]
+                return db_name if db_name else "database"
+            else:
+                return "database"
+        except Exception:
+            return "database"
+
+    def query_tasks_advanced(self, 
+                            page: int = 1,
+                            page_size: int = 20,
+                            status: str = None,
+                            task_name: str = None,
+                            created_by: str = None,
+                            db_name: str = None,
+                            created_time_start: str = None,
+                            created_time_end: str = None,
+                            started_time_start: str = None,
+                            started_time_end: str = None,
+                            completed_time_start: str = None,
+                            completed_time_end: str = None,
+                            sort_by: str = "created_at",
+                            sort_order: str = "desc") -> dict:
+        """
+        高级任务查询,支持复杂筛选、排序、分页
+        
+        Args:
+            page: 页码,必须大于0,默认1
+            page_size: 每页大小,1-100之间,默认20
+            status: 可选,任务状态筛选
+            task_name: 可选,任务名称模糊搜索
+            created_by: 可选,创建者精确匹配
+            db_name: 可选,数据库名称精确匹配
+            created_time_start: 可选,创建时间范围开始
+            created_time_end: 可选,创建时间范围结束
+            started_time_start: 可选,开始时间范围开始
+            started_time_end: 可选,开始时间范围结束
+            completed_time_start: 可选,完成时间范围开始
+            completed_time_end: 可选,完成时间范围结束
+            sort_by: 可选,排序字段,默认"created_at"
+            sort_order: 可选,排序方向,默认"desc"
+        
+        Returns:
+            {
+                "tasks": [...],
+                "pagination": {
+                    "page": 1,
+                    "page_size": 20,
+                    "total": 150,
+                    "total_pages": 8,
+                    "has_next": True,
+                    "has_prev": False
+                }
+            }
+        """
+        try:
+            import time
+            start_time = time.time()
+            
+            # 参数验证和处理
+            page = max(page, 1)
+            page_size = min(max(page_size, 1), 100)  # 限制在1-100之间
+            offset = (page - 1) * page_size
+            
+            # 构建WHERE条件
+            where_conditions = []
+            params = []
+            
+            # 状态筛选
+            if status:
+                where_conditions.append("t.status = %s")
+                params.append(status)
+            
+            # 任务名称模糊搜索
+            if task_name:
+                where_conditions.append("t.task_name ILIKE %s")
+                params.append(f"%{task_name}%")
+            
+            # 创建者精确匹配
+            if created_by:
+                where_conditions.append("t.by_user = %s")
+                params.append(created_by)
+            
+            # 数据库名称精确匹配
+            if db_name:
+                where_conditions.append("t.db_name = %s")
+                params.append(db_name)
+            
+            # 时间范围筛选
+            # 创建时间范围
+            if created_time_start:
+                where_conditions.append("t.created_at >= %s")
+                params.append(created_time_start)
+            if created_time_end:
+                where_conditions.append("t.created_at <= %s")
+                params.append(created_time_end)
+            
+            # 开始时间范围
+            if started_time_start:
+                where_conditions.append("t.started_at >= %s")
+                params.append(started_time_start)
+            if started_time_end:
+                where_conditions.append("t.started_at <= %s")
+                params.append(started_time_end)
+            
+            # 完成时间范围
+            if completed_time_start:
+                where_conditions.append("t.completed_at >= %s")
+                params.append(completed_time_start)
+            if completed_time_end:
+                where_conditions.append("t.completed_at <= %s")
+                params.append(completed_time_end)
+            
+            # 构建WHERE子句
+            where_clause = ""
+            if where_conditions:
+                where_clause = "WHERE " + " AND ".join(where_conditions)
+            
+            # 构建ORDER BY子句
+            # 验证排序字段白名单
+            allowed_sort_fields = ['created_at', 'started_at', 'completed_at', 'task_name', 'status']
+            if sort_by not in allowed_sort_fields:
+                sort_by = 'created_at'
+            
+            # 验证排序方向
+            sort_order_upper = sort_order.upper()
+            if sort_order_upper not in ['ASC', 'DESC']:
+                sort_order_upper = 'DESC'
+            
+            order_clause = f"ORDER BY t.{sort_by} {sort_order_upper}"
+            
+            conn = self._get_connection()
+            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+                # 首先获取总数
+                count_query = f"""
+                    SELECT COUNT(*) as total
+                    FROM data_pipeline_tasks t
+                    {where_clause}
+                """
+                cursor.execute(count_query, params)
+                total_count = cursor.fetchone()['total']
+                
+                # 然后获取分页数据
+                data_params = params + [page_size, offset]
+                data_query = f"""
+                    SELECT 
+                        t.task_id,
+                        t.task_name,
+                        t.task_type,
+                        t.status,
+                        t.parameters,
+                        t.error_message,
+                        t.created_at,
+                        t.started_at,
+                        t.completed_at,
+                        t.created_type,
+                        t.by_user,
+                        t.output_directory,
+                        t.db_name,
+                        COALESCE(t.directory_exists, TRUE) as directory_exists,
+                        t.updated_at,
+                        CASE 
+                            WHEN COUNT(s.step_name) = 0 THEN NULL
+                            WHEN COUNT(s.step_name) FILTER (WHERE s.step_status = 'failed') > 0 THEN 'failed'
+                            WHEN COUNT(s.step_name) FILTER (WHERE s.step_status = 'running') > 0 THEN 'running'
+                            WHEN COUNT(s.step_name) FILTER (WHERE s.step_status = 'completed') = COUNT(s.step_name) THEN 'all_completed'
+                            WHEN COUNT(s.step_name) FILTER (WHERE s.step_status = 'completed') > 0 THEN 'partial_completed'
+                            ELSE 'pending'
+                        END as step_status
+                    FROM data_pipeline_tasks t
+                    LEFT JOIN data_pipeline_task_steps s ON t.task_id = s.task_id
+                    {where_clause}
+                    GROUP BY t.task_id, t.task_name, t.task_type, t.status, t.parameters, t.error_message, 
+                             t.created_at, t.started_at, t.completed_at, t.created_type, t.by_user, 
+                             t.output_directory, t.db_name, t.directory_exists, t.updated_at
+                    {order_clause}
+                    LIMIT %s OFFSET %s
+                """
+                
+                cursor.execute(data_query, data_params)
+                tasks = [dict(row) for row in cursor.fetchall()]
+                
+                # 计算分页信息
+                total_pages = (total_count + page_size - 1) // page_size if page_size > 0 else 1
+                has_next = page < total_pages
+                has_prev = page > 1
+                
+                query_time = time.time() - start_time
+                
+                return {
+                    "tasks": tasks,
+                    "pagination": {
+                        "page": page,
+                        "page_size": page_size,
+                        "total": total_count,
+                        "total_pages": total_pages,
+                        "has_next": has_next,
+                        "has_prev": has_prev
+                    },
+                    "query_time": f"{query_time:.3f}s"
+                }
+                
+        except Exception as e:
+            self.logger.error(f"高级任务查询失败: {e}")
+            raise
+
+    def query_logs_advanced(self,
+                           task_id: str,
+                           page: int = 1,
+                           page_size: int = 50,
+                           level: str = None,
+                           start_time: str = None,
+                           end_time: str = None,
+                           keyword: str = None,
+                           logger_name: str = None,
+                           step_name: str = None,
+                           sort_by: str = "timestamp",
+                           sort_order: str = "desc") -> dict:
+        """
+        高级日志查询,支持复杂筛选、排序、分页
+        
+        Args:
+            task_id: 任务ID
+            page: 页码,必须大于0,默认1
+            page_size: 每页大小,1-500之间,默认50
+            level: 可选,日志级别筛选 (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+            start_time: 可选,开始时间范围 (YYYY-MM-DD HH:MM:SS)
+            end_time: 可选,结束时间范围 (YYYY-MM-DD HH:MM:SS)
+            keyword: 可选,关键字搜索(消息内容模糊匹配)
+            logger_name: 可选,日志记录器名称精确匹配
+            step_name: 可选,执行步骤名称精确匹配
+            sort_by: 可选,排序字段,默认"timestamp"
+            sort_order: 可选,排序方向,默认"desc"
+        
+        Returns:
+            {
+                "logs": [...],
+                "pagination": {
+                    "page": 1,
+                    "page_size": 50,
+                    "total": 1000,
+                    "total_pages": 20,
+                    "has_next": True,
+                    "has_prev": False
+                },
+                "log_file_info": {...}
+            }
+        """
+        try:
+            import time
+            
+            start_query_time = time.time()
+            
+            # 参数验证和处理
+            page = max(page, 1)
+            page_size = min(max(page_size, 1), 500)  # 限制在1-500之间
+            
+            # 获取日志文件路径
+            project_root = Path(__file__).parent.parent.parent
+            task_dir = project_root / "data_pipeline" / "training_data" / task_id
+            log_file = task_dir / "data_pipeline.log"
+            
+            # 检查日志文件是否存在
+            if not log_file.exists():
+                return {
+                    "logs": [],
+                    "pagination": {
+                        "page": page,
+                        "page_size": page_size,
+                        "total": 0,
+                        "total_pages": 0,
+                        "has_next": False,
+                        "has_prev": False
+                    },
+                    "log_file_info": {
+                        "exists": False,
+                        "file_path": str(log_file),
+                        "error": "日志文件不存在"
+                    },
+                    "query_time": f"{time.time() - start_query_time:.3f}s"
+                }
+            
+            # 读取并解析日志文件
+            parsed_logs = self._parse_log_file(log_file)
+            
+            # 应用过滤器
+            filtered_logs = self._filter_logs(
+                parsed_logs,
+                level=level,
+                start_time=start_time,
+                end_time=end_time,
+                keyword=keyword,
+                logger_name=logger_name,
+                step_name=step_name
+            )
+            
+            # 排序
+            sorted_logs = self._sort_logs(filtered_logs, sort_by, sort_order)
+            
+            # 分页
+            total_count = len(sorted_logs)
+            start_index = (page - 1) * page_size
+            end_index = start_index + page_size
+            paginated_logs = sorted_logs[start_index:end_index]
+            
+            # 计算分页信息
+            total_pages = (total_count + page_size - 1) // page_size if page_size > 0 else 1
+            has_next = page < total_pages
+            has_prev = page > 1
+            
+            # 获取文件信息
+            file_stat = log_file.stat()
+            log_file_info = {
+                "exists": True,
+                "file_path": str(log_file),
+                "file_size": file_stat.st_size,
+                "file_size_formatted": self._format_file_size(file_stat.st_size),
+                "last_modified": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                "total_lines": len(parsed_logs)
+            }
+            
+            query_time = time.time() - start_query_time
+            
+            return {
+                "logs": paginated_logs,
+                "pagination": {
+                    "page": page,
+                    "page_size": page_size,
+                    "total": total_count,
+                    "total_pages": total_pages,
+                    "has_next": has_next,
+                    "has_prev": has_prev
+                },
+                "log_file_info": log_file_info,
+                "query_time": f"{query_time:.3f}s"
+            }
+            
+        except Exception as e:
+            self.logger.error(f"日志查询失败: {e}")
+            return {
+                "logs": [],
+                "pagination": {
+                    "page": page,
+                    "page_size": page_size,
+                    "total": 0,
+                    "total_pages": 0,
+                    "has_next": False,
+                    "has_prev": False
+                },
+                "log_file_info": {
+                    "exists": False,
+                    "error": str(e)
+                },
+                "query_time": "0.000s"
+            }
+    
+    def _parse_log_file(self, log_file_path: Path) -> List[Dict[str, Any]]:
+        """
+        解析日志文件,提取结构化信息
+        """
+        try:
+            logs = []
+            with open(log_file_path, 'r', encoding='utf-8') as f:
+                lines = f.readlines()
+            
+            # 日志行格式: 2025-07-01 14:30:52 [INFO] SimpleWorkflowExecutor: 任务开始执行
+            log_pattern = r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+?): (.+)$'
+            current_log = None
+            line_number = 0
+            
+            for line in lines:
+                line_number += 1
+                line = line.rstrip('\n\r')
+                
+                if not line.strip():
+                    continue
+                
+                match = re.match(log_pattern, line)
+                if match:
+                    # 如果有之前的日志,先保存
+                    if current_log:
+                        logs.append(current_log)
+                    
+                    # 解析新的日志条目
+                    timestamp, level, logger_name, message = match.groups()
+                    
+                    # 尝试从日志记录器名称中提取步骤信息
+                    step_name = self._extract_step_from_logger(logger_name)
+                    
+                    current_log = {
+                        "timestamp": timestamp,
+                        "level": level,
+                        "logger": logger_name,
+                        "step": step_name,
+                        "message": message,
+                        "line_number": line_number
+                    }
+                else:
+                    # 多行日志(如异常堆栈),追加到当前日志的消息中
+                    if current_log:
+                        current_log["message"] += f"\n{line}"
+            
+            # 保存最后一个日志条目
+            if current_log:
+                logs.append(current_log)
+            
+            return logs
+            
+        except Exception as e:
+            self.logger.error(f"解析日志文件失败: {e}")
+            return []
+    
+    def _extract_step_from_logger(self, logger_name: str) -> Optional[str]:
+        """
+        从日志记录器名称中提取步骤信息
+        """
+        # 映射日志记录器名称到步骤名称
+        logger_to_step = {
+            "DDLGenerator": "ddl_generation",
+            "QAGenerator": "qa_generation", 
+            "QSGenerator": "qa_generation",
+            "SQLValidator": "sql_validation",
+            "TrainingDataLoader": "training_load",
+            "VannaTrainer": "training_load",
+            "SchemaWorkflowOrchestrator": None,  # 总体协调器
+            "SimpleWorkflowExecutor": None,      # 工作流执行器
+        }
+        
+        return logger_to_step.get(logger_name)
+    
+    def _filter_logs(self, logs: List[Dict[str, Any]], **filters) -> List[Dict[str, Any]]:
+        """
+        根据条件过滤日志
+        """
+        filtered = logs
+        
+        # 日志级别过滤
+        if filters.get('level'):
+            level = filters['level'].upper()
+            filtered = [log for log in filtered if log.get('level') == level]
+        
+        # 时间范围过滤
+        if filters.get('start_time'):
+            start_time = filters['start_time']
+            filtered = [log for log in filtered if log.get('timestamp', '') >= start_time]
+        
+        if filters.get('end_time'):
+            end_time = filters['end_time']
+            filtered = [log for log in filtered if log.get('timestamp', '') <= end_time]
+        
+        # 关键字搜索(消息内容模糊匹配)
+        if filters.get('keyword'):
+            keyword = filters['keyword'].lower()
+            filtered = [log for log in filtered 
+                       if keyword in log.get('message', '').lower()]
+        
+        # 日志记录器名称精确匹配
+        if filters.get('logger_name'):
+            logger_name = filters['logger_name']
+            filtered = [log for log in filtered if log.get('logger') == logger_name]
+        
+        # 步骤名称精确匹配
+        if filters.get('step_name'):
+            step_name = filters['step_name']
+            filtered = [log for log in filtered if log.get('step') == step_name]
+        
+        return filtered
+    
+    def _sort_logs(self, logs: List[Dict[str, Any]], sort_by: str, sort_order: str) -> List[Dict[str, Any]]:
+        """
+        对日志进行排序
+        """
+        # 验证排序字段
+        allowed_sort_fields = ['timestamp', 'level', 'logger', 'step', 'line_number']
+        if sort_by not in allowed_sort_fields:
+            sort_by = 'timestamp'
+        
+        # 验证排序方向
+        reverse = sort_order.lower() == 'desc'
+        
+        try:
+            # 特殊处理时间戳排序
+            if sort_by == 'timestamp':
+                return sorted(logs, key=lambda x: x.get('timestamp', ''), reverse=reverse)
+            else:
+                return sorted(logs, key=lambda x: x.get(sort_by, ''), reverse=reverse)
+        except Exception as e:
+            self.logger.error(f"日志排序失败: {e}")
+            return logs
+    
+    def _format_file_size(self, size_bytes: int) -> str:
+        """格式化文件大小显示"""
+        if size_bytes == 0:
+            return "0 B"
+        
+        size_names = ["B", "KB", "MB", "GB"]
+        i = 0
+        size = float(size_bytes)
+        
+        while size >= 1024.0 and i < len(size_names) - 1:
+            size /= 1024.0
+            i += 1
+        
+        return f"{size:.1f} {size_names[i]}"

+ 901 - 0
data_pipeline/api/simple_file_manager.py

@@ -0,0 +1,901 @@
+"""
+Data Pipeline API 简化文件管理器
+
+提供简单的文件列表、下载和上传功能,无压缩等复杂功能
+"""
+
+import os
+from pathlib import Path
+from typing import Dict, Any, List, BinaryIO, Union
+from datetime import datetime
+import tempfile
+import shutil
+
+import logging
+
+
+class SimpleFileManager:
+    """简化的文件管理器"""
+    
+    def __init__(self, base_output_dir: str = None):
+        if base_output_dir is None:
+            # 获取项目根目录的绝对路径
+            from pathlib import Path
+            project_root = Path(__file__).parent.parent.parent
+            base_output_dir = str(project_root / "data_pipeline" / "training_data")
+        """
+        初始化文件管理器
+        
+        Args:
+            base_output_dir: 基础输出目录
+        """
+        self.base_output_dir = Path(base_output_dir)
+        # 使用简单的控制台日志,不使用文件日志
+        self.logger = logging.getLogger("SimpleFileManager")
+        self.logger.setLevel(logging.INFO)
+        
+        # 确保基础目录存在
+        self.base_output_dir.mkdir(parents=True, exist_ok=True)
+    
+    def get_task_directory(self, task_id: str) -> Path:
+        """获取任务目录路径"""
+        return self.base_output_dir / task_id
+    
+    def create_task_directory(self, task_id: str) -> bool:
+        """创建任务目录"""
+        try:
+            task_dir = self.get_task_directory(task_id)
+            task_dir.mkdir(parents=True, exist_ok=True)
+            self.logger.info(f"任务目录已创建: {task_dir}")
+            return True
+        except Exception as e:
+            self.logger.error(f"创建任务目录失败: {e}")
+            return False
+    
+    def get_task_files(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务目录下的所有文件信息"""
+        try:
+            task_dir = self.get_task_directory(task_id)
+            if not task_dir.exists():
+                return []
+            
+            files_info = []
+            for file_path in task_dir.iterdir():
+                if file_path.is_file():
+                    file_info = self._get_file_info(file_path)
+                    files_info.append(file_info)
+            
+            # 按修改时间排序(最新的在前)
+            files_info.sort(key=lambda x: x['modified_at'], reverse=True)
+            return files_info
+            
+        except Exception as e:
+            self.logger.error(f"获取任务文件失败: {e}")
+            return []
+    
+    def _get_file_info(self, file_path: Path) -> Dict[str, Any]:
+        """获取单个文件的基本信息"""
+        try:
+            stat = file_path.stat()
+            
+            return {
+                "file_name": file_path.name,
+                "file_path": str(file_path),
+                "file_type": self._determine_file_type(file_path),
+                "file_size": stat.st_size,
+                "file_size_formatted": self._format_file_size(stat.st_size),
+                "created_at": datetime.fromtimestamp(stat.st_ctime),
+                "modified_at": datetime.fromtimestamp(stat.st_mtime),
+                "is_readable": os.access(file_path, os.R_OK)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"获取文件信息失败: {e}")
+            return {
+                "file_name": file_path.name,
+                "file_path": str(file_path),
+                "file_type": "unknown",
+                "file_size": 0,
+                "file_size_formatted": "0 B",
+                "created_at": datetime.now(),
+                "modified_at": datetime.now(),
+                "is_readable": False
+            }
+    
+    def _determine_file_type(self, file_path: Path) -> str:
+        """根据文件扩展名确定文件类型"""
+        suffix = file_path.suffix.lower()
+        
+        type_mapping = {
+            '.ddl': 'ddl',
+            '.sql': 'sql',
+            '.md': 'markdown',
+            '.markdown': 'markdown',
+            '.json': 'json',
+            '.txt': 'text',
+            '.log': 'log'
+        }
+        
+        return type_mapping.get(suffix, 'other')
+    
+    def _format_file_size(self, size_bytes: int) -> str:
+        """格式化文件大小显示"""
+        if size_bytes == 0:
+            return "0 B"
+        
+        size_names = ["B", "KB", "MB", "GB"]
+        i = 0
+        size = float(size_bytes)
+        
+        while size >= 1024.0 and i < len(size_names) - 1:
+            size /= 1024.0
+            i += 1
+        
+        return f"{size:.1f} {size_names[i]}"
+    
+    def get_file_path(self, task_id: str, file_name: str) -> Path:
+        """获取文件的完整路径"""
+        task_dir = self.get_task_directory(task_id)
+        return task_dir / file_name
+    
+    def file_exists(self, task_id: str, file_name: str) -> bool:
+        """检查文件是否存在"""
+        file_path = self.get_file_path(task_id, file_name)
+        return file_path.exists() and file_path.is_file()
+    
+    def is_file_safe(self, task_id: str, file_name: str) -> bool:
+        """检查文件路径是否安全(防止路径遍历攻击)"""
+        try:
+            task_dir = self.get_task_directory(task_id)
+            file_path = task_dir / file_name
+            
+            # 确保文件在任务目录内
+            file_path.resolve().relative_to(task_dir.resolve())
+            return True
+        except ValueError:
+            return False
+    
+    def get_directory_info(self, task_id: str) -> Dict[str, Any]:
+        """获取任务目录信息"""
+        try:
+            task_dir = self.get_task_directory(task_id)
+            
+            if not task_dir.exists():
+                return {
+                    "exists": False,
+                    "directory_path": str(task_dir),
+                    "total_files": 0,
+                    "total_size": 0,
+                    "total_size_formatted": "0 B"
+                }
+            
+            files = self.get_task_files(task_id)
+            total_size = sum(file_info['file_size'] for file_info in files)
+            
+            return {
+                "exists": True,
+                "directory_path": str(task_dir),
+                "total_files": len(files),
+                "total_size": total_size,
+                "total_size_formatted": self._format_file_size(total_size)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"获取目录信息失败: {e}")
+            return {
+                "exists": False,
+                "directory_path": str(self.get_task_directory(task_id)),
+                "total_files": 0,
+                "total_size": 0,
+                "total_size_formatted": "0 B"
+            }
+    
+    def upload_table_list_file(self, task_id: str, file_obj: Union[BinaryIO, bytes], filename: str = None) -> Dict[str, Any]:
+        """
+        上传表清单文件到指定任务目录
+        
+        Args:
+            task_id: 任务ID
+            file_obj: 文件对象(Flask的FileStorage)或文件内容(字节流)
+            filename: 原始文件名(可选,仅用于日志记录)
+        
+        Returns:
+            Dict: 上传结果,包含filename、file_size、file_size_formatted、upload_time等
+        
+        Raises:
+            ValueError: 文件验证失败(文件太大、空文件、格式错误等)
+            FileNotFoundError: 任务目录不存在且无法创建
+            IOError: 文件操作失败
+        """
+        try:
+            # 获取配置
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            upload_config = SCHEMA_TOOLS_CONFIG.get("file_upload", {})
+            max_file_size_mb = upload_config.get("max_file_size_mb", 2)
+            max_size = max_file_size_mb * 1024 * 1024  # 转换为字节
+            target_filename = upload_config.get("target_filename", "table_list.txt")
+            allowed_extensions = upload_config.get("allowed_extensions", ["txt"])
+            
+            # 处理文件对象或字节流
+            if isinstance(file_obj, bytes):
+                file_content = file_obj
+                original_filename = filename or "uploaded_file.txt"
+            else:
+                # Flask FileStorage对象
+                if hasattr(file_obj, 'filename') and file_obj.filename:
+                    original_filename = file_obj.filename
+                else:
+                    original_filename = filename or "uploaded_file.txt"
+                
+                # 验证文件扩展名 - 修复:统一格式进行比较
+                file_ext = Path(original_filename).suffix.lower().lstrip('.')
+                if file_ext not in allowed_extensions:
+                    raise ValueError(f"不支持的文件类型,仅支持: {', '.join(['.' + ext for ext in allowed_extensions])}")
+                
+                # 读取文件内容并验证大小
+                file_content = b''
+                chunk_size = 8192
+                total_size = 0
+                
+                while True:
+                    chunk = file_obj.read(chunk_size)
+                    if not chunk:
+                        break
+                    
+                    total_size += len(chunk)
+                    if total_size > max_size:
+                        raise ValueError(f"文件大小超过限制: {max_file_size_mb}MB")
+                    
+                    file_content += chunk
+            
+            # 验证文件内容为空
+            if len(file_content) == 0:
+                raise ValueError("文件为空,请选择有效的表清单文件")
+            
+            # 验证文件内容(简单检查是否为文本文件)
+            self._validate_table_list_content_simple(file_content)
+            
+            # 确保任务目录存在
+            task_dir = self.get_task_directory(task_id)
+            if not task_dir.exists():
+                task_dir.mkdir(parents=True, exist_ok=True)
+                self.logger.info(f"创建任务目录: {task_dir}")
+            
+            # 确定目标文件路径
+            target_file_path = task_dir / target_filename
+            
+            # 保存文件
+            with open(target_file_path, 'wb') as f:
+                f.write(file_content)
+            
+            # 验证文件是否成功写入
+            if not target_file_path.exists():
+                raise IOError("文件保存失败")
+            
+            # 获取文件信息
+            file_stat = target_file_path.stat()
+            upload_time = datetime.fromtimestamp(file_stat.st_mtime)
+            
+            self.logger.info(f"成功上传表清单文件到任务 {task_id}: {target_file_path}")
+            
+            return {
+                "filename": target_filename,
+                "original_filename": original_filename,
+                "file_size": file_stat.st_size,
+                "file_size_formatted": self._format_file_size(file_stat.st_size),
+                "upload_time": upload_time,
+                "target_path": str(target_file_path)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"上传表清单文件失败: {e}")
+            raise
+    
+    def _validate_table_list_content_simple(self, file_content: bytes) -> None:
+        """
+        简单验证表清单文件内容
+        
+        Args:
+            file_content: 文件内容(字节流)
+            
+        Raises:
+            ValueError: 文件内容验证失败
+        """
+        try:
+            # 尝试解码文件内容
+            try:
+                content = file_content.decode('utf-8')
+            except UnicodeDecodeError:
+                try:
+                    content = file_content.decode('gbk')
+                except UnicodeDecodeError:
+                    raise ValueError("文件编码错误,请确保文件为UTF-8或GBK格式")
+            
+            # 检查文件是否为空
+            if not content.strip():
+                raise ValueError("表清单文件为空")
+            
+            # 简单验证:检查是否包含至少一个非空行
+            lines = [line.strip() for line in content.split('\n') if line.strip()]
+            if not lines:
+                raise ValueError("表清单文件不包含有效的表名")
+            
+            # 可选:验证表名格式(避免SQL注入等安全问题)
+            import re
+            table_name_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*(\.[a-zA-Z_][a-zA-Z0-9_]*)?$')
+            invalid_tables = []
+            
+            for line in lines[:10]:  # 只检查前10行以避免过度验证
+                # 忽略注释行
+                if line.startswith('#') or line.startswith('--'):
+                    continue
+                
+                # 检查表名格式
+                if not table_name_pattern.match(line):
+                    invalid_tables.append(line)
+            
+            if invalid_tables:
+                raise ValueError(f"表清单文件包含无效的表名格式: {', '.join(invalid_tables[:3])}")
+                
+        except ValueError:
+            raise
+        except Exception as e:
+            raise ValueError(f"文件内容验证失败: {str(e)}")
+    
+    def _validate_table_list_content(self, file_content: bytes, config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        验证表清单文件内容
+        
+        Args:
+            file_content: 文件内容(字节流)
+            config: 文件上传配置
+        
+        Returns:
+            Dict: 验证结果
+        """
+        try:
+            # 解码文件内容
+            encoding = config.get("encoding", "utf-8")
+            try:
+                content = file_content.decode(encoding)
+            except UnicodeDecodeError:
+                # 尝试其他编码
+                for fallback_encoding in ["gbk", "latin1"]:
+                    try:
+                        content = file_content.decode(fallback_encoding)
+                        self.logger.warning(f"文件编码检测为 {fallback_encoding},建议使用 UTF-8")
+                        break
+                    except UnicodeDecodeError:
+                        continue
+                else:
+                    return {
+                        "valid": False,
+                        "error": f"无法解码文件内容,请确保文件编码为 {encoding}"
+                    }
+            
+            # 分析文件内容
+            lines = content.splitlines()
+            total_lines = len(lines)
+            
+            # 过滤空行和注释行
+            valid_lines = []
+            comment_lines = 0
+            empty_lines = 0
+            
+            for line_num, line in enumerate(lines, 1):
+                stripped = line.strip()
+                if not stripped:
+                    empty_lines += 1
+                elif stripped.startswith('#'):
+                    comment_lines += 1
+                else:
+                    # 简单验证表名格式
+                    if self._is_valid_table_name(stripped):
+                        valid_lines.append(stripped)
+                    else:
+                        return {
+                            "valid": False,
+                            "error": f"第 {line_num} 行包含无效的表名: {stripped}",
+                            "details": {
+                                "line_number": line_num,
+                                "invalid_content": stripped
+                            }
+                        }
+            
+            # 检查有效行数
+            min_lines = config.get("min_lines", 1)
+            max_lines = config.get("max_lines", 1000)
+            
+            if len(valid_lines) < min_lines:
+                return {
+                    "valid": False,
+                    "error": f"文件至少需要包含 {min_lines} 个有效表名,当前只有 {len(valid_lines)} 个",
+                    "details": {
+                        "valid_tables": len(valid_lines),
+                        "min_required": min_lines
+                    }
+                }
+            
+            if len(valid_lines) > max_lines:
+                return {
+                    "valid": False,
+                    "error": f"文件包含的表名数量超过限制,最多允许 {max_lines} 个,当前有 {len(valid_lines)} 个",
+                    "details": {
+                        "valid_tables": len(valid_lines),
+                        "max_allowed": max_lines
+                    }
+                }
+            
+            return {
+                "valid": True,
+                "details": {
+                    "total_lines": total_lines,
+                    "empty_lines": empty_lines,
+                    "comment_lines": comment_lines,
+                    "valid_tables": len(valid_lines),
+                    "table_names": valid_lines[:10]  # 只返回前10个作为预览
+                }
+            }
+            
+        except Exception as e:
+            return {
+                "valid": False,
+                "error": f"文件内容验证失败: {str(e)}"
+            }
+    
+    def _is_valid_table_name(self, table_name: str) -> bool:
+        """
+        验证表名格式是否有效
+        
+        Args:
+            table_name: 表名
+        
+        Returns:
+            bool: 是否有效
+        """
+        import re
+        
+        # 基本的表名格式检查
+        # 支持: table_name, schema.table_name
+        pattern = r'^[a-zA-Z_][a-zA-Z0-9_]*(\.[a-zA-Z_][a-zA-Z0-9_]*)?$'
+        return bool(re.match(pattern, table_name))
+    
+    def get_table_list_file_info(self, task_id: str) -> Dict[str, Any]:
+        """
+        获取任务的表清单文件信息
+        
+        Args:
+            task_id: 任务ID
+        
+        Returns:
+            Dict: 文件信息或None
+        """
+        try:
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            upload_config = SCHEMA_TOOLS_CONFIG.get("file_upload", {})
+            target_filename = upload_config.get("target_filename", "table_list.txt")
+            
+            file_path = self.get_file_path(task_id, target_filename)
+            
+            if not file_path.exists():
+                return {
+                    "exists": False,
+                    "file_name": target_filename,
+                    "expected_path": str(file_path)
+                }
+            
+            file_stat = file_path.stat()
+            
+            # 尝试读取文件内容进行分析
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                    lines = content.splitlines()
+                    valid_tables = [line.strip() for line in lines 
+                                   if line.strip() and not line.strip().startswith('#')]
+            except Exception:
+                valid_tables = []
+            
+            return {
+                "exists": True,
+                "file_name": target_filename,
+                "file_path": str(file_path),
+                "file_size": file_stat.st_size,
+                "file_size_formatted": self._format_file_size(file_stat.st_size),
+                "uploaded_at": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                "created_at": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                "table_count": len(valid_tables),
+                "is_readable": os.access(file_path, os.R_OK)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"获取表清单文件信息失败: {e}")
+            return {
+                "exists": False,
+                "error": str(e)
+            }
+    
+    def create_table_list_from_names(self, task_id: str, table_names: List[str]) -> Dict[str, Any]:
+        """
+        从表名列表创建table_list.txt文件
+        
+        Args:
+            task_id: 任务ID
+            table_names: 表名列表
+        
+        Returns:
+            Dict: 创建结果,包含filename、table_count、file_size等信息
+        
+        Raises:
+            ValueError: 表名验证失败(表名格式错误、空列表等)
+            IOError: 文件操作失败
+        """
+        try:
+            # 获取配置
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            upload_config = SCHEMA_TOOLS_CONFIG.get("file_upload", {})
+            target_filename = upload_config.get("target_filename", "table_list.txt")
+            max_lines = upload_config.get("max_lines", 1000)
+            min_lines = upload_config.get("min_lines", 1)
+            
+            # 验证输入
+            if not table_names:
+                raise ValueError("表名列表不能为空")
+            
+            if not isinstance(table_names, list):
+                raise ValueError("表名必须是列表格式")
+            
+            # 处理和验证表名
+            processed_tables = self._process_table_names(table_names)
+            
+            # 验证表名数量
+            if len(processed_tables) < min_lines:
+                raise ValueError(f"表名数量不能少于 {min_lines} 个")
+            
+            if len(processed_tables) > max_lines:
+                raise ValueError(f"表名数量不能超过 {max_lines} 个")
+            
+            # 确保任务目录存在
+            task_dir = self.get_task_directory(task_id)
+            if not task_dir.exists():
+                task_dir.mkdir(parents=True, exist_ok=True)
+                self.logger.info(f"创建任务目录: {task_dir}")
+            
+            # 确定目标文件路径
+            target_file_path = task_dir / target_filename
+            
+            # 生成文件内容
+            file_content = self._generate_table_list_content(processed_tables)
+            
+            # 写入文件(覆盖模式)
+            with open(target_file_path, 'w', encoding='utf-8') as f:
+                f.write(file_content)
+            
+            # 验证文件是否成功写入
+            if not target_file_path.exists():
+                raise IOError("文件创建失败")
+            
+            # 获取文件信息
+            file_stat = target_file_path.stat()
+            created_time = datetime.fromtimestamp(file_stat.st_mtime)
+            
+            self.logger.info(f"成功创建表清单文件到任务 {task_id}: {target_file_path} ({len(processed_tables)} 个表)")
+            
+            return {
+                "filename": target_filename,
+                "table_count": len(processed_tables),
+                "unique_table_count": len(set(processed_tables)),
+                "file_size": file_stat.st_size,
+                "file_size_formatted": self._format_file_size(file_stat.st_size),
+                "created_time": created_time,
+                "target_path": str(target_file_path)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"创建表清单文件失败: {e}")
+            raise
+    
+    def _process_table_names(self, table_names: List[str]) -> List[str]:
+        """
+        处理表名列表:验证格式、去重、排序
+        
+        Args:
+            table_names: 原始表名列表
+            
+        Returns:
+            List[str]: 处理后的表名列表
+            
+        Raises:
+            ValueError: 表名格式验证失败
+        """
+        processed_tables = []
+        invalid_tables = []
+        
+        for table_name in table_names:
+            # 去除空白
+            table_name = table_name.strip()
+            
+            # 跳过空字符串
+            if not table_name:
+                continue
+            
+            # 跳过注释行
+            if table_name.startswith('#') or table_name.startswith('--'):
+                continue
+            
+            # 验证表名格式
+            if self._is_valid_table_name(table_name):
+                processed_tables.append(table_name)
+            else:
+                invalid_tables.append(table_name)
+        
+        # 如果有无效表名,抛出异常
+        if invalid_tables:
+            raise ValueError(f"包含无效的表名格式: {', '.join(invalid_tables[:5])}")
+        
+        # 去重并保持顺序
+        seen = set()
+        unique_tables = []
+        for table in processed_tables:
+            if table not in seen:
+                seen.add(table)
+                unique_tables.append(table)
+        
+        return unique_tables
+    
+    def _generate_table_list_content(self, table_names: List[str]) -> str:
+        """
+        生成table_list.txt文件内容
+        
+        Args:
+            table_names: 表名列表
+            
+        Returns:
+            str: 文件内容
+        """
+        lines = []
+        
+        # 添加文件头注释
+        lines.append("# 表清单文件")
+        lines.append(f"# 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        lines.append(f"# 表数量: {len(table_names)}")
+        lines.append("")
+        
+        # 添加表名
+        for table_name in table_names:
+            lines.append(table_name)
+        
+        # 确保文件以换行符结束
+        if lines and not lines[-1] == "":
+            lines.append("")
+        
+        return "\n".join(lines)
+    
+    # ==================== 文件上传功能 ====================
+    
+    # 支持的文件类型
+    ALLOWED_EXTENSIONS = {'.ddl', '.md', '.txt', '.json', '.sql', '.csv'}
+    MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+    
+    def upload_file_to_task(self, task_id: str, file_stream, filename: str, overwrite_mode: str = "backup") -> Dict[str, Any]:
+        """
+        上传文件到指定任务目录
+        
+        Args:
+            task_id: 任务ID
+            file_stream: 文件流对象
+            filename: 文件名
+            overwrite_mode: 重名处理模式 ("backup", "replace", "skip")
+        
+        Returns:
+            Dict: 上传结果
+        """
+        try:
+            # 1. 验证任务存在
+            task_dir = self.get_task_directory(task_id)
+            if not task_dir.exists():
+                # 创建任务目录
+                task_dir.mkdir(parents=True, exist_ok=True)
+                self.logger.info(f"创建任务目录: {task_dir}")
+            
+            # 2. 验证文件
+            validation_result = self.validate_file_upload(filename, file_stream)
+            if not validation_result["valid"]:
+                raise ValueError(validation_result["error"])
+            
+            # 3. 检查目标文件路径
+            target_file_path = task_dir / filename
+            
+            # 4. 处理重名文件
+            backup_info = None
+            if target_file_path.exists():
+                if overwrite_mode == "skip":
+                    return {
+                        "success": True,
+                        "skipped": True,
+                        "message": f"文件已存在,跳过上传: {filename}",
+                        "task_id": task_id,
+                        "uploaded_file": {
+                            "filename": filename,
+                            "existed": True,
+                            "action": "skipped"
+                        }
+                    }
+                elif overwrite_mode == "backup":
+                    backup_info = self.create_backup_file(target_file_path)
+                # replace 模式不需要特殊处理,直接覆盖
+            
+            # 5. 保存新文件
+            file_content = file_stream.read()
+            with open(target_file_path, 'wb') as f:
+                f.write(file_content)
+            
+            # 6. 获取文件信息
+            file_stat = target_file_path.stat()
+            upload_time = datetime.fromtimestamp(file_stat.st_mtime)
+            
+            self.logger.info(f"文件上传成功: {task_id}/{filename}")
+            
+            # 7. 构建响应
+            result = {
+                "success": True,
+                "task_id": task_id,
+                "uploaded_file": {
+                    "filename": filename,
+                    "size": file_stat.st_size,
+                    "size_formatted": self._format_file_size(file_stat.st_size),
+                    "uploaded_at": upload_time.isoformat(),
+                    "overwrite_mode": overwrite_mode
+                }
+            }
+            
+            if backup_info:
+                result["backup_info"] = backup_info
+            
+            return result
+            
+        except Exception as e:
+            self.logger.error(f"文件上传失败: {e}")
+            raise
+    
+    def validate_file_upload(self, filename: str, file_stream) -> Dict[str, Any]:
+        """
+        验证上传文件的合法性
+        
+        Args:
+            filename: 文件名
+            file_stream: 文件流
+        
+        Returns:
+            Dict: 验证结果 {"valid": bool, "error": str}
+        """
+        try:
+            # 1. 检查文件名安全性
+            if not self._is_safe_filename(filename):
+                return {
+                    "valid": False,
+                    "error": f"文件名包含不安全字符: {filename}"
+                }
+            
+            # 2. 检查文件扩展名
+            file_ext = Path(filename).suffix.lower()
+            if file_ext not in self.ALLOWED_EXTENSIONS:
+                return {
+                    "valid": False,
+                    "error": f"不支持的文件类型: {file_ext},允许的类型: {', '.join(self.ALLOWED_EXTENSIONS)}"
+                }
+            
+            # 3. 检查文件大小
+            if hasattr(file_stream, 'seek') and hasattr(file_stream, 'tell'):
+                # 获取文件大小
+                current_pos = file_stream.tell()
+                file_stream.seek(0, 2)  # 移动到文件末尾
+                file_size = file_stream.tell()
+                file_stream.seek(current_pos)  # 恢复原位置
+                
+                if file_size > self.MAX_FILE_SIZE:
+                    return {
+                        "valid": False,
+                        "error": f"文件大小超出限制: {self._format_file_size(file_size)},最大允许: {self._format_file_size(self.MAX_FILE_SIZE)}"
+                    }
+                
+                if file_size == 0:
+                    return {
+                        "valid": False,
+                        "error": "文件为空"
+                    }
+            
+            return {"valid": True}
+            
+        except Exception as e:
+            return {
+                "valid": False,
+                "error": f"文件验证失败: {str(e)}"
+            }
+    
+    def _is_safe_filename(self, filename: str) -> bool:
+        """检查文件名是否安全"""
+        import re
+        
+        # 禁止的字符和模式
+        dangerous_patterns = [
+            r'\.\.',  # 路径遍历
+            r'[<>:"|?*]',  # Windows 禁止字符
+            r'[\x00-\x1f]',  # 控制字符
+        ]
+        
+        # 禁止的文件名
+        dangerous_names = [
+            'CON', 'PRN', 'AUX', 'NUL',
+            'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
+            'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
+        ]
+        
+        # 检查危险模式
+        for pattern in dangerous_patterns:
+            if re.search(pattern, filename):
+                return False
+        
+        # 检查危险文件名
+        name_without_ext = Path(filename).stem.upper()
+        if name_without_ext in dangerous_names:
+            return False
+        
+        # 检查长度
+        if len(filename) > 255:
+            return False
+        
+        return True
+    
+    def find_next_backup_version(self, file_path: Path) -> int:
+        """
+        查找下一个可用的备份版本号
+        
+        Args:
+            file_path: 原文件路径
+        
+        Returns:
+            int: 下一个可用的版本号
+        """
+        version = 1
+        while True:
+            backup_path = Path(str(file_path) + f"_bak{version}")
+            if not backup_path.exists():
+                return version
+            version += 1
+            # 防止无限循环
+            if version > 1000:
+                raise ValueError("备份版本号超出限制")
+    
+    def create_backup_file(self, original_path: Path) -> Dict[str, Any]:
+        """
+        创建备份文件
+        
+        Args:
+            original_path: 原文件路径
+        
+        Returns:
+            Dict: 备份信息
+        """
+        try:
+            # 找到下一个可用的版本号
+            version = self.find_next_backup_version(original_path)
+            backup_path = Path(str(original_path) + f"_bak{version}")
+            
+            # 创建备份
+            shutil.copy2(original_path, backup_path)
+            
+            backup_time = datetime.now()
+            
+            self.logger.info(f"创建备份文件: {backup_path}")
+            
+            return {
+                "had_existing_file": True,
+                "backup_filename": backup_path.name,
+                "backup_version": version,
+                "backup_created_at": backup_time.isoformat()
+            }
+            
+        except Exception as e:
+            self.logger.error(f"创建备份文件失败: {e}")
+            raise

+ 628 - 0
data_pipeline/api/simple_workflow.py

@@ -0,0 +1,628 @@
+"""
+Data Pipeline API 简化任务工作流
+
+集成简化后的数据库管理器和文件管理器,提供任务执行功能
+"""
+
+import asyncio
+import json
+import os
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+from contextlib import contextmanager
+
+from data_pipeline.schema_workflow import SchemaWorkflowOrchestrator
+from data_pipeline.api.simple_db_manager import SimpleTaskManager
+from data_pipeline.api.simple_file_manager import SimpleFileManager
+from data_pipeline.dp_logging import get_logger
+
+
+class SimpleWorkflowExecutor:
+    """简化的任务工作流执行器"""
+    
+    def __init__(self, task_id: str):
+        """
+        初始化工作流执行器
+        
+        Args:
+            task_id: 任务ID
+        """
+        self.task_id = task_id
+        self.logger = get_logger("SimpleWorkflowExecutor", task_id)
+        
+        # 初始化管理器
+        self.task_manager = SimpleTaskManager()
+        self.file_manager = SimpleFileManager()
+        
+        # 任务目录日志记录器
+        self.task_dir_logger = None
+        
+        # 加载任务信息
+        self.task_info = None
+        self.task_params = None
+        self._load_task_info()
+    
+    def _load_task_info(self):
+        """加载任务信息"""
+        try:
+            self.task_info = self.task_manager.get_task(self.task_id)
+            if self.task_info:
+                self.task_params = self.task_info.get('parameters', {})
+            else:
+                raise ValueError(f"任务不存在: {self.task_id}")
+        except Exception as e:
+            self.logger.error(f"加载任务信息失败: {e}")
+            raise
+    
+    def _ensure_task_directory(self) -> bool:
+        """确保任务目录存在"""
+        try:
+            success = self.file_manager.create_task_directory(self.task_id)
+            if success:
+                # 写入任务配置文件
+                self._write_task_config()
+                # 初始化任务目录日志记录器
+                self._setup_task_directory_logger()
+            return success
+        except Exception as e:
+            self.logger.error(f"创建任务目录失败: {e}")
+            return False
+    
+    def _write_task_config(self):
+        """写入任务配置文件"""
+        try:
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            config_file = task_dir / "task_config.json"
+            
+            config_data = {
+                "task_id": self.task_id,
+                "created_at": self.task_info.get('created_at').isoformat() if self.task_info.get('created_at') else None,
+                "parameters": self.task_params,
+                "output_directory": str(task_dir)
+            }
+            
+            with open(config_file, 'w', encoding='utf-8') as f:
+                json.dump(config_data, f, ensure_ascii=False, indent=2, default=str)
+                
+        except Exception as e:
+            self.logger.error(f"写入任务配置失败: {e}")
+    
+    def _setup_task_directory_logger(self):
+        """设置任务目录日志记录器"""
+        try:
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            log_file = task_dir / "data_pipeline.log"
+            
+            # 创建专门的任务目录日志记录器
+            self.task_dir_logger = logging.getLogger(f"TaskDir_{self.task_id}")
+            self.task_dir_logger.setLevel(logging.DEBUG)
+            
+            # 清除已有处理器
+            self.task_dir_logger.handlers.clear()
+            self.task_dir_logger.propagate = False
+            
+            # 创建文件处理器
+            file_handler = logging.FileHandler(log_file, encoding='utf-8')
+            file_handler.setLevel(logging.DEBUG)
+            
+            # 设置详细的日志格式
+            formatter = logging.Formatter(
+                '%(asctime)s [%(levelname)s] %(name)s: %(message)s',
+                datefmt='%Y-%m-%d %H:%M:%S'
+            )
+            file_handler.setFormatter(formatter)
+            
+            self.task_dir_logger.addHandler(file_handler)
+            
+            # 记录初始化信息
+            self.task_dir_logger.info(f"任务目录日志初始化完成 - 任务ID: {self.task_id}")
+            self.task_dir_logger.info(f"任务参数: {json.dumps(self.task_params, ensure_ascii=False, default=str)}")
+            
+        except Exception as e:
+            self.logger.error(f"设置任务目录日志记录器失败: {e}")
+    
+    def _log_to_task_directory(self, level: str, message: str, step_name: str = None):
+        """记录日志到任务目录"""
+        if self.task_dir_logger:
+            try:
+                if step_name:
+                    message = f"[{step_name}] {message}"
+                
+                log_level = getattr(logging, level.upper(), logging.INFO)
+                self.task_dir_logger.log(log_level, message)
+            except Exception as e:
+                self.logger.error(f"记录任务目录日志失败: {e}")
+    
+    def _resolve_table_list_file_path(self) -> str:
+        """解析表清单文件路径"""
+        table_list_file = self.task_params['table_list_file']
+        
+        # 检查是否使用文件上传模式
+        if self.task_params.get('file_upload_mode', False) or '{task_directory}' in table_list_file:
+            # 文件上传模式:检查任务目录中的文件
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            
+            # 替换占位符
+            if '{task_directory}' in table_list_file:
+                resolved_path = table_list_file.replace('{task_directory}', str(task_dir))
+            else:
+                from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+                upload_config = SCHEMA_TOOLS_CONFIG.get("file_upload", {})
+                target_filename = upload_config.get("target_filename", "table_list.txt")
+                resolved_path = str(task_dir / target_filename)
+            
+            # 检查文件是否存在
+            if not Path(resolved_path).exists():
+                raise FileNotFoundError(
+                    f"表清单文件不存在: {resolved_path}。"
+                    f"请先上传表清单文件到任务 {self.task_id},然后再执行工作流。"
+                )
+            
+            return resolved_path
+        else:
+            # 传统模式:使用指定的文件路径
+            if not Path(table_list_file).exists():
+                raise FileNotFoundError(f"表清单文件不存在: {table_list_file}")
+            return table_list_file
+    
+    def _create_orchestrator(self) -> SchemaWorkflowOrchestrator:
+        """创建工作流编排器"""
+        task_dir = self.file_manager.get_task_directory(self.task_id)
+        
+        # 解析表清单文件路径
+        table_list_file = self._resolve_table_list_file_path()
+        
+        return SchemaWorkflowOrchestrator(
+            db_connection=self.task_params['db_connection'],
+            table_list_file=table_list_file,
+            business_context=self.task_params['business_context'],
+            output_dir=str(task_dir),
+            task_id=self.task_id,  # 传递task_id给编排器
+            enable_sql_validation=self.task_params.get('enable_sql_validation', True),
+            enable_llm_repair=self.task_params.get('enable_llm_repair', True),
+            modify_original_file=self.task_params.get('modify_original_file', True),
+            enable_training_data_load=self.task_params.get('enable_training_data_load', True)
+        )
+    
+    @contextmanager
+    def _step_execution(self, step_name: str):
+        """步骤执行上下文管理器"""
+        execution_id = None
+        
+        try:
+            # 开始执行
+            execution_id = self.task_manager.start_step(self.task_id, step_name)
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("INFO", f"开始执行步骤: {step_name}", step_name)
+            
+            yield execution_id
+            
+            # 成功完成
+            self.task_manager.complete_step(self.task_id, step_name, 'completed')
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("INFO", f"步骤执行完成: {step_name}", step_name)
+            
+        except Exception as e:
+            # 执行失败
+            error_msg = str(e)
+            
+            self.task_manager.complete_step(self.task_id, step_name, 'failed', error_msg)
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("ERROR", f"步骤执行失败: {step_name} - {error_msg}", step_name)
+            raise
+    
+    async def execute_complete_workflow(self) -> Dict[str, Any]:
+        """执行完整工作流"""
+        try:
+            # 确保任务目录存在
+            if not self._ensure_task_directory():
+                raise Exception("无法创建任务目录")
+            
+            # 开始任务
+            self.task_manager.update_task_status(self.task_id, 'in_progress')
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("INFO", "完整工作流任务开始执行")
+            
+            # 创建工作流编排器
+            orchestrator = self._create_orchestrator()
+            
+            # 重定向SchemaWorkflowOrchestrator的日志到任务目录
+            self._redirect_orchestrator_logs(orchestrator)
+            
+            # 分别执行各个步骤,每个步骤都用_step_execution包装
+            try:
+                # 步骤1: DDL/MD生成
+                with self._step_execution("ddl_generation") as execution_id:
+                    self._log_to_task_directory("INFO", "开始执行DDL/MD生成步骤", "ddl_generation")
+                    await orchestrator._execute_step_1_ddl_md_generation()
+                    self._log_to_task_directory("INFO", "DDL/MD生成步骤完成", "ddl_generation")
+                
+                # 步骤2: Question-SQL生成  
+                with self._step_execution("qa_generation") as execution_id:
+                    self._log_to_task_directory("INFO", "开始执行Question-SQL生成步骤", "qa_generation")
+                    await orchestrator._execute_step_2_question_sql_generation()
+                    self._log_to_task_directory("INFO", "Question-SQL生成步骤完成", "qa_generation")
+                
+                # 步骤3: SQL验证(如果启用)
+                if orchestrator.enable_sql_validation:
+                    with self._step_execution("sql_validation") as execution_id:
+                        self._log_to_task_directory("INFO", "开始执行SQL验证步骤", "sql_validation")
+                        await orchestrator._execute_step_3_sql_validation()
+                        self._log_to_task_directory("INFO", "SQL验证步骤完成", "sql_validation")
+                else:
+                    self._log_to_task_directory("INFO", "跳过SQL验证步骤(未启用)", "sql_validation")
+                
+                # 步骤4: 训练数据加载(如果启用)
+                if orchestrator.enable_training_data_load:
+                    with self._step_execution("training_load") as execution_id:
+                        self._log_to_task_directory("INFO", "开始执行训练数据加载步骤", "training_load")
+                        await orchestrator._execute_step_4_training_data_load()
+                        self._log_to_task_directory("INFO", "训练数据加载步骤完成", "training_load")
+                else:
+                    self._log_to_task_directory("INFO", "跳过训练数据加载步骤(未启用)", "training_load")
+                
+                # 获取工作流结果
+                result = {
+                    "success": True,
+                    "workflow_state": orchestrator.workflow_state,
+                    "artifacts": orchestrator.workflow_state.get("artifacts", {})
+                }
+                
+                # 写入结果文件
+                self._write_result_file(result)
+                
+            except Exception as step_error:
+                self.logger.error(f"工作流步骤执行失败: {step_error}")
+                # 记录到任务目录日志
+                self._log_to_task_directory("ERROR", f"工作流步骤执行失败: {step_error}")
+                raise
+            
+            # 完成任务
+            self.task_manager.update_task_status(self.task_id, 'completed')
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("INFO", "完整工作流任务执行完成")
+            
+            return {
+                "success": True,
+                "task_id": self.task_id,
+                "execution_mode": "complete",
+                "result": result
+            }
+            
+        except Exception as e:
+            # 记录错误
+            error_msg = str(e)
+            self.task_manager.update_task_status(self.task_id, 'failed', error_msg)
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("ERROR", f"完整工作流任务执行失败: {error_msg}")
+            
+            return {
+                "success": False,
+                "task_id": self.task_id,
+                "execution_mode": "complete",
+                "error": error_msg
+            }
+    
+    async def execute_single_step(self, step_name: str) -> Dict[str, Any]:
+        """执行单个步骤"""
+        try:
+            # 确保任务目录存在
+            if not self._ensure_task_directory():
+                raise Exception("无法创建任务目录")
+            
+            # 更新任务状态
+            self.task_manager.update_task_status(self.task_id, 'in_progress')
+            
+            # 创建工作流编排器
+            orchestrator = self._create_orchestrator()
+            
+            # 重定向SchemaWorkflowOrchestrator的日志到任务目录
+            self._redirect_orchestrator_logs(orchestrator)
+            
+            # 执行指定步骤
+            result = None
+            with self._step_execution(step_name) as execution_id:
+                if step_name == "ddl_generation":
+                    await orchestrator._execute_step_1_ddl_md_generation()
+                    result = orchestrator.workflow_state["artifacts"].get("ddl_md_generation", {})
+                    
+                elif step_name == "qa_generation":
+                    await orchestrator._execute_step_2_question_sql_generation()
+                    result = orchestrator.workflow_state["artifacts"].get("question_sql_generation", {})
+                    
+                elif step_name == "sql_validation":
+                    await orchestrator._execute_step_3_sql_validation()
+                    result = orchestrator.workflow_state["artifacts"].get("sql_validation", {})
+                    
+                elif step_name == "training_load":
+                    await orchestrator._execute_step_4_training_data_load()
+                    result = orchestrator.workflow_state["artifacts"].get("training_data_load", {})
+                    
+                else:
+                    raise ValueError(f"不支持的步骤: {step_name}")
+                
+                # 写入步骤结果文件
+                self._write_step_result_file(step_name, result)
+            
+            # 检查是否所有步骤都已完成
+            self._update_overall_task_status()
+            
+            return {
+                "success": True,
+                "task_id": self.task_id,
+                "execution_mode": "step",
+                "step_name": step_name,
+                "result": result
+            }
+            
+        except Exception as e:
+            # 记录错误
+            error_msg = str(e)
+            self.task_manager.update_task_status(self.task_id, 'failed', error_msg)
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("ERROR", f"步骤执行失败: {step_name} - {error_msg}", step_name)
+            
+            return {
+                "success": False,
+                "task_id": self.task_id,
+                "execution_mode": "step",
+                "step_name": step_name,
+                "error": error_msg
+            }
+    
+    def _write_result_file(self, result: Dict[str, Any]):
+        """写入完整结果文件"""
+        try:
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            result_file = task_dir / "task_result.json"
+            
+            with open(result_file, 'w', encoding='utf-8') as f:
+                json.dump(result, f, ensure_ascii=False, indent=2, default=str)
+                
+        except Exception as e:
+            self.logger.error(f"写入结果文件失败: {e}")
+    
+    def _write_step_result_file(self, step_name: str, result: Dict[str, Any]):
+        """写入步骤结果文件"""
+        try:
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            result_file = task_dir / f"{step_name}_result.json"
+            
+            with open(result_file, 'w', encoding='utf-8') as f:
+                json.dump(result, f, ensure_ascii=False, indent=2, default=str)
+                
+        except Exception as e:
+            self.logger.error(f"写入步骤结果文件失败: {e}")
+    
+    def _update_overall_task_status(self):
+        """更新整体任务状态"""
+        try:
+            # 检查所有步骤的完成情况
+            steps = self.task_manager.get_task_steps(self.task_id)
+            
+            completed_steps = set()
+            failed_steps = set()
+            
+            for step in steps:
+                if step['step_status'] == 'completed':
+                    completed_steps.add(step['step_name'])
+                elif step['step_status'] == 'failed':
+                    failed_steps.add(step['step_name'])
+            
+            # 检查是否有失败的步骤
+            if failed_steps:
+                self.task_manager.update_task_status(self.task_id, 'failed')
+                return
+            
+            # 检查是否完成了必要步骤
+            required_steps = {"ddl_generation", "qa_generation"}
+            if required_steps.issubset(completed_steps):
+                # 检查是否有可选步骤完成
+                optional_steps = {"sql_validation", "training_load"}
+                if completed_steps.intersection(optional_steps):
+                    if len(completed_steps) >= 3:
+                        self.task_manager.update_task_status(self.task_id, 'completed')
+                    else:
+                        self.task_manager.update_task_status(self.task_id, 'partial_completed')
+                else:
+                    self.task_manager.update_task_status(self.task_id, 'partial_completed')
+            
+        except Exception as e:
+            self.logger.error(f"更新任务状态失败: {e}")
+    
+    def _redirect_orchestrator_logs(self, orchestrator):
+        """重定向SchemaWorkflowOrchestrator的日志到任务目录"""
+        if self.task_dir_logger and hasattr(orchestrator, 'logger'):
+            try:
+                # 为orchestrator的logger添加任务目录文件处理器
+                for handler in self.task_dir_logger.handlers:
+                    if isinstance(handler, logging.FileHandler):
+                        orchestrator.logger.addHandler(handler)
+                        break
+            except Exception as e:
+                self.logger.error(f"重定向orchestrator日志失败: {e}")
+    
+    def query_logs_advanced(self,
+                           page: int = 1,
+                           page_size: int = 50,
+                           level: str = None,
+                           start_time: str = None,
+                           end_time: str = None,
+                           keyword: str = None,
+                           logger_name: str = None,
+                           step_name: str = None,
+                           sort_by: str = "timestamp",
+                           sort_order: str = "desc") -> dict:
+        """
+        高级日志查询(工作流层)
+        
+        Args:
+            page: 页码,必须大于0,默认1
+            page_size: 每页大小,1-500之间,默认50
+            level: 可选,日志级别筛选
+            start_time: 可选,开始时间范围
+            end_time: 可选,结束时间范围
+            keyword: 可选,关键字搜索
+            logger_name: 可选,日志记录器名称
+            step_name: 可选,执行步骤名称
+            sort_by: 可选,排序字段
+            sort_order: 可选,排序方向
+            
+        Returns:
+            日志查询结果
+        """
+        try:
+            # 调用数据库层方法
+            result = self.task_manager.query_logs_advanced(
+                task_id=self.task_id,
+                page=page,
+                page_size=page_size,
+                level=level,
+                start_time=start_time,
+                end_time=end_time,
+                keyword=keyword,
+                logger_name=logger_name,
+                step_name=step_name,
+                sort_by=sort_by,
+                sort_order=sort_order
+            )
+            
+            # 记录查询操作
+            self.logger.info(f"日志查询完成: {self.task_id}, 页码: {page}, 结果数: {len(result.get('logs', []))}")
+            
+            return result
+            
+        except Exception as e:
+            self.logger.error(f"日志查询失败: {e}")
+            return {
+                "logs": [],
+                "pagination": {
+                    "page": page,
+                    "page_size": page_size,
+                    "total": 0,
+                    "total_pages": 0,
+                    "has_next": False,
+                    "has_prev": False
+                },
+                "log_file_info": {
+                    "exists": False,
+                    "error": str(e)
+                },
+                "query_time": "0.000s"
+            }
+    
+    def cleanup(self):
+        """清理资源"""
+        try:
+            # 清理任务目录日志记录器
+            if self.task_dir_logger:
+                for handler in self.task_dir_logger.handlers:
+                    handler.close()
+                self.task_dir_logger.handlers.clear()
+                
+            self.task_manager.close_connection()
+        except Exception as e:
+            self.logger.error(f"清理资源失败: {e}")
+
+
+class SimpleWorkflowManager:
+    """简化的任务工作流管理器"""
+    
+    def __init__(self):
+        """初始化工作流管理器"""
+        self.task_manager = SimpleTaskManager()
+        self.file_manager = SimpleFileManager()
+        # 使用简单的控制台日志,不使用文件日志
+        self.logger = logging.getLogger("SimpleWorkflowManager")
+        self.logger.setLevel(logging.INFO)
+    
+    def create_task(self, 
+                   table_list_file: str = None,
+                   business_context: str = None,
+                   db_name: str = None,
+                   **kwargs) -> str:
+        """创建新任务"""
+        try:
+            # 如果提供了table_list_file,验证文件存在
+            if table_list_file and not os.path.exists(table_list_file):
+                raise FileNotFoundError(f"表清单文件不存在: {table_list_file}")
+            
+            # 创建任务(使用app_config中的数据库配置)
+            task_id = self.task_manager.create_task(
+                table_list_file=table_list_file,
+                business_context=business_context,
+                db_name=db_name,
+                **kwargs
+            )
+            
+            return task_id
+            
+        except Exception as e:
+            self.logger.error(f"创建任务失败: {e}")
+            raise
+    
+    async def execute_task(self, 
+                          task_id: str,
+                          execution_mode: str = "complete",
+                          step_name: Optional[str] = None) -> Dict[str, Any]:
+        """执行任务"""
+        executor = None
+        try:
+            executor = SimpleWorkflowExecutor(task_id)
+            
+            if execution_mode == "complete":
+                return await executor.execute_complete_workflow()
+            elif execution_mode == "step":
+                if not step_name:
+                    raise ValueError("步骤执行模式需要指定step_name")
+                return await executor.execute_single_step(step_name)
+            else:
+                raise ValueError(f"不支持的执行模式: {execution_mode}")
+                
+        finally:
+            if executor:
+                executor.cleanup()
+    
+    def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """获取任务状态"""
+        return self.task_manager.get_task(task_id)
+    
+    def get_task_files(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务文件列表"""
+        return self.file_manager.get_task_files(task_id)
+    
+    def get_task_steps(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务步骤状态"""
+        return self.task_manager.get_task_steps(task_id)
+    
+    def get_tasks_list(self, **kwargs) -> List[Dict[str, Any]]:
+        """获取任务列表"""
+        return self.task_manager.get_tasks_list(**kwargs)
+    
+    def query_tasks_advanced(self, **kwargs) -> dict:
+        """
+        高级任务查询,支持复杂筛选、排序、分页
+        
+        Args:
+            **kwargs: 传递给数据库层的查询参数
+        
+        Returns:
+            包含任务列表和分页信息的字典
+        """
+        return self.task_manager.query_tasks_advanced(**kwargs)
+    
+    def cleanup(self):
+        """清理资源"""
+        try:
+            self.task_manager.close_connection()
+        except Exception as e:
+            self.logger.error(f"清理资源失败: {e}")

+ 370 - 0
data_pipeline/api/table_inspector_api.py

@@ -0,0 +1,370 @@
+"""
+表检查API模块
+
+复用data_pipeline中的数据库连接和查询功能,提供独立的表信息查询API
+"""
+
+import asyncio
+import asyncpg
+import logging
+from typing import List, Optional, Dict, Any
+from data_pipeline.tools.database_inspector import DatabaseInspectorTool
+
+
+class TableInspectorAPI:
+    """表检查API类,复用现有的数据库功能"""
+    
+    def __init__(self):
+        self.logger = logging.getLogger("TableInspectorAPI")
+        self.db_inspector = None
+    
+    async def get_tables_list(self, db_connection: str, schema: Optional[str] = None, table_name_pattern: Optional[str] = None) -> List[str]:
+        """
+        获取数据库表列表
+        
+        Args:
+            db_connection: 完整的PostgreSQL连接字符串
+            schema: 可选的schema参数,支持多个schema用逗号分隔
+                   如果为None或空字符串,则只返回public schema的表
+            table_name_pattern: 可选的表名模式匹配,支持通配符
+                               - ods_* : 以"ods_"开头的表
+                               - *_dim : 以"_dim"结尾的表
+                               - *fact* : 包含"fact"的表
+                               - ods_% : 直接使用SQL LIKE语法
+        
+        Returns:
+            表名列表,格式为 schema.tablename
+        """
+        try:
+            # 创建数据库检查器实例
+            self.db_inspector = DatabaseInspectorTool(db_connection=db_connection)
+            
+            # 创建连接池
+            await self.db_inspector._create_connection_pool()
+            
+            # 解析schema参数
+            target_schemas = self._parse_schemas(schema)
+            
+            # 查询表列表
+            tables = await self._query_tables(target_schemas, table_name_pattern)
+            
+            return tables
+            
+        except Exception as e:
+            self.logger.error(f"获取表列表失败: {e}")
+            raise
+        finally:
+            # 清理连接池
+            if self.db_inspector and self.db_inspector.connection_pool:
+                await self.db_inspector.connection_pool.close()
+    
+    def _parse_schemas(self, schema: Optional[str]) -> List[str]:
+        """
+        解析schema参数
+        
+        Args:
+            schema: schema参数,可以是单个schema或逗号分隔的多个schema
+        
+        Returns:
+            schema列表
+        """
+        if not schema or schema.strip() == "":
+            # 如果没有指定schema,默认只查询public schema
+            return ["public"]
+        
+        # 解析逗号分隔的schema
+        schemas = [s.strip() for s in schema.split(",") if s.strip()]
+        
+        # 如果解析后为空,回退到public
+        if not schemas:
+            return ["public"]
+        
+        return schemas
+    
+    async def _query_tables(self, schemas: List[str], table_name_pattern: Optional[str] = None) -> List[str]:
+        """
+        查询指定schema中的表
+        
+        Args:
+            schemas: schema列表
+            table_name_pattern: 可选的表名模式匹配,支持通配符
+                               - ods_* : 以"ods_"开头的表
+                               - *_dim : 以"_dim"结尾的表
+                               - *fact* : 包含"fact"的表
+                               - ods_% : 直接使用SQL LIKE语法
+        
+        Returns:
+            表名列表,格式为 schema.tablename
+        """
+        tables = []
+        
+        async with self.db_inspector.connection_pool.acquire() as conn:
+            for schema in schemas:
+                # 构建查询语句
+                if table_name_pattern:
+                    # 转换通配符模式为SQL LIKE语法
+                    sql_pattern = self._convert_wildcard_to_sql_like(table_name_pattern)
+                    
+                    query = """
+                    SELECT schemaname, tablename 
+                    FROM pg_tables 
+                    WHERE schemaname = $1 AND tablename LIKE $2
+                    ORDER BY tablename
+                    """
+                    
+                    rows = await conn.fetch(query, schema, sql_pattern)
+                else:
+                    # 没有表名模式,查询所有表
+                    query = """
+                    SELECT schemaname, tablename 
+                    FROM pg_tables 
+                    WHERE schemaname = $1
+                    ORDER BY tablename
+                    """
+                    
+                    rows = await conn.fetch(query, schema)
+                
+                # 格式化表名为 schema.tablename
+                for row in rows:
+                    schema_name = row['schemaname']
+                    table_name = row['tablename']
+                    full_table_name = f"{schema_name}.{table_name}"
+                    tables.append(full_table_name)
+        
+        # 按名称排序
+        tables.sort()
+        
+        pattern_info = f",表名模式: {table_name_pattern}" if table_name_pattern else ""
+        self.logger.info(f"查询到 {len(tables)} 个表,schemas: {schemas}{pattern_info}")
+        
+        return tables
+    
+    async def get_table_ddl(self, db_connection: str, table: str, business_context: str = None, output_type: str = "ddl") -> Dict[str, Any]:
+        """
+        获取表的DDL语句或MD文档
+        
+        Args:
+            db_connection: 数据库连接字符串
+            table: 表名,格式为 schema.tablename
+            business_context: 业务上下文描述
+            output_type: 输出类型,支持 "ddl", "md", "both"
+        
+        Returns:
+            包含DDL/MD内容的字典
+        """
+        try:
+            # 解析表名
+            schema_name, table_name = self._parse_table_name(table)
+            
+            # 导入必要的模块
+            from data_pipeline.tools.database_inspector import DatabaseInspectorTool
+            from data_pipeline.tools.comment_generator import CommentGeneratorTool
+            from data_pipeline.tools.ddl_generator import DDLGeneratorTool
+            from data_pipeline.tools.doc_generator import DocGeneratorTool
+            from data_pipeline.tools.data_sampler import DataSamplerTool
+            from data_pipeline.utils.data_structures import TableMetadata, TableProcessingContext
+            from core.vanna_llm_factory import create_vanna_instance
+            
+            # 创建数据库检查器实例
+            db_inspector = DatabaseInspectorTool(db_connection=db_connection)
+            await db_inspector._create_connection_pool()
+            
+            # 创建表元数据对象
+            table_metadata = TableMetadata(
+                table_name=table_name,
+                schema_name=schema_name,
+                full_name=f"{schema_name}.{table_name}",
+                fields=[],
+                comment=None,
+                sample_data=[]
+            )
+            
+            # 获取全局Vanna实例(仅用于LLM调用,不修改其数据库连接)
+            from common.vanna_instance import get_vanna_instance
+            vn = get_vanna_instance()
+            self.logger.info("使用全局Vanna单例实例进行LLM调用(不修改其数据库连接)")
+            
+            # 创建处理上下文
+            context = TableProcessingContext(
+                table_metadata=table_metadata,
+                business_context=business_context or "数据库管理系统",
+                output_dir="/tmp",  # 临时目录,API不会真正写文件
+                pipeline="api_direct",  # API直接调用标识
+                vn=vn,
+                file_manager=None,  # 不需要文件管理器
+                step_results={}
+            )
+            
+            # 第1步:获取表结构信息
+            self.logger.info(f"开始获取表结构: {table}")
+            inspect_result = await db_inspector.execute(context)
+            if not inspect_result.success:
+                raise Exception(f"获取表结构失败: {inspect_result.error_message}")
+            
+            # 第2步:获取样例数据(用于生成更好的注释)
+            self.logger.info("开始获取样例数据")
+            try:
+                data_sampler = DataSamplerTool(vn=vn, db_connection=db_connection)
+                sample_result = await data_sampler.execute(context)
+                if sample_result.success:
+                    self.logger.info("样例数据获取成功")
+                else:
+                    self.logger.warning(f"样例数据获取失败: {sample_result.error_message}")
+            except Exception as e:
+                self.logger.warning(f"样例数据获取异常: {e}")
+            
+            # 第3步:生成注释(调用LLM)
+            if business_context:
+                self.logger.info("开始生成LLM注释")
+                try:
+                    comment_generator = CommentGeneratorTool(
+                        vn=vn,
+                        business_context=business_context,
+                        db_connection=db_connection
+                    )
+                    comment_result = await comment_generator.execute(context)
+                    if comment_result.success:
+                        self.logger.info("LLM注释生成成功")
+                    else:
+                        self.logger.warning(f"LLM注释生成失败: {comment_result.error_message}")
+                except Exception as e:
+                    self.logger.warning(f"LLM注释生成异常: {e}")
+            
+            # 第4步:根据类型生成输出
+            result = {}
+            
+            if output_type in ["ddl", "both"]:
+                self.logger.info("开始生成DDL")
+                ddl_generator = DDLGeneratorTool()
+                ddl_result = await ddl_generator.execute(context)
+                if ddl_result.success:
+                    result["ddl"] = ddl_result.data.get("ddl_content", "")
+                    # 保存DDL结果供MD生成器使用
+                    context.step_results["ddl_generator"] = ddl_result
+                else:
+                    raise Exception(f"DDL生成失败: {ddl_result.error_message}")
+            
+            if output_type in ["md", "both"]:
+                self.logger.info("开始生成MD文档")
+                doc_generator = DocGeneratorTool()
+                
+                # 直接调用MD生成方法,不依赖文件系统
+                md_content = doc_generator._generate_md_content(
+                    table_metadata, 
+                    result.get("ddl", "")
+                )
+                result["md"] = md_content
+            
+            # 添加表信息摘要
+            result["table_info"] = {
+                "table_name": table_metadata.table_name,
+                "schema_name": table_metadata.schema_name,
+                "full_name": table_metadata.full_name,
+                "comment": table_metadata.comment,
+                "field_count": len(table_metadata.fields),
+                "row_count": table_metadata.row_count,
+                "table_size": table_metadata.table_size
+            }
+            
+            # 添加字段信息
+            result["fields"] = [
+                {
+                    "name": field.name,
+                    "type": field.type,
+                    "nullable": field.nullable,
+                    "comment": field.comment,
+                    "is_primary_key": field.is_primary_key,
+                    "is_foreign_key": field.is_foreign_key,
+                    "default_value": field.default_value,
+                    "is_enum": getattr(field, 'is_enum', False),
+                    "enum_values": getattr(field, 'enum_values', [])
+                }
+                for field in table_metadata.fields
+            ]
+            
+            self.logger.info(f"表DDL生成完成: {table}, 输出类型: {output_type}")
+            return result
+            
+        except Exception as e:
+            self.logger.error(f"获取表DDL失败: {e}")
+            raise
+        finally:
+            # 清理连接池
+            if 'db_inspector' in locals() and db_inspector.connection_pool:
+                await db_inspector.connection_pool.close()
+    
+    def _parse_table_name(self, table: str) -> tuple[str, str]:
+        """
+        解析表名
+        
+        Args:
+            table: 表名,格式为 schema.tablename 或 tablename
+        
+        Returns:
+            (schema_name, table_name) 元组
+        """
+        if "." in table:
+            parts = table.split(".", 1)
+            return parts[0], parts[1]
+        else:
+            # 如果没有指定schema,默认为public
+            return "public", table
+    
+    def _parse_db_connection(self, db_connection: str) -> Dict[str, Any]:
+        """
+        解析PostgreSQL连接字符串
+        
+        Args:
+            db_connection: PostgreSQL连接字符串,格式为 postgresql://user:password@host:port/dbname
+        
+        Returns:
+            包含数据库连接参数的字典
+        """
+        import re
+        
+        # 解析连接字符串的正则表达式
+        pattern = r'postgresql://([^:]+):([^@]+)@([^:]+):(\d+)/(.+)'
+        match = re.match(pattern, db_connection)
+        
+        if not match:
+            raise ValueError(f"无效的PostgreSQL连接字符串格式: {db_connection}")
+        
+        user, password, host, port, dbname = match.groups()
+        
+        return {
+            'user': user,
+            'password': password,
+            'host': host,
+            'port': int(port),
+            'dbname': dbname
+        }
+
+    def _convert_wildcard_to_sql_like(self, pattern: str) -> str:
+        """
+        将通配符模式转换为SQL LIKE语法
+        
+        Args:
+            pattern: 通配符模式
+                    - ods_* : 以"ods_"开头的表
+                    - *_dim : 以"_dim"结尾的表
+                    - *fact* : 包含"fact"的表
+                    - ods_% : 直接使用SQL LIKE语法(不转换)
+        
+        Returns:
+            SQL LIKE语法的模式字符串
+        """
+        if not pattern:
+            return "%"
+            
+        # 如果已经是SQL LIKE语法(包含%),直接返回
+        if "%" in pattern:
+            return pattern
+            
+        # 转换通配符*为%
+        sql_pattern = pattern.replace("*", "%")
+        
+        # 记录转换日志
+        if pattern != sql_pattern:
+            self.logger.debug(f"通配符模式转换: {pattern} -> {sql_pattern}")
+        
+        return sql_pattern 

+ 18 - 3
schema_tools/config.py → data_pipeline/config.py

@@ -13,7 +13,7 @@ SCHEMA_TOOLS_CONFIG = {
     # 核心配置
     "default_db_connection": None,  # 从命令行指定
     "default_business_context": "数据库管理系统", 
-    "output_directory": "training/generated_data",
+    "output_directory": "./data_pipeline/training_data/",
     
     # 处理链配置
     "default_pipeline": "full",
@@ -54,7 +54,7 @@ SCHEMA_TOOLS_CONFIG = {
     
     # LLM配置
     "use_app_config_llm": True,                # 是否使用app_config中的LLM配置
-    "comment_generation_timeout": 30,          # LLM调用超时时间(秒)
+    "comment_generation_timeout": 120,          # LLM调用超时时间(秒)
     "max_llm_retries": 3,                      # LLM调用最大重试次数
     
     # 系统表过滤配置
@@ -120,6 +120,19 @@ SCHEMA_TOOLS_CONFIG = {
         
         # 文件修改配置
         "modify_original_file": False,       # 是否修改原始JSON文件(默认禁用)
+    },
+    
+    # 文件上传配置
+    "file_upload": {
+        "enabled": True,                     # 是否启用文件上传功能
+        "max_file_size_mb": 2,               # 最大文件大小(MB)
+        "allowed_extensions": ["txt"],       # 允许的文件扩展名(不带点)
+        "target_filename": "table_list.txt", # 上传后的标准文件名
+        "validate_content": True,            # 是否验证文件内容
+        "min_lines": 1,                      # 最少行数(排除空行和注释)
+        "max_lines": 1000,                   # 最大行数限制
+        "encoding": "utf-8",                 # 文件编码
+        "allow_overwrite": True,             # 是否允许覆盖已存在的文件
     }
 }
 
@@ -169,4 +182,6 @@ def validate_config():
 try:
     validate_config()
 except ValueError as e:
-    print(f"警告: {e}")
+    # 在配置文件中使用stderr输出警告,避免依赖logging
+    import sys
+    print(f"警告: {e}", file=sys.stderr)

+ 5 - 0
data_pipeline/ddl_generation/__init__.py

@@ -0,0 +1,5 @@
+# DDL Generation module for database schema reverse engineering
+
+from .training_data_agent import SchemaTrainingDataAgent
+
+__all__ = ["SchemaTrainingDataAgent"]

+ 16 - 12
schema_tools/__main__.py → data_pipeline/ddl_generation/ddl_md_generator.py

@@ -1,3 +1,7 @@
+"""
+DDL和MD文档生成器命令行入口
+用于从PostgreSQL数据库生成DDL和MD训练数据
+"""
 import argparse
 import asyncio
 import sys
@@ -8,21 +12,21 @@ from pathlib import Path
 def setup_argument_parser():
     """设置命令行参数解析器"""
     parser = argparse.ArgumentParser(
-        description='Schema Tools - 自动生成数据库训练数据',
+        description='DDL/MD文档生成器 - 从PostgreSQL数据库生成训练数据',
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 示例用法:
   # 基本使用
-  python -m schema_tools --db-connection "postgresql://user:pass@host:5432/db" --table-list tables.txt
+  python -m data_pipeline.ddl_md_generator --db-connection "postgresql://user:pass@host:5432/db" --table-list tables.txt --business-context "电商系统"
   
-  # 指定业务上下文和输出目录
-  python -m schema_tools --db-connection "..." --table-list tables.txt --business-context "电商系统" --output-dir output
+  # 指定输出目录
+  python -m data_pipeline.ddl_md_generator --db-connection "..." --table-list tables.txt --business-context "电商系统" --output-dir ./data_pipeline/training_data/
   
   # 仅生成DDL文件
-  python -m schema_tools --db-connection "..." --table-list tables.txt --pipeline ddl_only
+  python -m data_pipeline.ddl_md_generator --db-connection "..." --table-list tables.txt --business-context "电商系统" --pipeline ddl_only
   
   # 权限检查模式
-  python -m schema_tools --db-connection "..." --check-permissions-only
+  python -m data_pipeline.ddl_md_generator --db-connection "..." --check-permissions-only
         """
     )
     
@@ -94,7 +98,7 @@ def setup_argument_parser():
 
 def load_config_with_overrides(args):
     """加载配置并应用命令行覆盖"""
-    from schema_tools.config import SCHEMA_TOOLS_CONFIG
+    from data_pipeline.config import SCHEMA_TOOLS_CONFIG
     
     config = SCHEMA_TOOLS_CONFIG.copy()
     
@@ -128,12 +132,12 @@ def load_business_context(args):
     if args.business_context:
         return args.business_context
     
-    from schema_tools.config import SCHEMA_TOOLS_CONFIG
+    from data_pipeline.config import SCHEMA_TOOLS_CONFIG
     return SCHEMA_TOOLS_CONFIG.get("default_business_context", "数据库管理系统")
 
 async def check_permissions_only(db_connection: str):
     """仅检查数据库权限"""
-    from schema_tools.training_data_agent import SchemaTrainingDataAgent
+    from .training_data_agent import SchemaTrainingDataAgent
     
     print("🔍 检查数据库权限...")
     
@@ -177,7 +181,7 @@ async def main():
     args = parser.parse_args()
     
     # 设置日志
-    from schema_tools.utils.logger import setup_logging
+    from data_pipeline.utils.logger import setup_logging
     setup_logging(
         verbose=args.verbose,
         log_file=args.log_file
@@ -204,7 +208,7 @@ async def main():
         business_context = load_business_context(args)
         
         # 创建Agent
-        from schema_tools.training_data_agent import SchemaTrainingDataAgent
+        from .training_data_agent import SchemaTrainingDataAgent
         
         agent = SchemaTrainingDataAgent(
             db_connection=args.db_connection,
@@ -215,7 +219,7 @@ async def main():
         )
         
         # 执行生成
-        print("🚀 开始生成Schema训练数据...")
+        print("🚀 开始生成DDL和MD文档...")
         report = await agent.generate_training_data()
         
         # 输出结果

+ 27 - 12
schema_tools/training_data_agent.py → data_pipeline/ddl_generation/training_data_agent.py

@@ -1,18 +1,17 @@
 import asyncio
 import time
-import logging
 import os
 from typing import List, Dict, Any, Optional
 from pathlib import Path
 
-from schema_tools.tools.base import ToolRegistry, PipelineExecutor
-from schema_tools.utils.data_structures import TableMetadata, TableProcessingContext, ProcessingResult
-from schema_tools.utils.file_manager import FileNameManager
-from schema_tools.utils.system_filter import SystemTableFilter
-from schema_tools.utils.permission_checker import DatabasePermissionChecker
-from schema_tools.utils.table_parser import TableListParser
-from schema_tools.utils.logger import setup_logging
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.tools.base import ToolRegistry, PipelineExecutor
+from data_pipeline.utils.data_structures import TableMetadata, TableProcessingContext, ProcessingResult
+from data_pipeline.utils.file_manager import FileNameManager
+from data_pipeline.utils.system_filter import SystemTableFilter
+from data_pipeline.utils.permission_checker import DatabasePermissionChecker
+from data_pipeline.utils.table_parser import TableListParser
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.dp_logging import get_logger
 
 class SchemaTrainingDataAgent:
     """Schema训练数据生成AI Agent"""
@@ -22,6 +21,7 @@ class SchemaTrainingDataAgent:
                  table_list_file: str,
                  business_context: str = None,
                  output_dir: str = None,
+                 task_id: str = None,
                  pipeline: str = "full"):
         
         self.db_connection = db_connection
@@ -50,7 +50,16 @@ class SchemaTrainingDataAgent:
         }
         
         self.failed_tables = []
-        self.logger = logging.getLogger("schema_tools.Agent")
+        self.task_id = task_id
+        
+        # 初始化独立日志系统
+        if task_id:
+            self.logger = get_logger("SchemaTrainingDataAgent", task_id)
+        else:
+            # 脚本模式下,如果没有传递task_id,生成一个
+            from datetime import datetime
+            self.task_id = f"manual_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+            self.logger = get_logger("SchemaTrainingDataAgent", self.task_id)
     
     async def generate_training_data(self) -> Dict[str, Any]:
         """主入口:生成训练数据"""
@@ -97,7 +106,7 @@ class SchemaTrainingDataAgent:
             os.makedirs(os.path.join(self.output_dir, "docs"), exist_ok=True)
         
         # logs目录始终创建
-        os.makedirs(os.path.join(self.output_dir, "logs"), exist_ok=True)
+        # os.makedirs(os.path.join(self.output_dir, "logs"), exist_ok=True)
         
         # 初始化数据库工具
         database_tool = ToolRegistry.get_tool("database_inspector", db_connection=self.db_connection)
@@ -110,7 +119,12 @@ class SchemaTrainingDataAgent:
         if not self.config["check_permissions"]:
             return
         
-        inspector = ToolRegistry.get_tool("database_inspector")
+        inspector = ToolRegistry.get_tool("database_inspector", db_connection=self.db_connection)
+        
+        # 确保连接池已创建
+        if not inspector.connection_pool:
+            await inspector._create_connection_pool()
+        
         checker = DatabasePermissionChecker(inspector)
         
         permissions = await checker.check_permissions()
@@ -208,6 +222,7 @@ class SchemaTrainingDataAgent:
                 pipeline=self.pipeline,
                 vn=None,  # 将在工具中注入
                 file_manager=self.file_manager,
+                db_connection=self.db_connection,  # 添加数据库连接参数
                 start_time=start_time
             )
             

+ 29 - 0
data_pipeline/dp_logging/__init__.py

@@ -0,0 +1,29 @@
+"""
+Data Pipeline 独立日志管理系统
+
+完全脱离主项目的日志管理,专门为data_pipeline模块设计
+支持任务级别的日志文件管理,同时支持API调用和脚本调用
+"""
+
+from .manager import DataPipelineLogManager
+
+# 对外接口
+def get_logger(name: str, task_id: str):
+    """
+    获取data_pipeline专用logger
+    
+    Args:
+        name: logger名称 (如: "SchemaWorkflowOrchestrator", "DDLGenerator")
+        task_id: 任务ID,必须提供
+                API模式: task_YYYYMMDD_HHMMSS
+                脚本模式: manual_YYYYMMDD_HHMMSS
+    
+    Returns:
+        配置好的logger,输出到 ./data_pipeline/training_data/{task_id}/data_pipeline.log
+    """
+    return DataPipelineLogManager.get_logger(name, task_id)
+
+# 便捷方法(保持接口一致性)
+def get_data_pipeline_logger(name: str, task_id: str):
+    """便捷方法,与get_logger功能相同"""
+    return get_logger(name, task_id)

+ 156 - 0
data_pipeline/dp_logging/manager.py

@@ -0,0 +1,156 @@
+"""
+Data Pipeline 独立日志管理器
+
+专门为data_pipeline模块设计的日志管理器,完全独立于主项目的日志系统
+"""
+
+import os
+from pathlib import Path
+from typing import Dict
+
+# 明确导入Python内置logging模块
+import logging as std_logging
+
+
+class DataPipelineLogManager:
+    """Data Pipeline 专用日志管理器"""
+    
+    _loggers: Dict[str, std_logging.Logger] = {}
+    _file_handlers: Dict[str, std_logging.FileHandler] = {}
+    
+    @classmethod
+    def get_logger(cls, name: str, task_id: str) -> std_logging.Logger:
+        """
+        获取或创建logger
+        
+        Args:
+            name: logger名称
+            task_id: 任务ID,用于确定日志文件位置
+        
+        Returns:
+            配置好的logger实例
+        """
+        logger_key = f"data_pipeline.{name}.{task_id}"
+        
+        if logger_key not in cls._loggers:
+            logger = cls._create_logger(name, task_id)
+            cls._loggers[logger_key] = logger
+        
+        return cls._loggers[logger_key]
+    
+    @classmethod
+    def _create_logger(cls, name: str, task_id: str) -> std_logging.Logger:
+        """创建新的logger实例"""
+        # 创建logger
+        logger_name = f"data_pipeline.{name}"
+        logger = std_logging.getLogger(logger_name)
+        
+        # 设置日志级别
+        logger.setLevel(std_logging.DEBUG)
+        
+        # 防止日志重复(清除已有处理器)
+        logger.handlers.clear()
+        logger.propagate = False
+        
+        # 添加控制台处理器
+        console_handler = cls._create_console_handler()
+        logger.addHandler(console_handler)
+        
+        # 添加文件处理器
+        file_handler = cls._create_file_handler(task_id)
+        if file_handler:
+            logger.addHandler(file_handler)
+        
+        return logger
+    
+    @classmethod
+    def _create_console_handler(cls) -> std_logging.StreamHandler:
+        """创建控制台处理器"""
+        handler = std_logging.StreamHandler()
+        handler.setLevel(std_logging.INFO)
+        
+        formatter = std_logging.Formatter(
+            '%(asctime)s [%(levelname)s] Pipeline: %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        handler.setFormatter(formatter)
+        
+        return handler
+    
+    @classmethod
+    def _create_file_handler(cls, task_id: str) -> std_logging.FileHandler:
+        """创建文件处理器"""
+        try:
+            # 获取项目根目录的绝对路径
+            project_root = Path(__file__).parent.parent.parent
+            task_dir = project_root / "data_pipeline" / "training_data" / task_id
+            
+            task_dir.mkdir(parents=True, exist_ok=True)
+            
+            log_file = task_dir / "data_pipeline.log"
+            
+            # 为每个任务创建独立的文件处理器
+            handler_key = f"file_handler_{task_id}"
+            
+            if handler_key not in cls._file_handlers:
+                handler = std_logging.FileHandler(log_file, encoding='utf-8')
+                handler.setLevel(std_logging.DEBUG)
+                
+                formatter = std_logging.Formatter(
+                    '%(asctime)s [%(levelname)s] [%(name)s] %(filename)s:%(lineno)d - %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S'
+                )
+                handler.setFormatter(formatter)
+                
+                cls._file_handlers[handler_key] = handler
+            
+            return cls._file_handlers[handler_key]
+            
+        except Exception as e:
+            # 如果文件处理器创建失败,记录到stderr但不影响程序运行
+            import sys
+            sys.stderr.write(f"[WARNING] 无法创建data_pipeline日志文件处理器: {e}\n")
+            return None
+    
+    @classmethod
+    def cleanup_logger(cls, task_id: str):
+        """清理指定任务的logger和文件处理器"""
+        try:
+            # 关闭文件处理器
+            handler_key = f"file_handler_{task_id}"
+            if handler_key in cls._file_handlers:
+                cls._file_handlers[handler_key].close()
+                del cls._file_handlers[handler_key]
+            
+            # 清理相关的logger
+            keys_to_remove = [key for key in cls._loggers.keys() if task_id in key]
+            for key in keys_to_remove:
+                logger = cls._loggers[key]
+                for handler in logger.handlers:
+                    handler.close()
+                logger.handlers.clear()
+                del cls._loggers[key]
+                
+        except Exception as e:
+            import sys
+            sys.stderr.write(f"[WARNING] 清理data_pipeline日志资源失败: {e}\n")
+    
+    @classmethod
+    def cleanup_all(cls):
+        """清理所有logger和文件处理器"""
+        try:
+            # 关闭所有文件处理器
+            for handler in cls._file_handlers.values():
+                handler.close()
+            cls._file_handlers.clear()
+            
+            # 清理所有logger
+            for logger in cls._loggers.values():
+                for handler in logger.handlers:
+                    handler.close()
+                logger.handlers.clear()
+            cls._loggers.clear()
+            
+        except Exception as e:
+            import sys
+            sys.stderr.write(f"[WARNING] 清理所有data_pipeline日志资源失败: {e}\n")

+ 544 - 0
data_pipeline/metadata_only_generator.py

@@ -0,0 +1,544 @@
+"""
+元数据生成器 - 仅生成metadata.txt和db_query_decision_prompt.txt
+不生成Question-SQL对,只提取主题并生成元数据文件
+"""
+
+import argparse
+import asyncio
+import sys
+import os
+from pathlib import Path
+from typing import List, Dict, Any
+from datetime import datetime
+
+from data_pipeline.analyzers import MDFileAnalyzer, ThemeExtractor
+from data_pipeline.validators import FileCountValidator
+from data_pipeline.utils.logger import setup_logging
+from core.vanna_llm_factory import create_vanna_instance
+import logging
+
+
+class MetadataOnlyGenerator:
+    """仅生成元数据文件的生成器"""
+    
+    def __init__(self, 
+                 output_dir: str,
+                 table_list_file: str,
+                 business_context: str,
+                 db_name: str = None):
+        """
+        初始化元数据生成器
+        
+        Args:
+            output_dir: 输出目录(包含DDL和MD文件)
+            table_list_file: 表清单文件路径
+            business_context: 业务上下文
+            db_name: 数据库名称
+        """
+        self.output_dir = Path(output_dir)
+        self.table_list_file = table_list_file
+        self.business_context = business_context
+        self.db_name = db_name or "db"
+        
+        # 初始化组件
+        self.validator = FileCountValidator()
+        self.md_analyzer = MDFileAnalyzer(output_dir)
+        self.vn = None
+        self.theme_extractor = None
+        
+        # 初始化logger
+        self.logger = logging.getLogger("MetadataOnlyGenerator")
+        
+        self.logger.info(f"🎯 元数据生成器初始化完成")
+        self.logger.info(f"📁 输出目录: {output_dir}")
+        self.logger.info(f"🏢 业务背景: {business_context}")
+        self.logger.info(f"💾 数据库: {self.db_name}")
+    
+    async def generate_metadata_only(self) -> Dict[str, Any]:
+        """
+        仅生成元数据文件
+        
+        Returns:
+            生成结果报告
+        """
+        try:
+            self.logger.info("🚀 开始生成元数据文件...")
+            
+            # 1. 验证文件数量
+            self.logger.info("📋 验证文件数量...")
+            validation_result = self.validator.validate(self.table_list_file, str(self.output_dir))
+            
+            if not validation_result.is_valid:
+                self.logger.error(f"❌ 文件验证失败: {validation_result.error}")
+                if validation_result.missing_ddl:
+                    self.logger.error(f"缺失DDL文件: {validation_result.missing_ddl}")
+                if validation_result.missing_md:
+                    self.logger.error(f"缺失MD文件: {validation_result.missing_md}")
+                raise ValueError(f"文件验证失败: {validation_result.error}")
+            
+            self.logger.info(f"✅ 文件验证通过: {validation_result.table_count}个表")
+            
+            # 2. 读取所有MD文件内容
+            self.logger.info("📖 读取MD文件...")
+            md_contents = await self.md_analyzer.read_all_md_files()
+            
+            # 3. 初始化LLM相关组件
+            self._initialize_llm_components()
+            
+            # 4. 提取分析主题
+            self.logger.info("🎯 提取分析主题...")
+            themes = await self.theme_extractor.extract_themes(md_contents)
+            self.logger.info(f"✅ 成功提取 {len(themes)} 个分析主题")
+            
+
+            for i, theme in enumerate(themes):
+                topic_name = theme.get('topic_name', theme.get('name', ''))
+                description = theme.get('description', '')
+                self.logger.info(f"  {i+1}. {topic_name}: {description}")
+            
+            # 5. 生成metadata.txt文件
+            self.logger.info("📝 生成metadata.txt...")
+            metadata_file = await self._generate_metadata_file(themes)
+            
+            # 6. 生成metadata_detail.md文件
+            self.logger.info("📝 生成metadata_detail.md...")
+            metadata_md_file = await self._generate_metadata_md_file(themes)
+            
+            # 7. 生成db_query_decision_prompt.txt文件
+            self.logger.info("📝 生成db_query_decision_prompt.txt...")
+            decision_prompt_file = await self._generate_decision_prompt_file(themes, md_contents)
+            
+            # 8. 生成报告
+            report = {
+                'success': True,
+                'total_themes': len(themes),
+                'metadata_file': str(metadata_file) if metadata_file else None,
+                'metadata_md_file': str(metadata_md_file) if metadata_md_file else None,
+                'decision_prompt_file': str(decision_prompt_file) if decision_prompt_file else None,
+                'themes': themes
+            }
+            
+            self._print_summary(report)
+            
+            return report
+            
+        except Exception as e:
+            self.logger.error(f"❌ 元数据生成失败: {e}")
+            raise
+    
+    def _initialize_llm_components(self):
+        """初始化LLM相关组件"""
+        if not self.vn:
+            self.logger.info("🤖 初始化LLM组件...")
+            self.vn = create_vanna_instance()
+            self.theme_extractor = ThemeExtractor(self.vn, self.business_context)
+    
+    async def _generate_metadata_file(self, themes: List[Dict]):
+        """生成metadata.txt文件,包含INSERT语句"""
+        metadata_file = self.output_dir / "metadata.txt"
+        
+        try:
+            with open(metadata_file, 'w', encoding='utf-8') as f:
+                f.write("-- Schema Tools生成的主题元数据\n")
+                f.write(f"-- 业务背景: {self.business_context}\n")
+                f.write(f"-- 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+                f.write(f"-- 数据库: {self.db_name}\n\n")
+                
+                f.write("-- 创建表(如果不存在)\n")
+                f.write("CREATE TABLE IF NOT EXISTS metadata (\n")
+                f.write("    id SERIAL PRIMARY KEY,    -- 主键\n")
+                f.write("    topic_name VARCHAR(100) NOT NULL,  -- 业务主题名称\n")
+                f.write("    description TEXT,                  -- 业务主体说明\n")
+                f.write("    related_tables TEXT[],\t\t\t  -- 相关表名\n")
+                f.write("    biz_entities TEXT[],               -- 主要业务实体名称\n")
+                f.write("    biz_metrics TEXT[],                -- 主要业务指标名称\n")
+                f.write("    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP    -- 插入时间\n")
+                f.write(");\n\n")
+                
+                f.write("-- 插入主题数据\n")
+                for theme in themes:
+                    # 获取字段值,使用新格式
+                    topic_name = theme.get('topic_name', theme.get('name', ''))
+                    description = theme.get('description', '')
+                    
+                    # 处理related_tables
+                    related_tables = theme.get('related_tables', [])
+                    if isinstance(related_tables, list):
+                        tables_str = ','.join(related_tables)
+                    else:
+                        tables_str = ''
+                    
+                    # 处理biz_entities
+                    biz_entities = theme.get('biz_entities', [])
+                    if isinstance(biz_entities, list):
+                        entities_str = ','.join(biz_entities)
+                    else:
+                        entities_str = ''
+                    
+                    # 处理biz_metrics
+                    biz_metrics = theme.get('biz_metrics', [])
+                    if isinstance(biz_metrics, list):
+                        metrics_str = ','.join(biz_metrics)
+                    else:
+                        metrics_str = ''
+                    
+                    # 生成INSERT语句
+                    f.write("INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES\n")
+                    f.write("(\n")
+                    f.write(f"  '{self._escape_sql_string(topic_name)}',\n")
+                    f.write(f"  '{self._escape_sql_string(description)}',\n")
+                    f.write(f"  '{tables_str}',\n")
+                    f.write(f"  '{entities_str}',\n")
+                    f.write(f"  '{metrics_str}'\n")
+                    f.write(");\n\n")
+            
+            self.logger.info(f"✅ metadata.txt文件已生成: {metadata_file}")
+            return metadata_file
+            
+        except Exception as e:
+            self.logger.error(f"❌ 生成metadata.txt文件失败: {e}")
+            return None
+    
+    async def _generate_metadata_md_file(self, themes: List[Dict]):
+        """生成metadata_detail.md文件"""
+        metadata_md_file = self.output_dir / "metadata_detail.md"
+        
+        try:
+            # 从themes中收集示例数据
+            sample_tables = set()
+            sample_entities = set()
+            sample_metrics = set()
+            
+            for theme in themes:
+                related_tables = theme.get('related_tables', [])
+                if isinstance(related_tables, list):
+                    sample_tables.update(related_tables[:2])  # 取前2个表作为示例
+                
+                biz_entities = theme.get('biz_entities', [])
+                if isinstance(biz_entities, list):
+                    sample_entities.update(biz_entities[:3])  # 取前3个实体作为示例
+                
+                biz_metrics = theme.get('biz_metrics', [])
+                if isinstance(biz_metrics, list):
+                    sample_metrics.update(biz_metrics[:3])  # 取前3个指标作为示例
+            
+            # 转换为字符串格式,避免硬编码特定行业内容
+            tables_example = ', '.join(list(sample_tables)[:2]) if sample_tables else '数据表1, 数据表2'
+            entities_example = ', '.join(list(sample_entities)[:3]) if sample_entities else '业务实体1, 业务实体2, 业务实体3'
+            metrics_example = ', '.join(list(sample_metrics)[:3]) if sample_metrics else '业务指标1, 业务指标2, 业务指标3'
+            
+            with open(metadata_md_file, 'w', encoding='utf-8') as f:
+                f.write("## metadata(存储分析主题元数据)\n\n")
+                f.write("`metadata` 主要描述了当前数据库包含了哪些数据内容,哪些分析主题,哪些指标等等。\n\n")
+                f.write("字段列表:\n\n")
+                f.write("- `id` (serial) - 主键ID [主键, 非空]\n")
+                f.write("- `topic_name` (varchar(100)) - 业务主题名称 [非空]\n")
+                f.write("- `description` (text) - 业务主题说明\n")
+                f.write(f"- `related_tables` (text[]) - 涉及的数据表 [示例: {tables_example}]\n")
+                f.write(f"- `biz_entities` (text[]) - 主要业务实体名称 [示例: {entities_example}]\n")
+                f.write(f"- `biz_metrics` (text[]) - 主要业务指标名称 [示例: {metrics_example}]\n")
+                f.write("- `created_at` (timestamp) - 插入时间 [默认值: `CURRENT_TIMESTAMP`]\n\n")
+                f.write("字段补充说明:\n\n")
+                f.write("- `id` 为主键,自增;\n")
+                f.write("- `related_tables` 用于建立主题与具体明细表的依赖关系;\n")
+                f.write("- `biz_entities` 表示主题关注的核心对象,例如服务区、车辆、公司;\n")
+                f.write("- `biz_metrics` 表示该主题关注的业务分析指标,例如营收对比、趋势变化、占比结构等。\n")
+            
+            self.logger.info(f"✅ metadata_detail.md文件已生成: {metadata_md_file}")
+            return metadata_md_file
+            
+        except Exception as e:
+            self.logger.error(f"❌ 生成metadata_detail.md文件失败: {e}")
+            return None
+    
+    async def _generate_decision_prompt_file(self, themes: List[Dict], md_contents: str):
+        """生成db_query_decision_prompt.txt文件"""
+        decision_prompt_file = self.output_dir / "db_query_decision_prompt.txt"
+        
+        try:
+            # 使用LLM动态生成决策提示内容
+            decision_content = await self._generate_decision_prompt_with_llm(themes, md_contents)
+            
+            # 写入文件
+            with open(decision_prompt_file, 'w', encoding='utf-8') as f:
+                f.write(decision_content)
+            
+            self.logger.info(f"✅ db_query_decision_prompt.txt文件已生成: {decision_prompt_file}")
+            return decision_prompt_file
+            
+        except Exception as e:
+            self.logger.error(f"❌ 生成db_query_decision_prompt.txt文件失败: {e}")
+            # 如果LLM调用失败,使用回退方案
+            try:
+                fallback_content = await self._generate_fallback_decision_content(themes)
+                with open(decision_prompt_file, 'w', encoding='utf-8') as f:
+                    f.write(fallback_content)
+                self.logger.warning(f"⚠️ 使用回退方案生成了 {decision_prompt_file}")
+                return decision_prompt_file
+            except Exception as fallback_error:
+                self.logger.error(f"❌ 回退方案也失败: {fallback_error}")
+                return None
+    
+    async def _generate_decision_prompt_with_llm(self, themes: List[Dict], md_contents: str) -> str:
+        """使用LLM动态生成db_query_decision_prompt.txt的完整内容(基于纯表结构分析)"""
+        try:
+            # 构建LLM提示词 - 让LLM完全独立分析表结构
+            prompt = f"""你是一位资深的数据分析师,请直接分析以下数据库的表结构,独立判断业务范围和数据范围。
+
+业务背景:{self.business_context}
+
+数据库表结构详细信息:
+{md_contents}
+
+分析任务:
+请你直接从表结构、字段名称、字段类型、示例数据中推断业务逻辑,不要参考任何预设的业务主题。
+
+分析要求:
+1. **业务范围**:根据表名和核心业务字段,用一句话概括这个数据库支撑的业务领域
+2. **数据范围**:根据具体的数据字段(如金额、数量、类型等),用一句话概括涉及的数据类型和范围  
+3. **核心业务实体**:从非技术字段中识别主要的业务对象(如表中的维度字段)
+4. **关键业务指标**:从数值型字段和枚举字段中识别可以进行分析的指标
+
+技术字段过滤规则(请忽略以下字段):
+- 主键字段:id、主键ID等
+- 时间戳字段:create_ts、update_ts、delete_ts、created_at、updated_at等  
+- 版本字段:version、版本号等
+- 操作人字段:created_by、updated_by、deleted_by等
+
+请直接生成以下格式的业务分析报告(请严格按照格式,不要添加额外内容):
+
+=== 数据库业务范围 ===
+当前数据库存储的是[业务描述]的相关数据,主要涉及[数据范围],包含以下业务数据:
+核心业务实体:
+- 实体类型1:详细描述(基于实际字段和业务场景),主要字段:字段1、字段2、字段3
+- 实体类型2:详细描述,主要字段:字段1、字段2、字段3
+关键业务指标:
+- 指标类型1:详细描述(基于实际数值字段和分析需求)
+- 指标类型2:详细描述
+
+要求:
+1. 请完全基于表结构进行独立分析,从字段的业务含义出发,准确反映数据库的实际业务范围
+2. 严格按照上述格式输出,不要添加分析依据、总结或其他额外内容
+3. 输出内容到"指标类型2:详细描述"结束即可"""
+            
+            # 调用LLM生成内容
+            response = await asyncio.to_thread(
+                self.vn.chat_with_llm,
+                question=prompt,
+                system_prompt="你是一个专业的数据分析师,擅长从业务角度总结数据库的业务范围和核心实体。请基于实际的表结构和字段信息生成准确的业务描述。"
+            )
+            return response.strip()
+            
+        except Exception as e:
+            self.logger.error(f"❌ LLM生成决策提示内容失败: {e}")
+            # 回退方案:生成基础内容
+            return await self._generate_fallback_decision_content(themes)
+    
+    async def _generate_fallback_decision_content(self, themes: List[Dict]) -> str:
+        """生成回退的决策提示内容(尝试用简化LLM调用)"""
+        content = f"=== 数据库业务范围 ===\n"
+        
+        # 尝试用简化的LLM调用获取数据范围
+        try:
+            # 构建简化的提示词
+            entities_sample = []
+            metrics_sample = []
+            
+            for theme in themes[:3]:  # 只取前3个主题作为示例
+                biz_entities = theme.get('biz_entities', [])
+                if isinstance(biz_entities, list):
+                    entities_sample.extend(biz_entities[:2])
+                    
+                biz_metrics = theme.get('biz_metrics', [])  
+                if isinstance(biz_metrics, list):
+                    metrics_sample.extend(biz_metrics[:2])
+            
+            # 简化的提示词
+            simple_prompt = f"""基于以下信息,用一句话概括{self.business_context}涉及的数据范围:
+业务实体示例:{', '.join(entities_sample[:5])}
+业务指标示例:{', '.join(metrics_sample[:5])}
+
+请只回答数据范围,格式如:某某数据、某某信息、某某统计等"""
+
+            data_range = await asyncio.to_thread(
+                self.vn.chat_with_llm,
+                question=simple_prompt,
+                system_prompt="请用简洁的语言概括数据范围。"
+            )
+            data_range = data_range.strip()
+            
+            # 如果LLM返回内容合理,则使用
+            if data_range and len(data_range) < 100:
+                content += f"当前数据库存储的是{self.business_context}的相关数据,主要涉及{data_range},包含以下业务数据:\n"
+            else:
+                raise Exception("LLM返回内容不合理")
+                
+        except Exception as e:
+            self.logger.warning(f"⚠️ 简化LLM调用也失败,使用完全兜底方案: {e}")
+            # 真正的最后兜底
+            content += f"当前数据库存储的是{self.business_context}的相关数据,主要涉及相关业务数据,包含以下业务数据:\n"
+        
+        content += "核心业务实体:\n"
+        
+        # 收集所有实体
+        all_entities = set()
+        for theme in themes:
+            biz_entities = theme.get('biz_entities', [])
+            if isinstance(biz_entities, list):
+                all_entities.update(biz_entities)
+        
+        for entity in list(all_entities)[:8]:
+            content += f"- {entity}:{entity}相关的业务信息\n"
+        
+        content += "关键业务指标:\n"
+        
+        # 收集所有指标
+        all_metrics = set()
+        for theme in themes:
+            biz_metrics = theme.get('biz_metrics', [])
+            if isinstance(biz_metrics, list):
+                all_metrics.update(biz_metrics)
+        
+        for metric in list(all_metrics)[:8]:
+            content += f"- {metric}:{metric}相关的分析指标\n"
+        
+        return content
+    
+    def _escape_sql_string(self, value: str) -> str:
+        """转义SQL字符串中的特殊字符"""
+        if not value:
+            return ""
+        # 转义单引号
+        return value.replace("'", "''")
+    
+    def _print_summary(self, report: Dict):
+        """打印总结信息"""
+        self.logger.info("=" * 60)
+        self.logger.info("📊 元数据生成总结")
+        self.logger.info(f"  ✅ 分析主题数: {report['total_themes']}")
+        self.logger.info(f"  📄 metadata.txt: {'✅ 已生成' if report['metadata_file'] else '❌ 生成失败'}")
+        self.logger.info(f"  📄 metadata_detail.md: {'✅ 已生成' if report['metadata_md_file'] else '❌ 生成失败'}")
+        self.logger.info(f"  📄 db_query_decision_prompt.txt: {'✅ 已生成' if report['decision_prompt_file'] else '❌ 生成失败'}")
+        self.logger.info("=" * 60)
+
+
+def setup_argument_parser():
+    """设置命令行参数解析器"""
+    parser = argparse.ArgumentParser(
+        description='元数据生成器 - 仅生成metadata.txt和db_query_decision_prompt.txt',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例用法:
+  # 基本使用
+  python -m data_pipeline.metadata_only_generator --output-dir ./data_pipeline/training_data --table-list ./data_pipeline/tables.txt --business-context "高速公路服务区管理系统"
+  
+  # 指定数据库名称
+  python -m data_pipeline.metadata_only_generator --output-dir ./data_pipeline/training_data --table-list ./data_pipeline/tables.txt --business-context "电商系统" --db-name ecommerce_db
+  
+  # 启用详细日志
+  python -m data_pipeline.metadata_only_generator --output-dir ./data_pipeline/training_data --table-list ./data_pipeline/tables.txt --business-context "管理系统" --verbose
+        """
+    )
+    
+    # 必需参数
+    parser.add_argument(
+        '--output-dir',
+        required=True,
+        help='包含DDL和MD文件的输出目录'
+    )
+    
+    parser.add_argument(
+        '--table-list',
+        required=True,
+        help='表清单文件路径(用于验证文件数量)'
+    )
+    
+    parser.add_argument(
+        '--business-context',
+        required=True,
+        help='业务上下文描述'
+    )
+    
+    # 可选参数
+    parser.add_argument(
+        '--db-name',
+        help='数据库名称(用于输出文件命名)'
+    )
+    
+    parser.add_argument(
+        '--verbose', '-v',
+        action='store_true',
+        help='启用详细日志输出'
+    )
+    
+    parser.add_argument(
+        '--log-file',
+        help='日志文件路径'
+    )
+    
+    return parser
+
+
+async def main():
+    """主入口函数"""
+    parser = setup_argument_parser()
+    args = parser.parse_args()
+    
+    # 设置日志
+    setup_logging(
+        verbose=args.verbose,
+        log_file=args.log_file
+    )
+    
+    # 验证参数
+    output_path = Path(args.output_dir)
+    # 为脚本模式生成task_id
+    from datetime import datetime
+    script_task_id = f"manual_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+    # 初始化logger用于参数验证
+    from data_pipeline.dp_logging import get_logger
+    logger = get_logger("MetadataGeneratorMain", script_task_id)
+    
+    if not output_path.exists():
+        logger.error(f"错误: 输出目录不存在: {args.output_dir}")
+        sys.exit(1)
+    
+    if not os.path.exists(args.table_list):
+        logger.error(f"错误: 表清单文件不存在: {args.table_list}")
+        sys.exit(1)
+    
+    try:
+        # 创建生成器
+        generator = MetadataOnlyGenerator(
+            output_dir=args.output_dir,
+            table_list_file=args.table_list,
+            business_context=args.business_context,
+            db_name=args.db_name
+        )
+        
+        # 执行生成
+        report = await generator.generate_metadata_only()
+        
+        # 输出结果
+        if report['success']:
+            logger.info("\n🎉 元数据文件生成成功!")
+            exit_code = 0
+        else:
+            logger.error("\n❌ 元数据文件生成失败")
+            exit_code = 1
+        
+        sys.exit(exit_code)
+        
+    except KeyboardInterrupt:
+        logger.info("\n\n⏹️  用户中断,程序退出")
+        sys.exit(130)
+    except Exception as e:
+        logger.error(f"\n❌ 程序执行失败: {e}")
+        if args.verbose:
+            import traceback
+            traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    asyncio.run(main()) 

+ 0 - 0
schema_tools/prompts/__init__.py → data_pipeline/prompts/__init__.py


+ 0 - 0
schema_tools/prompts/business_dictionary.txt → data_pipeline/prompts/business_dictionary.txt


+ 1 - 0
data_pipeline/qa_generation/__init__.py

@@ -0,0 +1 @@
+# QA Generation module for Vanna Q&A generation

+ 250 - 31
schema_tools/qs_agent.py → data_pipeline/qa_generation/qs_agent.py

@@ -6,10 +6,10 @@ from datetime import datetime
 from pathlib import Path
 from typing import List, Dict, Any, Optional
 
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
-from schema_tools.validators import FileCountValidator
-from schema_tools.analyzers import MDFileAnalyzer, ThemeExtractor
-from schema_tools.utils.logger import setup_logging
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.validators import FileCountValidator
+from data_pipeline.analyzers import MDFileAnalyzer, ThemeExtractor
+from data_pipeline.dp_logging import get_logger
 from core.vanna_llm_factory import create_vanna_instance
 
 
@@ -20,7 +20,8 @@ class QuestionSQLGenerationAgent:
                  output_dir: str,
                  table_list_file: str,
                  business_context: str,
-                 db_name: str = None):
+                 db_name: str = None,
+                 task_id: str = None):
         """
         初始化Agent
         
@@ -29,6 +30,7 @@ class QuestionSQLGenerationAgent:
             table_list_file: 表清单文件路径
             business_context: 业务上下文
             db_name: 数据库名称(用于输出文件命名)
+            task_id: 任务ID
         """
         self.output_dir = Path(output_dir)
         self.table_list_file = table_list_file
@@ -36,7 +38,16 @@ class QuestionSQLGenerationAgent:
         self.db_name = db_name or "db"
         
         self.config = SCHEMA_TOOLS_CONFIG
-        self.logger = logging.getLogger("schema_tools.QSAgent")
+        self.task_id = task_id
+        
+        # 初始化独立日志系统
+        if task_id:
+            self.logger = get_logger("QuestionSQLGenerationAgent", task_id)
+        else:
+            # 脚本模式下,如果没有传递task_id,生成一个
+            from datetime import datetime
+            self.task_id = f"manual_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+            self.logger = get_logger("QuestionSQLGenerationAgent", self.task_id)
         
         # 初始化组件
         self.validator = FileCountValidator()
@@ -120,6 +131,12 @@ class QuestionSQLGenerationAgent:
             # 8.5 生成metadata.txt文件
             await self._generate_metadata_file(themes)
             
+            # 8.6 生成metadata_detail.md文件
+            await self._generate_metadata_md_file(themes)
+            
+            # 8.7 生成db_query_decision_prompt.txt文件
+            await self._generate_decision_prompt_file(themes)
+            
             # 9. 清理中间文件
             if not failed_themes:  # 只有全部成功才清理
                 self._cleanup_intermediate_file()
@@ -243,17 +260,19 @@ class QuestionSQLGenerationAgent:
         """构建Question-SQL生成的prompt"""
         questions_count = self.config['qs_generation']['questions_per_theme']
         
-        # 兼容新旧格式
+        # 获取主题信息
         topic_name = theme.get('topic_name', theme.get('name', ''))
         description = theme.get('description', '')
-        focus_areas = theme.get('focus_areas', theme.get('keywords', []))
+        biz_entities = theme.get('biz_entities', [])
+        biz_metrics = theme.get('biz_metrics', [])
         related_tables = theme.get('related_tables', [])
         
         prompt = f"""你是一位业务数据分析师,正在为{self.business_context}设计数据查询。
 
 当前分析主题:{topic_name}
 主题描述:{description}
-关注领域:{', '.join(focus_areas)}
+业务实体:{', '.join(biz_entities)}
+业务指标:{', '.join(biz_metrics)}
 相关表:{', '.join(related_tables)}
 
 数据库表结构信息:
@@ -471,13 +490,13 @@ class QuestionSQLGenerationAgent:
                 
                 f.write("-- 创建表(如果不存在)\n")
                 f.write("CREATE TABLE IF NOT EXISTS metadata (\n")
-                f.write("    id SERIAL PRIMARY KEY,\n")
-                f.write("    topic_name VARCHAR(100) NOT NULL,\n")
-                f.write("    description TEXT,\n")
-                f.write("    related_tables TEXT[],\n")
-                f.write("    keywords TEXT[],\n")
-                f.write("    focus_areas TEXT[],\n")
-                f.write("    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n")
+                f.write("    id SERIAL PRIMARY KEY,    -- 主键\n")
+                f.write("    topic_name VARCHAR(100) NOT NULL,  -- 业务主题名称\n")
+                f.write("    description TEXT,                  -- 业务主体说明\n")
+                f.write("    related_tables TEXT[],\t\t\t  -- 相关表名\n")
+                f.write("    biz_entities TEXT[],               -- 主要业务实体名称\n")
+                f.write("    biz_metrics TEXT[],                -- 主要业务指标名称\n")
+                f.write("    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP    -- 插入时间\n")
                 f.write(");\n\n")
                 
                 f.write("-- 插入主题数据\n")
@@ -489,32 +508,32 @@ class QuestionSQLGenerationAgent:
                     # 处理related_tables
                     related_tables = theme.get('related_tables', [])
                     if isinstance(related_tables, list):
-                        tables_str = '{' + ','.join(related_tables) + '}'
+                        tables_str = ','.join(related_tables)
                     else:
-                        tables_str = '{}'
+                        tables_str = ''
                     
-                    # 处理keywords
-                    keywords = theme.get('keywords', [])
-                    if isinstance(keywords, list):
-                        keywords_str = '{' + ','.join(keywords) + '}'
+                    # 处理biz_entities
+                    biz_entities = theme.get('biz_entities', [])
+                    if isinstance(biz_entities, list):
+                        entities_str = ','.join(biz_entities)
                     else:
-                        keywords_str = '{}'
+                        entities_str = ''
                     
-                    # 处理focus_areas
-                    focus_areas = theme.get('focus_areas', [])
-                    if isinstance(focus_areas, list):
-                        focus_areas_str = '{' + ','.join(focus_areas) + '}'
+                    # 处理biz_metrics
+                    biz_metrics = theme.get('biz_metrics', [])
+                    if isinstance(biz_metrics, list):
+                        metrics_str = ','.join(biz_metrics)
                     else:
-                        focus_areas_str = '{}'
+                        metrics_str = ''
                     
                     # 生成INSERT语句
-                    f.write("INSERT INTO metadata(topic_name, description, related_tables, keywords, focus_areas) VALUES\n")
+                    f.write("INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES\n")
                     f.write("(\n")
                     f.write(f"  '{self._escape_sql_string(topic_name)}',\n")
                     f.write(f"  '{self._escape_sql_string(description)}',\n")
                     f.write(f"  '{tables_str}',\n")
-                    f.write(f"  '{keywords_str}',\n")
-                    f.write(f"  '{focus_areas_str}'\n")
+                    f.write(f"  '{entities_str}',\n")
+                    f.write(f"  '{metrics_str}'\n")
                     f.write(");\n\n")
             
             self.logger.info(f"✅ metadata.txt文件已生成: {metadata_file}")
@@ -524,6 +543,206 @@ class QuestionSQLGenerationAgent:
             self.logger.error(f"生成metadata.txt文件失败: {e}")
             return None
     
+    async def _generate_metadata_md_file(self, themes: List[Dict]):
+        """生成metadata_detail.md文件"""
+        metadata_md_file = self.output_dir / "metadata_detail.md"
+        
+        try:
+            # 从themes中收集示例数据
+            sample_tables = set()
+            sample_entities = set()
+            sample_metrics = set()
+            
+            for theme in themes:
+                related_tables = theme.get('related_tables', [])
+                if isinstance(related_tables, list):
+                    sample_tables.update(related_tables[:2])  # 取前2个表作为示例
+                
+                biz_entities = theme.get('biz_entities', [])
+                if isinstance(biz_entities, list):
+                    sample_entities.update(biz_entities[:3])  # 取前3个实体作为示例
+                
+                biz_metrics = theme.get('biz_metrics', [])
+                if isinstance(biz_metrics, list):
+                    sample_metrics.update(biz_metrics[:3])  # 取前3个指标作为示例
+            
+            # 转换为字符串格式,避免硬编码特定行业内容
+            tables_example = ', '.join(list(sample_tables)[:2]) if sample_tables else '数据表1, 数据表2'
+            entities_example = ', '.join(list(sample_entities)[:3]) if sample_entities else '业务实体1, 业务实体2, 业务实体3'
+            metrics_example = ', '.join(list(sample_metrics)[:3]) if sample_metrics else '业务指标1, 业务指标2, 业务指标3'
+            
+            with open(metadata_md_file, 'w', encoding='utf-8') as f:
+                f.write("## metadata(存储分析主题元数据)\n\n")
+                f.write("`metadata` 主要描述了当前数据库包含了哪些数据内容,哪些分析主题,哪些指标等等。\n\n")
+                f.write("字段列表:\n\n")
+                f.write("- `id` (serial) - 主键ID [主键, 非空]\n")
+                f.write("- `topic_name` (varchar(100)) - 业务主题名称 [非空]\n")
+                f.write("- `description` (text) - 业务主题说明\n")
+                f.write(f"- `related_tables` (text[]) - 涉及的数据表 [示例: {tables_example}]\n")
+                f.write(f"- `biz_entities` (text[]) - 主要业务实体名称 [示例: {entities_example}]\n")
+                f.write(f"- `biz_metrics` (text[]) - 主要业务指标名称 [示例: {metrics_example}]\n")
+                f.write("- `created_at` (timestamp) - 插入时间 [默认值: `CURRENT_TIMESTAMP`]\n\n")
+                f.write("字段补充说明:\n\n")
+                f.write("- `id` 为主键,自增;\n")
+                f.write("- `related_tables` 用于建立主题与具体明细表的依赖关系;\n")
+                f.write("- `biz_entities` 表示主题关注的核心对象,例如服务区、车辆、公司;\n")
+                f.write("- `biz_metrics` 表示该主题关注的业务分析指标,例如营收对比、趋势变化、占比结构等。\n")
+            
+            self.logger.info(f"✅ metadata_detail.md文件已生成: {metadata_md_file}")
+            return metadata_md_file
+            
+        except Exception as e:
+            self.logger.error(f"生成metadata_detail.md文件失败: {e}")
+            return None
+    
+    async def _generate_decision_prompt_with_llm(self, themes: List[Dict], md_contents: str) -> str:
+        """使用LLM动态生成db_query_decision_prompt.txt的完整内容(基于纯表结构分析)"""
+        try:
+            # 构建LLM提示词 - 让LLM完全独立分析表结构
+            prompt = f"""你是一位资深的数据分析师,请直接分析以下数据库的表结构,独立判断业务范围和数据范围。
+
+业务背景:{self.business_context}
+
+数据库表结构详细信息:
+{md_contents}
+
+分析任务:
+请你直接从表结构、字段名称、字段类型、示例数据中推断业务逻辑,不要参考任何预设的业务主题。
+
+分析要求:
+1. **业务范围**:根据表名和核心业务字段,用一句话概括这个数据库支撑的业务领域
+2. **数据范围**:根据具体的数据字段(如金额、数量、类型等),用一句话概括涉及的数据类型和范围  
+3. **核心业务实体**:从非技术字段中识别主要的业务对象(如表中的维度字段)
+4. **关键业务指标**:从数值型字段和枚举字段中识别可以进行分析的指标
+
+技术字段过滤规则(请忽略以下字段):
+- 主键字段:id、主键ID等
+- 时间戳字段:create_ts、update_ts、delete_ts、created_at、updated_at等  
+- 版本字段:version、版本号等
+- 操作人字段:created_by、updated_by、deleted_by等
+
+请直接生成以下格式的业务分析报告(请严格按照格式,不要添加额外内容):
+
+=== 数据库业务范围 ===
+当前数据库存储的是[业务描述]的相关数据,主要涉及[数据范围],包含以下业务数据:
+核心业务实体:
+- 实体类型1:详细描述(基于实际字段和业务场景),主要字段:字段1、字段2、字段3
+- 实体类型2:详细描述,主要字段:字段1、字段2、字段3
+关键业务指标:
+- 指标类型1:详细描述(基于实际数值字段和分析需求)
+- 指标类型2:详细描述
+
+要求:
+1. 请完全基于表结构进行独立分析,从字段的业务含义出发,准确反映数据库的实际业务范围
+2. 严格按照上述格式输出,不要添加分析依据、总结或其他额外内容
+3. 输出内容到"指标类型2:详细描述"结束即可"""
+            
+            # 调用LLM生成内容
+            response = await self._call_llm(prompt)
+            return response.strip()
+            
+        except Exception as e:
+            self.logger.error(f"LLM生成决策提示内容失败: {e}")
+            # 回退方案:生成基础内容
+            return self._generate_fallback_decision_content(themes)
+    
+    async def _generate_fallback_decision_content(self, themes: List[Dict]) -> str:
+        """生成回退的决策提示内容(尝试用简化LLM调用)"""
+        content = f"=== 数据库业务范围 ===\n"
+        
+        # 尝试用简化的LLM调用获取数据范围
+        try:
+            # 构建简化的提示词
+            entities_sample = []
+            metrics_sample = []
+            
+            for theme in themes[:3]:  # 只取前3个主题作为示例
+                biz_entities = theme.get('biz_entities', [])
+                if isinstance(biz_entities, list):
+                    entities_sample.extend(biz_entities[:2])
+                    
+                biz_metrics = theme.get('biz_metrics', [])  
+                if isinstance(biz_metrics, list):
+                    metrics_sample.extend(biz_metrics[:2])
+            
+            # 简化的提示词
+            simple_prompt = f"""基于以下信息,用一句话概括{self.business_context}涉及的数据范围:
+业务实体示例:{', '.join(entities_sample[:5])}
+业务指标示例:{', '.join(metrics_sample[:5])}
+
+请只回答数据范围,格式如:某某数据、某某信息、某某统计等"""
+
+            data_range = await self._call_llm(simple_prompt)
+            data_range = data_range.strip()
+            
+            # 如果LLM返回内容合理,则使用
+            if data_range and len(data_range) < 100:
+                content += f"当前数据库存储的是{self.business_context}的相关数据,主要涉及{data_range},包含以下业务数据:\n"
+            else:
+                raise Exception("LLM返回内容不合理")
+                
+        except Exception as e:
+            self.logger.warning(f"简化LLM调用也失败,使用完全兜底方案: {e}")
+            # 真正的最后兜底
+            content += f"当前数据库存储的是{self.business_context}的相关数据,主要涉及相关业务数据,包含以下业务数据:\n"
+        
+        content += "核心业务实体:\n"
+        
+        # 收集所有实体
+        all_entities = set()
+        for theme in themes:
+            biz_entities = theme.get('biz_entities', [])
+            if isinstance(biz_entities, list):
+                all_entities.update(biz_entities)
+        
+        for entity in list(all_entities)[:8]:
+            content += f"- {entity}:{entity}相关的业务信息\n"
+        
+        content += "关键业务指标:\n"
+        
+        # 收集所有指标
+        all_metrics = set()
+        for theme in themes:
+            biz_metrics = theme.get('biz_metrics', [])
+            if isinstance(biz_metrics, list):
+                all_metrics.update(biz_metrics)
+        
+        for metric in list(all_metrics)[:8]:
+            content += f"- {metric}:{metric}相关的分析指标\n"
+        
+        return content
+
+    async def _generate_decision_prompt_file(self, themes: List[Dict]):
+        """生成db_query_decision_prompt.txt文件"""
+        decision_prompt_file = self.output_dir / "db_query_decision_prompt.txt"
+        
+        try:
+            # 读取MD内容作为LLM输入
+            md_contents = await self.md_analyzer.read_all_md_files()
+            
+            # 使用LLM动态生成决策提示内容
+            decision_content = await self._generate_decision_prompt_with_llm(themes, md_contents)
+            
+            # 写入文件
+            with open(decision_prompt_file, 'w', encoding='utf-8') as f:
+                f.write(decision_content)
+            
+            self.logger.info(f"✅ db_query_decision_prompt.txt文件已生成: {decision_prompt_file}")
+            return decision_prompt_file
+            
+        except Exception as e:
+            self.logger.error(f"生成db_query_decision_prompt.txt文件失败: {e}")
+            # 如果LLM调用失败,使用回退方案
+            try:
+                fallback_content = await self._generate_fallback_decision_content(themes)
+                with open(decision_prompt_file, 'w', encoding='utf-8') as f:
+                    f.write(fallback_content)
+                self.logger.warning(f"⚠️ 使用回退方案生成了 {decision_prompt_file}")
+                return decision_prompt_file
+            except Exception as fallback_error:
+                self.logger.error(f"回退方案也失败: {fallback_error}")
+                return None
+    
     def _escape_sql_string(self, value: str) -> str:
         """转义SQL字符串中的特殊字符"""
         if not value:

+ 5 - 5
schema_tools/qs_generator.py → data_pipeline/qa_generation/qs_generator.py

@@ -9,8 +9,8 @@ import sys
 import os
 from pathlib import Path
 
-from schema_tools.qs_agent import QuestionSQLGenerationAgent
-from schema_tools.utils.logger import setup_logging
+from .qs_agent import QuestionSQLGenerationAgent
+from data_pipeline.utils.logger import setup_logging
 
 
 def setup_argument_parser():
@@ -21,13 +21,13 @@ def setup_argument_parser():
         epilog="""
 示例用法:
   # 基本使用
-  python -m schema_tools.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "高速公路服务区管理系统"
+  python -m data_pipeline.qa_generation.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "高速公路服务区管理系统"
   
   # 指定数据库名称
-  python -m schema_tools.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "电商系统" --db-name ecommerce_db
+  python -m data_pipeline.qa_generation.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "电商系统" --db-name ecommerce_db
   
   # 启用详细日志
-  python -m schema_tools.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "管理系统" --verbose
+  python -m data_pipeline.qa_generation.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "管理系统" --verbose
         """
     )
     

+ 227 - 59
schema_tools/schema_workflow_orchestrator.py → data_pipeline/schema_workflow.py

@@ -10,11 +10,11 @@ from typing import Dict, Any, List, Optional
 from pathlib import Path
 from datetime import datetime
 
-from schema_tools.training_data_agent import SchemaTrainingDataAgent
-from schema_tools.qs_agent import QuestionSQLGenerationAgent
-from schema_tools.sql_validation_agent import SQLValidationAgent
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
-from schema_tools.utils.logger import setup_logging
+from data_pipeline.ddl_generation.training_data_agent import SchemaTrainingDataAgent
+from data_pipeline.qa_generation.qs_agent import QuestionSQLGenerationAgent
+from data_pipeline.validators.sql_validation_agent import SQLValidationAgent
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.dp_logging import get_logger
 
 
 class SchemaWorkflowOrchestrator:
@@ -24,38 +24,58 @@ class SchemaWorkflowOrchestrator:
                  db_connection: str,
                  table_list_file: str,
                  business_context: str,
-                 db_name: str,
                  output_dir: str = None,
+                 task_id: str = None,
                  enable_sql_validation: bool = True,
                  enable_llm_repair: bool = True,
-                 modify_original_file: bool = True):
+                 modify_original_file: bool = True,
+                 enable_training_data_load: bool = True):
         """
         初始化Schema工作流编排器
         
         Args:
-            db_connection: 数据库连接字符串
+            db_connection: 数据库连接字符串 (postgresql://user:pass@host:port/dbname)
             table_list_file: 表清单文件路径
             business_context: 业务上下文描述
-            db_name: 数据库名称(用于生成文件名)
             output_dir: 输出目录
+            task_id: 任务ID (API模式传递,脚本模式自动生成)
             enable_sql_validation: 是否启用SQL验证
             enable_llm_repair: 是否启用LLM修复功能
             modify_original_file: 是否修改原始JSON文件
+            enable_training_data_load: 是否启用训练数据加载
         """
         self.db_connection = db_connection
         self.table_list_file = table_list_file
         self.business_context = business_context
-        self.db_name = db_name
-        self.output_dir = Path(output_dir) if output_dir else Path("./output")
+        self.db_name = self._extract_db_name_from_connection(db_connection)
         self.enable_sql_validation = enable_sql_validation
         self.enable_llm_repair = enable_llm_repair
         self.modify_original_file = modify_original_file
+        self.enable_training_data_load = enable_training_data_load
         
+        # 处理task_id
+        if task_id is None:
+            # 脚本模式:自动生成manual开头的task_id
+            self.task_id = f"manual_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        else:
+            # API模式:使用传递的task_id
+            self.task_id = task_id
+        
+        # 设置输出目录
+        if output_dir is None:
+            # 脚本模式或未指定输出目录时,使用任务目录
+            # 获取项目根目录的绝对路径
+            project_root = Path(__file__).parent.parent
+            self.output_dir = project_root / "data_pipeline" / "training_data" / self.task_id
+        else:
+            # API模式或明确指定输出目录时,使用指定的目录
+            self.output_dir = Path(output_dir)
+            
         # 确保输出目录存在
         self.output_dir.mkdir(parents=True, exist_ok=True)
-        
-        # 初始化日志
-        self.logger = logging.getLogger("schema_tools.SchemaWorkflowOrchestrator")
+            
+        # 初始化独立日志系统
+        self.logger = get_logger("SchemaWorkflowOrchestrator", self.task_id)
         
         # 工作流程状态
         self.workflow_state = {
@@ -68,6 +88,30 @@ class SchemaWorkflowOrchestrator:
             "statistics": {}
         }
     
+    def _extract_db_name_from_connection(self, connection_string: str) -> str:
+        """
+        从数据库连接字符串中提取数据库名称
+        
+        Args:
+            connection_string: PostgreSQL连接字符串
+            
+        Returns:
+            str: 数据库名称
+        """
+        try:
+            # 处理标准的PostgreSQL连接字符串: postgresql://user:pass@host:port/dbname
+            if '/' in connection_string:
+                # 取最后一个 '/' 后面的部分作为数据库名
+                db_name = connection_string.split('/')[-1]
+                # 移除可能的查询参数
+                if '?' in db_name:
+                    db_name = db_name.split('?')[0]
+                return db_name if db_name else "database"
+            else:
+                return "database"
+        except Exception:
+            return "database"
+    
     async def execute_complete_workflow(self) -> Dict[str, Any]:
         """
         执行完整的Schema处理工作流程
@@ -94,6 +138,12 @@ class SchemaWorkflowOrchestrator:
             else:
                 self.logger.info("⏭️ 跳过SQL验证步骤")
             
+            # 步骤4: 训练数据加载(可选)
+            if self.enable_training_data_load:
+                await self._execute_step_4_training_data_load()
+            else:
+                self.logger.info("⏭️ 跳过训练数据加载步骤")
+            
             # 设置结束时间
             self.workflow_state["end_time"] = time.time()
             
@@ -127,6 +177,7 @@ class SchemaWorkflowOrchestrator:
                 table_list_file=self.table_list_file,
                 business_context=self.business_context,
                 output_dir=str(self.output_dir),
+                task_id=self.task_id,  # 传递task_id
                 pipeline="full"
             )
             
@@ -169,7 +220,8 @@ class SchemaWorkflowOrchestrator:
                 output_dir=str(self.output_dir),
                 table_list_file=self.table_list_file,
                 business_context=self.business_context,
-                db_name=self.db_name
+                db_name=self.db_name,
+                task_id=self.task_id  # 传递task_id
             )
             
             # 执行Question-SQL生成
@@ -207,24 +259,37 @@ class SchemaWorkflowOrchestrator:
         step_start_time = time.time()
         
         try:
-            # 获取步骤2生成的文件
+            # 首先尝试从workflow_state获取文件(完整工作流模式)
             qs_artifacts = self.workflow_state["artifacts"].get("question_sql_generation", {})
             qs_file = qs_artifacts.get("output_file")
             
+            # 如果workflow_state中没有文件信息,则在任务目录中查找(分步执行模式)
             if not qs_file or not Path(qs_file).exists():
-                raise FileNotFoundError(f"找不到Question-SQL文件: {qs_file}")
+                self.logger.info("🔍 从workflow_state未找到文件,在任务目录中查找Question-SQL文件...")
+                
+                # 在输出目录中查找匹配的文件
+                possible_files = list(self.output_dir.glob("*_pair.json"))
+                
+                if not possible_files:
+                    raise FileNotFoundError(
+                        f"在任务目录 {self.output_dir} 中找不到Question-SQL文件(*_pair.json)。"
+                        f"请确保已执行qa_generation步骤并生成了Question-SQL对文件。"
+                    )
+                
+                # 选择最新的文件(按修改时间排序)
+                qs_file = str(max(possible_files, key=lambda f: f.stat().st_mtime))
+                self.logger.info(f"🎯 找到Question-SQL文件: {qs_file}")
             
             self.logger.info(f"📄 验证文件: {qs_file}")
             
-            # 动态设置验证配置
-            SCHEMA_TOOLS_CONFIG['sql_validation']['enable_sql_repair'] = self.enable_llm_repair
-            SCHEMA_TOOLS_CONFIG['sql_validation']['modify_original_file'] = self.modify_original_file
-            
-            # 创建SQL验证Agent
+            # 创建SQL验证Agent,通过参数传递配置而非修改全局配置
             sql_validator = SQLValidationAgent(
                 db_connection=self.db_connection,
                 input_file=str(qs_file),
-                output_dir=str(self.output_dir)
+                output_dir=str(self.output_dir),
+                task_id=self.task_id,  # 传递task_id
+                enable_sql_repair=self.enable_llm_repair,
+                modify_original_file=self.modify_original_file
             )
             
             # 执行SQL验证和修正
@@ -270,6 +335,88 @@ class SchemaWorkflowOrchestrator:
             self.logger.error(f"❌ 步骤3失败: {str(e)}")
             raise
     
+    async def _execute_step_4_training_data_load(self):
+        """步骤4: 训练数据加载"""
+        self.workflow_state["current_step"] = "training_data_load"
+        self.logger.info("=" * 60)
+        self.logger.info("🎯 步骤4: 开始加载训练数据")
+        self.logger.info("=" * 60)
+        
+        step_start_time = time.time()
+        
+        try:
+            # 确保输出目录存在所需的训练数据
+            training_data_dir = str(self.output_dir)
+            self.logger.info(f"📁 训练数据目录: {training_data_dir}")
+            
+            # 导入训练器模块
+            import sys
+            import os
+            sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+            
+            from data_pipeline.trainer.run_training import process_training_files
+            
+            # 执行训练数据加载
+            self.logger.info("🔄 开始处理训练文件...")
+            load_successful = process_training_files(training_data_dir, self.task_id)
+            
+            step_duration = time.time() - step_start_time
+            
+            if load_successful:
+                # 获取统计信息
+                from data_pipeline.trainer.vanna_trainer import flush_training, shutdown_trainer
+                
+                # 刷新批处理器
+                self.logger.info("🔄 刷新批处理器...")
+                flush_training()
+                shutdown_trainer()
+                
+                # 验证加载结果
+                try:
+                    from core.vanna_llm_factory import create_vanna_instance
+                    vn = create_vanna_instance()
+                    training_data = vn.get_training_data()
+                    
+                    if training_data is not None and not training_data.empty:
+                        total_records = len(training_data)
+                        self.logger.info(f"✅ 成功加载 {total_records} 条训练数据")
+                        
+                        # 统计数据类型
+                        if 'training_data_type' in training_data.columns:
+                            type_counts = training_data['training_data_type'].value_counts().to_dict()
+                        else:
+                            type_counts = {}
+                    else:
+                        total_records = 0
+                        type_counts = {}
+                        self.logger.warning("⚠️ 未能验证训练数据加载结果")
+                        
+                except Exception as e:
+                    self.logger.warning(f"⚠️ 验证训练数据时出错: {e}")
+                    total_records = 0
+                    type_counts = {}
+                
+                # 记录结果
+                self.workflow_state["completed_steps"].append("training_data_load")
+                self.workflow_state["artifacts"]["training_data_load"] = {
+                    "training_data_dir": training_data_dir,
+                    "load_successful": True,
+                    "total_records": total_records,
+                    "data_type_counts": type_counts,
+                    "duration": step_duration
+                }
+                self.workflow_state["statistics"]["step4_duration"] = step_duration
+                
+                self.logger.info(f"✅ 步骤4完成: 成功加载训练数据,耗时 {step_duration:.2f}秒")
+                
+            else:
+                raise Exception("训练数据加载失败:未找到可处理的训练文件")
+                
+        except Exception as e:
+            self.workflow_state["failed_steps"].append("training_data_load")
+            self.logger.error(f"❌ 步骤4失败: {str(e)}")
+            raise
+    
     async def _generate_final_report(self) -> Dict[str, Any]:
         """生成最终工作流程报告"""
         total_duration = self.workflow_state["end_time"] - self.workflow_state["start_time"]
@@ -305,12 +452,14 @@ class SchemaWorkflowOrchestrator:
                 "output_directory": str(self.output_dir),
                 "enable_sql_validation": self.enable_sql_validation,
                 "enable_llm_repair": self.enable_llm_repair,
-                "modify_original_file": self.modify_original_file
+                "modify_original_file": self.modify_original_file,
+                "enable_training_data_load": self.enable_training_data_load
             },
             "processing_results": {
                 "ddl_md_generation": self.workflow_state["artifacts"].get("ddl_md_generation", {}),
                 "question_sql_generation": self.workflow_state["artifacts"].get("question_sql_generation", {}),
-                "sql_validation": self.workflow_state["artifacts"].get("sql_validation", {})
+                "sql_validation": self.workflow_state["artifacts"].get("sql_validation", {}),
+                "training_data_load": self.workflow_state["artifacts"].get("training_data_load", {})
             },
             "final_outputs": {
                 "primary_output_file": final_output_file,
@@ -322,6 +471,7 @@ class SchemaWorkflowOrchestrator:
                 "step1_duration": round(self.workflow_state["statistics"].get("step1_duration", 0), 2),
                 "step2_duration": round(self.workflow_state["statistics"].get("step2_duration", 0), 2),
                 "step3_duration": round(self.workflow_state["statistics"].get("step3_duration", 0), 2),
+                "step4_duration": round(self.workflow_state["statistics"].get("step4_duration", 0), 2),
                 "total_duration": round(total_duration, 2)
             }
         }
@@ -421,28 +571,32 @@ def setup_argument_parser():
         epilog="""
 示例用法:
   # 完整工作流程
-  python -m schema_tools.schema_workflow_orchestrator \\
-    --db-connection "postgresql://user:pass@localhost:5432/dbname" \\
+  python -m data_pipeline.schema_workflow \\
+    --db-connection "postgresql://user:pass@localhost:5432/highway_db" \\
     --table-list tables.txt \\
     --business-context "高速公路服务区管理系统" \\
-    --db-name highway_db \\
-    --output-dir ./output
+    --output-dir ./data_pipeline/training_data/
   
   # 跳过SQL验证
-  python -m schema_tools.schema_workflow_orchestrator \\
-    --db-connection "postgresql://user:pass@localhost:5432/dbname" \\
+  python -m data_pipeline.schema_workflow \\
+    --db-connection "postgresql://user:pass@localhost:5432/ecommerce_db" \\
     --table-list tables.txt \\
     --business-context "电商系统" \\
-    --db-name ecommerce_db \\
     --skip-validation
   
   # 禁用LLM修复
-  python -m schema_tools.schema_workflow_orchestrator \\
-    --db-connection "postgresql://user:pass@localhost:5432/dbname" \\
+  python -m data_pipeline.schema_workflow \\
+    --db-connection "postgresql://user:pass@localhost:5432/management_db" \\
     --table-list tables.txt \\
     --business-context "管理系统" \\
-    --db-name management_db \\
     --disable-llm-repair
+  
+  # 跳过训练数据加载
+  python -m data_pipeline.schema_workflow \\
+    --db-connection "postgresql://user:pass@localhost:5432/management_db" \\
+    --table-list tables.txt \\
+    --business-context "管理系统" \\
+    --skip-training-load
         """
     )
     
@@ -465,17 +619,12 @@ def setup_argument_parser():
         help="业务上下文描述"
     )
     
-    parser.add_argument(
-        "--db-name",
-        required=True,
-        help="数据库名称(用于生成文件名)"
-    )
     
     # 可选参数
     parser.add_argument(
         "--output-dir",
-        default="./output",
-        help="输出目录(默认:./output)"
+        default="./data_pipeline/training_data/",
+        help="输出目录(默认:./data_pipeline/training_data/)"
     )
     
     parser.add_argument(
@@ -496,6 +645,12 @@ def setup_argument_parser():
         help="不修改原始JSON文件(仅生成报告)"
     )
     
+    parser.add_argument(
+        "--skip-training-load",
+        action="store_true",
+        help="跳过训练数据加载步骤"
+    )
+    
     parser.add_argument(
         "--verbose", "-v",
         action="store_true",
@@ -526,7 +681,13 @@ async def main():
     
     # 验证输入文件
     if not os.path.exists(args.table_list):
-        print(f"错误: 表清单文件不存在: {args.table_list}")
+        # 为脚本模式生成task_id
+        from datetime import datetime
+        script_task_id = f"manual_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        # 使用独立日志系统
+        from data_pipeline.dp_logging import get_logger
+        logger = get_logger("SchemaWorkflow", script_task_id)
+        logger.error(f"错误: 表清单文件不存在: {args.table_list}")
         sys.exit(1)
     
     try:
@@ -535,21 +696,28 @@ async def main():
             db_connection=args.db_connection,
             table_list_file=args.table_list,
             business_context=args.business_context,
-            db_name=args.db_name,
             output_dir=args.output_dir,
             enable_sql_validation=not args.skip_validation,
             enable_llm_repair=not args.disable_llm_repair,
-            modify_original_file=not args.no_modify_file
+            modify_original_file=not args.no_modify_file,
+            enable_training_data_load=not args.skip_training_load
         )
         
-        # 显示启动信息
-        print(f"🚀 开始执行Schema工作流编排...")
-        print(f"📁 输出目录: {args.output_dir}")
-        print(f"📋 表清单: {args.table_list}")
-        print(f"🏢 业务背景: {args.business_context}")
-        print(f"💾 数据库: {args.db_name}")
-        print(f"🔍 SQL验证: {'启用' if not args.skip_validation else '禁用'}")
-        print(f"🔧 LLM修复: {'启用' if not args.disable_llm_repair else '禁用'}")
+        # 获取logger用于启动信息
+        # 为脚本模式生成task_id
+        from datetime import datetime
+        script_task_id = f"manual_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        # 使用独立日志系统
+        from data_pipeline.dp_logging import get_logger
+        logger = get_logger("SchemaWorkflow", script_task_id)
+        logger.info(f"🚀 开始执行Schema工作流编排...")
+        logger.info(f"📁 输出目录: {args.output_dir}")
+        logger.info(f"📋 表清单: {args.table_list}")
+        logger.info(f"🏢 业务背景: {args.business_context}")
+        logger.info(f"💾 数据库: {orchestrator.db_name}")
+        logger.info(f"🔍 SQL验证: {'启用' if not args.skip_validation else '禁用'}")
+        logger.info(f"🔧 LLM修复: {'启用' if not args.disable_llm_repair else '禁用'}")
+        logger.info(f"🎯 训练数据加载: {'启用' if not args.skip_training_load else '禁用'}")
         
         # 执行完整工作流程
         report = await orchestrator.execute_complete_workflow()
@@ -560,23 +728,23 @@ async def main():
         # 输出结果并设置退出码
         if report["success"]:
             if report["processing_results"].get("sql_validation", {}).get("success_rate", 1.0) >= 0.8:
-                print(f"\n🎉 工作流程执行成功!")
+                logger.info(f"\n🎉 工作流程执行成功!")
                 exit_code = 0  # 完全成功
             else:
-                print(f"\n⚠️  工作流程执行完成,但SQL验证成功率较低")
+                logger.warning(f"\n⚠️  工作流程执行完成,但SQL验证成功率较低")
                 exit_code = 1  # 部分成功
         else:
-            print(f"\n❌ 工作流程执行失败")
+            logger.error(f"\n❌ 工作流程执行失败")
             exit_code = 2  # 失败
         
-        print(f"📄 主要输出文件: {report['final_outputs']['primary_output_file']}")
+        logger.info(f"📄 主要输出文件: {report['final_outputs']['primary_output_file']}")
         sys.exit(exit_code)
         
     except KeyboardInterrupt:
-        print("\n\n⏹️  用户中断,程序退出")
+        logger.info("\n\n⏹️  用户中断,程序退出")
         sys.exit(130)
     except Exception as e:
-        print(f"\n❌ 程序执行失败: {e}")
+        logger.error(f"\n❌ 程序执行失败: {e}")
         if args.verbose:
             import traceback
             traceback.print_exc()

+ 235 - 0
data_pipeline/sql/init_tables.sql

@@ -0,0 +1,235 @@
+-- Data Pipeline API 数据库初始化脚本
+-- 
+-- 此脚本在pgvector向量数据库中创建Data Pipeline API系统所需的表和索引
+-- 注意:这些表应该创建在pgvector数据库中,而不是业务数据库中
+-- 
+-- 执行方式(使用PGVECTOR_CONFIG中的连接信息):
+-- psql -h host -p port -U username -d pgvector_database_name -f init_tables.sql
+
+-- 设置客户端编码
+SET client_encoding = 'UTF8';
+
+-- 开始事务
+BEGIN;
+
+-- ====================================================================
+-- 任务主表 (data_pipeline_tasks)
+-- ====================================================================
+CREATE TABLE IF NOT EXISTS data_pipeline_tasks (
+    -- 主键:时间戳格式的任务ID
+    task_id VARCHAR(32) PRIMARY KEY,               -- 'task_20250627_143052'
+    
+    -- 任务基本信息
+    task_name VARCHAR(255),                        -- 任务自定义名称(可选)
+    task_type VARCHAR(50) NOT NULL DEFAULT 'data_workflow',
+    status VARCHAR(20) NOT NULL DEFAULT 'pending', -- pending/in_progress/partial_completed/completed/failed
+    
+    -- 配置和结果(JSON格式)
+    parameters JSONB NOT NULL,                     -- 任务配置参数
+    result JSONB,                                  -- 最终执行结果
+    
+    -- 错误处理
+    error_message TEXT,                            -- 错误详细信息
+    
+    -- 时间戳
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    started_at TIMESTAMP,
+    completed_at TIMESTAMP,
+    
+    -- 创建者信息
+    created_type VARCHAR(50) DEFAULT 'api',        -- 'api', 'manual', 'system'
+    by_user VARCHAR(50),                           -- 'guest'或其它user_id
+    
+    -- 输出目录
+    output_directory TEXT,                         -- 任务输出目录路径
+    
+    -- 索引字段
+    db_name VARCHAR(100)                           -- 数据库名称(便于筛选)
+);
+
+-- 添加约束
+ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_task_status 
+    CHECK (status IN ('pending', 'in_progress', 'partial_completed', 'completed', 'failed'));
+
+ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_task_type 
+    CHECK (task_type IN ('data_workflow', 'complete_workflow'));
+
+ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_created_type 
+    CHECK (created_type IN ('api', 'manual', 'system'));
+
+-- ====================================================================
+-- 任务步骤状态表 (data_pipeline_task_steps)
+-- ====================================================================
+CREATE TABLE IF NOT EXISTS data_pipeline_task_steps (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(task_id) ON DELETE CASCADE,
+    execution_id VARCHAR(100),                    -- 执行批次ID(可为空)
+    step_name VARCHAR(50) NOT NULL,               -- 'ddl_generation', 'qa_generation', 'sql_validation', 'training_load'
+    step_status VARCHAR(50) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed'
+    started_at TIMESTAMP,
+    completed_at TIMESTAMP,
+    error_message TEXT                            -- 错误详细信息
+);
+
+-- 添加约束
+ALTER TABLE data_pipeline_task_steps ADD CONSTRAINT chk_step_status 
+    CHECK (step_status IN ('pending', 'running', 'completed', 'failed'));
+
+ALTER TABLE data_pipeline_task_steps ADD CONSTRAINT chk_step_name 
+    CHECK (step_name IN ('ddl_generation', 'qa_generation', 'sql_validation', 'training_load'));
+
+
+
+-- ====================================================================
+-- 创建索引
+-- ====================================================================
+
+-- 任务表索引
+CREATE INDEX IF NOT EXISTS idx_tasks_status ON data_pipeline_tasks(status);
+CREATE INDEX IF NOT EXISTS idx_tasks_created_at ON data_pipeline_tasks(created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_tasks_db_name ON data_pipeline_tasks(db_name);
+CREATE INDEX IF NOT EXISTS idx_tasks_created_type ON data_pipeline_tasks(created_type);
+CREATE INDEX IF NOT EXISTS idx_tasks_task_type ON data_pipeline_tasks(task_type);
+CREATE INDEX IF NOT EXISTS idx_tasks_task_name ON data_pipeline_tasks(task_name);
+
+-- 步骤状态表索引
+CREATE INDEX IF NOT EXISTS idx_steps_task_id ON data_pipeline_task_steps(task_id);
+CREATE INDEX IF NOT EXISTS idx_steps_step_name ON data_pipeline_task_steps(step_name);
+CREATE INDEX IF NOT EXISTS idx_steps_step_status ON data_pipeline_task_steps(step_status);
+CREATE INDEX IF NOT EXISTS idx_steps_started_at ON data_pipeline_task_steps(started_at DESC);
+CREATE INDEX IF NOT EXISTS idx_steps_task_step ON data_pipeline_task_steps(task_id, step_name);
+
+-- ====================================================================
+-- 创建清理函数
+-- ====================================================================
+
+-- 清理旧任务的函数
+CREATE OR REPLACE FUNCTION cleanup_old_data_pipeline_tasks(days_to_keep INTEGER DEFAULT 30)
+RETURNS INTEGER AS $$
+DECLARE
+    deleted_count INTEGER;
+    cutoff_date TIMESTAMP;
+BEGIN
+    cutoff_date := NOW() - INTERVAL '1 day' * days_to_keep;
+    
+    -- 删除旧任务(级联删除相关步骤记录)
+    DELETE FROM data_pipeline_tasks 
+    WHERE created_at < cutoff_date 
+    AND status IN ('completed', 'failed');
+    
+    GET DIAGNOSTICS deleted_count = ROW_COUNT;
+    
+    RETURN deleted_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- 获取任务统计信息的函数
+CREATE OR REPLACE FUNCTION get_data_pipeline_task_stats()
+RETURNS TABLE (
+    total_tasks INTEGER,
+    pending_tasks INTEGER,
+    running_tasks INTEGER,
+    completed_tasks INTEGER,
+    failed_tasks INTEGER,
+    avg_completion_time INTERVAL
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        COUNT(*)::INTEGER as total_tasks,
+        COUNT(*) FILTER (WHERE status = 'pending')::INTEGER as pending_tasks,
+        COUNT(*) FILTER (WHERE status IN ('in_progress'))::INTEGER as running_tasks,
+        COUNT(*) FILTER (WHERE status = 'completed')::INTEGER as completed_tasks,
+        COUNT(*) FILTER (WHERE status = 'failed')::INTEGER as failed_tasks,
+        AVG(completed_at - started_at) FILTER (WHERE status = 'completed') as avg_completion_time
+    FROM data_pipeline_tasks;
+END;
+$$ LANGUAGE plpgsql;
+
+-- 检查僵尸任务的函数
+CREATE OR REPLACE FUNCTION check_zombie_data_pipeline_tasks(timeout_hours INTEGER DEFAULT 2)
+RETURNS INTEGER AS $$
+DECLARE
+    zombie_count INTEGER;
+    cutoff_time TIMESTAMP;
+BEGIN
+    cutoff_time := NOW() - INTERVAL '1 hour' * timeout_hours;
+    
+    -- 查找超时的运行中步骤
+    UPDATE data_pipeline_task_steps 
+    SET step_status = 'failed',
+        error_message = FORMAT('步骤执行超时(超过%s小时),可能已停止运行', timeout_hours),
+        completed_at = NOW()
+    WHERE step_status = 'running' 
+    AND started_at < cutoff_time;
+    
+    GET DIAGNOSTICS zombie_count = ROW_COUNT;
+    
+    -- 更新相关任务状态
+    UPDATE data_pipeline_tasks 
+    SET status = 'failed',
+        error_message = FORMAT('任务超时(超过%s小时),可能已停止运行', timeout_hours)
+    WHERE status IN ('in_progress') 
+    AND started_at < cutoff_time;
+    
+    RETURN zombie_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- ====================================================================
+-- 插入初始数据(如果需要)
+-- ====================================================================
+
+-- 这里可以插入一些初始配置数据
+-- 目前暂时不需要
+
+-- ====================================================================
+-- 创建视图(便于查询)
+-- ====================================================================
+
+-- 任务步骤概览视图
+CREATE OR REPLACE VIEW v_task_step_overview AS
+SELECT 
+    t.task_id,
+    t.task_name,
+    t.task_type,
+    t.status as task_status,
+    t.created_at,
+    t.started_at,
+    t.completed_at,
+    t.created_type,
+    t.by_user,
+    t.db_name,
+    s.step_name,
+    s.step_status,
+    s.started_at as step_started_at,
+    s.completed_at as step_completed_at,
+    s.error_message as step_error_message
+FROM data_pipeline_tasks t
+LEFT JOIN data_pipeline_task_steps s ON t.task_id = s.task_id
+ORDER BY t.created_at DESC, 
+         CASE s.step_name 
+           WHEN 'ddl_generation' THEN 1
+           WHEN 'qa_generation' THEN 2
+           WHEN 'sql_validation' THEN 3
+           WHEN 'training_load' THEN 4
+           ELSE 5 
+         END;
+
+-- 提交事务
+COMMIT;
+
+-- 输出创建结果
+\echo 'Data Pipeline API 数据库表创建完成!'
+\echo ''
+\echo '已创建的表:'
+\echo '- data_pipeline_tasks: 任务主表'
+\echo '- data_pipeline_task_steps: 任务步骤状态表'
+\echo ''
+\echo '已创建的函数:'
+\echo '- cleanup_old_data_pipeline_tasks(days): 清理旧任务'
+\echo '- get_data_pipeline_task_stats(): 获取任务统计'
+\echo '- check_zombie_data_pipeline_tasks(hours): 检查僵尸任务'
+\echo ''
+\echo '已创建的视图:'
+\echo '- v_task_step_overview: 任务步骤概览'

+ 5 - 5
schema_tools/tables.txt → data_pipeline/tables.txt

@@ -5,9 +5,9 @@
 # 服务区相关表
 bss_car_day_count
 bss_business_day_data
-bss_company
-bss_section_route
-bss_section_route_area_link
-bss_service_area
-bss_service_area_mapper
+#bss_company
+#bss_section_route
+#bss_section_route_area_link
+#bss_service_area
+#bss_service_area_mapper
 

+ 77 - 0
data_pipeline/task_executor.py

@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""
+Data Pipeline 独立任务执行器
+
+专门用于subprocess调用,执行data pipeline任务
+"""
+
+import sys
+import asyncio
+import argparse
+import json
+from pathlib import Path
+
+# 确保能够导入项目模块
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from data_pipeline.api.simple_workflow import SimpleWorkflowExecutor
+
+
+def main():
+    """主执行函数"""
+    parser = argparse.ArgumentParser(description='Data Pipeline 任务执行器')
+    parser.add_argument('--task-id', required=True, help='任务ID')
+    parser.add_argument('--execution-mode', default='complete', choices=['complete', 'step'], help='执行模式')
+    parser.add_argument('--step-name', help='步骤名称(当execution-mode=step时必需)')
+    
+    args = parser.parse_args()
+    
+    # 初始化日志系统(不需要,使用独立的日志系统)
+    pass
+    
+    # 验证参数
+    if args.execution_mode == 'step' and not args.step_name:
+        print("错误: 步骤执行模式需要指定--step-name参数", file=sys.stderr)
+        sys.exit(1)
+    
+    try:
+        # 执行任务
+        result = asyncio.run(execute_task(args.task_id, args.execution_mode, args.step_name))
+        
+        # 输出结果到stdout(供父进程读取)
+        print(json.dumps(result, ensure_ascii=False, default=str))
+        
+        # 设置退出码
+        sys.exit(0 if result.get('success', False) else 1)
+        
+    except Exception as e:
+        error_result = {
+            "success": False,
+            "error": str(e),
+            "task_id": args.task_id,
+            "execution_mode": args.execution_mode
+        }
+        print(json.dumps(error_result, ensure_ascii=False), file=sys.stderr)
+        sys.exit(1)
+
+
+async def execute_task(task_id: str, execution_mode: str, step_name: str = None):
+    """执行任务的异步函数"""
+    executor = None
+    try:
+        executor = SimpleWorkflowExecutor(task_id)
+        
+        if execution_mode == "complete":
+            return await executor.execute_complete_workflow()
+        elif execution_mode == "step":
+            return await executor.execute_single_step(step_name)
+        else:
+            raise ValueError(f"不支持的执行模式: {execution_mode}")
+            
+    finally:
+        if executor:
+            executor.cleanup()
+
+
+if __name__ == "__main__":
+    main()

+ 0 - 0
schema_tools/tools/__init__.py → data_pipeline/tools/__init__.py


+ 29 - 28
schema_tools/tools/base.py → data_pipeline/tools/base.py

@@ -1,52 +1,46 @@
 import asyncio
 import time
-import logging
 from abc import ABC, abstractmethod
+import logging
 from typing import Dict, Any, Optional, Type, List
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext
 
 class ToolRegistry:
     """工具注册管理器"""
     _tools: Dict[str, Type['BaseTool']] = {}
-    _instances: Dict[str, 'BaseTool'] = {}
     
     @classmethod
     def register(cls, name: str):
         """装饰器:注册工具"""
         def decorator(tool_class: Type['BaseTool']):
             cls._tools[name] = tool_class
-            logging.debug(f"注册工具: {name} -> {tool_class.__name__}")
+            logger = logging.getLogger("ToolRegistry")
+            logger.debug(f"注册工具: {name} -> {tool_class.__name__}")
             return tool_class
         return decorator
     
     @classmethod
     def get_tool(cls, name: str, **kwargs) -> 'BaseTool':
-        """获取工具实例,支持单例模式"""
-        if name not in cls._instances:
-            if name not in cls._tools:
-                raise ValueError(f"工具 '{name}' 未注册")
-            
-            tool_class = cls._tools[name]
-            
-            # 自动注入vanna实例到需要LLM的工具
-            if hasattr(tool_class, 'needs_llm') and tool_class.needs_llm:
-                from core.vanna_llm_factory import create_vanna_instance
-                kwargs['vn'] = create_vanna_instance()
-                logging.debug(f"为工具 {name} 注入LLM实例")
-            
-            cls._instances[name] = tool_class(**kwargs)
+        """获取工具实例,每次返回新实例确保参数正确传递"""
+        if name not in cls._tools:
+            raise ValueError(f"工具 '{name}' 未注册")
         
-        return cls._instances[name]
+        tool_class = cls._tools[name]
+        
+        # 自动注入vanna实例到需要LLM的工具
+        if hasattr(tool_class, 'needs_llm') and tool_class.needs_llm:
+            from core.vanna_llm_factory import create_vanna_instance
+            kwargs['vn'] = create_vanna_instance()
+            logger = logging.getLogger("ToolRegistry")
+            logger.debug(f"为工具 {name} 注入LLM实例")
+        
+        # 每次返回新实例,避免单例模式导致的数据库连接混乱
+        return tool_class(**kwargs)
     
     @classmethod
     def list_tools(cls) -> List[str]:
         """列出所有已注册的工具"""
         return list(cls._tools.keys())
-    
-    @classmethod
-    def clear_instances(cls):
-        """清除所有工具实例(用于测试)"""
-        cls._instances.clear()
 
 class BaseTool(ABC):
     """工具基类"""
@@ -55,7 +49,7 @@ class BaseTool(ABC):
     tool_name: str = ""      # 工具名称
     
     def __init__(self, **kwargs):
-        self.logger = logging.getLogger(f"schema_tools.{self.__class__.__name__}")
+        self.logger = logging.getLogger(f"tools.{self.__class__.__name__}")
         
         # 如果工具需要LLM,检查是否已注入
         if self.needs_llm and 'vn' not in kwargs:
@@ -113,7 +107,7 @@ class PipelineExecutor:
     
     def __init__(self, pipeline_config: Dict[str, List[str]]):
         self.pipeline_config = pipeline_config
-        self.logger = logging.getLogger("schema_tools.PipelineExecutor")
+        self.logger = logging.getLogger("tools.PipelineExecutor")
     
     async def execute_pipeline(self, pipeline_name: str, context: TableProcessingContext) -> Dict[str, ProcessingResult]:
         """执行指定的处理链"""
@@ -127,7 +121,14 @@ class PipelineExecutor:
         
         for step_name in steps:
             try:
-                tool = ToolRegistry.get_tool(step_name)
+                # 为工具传递数据库连接参数(从上下文中获取)
+                tool_kwargs = {}
+                if hasattr(context, 'db_connection') and context.db_connection:
+                    tool_kwargs['db_connection'] = context.db_connection
+                if hasattr(context, 'business_context') and context.business_context:
+                    tool_kwargs['business_context'] = context.business_context
+                
+                tool = ToolRegistry.get_tool(step_name, **tool_kwargs)
                 
                 # 验证输入
                 if not tool.validate_input(context):
@@ -143,7 +144,7 @@ class PipelineExecutor:
                 
                 # 如果步骤失败且不允许继续,则停止
                 if not result.success:
-                    from schema_tools.config import SCHEMA_TOOLS_CONFIG
+                    from data_pipeline.config import SCHEMA_TOOLS_CONFIG
                     if not SCHEMA_TOOLS_CONFIG["continue_on_error"]:
                         self.logger.error(f"步骤 {step_name} 失败,停止处理链执行")
                         break

+ 24 - 7
schema_tools/tools/comment_generator.py → data_pipeline/tools/comment_generator.py

@@ -1,7 +1,7 @@
 import asyncio
 from typing import List, Dict, Any, Tuple
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo
 
 @ToolRegistry.register("comment_generator")
 class CommentGeneratorTool(BaseTool):
@@ -13,6 +13,7 @@ class CommentGeneratorTool(BaseTool):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.business_context = kwargs.get('business_context', '')
+        self.db_connection = kwargs.get('db_connection')  # 支持传入数据库连接字符串
         self.business_dictionary = self._load_business_dictionary()
     
     async def execute(self, context: TableProcessingContext) -> ProcessingResult:
@@ -242,7 +243,7 @@ class CommentGeneratorTool(BaseTool):
     
     async def _call_llm_with_retry(self, prompt: str, max_retries: int = 3) -> str:
         """带重试的LLM调用"""
-        from schema_tools.config import SCHEMA_TOOLS_CONFIG
+        from data_pipeline.config import SCHEMA_TOOLS_CONFIG
         
         for attempt in range(max_retries):
             try:
@@ -342,13 +343,26 @@ class CommentGeneratorTool(BaseTool):
     
     async def _validate_enum_suggestions(self, table_metadata, enum_suggestions: List[Dict]) -> List[Dict]:
         """验证枚举建议"""
-        from schema_tools.tools.database_inspector import DatabaseInspectorTool
-        from schema_tools.config import SCHEMA_TOOLS_CONFIG
+        import asyncpg
+        from data_pipeline.config import SCHEMA_TOOLS_CONFIG
         
         validated_enums = []
-        inspector = ToolRegistry.get_tool("database_inspector")
         sample_limit = SCHEMA_TOOLS_CONFIG["enum_detection_sample_limit"]
         
+        # 获取数据库连接字符串 - 优先使用传入的连接字符串
+        db_connection = self.db_connection
+        
+        # 如果没有传入连接字符串,尝试从vanna实例获取
+        if not db_connection:
+            if hasattr(self.vn, 'connection_string'):
+                db_connection = self.vn.connection_string
+            elif hasattr(self.vn, '_connection_string'):
+                db_connection = self.vn._connection_string
+        
+        if not db_connection:
+            self.logger.warning("无法获取数据库连接字符串,跳过枚举验证")
+            return validated_enums
+        
         for enum_info in enum_suggestions:
             field_name = enum_info['field_name']
             
@@ -363,7 +377,8 @@ class CommentGeneratorTool(BaseTool):
                 LIMIT {sample_limit}
                 """
                 
-                async with inspector.connection_pool.acquire() as conn:
+                conn = await asyncpg.connect(db_connection)
+                try:
                     rows = await conn.fetch(query)
                     
                     actual_values = [str(row['value']) for row in rows]
@@ -381,6 +396,8 @@ class CommentGeneratorTool(BaseTool):
                         self.logger.info(f"确认字段 {field_name} 为枚举类型,包含 {len(actual_values)} 个值")
                     else:
                         self.logger.info(f"字段 {field_name} 不同值过多({len(actual_values)}),不认为是枚举")
+                finally:
+                    await conn.close()
                         
             except Exception as e:
                 self.logger.warning(f"验证字段 {field_name} 的枚举建议失败: {e}")

+ 14 - 11
schema_tools/tools/data_sampler.py → data_pipeline/tools/data_sampler.py

@@ -1,7 +1,7 @@
 import random
 from typing import List, Dict, Any
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, TableMetadata
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, TableMetadata
 
 @ToolRegistry.register("data_sampler")
 class DataSamplerTool(BaseTool):
@@ -17,7 +17,7 @@ class DataSamplerTool(BaseTool):
     async def execute(self, context: TableProcessingContext) -> ProcessingResult:
         """执行数据采样"""
         try:
-            from schema_tools.config import SCHEMA_TOOLS_CONFIG
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
             
             table_metadata = context.table_metadata
             sample_limit = SCHEMA_TOOLS_CONFIG["sample_data_limit"]
@@ -51,27 +51,28 @@ class DataSamplerTool(BaseTool):
     
     async def _simple_sample(self, table_metadata: TableMetadata, limit: int) -> List[Dict[str, Any]]:
         """简单采样策略"""
-        from schema_tools.tools.database_inspector import DatabaseInspectorTool
-        
-        # 复用数据库检查工具的连接
-        inspector = ToolRegistry.get_tool("database_inspector")
+        import asyncpg
         
+        # 直接使用数据库连接字符串创建连接
         query = f"SELECT * FROM {table_metadata.full_name} LIMIT {limit}"
         
-        async with inspector.connection_pool.acquire() as conn:
+        conn = await asyncpg.connect(self.db_connection)
+        try:
             rows = await conn.fetch(query)
             return [dict(row) for row in rows]
+        finally:
+            await conn.close()
     
     async def _smart_sample_large_table(self, table_metadata: TableMetadata, limit: int) -> List[Dict[str, Any]]:
         """智能采样策略(用于大表)"""
-        from schema_tools.tools.database_inspector import DatabaseInspectorTool
+        import asyncpg
         
-        inspector = ToolRegistry.get_tool("database_inspector")
         samples_per_section = max(1, limit // 3)
         
         samples = []
         
-        async with inspector.connection_pool.acquire() as conn:
+        conn = await asyncpg.connect(self.db_connection)
+        try:
             # 1. 前N行采样
             front_query = f"SELECT * FROM {table_metadata.full_name} LIMIT {samples_per_section}"
             front_rows = await conn.fetch(front_query)
@@ -118,5 +119,7 @@ class DataSamplerTool(BaseTool):
                         samples.append(row_dict)
                 except Exception as e:
                     self.logger.warning(f"尾部采样失败: {e}")
+        finally:
+            await conn.close()
         
         return samples[:limit]  # 确保不超过限制

+ 2 - 2
schema_tools/tools/database_inspector.py → data_pipeline/tools/database_inspector.py

@@ -1,8 +1,8 @@
 import asyncio
 import asyncpg
 from typing import List, Dict, Any, Optional
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
 
 @ToolRegistry.register("database_inspector")
 class DatabaseInspectorTool(BaseTool):

+ 46 - 30
schema_tools/tools/ddl_generator.py → data_pipeline/tools/ddl_generator.py

@@ -1,8 +1,8 @@
 import os
 from typing import List, Dict, Any
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 @ToolRegistry.register("ddl_generator")
 class DDLGeneratorTool(BaseTool):
@@ -22,33 +22,49 @@ class DDLGeneratorTool(BaseTool):
             # 生成DDL内容
             ddl_content = self._generate_ddl_content(table_metadata)
             
-            # 确定文件名和路径
-            filename = context.file_manager.get_safe_filename(
-                table_metadata.schema_name,
-                table_metadata.table_name,
-                SCHEMA_TOOLS_CONFIG["ddl_file_suffix"]
-            )
-            
-            # 确定子目录
-            subdirectory = "ddl" if SCHEMA_TOOLS_CONFIG["create_subdirectories"] else None
-            filepath = context.file_manager.get_full_path(filename, subdirectory)
-            
-            # 写入文件
-            with open(filepath, 'w', encoding='utf-8') as f:
-                f.write(ddl_content)
-            
-            self.logger.info(f"DDL文件已生成: {filepath}")
-            
-            return ProcessingResult(
-                success=True,
-                data={
-                    'filename': filename,
-                    'filepath': filepath,
-                    'content_length': len(ddl_content),
-                    'ddl_content': ddl_content  # 保存内容供后续工具使用
-                },
-                metadata={'tool': self.tool_name}
-            )
+            # 如果有file_manager,则写入文件(正常的data_pipeline流程)
+            if context.file_manager:
+                # 确定文件名和路径
+                filename = context.file_manager.get_safe_filename(
+                    table_metadata.schema_name,
+                    table_metadata.table_name,
+                    SCHEMA_TOOLS_CONFIG["ddl_file_suffix"]
+                )
+                
+                # 确定子目录
+                subdirectory = "ddl" if SCHEMA_TOOLS_CONFIG["create_subdirectories"] else None
+                filepath = context.file_manager.get_full_path(filename, subdirectory)
+                
+                # 写入文件
+                with open(filepath, 'w', encoding='utf-8') as f:
+                    f.write(ddl_content)
+                
+                self.logger.info(f"DDL文件已生成: {filepath}")
+                
+                return ProcessingResult(
+                    success=True,
+                    data={
+                        'filename': filename,
+                        'filepath': filepath,
+                        'content_length': len(ddl_content),
+                        'ddl_content': ddl_content  # 保存内容供后续工具使用
+                    },
+                    metadata={'tool': self.tool_name}
+                )
+            else:
+                # 如果没有file_manager,只返回DDL内容(API调用场景)
+                self.logger.info("DDL内容已生成(API调用模式,不写入文件)")
+                
+                return ProcessingResult(
+                    success=True,
+                    data={
+                        'filename': f"{table_metadata.schema_name}_{table_metadata.table_name}.ddl",
+                        'filepath': None,  # 不写入文件
+                        'content_length': len(ddl_content),
+                        'ddl_content': ddl_content  # 保存内容供后续工具使用
+                    },
+                    metadata={'tool': self.tool_name}
+                )
             
         except Exception as e:
             self.logger.exception(f"DDL生成失败")

+ 3 - 3
schema_tools/tools/doc_generator.py → data_pipeline/tools/doc_generator.py

@@ -1,8 +1,8 @@
 import os
 from typing import List, Dict, Any
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 @ToolRegistry.register("doc_generator")
 class DocGeneratorTool(BaseTool):

+ 1 - 0
data_pipeline/trainer/__init__.py

@@ -0,0 +1 @@
+# Trainer module for Vanna training

+ 57 - 26
training/run_training.py → data_pipeline/trainer/run_training.py

@@ -11,7 +11,7 @@ from pathlib import Path
 from sqlalchemy import create_engine
 
 
-from vanna_trainer import (
+from .vanna_trainer import (
     train_ddl,
     train_documentation,
     train_sql_example,
@@ -308,19 +308,39 @@ def train_json_question_sql_pairs(json_file):
     except Exception as e:
         print(f" 错误:处理JSON问答训练 - {e}")
 
-def process_training_files(data_path):
+def process_training_files(data_path, task_id=None):
     """处理指定路径下的所有训练文件
     
     Args:
         data_path (str): 训练数据目录路径
+        task_id (str): 任务ID,用于日志记录
     """
-    print(f"\n===== 扫描训练数据目录: {os.path.abspath(data_path)} =====")
+    # 初始化日志
+    if task_id:
+        from data_pipeline.dp_logging import get_logger
+        logger = get_logger("TrainingDataLoader", task_id)
+        logger.info(f"扫描训练数据目录: {os.path.abspath(data_path)}")
+    else:
+        # 兼容原有调用方式
+        print(f"\n===== 扫描训练数据目录: {os.path.abspath(data_path)} =====")
+        logger = None
     
     # 检查目录是否存在
     if not os.path.exists(data_path):
-        print(f"错误: 训练数据目录不存在: {data_path}")
+        error_msg = f"错误: 训练数据目录不存在: {data_path}"
+        if logger:
+            logger.error(error_msg)
+        else:
+            print(error_msg)
         return False
     
+    # 日志输出辅助函数
+    def log_message(message, level="info"):
+        if logger:
+            getattr(logger, level)(message)
+        else:
+            print(message)
+    
     # 初始化统计计数器
     stats = {
         "ddl": 0,
@@ -338,7 +358,7 @@ def process_training_files(data_path):
             
             # 只处理文件,跳过目录
             if not os.path.isfile(item_path):
-                print(f"跳过子目录: {item}")
+                log_message(f"跳过子目录: {item}")
                 continue
                 
             file_lower = item.lower()
@@ -346,49 +366,49 @@ def process_training_files(data_path):
             # 根据文件类型调用相应的处理函数
             try:
                 if file_lower.endswith(".ddl"):
-                    print(f"\n处理DDL文件: {item_path}")
+                    log_message(f"处理DDL文件: {item_path}")
                     train_ddl_statements(item_path)
                     stats["ddl"] += 1
                     
                 elif file_lower.endswith(".md") or file_lower.endswith(".markdown"):
-                    print(f"\n处理文档文件: {item_path}")
+                    log_message(f"处理文档文件: {item_path}")
                     train_documentation_blocks(item_path)
                     stats["documentation"] += 1
                     
                 elif file_lower.endswith("_pair.json") or file_lower.endswith("_pairs.json"):
-                    print(f"\n处理JSON问答对文件: {item_path}")
+                    log_message(f"处理JSON问答对文件: {item_path}")
                     train_json_question_sql_pairs(item_path)
                     stats["question_sql_json"] += 1
                     
                 elif file_lower.endswith("_pair.sql") or file_lower.endswith("_pairs.sql"):
-                    print(f"\n处理格式化问答对文件: {item_path}")
+                    log_message(f"处理格式化问答对文件: {item_path}")
                     train_formatted_question_sql_pairs(item_path)
                     stats["question_sql_formatted"] += 1
                     
                 elif file_lower.endswith(".sql") and not (file_lower.endswith("_pair.sql") or file_lower.endswith("_pairs.sql")):
-                    print(f"\n处理SQL示例文件: {item_path}")
+                    log_message(f"处理SQL示例文件: {item_path}")
                     train_sql_examples(item_path)
                     stats["sql_example"] += 1
                 else:
-                    print(f"跳过不支持的文件类型: {item}")
+                    log_message(f"跳过不支持的文件类型: {item}")
             except Exception as e:
-                print(f"处理文件 {item_path} 时出错: {e}")
+                log_message(f"处理文件 {item_path} 时出错: {e}", "error")
                 
     except OSError as e:
-        print(f"读取目录失败: {e}")
+        log_message(f"读取目录失败: {e}", "error")
         return False
     
     # 打印处理统计
-    print("\n===== 训练文件处理统计 =====")
-    print(f"DDL文件: {stats['ddl']}个")
-    print(f"文档文件: {stats['documentation']}个")
-    print(f"SQL示例文件: {stats['sql_example']}个")
-    print(f"格式化问答对文件: {stats['question_sql_formatted']}个")
-    print(f"JSON问答对文件: {stats['question_sql_json']}个")
+    log_message("训练文件处理统计:")
+    log_message(f"DDL文件: {stats['ddl']}个")
+    log_message(f"文档文件: {stats['documentation']}个")
+    log_message(f"SQL示例文件: {stats['sql_example']}个")
+    log_message(f"格式化问答对文件: {stats['question_sql_formatted']}个")
+    log_message(f"JSON问答对文件: {stats['question_sql_json']}个")
     
     total_files = sum(stats.values())
     if total_files == 0:
-        print(f"警告: 在目录 {data_path} 中未找到任何可训练的文件")
+        log_message(f"警告: 在目录 {data_path} 中未找到任何可训练的文件", "warning")
         return False
         
     return True
@@ -467,7 +487,13 @@ def main():
     # 获取默认路径并进行智能处理
     def resolve_training_data_path():
         """智能解析训练数据路径"""
-        config_path = getattr(app_config, 'TRAINING_DATA_PATH', './training/data')
+        # 使用data_pipeline统一配置
+        try:
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            config_path = SCHEMA_TOOLS_CONFIG.get("output_directory", './data_pipeline/training_data/')
+        except ImportError:
+            # 如果无法导入data_pipeline配置,使用默认路径
+            config_path = './data_pipeline/training_data/'
         
         # 如果是绝对路径,直接返回
         if os.path.isabs(config_path):
@@ -475,17 +501,17 @@ def main():
         
         # 如果以 . 开头,相对于项目根目录解析
         if config_path.startswith('./') or config_path.startswith('../'):
-            project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
             return os.path.join(project_root, config_path)
         
         # 其他情况,相对于项目根目录
-        project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
         return os.path.join(project_root, config_path)
     
     default_path = resolve_training_data_path()
     
     parser.add_argument('--data_path', type=str, default=default_path,
-                        help='训练数据目录路径 (默认: 从app_config.TRAINING_DATA_PATH)')
+                        help='训练数据目录路径 (默认: 从data_pipeline.config.SCHEMA_TOOLS_CONFIG)')
     args = parser.parse_args()
     
     # 使用Path对象处理路径以确保跨平台兼容性
@@ -493,12 +519,17 @@ def main():
     
     # 显示路径解析结果
     print(f"\n===== 训练数据路径配置 =====")
-    print(f"配置文件中的路径: {getattr(app_config, 'TRAINING_DATA_PATH', '未配置')}")
+    try:
+        from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+        config_value = SCHEMA_TOOLS_CONFIG.get("output_directory", "未配置")
+        print(f"data_pipeline配置路径: {config_value}")
+    except ImportError:
+        print(f"data_pipeline配置: 无法导入")
     print(f"解析后的绝对路径: {os.path.abspath(data_path)}")
     print("==============================")
     
     # 设置正确的项目根目录路径
-    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
     # 检查嵌入模型连接
     check_embedding_model_connection()

+ 37 - 33
training/vanna_trainer.py → data_pipeline/trainer/vanna_trainer.py

@@ -9,11 +9,15 @@ from collections import defaultdict
 from typing import List, Dict, Any, Tuple, Optional, Union, Callable
 import sys
 import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 import app_config
+import logging
+
+# 初始化日志
+logger = logging.getLogger("VannaTrainer")
 
 # 设置正确的项目根目录路径
-project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 # 创建vanna实例
 from core.vanna_llm_factory import create_vanna_instance
@@ -27,20 +31,20 @@ try:
     embedding_config = get_current_embedding_config()
     model_info = get_current_model_info()
     
-    print(f"\n===== Embedding模型信息 =====")
-    print(f"模型类型: {model_info['embedding_type']}")
-    print(f"模型名称: {model_info['embedding_model']}")
-    print(f"向量维度: {embedding_config.get('embedding_dimension', '未知')}")
+    logger.info("===== Embedding模型信息 =====")
+    logger.info(f"模型类型: {model_info['embedding_type']}")
+    logger.info(f"模型名称: {model_info['embedding_model']}")
+    logger.info(f"向量维度: {embedding_config.get('embedding_dimension', '未知')}")
     if 'base_url' in embedding_config:
-        print(f"API服务: {embedding_config['base_url']}")
-    print("==============================")
+        logger.info(f"API服务: {embedding_config['base_url']}")
+    logger.info("==============================")
 except ImportError as e:
-    print(f"警告: 无法导入配置工具函数: {e}")
-    print("使用默认配置...")
+    logger.warning(f"无法导入配置工具函数: {e}")
+    logger.info("使用默认配置...")
     embedding_config = getattr(app_config, 'API_EMBEDDING_CONFIG', {})
-    print(f"\n===== Embedding模型信息 (默认) =====")
-    print(f"模型名称: {embedding_config.get('model_name', '未知')}")
-    print("==============================")
+    logger.info("===== Embedding模型信息 (默认) =====")
+    logger.info(f"模型名称: {embedding_config.get('model_name', '未知')}")
+    logger.info("==============================")
 
 # 从app_config获取训练批处理配置
 BATCH_PROCESSING_ENABLED = app_config.TRAINING_BATCH_PROCESSING_ENABLED
@@ -63,7 +67,7 @@ class BatchProcessor:
         # 是否启用批处理
         self.batch_enabled = BATCH_PROCESSING_ENABLED       
 
-        print(f"[DEBUG] 训练批处理器初始化: 启用={self.batch_enabled}, 批大小={self.batch_size}, 最大工作线程={self.max_workers}")
+        logger.debug(f"训练批处理器初始化: 启用={self.batch_enabled}, 批大小={self.batch_size}, 最大工作线程={self.max_workers}")
     
     def add_item(self, batch_type: str, item: Dict[str, Any]):
         """添加一个项目到批处理队列"""
@@ -91,14 +95,14 @@ class BatchProcessor:
             elif batch_type == 'question_sql':
                 vn.train(question=item['question'], sql=item['sql'])
             
-            print(f"[DEBUG] 单项处理成功: {batch_type}")
+            logger.debug(f"单项处理成功: {batch_type}")
                 
         except Exception as e:
-            print(f"[ERROR] 处理 {batch_type} 项目失败: {e}")
+            logger.error(f"处理 {batch_type} 项目失败: {e}")
     
     def _process_batch(self, batch_type: str, items: List[Dict[str, Any]]):
         """处理一批项目"""
-        print(f"[INFO] 开始批量处理 {len(items)} 个 {batch_type} 项")
+        logger.info(f"开始批量处理 {len(items)} 个 {batch_type} 项")
         start_time = time.time()
         
         try:
@@ -131,46 +135,46 @@ class BatchProcessor:
             if hasattr(vn, 'add_batch') and callable(getattr(vn, 'add_batch')):
                 success = vn.add_batch(batch_data)
                 if success:
-                    print(f"[INFO] 批量处理成功: {len(items)} 个 {batch_type} 项")
+                    logger.info(f"批量处理成功: {len(items)} 个 {batch_type} 项")
                 else:
-                    print(f"[WARNING] 批量处理部分失败: {batch_type}")
+                    logger.warning(f"批量处理部分失败: {batch_type}")
             else:
                 # 如果没有批处理方法,退回到逐条处理
-                print(f"[WARNING] 批处理不可用,使用逐条处理: {batch_type}")
+                logger.warning(f"批处理不可用,使用逐条处理: {batch_type}")
                 for item in items:
                     self._process_single_item(batch_type, item)
                 
         except Exception as e:
-            print(f"[ERROR] 批处理 {batch_type} 失败: {e}")
+            logger.error(f"批处理 {batch_type} 失败: {e}")
             # 如果批处理失败,尝试逐条处理
-            print(f"[INFO] 尝试逐条处理...")
+            logger.info(f"尝试逐条处理...")
             for item in items:
                 try:
                     self._process_single_item(batch_type, item)
                 except Exception as item_e:
-                    print(f"[ERROR] 处理项目失败: {item_e}")
+                    logger.error(f"处理项目失败: {item_e}")
         
         elapsed = time.time() - start_time
-        print(f"[INFO] 批处理完成 {len(items)} 个 {batch_type} 项,耗时 {elapsed:.2f} 秒")
+        logger.info(f"批处理完成 {len(items)} 个 {batch_type} 项,耗时 {elapsed:.2f} 秒")
     
     def flush_all(self):
         """强制处理所有剩余项目"""
         with self.lock:
             for batch_type, items in self.batches.items():
                 if items:
-                    print(f"[INFO] 正在处理剩余的 {len(items)} 个 {batch_type} 项")
+                    logger.info(f"正在处理剩余的 {len(items)} 个 {batch_type} 项")
                     self._process_batch(batch_type, items)
             
             # 清空队列
             self.batches = defaultdict(list)
         
-        print("[INFO] 所有训练批处理项目已完成")
+        logger.info("所有训练批处理项目已完成")
     
     def shutdown(self):
         """关闭处理器和线程池"""
         self.flush_all()
         self.executor.shutdown(wait=True)
-        print("[INFO] 训练批处理器已关闭")
+        logger.info("训练批处理器已关闭")
 
 # 创建全局训练批处理器实例
 # 用于所有训练函数的批处理优化
@@ -178,16 +182,16 @@ batch_processor = BatchProcessor()
 
 # 原始训练函数的批处理增强版本
 def train_ddl(ddl_sql: str):
-    print(f"[DDL] Training on DDL:\n{ddl_sql}")
+    logger.debug(f"Training on DDL:\n{ddl_sql}")
     batch_processor.add_item('ddl', {'ddl': ddl_sql})
 
 def train_documentation(doc: str):
-    print(f"[DOC] Training on documentation:\n{doc}")
+    logger.debug(f"Training on documentation:\n{doc}")
     batch_processor.add_item('documentation', {'documentation': doc})
 
 def train_sql_example(sql: str):
     """训练单个SQL示例,通过SQL生成相应的问题"""
-    print(f"[SQL] Training on SQL:\n{sql}")
+    logger.debug(f"Training on SQL:\n{sql}")
     
     try:
         # 直接调用generate_question方法
@@ -198,15 +202,15 @@ def train_sql_example(sql: str):
             question += "?"
             
     except Exception as e:
-        print(f"[ERROR] 生成问题时出错: {e}")
+        logger.error(f"生成问题时出错: {e}")
         raise Exception(f"无法为SQL生成问题: {e}")
         
-    print(f"[SQL] 生成问题: {question}")
+    logger.debug(f"生成问题: {question}")
     # 使用标准方式存储问题-SQL对
     batch_processor.add_item('question_sql', {'question': question, 'sql': sql})
 
 def train_question_sql_pair(question: str, sql: str):
-    print(f"[Q-S] Training on:\nquestion: {question}\nsql: {sql}")
+    logger.debug(f"Training on question-sql pair:\nquestion: {question}\nsql: {sql}")
     batch_processor.add_item('question_sql', {'question': question, 'sql': sql})
 
 # 完成训练后刷新所有待处理项

+ 31 - 0
data_pipeline/training_data/task_20250701_131627/bss_business_day_data.ddl

@@ -0,0 +1,31 @@
+-- 中文名: 业务支撑系统每日营业数据表
+-- 描述: 业务支撑系统每日营业数据表,记录各服务区运营统计信息,包含统计日期、服务区编码及版本控制字段。
+create table public.bss_business_day_data (
+  id varchar(32) not null     -- 主键标识符,主键,
+  version integer not null    -- 数据版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人账号,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 最后更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除操作人,
+  oper_date date              -- 统计日期,
+  service_no varchar(255)     -- 服务区编码,
+  service_name varchar(255)   -- 服务区名称,
+  branch_no varchar(255)      -- 档口编码,
+  branch_name varchar(255)    -- 档口名称,
+  wx numeric(19,4)            -- 微信支付金额,
+  wx_order integer            -- 微信订单数量,
+  zfb numeric(19,4)           -- 支付宝支付金额,
+  zf_order integer            -- 支付宝订单数,
+  rmb numeric(19,4)           -- 现金支付金额,
+  rmb_order integer           -- 现金订单数量,
+  xs numeric(19,4)            -- 行吧支付金额,
+  xs_order integer            -- 行吧订单数量,
+  jd numeric(19,4)            -- 金豆支付金额,
+  jd_order integer            -- 金豆订单数量,
+  order_sum integer           -- 订单总数,
+  pay_sum numeric(19,4)       -- 支付总金额,
+  source_type integer         -- 数据来源类别,
+  primary key (id)
+);

+ 32 - 0
data_pipeline/training_data/task_20250701_131627/bss_business_day_data_detail.md

@@ -0,0 +1,32 @@
+## bss_business_day_data(业务支撑系统每日营业数据表)
+bss_business_day_data 表业务支撑系统每日营业数据表,记录各服务区运营统计信息,包含统计日期、服务区编码及版本控制字段。
+字段列表:
+- id (varchar(32)) - 主键标识符 [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- version (integer) - 数据版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- created_by (varchar(50)) - 创建人账号 [示例: xingba]
+- update_ts (timestamp) - 更新时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- updated_by (varchar(50)) - 最后更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除操作人
+- oper_date (date) - 统计日期 [示例: 2023-04-01]
+- service_no (varchar(255)) - 服务区编码 [示例: 1028, H0501]
+- service_name (varchar(255)) - 服务区名称 [示例: 宜春服务区, 庐山服务区]
+- branch_no (varchar(255)) - 档口编码 [示例: 1, H05016]
+- branch_name (varchar(255)) - 档口名称 [示例: 宜春南区, 庐山鲜徕客东区]
+- wx (numeric(19,4)) - 微信支付金额 [示例: 4790.0000, 2523.0000]
+- wx_order (integer) - 微信订单数量 [示例: 253, 133]
+- zfb (numeric(19,4)) - 支付宝支付金额 [示例: 229.0000, 0.0000]
+- zf_order (integer) - 支付宝订单数 [示例: 15, 0]
+- rmb (numeric(19,4)) - 现金支付金额 [示例: 1058.5000, 124.0000]
+- rmb_order (integer) - 现金订单数量 [示例: 56, 12]
+- xs (numeric(19,4)) - 行吧支付金额 [示例: 0.0000, 40.0000]
+- xs_order (integer) - 行吧订单数量 [示例: 0, 1]
+- jd (numeric(19,4)) - 金豆支付金额 [示例: 0.0000]
+- jd_order (integer) - 金豆订单数量 [示例: 0]
+- order_sum (integer) - 订单总数 [示例: 324, 146]
+- pay_sum (numeric(19,4)) - 支付总金额 [示例: 6077.5000, 2687.0000]
+- source_type (integer) - 数据来源类别 [示例: 1, 0, 4]
+字段补充说明:
+- id 为主键
+- source_type 为枚举字段,包含取值:0、4、1、2、3

+ 17 - 0
data_pipeline/training_data/task_20250701_131627/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: 服务区车辆日统计表
+-- 描述: 服务区车辆日统计表,记录各类型车辆日通行量及操作信息,用于交通流量分析和运营管理。
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  customer_count bigint       -- 车辆数量,
+  car_type varchar(100)       -- 车辆类别,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 18 - 0
data_pipeline/training_data/task_20250701_131627/bss_car_day_count_detail.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(服务区车辆日统计表)
+bss_car_day_count 表服务区车辆日统计表,记录各类型车辆日通行量及操作信息,用于交通流量分析和运营管理。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人
+- update_ts (timestamp) - 更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- customer_count (bigint) - 车辆数量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类别 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

+ 15 - 0
data_pipeline/training_data/task_20250701_131627/bss_company.ddl

@@ -0,0 +1,15 @@
+-- 中文名: 存储高速公路服务区合作公司基础信息(含公司名称及唯一编码)
+-- 描述: 存储高速公路服务区合作公司基础信息(含公司名称及唯一编码),用于业务支撑系统中企业信息管理与业务关联支撑。
+create table public.bss_company (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  company_name varchar(255)   -- 分公司名称,
+  company_no varchar(255)     -- 公司编码,
+  primary key (id)
+);

+ 15 - 0
data_pipeline/training_data/task_20250701_131627/bss_company_detail.md

@@ -0,0 +1,15 @@
+## bss_company(存储高速公路服务区合作公司基础信息(含公司名称及唯一编码))
+bss_company 表存储高速公路服务区合作公司基础信息(含公司名称及唯一编码),用于业务支撑系统中企业信息管理与业务关联支撑。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 30675d85ba5044c31acfa243b9d16334, 47ed0bb37f5a85f3d9245e4854959b81]
+- version (integer) - 版本号 [非空] [示例: 1, 2]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- company_name (varchar(255)) - 分公司名称 [示例: 上饶分公司, 宜春分公司]
+- company_no (varchar(255)) - 公司编码 [示例: H03, H02]
+字段补充说明:
+- id 为主键

+ 16 - 0
data_pipeline/training_data/task_20250701_131627/bss_section_route.ddl

@@ -0,0 +1,16 @@
+-- 中文名: 存储高速公路路段与路线信息
+-- 描述: 存储高速公路路段与路线信息,支持服务区路线关联管理。
+create table public.bss_section_route (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  section_name varchar(255)   -- 路段名称,
+  route_name varchar(255)     -- 路线名称,
+  code varchar(255)           -- 路段编号,
+  primary key (id)
+);

+ 7 - 0
data_pipeline/training_data/task_20250701_131627/bss_section_route_area_link.ddl

@@ -0,0 +1,7 @@
+-- 中文名: 路段路线与服务区关联表
+-- 描述: 路段路线与服务区关联表,维护路线与服务区之间的归属关系。
+create table public.bss_section_route_area_link (
+  section_route_id varchar(32) not null -- 路段路线ID,主键,
+  service_area_id varchar(32) not null -- 服务区ID,主键,
+  primary key (section_route_id, service_area_id)
+);

+ 7 - 0
data_pipeline/training_data/task_20250701_131627/bss_section_route_area_link_detail.md

@@ -0,0 +1,7 @@
+## bss_section_route_area_link(路段路线与服务区关联表)
+bss_section_route_area_link 表路段路线与服务区关联表,维护路线与服务区之间的归属关系。
+字段列表:
+- section_route_id (varchar(32)) - 路段路线ID [主键, 非空] [示例: v8elrsfs5f7lt7jl8a6p87smfzesn3rz, hxzi2iim238e3s1eajjt1enmh9o4h3wp]
+- service_area_id (varchar(32)) - 服务区ID [主键, 非空] [示例: 08e01d7402abd1d6a4d9fdd5df855ef8, 091662311d2c737029445442ff198c4c]
+字段补充说明:
+- 复合主键:section_route_id, service_area_id

+ 16 - 0
data_pipeline/training_data/task_20250701_131627/bss_section_route_detail.md

@@ -0,0 +1,16 @@
+## bss_section_route(存储高速公路路段与路线信息)
+bss_section_route 表存储高速公路路段与路线信息,支持服务区路线关联管理。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 04ri3j67a806uw2c6o6dwdtz4knexczh, 0g5mnefxxtukql2cq6acul7phgskowy7]
+- version (integer) - 版本号 [非空] [示例: 1, 0]
+- create_ts (timestamp) - 创建时间 [示例: 2021-10-29 19:43:50, 2022-03-04 16:07:16]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- section_name (varchar(255)) - 路段名称 [示例: 昌栗, 昌宁]
+- route_name (varchar(255)) - 路线名称 [示例: 昌栗, 昌韶]
+- code (varchar(255)) - 路段编号 [示例: SR0001, SR0002]
+字段补充说明:
+- id 为主键

+ 19 - 0
data_pipeline/training_data/task_20250701_131627/bss_service_area.ddl

@@ -0,0 +1,19 @@
+-- 中文名: 存储高速公路服务区基础信息及版本变更记录
+-- 描述: 存储高速公路服务区基础信息及版本变更记录,支持服务区全生命周期管理。
+create table public.bss_service_area (
+  id varchar(32) not null     -- 主键标识符,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  service_area_name varchar(255) -- 服务区名称,
+  service_area_no varchar(255) -- 服务区编码,
+  company_id varchar(32)      -- 所属公司ID,
+  service_position varchar(255) -- 地理坐标,
+  service_area_type varchar(50) -- 服务区类型,
+  service_state varchar(50)   -- 运营状态,
+  primary key (id)
+);

+ 21 - 0
data_pipeline/training_data/task_20250701_131627/bss_service_area_detail.md

@@ -0,0 +1,21 @@
+## bss_service_area(存储高速公路服务区基础信息及版本变更记录)
+bss_service_area 表存储高速公路服务区基础信息及版本变更记录,支持服务区全生命周期管理。
+字段列表:
+- id (varchar(32)) - 主键标识符 [主键, 非空] [示例: 0271d68ef93de9684b7ad8c7aae600b6, 08e01d7402abd1d6a4d9fdd5df855ef8]
+- version (integer) - 版本号 [非空] [示例: 3, 6]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-21 13:26:40.589000, 2021-05-20 19:51:46.314000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-07-10 15:41:28.795000, 2021-07-11 09:33:08.455000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人 [示例: ]
+- service_area_name (varchar(255)) - 服务区名称 [示例: 白鹭湖停车区, 南昌南服务区]
+- service_area_no (varchar(255)) - 服务区编码 [示例: H0814, H0105]
+- company_id (varchar(32)) - 所属公司ID [示例: b1629f07c8d9ac81494fbc1de61f1ea5, ee9bf1180a2b45003f96e597a4b7f15a]
+- service_position (varchar(255)) - 地理坐标 [示例: 114.574721,26.825584, 115.910549,28.396355]
+- service_area_type (varchar(50)) - 服务区类型 [示例: 信息化服务区]
+- service_state (varchar(50)) - 运营状态 [示例: 开放, 关闭]
+字段补充说明:
+- id 为主键
+- service_area_type 为枚举字段,包含取值:信息化服务区、智能化服务区
+- service_state 为枚举字段,包含取值:开放、关闭、上传数据

+ 18 - 0
data_pipeline/training_data/task_20250701_131627/bss_service_area_mapper.ddl

@@ -0,0 +1,18 @@
+-- 中文名: BSS服务区基础信息映射表
+-- 描述: BSS服务区基础信息映射表,记录服务区名称、编码及全生命周期操作日志
+create table public.bss_service_area_mapper (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  service_name varchar(255)   -- 服务区名称,
+  service_no varchar(255)     -- 服务区编码,
+  service_area_id varchar(32) -- 服务区ID,
+  source_system_type varchar(50) -- 数据来源系统类型,
+  source_type integer         -- 数据来源类别ID,
+  primary key (id)
+);

+ 19 - 0
data_pipeline/training_data/task_20250701_131627/bss_service_area_mapper_detail.md

@@ -0,0 +1,19 @@
+## bss_service_area_mapper(BSS服务区基础信息映射表)
+bss_service_area_mapper 表BSS服务区基础信息映射表,记录服务区名称、编码及全生命周期操作日志
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00e1e893909211ed8ee6fa163eaf653f, 013867f5962211ed8ee6fa163eaf653f]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-01-10 10:54:03, 2023-01-17 12:47:29]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2023-01-10 10:54:07, 2023-01-17 12:47:32]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- service_name (varchar(255)) - 服务区名称 [示例: 信丰西服务区, 南康北服务区]
+- service_no (varchar(255)) - 服务区编码 [示例: 1067, 1062]
+- service_area_id (varchar(32)) - 服务区ID [示例: 97cd6cd516a551409a4d453a58f9e170, fdbdd042962011ed8ee6fa163eaf653f]
+- source_system_type (varchar(50)) - 数据来源系统类型 [示例: 驿美, 驿购]
+- source_type (integer) - 数据来源类别ID [示例: 3, 1]
+字段补充说明:
+- id 为主键
+- source_system_type 为枚举字段,包含取值:司乘管理、商业管理、驿购、驿美、手工录入

+ 10 - 0
data_pipeline/training_data/task_20250701_131627/db_query_decision_prompt.txt

@@ -0,0 +1,10 @@
+=== 数据库业务范围 ===
+当前数据库存储的是高速公路服务区运营管理的相关数据,主要涉及服务区运营统计、车辆通行量、基础信息管理及路段关联,包含以下业务数据:
+核心业务实体:
+- 服务区:描述高速公路服务区基础信息,主要字段:服务区名称、服务区编码、地理坐标、服务区类型、运营状态
+- 车辆类型:描述通行车辆分类维度,主要字段:车辆类别(其他、危化品、城际、过境)
+- 路段路线:描述高速公路路段与路线归属关系,主要字段:路段名称、路线名称、路段编号
+- 合作公司:描述服务区所属分公司信息,主要字段:分公司名称、公司编码
+关键业务指标:
+- 营收指标:包含微信/支付宝/现金/行吧/金豆支付金额及订单数、支付总金额、订单总数
+- 车辆流量:按类型统计的日通行车辆数量

+ 10 - 0
data_pipeline/training_data/task_20250701_131627/filename_mapping.txt

@@ -0,0 +1,10 @@
+# 文件名映射报告
+# 格式: 原始表名 -> 实际文件名
+
+public.bss_business_day_data -> bss_business_day_data_detail.md
+public.bss_car_day_count -> bss_car_day_count_detail.md
+public.bss_company -> bss_company_detail.md
+public.bss_section_route -> bss_section_route_detail.md
+public.bss_section_route_area_link -> bss_section_route_area_link_detail.md
+public.bss_service_area -> bss_service_area_detail.md
+public.bss_service_area_mapper -> bss_service_area_mapper_detail.md

+ 62 - 0
data_pipeline/training_data/task_20250701_131627/metadata.txt

@@ -0,0 +1,62 @@
+-- Schema Tools生成的主题元数据
+-- 业务背景: 高速公路服务区管理系统
+-- 生成时间: 2025-07-01 13:47:36
+-- 数据库: highway_db
+
+-- 创建表(如果不存在)
+CREATE TABLE IF NOT EXISTS metadata (
+    id SERIAL PRIMARY KEY,    -- 主键
+    topic_name VARCHAR(100) NOT NULL,  -- 业务主题名称
+    description TEXT,                  -- 业务主体说明
+    related_tables TEXT[],			  -- 相关表名
+    biz_entities TEXT[],               -- 主要业务实体名称
+    biz_metrics TEXT[],                -- 主要业务指标名称
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP    -- 插入时间
+);
+
+-- 插入主题数据
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '日营收结构',
+  '分析各服务区每日营收构成及支付方式占比,优化资金管理策略',
+  'bss_business_day_data',
+  '服务区,支付方式,档口',
+  '总营收,现金占比,移动支付比例'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '车流高峰分析',
+  '通过车辆统计表识别服务区高峰时段及车型分布,指导资源调度',
+  'bss_car_day_count,bss_service_area',
+  '服务区,车辆类型,统计日期',
+  '日均车流,高峰时段,危化品车辆占比'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '分公司对比',
+  '比较不同分公司的服务区运营效率及营收能力,发现管理差异',
+  'bss_company,bss_service_area,bss_business_day_data',
+  '分公司,服务区,运营指标',
+  '人均营收,客单价,订单密度'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '路线关联分析',
+  '研究路段路线与服务区的关联关系,优化路线规划和服务区配置',
+  'bss_section_route,bss_section_route_area_link,bss_car_day_count',
+  '路段,路线,服务区',
+  '路线车流,服务区覆盖率,路线营收贡献'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '节假日效应',
+  '分析节假日前后服务区营收和车流变化,制定营销和服务方案',
+  'bss_business_day_data,bss_car_day_count',
+  '服务区,节假日,支付方式',
+  '节前增幅,节假日营收占比,车流增长率'
+);
+

+ 20 - 0
data_pipeline/training_data/task_20250701_131627/metadata_detail.md

@@ -0,0 +1,20 @@
+## metadata(存储分析主题元数据)
+
+`metadata` 主要描述了当前数据库包含了哪些数据内容,哪些分析主题,哪些指标等等。
+
+字段列表:
+
+- `id` (serial) - 主键ID [主键, 非空]
+- `topic_name` (varchar(100)) - 业务主题名称 [非空]
+- `description` (text) - 业务主题说明
+- `related_tables` (text[]) - 涉及的数据表 [示例: bss_business_day_data, bss_section_route_area_link]
+- `biz_entities` (text[]) - 主要业务实体名称 [示例: 车辆类型, 节假日, 路线]
+- `biz_metrics` (text[]) - 主要业务指标名称 [示例: 总营收, 现金占比, 人均营收]
+- `created_at` (timestamp) - 插入时间 [默认值: `CURRENT_TIMESTAMP`]
+
+字段补充说明:
+
+- `id` 为主键,自增;
+- `related_tables` 用于建立主题与具体明细表的依赖关系;
+- `biz_entities` 表示主题关注的核心对象,例如服务区、车辆、公司;
+- `biz_metrics` 表示该主题关注的业务分析指标,例如营收对比、趋势变化、占比结构等。

+ 190 - 0
data_pipeline/training_data/task_20250701_131627/qs_highway_db_20250701_134736_pair.json

@@ -0,0 +1,190 @@
+[
+  {
+    "question": "统计2023年4月1日各服务区的总营收及现金支付金额占比",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, SUM(rmb)/SUM(pay_sum)*100 AS 现金支付占比 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "分析2023年第一季度各支付方式在总营收中的占比变化趋势",
+    "sql": "SELECT oper_date AS 统计日期, SUM(wx)/SUM(pay_sum)*100 AS 微信占比, SUM(zfb)/SUM(pay_sum)*100 AS 支付宝占比, SUM(rmb)/SUM(pay_sum)*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "查询最近7天总营收最高的前5个服务区及其移动支付比例",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, (SUM(wx)+SUM(zfb))/SUM(pay_sum)*100 AS 移动支付比例 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND oper_date < CURRENT_DATE AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "对比不同档口的现金支付订单占比并按占比排序",
+    "sql": "SELECT branch_name AS 档口名称, SUM(rmb_order)/SUM(order_sum)*100 AS 现金订单占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 现金订单占比 DESC;"
+  },
+  {
+    "question": "计算宜春服务区2023年各季度月均营收及最大单日营收",
+    "sql": "SELECT EXTRACT(QUARTER FROM oper_date) AS 季度, AVG(pay_sum) AS 月均营收, MAX(pay_sum) AS 最大单日营收 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND EXTRACT(YEAR FROM oper_date) = 2023 AND delete_ts IS NULL GROUP BY 季度 ORDER BY 季度;"
+  },
+  {
+    "question": "统计2023年4月各服务区订单总数及总营收并按营收排名",
+    "sql": "SELECT service_name AS 服务区名称, SUM(order_sum) AS 订单总数, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "查询最近一天移动支付占比超过80%的服务区信息",
+    "sql": "SELECT service_name AS 服务区名称, (wx+zfb)/pay_sum*100 AS 移动支付比例 FROM bss_business_day_data WHERE oper_date = (SELECT MAX(oper_date) FROM bss_business_day_data WHERE delete_ts IS NULL) AND (wx+zfb)/pay_sum > 0.8 AND delete_ts IS NULL ORDER BY 移动支付比例 DESC;"
+  },
+  {
+    "question": "分析庐山服务区2023年各星期的营收分布情况",
+    "sql": "SELECT EXTRACT(ISODOW FROM oper_date) AS 星期, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE service_name = '庐山服务区' AND EXTRACT(YEAR FROM oper_date) = 2023 AND delete_ts IS NULL GROUP BY 星期 ORDER BY 星期;"
+  },
+  {
+    "question": "统计最近一天总营收超过1万元且现金占比低于10%的服务区",
+    "sql": "SELECT service_name AS 服务区名称, pay_sum AS 总营收, rmb/pay_sum*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date = (SELECT MAX(oper_date) FROM bss_business_day_data WHERE delete_ts IS NULL) AND pay_sum > 10000 AND rmb/pay_sum < 0.1 AND delete_ts IS NULL ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "对比宜春和南昌南服务区最近30天各支付方式的平均日营收",
+    "sql": "SELECT service_name AS 服务区名称, AVG(wx) AS 日均微信营收, AVG(zfb) AS 日均支付宝营收, AVG(rmb) AS 日均现金营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND service_name IN ('宜春服务区','南昌南服务区') AND delete_ts IS NULL GROUP BY service_name ORDER BY 服务区名称;"
+  },
+  {
+    "question": "统计各服务区日均车流量并按车流由高到低排序",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, AVG(cc.customer_count) AS 日均车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 日均车流量 DESC;"
+  },
+  {
+    "question": "查询危化品车辆占比超过5%的服务区信息",
+    "sql": "SELECT sa.service_area_name, ROUND((SUM(CASE WHEN cc.car_type='危化品' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count))::numeric,2) AS 危化品占比 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name HAVING SUM(CASE WHEN cc.car_type='危化品' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count) > 5 ORDER BY 危化品占比 DESC;"
+  },
+  {
+    "question": "分析最近30天各车型日均通行量变化趋势",
+    "sql": "SELECT count_date AS 统计日期, car_type AS 车型, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY count_date, car_type ORDER BY count_date;"
+  },
+  {
+    "question": "对比周末与工作日车流量差异",
+    "sql": "SELECT CASE WHEN EXTRACT(DOW FROM count_date) IN (0,6) THEN '周末' ELSE '工作日' END AS 时段类型, AVG(customer_count) AS 平均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY 时段类型;"
+  },
+  {
+    "question": "获取各服务区过境车辆占比TOP5",
+    "sql": "SELECT sa.service_area_name, ROUND((SUM(CASE WHEN cc.car_type='过境' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count))::numeric,2) AS 过境占比 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 过境占比 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计最近一周每日总车流量及环比增长率",
+    "sql": "WITH daily_total AS (SELECT count_date, SUM(customer_count) AS total FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY count_date) SELECT count_date, total, LAG(total) OVER(ORDER BY count_date) AS 前一日流量, ROUND(((total - LAG(total) OVER(ORDER BY count_date))*100.0/LAG(total) OVER(ORDER BY count_date))::numeric,2) AS 环比增长率 FROM daily_total;"
+  },
+  {
+    "question": "查询连续3天车流量增长的服务区",
+    "sql": "WITH daily_growth AS (SELECT service_area_id, count_date, SUM(customer_count) AS daily_count, LAG(SUM(customer_count),1) OVER(PARTITION BY service_area_id ORDER BY count_date) AS prev_count FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id, count_date) SELECT sa.service_area_name FROM (SELECT service_area_id FROM daily_growth WHERE daily_count > prev_count GROUP BY service_area_id, count_date - generate_series(0,2)) t JOIN bss_service_area sa ON t.service_area_id = sa.id;"
+  },
+  {
+    "question": "统计各车辆类型在不同时间段的分布比例",
+    "sql": "SELECT car_type AS 车型, EXTRACT(HOUR FROM create_ts)::integer AS 小时段, ROUND(AVG(customer_count)::numeric,0) AS 平均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type, 小时段 ORDER BY 小时段;"
+  },
+  {
+    "question": "获取昨日车流量最高的3个服务区及对应车型分布",
+    "sql": "SELECT sa.service_area_name, cc.car_type, cc.customer_count FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date = CURRENT_DATE - 1 AND sa.delete_ts IS NULL ORDER BY cc.customer_count DESC LIMIT 3;"
+  },
+  {
+    "question": "分析各区域城际车辆通行量与服务区开放状态的关系",
+    "sql": "SELECT sa.service_state AS 开放状态, AVG(CASE WHEN cc.car_type='城际' THEN cc.customer_count ELSE 0 END) AS 平均城际车流量 FROM bss_car_day_count cc RIGHT JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE sa.delete_ts IS NULL GROUP BY sa.service_state;"
+  },
+  {
+    "question": "各分公司2023年4月人均营收TOP5(按支付总额/车流量计算)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.pay_sum)/SUM(car.customer_count) AS 人均营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_car_day_count car ON sa.id = car.service_area_id AND bd.oper_date = car.count_date WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY c.company_name ORDER BY 人均营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "2023年Q2各分公司客单价对比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, AVG(bd.pay_sum/bd.order_sum) AS 客单价 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-06-30' GROUP BY c.company_name ORDER BY 客单价 DESC;"
+  },
+  {
+    "question": "最近一周订单密度(订单数/面积)最低的3个分公司",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.order_sum)/COUNT(DISTINCT sa.id) AS 订单密度 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date >= CURRENT_DATE - 7 GROUP BY c.company_name ORDER BY 订单密度 ASC LIMIT 3;"
+  },
+  {
+    "question": "各分公司2023年节假日营收总额环比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 1 THEN bd.pay_sum ELSE 0 END) AS 一月营收, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 2 THEN bd.pay_sum ELSE 0 END) AS 二月营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(YEAR FROM bd.oper_date) = 2023 GROUP BY c.company_name;"
+  },
+  {
+    "question": "2023-04-01当日各分公司运营指标对比(支付总额、订单数、车流量)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.pay_sum) AS 支付总额, SUM(bd.order_sum) AS 订单总数, SUM(car.customer_count) AS 车流量 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_car_day_count car ON sa.id = car.service_area_id WHERE bd.oper_date = '2023-04-01' GROUP BY c.company_name ORDER BY 支付总额 DESC;"
+  },
+  {
+    "question": "各分公司微信支付占比分析(近30天)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.wx) / SUM(bd.pay_sum) * 100 AS 微信占比百分比 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date >= CURRENT_DATE - 30 GROUP BY c.company_name ORDER BY 微信占比百分比 DESC;"
+  },
+  {
+    "question": "各分公司服务区数量与营收能力关联分析",
+    "sql": "SELECT c.company_name AS 分公司名称, COUNT(sa.id) AS 服务区数量, SUM(bd.pay_sum) AS 总营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY c.company_name ORDER BY 服务区数量 DESC, 总营收 DESC;"
+  },
+  {
+    "question": "2023年各分公司月均订单密度趋势分析",
+    "sql": "SELECT c.company_name AS 分公司名称, EXTRACT(MONTH FROM bd.oper_date) AS 月份, AVG(bd.order_sum) AS 月均订单密度 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(YEAR FROM bd.oper_date) = 2023 GROUP BY c.company_name, 月份 ORDER BY 分公司名称, 月份;"
+  },
+  {
+    "question": "各分公司不同支付方式订单数占比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.wx_order)/SUM(bd.order_sum)*100 AS 微信占比, SUM(bd.zf_order)/SUM(bd.order_sum)*100 AS 支付宝占比 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY c.company_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "2023年Q2各分公司营收增长率分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 4 THEN bd.pay_sum ELSE 0 END) / SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 5 THEN bd.pay_sum ELSE 0 END) - 1 AS 月增长率 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(QUARTER FROM bd.oper_date) = 2 GROUP BY c.company_name ORDER BY 月增长率 DESC;"
+  },
+  {
+    "question": "统计各路线关联的服务区数量及平均车流量,按服务区数量降序排列",
+    "sql": "SELECT r.route_name AS 路线名称, COUNT(l.service_area_id) AS 服务区数量, AVG(c.customer_count) AS 平均车流量 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id LEFT JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE r.delete_ts IS NULL GROUP BY r.route_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "计算2023年Q2各路段日均车流量,筛选出日均车流量>1000的路段",
+    "sql": "SELECT s.section_name AS 路段名称, COUNT(*) AS 天数, AVG(c.customer_count) AS 日均车流量 FROM bss_section_route s JOIN bss_section_route_area_link l ON s.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE c.count_date BETWEEN '2023-04-01' AND '2023-06-30' AND s.delete_ts IS NULL GROUP BY s.section_name HAVING AVG(c.customer_count) > 1000;"
+  },
+  {
+    "question": "查询2023年车流量TOP5服务区及对应路线信息",
+    "sql": "SELECT a.service_area_name AS 服务区名称, r.route_name AS 路线名称, SUM(c.customer_count) AS 总车流量 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_car_day_count c ON a.id = c.service_area_id WHERE c.count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY a.service_area_name, r.route_name ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计未关联服务区的路段清单及创建时间",
+    "sql": "SELECT r.section_name AS 路段名称, r.create_ts AS 创建时间 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id WHERE l.service_area_id IS NULL AND r.delete_ts IS NULL;"
+  },
+  {
+    "question": "分析春运期间(2023-01-07至2023-02-16)各路线车流变化趋势",
+    "sql": "SELECT r.route_name AS 路线名称, c.count_date AS 日期, SUM(c.customer_count) AS 总车流量 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE c.count_date BETWEEN '2023-01-07' AND '2023-02-16' GROUP BY r.route_name, c.count_date ORDER BY 日期;"
+  },
+  {
+    "question": "计算各服务区车流覆盖率(关联路段车流/总车流)TOP10",
+    "sql": "SELECT a.service_area_name AS 服务区名称, SUM(c.customer_count) AS 关联车流, (SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = a.id) AS 总车流, ROUND((SUM(c.customer_count)/(SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = a.id)) * 100)::numeric(5,2) AS 覆盖率 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_car_day_count c ON a.id = c.service_area_id GROUP BY a.id, a.service_area_name ORDER BY 覆盖率 DESC LIMIT 10;"
+  },
+  {
+    "question": "分析不同分公司管辖路段的服务区密度(服务区数/路段长度)",
+    "sql": "SELECT c.company_name AS 分公司名称, COUNT(a.id) AS 服务区数量, SUM(LENGTH(s.code)) AS 路段总长度, ROUND((COUNT(a.id)/SUM(LENGTH(s.code))) * 1000)::numeric(5,2) AS 密度_每千米 FROM bss_company c JOIN bss_service_area a ON c.id = a.company_id JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route s ON l.section_route_id = s.id GROUP BY c.company_name;"
+  },
+  {
+    "question": "分析2023年国庆节期间各服务区营收总额及环比增长率",
+    "sql": "WITH holiday_revenue AS (SELECT service_name, SUM(pay_sum) AS holiday_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name), pre_holiday_revenue AS (SELECT service_name, SUM(pay_sum) AS pre_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-09-30' AND delete_ts IS NULL GROUP BY service_name) SELECT h.service_name, h.holiday_amount, ROUND((h.holiday_amount - p.pre_amount)/p.pre_amount*100, 2) AS growth_rate FROM holiday_revenue h JOIN pre_holiday_revenue p ON h.service_name = p.service_name ORDER BY growth_rate DESC;"
+  },
+  {
+    "question": "统计2023年春节期间各服务区节假日营收占Q1季度总营收比例",
+    "sql": "WITH q1_revenue AS (SELECT service_name, SUM(pay_sum) AS q1_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY service_name), lunar_revenue AS (SELECT service_name, SUM(pay_sum) AS lunar_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-20' AND '2023-01-27' AND delete_ts IS NULL GROUP BY service_name) SELECT q.service_name, ROUND(l.lunar_amount/q.q1_amount*100, 2) AS ratio FROM q1_revenue q JOIN lunar_revenue l ON q.service_name = l.service_name ORDER BY ratio DESC;"
+  },
+  {
+    "question": "对比2023年国庆节期间不同支付方式金额占比",
+    "sql": "SELECT '微信' AS pay_type, ROUND(SUM(wx)/SUM(pay_sum)*100, 2) AS ratio FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL UNION ALL SELECT '支付宝', ROUND(SUM(zfb)/SUM(pay_sum)*100, 2) FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL UNION ALL SELECT '现金', ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "分析节假日与非节假日各服务区日均车流量增长率",
+    "sql": "WITH holiday_avg AS (SELECT service_area_id, AVG(customer_count) AS holiday_avg FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id), non_holiday_avg AS (SELECT service_area_id, AVG(customer_count) AS non_holiday_avg FROM bss_car_day_count WHERE count_date NOT BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id) SELECT h.service_area_id, ROUND((h.holiday_avg - n.non_holiday_avg)/n.non_holiday_avg*100, 2) AS growth_rate FROM holiday_avg h JOIN non_holiday_avg n ON h.service_area_id = n.service_area_id ORDER BY growth_rate DESC LIMIT 10;"
+  },
+  {
+    "question": "统计节假日车流最高峰时段的车辆类型分布",
+    "sql": "SELECT car_type, SUM(customer_count) AS total_cars FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND EXTRACT(HOUR FROM create_ts) BETWEEN 8 AND 10 AND delete_ts IS NULL GROUP BY car_type ORDER BY total_cars DESC;"
+  },
+  {
+    "question": "对比2023年五一假期与清明假期营收增幅排名TOP5服务区",
+    "sql": "WITH may_revenue AS (SELECT service_name, SUM(pay_sum) AS may_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-29' AND '2023-05-03' AND delete_ts IS NULL GROUP BY service_name), qingming_revenue AS (SELECT service_name, SUM(pay_sum) AS qingming_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-05' AND '2023-04-07' AND delete_ts IS NULL GROUP BY service_name) SELECT m.service_name, ROUND((m.may_amount - q.qingming_amount)/q.qingming_amount*100, 2) AS growth_rate FROM may_revenue m JOIN qingming_revenue q ON m.service_name = q.service_name ORDER BY growth_rate DESC LIMIT 5;"
+  },
+  {
+    "question": "分析节假日现金支付比例变化趋势",
+    "sql": "SELECT oper_date, ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) AS cash_ratio FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-10-07' AND delete_ts IS NULL GROUP BY oper_date ORDER BY oper_date;"
+  },
+  {
+    "question": "统计危化品车辆节假日期间通行量同比增幅",
+    "sql": "WITH holiday_2022 AS (SELECT COUNT(*) AS cnt_2022 FROM bss_car_day_count WHERE count_date BETWEEN '2022-10-01' AND '2022-10-07' AND car_type = '危化品' AND delete_ts IS NULL), holiday_2023 AS (SELECT COUNT(*) AS cnt_2023 FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND car_type = '危化品' AND delete_ts IS NULL) SELECT ROUND((cnt_2023 - cnt_2022)/cnt_2022*100, 2) AS growth_rate FROM holiday_2022, holiday_2023;"
+  },
+  {
+    "question": "查询2023年国庆节期间营收增幅超过50%的服务区清单",
+    "sql": "WITH pre_data AS (SELECT service_name, SUM(pay_sum) AS pre_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-09-30' AND delete_ts IS NULL GROUP BY service_name), holiday_data AS (SELECT service_name, SUM(pay_sum) AS holiday_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name) SELECT h.service_name, ROUND((h.holiday_amount - p.pre_amount)/p.pre_amount*100, 2) AS growth_rate FROM holiday_data h JOIN pre_data p ON h.service_name = p.service_name WHERE (h.holiday_amount - p.pre_amount)/p.pre_amount > 0.5 ORDER BY growth_rate DESC;"
+  },
+  {
+    "question": "分析节假日期间城际车辆流量与服务区地理位置的关系",
+    "sql": "SELECT s.service_area_name, s.service_position, AVG(c.customer_count) AS avg_traffic FROM bss_car_day_count c JOIN bss_service_area s ON c.service_area_id = s.id WHERE c.car_type = '城际' AND c.count_date BETWEEN '2023-10-01' AND '2023-10-07' AND c.delete_ts IS NULL GROUP BY s.service_area_name, s.service_position ORDER BY avg_traffic DESC;"
+  }
+]

+ 202 - 0
data_pipeline/training_data/task_20250701_131627/qs_highway_db_20250701_134736_pair.json.backup

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "统计2023年4月1日各服务区的总营收及现金支付金额占比",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, SUM(rmb)/SUM(pay_sum)*100 AS 现金支付占比 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "分析2023年第一季度各支付方式在总营收中的占比变化趋势",
+    "sql": "SELECT oper_date AS 统计日期, SUM(wx)/SUM(pay_sum)*100 AS 微信占比, SUM(zfb)/SUM(pay_sum)*100 AS 支付宝占比, SUM(rmb)/SUM(pay_sum)*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "查询最近7天总营收最高的前5个服务区及其移动支付比例",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, (SUM(wx)+SUM(zfb))/SUM(pay_sum)*100 AS 移动支付比例 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND oper_date < CURRENT_DATE AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "对比不同档口的现金支付订单占比并按占比排序",
+    "sql": "SELECT branch_name AS 档口名称, SUM(rmb_order)/SUM(order_sum)*100 AS 现金订单占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 现金订单占比 DESC;"
+  },
+  {
+    "question": "计算宜春服务区2023年各季度月均营收及最大单日营收",
+    "sql": "SELECT EXTRACT(QUARTER FROM oper_date) AS 季度, AVG(pay_sum) AS 月均营收, MAX(pay_sum) AS 最大单日营收 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND EXTRACT(YEAR FROM oper_date) = 2023 AND delete_ts IS NULL GROUP BY 季度 ORDER BY 季度;"
+  },
+  {
+    "question": "统计2023年4月各服务区订单总数及总营收并按营收排名",
+    "sql": "SELECT service_name AS 服务区名称, SUM(order_sum) AS 订单总数, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "查询最近一天移动支付占比超过80%的服务区信息",
+    "sql": "SELECT service_name AS 服务区名称, (wx+zfb)/pay_sum*100 AS 移动支付比例 FROM bss_business_day_data WHERE oper_date = (SELECT MAX(oper_date) FROM bss_business_day_data WHERE delete_ts IS NULL) AND (wx+zfb)/pay_sum > 0.8 AND delete_ts IS NULL ORDER BY 移动支付比例 DESC;"
+  },
+  {
+    "question": "分析庐山服务区2023年各星期的营收分布情况",
+    "sql": "SELECT EXTRACT(ISODOW FROM oper_date) AS 星期, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE service_name = '庐山服务区' AND EXTRACT(YEAR FROM oper_date) = 2023 AND delete_ts IS NULL GROUP BY 星期 ORDER BY 星期;"
+  },
+  {
+    "question": "统计最近一天总营收超过1万元且现金占比低于10%的服务区",
+    "sql": "SELECT service_name AS 服务区名称, pay_sum AS 总营收, rmb/pay_sum*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date = (SELECT MAX(oper_date) FROM bss_business_day_data WHERE delete_ts IS NULL) AND pay_sum > 10000 AND rmb/pay_sum < 0.1 AND delete_ts IS NULL ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "对比宜春和南昌南服务区最近30天各支付方式的平均日营收",
+    "sql": "SELECT service_name AS 服务区名称, AVG(wx) AS 日均微信营收, AVG(zfb) AS 日均支付宝营收, AVG(rmb) AS 日均现金营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND service_name IN ('宜春服务区','南昌南服务区') AND delete_ts IS NULL GROUP BY service_name ORDER BY 服务区名称;"
+  },
+  {
+    "question": "统计各服务区日均车流量并按车流由高到低排序",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, AVG(cc.customer_count) AS 日均车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 日均车流量 DESC;"
+  },
+  {
+    "question": "查询危化品车辆占比超过5%的服务区信息",
+    "sql": "SELECT sa.service_area_name, ROUND((SUM(CASE WHEN cc.car_type='危化品' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count))::numeric,2) AS 危化品占比 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name HAVING SUM(CASE WHEN cc.car_type='危化品' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count) > 5 ORDER BY 危化品占比 DESC;"
+  },
+  {
+    "question": "分析最近30天各车型日均通行量变化趋势",
+    "sql": "SELECT count_date AS 统计日期, car_type AS 车型, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY count_date, car_type ORDER BY count_date;"
+  },
+  {
+    "question": "对比周末与工作日车流量差异",
+    "sql": "SELECT CASE WHEN EXTRACT(DOW FROM count_date) IN (0,6) THEN '周末' ELSE '工作日' END AS 时段类型, AVG(customer_count) AS 平均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY 时段类型;"
+  },
+  {
+    "question": "获取各服务区过境车辆占比TOP5",
+    "sql": "SELECT sa.service_area_name, ROUND((SUM(CASE WHEN cc.car_type='过境' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count))::numeric,2) AS 过境占比 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 过境占比 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计最近一周每日总车流量及环比增长率",
+    "sql": "WITH daily_total AS (SELECT count_date, SUM(customer_count) AS total FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY count_date) SELECT count_date, total, LAG(total) OVER(ORDER BY count_date) AS 前一日流量, ROUND(((total - LAG(total) OVER(ORDER BY count_date))*100.0/LAG(total) OVER(ORDER BY count_date))::numeric,2) AS 环比增长率 FROM daily_total;"
+  },
+  {
+    "question": "查询连续3天车流量增长的服务区",
+    "sql": "WITH daily_growth AS (SELECT service_area_id, count_date, SUM(customer_count) AS daily_count, LAG(SUM(customer_count),1) OVER(PARTITION BY service_area_id ORDER BY count_date) AS prev_count FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id, count_date) SELECT sa.service_area_name FROM (SELECT service_area_id FROM daily_growth WHERE daily_count > prev_count GROUP BY service_area_id, count_date - generate_series(0,2)) t JOIN bss_service_area sa ON t.service_area_id = sa.id;"
+  },
+  {
+    "question": "统计各车辆类型在不同时间段的分布比例",
+    "sql": "SELECT car_type AS 车型, EXTRACT(HOUR FROM create_ts)::integer AS 小时段, ROUND(AVG(customer_count)::numeric,0) AS 平均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type, 小时段 ORDER BY 小时段;"
+  },
+  {
+    "question": "获取昨日车流量最高的3个服务区及对应车型分布",
+    "sql": "SELECT sa.service_area_name, cc.car_type, cc.customer_count FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date = CURRENT_DATE - 1 AND sa.delete_ts IS NULL ORDER BY cc.customer_count DESC LIMIT 3;"
+  },
+  {
+    "question": "分析各区域城际车辆通行量与服务区开放状态的关系",
+    "sql": "SELECT sa.service_state AS 开放状态, AVG(CASE WHEN cc.car_type='城际' THEN cc.customer_count ELSE 0 END) AS 平均城际车流量 FROM bss_car_day_count cc RIGHT JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE sa.delete_ts IS NULL GROUP BY sa.service_state;"
+  },
+  {
+    "question": "各分公司2023年4月人均营收TOP5(按支付总额/车流量计算)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.pay_sum)/SUM(car.customer_count) AS 人均营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_car_day_count car ON sa.id = car.service_area_id AND bd.oper_date = car.count_date WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY c.company_name ORDER BY 人均营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "2023年Q2各分公司客单价对比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, AVG(bd.pay_sum/bd.order_sum) AS 客单价 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-06-30' GROUP BY c.company_name ORDER BY 客单价 DESC;"
+  },
+  {
+    "question": "最近一周订单密度(订单数/面积)最低的3个分公司",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.order_sum)/COUNT(DISTINCT sa.id) AS 订单密度 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date >= CURRENT_DATE - 7 GROUP BY c.company_name ORDER BY 订单密度 ASC LIMIT 3;"
+  },
+  {
+    "question": "各分公司2023年节假日营收总额环比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 1 THEN bd.pay_sum ELSE 0 END) AS 一月营收, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 2 THEN bd.pay_sum ELSE 0 END) AS 二月营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(YEAR FROM bd.oper_date) = 2023 GROUP BY c.company_name;"
+  },
+  {
+    "question": "2023-04-01当日各分公司运营指标对比(支付总额、订单数、车流量)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.pay_sum) AS 支付总额, SUM(bd.order_sum) AS 订单总数, SUM(car.customer_count) AS 车流量 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_car_day_count car ON sa.id = car.service_area_id WHERE bd.oper_date = '2023-04-01' GROUP BY c.company_name ORDER BY 支付总额 DESC;"
+  },
+  {
+    "question": "各分公司微信支付占比分析(近30天)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.wx) / SUM(bd.pay_sum) * 100 AS 微信占比百分比 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date >= CURRENT_DATE - 30 GROUP BY c.company_name ORDER BY 微信占比百分比 DESC;"
+  },
+  {
+    "question": "各分公司服务区数量与营收能力关联分析",
+    "sql": "SELECT c.company_name AS 分公司名称, COUNT(sa.id) AS 服务区数量, SUM(bd.pay_sum) AS 总营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY c.company_name ORDER BY 服务区数量 DESC, 总营收 DESC;"
+  },
+  {
+    "question": "2023年各分公司月均订单密度趋势分析",
+    "sql": "SELECT c.company_name AS 分公司名称, EXTRACT(MONTH FROM bd.oper_date) AS 月份, AVG(bd.order_sum) AS 月均订单密度 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(YEAR FROM bd.oper_date) = 2023 GROUP BY c.company_name, 月份 ORDER BY 分公司名称, 月份;"
+  },
+  {
+    "question": "各分公司不同支付方式订单数占比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.wx_order)/SUM(bd.order_sum)*100 AS 微信占比, SUM(bd.zf_order)/SUM(bd.order_sum)*100 AS 支付宝占比 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY c.company_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "2023年Q2各分公司营收增长率分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 4 THEN bd.pay_sum ELSE 0 END) / SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 5 THEN bd.pay_sum ELSE 0 END) - 1 AS 月增长率 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(QUARTER FROM bd.oper_date) = 2 GROUP BY c.company_name ORDER BY 月增长率 DESC;"
+  },
+  {
+    "question": "统计各路线关联的服务区数量及平均车流量,按服务区数量降序排列",
+    "sql": "SELECT r.route_name AS 路线名称, COUNT(l.service_area_id) AS 服务区数量, AVG(c.customer_count) AS 平均车流量 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id LEFT JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE r.delete_ts IS NULL GROUP BY r.route_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "计算2023年Q2各路段日均车流量,筛选出日均车流量>1000的路段",
+    "sql": "SELECT s.section_name AS 路段名称, COUNT(*) AS 天数, AVG(c.customer_count) AS 日均车流量 FROM bss_section_route s JOIN bss_section_route_area_link l ON s.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE c.count_date BETWEEN '2023-04-01' AND '2023-06-30' AND s.delete_ts IS NULL GROUP BY s.section_name HAVING AVG(c.customer_count) > 1000;"
+  },
+  {
+    "question": "查询2023年车流量TOP5服务区及对应路线信息",
+    "sql": "SELECT a.service_area_name AS 服务区名称, r.route_name AS 路线名称, SUM(c.customer_count) AS 总车流量 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_car_day_count c ON a.id = c.service_area_id WHERE c.count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY a.service_area_name, r.route_name ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析各路线服务区营收贡献占比,按微信支付金额排序",
+    "sql": "SELECT r.route_name AS 路线名称, SUM(b.wx) AS 微信支付总额, SUM(b.pay_sum) AS 总营收, ROUND((SUM(b.wx)/SUM(b.pay_sum))*100, 2) AS 微信占比 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_business_day_data b ON l.service_area_id = b.service_area_id WHERE b.oper_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY r.route_name ORDER BY 微信支付总额 DESC;"
+  },
+  {
+    "question": "对比不同车辆类型在各路线的分布比例",
+    "sql": "SELECT r.route_name AS 路线名称, c.car_type AS 车辆类型, COUNT(*) AS 记录数, ROUND((COUNT(*)/(SELECT COUNT(*) FROM bss_car_day_count WHERE service_area_id IN (SELECT service_area_id FROM bss_section_route_area_link WHERE section_route_id = r.id))) * 100)::numeric(5,2) AS 占比百分比 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id GROUP BY r.route_name, c.car_type;"
+  },
+  {
+    "question": "统计未关联服务区的路段清单及创建时间",
+    "sql": "SELECT r.section_name AS 路段名称, r.create_ts AS 创建时间 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id WHERE l.service_area_id IS NULL AND r.delete_ts IS NULL;"
+  },
+  {
+    "question": "分析春运期间(2023-01-07至2023-02-16)各路线车流变化趋势",
+    "sql": "SELECT r.route_name AS 路线名称, c.count_date AS 日期, SUM(c.customer_count) AS 总车流量 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE c.count_date BETWEEN '2023-01-07' AND '2023-02-16' GROUP BY r.route_name, c.count_date ORDER BY 日期;"
+  },
+  {
+    "question": "计算各服务区车流覆盖率(关联路段车流/总车流)TOP10",
+    "sql": "SELECT a.service_area_name AS 服务区名称, SUM(c.customer_count) AS 关联车流, (SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = a.id) AS 总车流, ROUND((SUM(c.customer_count)/(SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = a.id)) * 100)::numeric(5,2) AS 覆盖率 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_car_day_count c ON a.id = c.service_area_id GROUP BY a.service_area_name ORDER BY 覆盖率 DESC LIMIT 10;"
+  },
+  {
+    "question": "查询节假日(2023-10-01至2023-10-07)营收贡献最高的TOP3服务区及对应路线",
+    "sql": "SELECT a.service_area_name AS 服务区名称, r.route_name AS 路线名称, SUM(b.pay_sum) AS 总营收 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_business_day_data b ON a.id = b.service_area_id WHERE b.oper_date BETWEEN '2023-10-01' AND '2023-10-07' GROUP BY a.service_area_name, r.route_name ORDER BY 总营收 DESC LIMIT 3;"
+  },
+  {
+    "question": "分析不同分公司管辖路段的服务区密度(服务区数/路段长度)",
+    "sql": "SELECT c.company_name AS 分公司名称, COUNT(a.id) AS 服务区数量, SUM(LENGTH(s.code)) AS 路段总长度, ROUND((COUNT(a.id)/SUM(LENGTH(s.code))) * 1000)::numeric(5,2) AS 密度_每千米 FROM bss_company c JOIN bss_service_area a ON c.id = a.company_id JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route s ON l.section_route_id = s.id GROUP BY c.company_name;"
+  },
+  {
+    "question": "分析2023年国庆节期间各服务区营收总额及环比增长率",
+    "sql": "WITH holiday_revenue AS (SELECT service_name, SUM(pay_sum) AS holiday_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name), pre_holiday_revenue AS (SELECT service_name, SUM(pay_sum) AS pre_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-09-30' AND delete_ts IS NULL GROUP BY service_name) SELECT h.service_name, h.holiday_amount, ROUND((h.holiday_amount - p.pre_amount)/p.pre_amount*100, 2) AS growth_rate FROM holiday_revenue h JOIN pre_holiday_revenue p ON h.service_name = p.service_name ORDER BY growth_rate DESC;"
+  },
+  {
+    "question": "统计2023年春节期间各服务区节假日营收占Q1季度总营收比例",
+    "sql": "WITH q1_revenue AS (SELECT service_name, SUM(pay_sum) AS q1_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY service_name), lunar_revenue AS (SELECT service_name, SUM(pay_sum) AS lunar_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-20' AND '2023-01-27' AND delete_ts IS NULL GROUP BY service_name) SELECT q.service_name, ROUND(l.lunar_amount/q.q1_amount*100, 2) AS ratio FROM q1_revenue q JOIN lunar_revenue l ON q.service_name = l.service_name ORDER BY ratio DESC;"
+  },
+  {
+    "question": "对比2023年国庆节期间不同支付方式金额占比",
+    "sql": "SELECT '微信' AS pay_type, ROUND(SUM(wx)/SUM(pay_sum)*100, 2) AS ratio FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL UNION ALL SELECT '支付宝', ROUND(SUM(zfb)/SUM(pay_sum)*100, 2) FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL UNION ALL SELECT '现金', ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "分析节假日与非节假日各服务区日均车流量增长率",
+    "sql": "WITH holiday_avg AS (SELECT service_area_id, AVG(customer_count) AS holiday_avg FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id), non_holiday_avg AS (SELECT service_area_id, AVG(customer_count) AS non_holiday_avg FROM bss_car_day_count WHERE count_date NOT BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id) SELECT h.service_area_id, ROUND((h.holiday_avg - n.non_holiday_avg)/n.non_holiday_avg*100, 2) AS growth_rate FROM holiday_avg h JOIN non_holiday_avg n ON h.service_area_id = n.service_area_id ORDER BY growth_rate DESC LIMIT 10;"
+  },
+  {
+    "question": "统计节假日车流最高峰时段的车辆类型分布",
+    "sql": "SELECT car_type, SUM(customer_count) AS total_cars FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND EXTRACT(HOUR FROM create_ts) BETWEEN 8 AND 10 AND delete_ts IS NULL GROUP BY car_type ORDER BY total_cars DESC;"
+  },
+  {
+    "question": "对比2023年五一假期与清明假期营收增幅排名TOP5服务区",
+    "sql": "WITH may_revenue AS (SELECT service_name, SUM(pay_sum) AS may_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-29' AND '2023-05-03' AND delete_ts IS NULL GROUP BY service_name), qingming_revenue AS (SELECT service_name, SUM(pay_sum) AS qingming_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-05' AND '2023-04-07' AND delete_ts IS NULL GROUP BY service_name) SELECT m.service_name, ROUND((m.may_amount - q.qingming_amount)/q.qingming_amount*100, 2) AS growth_rate FROM may_revenue m JOIN qingming_revenue q ON m.service_name = q.service_name ORDER BY growth_rate DESC LIMIT 5;"
+  },
+  {
+    "question": "分析节假日现金支付比例变化趋势",
+    "sql": "SELECT oper_date, ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) AS cash_ratio FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-10-07' AND delete_ts IS NULL GROUP BY oper_date ORDER BY oper_date;"
+  },
+  {
+    "question": "统计危化品车辆节假日期间通行量同比增幅",
+    "sql": "WITH holiday_2022 AS (SELECT COUNT(*) AS cnt_2022 FROM bss_car_day_count WHERE count_date BETWEEN '2022-10-01' AND '2022-10-07' AND car_type = '危化品' AND delete_ts IS NULL), holiday_2023 AS (SELECT COUNT(*) AS cnt_2023 FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND car_type = '危化品' AND delete_ts IS NULL) SELECT ROUND((cnt_2023 - cnt_2022)/cnt_2022*100, 2) AS growth_rate FROM holiday_2022, holiday_2023;"
+  },
+  {
+    "question": "查询2023年国庆节期间营收增幅超过50%的服务区清单",
+    "sql": "WITH pre_data AS (SELECT service_name, SUM(pay_sum) AS pre_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-09-30' AND delete_ts IS NULL GROUP BY service_name), holiday_data AS (SELECT service_name, SUM(pay_sum) AS holiday_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name) SELECT h.service_name, ROUND((h.holiday_amount - p.pre_amount)/p.pre_amount*100, 2) AS growth_rate FROM holiday_data h JOIN pre_data p ON h.service_name = p.service_name WHERE (h.holiday_amount - p.pre_amount)/p.pre_amount > 0.5 ORDER BY growth_rate DESC;"
+  },
+  {
+    "question": "分析节假日期间城际车辆流量与服务区地理位置的关系",
+    "sql": "SELECT s.service_area_name, s.service_position, AVG(c.customer_count) AS avg_traffic FROM bss_car_day_count c JOIN bss_service_area s ON c.service_area_id = s.id WHERE c.car_type = '城际' AND c.count_date BETWEEN '2023-10-01' AND '2023-10-07' AND c.delete_ts IS NULL GROUP BY s.service_area_name, s.service_position ORDER BY avg_traffic DESC;"
+  }
+]

+ 14 - 0
data_pipeline/training_data/task_20250701_131627/task_config.json

@@ -0,0 +1,14 @@
+{
+  "task_id": "task_20250701_131627",
+  "created_at": "2025-07-01T05:16:27.671265",
+  "parameters": {
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:6432/highway_db",
+    "table_list_file": "data_pipeline/tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_sql_validation": true,
+    "enable_training_data_load": true
+  },
+  "output_directory": "data_pipeline\\training_data\\task_20250701_131627"
+}

+ 88 - 0
data_pipeline/training_data/task_20250701_131627/task_result.json

@@ -0,0 +1,88 @@
+{
+  "success": true,
+  "workflow_summary": {
+    "total_duration": 1283.84,
+    "completed_steps": [
+      "ddl_md_generation",
+      "question_sql_generation",
+      "sql_validation",
+      "training_data_load"
+    ],
+    "failed_steps": [],
+    "total_steps": 4,
+    "workflow_started": "2025-07-01T13:30:53.267230",
+    "workflow_completed": "2025-07-01T13:52:17.112211"
+  },
+  "input_parameters": {
+    "db_connection": "postgresql://postgres:***@192.168.67.1:6432/highway_db",
+    "table_list_file": "data_pipeline/tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "db_name": "highway_db",
+    "output_directory": "data_pipeline\\training_data\\task_20250701_131627",
+    "enable_sql_validation": true,
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_training_data_load": true
+  },
+  "processing_results": {
+    "ddl_md_generation": {
+      "total_tables": 7,
+      "processed_successfully": 7,
+      "failed": 0,
+      "files_generated": 14,
+      "duration": 422.30856490135193
+    },
+    "question_sql_generation": {
+      "output_file": "data_pipeline\\training_data\\task_20250701_131627\\qs_highway_db_20250701_134736_pair.json",
+      "total_questions": 50,
+      "total_themes": 5,
+      "successful_themes": 5,
+      "failed_themes": [],
+      "duration": 607.0530173778534
+    },
+    "sql_validation": {
+      "original_sql_count": 50,
+      "valid_sql_count": 47,
+      "invalid_sql_count": 3,
+      "success_rate": 0.94,
+      "repair_stats": {
+        "attempted": 4,
+        "successful": 1,
+        "failed": 3
+      },
+      "file_modification_stats": {
+        "modified": 1,
+        "deleted": 3,
+        "failed_modifications": 0
+      },
+      "average_execution_time": 0.02947342872619629,
+      "total_retries": 0,
+      "duration": 236.6604528427124
+    },
+    "training_data_load": {
+      "training_data_dir": "data_pipeline\\training_data\\task_20250701_131627",
+      "load_successful": true,
+      "total_records": 288,
+      "data_type_counts": {
+        "sql": 254,
+        "documentation": 17,
+        "ddl": 16,
+        "error_sql": 1
+      },
+      "duration": 17.167370080947876
+    }
+  },
+  "final_outputs": {
+    "primary_output_file": "data_pipeline\\training_data\\task_20250701_131627\\qs_highway_db_20250701_134736_pair.json",
+    "output_directory": "data_pipeline\\training_data\\task_20250701_131627",
+    "final_question_count": 47,
+    "backup_files_created": true
+  },
+  "performance_metrics": {
+    "step1_duration": 422.31,
+    "step2_duration": 607.05,
+    "step3_duration": 236.66,
+    "step4_duration": 17.17,
+    "total_duration": 1283.84
+  }
+}

+ 17 - 0
data_pipeline/training_data/task_20250701_175640/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: 服务区车辆日统计表
+-- 描述: 服务区车辆日统计表,按车型统计每日车辆数量及类型,用于交通流量分析与资源调度。
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人ID,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人ID,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人ID,
+  customer_count bigint       -- 车辆数量,
+  car_type varchar(100)       -- 车辆类别,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 18 - 0
data_pipeline/training_data/task_20250701_175640/bss_car_day_count_detail.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(服务区车辆日统计表)
+bss_car_day_count 表服务区车辆日统计表,按车型统计每日车辆数量及类型,用于交通流量分析与资源调度。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人ID
+- update_ts (timestamp) - 更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 更新人ID
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人ID
+- customer_count (bigint) - 车辆数量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类别 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

+ 14 - 0
data_pipeline/training_data/task_20250701_175640/task_config.json

@@ -0,0 +1,14 @@
+{
+  "task_id": "task_20250701_175640",
+  "created_at": "2025-07-01T09:56:40.836065",
+  "parameters": {
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:6432/highway_db",
+    "table_list_file": "./data_pipeline/tables.txt",
+    "business_context": "高速公路服务区管理系统测试",
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_sql_validation": true,
+    "enable_training_data_load": true
+  },
+  "output_directory": "data_pipeline\\training_data\\task_20250701_175640"
+}

+ 14 - 0
data_pipeline/training_data/task_20250701_180014/task_config.json

@@ -0,0 +1,14 @@
+{
+  "task_id": "task_20250701_180014",
+  "created_at": "2025-07-01T10:00:14.816750",
+  "parameters": {
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:6432/highway_db",
+    "table_list_file": "data_pipeline/tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_sql_validation": true,
+    "enable_training_data_load": true
+  },
+  "output_directory": "data_pipeline\\training_data\\task_20250701_180014"
+}

+ 31 - 0
data_pipeline/training_data/task_20250701_184430/bss_business_day_data.ddl

@@ -0,0 +1,31 @@
+-- 中文名: 服务区每日业务统计表(记录各SA运营数据)
+-- 描述: 服务区每日业务统计表(记录各SA运营数据)
+create table public.bss_business_day_data (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  oper_date date              -- 统计日期,
+  service_no varchar(255)     -- 服务区编码,
+  service_name varchar(255)   -- 服务区名称,
+  branch_no varchar(255)      -- 档口编码,
+  branch_name varchar(255)    -- 档口名称,
+  wx numeric(19,4)            -- 微信支付金额,
+  wx_order integer            -- 微信订单数量,
+  zfb numeric(19,4)           -- 支付宝支付金额,
+  zf_order integer            -- 支付宝订单数量,
+  rmb numeric(19,4)           -- 现金支付金额,
+  rmb_order integer           -- 现金订单数量,
+  xs numeric(19,4)            -- 行吧支付金额,
+  xs_order integer            -- 行吧订单数量,
+  jd numeric(19,4)            -- 金豆支付金额,
+  jd_order integer            -- 金豆订单数量,
+  order_sum integer           -- 订单总数,
+  pay_sum numeric(19,4)       -- 总支付金额,
+  source_type integer         -- 数据来源类别,
+  primary key (id)
+);

+ 32 - 0
data_pipeline/training_data/task_20250701_184430/bss_business_day_data_detail.md

@@ -0,0 +1,32 @@
+## bss_business_day_data(服务区每日业务统计表(记录各SA运营数据))
+bss_business_day_data 表服务区每日业务统计表(记录各SA运营数据)
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- created_by (varchar(50)) - 创建人 [示例: xingba]
+- update_ts (timestamp) - 更新时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- oper_date (date) - 统计日期 [示例: 2023-04-01]
+- service_no (varchar(255)) - 服务区编码 [示例: 1028, H0501]
+- service_name (varchar(255)) - 服务区名称 [示例: 宜春服务区, 庐山服务区]
+- branch_no (varchar(255)) - 档口编码 [示例: 1, H05016]
+- branch_name (varchar(255)) - 档口名称 [示例: 宜春南区, 庐山鲜徕客东区]
+- wx (numeric(19,4)) - 微信支付金额 [示例: 4790.0000, 2523.0000]
+- wx_order (integer) - 微信订单数量 [示例: 253, 133]
+- zfb (numeric(19,4)) - 支付宝支付金额 [示例: 229.0000, 0.0000]
+- zf_order (integer) - 支付宝订单数量 [示例: 15, 0]
+- rmb (numeric(19,4)) - 现金支付金额 [示例: 1058.5000, 124.0000]
+- rmb_order (integer) - 现金订单数量 [示例: 56, 12]
+- xs (numeric(19,4)) - 行吧支付金额 [示例: 0.0000, 40.0000]
+- xs_order (integer) - 行吧订单数量 [示例: 0, 1]
+- jd (numeric(19,4)) - 金豆支付金额 [示例: 0.0000]
+- jd_order (integer) - 金豆订单数量 [示例: 0]
+- order_sum (integer) - 订单总数 [示例: 324, 146]
+- pay_sum (numeric(19,4)) - 总支付金额 [示例: 6077.5000, 2687.0000]
+- source_type (integer) - 数据来源类别 [示例: 1, 0, 4]
+字段补充说明:
+- id 为主键
+- source_type 为枚举字段,包含取值:0、4、1、2、3

+ 17 - 0
data_pipeline/training_data/task_20250701_184430/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: 高速公路服务区每日车辆统计表
+-- 描述: 高速公路服务区每日车辆统计表,记录各类型车辆流量数据,支撑交通管理与资源调度分析。
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  customer_count bigint       -- 车辆数量,
+  car_type varchar(100)       -- 车辆类别,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 18 - 0
data_pipeline/training_data/task_20250701_184430/bss_car_day_count_detail.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(高速公路服务区每日车辆统计表)
+bss_car_day_count 表高速公路服务区每日车辆统计表,记录各类型车辆流量数据,支撑交通管理与资源调度分析。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人
+- update_ts (timestamp) - 更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- customer_count (bigint) - 车辆数量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类别 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików