瀏覽代碼

训练数据生成与加载模块重构完成.

wangxq 1 周之前
父節點
當前提交
847e45252b
共有 71 個文件被更改,包括 1871 次插入4670 次删除
  1. 23 0
      .claude/settings.local.json
  2. 161 0
      CLAUDE.md
  3. 0 13
      app_config.py
  4. 5 5
      data_pipeline/README.md
  5. 4 4
      data_pipeline/__init__.py
  6. 0 0
      data_pipeline/analyzers/__init__.py
  7. 0 0
      data_pipeline/analyzers/md_analyzer.py
  8. 1 1
      data_pipeline/analyzers/theme_extractor.py
  9. 1 1
      data_pipeline/config.py
  10. 5 0
      data_pipeline/ddl_generation/__init__.py
  11. 16 12
      data_pipeline/ddl_generation/ddl_md_generator.py
  12. 9 9
      data_pipeline/ddl_generation/training_data_agent.py
  13. 0 0
      data_pipeline/prompts/__init__.py
  14. 0 0
      data_pipeline/prompts/business_dictionary.txt
  15. 1 0
      data_pipeline/qa_generation/__init__.py
  16. 4 4
      data_pipeline/qa_generation/qs_agent.py
  17. 5 5
      data_pipeline/qa_generation/qs_generator.py
  18. 158 38
      data_pipeline/schema_workflow.py
  19. 0 0
      data_pipeline/tables.txt
  20. 0 0
      data_pipeline/tools/__init__.py
  21. 2 2
      data_pipeline/tools/base.py
  22. 5 5
      data_pipeline/tools/comment_generator.py
  23. 5 5
      data_pipeline/tools/data_sampler.py
  24. 2 2
      data_pipeline/tools/database_inspector.py
  25. 3 3
      data_pipeline/tools/ddl_generator.py
  26. 3 3
      data_pipeline/tools/doc_generator.py
  27. 1 0
      data_pipeline/trainer/__init__.py
  28. 18 7
      data_pipeline/trainer/run_training.py
  29. 2 2
      data_pipeline/trainer/vanna_trainer.py
  30. 31 0
      data_pipeline/training_data/bss_business_day_data.ddl
  31. 32 0
      data_pipeline/training_data/bss_business_day_data_detail.md
  32. 17 0
      data_pipeline/training_data/bss_car_day_count.ddl
  33. 18 0
      data_pipeline/training_data/bss_car_day_count_detail.md
  34. 15 0
      data_pipeline/training_data/bss_company.ddl
  35. 15 0
      data_pipeline/training_data/bss_company_detail.md
  36. 16 0
      data_pipeline/training_data/bss_section_route.ddl
  37. 7 0
      data_pipeline/training_data/bss_section_route_area_link.ddl
  38. 7 0
      data_pipeline/training_data/bss_section_route_area_link_detail.md
  39. 16 0
      data_pipeline/training_data/bss_section_route_detail.md
  40. 19 0
      data_pipeline/training_data/bss_service_area.ddl
  41. 21 0
      data_pipeline/training_data/bss_service_area_detail.md
  42. 18 0
      data_pipeline/training_data/bss_service_area_mapper.ddl
  43. 19 0
      data_pipeline/training_data/bss_service_area_mapper_detail.md
  44. 10 0
      data_pipeline/training_data/filename_mapping.txt
  45. 62 0
      data_pipeline/training_data/metadata.txt
  46. 198 0
      data_pipeline/training_data/qs_highway_db_20250626_123202_pair.json
  47. 202 0
      data_pipeline/training_data/qs_highway_db_20250626_123202_pair.json.backup
  48. 0 0
      data_pipeline/utils/__init__.py
  49. 0 0
      data_pipeline/utils/data_structures.py
  50. 0 0
      data_pipeline/utils/file_manager.py
  51. 1 1
      data_pipeline/utils/large_table_handler.py
  52. 0 0
      data_pipeline/utils/logger.py
  53. 0 0
      data_pipeline/utils/permission_checker.py
  54. 1 1
      data_pipeline/utils/system_filter.py
  55. 0 0
      data_pipeline/utils/table_parser.py
  56. 0 0
      data_pipeline/validators/__init__.py
  57. 2 2
      data_pipeline/validators/file_count_validator.py
  58. 8 8
      data_pipeline/validators/sql_validate_cli.py
  59. 14 5
      data_pipeline/validators/sql_validation_agent.py
  60. 2 2
      data_pipeline/validators/sql_validation_example.py
  61. 2 2
      data_pipeline/validators/sql_validator.py
  62. 230 217
      docs/Data Pipeline 使用说明.md
  63. 299 918
      docs/Schema Tools 系统概要设计说明书.md
  64. 0 2579
      docs/Schema Tools 详细设计文档.md
  65. 155 73
      docs/run_training说明.md
  66. 0 135
      schema_tools/test_schema_tools.py
  67. 0 313
      schema_tools/workflow_example.py
  68. 0 10
      test_file_modification.json
  69. 0 89
      test_qa_apis.py
  70. 0 180
      test_training_data_apis.py
  71. 0 14
      training/__init__.py

+ 23 - 0
.claude/settings.local.json

@@ -0,0 +1,23 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(rg:*)",
+      "Bash(rg:*)",
+      "Bash(find:*)",
+      "Bash(mkdir:*)",
+      "Bash(cp:*)",
+      "Bash(grep:*)",
+      "Bash(python:*)",
+      "Bash(source:*)",
+      "Bash(ls:*)",
+      "Bash(.venv/Scripts/activate)",
+      "Bash(.venv/Scripts/python.exe -m data_pipeline:*)",
+      "Bash(.venv/Scripts/python.exe data_pipeline/training_data/run_training.py:*)",
+      "Bash(.venv/Scripts/python.exe:*)",
+      "Bash(mv:*)",
+      "Bash(rm:*)",
+      "Bash(.venv/bin/python:*)"
+    ],
+    "deny": []
+  }
+}

+ 161 - 0
CLAUDE.md

@@ -0,0 +1,161 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Development Commands
+
+### Start Applications
+
+```bash
+# Start Chainlit conversational interface (primary UI)
+chainlit run chainlit_app.py
+
+# Start Flask web interface (simple API)
+python flask_app.py
+
+# Start advanced Flask application with full agent APIs
+python citu_app.py
+```
+
+### Training and Data Management
+
+```bash
+# Run training pipeline with data from data_pipeline/training_data directory
+python -m data_pipeline.trainer.run_training --data_path ./data_pipeline/training_data/
+
+# Complete automated schema workflow (DDL generation → Q&A generation → SQL validation → Training data loading)
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@host:port/database_name" \
+  --table-list tables.txt \
+  --business-context "业务系统描述" \
+  --output-dir ./data_pipeline/training_data/
+
+# Generate only schema documentation without validation
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@host:port/db_name" \
+  --table-list tables.txt \
+  --business-context "系统描述" \
+  --skip-validation
+```
+
+### Testing
+
+```bash
+# Test QA feedback and conversation management APIs
+python test_qa_apis.py
+
+# Test training data management APIs  
+python test_training_data_apis.py
+```
+
+## Core Architecture
+
+### Application Entry Points
+
+- **`chainlit_app.py`** - Modern conversational UI with streaming responses, fallback mechanisms, and comprehensive error handling
+- **`citu_app.py`** - Production Flask application with full REST APIs for agent queries, conversation management, QA feedback, and health monitoring
+- **`flask_app.py`** - Simple REST API for basic database queries
+
+### Central Configuration
+
+**`app_config.py`** is the main configuration hub controlling:
+
+```python
+# Multi-provider LLM selection
+LLM_MODEL_TYPE = "api"  # api or ollama
+API_LLM_MODEL = "qianwen"  # qianwen or deepseek
+
+# Vector database selection  
+VECTOR_DB_TYPE = "pgvector"  # chromadb or pgvector
+
+# Agent routing behavior
+QUESTION_ROUTING_MODE = "hybrid"  # hybrid, database_direct, chat_direct, llm_only
+
+# Feature toggles
+ENABLE_RESULT_SUMMARY = True
+ENABLE_CONVERSATION_CONTEXT = True
+DISPLAY_RESULT_THINKING = False
+```
+
+### LLM and Vector Database Combinations
+
+The system supports 6 LLM + vector database combinations via **`common/vanna_combinations.py`**:
+- QianWen + ChromaDB/PgVector
+- DeepSeek + ChromaDB/PgVector  
+- Ollama + ChromaDB/PgVector
+
+All combinations are created through **`core/vanna_llm_factory.py`** using factory pattern.
+
+### Agent System Architecture
+
+**`agent/citu_agent.py`** implements a sophisticated LangGraph-based workflow:
+
+```
+Question → Classify → [DATABASE Path] → SQL Generation → SQL Validation → SQL Execution → Summary
+                   → [CHAT Path] → General Chat
+```
+
+**Routing Modes:**
+- `hybrid` (default) - Intelligent classification between database and chat
+- `database_direct` - Skip classification, direct SQL generation
+- `chat_direct` - Skip classification, direct chat response
+- `llm_only` - LLM-based classification only
+
+### Database Integration
+
+**Three-Database Architecture:**
+1. **Business Database** (`APP_DB_CONFIG`) - Source data for queries
+2. **Vector Database** (`PGVECTOR_CONFIG`) - Training data and embeddings
+3. **Redis Cache** (`REDIS_*`) - Conversations, QA results, embedding cache
+
+### Training Data Pipeline
+
+**Training data is managed in `data_pipeline/training_data/` directory.**
+
+**File Format Mapping:**
+- `.ddl` files → `train_ddl_statements()`
+- `.md/.markdown` → `train_documentation_blocks()`
+- `_pair.json/_pairs.json` → `train_json_question_sql_pairs()`
+- `_pair.sql/_pairs.sql` → `train_formatted_question_sql_pairs()`
+- `.sql` (other) → `train_sql_examples()`
+
+### Data Pipeline System
+
+**`data_pipeline/`** provides automated database reverse engineering:
+
+1. **Database Inspector** - Automatic schema discovery
+2. **DDL Generator** - PostgreSQL DDL with intelligent comments
+3. **Documentation Generator** - Detailed markdown documentation  
+4. **Q&A Generator** (`qa_generation/`) - LLM-generated question-SQL pairs
+5. **SQL Validator** (`validators/`) - EXPLAIN-based validation with auto-repair
+6. **Training Pipeline** (`trainer/`) - Vanna.ai training data ingestion
+
+## Key Patterns
+
+### Singleton Pattern
+**`common/vanna_instance.py`** implements thread-safe singleton for global Vanna instance management.
+
+### Caching Strategy
+Multi-layer caching via **`common/`**:
+- **`session_aware_cache.py`** - Web session-aware caching
+- **`embedding_cache_manager.py`** - High-performance embedding caching
+- **`redis_conversation_manager.py`** - Conversation lifecycle management
+
+### Error Handling
+Comprehensive fallback mechanisms throughout the stack:
+- SQL generation failures → General chat responses
+- LLM timeouts → Cached responses
+- Database connection issues → Health check endpoints
+
+### Configuration Precedence
+1. Environment variables (`.env` file)
+2. **`app_config.py`** defaults
+3. Module-specific configs (e.g., **`data_pipeline/config.py`**)
+
+## Important Notes
+
+- The system requires PostgreSQL for business data and optionally PgVector for vector storage
+- Redis is essential for conversation management and caching
+- Training data generation is resource-intensive and should be run with appropriate database permissions
+- The agent system supports both streaming and non-streaming responses based on LLM provider capabilities
+- Always test configuration changes with health check endpoints before production deployment

+ 0 - 13
app_config.py

@@ -117,19 +117,6 @@ TRAINING_BATCH_PROCESSING_ENABLED = False   # 是否启用训练数据批处理
 TRAINING_BATCH_SIZE = 10                    # 每批处理的训练项目数量
 TRAINING_BATCH_SIZE = 10                    # 每批处理的训练项目数量
 TRAINING_MAX_WORKERS = 1                    # 训练批处理的最大工作线程数(设置为1确保单线程)
 TRAINING_MAX_WORKERS = 1                    # 训练批处理的最大工作线程数(设置为1确保单线程)
 
 
-# 训练数据路径配置
-# 支持以下格式:
-# 1. 相对路径(以 . 开头):
-#    "./training/data"     - 项目根目录下的training/data
-#    "../data"             - 项目根目录上级的data目录
-# 2. 绝对路径:
-#    "/home/user/data"     - Linux绝对路径
-#    "C:/data"             - Windows绝对路径
-#    "D:\\training\\data"  - Windows绝对路径(转义反斜杠)
-# 3. 相对路径(不以.开头):
-#    "training/data"       - 相对于项目根目录
-#    "my_data"             - 项目根目录下的my_data文件夹
-TRAINING_DATA_PATH = "./training/data"
 
 
 # 是否启用问题重写功能,也就是上下文问题合并。
 # 是否启用问题重写功能,也就是上下文问题合并。
 REWRITE_QUESTION_ENABLED = False
 REWRITE_QUESTION_ENABLED = False

+ 5 - 5
schema_tools/README.md → data_pipeline/README.md

@@ -29,7 +29,7 @@ pip install asyncpg asyncio
 
 
 #### 命令行方式
 #### 命令行方式
 ```bash
 ```bash
-python -m schema_tools.schema_workflow_orchestrator \
+python -m schema_tools.schema_workflow \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --table-list tables.txt \
   --table-list tables.txt \
   --business-context "高速公路服务区管理系统" \
   --business-context "高速公路服务区管理系统" \
@@ -40,7 +40,7 @@ python -m schema_tools.schema_workflow_orchestrator \
 #### 编程方式
 #### 编程方式
 ```python
 ```python
 import asyncio
 import asyncio
-from schema_tools.schema_workflow_orchestrator import SchemaWorkflowOrchestrator
+from schema_tools.schema_workflow import SchemaWorkflowOrchestrator
 
 
 async def run_complete_workflow():
 async def run_complete_workflow():
     orchestrator = SchemaWorkflowOrchestrator(
     orchestrator = SchemaWorkflowOrchestrator(
@@ -73,17 +73,17 @@ asyncio.run(run_complete_workflow())
 #### 工作流编排器命令行选项
 #### 工作流编排器命令行选项
 ```bash
 ```bash
 # 跳过SQL验证
 # 跳过SQL验证
-python -m schema_tools.schema_workflow_orchestrator \
+python -m schema_tools.schema_workflow \
   --db-connection "postgresql://..." --table-list tables.txt \
   --db-connection "postgresql://..." --table-list tables.txt \
   --business-context "系统" --db-name test_db --skip-validation
   --business-context "系统" --db-name test_db --skip-validation
 
 
 # 禁用LLM修复
 # 禁用LLM修复
-python -m schema_tools.schema_workflow_orchestrator \
+python -m schema_tools.schema_workflow \
   --db-connection "postgresql://..." --table-list tables.txt \
   --db-connection "postgresql://..." --table-list tables.txt \
   --business-context "系统" --db-name test_db --disable-llm-repair
   --business-context "系统" --db-name test_db --disable-llm-repair
 
 
 # 详细日志
 # 详细日志
-python -m schema_tools.schema_workflow_orchestrator \
+python -m schema_tools.schema_workflow \
   --db-connection "postgresql://..." --table-list tables.txt \
   --db-connection "postgresql://..." --table-list tables.txt \
   --business-context "系统" --db-name test_db --verbose
   --business-context "系统" --db-name test_db --verbose
 ```
 ```

+ 4 - 4
schema_tools/__init__.py → data_pipeline/__init__.py

@@ -3,10 +3,10 @@ Schema Tools - 自动化数据库逆向工程工具
 用于从PostgreSQL数据库生成vanna.ai格式的训练数据(DDL和MD文档)
 用于从PostgreSQL数据库生成vanna.ai格式的训练数据(DDL和MD文档)
 """
 """
 
 
-from .training_data_agent import SchemaTrainingDataAgent
-from .qs_agent import QuestionSQLGenerationAgent
-from .sql_validation_agent import SQLValidationAgent
-from .schema_workflow_orchestrator import SchemaWorkflowOrchestrator
+from .ddl_generation.training_data_agent import SchemaTrainingDataAgent
+from .qa_generation.qs_agent import QuestionSQLGenerationAgent
+from .validators.sql_validation_agent import SQLValidationAgent
+from .schema_workflow import SchemaWorkflowOrchestrator
 from .config import SCHEMA_TOOLS_CONFIG, get_config, update_config
 from .config import SCHEMA_TOOLS_CONFIG, get_config, update_config
 
 
 __version__ = "1.0.0"
 __version__ = "1.0.0"

+ 0 - 0
schema_tools/analyzers/__init__.py → data_pipeline/analyzers/__init__.py


+ 0 - 0
schema_tools/analyzers/md_analyzer.py → data_pipeline/analyzers/md_analyzer.py


+ 1 - 1
schema_tools/analyzers/theme_extractor.py → data_pipeline/analyzers/theme_extractor.py

@@ -3,7 +3,7 @@ import json
 import logging
 import logging
 from typing import List, Dict, Any
 from typing import List, Dict, Any
 
 
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 
 
 
 class ThemeExtractor:
 class ThemeExtractor:

+ 1 - 1
schema_tools/config.py → data_pipeline/config.py

@@ -13,7 +13,7 @@ SCHEMA_TOOLS_CONFIG = {
     # 核心配置
     # 核心配置
     "default_db_connection": None,  # 从命令行指定
     "default_db_connection": None,  # 从命令行指定
     "default_business_context": "数据库管理系统", 
     "default_business_context": "数据库管理系统", 
-    "output_directory": "training/generated_data",
+    "output_directory": "./data_pipeline/training_data/",
     
     
     # 处理链配置
     # 处理链配置
     "default_pipeline": "full",
     "default_pipeline": "full",

+ 5 - 0
data_pipeline/ddl_generation/__init__.py

@@ -0,0 +1,5 @@
+# DDL Generation module for database schema reverse engineering
+
+from .training_data_agent import SchemaTrainingDataAgent
+
+__all__ = ["SchemaTrainingDataAgent"]

+ 16 - 12
schema_tools/__main__.py → data_pipeline/ddl_generation/ddl_md_generator.py

@@ -1,3 +1,7 @@
+"""
+DDL和MD文档生成器命令行入口
+用于从PostgreSQL数据库生成DDL和MD训练数据
+"""
 import argparse
 import argparse
 import asyncio
 import asyncio
 import sys
 import sys
@@ -8,21 +12,21 @@ from pathlib import Path
 def setup_argument_parser():
 def setup_argument_parser():
     """设置命令行参数解析器"""
     """设置命令行参数解析器"""
     parser = argparse.ArgumentParser(
     parser = argparse.ArgumentParser(
-        description='Schema Tools - 自动生成数据库训练数据',
+        description='DDL/MD文档生成器 - 从PostgreSQL数据库生成训练数据',
         formatter_class=argparse.RawDescriptionHelpFormatter,
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
         epilog="""
 示例用法:
 示例用法:
   # 基本使用
   # 基本使用
-  python -m schema_tools --db-connection "postgresql://user:pass@host:5432/db" --table-list tables.txt
+  python -m data_pipeline.ddl_md_generator --db-connection "postgresql://user:pass@host:5432/db" --table-list tables.txt --business-context "电商系统"
   
   
-  # 指定业务上下文和输出目录
-  python -m schema_tools --db-connection "..." --table-list tables.txt --business-context "电商系统" --output-dir output
+  # 指定输出目录
+  python -m data_pipeline.ddl_md_generator --db-connection "..." --table-list tables.txt --business-context "电商系统" --output-dir ./data_pipeline/training_data/
   
   
   # 仅生成DDL文件
   # 仅生成DDL文件
-  python -m schema_tools --db-connection "..." --table-list tables.txt --pipeline ddl_only
+  python -m data_pipeline.ddl_md_generator --db-connection "..." --table-list tables.txt --business-context "电商系统" --pipeline ddl_only
   
   
   # 权限检查模式
   # 权限检查模式
-  python -m schema_tools --db-connection "..." --check-permissions-only
+  python -m data_pipeline.ddl_md_generator --db-connection "..." --check-permissions-only
         """
         """
     )
     )
     
     
@@ -94,7 +98,7 @@ def setup_argument_parser():
 
 
 def load_config_with_overrides(args):
 def load_config_with_overrides(args):
     """加载配置并应用命令行覆盖"""
     """加载配置并应用命令行覆盖"""
-    from schema_tools.config import SCHEMA_TOOLS_CONFIG
+    from data_pipeline.config import SCHEMA_TOOLS_CONFIG
     
     
     config = SCHEMA_TOOLS_CONFIG.copy()
     config = SCHEMA_TOOLS_CONFIG.copy()
     
     
@@ -128,12 +132,12 @@ def load_business_context(args):
     if args.business_context:
     if args.business_context:
         return args.business_context
         return args.business_context
     
     
-    from schema_tools.config import SCHEMA_TOOLS_CONFIG
+    from data_pipeline.config import SCHEMA_TOOLS_CONFIG
     return SCHEMA_TOOLS_CONFIG.get("default_business_context", "数据库管理系统")
     return SCHEMA_TOOLS_CONFIG.get("default_business_context", "数据库管理系统")
 
 
 async def check_permissions_only(db_connection: str):
 async def check_permissions_only(db_connection: str):
     """仅检查数据库权限"""
     """仅检查数据库权限"""
-    from schema_tools.training_data_agent import SchemaTrainingDataAgent
+    from .training_data_agent import SchemaTrainingDataAgent
     
     
     print("🔍 检查数据库权限...")
     print("🔍 检查数据库权限...")
     
     
@@ -177,7 +181,7 @@ async def main():
     args = parser.parse_args()
     args = parser.parse_args()
     
     
     # 设置日志
     # 设置日志
-    from schema_tools.utils.logger import setup_logging
+    from data_pipeline.utils.logger import setup_logging
     setup_logging(
     setup_logging(
         verbose=args.verbose,
         verbose=args.verbose,
         log_file=args.log_file
         log_file=args.log_file
@@ -204,7 +208,7 @@ async def main():
         business_context = load_business_context(args)
         business_context = load_business_context(args)
         
         
         # 创建Agent
         # 创建Agent
-        from schema_tools.training_data_agent import SchemaTrainingDataAgent
+        from .training_data_agent import SchemaTrainingDataAgent
         
         
         agent = SchemaTrainingDataAgent(
         agent = SchemaTrainingDataAgent(
             db_connection=args.db_connection,
             db_connection=args.db_connection,
@@ -215,7 +219,7 @@ async def main():
         )
         )
         
         
         # 执行生成
         # 执行生成
-        print("🚀 开始生成Schema训练数据...")
+        print("🚀 开始生成DDL和MD文档...")
         report = await agent.generate_training_data()
         report = await agent.generate_training_data()
         
         
         # 输出结果
         # 输出结果

+ 9 - 9
schema_tools/training_data_agent.py → data_pipeline/ddl_generation/training_data_agent.py

@@ -5,14 +5,14 @@ import os
 from typing import List, Dict, Any, Optional
 from typing import List, Dict, Any, Optional
 from pathlib import Path
 from pathlib import Path
 
 
-from schema_tools.tools.base import ToolRegistry, PipelineExecutor
-from schema_tools.utils.data_structures import TableMetadata, TableProcessingContext, ProcessingResult
-from schema_tools.utils.file_manager import FileNameManager
-from schema_tools.utils.system_filter import SystemTableFilter
-from schema_tools.utils.permission_checker import DatabasePermissionChecker
-from schema_tools.utils.table_parser import TableListParser
-from schema_tools.utils.logger import setup_logging
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.tools.base import ToolRegistry, PipelineExecutor
+from data_pipeline.utils.data_structures import TableMetadata, TableProcessingContext, ProcessingResult
+from data_pipeline.utils.file_manager import FileNameManager
+from data_pipeline.utils.system_filter import SystemTableFilter
+from data_pipeline.utils.permission_checker import DatabasePermissionChecker
+from data_pipeline.utils.table_parser import TableListParser
+from data_pipeline.utils.logger import setup_logging
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 
 class SchemaTrainingDataAgent:
 class SchemaTrainingDataAgent:
     """Schema训练数据生成AI Agent"""
     """Schema训练数据生成AI Agent"""
@@ -97,7 +97,7 @@ class SchemaTrainingDataAgent:
             os.makedirs(os.path.join(self.output_dir, "docs"), exist_ok=True)
             os.makedirs(os.path.join(self.output_dir, "docs"), exist_ok=True)
         
         
         # logs目录始终创建
         # logs目录始终创建
-        os.makedirs(os.path.join(self.output_dir, "logs"), exist_ok=True)
+        # os.makedirs(os.path.join(self.output_dir, "logs"), exist_ok=True)
         
         
         # 初始化数据库工具
         # 初始化数据库工具
         database_tool = ToolRegistry.get_tool("database_inspector", db_connection=self.db_connection)
         database_tool = ToolRegistry.get_tool("database_inspector", db_connection=self.db_connection)

+ 0 - 0
schema_tools/prompts/__init__.py → data_pipeline/prompts/__init__.py


+ 0 - 0
schema_tools/prompts/business_dictionary.txt → data_pipeline/prompts/business_dictionary.txt


+ 1 - 0
data_pipeline/qa_generation/__init__.py

@@ -0,0 +1 @@
+# QA Generation module for Vanna Q&A generation

+ 4 - 4
schema_tools/qs_agent.py → data_pipeline/qa_generation/qs_agent.py

@@ -6,10 +6,10 @@ from datetime import datetime
 from pathlib import Path
 from pathlib import Path
 from typing import List, Dict, Any, Optional
 from typing import List, Dict, Any, Optional
 
 
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
-from schema_tools.validators import FileCountValidator
-from schema_tools.analyzers import MDFileAnalyzer, ThemeExtractor
-from schema_tools.utils.logger import setup_logging
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.validators import FileCountValidator
+from data_pipeline.analyzers import MDFileAnalyzer, ThemeExtractor
+from data_pipeline.utils.logger import setup_logging
 from core.vanna_llm_factory import create_vanna_instance
 from core.vanna_llm_factory import create_vanna_instance
 
 
 
 

+ 5 - 5
schema_tools/qs_generator.py → data_pipeline/qa_generation/qs_generator.py

@@ -9,8 +9,8 @@ import sys
 import os
 import os
 from pathlib import Path
 from pathlib import Path
 
 
-from schema_tools.qs_agent import QuestionSQLGenerationAgent
-from schema_tools.utils.logger import setup_logging
+from .qs_agent import QuestionSQLGenerationAgent
+from data_pipeline.utils.logger import setup_logging
 
 
 
 
 def setup_argument_parser():
 def setup_argument_parser():
@@ -21,13 +21,13 @@ def setup_argument_parser():
         epilog="""
         epilog="""
 示例用法:
 示例用法:
   # 基本使用
   # 基本使用
-  python -m schema_tools.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "高速公路服务区管理系统"
+  python -m data_pipeline.qa_generation.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "高速公路服务区管理系统"
   
   
   # 指定数据库名称
   # 指定数据库名称
-  python -m schema_tools.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "电商系统" --db-name ecommerce_db
+  python -m data_pipeline.qa_generation.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "电商系统" --db-name ecommerce_db
   
   
   # 启用详细日志
   # 启用详细日志
-  python -m schema_tools.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "管理系统" --verbose
+  python -m data_pipeline.qa_generation.qs_generator --output-dir ./output --table-list ./tables.txt --business-context "管理系统" --verbose
         """
         """
     )
     )
     
     

+ 158 - 38
schema_tools/schema_workflow_orchestrator.py → data_pipeline/schema_workflow.py

@@ -10,11 +10,11 @@ from typing import Dict, Any, List, Optional
 from pathlib import Path
 from pathlib import Path
 from datetime import datetime
 from datetime import datetime
 
 
-from schema_tools.training_data_agent import SchemaTrainingDataAgent
-from schema_tools.qs_agent import QuestionSQLGenerationAgent
-from schema_tools.sql_validation_agent import SQLValidationAgent
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
-from schema_tools.utils.logger import setup_logging
+from data_pipeline.ddl_generation.training_data_agent import SchemaTrainingDataAgent
+from data_pipeline.qa_generation.qs_agent import QuestionSQLGenerationAgent
+from data_pipeline.validators.sql_validation_agent import SQLValidationAgent
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.utils.logger import setup_logging
 
 
 
 
 class SchemaWorkflowOrchestrator:
 class SchemaWorkflowOrchestrator:
@@ -24,32 +24,33 @@ class SchemaWorkflowOrchestrator:
                  db_connection: str,
                  db_connection: str,
                  table_list_file: str,
                  table_list_file: str,
                  business_context: str,
                  business_context: str,
-                 db_name: str,
                  output_dir: str = None,
                  output_dir: str = None,
                  enable_sql_validation: bool = True,
                  enable_sql_validation: bool = True,
                  enable_llm_repair: bool = True,
                  enable_llm_repair: bool = True,
-                 modify_original_file: bool = True):
+                 modify_original_file: bool = True,
+                 enable_training_data_load: bool = True):
         """
         """
         初始化Schema工作流编排器
         初始化Schema工作流编排器
         
         
         Args:
         Args:
-            db_connection: 数据库连接字符串
+            db_connection: 数据库连接字符串 (postgresql://user:pass@host:port/dbname)
             table_list_file: 表清单文件路径
             table_list_file: 表清单文件路径
             business_context: 业务上下文描述
             business_context: 业务上下文描述
-            db_name: 数据库名称(用于生成文件名)
             output_dir: 输出目录
             output_dir: 输出目录
             enable_sql_validation: 是否启用SQL验证
             enable_sql_validation: 是否启用SQL验证
             enable_llm_repair: 是否启用LLM修复功能
             enable_llm_repair: 是否启用LLM修复功能
             modify_original_file: 是否修改原始JSON文件
             modify_original_file: 是否修改原始JSON文件
+            enable_training_data_load: 是否启用训练数据加载
         """
         """
         self.db_connection = db_connection
         self.db_connection = db_connection
         self.table_list_file = table_list_file
         self.table_list_file = table_list_file
         self.business_context = business_context
         self.business_context = business_context
-        self.db_name = db_name
+        self.db_name = self._extract_db_name_from_connection(db_connection)
         self.output_dir = Path(output_dir) if output_dir else Path("./output")
         self.output_dir = Path(output_dir) if output_dir else Path("./output")
         self.enable_sql_validation = enable_sql_validation
         self.enable_sql_validation = enable_sql_validation
         self.enable_llm_repair = enable_llm_repair
         self.enable_llm_repair = enable_llm_repair
         self.modify_original_file = modify_original_file
         self.modify_original_file = modify_original_file
+        self.enable_training_data_load = enable_training_data_load
         
         
         # 确保输出目录存在
         # 确保输出目录存在
         self.output_dir.mkdir(parents=True, exist_ok=True)
         self.output_dir.mkdir(parents=True, exist_ok=True)
@@ -68,6 +69,30 @@ class SchemaWorkflowOrchestrator:
             "statistics": {}
             "statistics": {}
         }
         }
     
     
+    def _extract_db_name_from_connection(self, connection_string: str) -> str:
+        """
+        从数据库连接字符串中提取数据库名称
+        
+        Args:
+            connection_string: PostgreSQL连接字符串
+            
+        Returns:
+            str: 数据库名称
+        """
+        try:
+            # 处理标准的PostgreSQL连接字符串: postgresql://user:pass@host:port/dbname
+            if '/' in connection_string:
+                # 取最后一个 '/' 后面的部分作为数据库名
+                db_name = connection_string.split('/')[-1]
+                # 移除可能的查询参数
+                if '?' in db_name:
+                    db_name = db_name.split('?')[0]
+                return db_name if db_name else "database"
+            else:
+                return "database"
+        except Exception:
+            return "database"
+    
     async def execute_complete_workflow(self) -> Dict[str, Any]:
     async def execute_complete_workflow(self) -> Dict[str, Any]:
         """
         """
         执行完整的Schema处理工作流程
         执行完整的Schema处理工作流程
@@ -94,6 +119,12 @@ class SchemaWorkflowOrchestrator:
             else:
             else:
                 self.logger.info("⏭️ 跳过SQL验证步骤")
                 self.logger.info("⏭️ 跳过SQL验证步骤")
             
             
+            # 步骤4: 训练数据加载(可选)
+            if self.enable_training_data_load:
+                await self._execute_step_4_training_data_load()
+            else:
+                self.logger.info("⏭️ 跳过训练数据加载步骤")
+            
             # 设置结束时间
             # 设置结束时间
             self.workflow_state["end_time"] = time.time()
             self.workflow_state["end_time"] = time.time()
             
             
@@ -216,15 +247,13 @@ class SchemaWorkflowOrchestrator:
             
             
             self.logger.info(f"📄 验证文件: {qs_file}")
             self.logger.info(f"📄 验证文件: {qs_file}")
             
             
-            # 动态设置验证配置
-            SCHEMA_TOOLS_CONFIG['sql_validation']['enable_sql_repair'] = self.enable_llm_repair
-            SCHEMA_TOOLS_CONFIG['sql_validation']['modify_original_file'] = self.modify_original_file
-            
-            # 创建SQL验证Agent
+            # 创建SQL验证Agent,通过参数传递配置而非修改全局配置
             sql_validator = SQLValidationAgent(
             sql_validator = SQLValidationAgent(
                 db_connection=self.db_connection,
                 db_connection=self.db_connection,
                 input_file=str(qs_file),
                 input_file=str(qs_file),
-                output_dir=str(self.output_dir)
+                output_dir=str(self.output_dir),
+                enable_sql_repair=self.enable_llm_repair,
+                modify_original_file=self.modify_original_file
             )
             )
             
             
             # 执行SQL验证和修正
             # 执行SQL验证和修正
@@ -270,6 +299,88 @@ class SchemaWorkflowOrchestrator:
             self.logger.error(f"❌ 步骤3失败: {str(e)}")
             self.logger.error(f"❌ 步骤3失败: {str(e)}")
             raise
             raise
     
     
+    async def _execute_step_4_training_data_load(self):
+        """步骤4: 训练数据加载"""
+        self.workflow_state["current_step"] = "training_data_load"
+        self.logger.info("=" * 60)
+        self.logger.info("🎯 步骤4: 开始加载训练数据")
+        self.logger.info("=" * 60)
+        
+        step_start_time = time.time()
+        
+        try:
+            # 确保输出目录存在所需的训练数据
+            training_data_dir = str(self.output_dir)
+            self.logger.info(f"📁 训练数据目录: {training_data_dir}")
+            
+            # 导入训练器模块
+            import sys
+            import os
+            sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+            
+            from data_pipeline.trainer.run_training import process_training_files
+            
+            # 执行训练数据加载
+            self.logger.info("🔄 开始处理训练文件...")
+            load_successful = process_training_files(training_data_dir)
+            
+            step_duration = time.time() - step_start_time
+            
+            if load_successful:
+                # 获取统计信息
+                from data_pipeline.trainer.vanna_trainer import flush_training, shutdown_trainer
+                
+                # 刷新批处理器
+                self.logger.info("🔄 刷新批处理器...")
+                flush_training()
+                shutdown_trainer()
+                
+                # 验证加载结果
+                try:
+                    from core.vanna_llm_factory import create_vanna_instance
+                    vn = create_vanna_instance()
+                    training_data = vn.get_training_data()
+                    
+                    if training_data is not None and not training_data.empty:
+                        total_records = len(training_data)
+                        self.logger.info(f"✅ 成功加载 {total_records} 条训练数据")
+                        
+                        # 统计数据类型
+                        if 'training_data_type' in training_data.columns:
+                            type_counts = training_data['training_data_type'].value_counts().to_dict()
+                        else:
+                            type_counts = {}
+                    else:
+                        total_records = 0
+                        type_counts = {}
+                        self.logger.warning("⚠️ 未能验证训练数据加载结果")
+                        
+                except Exception as e:
+                    self.logger.warning(f"⚠️ 验证训练数据时出错: {e}")
+                    total_records = 0
+                    type_counts = {}
+                
+                # 记录结果
+                self.workflow_state["completed_steps"].append("training_data_load")
+                self.workflow_state["artifacts"]["training_data_load"] = {
+                    "training_data_dir": training_data_dir,
+                    "load_successful": True,
+                    "total_records": total_records,
+                    "data_type_counts": type_counts,
+                    "duration": step_duration
+                }
+                self.workflow_state["statistics"]["step4_duration"] = step_duration
+                
+                self.logger.info(f"✅ 步骤4完成: 成功加载训练数据,耗时 {step_duration:.2f}秒")
+                
+            else:
+                raise Exception("训练数据加载失败:未找到可处理的训练文件")
+                
+        except Exception as e:
+            self.workflow_state["failed_steps"].append("training_data_load")
+            self.logger.error(f"❌ 步骤4失败: {str(e)}")
+            raise
+    
     async def _generate_final_report(self) -> Dict[str, Any]:
     async def _generate_final_report(self) -> Dict[str, Any]:
         """生成最终工作流程报告"""
         """生成最终工作流程报告"""
         total_duration = self.workflow_state["end_time"] - self.workflow_state["start_time"]
         total_duration = self.workflow_state["end_time"] - self.workflow_state["start_time"]
@@ -305,12 +416,14 @@ class SchemaWorkflowOrchestrator:
                 "output_directory": str(self.output_dir),
                 "output_directory": str(self.output_dir),
                 "enable_sql_validation": self.enable_sql_validation,
                 "enable_sql_validation": self.enable_sql_validation,
                 "enable_llm_repair": self.enable_llm_repair,
                 "enable_llm_repair": self.enable_llm_repair,
-                "modify_original_file": self.modify_original_file
+                "modify_original_file": self.modify_original_file,
+                "enable_training_data_load": self.enable_training_data_load
             },
             },
             "processing_results": {
             "processing_results": {
                 "ddl_md_generation": self.workflow_state["artifacts"].get("ddl_md_generation", {}),
                 "ddl_md_generation": self.workflow_state["artifacts"].get("ddl_md_generation", {}),
                 "question_sql_generation": self.workflow_state["artifacts"].get("question_sql_generation", {}),
                 "question_sql_generation": self.workflow_state["artifacts"].get("question_sql_generation", {}),
-                "sql_validation": self.workflow_state["artifacts"].get("sql_validation", {})
+                "sql_validation": self.workflow_state["artifacts"].get("sql_validation", {}),
+                "training_data_load": self.workflow_state["artifacts"].get("training_data_load", {})
             },
             },
             "final_outputs": {
             "final_outputs": {
                 "primary_output_file": final_output_file,
                 "primary_output_file": final_output_file,
@@ -322,6 +435,7 @@ class SchemaWorkflowOrchestrator:
                 "step1_duration": round(self.workflow_state["statistics"].get("step1_duration", 0), 2),
                 "step1_duration": round(self.workflow_state["statistics"].get("step1_duration", 0), 2),
                 "step2_duration": round(self.workflow_state["statistics"].get("step2_duration", 0), 2),
                 "step2_duration": round(self.workflow_state["statistics"].get("step2_duration", 0), 2),
                 "step3_duration": round(self.workflow_state["statistics"].get("step3_duration", 0), 2),
                 "step3_duration": round(self.workflow_state["statistics"].get("step3_duration", 0), 2),
+                "step4_duration": round(self.workflow_state["statistics"].get("step4_duration", 0), 2),
                 "total_duration": round(total_duration, 2)
                 "total_duration": round(total_duration, 2)
             }
             }
         }
         }
@@ -421,28 +535,32 @@ def setup_argument_parser():
         epilog="""
         epilog="""
 示例用法:
 示例用法:
   # 完整工作流程
   # 完整工作流程
-  python -m schema_tools.schema_workflow_orchestrator \\
-    --db-connection "postgresql://user:pass@localhost:5432/dbname" \\
+  python -m data_pipeline.schema_workflow \\
+    --db-connection "postgresql://user:pass@localhost:5432/highway_db" \\
     --table-list tables.txt \\
     --table-list tables.txt \\
     --business-context "高速公路服务区管理系统" \\
     --business-context "高速公路服务区管理系统" \\
-    --db-name highway_db \\
-    --output-dir ./output
+    --output-dir ./data_pipeline/training_data/
   
   
   # 跳过SQL验证
   # 跳过SQL验证
-  python -m schema_tools.schema_workflow_orchestrator \\
-    --db-connection "postgresql://user:pass@localhost:5432/dbname" \\
+  python -m data_pipeline.schema_workflow \\
+    --db-connection "postgresql://user:pass@localhost:5432/ecommerce_db" \\
     --table-list tables.txt \\
     --table-list tables.txt \\
     --business-context "电商系统" \\
     --business-context "电商系统" \\
-    --db-name ecommerce_db \\
     --skip-validation
     --skip-validation
   
   
   # 禁用LLM修复
   # 禁用LLM修复
-  python -m schema_tools.schema_workflow_orchestrator \\
-    --db-connection "postgresql://user:pass@localhost:5432/dbname" \\
+  python -m data_pipeline.schema_workflow \\
+    --db-connection "postgresql://user:pass@localhost:5432/management_db" \\
     --table-list tables.txt \\
     --table-list tables.txt \\
     --business-context "管理系统" \\
     --business-context "管理系统" \\
-    --db-name management_db \\
     --disable-llm-repair
     --disable-llm-repair
+  
+  # 跳过训练数据加载
+  python -m data_pipeline.schema_workflow \\
+    --db-connection "postgresql://user:pass@localhost:5432/management_db" \\
+    --table-list tables.txt \\
+    --business-context "管理系统" \\
+    --skip-training-load
         """
         """
     )
     )
     
     
@@ -465,17 +583,12 @@ def setup_argument_parser():
         help="业务上下文描述"
         help="业务上下文描述"
     )
     )
     
     
-    parser.add_argument(
-        "--db-name",
-        required=True,
-        help="数据库名称(用于生成文件名)"
-    )
     
     
     # 可选参数
     # 可选参数
     parser.add_argument(
     parser.add_argument(
         "--output-dir",
         "--output-dir",
-        default="./output",
-        help="输出目录(默认:./output)"
+        default="./data_pipeline/training_data/",
+        help="输出目录(默认:./data_pipeline/training_data/)"
     )
     )
     
     
     parser.add_argument(
     parser.add_argument(
@@ -496,6 +609,12 @@ def setup_argument_parser():
         help="不修改原始JSON文件(仅生成报告)"
         help="不修改原始JSON文件(仅生成报告)"
     )
     )
     
     
+    parser.add_argument(
+        "--skip-training-load",
+        action="store_true",
+        help="跳过训练数据加载步骤"
+    )
+    
     parser.add_argument(
     parser.add_argument(
         "--verbose", "-v",
         "--verbose", "-v",
         action="store_true",
         action="store_true",
@@ -535,11 +654,11 @@ async def main():
             db_connection=args.db_connection,
             db_connection=args.db_connection,
             table_list_file=args.table_list,
             table_list_file=args.table_list,
             business_context=args.business_context,
             business_context=args.business_context,
-            db_name=args.db_name,
             output_dir=args.output_dir,
             output_dir=args.output_dir,
             enable_sql_validation=not args.skip_validation,
             enable_sql_validation=not args.skip_validation,
             enable_llm_repair=not args.disable_llm_repair,
             enable_llm_repair=not args.disable_llm_repair,
-            modify_original_file=not args.no_modify_file
+            modify_original_file=not args.no_modify_file,
+            enable_training_data_load=not args.skip_training_load
         )
         )
         
         
         # 显示启动信息
         # 显示启动信息
@@ -547,9 +666,10 @@ async def main():
         print(f"📁 输出目录: {args.output_dir}")
         print(f"📁 输出目录: {args.output_dir}")
         print(f"📋 表清单: {args.table_list}")
         print(f"📋 表清单: {args.table_list}")
         print(f"🏢 业务背景: {args.business_context}")
         print(f"🏢 业务背景: {args.business_context}")
-        print(f"💾 数据库: {args.db_name}")
+        print(f"💾 数据库: {orchestrator.db_name}")
         print(f"🔍 SQL验证: {'启用' if not args.skip_validation else '禁用'}")
         print(f"🔍 SQL验证: {'启用' if not args.skip_validation else '禁用'}")
         print(f"🔧 LLM修复: {'启用' if not args.disable_llm_repair else '禁用'}")
         print(f"🔧 LLM修复: {'启用' if not args.disable_llm_repair else '禁用'}")
+        print(f"🎯 训练数据加载: {'启用' if not args.skip_training_load else '禁用'}")
         
         
         # 执行完整工作流程
         # 执行完整工作流程
         report = await orchestrator.execute_complete_workflow()
         report = await orchestrator.execute_complete_workflow()

+ 0 - 0
schema_tools/tables.txt → data_pipeline/tables.txt


+ 0 - 0
schema_tools/tools/__init__.py → data_pipeline/tools/__init__.py


+ 2 - 2
schema_tools/tools/base.py → data_pipeline/tools/base.py

@@ -3,7 +3,7 @@ import time
 import logging
 import logging
 from abc import ABC, abstractmethod
 from abc import ABC, abstractmethod
 from typing import Dict, Any, Optional, Type, List
 from typing import Dict, Any, Optional, Type, List
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext
 
 
 class ToolRegistry:
 class ToolRegistry:
     """工具注册管理器"""
     """工具注册管理器"""
@@ -143,7 +143,7 @@ class PipelineExecutor:
                 
                 
                 # 如果步骤失败且不允许继续,则停止
                 # 如果步骤失败且不允许继续,则停止
                 if not result.success:
                 if not result.success:
-                    from schema_tools.config import SCHEMA_TOOLS_CONFIG
+                    from data_pipeline.config import SCHEMA_TOOLS_CONFIG
                     if not SCHEMA_TOOLS_CONFIG["continue_on_error"]:
                     if not SCHEMA_TOOLS_CONFIG["continue_on_error"]:
                         self.logger.error(f"步骤 {step_name} 失败,停止处理链执行")
                         self.logger.error(f"步骤 {step_name} 失败,停止处理链执行")
                         break
                         break

+ 5 - 5
schema_tools/tools/comment_generator.py → data_pipeline/tools/comment_generator.py

@@ -1,7 +1,7 @@
 import asyncio
 import asyncio
 from typing import List, Dict, Any, Tuple
 from typing import List, Dict, Any, Tuple
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo
 
 
 @ToolRegistry.register("comment_generator")
 @ToolRegistry.register("comment_generator")
 class CommentGeneratorTool(BaseTool):
 class CommentGeneratorTool(BaseTool):
@@ -242,7 +242,7 @@ class CommentGeneratorTool(BaseTool):
     
     
     async def _call_llm_with_retry(self, prompt: str, max_retries: int = 3) -> str:
     async def _call_llm_with_retry(self, prompt: str, max_retries: int = 3) -> str:
         """带重试的LLM调用"""
         """带重试的LLM调用"""
-        from schema_tools.config import SCHEMA_TOOLS_CONFIG
+        from data_pipeline.config import SCHEMA_TOOLS_CONFIG
         
         
         for attempt in range(max_retries):
         for attempt in range(max_retries):
             try:
             try:
@@ -342,8 +342,8 @@ class CommentGeneratorTool(BaseTool):
     
     
     async def _validate_enum_suggestions(self, table_metadata, enum_suggestions: List[Dict]) -> List[Dict]:
     async def _validate_enum_suggestions(self, table_metadata, enum_suggestions: List[Dict]) -> List[Dict]:
         """验证枚举建议"""
         """验证枚举建议"""
-        from schema_tools.tools.database_inspector import DatabaseInspectorTool
-        from schema_tools.config import SCHEMA_TOOLS_CONFIG
+        from data_pipeline.tools.database_inspector import DatabaseInspectorTool
+        from data_pipeline.config import SCHEMA_TOOLS_CONFIG
         
         
         validated_enums = []
         validated_enums = []
         inspector = ToolRegistry.get_tool("database_inspector")
         inspector = ToolRegistry.get_tool("database_inspector")

+ 5 - 5
schema_tools/tools/data_sampler.py → data_pipeline/tools/data_sampler.py

@@ -1,7 +1,7 @@
 import random
 import random
 from typing import List, Dict, Any
 from typing import List, Dict, Any
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, TableMetadata
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, TableMetadata
 
 
 @ToolRegistry.register("data_sampler")
 @ToolRegistry.register("data_sampler")
 class DataSamplerTool(BaseTool):
 class DataSamplerTool(BaseTool):
@@ -17,7 +17,7 @@ class DataSamplerTool(BaseTool):
     async def execute(self, context: TableProcessingContext) -> ProcessingResult:
     async def execute(self, context: TableProcessingContext) -> ProcessingResult:
         """执行数据采样"""
         """执行数据采样"""
         try:
         try:
-            from schema_tools.config import SCHEMA_TOOLS_CONFIG
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
             
             
             table_metadata = context.table_metadata
             table_metadata = context.table_metadata
             sample_limit = SCHEMA_TOOLS_CONFIG["sample_data_limit"]
             sample_limit = SCHEMA_TOOLS_CONFIG["sample_data_limit"]
@@ -51,7 +51,7 @@ class DataSamplerTool(BaseTool):
     
     
     async def _simple_sample(self, table_metadata: TableMetadata, limit: int) -> List[Dict[str, Any]]:
     async def _simple_sample(self, table_metadata: TableMetadata, limit: int) -> List[Dict[str, Any]]:
         """简单采样策略"""
         """简单采样策略"""
-        from schema_tools.tools.database_inspector import DatabaseInspectorTool
+        from data_pipeline.tools.database_inspector import DatabaseInspectorTool
         
         
         # 复用数据库检查工具的连接
         # 复用数据库检查工具的连接
         inspector = ToolRegistry.get_tool("database_inspector")
         inspector = ToolRegistry.get_tool("database_inspector")
@@ -64,7 +64,7 @@ class DataSamplerTool(BaseTool):
     
     
     async def _smart_sample_large_table(self, table_metadata: TableMetadata, limit: int) -> List[Dict[str, Any]]:
     async def _smart_sample_large_table(self, table_metadata: TableMetadata, limit: int) -> List[Dict[str, Any]]:
         """智能采样策略(用于大表)"""
         """智能采样策略(用于大表)"""
-        from schema_tools.tools.database_inspector import DatabaseInspectorTool
+        from data_pipeline.tools.database_inspector import DatabaseInspectorTool
         
         
         inspector = ToolRegistry.get_tool("database_inspector")
         inspector = ToolRegistry.get_tool("database_inspector")
         samples_per_section = max(1, limit // 3)
         samples_per_section = max(1, limit // 3)

+ 2 - 2
schema_tools/tools/database_inspector.py → data_pipeline/tools/database_inspector.py

@@ -1,8 +1,8 @@
 import asyncio
 import asyncio
 import asyncpg
 import asyncpg
 from typing import List, Dict, Any, Optional
 from typing import List, Dict, Any, Optional
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
 
 
 @ToolRegistry.register("database_inspector")
 @ToolRegistry.register("database_inspector")
 class DatabaseInspectorTool(BaseTool):
 class DatabaseInspectorTool(BaseTool):

+ 3 - 3
schema_tools/tools/ddl_generator.py → data_pipeline/tools/ddl_generator.py

@@ -1,8 +1,8 @@
 import os
 import os
 from typing import List, Dict, Any
 from typing import List, Dict, Any
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 
 @ToolRegistry.register("ddl_generator")
 @ToolRegistry.register("ddl_generator")
 class DDLGeneratorTool(BaseTool):
 class DDLGeneratorTool(BaseTool):

+ 3 - 3
schema_tools/tools/doc_generator.py → data_pipeline/tools/doc_generator.py

@@ -1,8 +1,8 @@
 import os
 import os
 from typing import List, Dict, Any
 from typing import List, Dict, Any
-from schema_tools.tools.base import BaseTool, ToolRegistry
-from schema_tools.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.tools.base import BaseTool, ToolRegistry
+from data_pipeline.utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 
 @ToolRegistry.register("doc_generator")
 @ToolRegistry.register("doc_generator")
 class DocGeneratorTool(BaseTool):
 class DocGeneratorTool(BaseTool):

+ 1 - 0
data_pipeline/trainer/__init__.py

@@ -0,0 +1 @@
+# Trainer module for Vanna training

+ 18 - 7
training/run_training.py → data_pipeline/trainer/run_training.py

@@ -11,7 +11,7 @@ from pathlib import Path
 from sqlalchemy import create_engine
 from sqlalchemy import create_engine
 
 
 
 
-from vanna_trainer import (
+from .vanna_trainer import (
     train_ddl,
     train_ddl,
     train_documentation,
     train_documentation,
     train_sql_example,
     train_sql_example,
@@ -467,7 +467,13 @@ def main():
     # 获取默认路径并进行智能处理
     # 获取默认路径并进行智能处理
     def resolve_training_data_path():
     def resolve_training_data_path():
         """智能解析训练数据路径"""
         """智能解析训练数据路径"""
-        config_path = getattr(app_config, 'TRAINING_DATA_PATH', './training/data')
+        # 使用data_pipeline统一配置
+        try:
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            config_path = SCHEMA_TOOLS_CONFIG.get("output_directory", './data_pipeline/training_data/')
+        except ImportError:
+            # 如果无法导入data_pipeline配置,使用默认路径
+            config_path = './data_pipeline/training_data/'
         
         
         # 如果是绝对路径,直接返回
         # 如果是绝对路径,直接返回
         if os.path.isabs(config_path):
         if os.path.isabs(config_path):
@@ -475,17 +481,17 @@ def main():
         
         
         # 如果以 . 开头,相对于项目根目录解析
         # 如果以 . 开头,相对于项目根目录解析
         if config_path.startswith('./') or config_path.startswith('../'):
         if config_path.startswith('./') or config_path.startswith('../'):
-            project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
             return os.path.join(project_root, config_path)
             return os.path.join(project_root, config_path)
         
         
         # 其他情况,相对于项目根目录
         # 其他情况,相对于项目根目录
-        project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
         return os.path.join(project_root, config_path)
         return os.path.join(project_root, config_path)
     
     
     default_path = resolve_training_data_path()
     default_path = resolve_training_data_path()
     
     
     parser.add_argument('--data_path', type=str, default=default_path,
     parser.add_argument('--data_path', type=str, default=default_path,
-                        help='训练数据目录路径 (默认: 从app_config.TRAINING_DATA_PATH)')
+                        help='训练数据目录路径 (默认: 从data_pipeline.config.SCHEMA_TOOLS_CONFIG)')
     args = parser.parse_args()
     args = parser.parse_args()
     
     
     # 使用Path对象处理路径以确保跨平台兼容性
     # 使用Path对象处理路径以确保跨平台兼容性
@@ -493,12 +499,17 @@ def main():
     
     
     # 显示路径解析结果
     # 显示路径解析结果
     print(f"\n===== 训练数据路径配置 =====")
     print(f"\n===== 训练数据路径配置 =====")
-    print(f"配置文件中的路径: {getattr(app_config, 'TRAINING_DATA_PATH', '未配置')}")
+    try:
+        from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+        config_value = SCHEMA_TOOLS_CONFIG.get("output_directory", "未配置")
+        print(f"data_pipeline配置路径: {config_value}")
+    except ImportError:
+        print(f"data_pipeline配置: 无法导入")
     print(f"解析后的绝对路径: {os.path.abspath(data_path)}")
     print(f"解析后的绝对路径: {os.path.abspath(data_path)}")
     print("==============================")
     print("==============================")
     
     
     # 设置正确的项目根目录路径
     # 设置正确的项目根目录路径
-    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
     # 检查嵌入模型连接
     # 检查嵌入模型连接
     check_embedding_model_connection()
     check_embedding_model_connection()

+ 2 - 2
training/vanna_trainer.py → data_pipeline/trainer/vanna_trainer.py

@@ -9,11 +9,11 @@ from collections import defaultdict
 from typing import List, Dict, Any, Tuple, Optional, Union, Callable
 from typing import List, Dict, Any, Tuple, Optional, Union, Callable
 import sys
 import sys
 import os
 import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 import app_config
 import app_config
 
 
 # 设置正确的项目根目录路径
 # 设置正确的项目根目录路径
-project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 # 创建vanna实例
 # 创建vanna实例
 from core.vanna_llm_factory import create_vanna_instance
 from core.vanna_llm_factory import create_vanna_instance

+ 31 - 0
data_pipeline/training_data/bss_business_day_data.ddl

@@ -0,0 +1,31 @@
+-- 中文名: 存储各服务区每日业务统计数据(如车流、销售等)
+-- 描述: 存储各服务区每日业务统计数据(如车流、销售等),支持经营分析
+create table public.bss_business_day_data (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建者,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新者,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除者,
+  oper_date date              -- 统计日期,
+  service_no varchar(255)     -- 服务区编码,
+  service_name varchar(255)   -- 服务区名称,
+  branch_no varchar(255)      -- 档口编码,
+  branch_name varchar(255)    -- 档口名称,
+  wx numeric(19,4)            -- 微信支付金额,
+  wx_order integer            -- 微信订单数量,
+  zfb numeric(19,4)           -- 支付宝支付金额,
+  zf_order integer            -- 支付宝订单数量,
+  rmb numeric(19,4)           -- 现金支付金额,
+  rmb_order integer           -- 现金支付订单数,
+  xs numeric(19,4)            -- 行吧支付金额,
+  xs_order integer            -- 行吧支付数量,
+  jd numeric(19,4)            -- 金豆支付金额,
+  jd_order integer            -- 金豆支付数量,
+  order_sum integer           -- 订单总数,
+  pay_sum numeric(19,4)       -- 总支付金额,
+  source_type integer         -- 数据来源类别,
+  primary key (id)
+);

+ 32 - 0
data_pipeline/training_data/bss_business_day_data_detail.md

@@ -0,0 +1,32 @@
+## bss_business_day_data(存储各服务区每日业务统计数据(如车流、销售等))
+bss_business_day_data 表存储各服务区每日业务统计数据(如车流、销售等),支持经营分析
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- created_by (varchar(50)) - 创建者 [示例: xingba]
+- update_ts (timestamp) - 更新时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- updated_by (varchar(50)) - 更新者
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除者
+- oper_date (date) - 统计日期 [示例: 2023-04-01]
+- service_no (varchar(255)) - 服务区编码 [示例: 1028, H0501]
+- service_name (varchar(255)) - 服务区名称 [示例: 宜春服务区, 庐山服务区]
+- branch_no (varchar(255)) - 档口编码 [示例: 1, H05016]
+- branch_name (varchar(255)) - 档口名称 [示例: 宜春南区, 庐山鲜徕客东区]
+- wx (numeric(19,4)) - 微信支付金额 [示例: 4790.0000, 2523.0000]
+- wx_order (integer) - 微信订单数量 [示例: 253, 133]
+- zfb (numeric(19,4)) - 支付宝支付金额 [示例: 229.0000, 0.0000]
+- zf_order (integer) - 支付宝订单数量 [示例: 15, 0]
+- rmb (numeric(19,4)) - 现金支付金额 [示例: 1058.5000, 124.0000]
+- rmb_order (integer) - 现金支付订单数 [示例: 56, 12]
+- xs (numeric(19,4)) - 行吧支付金额 [示例: 0.0000, 40.0000]
+- xs_order (integer) - 行吧支付数量 [示例: 0, 1]
+- jd (numeric(19,4)) - 金豆支付金额 [示例: 0.0000]
+- jd_order (integer) - 金豆支付数量 [示例: 0]
+- order_sum (integer) - 订单总数 [示例: 324, 146]
+- pay_sum (numeric(19,4)) - 总支付金额 [示例: 6077.5000, 2687.0000]
+- source_type (integer) - 数据来源类别 [示例: 1, 0, 4]
+字段补充说明:
+- id 为主键
+- source_type 为枚举字段,包含取值:0、4、1、2、3

+ 17 - 0
data_pipeline/training_data/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: 服务区每日车辆统计表(按车型分类记录通行车流量及用户数量
+-- 描述: 服务区每日车辆统计表(按车型分类记录通行车流量及用户数量,用于服务区运营分析与资源规划)
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 最后更新时间,
+  updated_by varchar(50)      -- 最后更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  customer_count bigint       -- 车辆通行量,
+  car_type varchar(100)       -- 车辆类型,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 18 - 0
data_pipeline/training_data/bss_car_day_count_detail.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(服务区每日车辆统计表(按车型分类记录通行车流量及用户数量)
+bss_car_day_count 表服务区每日车辆统计表(按车型分类记录通行车流量及用户数量,用于服务区运营分析与资源规划)
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人
+- update_ts (timestamp) - 最后更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 最后更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- customer_count (bigint) - 车辆通行量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类型 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

+ 15 - 0
data_pipeline/training_data/bss_company.ddl

@@ -0,0 +1,15 @@
+-- 中文名: 存储高速公路管理公司信息
+-- 描述: 存储高速公路管理公司信息,用于服务区运营管理
+create table public.bss_company (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  company_name varchar(255)   -- 公司名称,
+  company_no varchar(255)     -- 公司编码,
+  primary key (id)
+);

+ 15 - 0
data_pipeline/training_data/bss_company_detail.md

@@ -0,0 +1,15 @@
+## bss_company(存储高速公路管理公司信息)
+bss_company 表存储高速公路管理公司信息,用于服务区运营管理
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 30675d85ba5044c31acfa243b9d16334, 47ed0bb37f5a85f3d9245e4854959b81]
+- version (integer) - 版本号 [非空] [示例: 1, 2]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- company_name (varchar(255)) - 公司名称 [示例: 上饶分公司, 宜春分公司]
+- company_no (varchar(255)) - 公司编码 [示例: H03, H02]
+字段补充说明:
+- id 为主键

+ 16 - 0
data_pipeline/training_data/bss_section_route.ddl

@@ -0,0 +1,16 @@
+-- 中文名: 业务支撑系统路段与路线基础信息表
+-- 描述: 业务支撑系统路段与路线基础信息表
+create table public.bss_section_route (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 数据版本号,
+  create_ts timestamp         -- 创建时间戳,
+  created_by varchar(50)      -- 创建人标识,
+  update_ts timestamp         -- 最后更新时间,
+  updated_by varchar(50)      -- 最后更新人,
+  delete_ts timestamp         -- 删除时间戳,
+  deleted_by varchar(50)      -- 删除操作人,
+  section_name varchar(255)   -- 所属路段名称,
+  route_name varchar(255)     -- 关联路线名称,
+  code varchar(255)           -- 路段编码编号,
+  primary key (id)
+);

+ 7 - 0
data_pipeline/training_data/bss_section_route_area_link.ddl

@@ -0,0 +1,7 @@
+-- 中文名: 记录路段路线与服务区的绑定关系
+-- 描述: 记录路段路线与服务区的绑定关系,用于路径导航及服务区资源管理。
+create table public.bss_section_route_area_link (
+  section_route_id varchar(32) not null -- 路段路线ID,主键,
+  service_area_id varchar(32) not null -- 服务区ID,主键,
+  primary key (section_route_id, service_area_id)
+);

+ 7 - 0
data_pipeline/training_data/bss_section_route_area_link_detail.md

@@ -0,0 +1,7 @@
+## bss_section_route_area_link(记录路段路线与服务区的绑定关系)
+bss_section_route_area_link 表记录路段路线与服务区的绑定关系,用于路径导航及服务区资源管理。
+字段列表:
+- section_route_id (varchar(32)) - 路段路线ID [主键, 非空] [示例: v8elrsfs5f7lt7jl8a6p87smfzesn3rz, hxzi2iim238e3s1eajjt1enmh9o4h3wp]
+- service_area_id (varchar(32)) - 服务区ID [主键, 非空] [示例: 08e01d7402abd1d6a4d9fdd5df855ef8, 091662311d2c737029445442ff198c4c]
+字段补充说明:
+- 复合主键:section_route_id, service_area_id

+ 16 - 0
data_pipeline/training_data/bss_section_route_detail.md

@@ -0,0 +1,16 @@
+## bss_section_route(业务支撑系统路段与路线基础信息表)
+bss_section_route 表业务支撑系统路段与路线基础信息表
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 04ri3j67a806uw2c6o6dwdtz4knexczh, 0g5mnefxxtukql2cq6acul7phgskowy7]
+- version (integer) - 数据版本号 [非空] [示例: 1, 0]
+- create_ts (timestamp) - 创建时间戳 [示例: 2021-10-29 19:43:50, 2022-03-04 16:07:16]
+- created_by (varchar(50)) - 创建人标识 [示例: admin]
+- update_ts (timestamp) - 最后更新时间
+- updated_by (varchar(50)) - 最后更新人
+- delete_ts (timestamp) - 删除时间戳
+- deleted_by (varchar(50)) - 删除操作人
+- section_name (varchar(255)) - 所属路段名称 [示例: 昌栗, 昌宁]
+- route_name (varchar(255)) - 关联路线名称 [示例: 昌栗, 昌韶]
+- code (varchar(255)) - 路段编码编号 [示例: SR0001, SR0002]
+字段补充说明:
+- id 为主键

+ 19 - 0
data_pipeline/training_data/bss_service_area.ddl

@@ -0,0 +1,19 @@
+-- 中文名: 存储高速公路服务区基本信息
+-- 描述: 存储高速公路服务区基本信息,包含服务区名称、编码及版本控制字段,用于管理服务区全生命周期信息。
+create table public.bss_service_area (
+  id varchar(32) not null     -- 主键标识,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  service_area_name varchar(255) -- 服务区名称,
+  service_area_no varchar(255) -- 服务区编码,
+  company_id varchar(32)      -- 所属公司ID,
+  service_position varchar(255) -- 地理位置坐标,
+  service_area_type varchar(50) -- 服务区类型,
+  service_state varchar(50)   -- 运营状态,
+  primary key (id)
+);

+ 21 - 0
data_pipeline/training_data/bss_service_area_detail.md

@@ -0,0 +1,21 @@
+## bss_service_area(存储高速公路服务区基本信息)
+bss_service_area 表存储高速公路服务区基本信息,包含服务区名称、编码及版本控制字段,用于管理服务区全生命周期信息。
+字段列表:
+- id (varchar(32)) - 主键标识 [主键, 非空] [示例: 0271d68ef93de9684b7ad8c7aae600b6, 08e01d7402abd1d6a4d9fdd5df855ef8]
+- version (integer) - 版本号 [非空] [示例: 3, 6]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-21 13:26:40.589000, 2021-05-20 19:51:46.314000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-07-10 15:41:28.795000, 2021-07-11 09:33:08.455000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人 [示例: ]
+- service_area_name (varchar(255)) - 服务区名称 [示例: 白鹭湖停车区, 南昌南服务区]
+- service_area_no (varchar(255)) - 服务区编码 [示例: H0814, H0105]
+- company_id (varchar(32)) - 所属公司ID [示例: b1629f07c8d9ac81494fbc1de61f1ea5, ee9bf1180a2b45003f96e597a4b7f15a]
+- service_position (varchar(255)) - 地理位置坐标 [示例: 114.574721,26.825584, 115.910549,28.396355]
+- service_area_type (varchar(50)) - 服务区类型 [示例: 信息化服务区]
+- service_state (varchar(50)) - 运营状态 [示例: 开放, 关闭]
+字段补充说明:
+- id 为主键
+- service_area_type 为枚举字段,包含取值:信息化服务区、智能化服务区
+- service_state 为枚举字段,包含取值:开放、关闭、上传数据

+ 18 - 0
data_pipeline/training_data/bss_service_area_mapper.ddl

@@ -0,0 +1,18 @@
+-- 中文名: 服务区信息映射表
+-- 描述: 服务区信息映射表,用于唯一标识和版本管理,支撑服务区数据维护。
+create table public.bss_service_area_mapper (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人账号,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人账号,
+  delete_ts timestamp         -- 删除时间(软删除),
+  deleted_by varchar(50)      -- 删除人账号,
+  service_name varchar(255)   -- 服务区名称,
+  service_no varchar(255)     -- 服务区编码,
+  service_area_id varchar(32) -- 关联服务区ID,
+  source_system_type varchar(50) -- 数据来源系统类别,
+  source_type integer         -- 数据来源类别ID,
+  primary key (id)
+);

+ 19 - 0
data_pipeline/training_data/bss_service_area_mapper_detail.md

@@ -0,0 +1,19 @@
+## bss_service_area_mapper(服务区信息映射表)
+bss_service_area_mapper 表服务区信息映射表,用于唯一标识和版本管理,支撑服务区数据维护。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00e1e893909211ed8ee6fa163eaf653f, 013867f5962211ed8ee6fa163eaf653f]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-01-10 10:54:03, 2023-01-17 12:47:29]
+- created_by (varchar(50)) - 创建人账号 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2023-01-10 10:54:07, 2023-01-17 12:47:32]
+- updated_by (varchar(50)) - 更新人账号
+- delete_ts (timestamp) - 删除时间(软删除)
+- deleted_by (varchar(50)) - 删除人账号
+- service_name (varchar(255)) - 服务区名称 [示例: 信丰西服务区, 南康北服务区]
+- service_no (varchar(255)) - 服务区编码 [示例: 1067, 1062]
+- service_area_id (varchar(32)) - 关联服务区ID [示例: 97cd6cd516a551409a4d453a58f9e170, fdbdd042962011ed8ee6fa163eaf653f]
+- source_system_type (varchar(50)) - 数据来源系统类别 [示例: 驿美, 驿购]
+- source_type (integer) - 数据来源类别ID [示例: 3, 1]
+字段补充说明:
+- id 为主键
+- source_system_type 为枚举字段,包含取值:司乘管理、商业管理、驿购、驿美、手工录入

+ 10 - 0
data_pipeline/training_data/filename_mapping.txt

@@ -0,0 +1,10 @@
+# 文件名映射报告
+# 格式: 原始表名 -> 实际文件名
+
+public.bss_business_day_data -> bss_business_day_data_detail.md
+public.bss_car_day_count -> bss_car_day_count_detail.md
+public.bss_company -> bss_company_detail.md
+public.bss_section_route -> bss_section_route_detail.md
+public.bss_section_route_area_link -> bss_section_route_area_link_detail.md
+public.bss_service_area -> bss_service_area_detail.md
+public.bss_service_area_mapper -> bss_service_area_mapper_detail.md

+ 62 - 0
data_pipeline/training_data/metadata.txt

@@ -0,0 +1,62 @@
+-- Schema Tools生成的主题元数据
+-- 业务背景: 高速公路服务区管理系统
+-- 生成时间: 2025-06-26 12:32:02
+-- 数据库: highway_db
+
+-- 创建表(如果不存在)
+CREATE TABLE IF NOT EXISTS metadata (
+    id SERIAL PRIMARY KEY,
+    topic_name VARCHAR(100) NOT NULL,
+    description TEXT,
+    related_tables TEXT[],
+    keywords TEXT[],
+    focus_areas TEXT[],
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- 插入主题数据
+INSERT INTO metadata(topic_name, description, related_tables, keywords, focus_areas) VALUES
+(
+  '日营收分析',
+  '基于bss_business_day_data表分析各服务区每日营收结构及支付方式占比,优化资金管理策略',
+  '{bss_business_day_data,bss_service_area}',
+  '{营收,支付方式,日统计,服务区}',
+  '{收入趋势,服务区对比,支付方式分布}'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, keywords, focus_areas) VALUES
+(
+  '车流特征分析',
+  '通过bss_car_day_count表研究不同车型通行规律,为服务区设施配置和营销策略提供数据支持',
+  '{bss_car_day_count,bss_service_area}',
+  '{车流量,车型分类,通行规律,运营规划}',
+  '{高峰时段分析,车型分布,车流与营收关联}'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, keywords, focus_areas) VALUES
+(
+  '公司效能对比',
+  '结合bss_company和bss_service_area表,评估各管理公司下属服务区运营效能及资源利用率',
+  '{bss_company,bss_service_area,bss_business_day_data}',
+  '{分公司,效能评估,资源利用,横向对比}',
+  '{营收能力对比,服务区密度分析,运营成本关联}'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, keywords, focus_areas) VALUES
+(
+  '路线关联分析',
+  '通过bss_section_route_area_link表分析路段-服务区关联关系,优化路网导航与资源调度',
+  '{bss_section_route_area_link,bss_section_route,bss_car_day_count}',
+  '{路段关联,资源调度,导航优化,车流分布}',
+  '{路段车流分布,服务区覆盖分析,路线-营收关联}'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, keywords, focus_areas) VALUES
+(
+  '运营状态监测',
+  '基于bss_service_area表监控服务区运营状态变化,分析服务关闭对周边路网的影响',
+  '{bss_service_area,bss_car_day_count,bss_business_day_data}',
+  '{运营状态,服务关闭,影响评估,地理位置}',
+  '{状态变更趋势,地理分布影响,替代服务区效应}'
+);
+

+ 198 - 0
data_pipeline/training_data/qs_highway_db_20250626_123202_pair.json

@@ -0,0 +1,198 @@
+[
+  {
+    "question": "统计各服务区2023年4月1日当日总营收金额并按金额降序排列",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收金额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_name ORDER BY SUM(pay_sum) DESC;"
+  },
+  {
+    "question": "分析2023年4月期间各支付方式(微信/支付宝/现金)金额占比分布情况",
+    "sql": "SELECT '微信' AS 支付方式, SUM(wx)/SUM(pay_sum)*100 AS 占比百分比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL UNION ALL SELECT '支付宝', SUM(zfb)/SUM(pay_sum)*100 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL UNION ALL SELECT '现金', SUM(rmb)/SUM(pay_sum)*100 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "查询近7天各服务区日均营收金额超过1万元的记录",
+    "sql": "SELECT service_name AS 服务区名称, oper_date AS 统计日期, pay_sum AS 营收金额 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND oper_date < CURRENT_DATE AND delete_ts IS NULL AND pay_sum > 10000 ORDER BY oper_date DESC;"
+  },
+  {
+    "question": "对比2023年4月1日与2023年3月31日各服务区营收金额变化率",
+    "sql": "WITH yesterday AS (SELECT service_name, pay_sum FROM bss_business_day_data WHERE oper_date = '2023-04-01'), today AS (SELECT service_name, pay_sum FROM bss_business_day_data WHERE oper_date = '2023-03-31') SELECT y.service_name, (t.pay_sum - y.pay_sum)/y.pay_sum*100 AS 变化率百分比 FROM yesterday y JOIN today t ON y.service_name = t.service_name;"
+  },
+  {
+    "question": "统计各服务区现金支付占比超过30%的记录并按占比排序",
+    "sql": "SELECT service_name AS 服务区名称, (SUM(rmb)/SUM(pay_sum))*100 AS 现金占比百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name HAVING (SUM(rmb)/SUM(pay_sum))*100 > 30 ORDER BY 现金占比百分比 DESC;"
+  },
+  {
+    "question": "查询宜春服务区2023年4月1日各支付方式明细金额",
+    "sql": "SELECT '微信' AS 支付方式, wx AS 金额 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '支付宝', zfb FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '现金', rmb FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date = '2023-04-01' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "统计各公司管辖服务区的月度平均营收金额",
+    "sql": "SELECT c.company_name AS 管理公司, AVG(b.pay_sum) AS 平均营收金额 FROM bss_business_day_data b JOIN bss_service_area s ON b.service_name = s.service_area_name JOIN bss_company c ON s.company_id = c.id WHERE b.oper_date >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month' AND b.oper_date < DATE_TRUNC('month', CURRENT_DATE) AND b.delete_ts IS NULL GROUP BY c.company_name;"
+  },
+  {
+    "question": "查询微信支付占比最高的前5个服务区及具体占比",
+    "sql": "SELECT service_name AS 服务区名称, (SUM(wx)/SUM(pay_sum))*100 AS 微信占比百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 微信占比百分比 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计各服务区日订单数与日营收金额的线性相关性系数",
+    "sql": "SELECT service_name AS 服务区名称, CORR(order_sum, pay_sum) AS 相关系数 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name HAVING CORR(order_sum, pay_sum) IS NOT NULL;"
+  },
+  {
+    "question": "查询国庆假期期间(2023-10-01至2023-10-07)各服务区总营收排名",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收金额 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name ORDER BY SUM(pay_sum) DESC LIMIT 10;"
+  },
+  {
+    "question": "统计各服务区2023年4月车流量总和及日均车流量,并按日均车流量降序排列",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, SUM(bcc.customer_count) AS 总车流量, AVG(bcc.customer_count) AS 日均车流量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bcc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND bcc.delete_ts IS NULL GROUP BY bsa.service_area_name ORDER BY 日均车流量 DESC;"
+  },
+  {
+    "question": "查询危化品车辆在各路段的通行量占比,筛选出占比超过5%的路段",
+    "sql": "SELECT bsrl.route_name AS 路段名称, (COUNT(CASE WHEN bcc.car_type = '危化品' THEN 1 END) * 100.0 / COUNT(*)) AS 危化品占比 FROM bss_car_day_count bcc JOIN bss_section_route_area_link bsral ON bcc.service_area_id = bsral.service_area_id JOIN bss_section_route bsrl ON bsral.section_route_id = bsrl.id WHERE bcc.delete_ts IS NULL GROUP BY bsrl.route_name HAVING (COUNT(CASE WHEN bcc.car_type = '危化品' THEN 1 END) * 100.0 / COUNT(*)) > 5;"
+  },
+  {
+    "question": "分析近7天各时段(小时级)车流变化趋势,按小时聚合展示平均车流量",
+    "sql": "SELECT EXTRACT(HOUR FROM bcc.create_ts) AS 小时段, AVG(bcc.customer_count) AS 平均车流量 FROM bss_car_day_count bcc WHERE bcc.count_date >= CURRENT_DATE - 7 AND bcc.delete_ts IS NULL GROUP BY 小时段 ORDER BY 小时段;"
+  },
+  {
+    "question": "对比城际车辆与过境车辆在不同服务区类型的日均通行量差异",
+    "sql": "SELECT bsa.service_area_type AS 服务区类型, bcc.car_type AS 车辆类型, AVG(bcc.customer_count) AS 日均通行量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bcc.delete_ts IS NULL AND bcc.car_type IN ('城际', '过境') GROUP BY 服务区类型, 车辆类型 ORDER BY 服务区类型, 日均通行量 DESC;"
+  },
+  {
+    "question": "找出最近一个月车流量波动最大的5个服务区(使用标准差衡量波动)",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, STDDEV(bcc.customer_count) AS 车流量标准差 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bcc.count_date >= CURRENT_DATE - 30 AND bcc.delete_ts IS NULL GROUP BY 服务区名称 ORDER BY 车流量标准差 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析各车型在不同运营状态服务区的通行分布,筛选出关闭状态服务区中其他类型车辆占比超过20%的记录",
+    "sql": "SELECT 服务区名称, 车型, 占比 FROM (SELECT bsa.service_area_name AS 服务区名称, bcc.car_type AS 车型, (COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(PARTITION BY bsa.id)) AS 占比 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bsa.service_state = '关闭' AND bcc.delete_ts IS NULL GROUP BY bsa.service_area_name, bcc.car_type, bsa.id) AS sub WHERE 车型 = '其他' AND 占比 > 20;"
+  },
+  {
+    "question": "统计各公司管辖服务区的月度车流增长率(对比最近两个月数据)",
+    "sql": "WITH monthly AS (SELECT bc.company_name AS 公司名称, DATE_TRUNC('month', bcc.count_date) AS 月份, SUM(bcc.customer_count) AS 总车流量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bc ON bsa.company_id = bc.id WHERE bcc.delete_ts IS NULL GROUP BY 公司名称, 月份) SELECT 公司名称, 月份, 总车流量, LAG(总车流量) OVER(PARTITION BY 公司名称 ORDER BY 月份) AS 上月车流, ROUND((总车流量 - LAG(总车流量) OVER(PARTITION BY 公司名称 ORDER BY 月份)) * 100.0 / LAG(总车流量) OVER(PARTITION BY 公司名称 ORDER BY 月份), 2) AS 增长率 FROM monthly WHERE 月份 >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month' ORDER BY 月份 DESC;"
+  },
+  {
+    "question": "查询节假日(假设2023-04-01至2023-04-08为节假日)与平日车流量对比,按车型分类统计",
+    "sql": "SELECT 车型, 节日日均, 平日日均, ROUND((节日日均 - 平日日均) * 100.0 / 平日日均, 2) AS 变化率 FROM (SELECT car_type AS 车型, AVG(CASE WHEN count_date BETWEEN '2023-04-01' AND '2023-04-08' THEN customer_count END) AS 节日日均, AVG(CASE WHEN count_date NOT BETWEEN '2023-04-01' AND '2023-04-08' THEN customer_count END) AS 平日日均 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type) AS sub;"
+  },
+  {
+    "question": "找出车流高峰时段(07:00-09:00,17:00-19:00)车流量占比超过60%的服务区TOP10",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, (SUM(CASE WHEN EXTRACT(HOUR FROM bcc.create_ts) BETWEEN 7 AND 9 OR EXTRACT(HOUR FROM bcc.create_ts) BETWEEN 17 AND 19 THEN customer_count ELSE 0 END) * 100.0 / SUM(customer_count)) AS 高峰占比 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bcc.delete_ts IS NULL GROUP BY 服务区名称 HAVING (SUM(CASE WHEN EXTRACT(HOUR FROM bcc.create_ts) BETWEEN 7 AND 9 OR EXTRACT(HOUR FROM bcc.create_ts) BETWEEN 17 AND 19 THEN customer_count ELSE 0 END) * 100.0 / SUM(customer_count)) > 60 ORDER BY 高峰占比 DESC LIMIT 10;"
+  },
+  {
+    "question": "统计各管理公司下属开放状态的服务区数量,并按数量降序排列",
+    "sql": "SELECT c.company_name AS 公司名称, COUNT(s.id) AS 服务区数量 FROM bss_service_area s JOIN bss_company c ON s.company_id = c.id WHERE s.delete_ts IS NULL AND c.delete_ts IS NULL AND s.service_state = '开放' GROUP BY c.company_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "查询2023年Q2季度各公司日均营业额TOP5",
+    "sql": "SELECT c.company_name AS 公司名称, AVG(b.pay_sum) AS 日均营业额 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id WHERE b.oper_date BETWEEN '2023-04-01' AND '2023-06-30' AND s.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY c.company_name ORDER BY 日均营业额 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析不同服务区类型(信息化/智能化)的平均订单金额差异",
+    "sql": "SELECT s.service_area_type AS 服务区类型, AVG(b.pay_sum / NULLIF(b.order_sum, 0)) AS 平均订单金额 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id WHERE b.oper_date >= CURRENT_DATE - 30 AND s.delete_ts IS NULL GROUP BY s.service_area_type;"
+  },
+  {
+    "question": "统计最近一个月各公司车辆通行总量并计算单车流量收益",
+    "sql": "SELECT c.company_name AS 公司名称, SUM(car.customer_count) AS 总车流量, SUM(b.pay_sum) / NULLIF(SUM(car.customer_count), 0) AS 单车收益 FROM bss_car_day_count car JOIN bss_service_area s ON car.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id JOIN bss_business_day_data b ON s.service_area_no = b.service_no AND car.count_date = b.oper_date WHERE car.count_date >= CURRENT_DATE - 30 GROUP BY c.company_name;"
+  },
+  {
+    "question": "对比各公司在工作日与非工作日的营收差异(以周五至周日为非工作日)",
+    "sql": "SELECT c.company_name AS 公司名称, AVG(CASE WHEN EXTRACT(ISODOW FROM b.oper_date) IN (5,6,7) THEN b.pay_sum ELSE NULL END) AS 非工作日均值, AVG(CASE WHEN EXTRACT(ISODOW FROM b.oper_date) IN (1,2,3,4) THEN b.pay_sum ELSE NULL END) AS 工作日均值 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id GROUP BY c.company_name;"
+  },
+  {
+    "question": "计算各公司现金支付占比超过15%的服务区数量",
+    "sql": "SELECT c.company_name AS 公司名称, COUNT(*) AS 高现金占比服务区 FROM (SELECT s.company_id, m.service_no, SUM(b.rmb) / NULLIF(SUM(b.pay_sum), 0) AS 现金占比 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id WHERE b.oper_date >= CURRENT_DATE - 90 GROUP BY s.company_id, m.service_no HAVING SUM(b.rmb)/NULLIF(SUM(b.pay_sum), 0) > 0.15) t JOIN bss_company c ON t.company_id = c.id GROUP BY c.company_name;"
+  },
+  {
+    "question": "分析各公司服务区档口利用率(档口数量/服务区面积)TOP3",
+    "sql": "SELECT c.company_name AS 公司名称, s.service_area_name AS 服务区名称, COUNT(DISTINCT b.branch_no) / NULLIF((LENGTH(s.service_position) - LENGTH(REPLACE(s.service_position, ',', ''))) / 2, 0) AS 档口密度 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id GROUP BY c.company_name, s.service_area_name, s.service_position ORDER BY 档口密度 DESC LIMIT 3;"
+  },
+  {
+    "question": "统计最近7天无业务数据产生的服务区清单及所属公司",
+    "sql": "SELECT s.service_area_name AS 服务区名称, c.company_name AS 公司名称 FROM bss_service_area s JOIN bss_company c ON s.company_id = c.id WHERE s.delete_ts IS NULL AND NOT EXISTS (SELECT 1 FROM bss_business_day_data b WHERE b.service_no = s.service_area_no AND b.oper_date >= CURRENT_DATE - 7) ORDER BY c.company_name;"
+  },
+  {
+    "question": "分析各公司不同支付方式的订单占比分布",
+    "sql": "SELECT c.company_name AS 公司名称, '微信' AS 支付方式, SUM(b.wx_order)/NULLIF(SUM(b.order_sum), 0) AS 占比 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id GROUP BY c.company_name UNION ALL SELECT c.company_name, '支付宝', SUM(b.zf_order)/NULLIF(SUM(b.order_sum), 0) FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id GROUP BY c.company_name ORDER BY 公司名称, 占比 DESC;"
+  },
+  {
+    "question": "计算各公司服务区营收标准差评估运营稳定性",
+    "sql": "SELECT c.company_name AS 公司名称, STDDEV_SAMP(b.pay_sum) AS 营收波动率 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id WHERE b.oper_date >= CURRENT_DATE - 30 GROUP BY c.company_name ORDER BY 营收波动率;"
+  },
+  {
+    "question": "统计各路段关联的服务区数量,按服务区数量降序排列",
+    "sql": "SELECT r.section_name AS 路段名称, COUNT(l.service_area_id) AS 服务区数量 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id WHERE r.delete_ts IS NULL GROUP BY r.section_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "查询2023年1月各路段下辖服务区总车流量TOP10",
+    "sql": "SELECT r.section_name AS 路段名称, SUM(c.customer_count) AS 总通行量 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id WHERE c.count_date BETWEEN '2023-01-01' AND '2023-01-31' AND c.delete_ts IS NULL GROUP BY r.section_name ORDER BY 总通行量 DESC LIMIT 10;"
+  },
+  {
+    "question": "分析不同车型在各服务区的平均通行量分布",
+    "sql": "SELECT car_type AS 车辆类型, AVG(customer_count) AS 平均通行量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type;"
+  },
+  {
+    "question": "对比昌栗路段与昌韶路段下辖服务区2023年1月总营收额",
+    "sql": "SELECT r.section_name AS 路段名称, SUM(b.pay_sum) AS 总营收额 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_service_area_mapper m ON l.service_area_id = m.service_area_id JOIN bss_business_day_data b ON m.service_no = b.service_no WHERE b.oper_date BETWEEN '2023-01-01' AND '2023-01-31' AND r.section_name IN ('昌栗', '昌韶') GROUP BY r.section_name;"
+  },
+  {
+    "question": "找出最近一周日车流量最高的3个服务区及其所属路段",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, r.section_name AS 所属路段, SUM(c.customer_count) AS 周通行量 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_service_area sa ON c.service_area_id = sa.id JOIN bss_section_route r ON l.section_route_id = r.id WHERE c.count_date >= CURRENT_DATE - 7 GROUP BY sa.service_area_name, r.section_name ORDER BY 周通行量 DESC LIMIT 3;"
+  },
+  {
+    "question": "统计各路段下辖服务区日均车流与日均营收的相关性系数",
+    "sql": "SELECT r.section_name AS 路段名称, CORR(c.customer_count, b.pay_sum) AS 相关性系数 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_service_area_mapper m ON c.service_area_id = m.service_area_id JOIN bss_business_day_data b ON m.service_no = b.service_no AND c.count_date = b.oper_date GROUP BY r.section_name;"
+  },
+  {
+    "question": "查询未绑定任何服务区的路段清单",
+    "sql": "SELECT r.section_name AS 路段名称 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id WHERE l.service_area_id IS NULL AND r.delete_ts IS NULL;"
+  },
+  {
+    "question": "分析宜春分公司管理路段下各服务区月度车流变化趋势",
+    "sql": "SELECT EXTRACT(MONTH FROM c.count_date) AS 月份, sa.service_area_name AS 服务区名称, SUM(c.customer_count) AS 月度车流量 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_service_area sa ON c.service_area_id = sa.id JOIN bss_company cp ON sa.company_id = cp.id WHERE cp.company_name = '宜春分公司' GROUP BY 月份, sa.service_area_name ORDER BY 月份;"
+  },
+  {
+    "question": "统计各公司管理路段覆盖服务区数量及车流总量",
+    "sql": "SELECT cp.company_name AS 管理公司, COUNT(DISTINCT l.service_area_id) AS 覆盖服务区数, SUM(c.customer_count) AS 总车流量 FROM bss_section_route_area_link l JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_service_area sa ON l.service_area_id = sa.id JOIN bss_company cp ON sa.company_id = cp.id LEFT JOIN bss_car_day_count c ON sa.id = c.service_area_id GROUP BY cp.company_name;"
+  },
+  {
+    "question": "找出车流密度(车流量/路段长度)最高的5个路段",
+    "sql": "SELECT r.section_name AS 路段名称, SUM(c.customer_count) / MAX(CAST(r.code AS numeric)) AS 车流密度 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id GROUP BY r.section_name ORDER BY 车流密度 DESC LIMIT 5;"
+  },
+  {
+    "question": "当前各地区关闭的服务区数量及占比统计?",
+    "sql": "SELECT area.service_position AS 地理位置, COUNT(*) AS 关闭数量, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM bss_service_area WHERE service_state = '关闭' AND delete_ts IS NULL) AS 占比百分比 FROM bss_service_area area WHERE area.service_state = '关闭' AND area.delete_ts IS NULL GROUP BY area.service_position;"
+  },
+  {
+    "question": "最近一周各服务区日均车流量排名TOP10?",
+    "sql": "SELECT area.service_area_name AS 服务区名称, AVG(car.customer_count) AS 日均车流量 FROM bss_car_day_count car JOIN bss_service_area area ON car.service_area_id = area.id WHERE car.count_date >= CURRENT_DATE - 7 AND car.delete_ts IS NULL AND area.delete_ts IS NULL GROUP BY area.service_area_name ORDER BY 日均车流量 DESC LIMIT 10;"
+  },
+  {
+    "question": "最近一个月订单总额最高的服务区明细?",
+    "sql": "SELECT service_name AS 服务区名称, SUM(order_sum) AS 总订单量, SUM(pay_sum) AS 总支付金额 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY service_name ORDER BY 总支付金额 DESC LIMIT 10;"
+  },
+  {
+    "question": "各管理公司关闭服务区数量对比分析?",
+    "sql": "SELECT comp.company_name AS 管理公司, COUNT(area.id) AS 关闭服务区数量 FROM bss_service_area area JOIN bss_company comp ON area.company_id = comp.id WHERE area.service_state = '关闭' AND area.delete_ts IS NULL GROUP BY comp.company_name;"
+  },
+  {
+    "question": "昨日关闭服务区的相邻服务区车流变化率?",
+    "sql": "SELECT closed.service_area_name AS 关闭服务区, neighbor.service_area_name AS 相邻服务区, (curr.customer_count - prev.customer_count) * 100.0 / prev.customer_count AS 车流变化率 FROM bss_service_area closed JOIN bss_section_route_area_link link ON closed.id = link.service_area_id JOIN bss_section_route_area_link neighbor_link ON link.section_route_id = neighbor_link.section_route_id JOIN bss_service_area neighbor ON neighbor_link.service_area_id = neighbor.id JOIN bss_car_day_count curr ON neighbor.id = curr.service_area_id AND curr.count_date = CURRENT_DATE - 1 JOIN bss_car_day_count prev ON neighbor.id = prev.service_area_id AND prev.count_date = CURRENT_DATE - 2 WHERE closed.service_state = '关闭' AND closed.delete_ts IS NULL;"
+  },
+  {
+    "question": "不同服务区类型的车辆通行量分布情况?",
+    "sql": "SELECT area.service_area_type AS 服务区类型, car.car_type AS 车辆类型, AVG(car.customer_count) AS 平均车流量 FROM bss_car_day_count car JOIN bss_service_area area ON car.service_area_id = area.id WHERE area.delete_ts IS NULL GROUP BY area.service_area_type, car.car_type;"
+  },
+  {
+    "question": "过去7天各支付方式日均占比趋势分析?",
+    "sql": "SELECT oper_date AS 统计日期, SUM(wx) / SUM(pay_sum) * 100 AS 微信占比, SUM(zfb) / SUM(pay_sum) * 100 AS 支付宝占比, SUM(rmb) / SUM(pay_sum) * 100 AS 现金占比 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "最近一周每日新增关闭服务区数量趋势?",
+    "sql": "SELECT DATE(update_ts) AS 操作日期, COUNT(*) AS 新增关闭数 FROM bss_service_area WHERE service_state = '关闭' AND update_ts >= CURRENT_DATE - 7 GROUP BY DATE(update_ts) ORDER BY 操作日期;"
+  },
+  {
+    "question": "与关闭服务区同路线的替代服务区推荐列表?",
+    "sql": "SELECT DISTINCT route.route_name AS 路线名称, closed.service_area_name AS 关闭服务区, active.service_area_name AS 替代服务区 FROM bss_section_route_area_link closed_link JOIN bss_section_route route ON closed_link.section_route_id = route.id JOIN bss_section_route_area_link active_link ON closed_link.section_route_id = active_link.section_route_id JOIN bss_service_area closed ON closed_link.service_area_id = closed.id JOIN bss_service_area active ON active_link.service_area_id = active.id WHERE closed.service_state = '关闭' AND active.service_state = '开放' AND closed.delete_ts IS NULL LIMIT 10;"
+  },
+  {
+    "question": "关闭前后周边服务区车流变化对比分析?",
+    "sql": "SELECT area.service_area_name AS 服务区, COUNT(CASE WHEN car.count_date < area.update_ts THEN 1 ELSE NULL END) AS 关闭前车流, COUNT(CASE WHEN car.count_date >= area.update_ts THEN 1 ELSE NULL END) AS 关闭后车流 FROM bss_service_area area LEFT JOIN bss_car_day_count car ON area.id = car.service_area_id AND car.count_date BETWEEN area.update_ts - INTERVAL '7 days' AND area.update_ts + INTERVAL '7 days' WHERE area.service_state = '关闭' GROUP BY area.service_area_name;"
+  }
+]

+ 202 - 0
data_pipeline/training_data/qs_highway_db_20250626_123202_pair.json.backup

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "统计各服务区2023年4月1日当日总营收金额并按金额降序排列",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收金额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_name ORDER BY SUM(pay_sum) DESC;"
+  },
+  {
+    "question": "分析2023年4月期间各支付方式(微信/支付宝/现金)金额占比分布情况",
+    "sql": "SELECT '微信' AS 支付方式, SUM(wx)/SUM(pay_sum)*100 AS 占比百分比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL UNION ALL SELECT '支付宝', SUM(zfb)/SUM(pay_sum)*100 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL UNION ALL SELECT '现金', SUM(rmb)/SUM(pay_sum)*100 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "查询近7天各服务区日均营收金额超过1万元的记录",
+    "sql": "SELECT service_name AS 服务区名称, oper_date AS 统计日期, pay_sum AS 营收金额 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND oper_date < CURRENT_DATE AND delete_ts IS NULL AND pay_sum > 10000 ORDER BY oper_date DESC;"
+  },
+  {
+    "question": "对比2023年4月1日与2023年3月31日各服务区营收金额变化率",
+    "sql": "WITH yesterday AS (SELECT service_name, pay_sum FROM bss_business_day_data WHERE oper_date = '2023-04-01'), today AS (SELECT service_name, pay_sum FROM bss_business_day_data WHERE oper_date = '2023-03-31') SELECT y.service_name, (t.pay_sum - y.pay_sum)/y.pay_sum*100 AS 变化率百分比 FROM yesterday y JOIN today t ON y.service_name = t.service_name WHERE y.delete_ts IS NULL AND t.delete_ts IS NULL;"
+  },
+  {
+    "question": "统计各服务区现金支付占比超过30%的记录并按占比排序",
+    "sql": "SELECT service_name AS 服务区名称, (SUM(rmb)/SUM(pay_sum))*100 AS 现金占比百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name HAVING (SUM(rmb)/SUM(pay_sum))*100 > 30 ORDER BY 现金占比百分比 DESC;"
+  },
+  {
+    "question": "查询宜春服务区2023年4月1日各支付方式明细金额",
+    "sql": "SELECT '微信' AS 支付方式, wx AS 金额 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '支付宝', zfb FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '现金', rmb FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date = '2023-04-01' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "统计各公司管辖服务区的月度平均营收金额",
+    "sql": "SELECT c.company_name AS 管理公司, AVG(b.pay_sum) AS 平均营收金额 FROM bss_business_day_data b JOIN bss_service_area s ON b.service_name = s.service_area_name JOIN bss_company c ON s.company_id = c.id WHERE b.oper_date >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month' AND b.oper_date < DATE_TRUNC('month', CURRENT_DATE) AND b.delete_ts IS NULL GROUP BY c.company_name;"
+  },
+  {
+    "question": "查询微信支付占比最高的前5个服务区及具体占比",
+    "sql": "SELECT service_name AS 服务区名称, (SUM(wx)/SUM(pay_sum))*100 AS 微信占比百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 微信占比百分比 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计各服务区日订单数与日营收金额的线性相关性系数",
+    "sql": "SELECT service_name AS 服务区名称, CORR(order_sum, pay_sum) AS 相关系数 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name HAVING CORR(order_sum, pay_sum) IS NOT NULL;"
+  },
+  {
+    "question": "查询国庆假期期间(2023-10-01至2023-10-07)各服务区总营收排名",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收金额 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name ORDER BY SUM(pay_sum) DESC LIMIT 10;"
+  },
+  {
+    "question": "统计各服务区2023年4月车流量总和及日均车流量,并按日均车流量降序排列",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, SUM(bcc.customer_count) AS 总车流量, AVG(bcc.customer_count) AS 日均车流量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bcc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND bcc.delete_ts IS NULL GROUP BY bsa.service_area_name ORDER BY 日均车流量 DESC;"
+  },
+  {
+    "question": "查询危化品车辆在各路段的通行量占比,筛选出占比超过5%的路段",
+    "sql": "SELECT bsrl.route_name AS 路段名称, (COUNT(CASE WHEN bcc.car_type = '危化品' THEN 1 END) * 100.0 / COUNT(*)) AS 危化品占比 FROM bss_car_day_count bcc JOIN bss_section_route_area_link bsral ON bcc.service_area_id = bsral.service_area_id JOIN bss_section_route bsrl ON bsral.section_route_id = bsrl.id WHERE bcc.delete_ts IS NULL GROUP BY bsrl.route_name HAVING (COUNT(CASE WHEN bcc.car_type = '危化品' THEN 1 END) * 100.0 / COUNT(*)) > 5;"
+  },
+  {
+    "question": "分析近7天各时段(小时级)车流变化趋势,按小时聚合展示平均车流量",
+    "sql": "SELECT EXTRACT(HOUR FROM bcc.create_ts) AS 小时段, AVG(bcc.customer_count) AS 平均车流量 FROM bss_car_day_count bcc WHERE bcc.count_date >= CURRENT_DATE - 7 AND bcc.delete_ts IS NULL GROUP BY 小时段段 ORDER BY 小时段段;"
+  },
+  {
+    "question": "对比城际车辆与过境车辆在不同服务区类型的日均通行量差异",
+    "sql": "SELECT bsa.service_area_type AS 服务区类型, bcc.car_type AS 车辆类型, AVG(bcc.customer_count) AS 日均通行量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bcc.delete_ts IS NULL AND bcc.car_type IN ('城际', '过境') GROUP BY 服务区类型, 车辆类型 ORDER BY 服务区类型, 日均通行量 DESC;"
+  },
+  {
+    "question": "找出最近一个月车流量波动最大的5个服务区(使用标准差衡量波动)",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, STDDEV(bcc.customer_count) AS 车流量标准差 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bcc.count_date >= CURRENT_DATE - 30 AND bcc.delete_ts IS NULL GROUP BY 服务区名称 ORDER BY 车流量标准差 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析各车型在不同运营状态服务区的通行分布,筛选出关闭状态服务区中其他类型车辆占比超过20%的记录",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, bcc.car_type AS 车型, (COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(PARTITION BY bsa.id)) AS 占比 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bsa.service_state = '关闭' AND bcc.delete_ts IS NULL GROUP BY bsa.service_area_name, bcc.car_type HAVING bcc.car_type = '其他' AND (COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(PARTITION BY bsa.id)) > 20;"
+  },
+  {
+    "question": "统计各公司管辖服务区的月度车流增长率(对比最近两个月数据)",
+    "sql": "WITH monthly AS (SELECT bc.company_name AS 公司名称, DATE_TRUNC('month', bcc.count_date) AS 月份, SUM(bcc.customer_count) AS 总车流量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bc ON bsa.company_id = bc.id WHERE bcc.delete_ts IS NULL GROUP BY 公司名称, 月份) SELECT 公司名称, 月份, 总车流量, LAG(总车流量) OVER(PARTITION BY 公司名称 ORDER BY 月份) AS 上月车流, ROUND((总车流量 - LAG(总车流量) OVER(PARTITION BY 公司名称 ORDER BY 月份)) * 100.0 / LAG(总车流量) OVER(PARTITION BY 公司名称 ORDER BY 月份), 2) AS 增长率 FROM monthly WHERE 月份 >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month' ORDER BY 月份 DESC;"
+  },
+  {
+    "question": "查询节假日(假设2023-04-01至2023-04-08为节假日)与平日车流量对比,按车型分类统计",
+    "sql": "SELECT car_type AS 车型, AVG(CASE WHEN count_date BETWEEN '2023-04-01' AND '2023-04-08' THEN customer_count END) AS 节日日均, AVG(CASE WHEN count_date NOT BETWEEN '2023-04-01' AND '2023-04-08' THEN customer_count END) AS 平日日均, ROUND((节日日均 - 平日日均) * 100.0 / 平日日均, 2) AS 变化率 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type;"
+  },
+  {
+    "question": "找出车流高峰时段(07:00-09:00,17:00-19:00)车流量占比超过60%的服务区TOP10",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, (SUM(CASE WHEN EXTRACT(HOUR FROM bcc.create_ts) BETWEEN 7 AND 9 OR EXTRACT(HOUR FROM bcc.create_ts) BETWEEN 17 AND 19 THEN customer_count ELSE 0 END) * 100.0 / SUM(customer_count)) AS 高峰占比 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id WHERE bcc.delete_ts IS NULL GROUP BY 服务区名称 HAVING (SUM(CASE WHEN EXTRACT(HOUR FROM bcc.create_ts) BETWEEN 7 AND 9 OR EXTRACT(HOUR FROM bcc.create_ts) BETWEEN 17 AND 19 THEN customer_count ELSE 0 END) * 100.0 / SUM(customer_count)) > 60 ORDER BY 高峰占比 DESC LIMIT 10;"
+  },
+  {
+    "question": "分析服务区车流量与非油收入(以微信+支付宝金额为准)的相关性(取最近一个月数据)",
+    "sql": "SELECT bcc.service_area_id AS 服务区ID, CORR(bcc.customer_count, (b.business.wx + b.business.zfb)) AS 相关性系数 FROM (SELECT service_area_id, count_date, SUM(customer_count) AS customer_count FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id, count_date) bcc JOIN (SELECT service_no, oper_date, (COALESCE(wx,0) + COALESCE(zfb,0)) AS 非油收入 FROM bss_business_day_data) business ON bcc.service_area_id = business.service_no::varchar AND bcc.count_date = business.oper_date WHERE bcc.count_date >= CURRENT_DATE - 30 GROUP BY 服务区ID HAVING COUNT(*) > 10;"
+  },
+  {
+    "question": "统计各管理公司下属开放状态的服务区数量,并按数量降序排列",
+    "sql": "SELECT c.company_name AS 公司名称, COUNT(s.id) AS 服务区数量 FROM bss_service_area s JOIN bss_company c ON s.company_id = c.id WHERE s.delete_ts IS NULL AND c.delete_ts IS NULL AND s.service_state = '开放' GROUP BY c.company_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "查询2023年Q2季度各公司日均营业额TOP5",
+    "sql": "SELECT c.company_name AS 公司名称, AVG(b.pay_sum) AS 日均营业额 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id WHERE b.oper_date BETWEEN '2023-04-01' AND '2023-06-30' AND s.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY c.company_name ORDER BY 日均营业额 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析不同服务区类型(信息化/智能化)的平均订单金额差异",
+    "sql": "SELECT s.service_area_type AS 服务区类型, AVG(b.pay_sum / NULLIF(b.order_sum, 0)) AS 平均订单金额 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id WHERE b.oper_date >= CURRENT_DATE - 30 AND s.delete_ts IS NULL GROUP BY s.service_area_type;"
+  },
+  {
+    "question": "统计最近一个月各公司车辆通行总量并计算单车流量收益",
+    "sql": "SELECT c.company_name AS 公司名称, SUM(car.customer_count) AS 总车流量, SUM(b.pay_sum) / NULLIF(SUM(car.customer_count), 0) AS 单车收益 FROM bss_car_day_count car JOIN bss_service_area s ON car.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id JOIN bss_business_day_data b ON s.service_area_no = b.service_no AND car.count_date = b.oper_date WHERE car.count_date >= CURRENT_DATE - 30 GROUP BY c.company_name;"
+  },
+  {
+    "question": "对比各公司在工作日与非工作日的营收差异(以周五至周日为非工作日)",
+    "sql": "SELECT c.company_name AS 公司名称, AVG(CASE WHEN EXTRACT(ISODOW FROM b.oper_date) IN (5,6,7) THEN b.pay_sum ELSE NULL END) AS 非工作日均值, AVG(CASE WHEN EXTRACT(ISODOW FROM b.oper_date) IN (1,2,3,4) THEN b.pay_sum ELSE NULL END) AS 工作日均值 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id GROUP BY c.company_name;"
+  },
+  {
+    "question": "计算各公司现金支付占比超过15%的服务区数量",
+    "sql": "SELECT c.company_name AS 公司名称, COUNT(*) AS 高现金占比服务区 FROM (SELECT s.company_id, m.service_no, SUM(b.rmb) / NULLIF(SUM(b.pay_sum), 0) AS 现金占比 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id WHERE b.oper_date >= CURRENT_DATE - 90 GROUP BY s.company_id, m.service_no HAVING SUM(b.rmb)/NULLIF(SUM(b.pay_sum), 0) > 0.15) t JOIN bss_company c ON t.company_id = c.id GROUP BY c.company_name;"
+  },
+  {
+    "question": "分析各公司服务区档口利用率(档口数量/服务区面积)TOP3",
+    "sql": "SELECT c.company_name AS 公司名称, s.service_area_name AS 服务区名称, COUNT(DISTINCT b.branch_no) / NULLIF((LENGTH(s.service_position) - LENGTH(REPLACE(s.service_position, ',', ''))) / 2, 0) AS 档口密度 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id GROUP BY c.company_name, s.service_area_name ORDER BY 档口密度 DESC LIMIT 3;"
+  },
+  {
+    "question": "统计最近7天无业务数据产生的服务区清单及所属公司",
+    "sql": "SELECT s.service_area_name AS 服务区名称, c.company_name AS 公司名称 FROM bss_service_area s JOIN bss_company c ON s.company_id = c.id WHERE s.delete_ts IS NULL AND NOT EXISTS (SELECT 1 FROM bss_business_day_data b WHERE b.service_no = s.service_area_no AND b.oper_date >= CURRENT_DATE - 7) ORDER BY c.company_name;"
+  },
+  {
+    "question": "分析各公司不同支付方式的订单占比分布",
+    "sql": "SELECT c.company_name AS 公司名称, '微信' AS 支付方式, SUM(b.wx_order)/NULLIF(SUM(b.order_sum), 0) AS 占比 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id GROUP BY c.company_name UNION ALL SELECT c.company_name, '支付宝', SUM(b.zf_order)/NULLIF(SUM(b.order_sum), 0) FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id GROUP BY c.company_name ORDER BY 公司名称, 占比 DESC;"
+  },
+  {
+    "question": "计算各公司服务区营收标准差评估运营稳定性",
+    "sql": "SELECT c.company_name AS 公司名称, STDDEV_SAMP(b.pay_sum) AS 营收波动率 FROM bss_business_day_data b JOIN bss_service_area_mapper m ON b.service_no = m.service_no JOIN bss_service_area s ON m.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id WHERE b.oper_date >= CURRENT_DATE - 30 GROUP BY c.company_name ORDER BY 营收波动率;"
+  },
+  {
+    "question": "统计各路段关联的服务区数量,按服务区数量降序排列",
+    "sql": "SELECT r.section_name AS 路段名称, COUNT(l.service_area_id) AS 服务区数量 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id WHERE r.delete_ts IS NULL GROUP BY r.section_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "查询2023年1月各路段下辖服务区总车流量TOP10",
+    "sql": "SELECT r.section_name AS 路段名称, SUM(c.customer_count) AS 总通行量 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id WHERE c.count_date BETWEEN '2023-01-01' AND '2023-01-31' AND c.delete_ts IS NULL GROUP BY r.section_name ORDER BY 总通行量 DESC LIMIT 10;"
+  },
+  {
+    "question": "分析不同车型在各服务区的平均通行量分布",
+    "sql": "SELECT car_type AS 车辆类型, AVG(customer_count) AS 平均通行量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type;"
+  },
+  {
+    "question": "对比昌栗路段与昌韶路段下辖服务区2023年1月总营收额",
+    "sql": "SELECT r.section_name AS 路段名称, SUM(b.pay_sum) AS 总营收额 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_service_area_mapper m ON l.service_area_id = m.service_area_id JOIN bss_business_day_data b ON m.service_no = b.service_no WHERE b.oper_date BETWEEN '2023-01-01' AND '2023-01-31' AND r.section_name IN ('昌栗', '昌韶') GROUP BY r.section_name;"
+  },
+  {
+    "question": "找出最近一周日车流量最高的3个服务区及其所属路段",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, r.section_name AS 所属路段, SUM(c.customer_count) AS 周通行量 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_service_area sa ON c.service_area_id = sa.id JOIN bss_section_route r ON l.section_route_id = r.id WHERE c.count_date >= CURRENT_DATE - 7 GROUP BY sa.service_area_name, r.section_name ORDER BY 周通行量 DESC LIMIT 3;"
+  },
+  {
+    "question": "统计各路段下辖服务区日均车流与日均营收的相关性系数",
+    "sql": "SELECT r.section_name AS 路段名称, CORR(c.customer_count, b.pay_sum) AS 相关性系数 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_service_area_mapper m ON c.service_area_id = m.service_area_id JOIN bss_business_day_data b ON m.service_no = b.service_no AND c.count_date = b.oper_date GROUP BY r.section_name;"
+  },
+  {
+    "question": "查询未绑定任何服务区的路段清单",
+    "sql": "SELECT r.section_name AS 路段名称 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id WHERE l.service_area_id IS NULL AND r.delete_ts IS NULL;"
+  },
+  {
+    "question": "分析宜春分公司管理路段下各服务区月度车流变化趋势",
+    "sql": "SELECT EXTRACT(MONTH FROM c.count_date) AS 月份, sa.service_area_name AS 服务区名称, SUM(c.customer_count) AS 月度车流量 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_service_area sa ON c.service_area_id = sa.id JOIN bss_company cp ON sa.company_id = cp.id WHERE cp.company_name = '宜春分公司' GROUP BY 月份, sa.service_area_name ORDER BY 月份;"
+  },
+  {
+    "question": "统计各公司管理路段覆盖服务区数量及车流总量",
+    "sql": "SELECT cp.company_name AS 管理公司, COUNT(DISTINCT l.service_area_id) AS 覆盖服务区数, SUM(c.customer_count) AS 总车流量 FROM bss_section_route_area_link l JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_service_area sa ON l.service_area_id = sa.id JOIN bss_company cp ON sa.company_id = cp.id LEFT JOIN bss_car_day_count c ON sa.id = c.service_area_id GROUP BY cp.company_name;"
+  },
+  {
+    "question": "找出车流密度(车流量/路段长度)最高的5个路段",
+    "sql": "SELECT r.section_name AS 路段名称, SUM(c.customer_count) / MAX(CAST(r.code AS numeric)) AS 车流密度 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id GROUP BY r.section_name ORDER BY 车流密度 DESC LIMIT 5;"
+  },
+  {
+    "question": "当前各地区关闭的服务区数量及占比统计?",
+    "sql": "SELECT area.service_position AS 地理位置, COUNT(*) AS 关闭数量, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM bss_service_area WHERE service_state = '关闭' AND delete_ts IS NULL) AS 占比百分比 FROM bss_service_area area WHERE area.service_state = '关闭' AND area.delete_ts IS NULL GROUP BY area.service_position;"
+  },
+  {
+    "question": "最近一周各服务区日均车流量排名TOP10?",
+    "sql": "SELECT area.service_area_name AS 服务区名称, AVG(car.customer_count) AS 日均车流量 FROM bss_car_day_count car JOIN bss_service_area area ON car.service_area_id = area.id WHERE car.count_date >= CURRENT_DATE - 7 AND car.delete_ts IS NULL AND area.delete_ts IS NULL GROUP BY area.service_area_name ORDER BY 日均车流量 DESC LIMIT 10;"
+  },
+  {
+    "question": "最近一个月订单总额最高的服务区明细?",
+    "sql": "SELECT service_name AS 服务区名称, SUM(order_sum) AS 总订单量, SUM(pay_sum) AS 总支付金额 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY service_name ORDER BY 总支付金额 DESC LIMIT 10;"
+  },
+  {
+    "question": "各管理公司关闭服务区数量对比分析?",
+    "sql": "SELECT comp.company_name AS 管理公司, COUNT(area.id) AS 关闭服务区数量 FROM bss_service_area area JOIN bss_company comp ON area.company_id = comp.id WHERE area.service_state = '关闭' AND area.delete_ts IS NULL GROUP BY comp.company_name;"
+  },
+  {
+    "question": "昨日关闭服务区的相邻服务区车流变化率?",
+    "sql": "SELECT closed.service_area_name AS 关闭服务区, neighbor.service_area_name AS 相邻服务区, (curr.customer_count - prev.customer_count) * 100.0 / prev.customer_count AS 车流变化率 FROM bss_service_area closed JOIN bss_section_route_area_link link ON closed.id = link.service_area_id JOIN bss_section_route_area_link neighbor_link ON link.section_route_id = neighbor_link.section_route_id JOIN bss_service_area neighbor ON neighbor_link.service_area_id = neighbor.id JOIN bss_car_day_count curr ON neighbor.id = curr.service_area_id AND curr.count_date = CURRENT_DATE - 1 JOIN bss_car_day_count prev ON neighbor.id = prev.service_area_id AND prev.count_date = CURRENT_DATE - 2 WHERE closed.service_state = '关闭' AND closed.delete_ts IS NULL;"
+  },
+  {
+    "question": "不同服务区类型的车辆通行量分布情况?",
+    "sql": "SELECT area.service_area_type AS 服务区类型, car.car_type AS 车辆类型, AVG(car.customer_count) AS 平均车流量 FROM bss_car_day_count car JOIN bss_service_area area ON car.service_area_id = area.id WHERE area.delete_ts IS NULL GROUP BY area.service_area_type, car.car_type;"
+  },
+  {
+    "question": "过去7天各支付方式日均占比趋势分析?",
+    "sql": "SELECT oper_date AS 统计日期, SUM(wx) / SUM(pay_sum) * 100 AS 微信占比, SUM(zfb) / SUM(pay_sum) * 100 AS 支付宝占比, SUM(rmb) / SUM(pay_sum) * 100 AS 现金占比 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "最近一周每日新增关闭服务区数量趋势?",
+    "sql": "SELECT DATE(update_ts) AS 操作日期, COUNT(*) AS 新增关闭数 FROM bss_service_area WHERE service_state = '关闭' AND update_ts >= CURRENT_DATE - 7 GROUP BY DATE(update_ts) ORDER BY 操作日期;"
+  },
+  {
+    "question": "与关闭服务区同路线的替代服务区推荐列表?",
+    "sql": "SELECT DISTINCT route.route_name AS 路线名称, closed.service_area_name AS 关闭服务区, active.service_area_name AS 替代服务区 FROM bss_section_route_area_link closed_link JOIN bss_section_route route ON closed_link.section_route_id = route.id JOIN bss_section_route_area_link active_link ON closed_link.section_route_id = active_link.section_route_id JOIN bss_service_area closed ON closed_link.service_area_id = closed.id JOIN bss_service_area active ON active_link.service_area_id = active.id WHERE closed.service_state = '关闭' AND active.service_state = '开放' AND closed.delete_ts IS NULL LIMIT 10;"
+  },
+  {
+    "question": "关闭前后周边服务区车流变化对比分析?",
+    "sql": "SELECT area.service_area_name AS 服务区, COUNT(CASE WHEN car.count_date < area.update_ts THEN 1 ELSE NULL END) AS 关闭前车流, COUNT(CASE WHEN car.count_date >= area.update_ts THEN 1 ELSE NULL END) AS 关闭后车流 FROM bss_service_area area LEFT JOIN bss_car_day_count car ON area.id = car.service_area_id AND car.count_date BETWEEN area.update_ts - 7 AND area.update_ts + 7 WHERE area.service_state = '关闭' GROUP BY area.service_area_name;"
+  }
+]

+ 0 - 0
schema_tools/utils/__init__.py → data_pipeline/utils/__init__.py


+ 0 - 0
schema_tools/utils/data_structures.py → data_pipeline/utils/data_structures.py


+ 0 - 0
schema_tools/utils/file_manager.py → data_pipeline/utils/file_manager.py


+ 1 - 1
schema_tools/utils/large_table_handler.py → data_pipeline/utils/large_table_handler.py

@@ -1,7 +1,7 @@
 import logging
 import logging
 import random
 import random
 from typing import List, Dict, Any, Optional
 from typing import List, Dict, Any, Optional
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 
 class LargeTableHandler:
 class LargeTableHandler:
     """大表处理策略"""
     """大表处理策略"""

+ 0 - 0
schema_tools/utils/logger.py → data_pipeline/utils/logger.py


+ 0 - 0
schema_tools/utils/permission_checker.py → data_pipeline/utils/permission_checker.py


+ 1 - 1
schema_tools/utils/system_filter.py → data_pipeline/utils/system_filter.py

@@ -1,6 +1,6 @@
 import logging
 import logging
 from typing import List, Set
 from typing import List, Set
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 
 class SystemTableFilter:
 class SystemTableFilter:
     """系统表过滤器"""
     """系统表过滤器"""

+ 0 - 0
schema_tools/utils/table_parser.py → data_pipeline/utils/table_parser.py


+ 0 - 0
schema_tools/validators/__init__.py → data_pipeline/validators/__init__.py


+ 2 - 2
schema_tools/validators/file_count_validator.py → data_pipeline/validators/file_count_validator.py

@@ -3,8 +3,8 @@ from pathlib import Path
 from typing import Dict, List, Tuple, Set
 from typing import Dict, List, Tuple, Set
 from dataclasses import dataclass, field
 from dataclasses import dataclass, field
 
 
-from schema_tools.utils.table_parser import TableListParser
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.utils.table_parser import TableListParser
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 
 
 
 @dataclass
 @dataclass

+ 8 - 8
schema_tools/sql_validator.py → data_pipeline/validators/sql_validate_cli.py

@@ -9,8 +9,8 @@ import sys
 import os
 import os
 from pathlib import Path
 from pathlib import Path
 
 
-from schema_tools.sql_validation_agent import SQLValidationAgent
-from schema_tools.utils.logger import setup_logging
+from .sql_validation_agent import SQLValidationAgent
+from data_pipeline.utils.logger import setup_logging
 
 
 
 
 def setup_argument_parser():
 def setup_argument_parser():
@@ -21,19 +21,19 @@ def setup_argument_parser():
         epilog="""
         epilog="""
 示例用法:
 示例用法:
   # 基本使用(仅验证,不修改文件)
   # 基本使用(仅验证,不修改文件)
-  python -m schema_tools.sql_validator --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json
+  python -m data_pipeline.validators.sql_validate_cli --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json
   
   
   # 启用文件修改,但禁用LLM修复(仅删除无效SQL)
   # 启用文件修改,但禁用LLM修复(仅删除无效SQL)
-  python -m schema_tools.sql_validator --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json --modify-original-file --disable-llm-repair
+  python -m data_pipeline.validators.sql_validate_cli --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json --modify-original-file --disable-llm-repair
   
   
   # 启用文件修改和LLM修复功能
   # 启用文件修改和LLM修复功能
-  python -m schema_tools.sql_validator --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json --modify-original-file
+  python -m data_pipeline.validators.sql_validate_cli --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json --modify-original-file
   
   
   # 指定输出目录
   # 指定输出目录
-  python -m schema_tools.sql_validator --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json --output-dir ./reports
+  python -m data_pipeline.validators.sql_validate_cli --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json --output-dir ./reports
   
   
   # 启用详细日志
   # 启用详细日志
-  python -m schema_tools.sql_validator --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json --verbose
+  python -m data_pipeline.validators.sql_validate_cli --db-connection "postgresql://user:pass@localhost:5432/dbname" --input-file ./data.json --verbose
         """
         """
     )
     )
     
     
@@ -128,7 +128,7 @@ def setup_argument_parser():
 
 
 def apply_config_overrides(args):
 def apply_config_overrides(args):
     """应用命令行参数覆盖配置"""
     """应用命令行参数覆盖配置"""
-    from schema_tools.config import SCHEMA_TOOLS_CONFIG
+    from data_pipeline.config import SCHEMA_TOOLS_CONFIG
     
     
     sql_config = SCHEMA_TOOLS_CONFIG['sql_validation']
     sql_config = SCHEMA_TOOLS_CONFIG['sql_validation']
     
     

+ 14 - 5
schema_tools/sql_validation_agent.py → data_pipeline/validators/sql_validation_agent.py

@@ -6,9 +6,9 @@ from datetime import datetime
 from pathlib import Path
 from pathlib import Path
 from typing import List, Dict, Any, Optional
 from typing import List, Dict, Any, Optional
 
 
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
-from schema_tools.validators import SQLValidator, SQLValidationResult, ValidationStats
-from schema_tools.utils.logger import setup_logging
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.validators import SQLValidator, SQLValidationResult, ValidationStats
+from data_pipeline.utils.logger import setup_logging
 
 
 
 
 class SQLValidationAgent:
 class SQLValidationAgent:
@@ -17,7 +17,9 @@ class SQLValidationAgent:
     def __init__(self, 
     def __init__(self, 
                  db_connection: str,
                  db_connection: str,
                  input_file: str,
                  input_file: str,
-                 output_dir: str = None):
+                 output_dir: str = None,
+                 enable_sql_repair: bool = None,
+                 modify_original_file: bool = None):
         """
         """
         初始化SQL验证Agent
         初始化SQL验证Agent
         
         
@@ -25,12 +27,19 @@ class SQLValidationAgent:
             db_connection: 数据库连接字符串
             db_connection: 数据库连接字符串
             input_file: 输入的JSON文件路径(包含Question-SQL对)
             input_file: 输入的JSON文件路径(包含Question-SQL对)
             output_dir: 输出目录(默认为输入文件同目录)
             output_dir: 输出目录(默认为输入文件同目录)
+            enable_sql_repair: 是否启用SQL修复(覆盖配置文件)
+            modify_original_file: 是否修改原始文件(覆盖配置文件)
         """
         """
         self.db_connection = db_connection
         self.db_connection = db_connection
         self.input_file = Path(input_file)
         self.input_file = Path(input_file)
         self.output_dir = Path(output_dir) if output_dir else self.input_file.parent
         self.output_dir = Path(output_dir) if output_dir else self.input_file.parent
         
         
-        self.config = SCHEMA_TOOLS_CONFIG['sql_validation']
+        # 加载配置并允许参数覆盖
+        self.config = SCHEMA_TOOLS_CONFIG['sql_validation'].copy()
+        if enable_sql_repair is not None:
+            self.config['enable_sql_repair'] = enable_sql_repair
+        if modify_original_file is not None:
+            self.config['modify_original_file'] = modify_original_file
         self.logger = logging.getLogger("schema_tools.SQLValidationAgent")
         self.logger = logging.getLogger("schema_tools.SQLValidationAgent")
         
         
         # 初始化验证器
         # 初始化验证器

+ 2 - 2
schema_tools/sql_validation_example.py → data_pipeline/validators/sql_validation_example.py

@@ -13,7 +13,7 @@ from pathlib import Path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 from schema_tools import SQLValidationAgent
 from schema_tools import SQLValidationAgent
-from schema_tools.utils.logger import setup_logging
+from data_pipeline.utils.logger import setup_logging
 
 
 
 
 async def example_basic_validation():
 async def example_basic_validation():
@@ -145,7 +145,7 @@ async def example_configuration_demo():
     print("配置选项演示")
     print("配置选项演示")
     print("=" * 60)
     print("=" * 60)
     
     
-    from schema_tools.config import SCHEMA_TOOLS_CONFIG
+    from data_pipeline.config import SCHEMA_TOOLS_CONFIG
     
     
     print("当前SQL验证配置:")
     print("当前SQL验证配置:")
     sql_config = SCHEMA_TOOLS_CONFIG['sql_validation']
     sql_config = SCHEMA_TOOLS_CONFIG['sql_validation']

+ 2 - 2
schema_tools/validators/sql_validator.py → data_pipeline/validators/sql_validator.py

@@ -4,7 +4,7 @@ import time
 from typing import Dict, Any, List, Optional
 from typing import Dict, Any, List, Optional
 from dataclasses import dataclass, field
 from dataclasses import dataclass, field
 
 
-from schema_tools.config import SCHEMA_TOOLS_CONFIG
+from data_pipeline.config import SCHEMA_TOOLS_CONFIG
 
 
 
 
 @dataclass
 @dataclass
@@ -59,7 +59,7 @@ class SQLValidator:
         if not self.connection_pool:
         if not self.connection_pool:
             if self.config['reuse_connection_pool'] and self.db_connection:
             if self.config['reuse_connection_pool'] and self.db_connection:
                 # 复用现有的DatabaseInspector连接池
                 # 复用现有的DatabaseInspector连接池
-                from schema_tools.tools.base import ToolRegistry
+                from data_pipeline.tools.base import ToolRegistry
                 
                 
                 db_tool = ToolRegistry.get_tool("database_inspector", 
                 db_tool = ToolRegistry.get_tool("database_inspector", 
                                                db_connection=self.db_connection)
                                                db_connection=self.db_connection)

+ 230 - 217
docs/Schema Tools 使用说明.md → docs/Data Pipeline 使用说明.md

@@ -1,4 +1,4 @@
-# Schema Tools 使用说明
+# Data Pipeline 使用说明
 
 
 ## 目录
 ## 目录
 
 
@@ -7,12 +7,15 @@
 3. [一键执行完整工作流(推荐)](#3-一键执行完整工作流推荐)
 3. [一键执行完整工作流(推荐)](#3-一键执行完整工作流推荐)
 4. [生成DDL和MD文档](#4-生成ddl和md文档)
 4. [生成DDL和MD文档](#4-生成ddl和md文档)
 5. [生成Question-SQL训练数据](#5-生成question-sql训练数据)
 5. [生成Question-SQL训练数据](#5-生成question-sql训练数据)
-6. [配置详解](#6-配置详解)
-7. [常见问题](#7-常见问题)
+6. [SQL验证和修复](#6-sql验证和修复)
+7. [训练数据管理](#7-训练数据管理)
+8. [配置详解](#8-配置详解)
+9. [常见问题](#9-常见问题)
+10. [最佳实践](#10-最佳实践)
 
 
 ## 1. 功能概述
 ## 1. 功能概述
 
 
-Schema Tools 提供两个主要功能:
+Data Pipeline 是一个完整的数据库逆向工程和训练数据生成系统,提供以下核心功能:
 
 
 ### 1.1 DDL和MD文档生成
 ### 1.1 DDL和MD文档生成
 - 自动连接PostgreSQL数据库
 - 自动连接PostgreSQL数据库
@@ -27,7 +30,18 @@ Schema Tools 提供两个主要功能:
 - 为每个主题生成高质量的Question-SQL对
 - 为每个主题生成高质量的Question-SQL对
 - 支持中断恢复和并行处理
 - 支持中断恢复和并行处理
 
 
-### 1.3 一键工作流编排器(推荐)
+### 1.3 SQL验证和修复
+- 自动验证生成的SQL语句
+- 使用EXPLAIN语法检查SQL有效性
+- LLM自动修复无效SQL语句
+- 详细的验证报告和统计信息
+
+### 1.4 训练数据管理
+- 自动识别多种训练数据格式
+- 统一的训练数据加载和处理
+- 支持DDL、文档、Q&A对等多种数据类型
+
+### 1.5 一键工作流编排器(推荐)
 - 端到端自动化执行完整流程
 - 端到端自动化执行完整流程
 - DDL/MD生成 → Question-SQL生成 → SQL验证修复
 - DDL/MD生成 → Question-SQL生成 → SQL验证修复
 - 详细的执行报告和性能指标
 - 详细的执行报告和性能指标
@@ -43,77 +57,94 @@ pip install asyncpg asyncio
 
 
 ### 2.2 基本配置
 ### 2.2 基本配置
 
 
-Schema Tools 使用项目现有的 LLM 配置,无需额外配置数据库连接。
+Data Pipeline 使用项目现有的 LLM 配置,无需额外配置数据库连接。
+
+### 2.3 目录结构
+
+```
+data_pipeline/
+├── ddl_generation/          # DDL/MD文档生成工具
+│   ├── ddl_md_generator.py
+│   └── training_data_agent.py
+├── qa_generation/           # Q&A生成工具
+│   ├── qs_agent.py
+│   └── qs_generator.py
+├── validators/              # SQL验证工具  
+│   ├── sql_validate_cli.py
+│   ├── sql_validation_agent.py
+│   └── sql_validator.py
+├── trainer/                 # 训练数据管道
+│   ├── run_training.py
+│   └── vanna_trainer.py
+├── training_data/           # 训练数据存储目录
+├── tools/                   # 核心工具
+├── utils/                   # 工具函数
+├── config.py               # 配置文件
+└── schema_workflow.py  # 工作流编排器
+```
 
 
 ## 3. 一键执行完整工作流(推荐)
 ## 3. 一键执行完整工作流(推荐)
 
 
 ### 3.1 工作流编排器概述
 ### 3.1 工作流编排器概述
 
 
-`SchemaWorkflowOrchestrator` 是 Schema Tools 的核心组件,提供端到端的自动化处理流程:
+`SchemaWorkflowOrchestrator` 是 Data Pipeline 的核心组件,提供端到端的自动化处理流程:
 
 
 1. **DDL和MD文档生成** - 连接数据库,生成表结构文档
 1. **DDL和MD文档生成** - 连接数据库,生成表结构文档
 2. **Question-SQL对生成** - 基于文档生成训练数据
 2. **Question-SQL对生成** - 基于文档生成训练数据
 3. **SQL验证和修复** - 验证SQL有效性并自动修复错误
 3. **SQL验证和修复** - 验证SQL有效性并自动修复错误
+4. **训练数据加载** - 将生成的数据加载到向量数据库中
 
 
 ### 3.2 命令行使用
 ### 3.2 命令行使用
 
 
 #### 基本使用(完整工作流)
 #### 基本使用(完整工作流)
 ```bash
 ```bash
-python -m schema_tools.schema_workflow_orchestrator \
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@localhost:5432/highway_db" \
   --table-list tables.txt \
   --table-list tables.txt \
   --business-context "高速公路服务区管理系统" \
   --business-context "高速公路服务区管理系统" \
-  --db-name highway_db \
-  --output-dir ./output
+  --output-dir ./data_pipeline/training_data/
 ```
 ```
 
 
 #### 跳过SQL验证
 #### 跳过SQL验证
 ```bash
 ```bash
-python -m schema_tools.schema_workflow_orchestrator \
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@localhost:5432/ecommerce_db" \
   --table-list tables.txt \
   --table-list tables.txt \
   --business-context "电商系统" \
   --business-context "电商系统" \
-  --db-name ecommerce_db \
   --skip-validation
   --skip-validation
 ```
 ```
 
 
 #### 禁用LLM修复
 #### 禁用LLM修复
 ```bash
 ```bash
-python -m schema_tools.schema_workflow_orchestrator \
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@localhost:5432/management_db" \
   --table-list tables.txt \
   --table-list tables.txt \
   --business-context "管理系统" \
   --business-context "管理系统" \
-  --db-name management_db \
   --disable-llm-repair
   --disable-llm-repair
 ```
 ```
 
 
 #### 不修改原始文件(仅生成报告)
 #### 不修改原始文件(仅生成报告)
 ```bash
 ```bash
-python -m schema_tools.schema_workflow_orchestrator \
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@localhost:5432/business_db" \
   --table-list tables.txt \
   --table-list tables.txt \
   --business-context "业务系统" \
   --business-context "业务系统" \
-  --db-name business_db \
   --no-modify-file
   --no-modify-file
 ```
 ```
 
 
-python -m schema_tools.schema_workflow_orchestrator --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" --table-list ./schema_tools/tables.txt --business-context "高速公路服务区管理系统"  --db-name highway_db --output-dir ./output
-
-
 ### 3.3 编程方式使用
 ### 3.3 编程方式使用
 
 
 ```python
 ```python
 import asyncio
 import asyncio
-from schema_tools.schema_workflow_orchestrator import SchemaWorkflowOrchestrator
+from data_pipeline.schema_workflow import SchemaWorkflowOrchestrator
 
 
 async def run_complete_workflow():
 async def run_complete_workflow():
     # 创建工作流编排器
     # 创建工作流编排器
     orchestrator = SchemaWorkflowOrchestrator(
     orchestrator = SchemaWorkflowOrchestrator(
-        db_connection="postgresql://user:pass@localhost:5432/dbname",
+        db_connection="postgresql://user:pass@localhost:5432/highway_db",
         table_list_file="tables.txt",
         table_list_file="tables.txt",
         business_context="高速公路服务区管理系统",
         business_context="高速公路服务区管理系统",
-        db_name="highway_db",
-        output_dir="./output",
+        output_dir="./data_pipeline/training_data/",
         enable_sql_validation=True,      # 启用SQL验证
         enable_sql_validation=True,      # 启用SQL验证
         enable_llm_repair=True,          # 启用LLM修复
         enable_llm_repair=True,          # 启用LLM修复
         modify_original_file=True        # 修改原始JSON文件
         modify_original_file=True        # 修改原始JSON文件
@@ -140,14 +171,14 @@ asyncio.run(run_complete_workflow())
 
 
 | 参数 | 说明 | 默认值 |
 | 参数 | 说明 | 默认值 |
 |------|------|--------|
 |------|------|--------|
-| `--db-connection` | 数据库连接字符串 | 必需 |
+| `--db-connection` | 数据库连接字符串(包含数据库名) | 必需 |
 | `--table-list` | 表清单文件路径 | 必需 |
 | `--table-list` | 表清单文件路径 | 必需 |
 | `--business-context` | 业务上下文描述 | 必需 |
 | `--business-context` | 业务上下文描述 | 必需 |
-| `--db-name` | 数据库名称(用于文件命名) | 必需 |
-| `--output-dir` | 输出目录 | `./output` |
+| `--output-dir` | 输出目录 | `./data_pipeline/training_data/` |
 | `--skip-validation` | 跳过SQL验证步骤 | `False`(默认执行SQL验证) |
 | `--skip-validation` | 跳过SQL验证步骤 | `False`(默认执行SQL验证) |
 | `--disable-llm-repair` | 禁用LLM修复功能 | `False`(默认启用LLM修复) |
 | `--disable-llm-repair` | 禁用LLM修复功能 | `False`(默认启用LLM修复) |
 | `--no-modify-file` | 不修改原始JSON文件 | `False`(默认修改原文件) |
 | `--no-modify-file` | 不修改原始JSON文件 | `False`(默认修改原文件) |
+| `--skip-training-load` | 跳过训练数据加载步骤 | `False`(默认执行训练数据加载) |
 | `--verbose` | 启用详细日志 | `False` |
 | `--verbose` | 启用详细日志 | `False` |
 | `--log-file` | 日志文件路径 | 无 |
 | `--log-file` | 日志文件路径 | 无 |
 
 
@@ -159,9 +190,9 @@ asyncio.run(run_complete_workflow())
 {
 {
     "success": True,
     "success": True,
     "workflow_summary": {
     "workflow_summary": {
-        "total_duration": 245.67,
-        "completed_steps": ["ddl_md_generation", "question_sql_generation", "sql_validation"],
-        "total_steps": 3
+        "total_duration": 285.34,
+        "completed_steps": ["ddl_md_generation", "question_sql_generation", "sql_validation", "training_data_load"],
+        "total_steps": 4
     },
     },
     "processing_results": {
     "processing_results": {
         "ddl_md_generation": {
         "ddl_md_generation": {
@@ -179,35 +210,37 @@ asyncio.run(run_complete_workflow())
             "valid_sql_count": 47,
             "valid_sql_count": 47,
             "invalid_sql_count": 3,
             "invalid_sql_count": 3,
             "duration": 32.99
             "duration": 32.99
+        },
+        "training_data_load": {
+            "total_records": 195,
+            "data_type_counts": {
+                "ddl": 8,
+                "documentation": 8,
+                "sql": 47
+            },
+            "duration": 39.67
         }
         }
     },
     },
     "final_outputs": {
     "final_outputs": {
-        "primary_output_file": "./output/qs_highway_db_20240123_143052_pair.json",
+        "primary_output_file": "./data_pipeline/training_data/qs_highway_db_20240123_143052_pair.json",
         "final_question_count": 47
         "final_question_count": 47
     },
     },
     "performance_metrics": {
     "performance_metrics": {
         "step1_duration": 89.23,
         "step1_duration": 89.23,
         "step2_duration": 123.45,
         "step2_duration": 123.45,
         "step3_duration": 32.99,
         "step3_duration": 32.99,
-        "total_duration": 245.67
+        "step4_duration": 39.67,
+        "total_duration": 285.34
     }
     }
 }
 }
 ```
 ```
 
 
-### 3.6 优势特性
-
-- **自动化流程**:一个命令完成所有步骤
-- **错误恢复**:失败时保留已完成步骤的输出
-- **灵活配置**:可选择跳过验证、禁用修复等
-- **详细报告**:提供完整的执行状态和性能指标
-- **向后兼容**:支持所有现有参数和配置
-
 ## 4. 生成DDL和MD文档(分步执行)
 ## 4. 生成DDL和MD文档(分步执行)
 
 
 ### 4.1 命令格式
 ### 4.1 命令格式
 
 
 ```bash
 ```bash
-python -m schema_tools \
+python -m data_pipeline.ddl_generation.ddl_md_generator \
   --db-connection <数据库连接字符串> \
   --db-connection <数据库连接字符串> \
   --table-list <表清单文件> \
   --table-list <表清单文件> \
   --business-context <业务上下文> \
   --business-context <业务上下文> \
@@ -226,7 +259,7 @@ python -m schema_tools \
 
 
 | 参数 | 说明 | 默认值 |
 | 参数 | 说明 | 默认值 |
 |------|------|--------|
 |------|------|--------|
-| `--output-dir` | 输出目录路径 | `training/generated_data` |
+| `--output-dir` | 输出目录路径 | `./data_pipeline/training_data/` |
 | `--pipeline` | 处理链类型 | `full` |
 | `--pipeline` | 处理链类型 | `full` |
 | `--max-concurrent` | 最大并发表数量 | `1` |
 | `--max-concurrent` | 最大并发表数量 | `1` |
 | `--verbose` | 启用详细日志 | `False` |
 | `--verbose` | 启用详细日志 | `False` |
@@ -234,75 +267,33 @@ python -m schema_tools \
 | `--no-filter-system-tables` | 禁用系统表过滤 | `False` |
 | `--no-filter-system-tables` | 禁用系统表过滤 | `False` |
 | `--check-permissions-only` | 仅检查数据库权限 | `False` |
 | `--check-permissions-only` | 仅检查数据库权限 | `False` |
 
 
-### 4.4 处理链类型
-
-- **full**: 完整处理链(默认)- 生成DDL和MD文档
-- **ddl_only**: 仅生成DDL文件
-- **analysis_only**: 仅分析不生成文件
-
-### 4.5 使用示例
+### 4.4 使用示例
 
 
 #### 基本使用
 #### 基本使用
 ```bash
 ```bash
-python -m schema_tools \
-  --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" \
-  --table-list ./schema_tools/tables.txt \
+python -m data_pipeline.ddl_generation.ddl_md_generator \
+  --db-connection "postgresql://postgres:postgres@localhost:5432/highway_db" \
+  --table-list ./tables.txt \
   --business-context "高速公路服务区管理系统"
   --business-context "高速公路服务区管理系统"
 ```
 ```
 
 
 #### 指定输出目录和启用详细日志
 #### 指定输出目录和启用详细日志
 ```bash
 ```bash
-python -m schema_tools \
-  --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" \
-  --table-list ./schema_tools/tables.txt \
+python -m data_pipeline.ddl_generation.ddl_md_generator \
+  --db-connection "postgresql://postgres:postgres@localhost:5432/highway_db" \
+  --table-list ./tables.txt \
   --business-context "高速公路服务区管理系统" \
   --business-context "高速公路服务区管理系统" \
-  --output-dir ./output \
+  --output-dir ./data_pipeline/training_data/ \
   --verbose
   --verbose
 ```
 ```
 
 
-#### 仅生成DDL文件
-```bash
-python -m schema_tools \
-  --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" \
-  --table-list ./schema_tools/tables.txt \
-  --business-context "高速公路服务区管理系统" \
-  --pipeline ddl_only
-```
-
 #### 权限检查
 #### 权限检查
 ```bash
 ```bash
-python -m schema_tools \
-  --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" \
+python -m data_pipeline.ddl_generation.ddl_md_generator \
+  --db-connection "postgresql://postgres:postgres@localhost:5432/highway_db" \
   --check-permissions-only
   --check-permissions-only
 ```
 ```
 
 
-### 4.6 表清单文件格式
-
-创建一个文本文件(如 `tables.txt`),每行一个表名:
-
-```text
-# 这是注释行
-public.bss_service_area
-public.bss_company
-bss_car_day_count  # 默认为public schema
-hr.employees       # 指定schema
-```
-
-### 4.7 输出文件
-
-生成的文件都放在输出目录下(不创建子目录):
-
-```
-output/
-├── bss_service_area.ddl              # DDL文件
-├── bss_service_area_detail.md        # MD文档
-├── bss_company.ddl
-├── bss_company_detail.md
-├── filename_mapping.txt              # 文件名映射
-└── logs/                            # 日志目录
-    └── schema_tools_20240123.log
-```
-
 ## 5. 生成Question-SQL训练数据(分步执行)
 ## 5. 生成Question-SQL训练数据(分步执行)
 
 
 ### 5.1 前置条件
 ### 5.1 前置条件
@@ -312,7 +303,7 @@ output/
 ### 5.2 命令格式
 ### 5.2 命令格式
 
 
 ```bash
 ```bash
-python -m schema_tools.qs_generator \
+python -m data_pipeline.qa_generation.qs_generator \
   --output-dir <输出目录> \
   --output-dir <输出目录> \
   --table-list <表清单文件> \
   --table-list <表清单文件> \
   --business-context <业务上下文> \
   --business-context <业务上下文> \
@@ -323,51 +314,45 @@ python -m schema_tools.qs_generator \
 
 
 | 参数 | 说明 | 示例 |
 | 参数 | 说明 | 示例 |
 |------|------|------|
 |------|------|------|
-| `--output-dir` | 包含DDL和MD文件的目录 | `./output` |
+| `--output-dir` | 包含DDL和MD文件的目录 | `./data_pipeline/training_data/` |
 | `--table-list` | 表清单文件路径(用于验证) | `./tables.txt` |
 | `--table-list` | 表清单文件路径(用于验证) | `./tables.txt` |
 | `--business-context` | 业务上下文描述 | `"高速公路服务区管理系统"` |
 | `--business-context` | 业务上下文描述 | `"高速公路服务区管理系统"` |
 
 
-### 5.4 可选参数说明
-
-| 参数 | 说明 | 默认值 |
-|------|------|--------|
-| `--db-name` | 数据库名称(用于文件命名) | `db` |
-| `--verbose` | 启用详细日志 | `False` |
-| `--log-file` | 日志文件路径 | `无` |
-
-### 5.5 使用示例
+### 5.4 使用示例
 
 
 #### 基本使用
 #### 基本使用
 ```bash
 ```bash
-python -m schema_tools.qs_generator \
-  --output-dir ./output \
-  --table-list ./schema_tools/tables.txt \
+python -m data_pipeline.qa_generation.qs_generator \
+  --output-dir ./data_pipeline/training_data/ \
+  --table-list ./tables.txt \
   --business-context "高速公路服务区管理系统" \
   --business-context "高速公路服务区管理系统" \
-  --db-name highway_db
+   highway_db
 ```
 ```
 
 
 #### 启用详细日志
 #### 启用详细日志
 ```bash
 ```bash
-python -m schema_tools.qs_generator \
-  --output-dir ./output \
-  --table-list ./schema_tools/tables.txt \
+python -m data_pipeline.qa_generation.qs_generator \
+  --output-dir ./data_pipeline/training_data/ \
+  --table-list ./tables.txt \
   --business-context "高速公路服务区管理系统" \
   --business-context "高速公路服务区管理系统" \
-  --db-name highway_db \
+   highway_db \
   --verbose
   --verbose
 ```
 ```
 
 
-### 5.6 SQL验证和修复
+## 6. SQL验证和修复
+
+### 6.1 命令格式
 
 
 生成Question-SQL对后,可以使用SQL验证功能。**注意:命令行使用时,默认启用LLM修复和文件修改功能**。
 生成Question-SQL对后,可以使用SQL验证功能。**注意:命令行使用时,默认启用LLM修复和文件修改功能**。
 
 
 ```bash
 ```bash
-python -m schema_tools.sql_validator \
+python -m data_pipeline.validators.sql_validate_cli \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --input-file ./qs_highway_db_20240123_143052_pair.json \
   --input-file ./qs_highway_db_20240123_143052_pair.json \
   --output-dir ./validation_reports
   --output-dir ./validation_reports
 ```
 ```
 
 
-#### SQL验证参数说明
+### 6.2 SQL验证参数说明
 
 
 | 参数 | 说明 | 默认值 |
 | 参数 | 说明 | 默认值 |
 |------|------|--------|
 |------|------|--------|
@@ -383,28 +368,28 @@ python -m schema_tools.sql_validator \
 | `--dry-run` | 仅解析文件不执行验证 | `False` |
 | `--dry-run` | 仅解析文件不执行验证 | `False` |
 | `--save-json` | 保存详细JSON报告 | `False` |
 | `--save-json` | 保存详细JSON报告 | `False` |
 
 
-#### SQL验证使用示例
+### 6.3 SQL验证使用示例
 
 
 ```bash
 ```bash
 # 基本验证(默认:启用LLM修复和文件修改)
 # 基本验证(默认:启用LLM修复和文件修改)
-python -m schema_tools.sql_validator \
+python -m data_pipeline.validators.sql_validate_cli \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --input-file ./data.json
   --input-file ./data.json
 
 
 # 仅生成报告,不修改文件
 # 仅生成报告,不修改文件
-python -m schema_tools.sql_validator \
+python -m data_pipeline.validators.sql_validate_cli \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --input-file ./data.json \
   --input-file ./data.json \
   --no-modify-file
   --no-modify-file
 
 
 # 启用文件修改,但禁用LLM修复(仅删除无效SQL)
 # 启用文件修改,但禁用LLM修复(仅删除无效SQL)
-python -m schema_tools.sql_validator \
+python -m data_pipeline.validators.sql_validate_cli \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --input-file ./data.json \
   --input-file ./data.json \
   --disable-llm-repair
   --disable-llm-repair
 
 
 # 性能调优参数
 # 性能调优参数
-python -m schema_tools.sql_validator \
+python -m data_pipeline.validators.sql_validate_cli \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --db-connection "postgresql://user:pass@localhost:5432/dbname" \
   --input-file ./data.json \
   --input-file ./data.json \
   --max-concurrent 10 \
   --max-concurrent 10 \
@@ -413,87 +398,84 @@ python -m schema_tools.sql_validator \
   --verbose
   --verbose
 ```
 ```
 
 
-### 5.7 执行流程
-
-1. **文件验证**:检查DDL和MD文件数量是否正确
-2. **表数量限制**:最多处理20个表(可配置)
-3. **主题提取**:LLM分析表结构,提取5个业务分析主题
-4. **Question-SQL生成**:每个主题生成10个问题
-5. **结果保存**:输出到 `qs_<db_name>_<时间戳>_pair.json`
-6. **SQL验证**:验证生成的SQL语句有效性
-7. **自动修复**:使用LLM修复无效的SQL语句(可选)
+## 7. 训练数据管理
 
 
-### 5.8 输出文件
+### 7.1 训练数据加载
 
 
+```bash
+# 使用训练数据管道
+python -m data_pipeline.trainer.run_training \
+  --data_path ./data_pipeline/training_data/
 ```
 ```
-output/
-├── qs_highway_db_20240123_143052_pair.json  # 最终结果
-├── qs_highway_db_20240123_143052_pair.json.backup  # 原文件备份(如果启用文件修改)
-├── qs_intermediate_20240123_143052.json     # 中间结果(成功后自动删除)
-├── qs_recovery_20240123_143052.json         # 恢复文件(异常中断时生成)
-├── sql_validation_20240123_150000_summary.txt  # SQL验证报告
-└── file_modifications_20240123_150000.log  # 文件修改日志(如果启用文件修改)
-```
 
 
-### 5.9 输出格式示例
-
-```json
-[
-  {
-    "question": "按服务区统计每日营收趋势(最近30天)?",
-    "sql": "SELECT service_name AS 服务区, oper_date AS 营业日期, SUM(pay_sum) AS 每日营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - INTERVAL '30 day' AND delete_ts IS NULL GROUP BY service_name, oper_date ORDER BY 营业日期 ASC;"
-  },
-  {
-    "question": "哪个服务区的车流量最大?",
-    "sql": "SELECT service_area_id, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id ORDER BY 总车流量 DESC LIMIT 1;"
-  }
-]
+### 7.2 支持的文件格式
+
+Data Pipeline 自动识别以下训练数据格式:
+
+- **`.ddl`** 文件 → `train_ddl_statements()`
+- **`.md/.markdown`** → `train_documentation_blocks()`
+- **`_pair.json/_pairs.json`** → `train_json_question_sql_pairs()`
+- **`_pair.sql/_pairs.sql`** → `train_formatted_question_sql_pairs()`
+- **`.sql`** (其他) → `train_sql_examples()`
+
+### 7.3 训练数据目录结构
+
+```
+data_pipeline/training_data/
+├── bss_service_area.ddl              # DDL文件
+├── bss_service_area_detail.md        # MD文档
+├── bss_company.ddl
+├── bss_company_detail.md
+├── qs_highway_db_20240123_143052_pair.json  # Q&A训练数据
+├── filename_mapping.txt              # 文件名映射
+└── logs/                            # 日志目录
+    └── data_pipeline_20240123.log
 ```
 ```
 
 
-## 6. 配置详解
+## 8. 配置详解
 
 
-### 6.1 主要配置项
+### 8.1 主要配置项
 
 
-配置文件位于 `schema_tools/config.py`:
+配置文件位于 `data_pipeline/config.py`:
 
 
 ```python
 ```python
 # DDL/MD生成相关配置
 # DDL/MD生成相关配置
-"output_directory": "training/generated_data",     # 输出目录
-"create_subdirectories": False,                    # 不创建子目录
-"max_concurrent_tables": 1,                        # 最大并发数(避免LLM并发问题)
-"sample_data_limit": 20,                          # 数据采样量
-"large_table_threshold": 1000000,                 # 大表阈值(100万行)
-"filter_system_tables": True,                      # 过滤系统表
-"continue_on_error": True,                         # 错误后继续
+"output_directory": "./data_pipeline/training_data/",   # 输出目录
+"create_subdirectories": False,                        # 不创建子目录
+"max_concurrent_tables": 1,                            # 最大并发数(避免LLM并发问题)
+"sample_data_limit": 20,                              # 数据采样量
+"large_table_threshold": 1000000,                     # 大表阈值(100万行)
+"filter_system_tables": True,                          # 过滤系统表
+"continue_on_error": True,                             # 错误后继续
 
 
 # Question-SQL生成配置
 # Question-SQL生成配置
 "qs_generation": {
 "qs_generation": {
-    "max_tables": 20,                             # 最大表数量限制
-    "theme_count": 5,                             # 主题数量
-    "questions_per_theme": 10,                    # 每主题问题数
-    "max_concurrent_themes": 1,                   # 并行主题数(避免LLM并发问题)
-    "continue_on_theme_error": True,              # 主题失败继续
-    "save_intermediate": True,                    # 保存中间结果
+    "max_tables": 20,                                 # 最大表数量限制
+    "theme_count": 5,                                 # 主题数量
+    "questions_per_theme": 10,                        # 每主题问题数
+    "max_concurrent_themes": 1,                       # 并行主题数(避免LLM并发问题)
+    "continue_on_theme_error": True,                  # 主题失败继续
+    "save_intermediate": True,                        # 保存中间结果
 }
 }
 
 
 # SQL验证配置
 # SQL验证配置
 "sql_validation": {
 "sql_validation": {
-    "max_concurrent_validations": 5,              # 并发验证数
-    "validation_timeout": 30,                     # 单个验证超时(秒)
-    "batch_size": 10,                             # 批处理大小
-    "enable_sql_repair": False,                   # SQL修复功能(命令行覆盖为True)
-    "modify_original_file": False,                # 文件修改功能(命令行覆盖为True)
-    "readonly_mode": True,                        # 启用只读模式
+    "max_concurrent_validations": 5,                  # 并发验证数
+    "validation_timeout": 30,                         # 单个验证超时(秒)
+    "batch_size": 10,                                 # 批处理大小
+    "enable_sql_repair": False,                       # SQL修复功能(命令行覆盖为True)
+    "modify_original_file": False,                    # 文件修改功能(命令行覆盖为True)
+    "readonly_mode": True,                            # 启用只读模式
 }
 }
 ```
 ```
 
 
-### 6.2 修改配置
+### 8.2 修改配置
 
 
-可以通过编辑 `schema_tools/config.py` 文件来修改默认配置。
+可以通过编辑 `data_pipeline/config.py` 文件来修改默认配置。
 
 
-## 7. 常见问题
+## 9. 常见问题
 
 
-### 7.1 表数量超过20个怎么办?
+### 9.1 表数量超过20个怎么办?
 
 
 **错误信息**:
 **错误信息**:
 ```
 ```
@@ -504,7 +486,7 @@ output/
 1. 分批处理:将表清单分成多个文件,每个不超过20个表
 1. 分批处理:将表清单分成多个文件,每个不超过20个表
 2. 修改配置:在 `config.py` 中增加 `max_tables` 限制
 2. 修改配置:在 `config.py` 中增加 `max_tables` 限制
 
 
-### 7.2 DDL和MD文件数量不一致
+### 9.2 DDL和MD文件数量不一致
 
 
 **错误信息**:
 **错误信息**:
 ```
 ```
@@ -516,7 +498,7 @@ DDL文件数量(5)与表数量(6)不一致
 2. 查看日志文件找出失败的表
 2. 查看日志文件找出失败的表
 3. 重新运行DDL/MD生成
 3. 重新运行DDL/MD生成
 
 
-### 7.3 LLM调用失败
+### 9.3 LLM调用失败
 
 
 **可能原因**:
 **可能原因**:
 - 网络连接问题
 - 网络连接问题
@@ -528,7 +510,7 @@ DDL文件数量(5)与表数量(6)不一致
 2. 查看中间结果文件,从断点继续
 2. 查看中间结果文件,从断点继续
 3. 减少表数量或分批处理
 3. 减少表数量或分批处理
 
 
-### 7.4 权限不足
+### 9.4 权限不足
 
 
 **错误信息**:
 **错误信息**:
 ```
 ```
@@ -538,25 +520,9 @@ DDL文件数量(5)与表数量(6)不一致
 **解决方案**:
 **解决方案**:
 1. 使用 `--check-permissions-only` 检查权限
 1. 使用 `--check-permissions-only` 检查权限
 2. 确保数据库用户有SELECT权限
 2. 确保数据库用户有SELECT权限
-3. Schema Tools支持只读数据库
-
-### 7.5 如何处理大表?
-
-Schema Tools会自动检测大表(超过100万行)并使用智能采样策略:
-- **前N行采样**:使用 `SELECT * FROM table LIMIT N` 获取前N行
-- **随机中间采样**:使用 `TABLESAMPLE SYSTEM` 进行随机采样(失败时回退到OFFSET采样)
-- **后N行采样**:使用ROW_NUMBER窗口函数获取最后N行
-- 三段采样确保数据的代表性,有效处理大表的多样性
+3. Data Pipeline支持只读数据库
 
 
-**大表阈值**:默认为100万行(可在config.py中修改`large_table_threshold`)
-
-### 7.6 生成的SQL语法错误
-
-目前生成的SQL使用PostgreSQL语法。如果需要其他数据库语法:
-1. 在业务上下文中明确指定目标数据库
-2. 未来版本将支持MySQL等其他数据库
-
-### 7.7 工作流编排器相关问题
+### 9.5 工作流编排器相关问题
 
 
 **Q: 工作流中途失败,如何恢复?**
 **Q: 工作流中途失败,如何恢复?**
 A: 工作流编排器会保留已完成步骤的输出文件,可以手动从失败步骤开始重新执行。
 A: 工作流编排器会保留已完成步骤的输出文件,可以手动从失败步骤开始重新执行。
@@ -567,7 +533,7 @@ A: 使用 `--skip-validation` 跳过SQL验证,或使用分步执行方式调
 **Q: 工作流执行时间过长怎么办?**
 **Q: 工作流执行时间过长怎么办?**
 A: 可以通过减少表数量、调整并发参数、或分批处理来优化执行时间。
 A: 可以通过减少表数量、调整并发参数、或分批处理来优化执行时间。
 
 
-### 7.8 SQL验证器默认行为说明
+### 9.6 SQL验证器默认行为说明
 
 
 **重要**:SQL验证器的命令行模式与配置文件中的默认值不同:
 **重要**:SQL验证器的命令行模式与配置文件中的默认值不同:
 
 
@@ -577,25 +543,24 @@ A: 可以通过减少表数量、调整并发参数、或分批处理来优化
 
 
 如需禁用,请明确使用 `--disable-llm-repair` 或 `--no-modify-file` 参数。
 如需禁用,请明确使用 `--disable-llm-repair` 或 `--no-modify-file` 参数。
 
 
-## 8. 最佳实践
+## 10. 最佳实践
 
 
-### 8.1 推荐工作流程
+### 10.1 推荐工作流程
 
 
 **方式一:一键执行(推荐)**
 **方式一:一键执行(推荐)**
 ```bash
 ```bash
 # 完整工作流程,一个命令搞定
 # 完整工作流程,一个命令搞定
-python -m schema_tools.schema_workflow_orchestrator \
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@localhost:5432/highway_db" \
   --table-list tables.txt \
   --table-list tables.txt \
   --business-context "高速公路服务区管理系统" \
   --business-context "高速公路服务区管理系统" \
-  --db-name highway_db \
-  --output-dir ./output
+  --output-dir ./data_pipeline/training_data/
 ```
 ```
 
 
 **方式二:分步执行(调试时使用)**
 **方式二:分步执行(调试时使用)**
 1. **第一步**:生成DDL和MD文档
 1. **第一步**:生成DDL和MD文档
    ```bash
    ```bash
-   python -m schema_tools --db-connection "..." --table-list tables.txt --business-context "..." --output-dir ./output
+   python -m data_pipeline.ddl_generation.ddl_md_generator --db-connection "..." --table-list tables.txt --business-context "..." --output-dir ./data_pipeline/training_data/
    ```
    ```
 
 
 2. **第二步**:人工检查
 2. **第二步**:人工检查
@@ -605,35 +570,83 @@ python -m schema_tools.schema_workflow_orchestrator \
 
 
 3. **第三步**:生成Question-SQL
 3. **第三步**:生成Question-SQL
    ```bash
    ```bash
-   python -m schema_tools.qs_generator --output-dir ./output --table-list tables.txt --business-context "..."
+   python -m data_pipeline.qa_generation.qs_generator --output-dir ./data_pipeline/training_data/ --table-list tables.txt --business-context "..."
    ```
    ```
 
 
 4. **第四步**:验证SQL(可选)
 4. **第四步**:验证SQL(可选)
    ```bash
    ```bash
-   python -m schema_tools.sql_validator --db-connection "..." --input-file ./qs_xxx.json
+   python -m data_pipeline.validators.sql_validate_cli --db-connection "..." --input-file ./qs_xxx.json
+   ```
+
+5. **第五步**:训练数据加载
+   ```bash
+   python -m data_pipeline.trainer.run_training --data_path ./data_pipeline/training_data/
    ```
    ```
 
 
-### 8.2 表清单组织
+### 10.2 表清单组织
 
 
 - 按业务模块分组
 - 按业务模块分组
 - 每组不超过15-20个表
 - 每组不超过15-20个表
 - 使用注释说明每组的用途
 - 使用注释说明每组的用途
 
 
-### 8.3 业务上下文优化
+### 10.3 业务上下文优化
 
 
 - 提供准确的业务背景描述
 - 提供准确的业务背景描述
 - 包含行业特定术语
 - 包含行业特定术语
 - 说明主要业务流程
 - 说明主要业务流程
 
 
-### 8.4 输出文件管理
+### 10.4 输出文件管理
 
 
 - 定期备份生成的文件
 - 定期备份生成的文件
 - 使用版本控制管理DDL文件
 - 使用版本控制管理DDL文件
 - 保留中间结果用于调试
 - 保留中间结果用于调试
+- 统一使用 `./data_pipeline/training_data/` 目录
 
 
-### 8.5 工作流编排器最佳实践
+### 10.5 工作流编排器最佳实践
 
 
 - **首次使用**:建议启用详细日志(`--verbose`)观察执行过程
 - **首次使用**:建议启用详细日志(`--verbose`)观察执行过程
 - **生产环境**:使用默认参数,启用SQL验证和修复
 - **生产环境**:使用默认参数,启用SQL验证和修复
 - **调试阶段**:可以使用 `--skip-validation` 跳过验证步骤加快执行
 - **调试阶段**:可以使用 `--skip-validation` 跳过验证步骤加快执行
-- **质量要求高**:使用 `--no-modify-file` 仅生成报告,手动审查后再决定是否修改 
+- **质量要求高**:使用 `--no-modify-file` 仅生成报告,手动审查后再决定是否修改
+
+### 10.6 数据管道集成
+
+- **训练数据统一管理**:所有生成的数据都存储在 `data_pipeline/training_data/` 目录
+- **自动化训练**:可以定期运行工作流编排器更新训练数据
+- **版本控制**:建议对训练数据进行版本管理
+- **监控和报告**:利用详细的执行报告监控数据质量
+
+## 总结
+
+Data Pipeline 提供了完整的数据库逆向工程解决方案,从原始数据库schema到可用的训练数据,整个流程完全自动化。通过工作流编排器,用户可以一键完成所有步骤,也可以根据需要分步执行和调试。系统设计考虑了容错性、可扩展性和易用性,适合各种规模的数据处理需求。
+
+-------
+  一键执行(推荐):
+
+  # 完整的4步流程
+  python -m data_pipeline.schema_workflow \
+    --db-connection "postgresql://user:pass@localhost:5432/highway_db" \
+    --table-list tables.txt \
+    --business-context "高速公路服务区管理系统" \
+    --output-dir ./data_pipeline/training_data/
+
+  # 如需跳过训练数据加载
+  python -m data_pipeline.schema_workflow \
+    --db-connection "postgresql://user:pass@localhost:5432/test_db" \
+    --table-list tables.txt \
+    --business-context "测试系统" \
+    --skip-training-load
+
+  分步执行:
+
+  # 第1步:DDL/MD生成
+  python -m data_pipeline.ddl_generation.ddl_md_generator --db-connection "..." --table-list tables.txt --business-context "..."
+
+  # 第2步:Q&A生成
+  python -m data_pipeline.qa_generation.qs_generator --output-dir ./data_pipeline/training_data/ --table-list tables.txt --business-context "..."
+
+  # 第3步:SQL验证
+  python -m data_pipeline.validators.sql_validate_cli --db-connection "..." --input-file ./qs_xxx.json
+
+  # 第4步:训练数据加载
+  python -m data_pipeline.trainer.run_training --data_path ./data_pipeline/training_data/

+ 299 - 918
docs/Schema Tools 系统概要设计说明书.md

@@ -1,102 +1,121 @@
-# Schema Tools 系统概要设计说明书
+# Data Pipeline 系统概要设计说明书
 
 
 ## 1. 项目概述
 ## 1. 项目概述
 
 
 ### 1.1 项目目标
 ### 1.1 项目目标
 
 
-扩展现有的training模块,新增自动化数据库逆向工程功能,将PostgreSQL数据库结构转换为vanna.ai格式的训练数据(DDL和MD文档)
+**Data Pipeline** 是一个完整的数据库逆向工程和训练数据生成系统,将PostgreSQL数据库结构转换为vanna.ai格式的训练数据,包括DDL文档、Markdown说明、Question-SQL问答对以及训练数据加载
 
 
 ### 1.2 核心功能
 ### 1.2 核心功能
 
 
 - 自动连接PostgreSQL数据库
 - 自动连接PostgreSQL数据库
-- 批量处理表清单
+- 批量处理表清单,支持多Schema
 - 生成带中文注释的DDL文件
 - 生成带中文注释的DDL文件
 - 生成详细的MD格式说明文档
 - 生成详细的MD格式说明文档
 - LLM辅助的智能注释生成和枚举检测
 - LLM辅助的智能注释生成和枚举检测
-- 并发处理提高效率
+- Question-SQL训练数据生成
+- SQL验证和自动修复
+- 训练数据加载到向量数据库
+- 端到端工作流编排
 - 完整的错误处理和日志记录
 - 完整的错误处理和日志记录
-- **新增**: Question-SQL训练数据生成功能
 
 
 ## 2. 系统架构
 ## 2. 系统架构
 
 
 ### 2.1 整体架构
 ### 2.1 整体架构
 
 
 ```
 ```
-schema_tools/                    # 独立的schema工具模块
+data_pipeline/                   # 数据管道模块
 ├── __init__.py                 # 模块入口
 ├── __init__.py                 # 模块入口
-├── config.py                   # 配置文件
-├── training_data_agent.py      # 主AI Agent
-├── qs_agent.py                 # Question-SQL生成Agent (新增)
-├── qs_generator.py             # Question-SQL命令行入口 (新增)
-├── sql_validation_agent.py     # SQL验证Agent (新增)
-├── sql_validator.py            # SQL验证命令行入口 (新增)
-├── schema_workflow_orchestrator.py  # 端到端工作流编排器 (新增)
+├── config.py                   # 统一配置文件
+├── schema_workflow.py          # 端到端工作流编排器
+├── ddl_generation/             # DDL/MD生成模块
+│   ├── __init__.py
+│   ├── ddl_md_generator.py     # 命令行入口
+│   └── training_data_agent.py  # 主AI Agent
+├── qa_generation/              # Question-SQL生成模块
+│   ├── __init__.py
+│   ├── qs_agent.py            # Q&A生成Agent
+│   └── qs_generator.py        # Q&A命令行入口
+├── validators/                 # SQL验证模块
+│   ├── __init__.py
+│   ├── sql_validation_agent.py # SQL验证Agent
+│   ├── sql_validate_cli.py    # SQL验证命令行入口
+│   ├── sql_validator.py       # SQL验证器核心
+│   └── file_count_validator.py # 文件数量验证器
+├── trainer/                    # 训练数据管理模块
+│   ├── __init__.py
+│   ├── run_training.py        # 训练数据加载脚本
+│   └── vanna_trainer.py       # 训练器核心模块
 ├── tools/                      # Agent工具集
 ├── tools/                      # Agent工具集
 │   ├── __init__.py
 │   ├── __init__.py
-│   ├── base.py                 # 基础工具类和注册机制
-│   ├── database_inspector.py   # 数据库元数据检查工具
-│   ├── data_sampler.py         # 数据采样工具
-│   ├── comment_generator.py    # LLM注释生成工具
-│   ├── ddl_generator.py        # DDL格式生成工具
-│   └── doc_generator.py        # MD文档生成工具
-├── validators/                 # 验证器模块 (新增)
+│   ├── base.py                # 基础工具类和注册机制
+│   ├── database_inspector.py  # 数据库元数据检查工具
+│   ├── data_sampler.py        # 数据采样工具
+│   ├── comment_generator.py   # LLM注释生成工具
+│   ├── ddl_generator.py       # DDL格式生成工具
+│   └── doc_generator.py       # MD文档生成工具
+├── analyzers/                  # 分析器模块
 │   ├── __init__.py
 │   ├── __init__.py
-│   ├── file_count_validator.py # 文件数量验证器
-│   └── sql_validator.py        # SQL验证器核心模块
-├── analyzers/                  # 分析器模块 (新增)
+│   ├── md_analyzer.py         # MD文件分析
+│   └── theme_extractor.py     # 主题提取器
+├── utils/                      # 工具函数
 │   ├── __init__.py
 │   ├── __init__.py
-│   ├── md_analyzer.py          # MD文件分析器
-│   └── theme_extractor.py      # 主题提取器
+│   ├── table_parser.py        # 表清单解析器
+│   ├── logger.py              # 日志管理
+│   ├── file_manager.py        # 文件管理器
+│   ├── data_structures.py     # 数据结构定义
+│   ├── large_table_handler.py # 大表处理
+│   ├── permission_checker.py  # 权限检查器
+│   └── system_filter.py       # 系统表过滤器
 ├── prompts/                    # 提示词和业务词典
 ├── prompts/                    # 提示词和业务词典
-│   ├── table_comment_template.txt
-│   ├── field_comment_template.txt
-│   ├── enum_detection_template.txt
-│   ├── business_context.txt
-│   └── business_dictionary.txt
-├── utils/                      # 工具函数
 │   ├── __init__.py
 │   ├── __init__.py
-│   ├── table_parser.py         # 表清单解析器
-│   ├── logger.py               # 日志管理
-│   └── file_utils.py           # 文件操作工具
-└── __main__.py                 # 命令行入口
+│   └── business_dictionary.txt
+└── training_data/              # 训练数据存储目录
+    ├── *.ddl                   # DDL文件
+    ├── *_detail.md            # MD文档
+    ├── qs_*_pair.json         # 问答对文件
+    └── filename_mapping.txt    # 文件映射
 ```
 ```
 
 
 ### 2.2 核心组件
 ### 2.2 核心组件
 
 
-#### 2.2.1 主AI Agent
+#### 2.2.1 工作流编排器
+- **类名**: `SchemaWorkflowOrchestrator`
+- **文件**: `schema_workflow.py`
+- **职责**: 端到端执行完整的数据管道流程
+- **特点**: 统一管理4个步骤的执行,支持配置覆盖
 
 
+#### 2.2.2 DDL/MD生成Agent
 - **类名**: `SchemaTrainingDataAgent`
 - **类名**: `SchemaTrainingDataAgent`
-- **职责**: 整体流程控制、工具调度、并发管理
+- **文件**: `ddl_generation/training_data_agent.py`
+- **职责**: 数据库逆向工程,生成DDL和MD文档
 - **特点**: 单一Agent管理多工具的架构
 - **特点**: 单一Agent管理多工具的架构
 
 
-#### 2.2.2 Question-SQL生成Agent(新增)
-
+#### 2.2.3 Question-SQL生成Agent
 - **类名**: `QuestionSQLGenerationAgent`
 - **类名**: `QuestionSQLGenerationAgent`
-- **职责**: 生成Question-SQL训练数据对
-- **特点**: 独立的功能模块,可在DDL/MD生成后单独执行
-
-#### 2.2.3 SQL验证Agent(新增)
+- **文件**: `qa_generation/qs_agent.py`
+- **职责**: 基于DDL/MD生成Question-SQL训练数据对
+- **特点**: 独立的功能模块,支持主题分析和并发生成
 
 
+#### 2.2.4 SQL验证Agent
 - **类名**: `SQLValidationAgent`
 - **类名**: `SQLValidationAgent`
-- **职责**: 验证Question-SQL对中的SQL语句有效性,自动修复错误SQL
-- **特点**: 支持并发验证、LLM自动修复、原文件自动修改
-
-#### 2.2.4 工作流编排器(新增)
-
-- **类名**: `SchemaWorkflowOrchestrator`
-- **职责**: 端到端执行完整的Schema处理流程
-- **特点**: 统一管理DDL/MD生成、Question-SQL生成、SQL验证三个步骤
+- **文件**: `validators/sql_validation_agent.py`
+- **职责**: 验证Question-SQL对中的SQL语句有效性,支持LLM自动修复
+- **特点**: 支持并发验证、自动修复、原文件修改
 
 
-#### 2.2.5 Agent工具集(基于装饰器注册)
+#### 2.2.5 训练数据管理
+- **文件**: `trainer/run_training.py`
+- **职责**: 将生成的训练数据加载到向量数据库
+- **特点**: 支持多种文件格式,自动识别和分类
 
 
+#### 2.2.6 Agent工具集(基于装饰器注册)
 1. **DatabaseInspectorTool**: 获取表元数据
 1. **DatabaseInspectorTool**: 获取表元数据
 2. **DataSamplerTool**: 采样表数据
 2. **DataSamplerTool**: 采样表数据
 3. **CommentGeneratorTool**: LLM生成注释和枚举建议
 3. **CommentGeneratorTool**: LLM生成注释和枚举建议
 4. **DDLGeneratorTool**: 生成DDL格式文件
 4. **DDLGeneratorTool**: 生成DDL格式文件
 5. **DocGeneratorTool**: 生成MD文档
 5. **DocGeneratorTool**: 生成MD文档
 
 
-#### 2.2.6 验证器和分析器(新增)
-
+#### 2.2.7 验证器和分析器
 1. **FileCountValidator**: 验证DDL和MD文件数量
 1. **FileCountValidator**: 验证DDL和MD文件数量
 2. **SQLValidator**: 验证SQL语句有效性,支持LLM自动修复
 2. **SQLValidator**: 验证SQL语句有效性,支持LLM自动修复
 3. **MDFileAnalyzer**: 读取和分析MD文件内容
 3. **MDFileAnalyzer**: 读取和分析MD文件内容
@@ -104,7 +123,32 @@ schema_tools/                    # 独立的schema工具模块
 
 
 ## 3. 详细设计
 ## 3. 详细设计
 
 
-### 3.1 DDL/MD生成流程
+### 3.1 完整工作流程(4步骤)
+
+```mermaid
+graph TD
+    A[开始] --> B[步骤1: DDL/MD生成]
+    B --> C{成功?}
+    C -->|否| D[生成错误报告]
+    C -->|是| E[步骤2: Question-SQL生成]
+    E --> F{成功?}
+    F -->|否| D
+    F -->|是| G{启用SQL验证?}
+    G -->|否| H[步骤4: 训练数据加载]
+    G -->|是| I[步骤3: SQL验证和修复]
+    I --> J{成功?}
+    J -->|否| D
+    J -->|是| K{启用训练数据加载?}
+    K -->|否| L[生成最终报告]
+    K -->|是| H
+    H --> M{成功?}
+    M -->|否| D
+    M -->|是| L
+    L --> N[完成]
+    D --> N
+```
+
+### 3.2 DDL/MD生成流程
 
 
 ```mermaid
 ```mermaid
 graph TD
 graph TD
@@ -119,7 +163,7 @@ graph TD
     I --> J[完成]
     I --> J[完成]
 ```
 ```
 
 
-### 3.2 Question-SQL生成流程(新增)
+### 3.3 Question-SQL生成流程
 
 
 ```mermaid
 ```mermaid
 graph TD
 graph TD
@@ -134,7 +178,7 @@ graph TD
     I --> J[完成]
     I --> J[完成]
 ```
 ```
 
 
-### 3.3 SQL验证和修复流程(新增)
+### 3.4 SQL验证和修复流程
 
 
 ```mermaid
 ```mermaid
 graph TD
 graph TD
@@ -157,222 +201,40 @@ graph TD
     H --> F
     H --> F
 ```
 ```
 
 
-### 3.4 端到端工作流编排流程(新增)
+### 3.5 训练数据加载流程
 
 
 ```mermaid
 ```mermaid
 graph TD
 graph TD
-    A[开始] --> B[步骤1: DDL/MD生成]
-    B --> C{成功?}
-    C -->|否| D[生成错误报告]
-    C -->|是| E[步骤2: Question-SQL生成]
-    E --> F{成功?}
-    F -->|否| D
-    F -->|是| G{启用SQL验证?}
-    G -->|否| H[生成最终报告]
-    G -->|是| I[步骤3: SQL验证和修复]
-    I --> J{成功?}
-    J -->|否| D
-    J -->|是| H
-    H --> K[完成]
-    D --> K
-```
-
-### 3.5 模块间接口规范
-
-#### 3.5.1 统一数据结构定义
-
-```python
-from dataclasses import dataclass
-from typing import List, Dict, Optional, Any
-
-@dataclass
-class FieldInfo:
-    """字段信息标准结构"""
-    name: str
-    type: str
-    nullable: bool
-    default_value: Optional[str]
-    comment: Optional[str]
-    is_primary_key: bool = False
-    is_foreign_key: bool = False
-    is_enum: bool = False
-    enum_values: Optional[List[str]] = None
-
-@dataclass
-class TableMetadata:
-    """表元数据标准结构"""
-    schema_name: str
-    table_name: str
-    full_name: str  # schema.table_name
-    comment: Optional[str]
-    fields: List[FieldInfo]
-    sample_data: List[Dict[str, Any]]
-    row_count: Optional[int]
-
-@dataclass
-class ProcessingResult:
-    """工具处理结果标准结构"""
-    success: bool
-    data: Optional[Any]
-    error_message: Optional[str]
-    warnings: List[str] = None
-```
-
-#### 3.5.2 工具接口规范
-
-```python
-class BaseTool:
-    async def execute(self, input_data: Dict[str, Any], context: Dict[str, Any]) -> ProcessingResult:
-        """
-        统一工具执行接口
-        Args:
-            input_data: 输入数据字典
-            context: 全局上下文
-        Returns:
-            ProcessingResult: 标准化处理结果
-        """
-        pass
-```
-
-### 3.6 可插拔处理链设计
-
-#### 3.6.1 Pipeline配置
-
-```python
-# 支持灵活的处理链配置
-PROCESSING_PIPELINES = {
-    "full": [  # 完整处理链
-        "database_inspector",
-        "data_sampler", 
-        "comment_generator",
-        "ddl_generator",
-        "doc_generator"
-    ],
-    "ddl_only": [  # 仅生成DDL
-        "database_inspector",
-        "data_sampler",
-        "comment_generator", 
-        "ddl_generator"
-    ],
-    "analysis_only": [  # 仅分析不生成文件
-        "database_inspector",
-        "data_sampler",
-        "comment_generator"
-    ]
-}
-```
-
-#### 3.3.2 Pipeline执行器
-
-```python
-class PipelineExecutor:
-    def __init__(self, pipeline_name: str):
-        self.steps = PROCESSING_PIPELINES[pipeline_name]
-        self.tools = {name: ToolRegistry.get_tool(name) for name in self.steps}
-    
-    async def execute(self, table_metadata: TableMetadata, context: Dict) -> Dict[str, ProcessingResult]:
-        """按配置的处理链执行工具"""
-        results = {}
-        current_data = {"table_metadata": table_metadata}
-        
-        for step_name in self.steps:
-            result = await self.tools[step_name].execute(current_data, context)
-            results[step_name] = result
-            if result.success:
-                current_data[f"{step_name}_result"] = result.data
-            else:
-                # 根据配置决定是否继续
-                if not SCHEMA_TOOLS_CONFIG["continue_on_error"]:
-                    break
-        
-        return results
-```
-
-### 3.4 并发处理策略
-
-#### 3.4.1 表级并发
-
-- 最大并发表数: 可配置(默认1个,避免LLM并发问题)
-- 使用asyncio.Semaphore控制并发数
-- 单表内工具串行执行
-
-#### 3.4.2 工具执行顺序
-
-1. `DatabaseInspectorTool` → 获取表结构元数据
-2. `DataSamplerTool` → 采样20条数据
-3. `CommentGeneratorTool` → LLM生成注释和枚举建议
-4. 枚举字段验证 → SELECT DISTINCT确认枚举值
-5. `DDLGeneratorTool` → 生成DDL文件
-6. `DocGeneratorTool` → 基于DDL结果生成MD文档
-
-### 3.3 LLM集成设计
-
-#### 3.3.1 LLM实例复用
-
-```python
-# 复用现有vanna实例,支持qwen/deepseek/ollama
-from core.vanna_llm_factory import create_vanna_instance
-vn = create_vanna_instance()
-```
-
-#### 3.3.2 智能注释生成
-
-**输入**: 表元数据 + 20条样例数据 + 业务上下文 **输出**: 中文注释 + 枚举字段建议 **特点**: 一次LLM调用完成注释生成和枚举检测
-
-#### 3.3.3 枚举检测策略
-
-1. **规则预筛选**: ENUM类型、VARCHAR+关键词
-2. **LLM智能判断**: 基于字段名、注释、样例数据
-3. **数据验证**: SELECT DISTINCT确认实际枚举值
-
-### 3.4 工具注册机制
-
-#### 3.4.1 装饰器注册
-
-```python
-@ToolRegistry.register("database_inspector")
-class DatabaseInspectorTool(BaseTool):
-    needs_llm = False  # 是否需要LLM实例
-```
-
-#### 3.4.2 自动依赖注入
-
-- 自动为需要LLM的工具注入vanna实例
-- 确保所有工具使用相同的LLM配置
-
-### 3.5 数据流设计
-
-#### 3.5.1 工具间数据传递
-
-- 方案B: 工具间直接传递数据
-- DDLGeneratorTool的结果作为DocGeneratorTool的输入
-- 通过input_data字典传递中间结果
-
-#### 3.5.2 上下文管理
-
-```python
-context = {
-    'business_context': '高速公路服务区管理系统',
-    'table_name': 'bss_service_area',
-    'output_dir': 'training/generated_data',
-    'vn': vanna_instance
-}
+    A[开始] --> B[扫描训练数据目录]
+    B --> C[识别文件类型]
+    C --> D[处理DDL文件]
+    C --> E[处理MD文件]
+    C --> F[处理JSON问答对]
+    C --> G[处理SQL文件]
+    D --> H[加载到向量数据库]
+    E --> H
+    F --> H
+    G --> H
+    H --> I[验证加载结果]
+    I --> J[生成统计报告]
+    J --> K[完成]
 ```
 ```
 
 
 ## 4. 配置设计
 ## 4. 配置设计
 
 
-### 4.1 配置文件结构
+### 4.1 统一配置架构
+
+所有data_pipeline相关配置统一在 `data_pipeline/config.py` 中:
 
 
 ```python
 ```python
-# schema_tools/config.py
 SCHEMA_TOOLS_CONFIG = {
 SCHEMA_TOOLS_CONFIG = {
     # 核心配置
     # 核心配置
     "default_db_connection": None,
     "default_db_connection": None,
     "default_business_context": "数据库管理系统", 
     "default_business_context": "数据库管理系统", 
-    "output_directory": "training/generated_data",
+    "output_directory": "./data_pipeline/training_data/",
     
     
     # 处理链配置
     # 处理链配置
-    "default_pipeline": "full",  # full, ddl_only, analysis_only
+    "default_pipeline": "full",
     "available_pipelines": {
     "available_pipelines": {
         "full": ["database_inspector", "data_sampler", "comment_generator", "ddl_generator", "doc_generator"],
         "full": ["database_inspector", "data_sampler", "comment_generator", "ddl_generator", "doc_generator"],
         "ddl_only": ["database_inspector", "data_sampler", "comment_generator", "ddl_generator"],
         "ddl_only": ["database_inspector", "data_sampler", "comment_generator", "ddl_generator"],
@@ -383,75 +245,30 @@ SCHEMA_TOOLS_CONFIG = {
     "sample_data_limit": 20,
     "sample_data_limit": 20,
     "enum_detection_sample_limit": 5000,
     "enum_detection_sample_limit": 5000,
     "enum_max_distinct_values": 20,
     "enum_max_distinct_values": 20,
-    "enum_varchar_keywords": ["性别", "状态", "类型", "级别", "方向", "品类"],
-    "large_table_threshold": 1000000,  # 大表阈值
+    "large_table_threshold": 1000000,
     
     
     # 并发配置
     # 并发配置
-    "max_concurrent_tables": 1,  # 建议保持1,避免LLM并发调用问题
-    
-    # LLM配置
-    "use_app_config_llm": True,
-    "comment_generation_timeout": 30,
-    "max_llm_retries": 3,
-    
-    # 系统表过滤配置
-    "filter_system_tables": True,
-    "custom_system_prefixes": [],  # 用户自定义的系统表前缀
-    "custom_system_schemas": [],   # 用户自定义的系统schema
-    
-    # 权限与安全配置
-    "check_permissions": True,
-    "require_select_permission": True,
-    "allow_readonly_database": True,
-    
-    # 错误处理配置
-    "continue_on_error": True,
-    "max_table_failures": 5,
-    "skip_large_tables": False,  # 是否跳过超大表
-    "max_table_size": 10000000,  # 最大表行数限制
+    "max_concurrent_tables": 1,  # 避免LLM并发调用问题
     
     
-    # 文件配置
-    "ddl_file_suffix": ".ddl",
-    "doc_file_suffix": "_detail.md",
-    "log_file": "schema_tools.log",
-    "create_subdirectories": False,  # 不创建子目录,所有文件放在output目录下
-    
-    # 输出格式配置
-    "include_sample_data_in_comments": True,  # 注释中是否包含示例数据
-    "max_comment_length": 500,  # 最大注释长度
-    "include_field_statistics": True,  # 是否包含字段统计信息
-    
-    # Question-SQL生成配置(新增)
+    # Question-SQL生成配置
     "qs_generation": {
     "qs_generation": {
-        "max_tables": 20,                    # 最大表数量限制
-        "theme_count": 5,                    # LLM生成的主题数量
-        "questions_per_theme": 10,           # 每个主题生成的问题数
-        "max_concurrent_themes": 1,          # 并行处理的主题数量(建议保持1)
-        "continue_on_theme_error": True,     # 主题生成失败是否继续
-        "save_intermediate": True,           # 是否保存中间结果
-        "output_file_prefix": "qs",          # 输出文件前缀
+        "max_tables": 20,
+        "theme_count": 5,
+        "questions_per_theme": 10,
+        "max_concurrent_themes": 1,
+        "continue_on_theme_error": True,
+        "save_intermediate": True,
+        "output_file_prefix": "qs",
     },
     },
     
     
-    # SQL验证配置(新增)
+    # SQL验证配置
     "sql_validation": {
     "sql_validation": {
-        "reuse_connection_pool": True,       # 复用现有连接池
-        "max_concurrent_validations": 5,     # 并发验证数
-        "validation_timeout": 30,            # 单个验证超时(秒)
-        "batch_size": 10,                    # 批处理大小
-        "continue_on_error": True,           # 错误时是否继续
-        "save_validation_report": True,      # 保存验证报告
-        "save_detailed_json_report": False,  # 保存详细JSON报告(可选)
-        "readonly_mode": True,               # 启用只读模式
-        "max_retry_count": 2,                # 验证失败重试次数
-        "report_file_prefix": "sql_validation",  # 报告文件前缀
-        
-        # SQL修复配置
-        "enable_sql_repair": False,          # 启用SQL修复功能(默认禁用)
-        "llm_repair_timeout": 120,           # LLM修复超时时间(秒)
-        "repair_batch_size": 2,              # 修复批处理大小
-        
-        # 文件修改配置
-        "modify_original_file": False,       # 是否修改原始JSON文件(默认禁用)
+        "max_concurrent_validations": 5,
+        "validation_timeout": 30,
+        "batch_size": 10,
+        "enable_sql_repair": False,  # 默认禁用
+        "modify_original_file": False,  # 默认禁用
+        "readonly_mode": True,
     }
     }
 }
 }
 ```
 ```
@@ -459,328 +276,103 @@ SCHEMA_TOOLS_CONFIG = {
 ### 4.2 配置优先级
 ### 4.2 配置优先级
 
 
 ```
 ```
-命令行参数 > schema_tools/config.py > app_config.py默认值
+命令行参数 > data_pipeline/config.py > 默认值
 ```
 ```
 
 
-### 4.3 多Schema场景处理
-
-#### 4.3.1 文件命名防冲突机制
-
-```python
-def generate_safe_filename(schema_name: str, table_name: str, suffix: str) -> str:
-    """
-    生成安全的文件名,避免冲突
-    
-    规则:
-    - public.table_name → table_name.ddl
-    - schema.table_name → schema__table_name.ddl  
-    - 特殊字符替换: . → __, - → _, 空格 → _
-    """
-    if schema_name.lower() == 'public':
-        safe_name = table_name
-    else:
-        safe_name = f"{schema_name}__{table_name}"
-    
-    # 替换特殊字符
-    safe_name = safe_name.replace('.', '__').replace('-', '_').replace(' ', '_')
-    
-    return f"{safe_name}{suffix}"
+### 4.3 与全局配置的分离
 
 
-# 示例:
-# public.users → users.ddl
-# hr.employees → hr__employees.ddl
-# sales.order-items → sales__order_items.ddl
-```
+- **系统级配置** (`app_config.py`): LLM配置、向量数据库配置、全局开关
+- **模块级配置** (`data_pipeline/config.py`): 数据管道专用配置
+- **清晰分工**: 避免配置冲突和重复
 
 
-#### 4.3.2 输出目录结构
+## 5. 使用方式
 
 
-```
-training/generated_data/
-├── users.ddl                        # public.users
-├── hr__employees.ddl                # hr.employees  
-├── sales__order_items.ddl           # sales.order-items
-├── users_detail.md                  # 对应的MD文档
-├── hr__employees_detail.md
-├── sales__order_items_detail.md
-├── qs_highway_db_20240101_pair.json # Question-SQL对文件
-├── metadata.txt                     # 主题元数据
-├── sql_validation_20240101_summary.txt  # SQL验证报告
-└── logs/
-    └── schema_tools.log
-```
+### 5.1 端到端工作流(推荐)
 
 
-**注意**: 配置已更新为不创建ddl/和docs/子目录,所有文件直接放在output目录下。
+```bash
+# 完整4步流程
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@localhost:5432/database_name" \
+  --table-list tables.txt \
+  --business-context "高速公路服务区管理系统" \
+  --output-dir ./data_pipeline/training_data/
 
 
-#### 4.3.3 重名检测与处理
+# 跳过SQL验证
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@localhost:5432/database_name" \
+  --table-list tables.txt \
+  --business-context "电商系统" \
+  --skip-validation
 
 
-```python
-class FileNameManager:
-    def __init__(self, output_dir: str):
-        self.output_dir = output_dir
-        self.used_names = set()
-    
-    def get_unique_filename(self, schema_name: str, table_name: str, suffix: str) -> str:
-        """确保文件名唯一性"""
-        base_name = generate_safe_filename(schema_name, table_name, suffix)
-        
-        if base_name not in self.used_names:
-            self.used_names.add(base_name)
-            return base_name
-        
-        # 如果重名,添加数字后缀
-        counter = 1
-        while True:
-            name_parts = base_name.rsplit('.', 1)
-            if len(name_parts) == 2:
-                unique_name = f"{name_parts[0]}_{counter}.{name_parts[1]}"
-            else:
-                unique_name = f"{base_name}_{counter}"
-            
-            if unique_name not in self.used_names:
-                self.used_names.add(unique_name)
-                return unique_name
-            counter += 1
+# 跳过训练数据加载
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@localhost:5432/database_name" \
+  --table-list tables.txt \
+  --business-context "管理系统" \
+  --skip-training-load
 ```
 ```
 
 
-### 4.4 边界情况处理
+### 5.2 分步执行
 
 
-#### 4.4.1 系统表过滤
+```bash
+# 第1步:DDL/MD生成
+python -m data_pipeline.ddl_generation.ddl_md_generator \
+  --db-connection "postgresql://user:pass@localhost:5432/database" \
+  --table-list tables.txt \
+  --business-context "业务描述"
 
 
-```python
-class SystemTableFilter:
-    """系统表过滤器"""
-    
-    # PostgreSQL系统表前缀
-    PG_SYSTEM_PREFIXES = [
-        'pg_', 'information_schema', 'sql_', 'cardinal_number',
-        'character_data', 'sql_identifier', 'time_stamp', 'yes_or_no'
-    ]
-    
-    # 系统schema
-    SYSTEM_SCHEMAS = [
-        'information_schema', 'pg_catalog', 'pg_toast', 
-        'pg_temp_1', 'pg_toast_temp_1'
-    ]
-    
-    @classmethod
-    def is_system_table(cls, schema_name: str, table_name: str) -> bool:
-        """判断是否为系统表"""
-        # 检查系统schema
-        if schema_name.lower() in cls.SYSTEM_SCHEMAS:
-            return True
-        
-        # 检查表名前缀
-        table_lower = table_name.lower()
-        return any(table_lower.startswith(prefix) for prefix in cls.PG_SYSTEM_PREFIXES)
-    
-    @classmethod
-    def filter_user_tables(cls, table_list: List[str]) -> List[str]:
-        """过滤出用户表"""
-        user_tables = []
-        filtered_count = 0
-        
-        for table_spec in table_list:
-            if '.' in table_spec:
-                schema, table = table_spec.split('.', 1)
-            else:
-                schema, table = 'public', table_spec
-            
-            if not cls.is_system_table(schema, table):
-                user_tables.append(table_spec)
-            else:
-                filtered_count += 1
-                logging.info(f"过滤系统表: {table_spec}")
-        
-        logging.info(f"过滤了 {filtered_count} 个系统表,保留 {len(user_tables)} 个用户表")
-        return user_tables
-```
+# 第2步:Q&A生成
+python -m data_pipeline.qa_generation.qs_generator \
+  --output-dir ./data_pipeline/training_data/ \
+  --table-list tables.txt \
+  --business-context "业务描述"
 
 
-#### 4.4.2 数据库权限检查
+# 第3步:SQL验证
+python -m data_pipeline.validators.sql_validate_cli \
+  --db-connection "postgresql://user:pass@localhost:5432/database" \
+  --input-file ./qs_xxx.json
 
 
-```python
-class DatabasePermissionChecker:
-    """数据库权限检查器"""
-    
-    def __init__(self, db_inspector):
-        self.db_inspector = db_inspector
-    
-    async def check_permissions(self) -> Dict[str, bool]:
-        """检查数据库权限"""
-        permissions = {
-            'connect': False,
-            'select_metadata': False,
-            'select_data': False,
-            'is_readonly': False
-        }
-        
-        try:
-            # 检查连接权限
-            await self.db_inspector.test_connection()
-            permissions['connect'] = True
-            
-            # 检查元数据查询权限
-            await self.db_inspector.get_schemas()
-            permissions['select_metadata'] = True
-            
-            # 检查数据查询权限(测试一个已知表)
-            try:
-                await self.db_inspector.execute_query("SELECT 1 LIMIT 1")
-                permissions['select_data'] = True
-            except Exception as e:
-                logging.warning(f"数据查询权限受限: {e}")
-            
-            # 检查是否为只读库(尝试创建临时表)
-            try:
-                await self.db_inspector.execute_query("CREATE TEMP TABLE test_write_permission (id int)")
-                await self.db_inspector.execute_query("DROP TABLE test_write_permission")
-            except Exception:
-                permissions['is_readonly'] = True
-                logging.info("检测到只读数据库,这是正常的")
-            
-        except Exception as e:
-            logging.error(f"权限检查失败: {e}")
-            
-        return permissions
+# 第4步:训练数据加载
+python -m data_pipeline.trainer.run_training \
+  --data_path ./data_pipeline/training_data/
 ```
 ```
 
 
-#### 4.4.3 大表处理策略
+### 5.3 编程方式
 
 
 ```python
 ```python
-class LargeTableHandler:
-    """大表处理策略"""
+import asyncio
+from data_pipeline.schema_workflow import SchemaWorkflowOrchestrator
+
+async def run_complete_workflow():
+    orchestrator = SchemaWorkflowOrchestrator(
+        db_connection="postgresql://user:pass@localhost:5432/database_name",
+        table_list_file="tables.txt",
+        business_context="高速公路服务区管理系统",
+        output_dir="./data_pipeline/training_data/",
+        enable_sql_validation=True,
+        enable_llm_repair=True,
+        modify_original_file=True,
+        enable_training_data_load=True
+    )
     
     
-    @staticmethod
-    async def get_smart_sample(db_inspector, table_name: str, limit: int = 20) -> List[Dict]:
-        """智能采样策略"""
-        # 1. 先检查表大小
-        row_count = await db_inspector.get_table_row_count(table_name)
-        
-        if row_count <= limit * 10:  # 小表,直接采样
-            return await db_inspector.sample_table_data(table_name, limit)
-        
-        # 2. 大表使用分层采样
-        logging.info(f"表 {table_name} 有 {row_count} 行,使用智能采样")
-        
-        # 前N行 + 随机中间N行 + 后N行
-        samples_per_section = limit // 3
-        
-        samples = []
-        
-        # 前N行
-        front_samples = await db_inspector.execute_query(
-            f"SELECT * FROM {table_name} LIMIT {samples_per_section}"
-        )
-        samples.extend(front_samples)
-        
-        # 随机中间N行
-        if row_count > samples_per_section * 2:
-            middle_samples = await db_inspector.execute_query(f"""
-                SELECT * FROM {table_name} 
-                TABLESAMPLE SYSTEM(1) 
-                LIMIT {samples_per_section}
-            """)
-            samples.extend(middle_samples)
-        
-        # 后N行
-        remaining = limit - len(samples)
-        if remaining > 0:
-            tail_samples = await db_inspector.execute_query(f"""
-                SELECT * FROM (
-                    SELECT * FROM {table_name} 
-                    ORDER BY CTID DESC 
-                    LIMIT {remaining}
-                ) sub ORDER BY CTID
-            """)
-            samples.extend(tail_samples)
-        
-        return samples[:limit]
-```
+    report = await orchestrator.execute_complete_workflow()
+    return report
 
 
-#### 4.4.4 异常表结构处理
-
-```python
-class TableStructureValidator:
-    """表结构验证器"""
-    
-    @staticmethod
-    def validate_table_structure(table_metadata: TableMetadata) -> List[str]:
-        """验证表结构,返回警告信息"""
-        warnings = []
-        
-        # 检查是否有字段
-        if not table_metadata.fields:
-            warnings.append("表没有字段定义")
-            return warnings
-        
-        # 检查是否有主键
-        has_primary_key = any(field.is_primary_key for field in table_metadata.fields)
-        if not has_primary_key:
-            warnings.append("表没有主键")
-        
-        # 检查字段名是否合规
-        for field in table_metadata.fields:
-            if not field.name or field.name.strip() == '':
-                warnings.append(f"发现空字段名")
-            
-            # 检查特殊字符
-            if any(char in field.name for char in [' ', '-', '.']):
-                warnings.append(f"字段名包含特殊字符: {field.name}")
-        
-        # 检查超大字段数量
-        if len(table_metadata.fields) > 100:
-            warnings.append(f"表字段数量过多: {len(table_metadata.fields)} 个字段")
-        
-        return warnings
+# 运行工作流程
+asyncio.run(run_complete_workflow())
 ```
 ```
 
 
-## 5. 错误处理与日志
-
-### 5.1 错误处理策略
-
-#### 5.1.1 表级错误处理
-
-- 某表处理失败 → 记录错误,继续下一表
-- 失败表数超过阈值 → 警告但继续执行
-
-#### 5.1.2 工具级错误处理
-
-- `DatabaseInspectorTool`失败 → 跳过该表
-- `CommentGeneratorTool`失败 → 使用原始注释
-- `EnumDetectorTool`失败 → 跳过枚举检测
-- 所有失败都记录到日志
-
-#### 5.1.3 LLM调用错误处理
-
-- 超时/失败 → 自动重试(最大3次)
-- 重试失败 → 使用原始注释或默认注释
-
-### 5.2 日志设计
-
-#### 5.2.1 日志级别
-
-- **INFO**: 正常处理流程
-- **WARNING**: 可恢复错误(使用默认值、重试成功)
-- **ERROR**: 影响结果的错误(表处理失败、工具异常)
-
-#### 5.2.2 日志输出
-
-- **控制台**: 进度信息和关键错误
-- **文件**: 详细执行日志(schema_tools.log)
-- **格式**: 文本格式,包含时间戳、级别、消息
-
 ## 6. 文件格式设计
 ## 6. 文件格式设计
 
 
 ### 6.1 DDL文件格式
 ### 6.1 DDL文件格式
 
 
 ```sql
 ```sql
--- 中文名: 服务区基础信息表
--- 描述: 记录高速公路服务区的基础属性,包括服务区编码、名称、方向、公司归属、地理位置、服务类型和状态,是业务分析与服务区定位的核心表。
-create table bss_service_area (
-  id varchar(32) not null,              -- 服务区唯一标识(主键,UUID)
-  version integer not null,             -- 版本号
-  service_area_name varchar(255),       -- 服务区名称
-  service_area_no varchar(255),         -- 服务区编码(业务唯一标识)
-  service_area_type varchar(50),        -- 服务区类型(枚举:信息化服务区、智能化服务区)
+-- 中文名: 存储高速公路管理公司信息
+-- 描述: 存储高速公路管理公司信息,用于服务区运营管理
+create table public.bss_company (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  company_name varchar(255)   -- 公司名称,
   primary key (id)
   primary key (id)
 );
 );
 ```
 ```
@@ -788,359 +380,148 @@ create table bss_service_area (
 ### 6.2 MD文档格式
 ### 6.2 MD文档格式
 
 
 ```markdown
 ```markdown
-## bss_service_area(服务区基础信息表
-bss_service_area 表记录高速公路服务区的基础属性...
+## bss_company(存储高速公路管理公司信息
+bss_company 表存储高速公路管理公司信息,用于服务区运营管理
 
 
 字段列表:
 字段列表:
-- id (varchar(32)) - 服务区唯一标识(主键,UUID) [示例: 0271d68ef93de9684b7ad8c7aae600b6]
-- service_area_type (varchar(50)) - 服务区类型(枚举:信息化服务区、智能化服务区)[示例: 信息化服务区]
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 30675d85ba5044c31acfa243b9d16334]
+- company_name (varchar(255)) - 公司名称 [示例: 上饶分公司, 宜春分公司]
 
 
 字段补充说明:
 字段补充说明:
-- service_area_type 为枚举字段,包含两个取值:信息化服务区、智能化服务区。
+- id 为主键
 ```
 ```
 
 
-### 6.3 Question-SQL文件格式(新增)
+### 6.3 Question-SQL文件格式
 
 
 ```json
 ```json
 [
 [
   {
   {
-    "question": "按服务区统计每日营收趋势(最近30天)?",
-    "sql": "SELECT service_name AS 服务区, oper_date AS 营业日期, SUM(pay_sum) AS 每日营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - INTERVAL '30 day' AND delete_ts IS NULL GROUP BY service_name, oper_date ORDER BY 营业日期 ASC NULLS LAST;"
+    "question": "查询所有公司信息",
+    "sql": "SELECT * FROM bss_company WHERE delete_ts IS NULL"
   },
   },
   {
   {
-    "question": "按月统计服务区营收趋势?",
-    "sql": "SELECT service_name AS 服务区, DATE_TRUNC('month', oper_date) AS 月份, SUM(pay_sum) AS 月营收 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name, 月份 ORDER BY 月份 ASC NULLS LAST;"
+    "question": "按公司统计服务区数量",
+    "sql": "SELECT company_name, COUNT(*) FROM bss_service_area GROUP BY company_name"
   }
   }
 ]
 ]
 ```
 ```
 
 
-## 7. 使用方式
-
-### 7.1 命令行方式
-
-#### 7.1.1 生成DDL和MD文档
-
-```bash
-# 基本使用
-python -m schema_tools \
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \
-  --table-list tables.txt \
-  --business-context "高速公路服务区管理系统"
+## 7. 多Schema支持
 
 
-# 指定处理链
-python -m schema_tools \
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \
-  --table-list tables.txt \
-  --pipeline ddl_only \
-  --business-context "高速公路服务区管理系统"
+### 7.1 文件命名防冲突机制
 
 
-# 高级参数
-python -m schema_tools \
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \
-  --table-list tables.txt \
-  --business-context-file business_context.txt \
-  --output-dir custom_output \
-  --max-concurrent 5 \
-  --pipeline full \
-  --no-filter-system-tables
-
-# 权限检查模式
-python -m schema_tools \
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \
-  --check-permissions-only
+```python
+def generate_safe_filename(schema_name: str, table_name: str, suffix: str) -> str:
+    """
+    生成安全的文件名,避免冲突
+    
+    规则:
+    - public.table_name → table_name.ddl
+    - schema.table_name → schema__table_name.ddl  
+    - 特殊字符替换: . → __, - → _, 空格 → _
+    """
+    if schema_name.lower() == 'public':
+        safe_name = table_name
+    else:
+        safe_name = f"{schema_name}__{table_name}"
+    
+    # 替换特殊字符
+    safe_name = safe_name.replace('.', '__').replace('-', '_').replace(' ', '_')
+    
+    return f"{safe_name}{suffix}"
 ```
 ```
 
 
-#### 7.1.2 生成Question-SQL训练数据(新增)
+### 7.2 输出目录结构
 
 
-```bash
-# 基本使用(在生成DDL/MD文件后执行)
-python -m schema_tools.qs_generator \
-  --output-dir ./output \
-  --table-list ./schema_tools/tables.txt \
-  --business-context "高速公路服务区管理系统" \
-  --db-name highway_db
-
-# 启用详细日志
-python -m schema_tools.qs_generator \
-  --output-dir ./output \
-  --table-list ./tables.txt \
-  --business-context "电商系统" \
-  --db-name ecommerce_db \
-  --verbose
 ```
 ```
-
-### 7.2 编程方式
-
-```python
-from schema_tools import SchemaTrainingDataAgent
-
-# 基本使用
-agent = SchemaTrainingDataAgent(
-    db_connection="postgresql://user:pass@localhost:5432/dbname",
-    table_list_file="tables.txt",
-    business_context="高速公路服务区管理系统"
-)
-
-await agent.generate_training_data()
-
-# 指定处理链
-agent = SchemaTrainingDataAgent(
-    db_connection="postgresql://user:pass@localhost:5432/dbname",
-    table_list_file="tables.txt",
-    business_context="高速公路服务区管理系统",
-    pipeline="ddl_only"
-)
-
-await agent.generate_training_data()
-
-# 权限检查
-permissions = await agent.check_database_permissions()
-if permissions['select_data']:
-    await agent.generate_training_data()
-else:
-    print("数据库权限不足")
+data_pipeline/training_data/
+├── bss_company.ddl                          # public.bss_company
+├── hr__employees.ddl                        # hr.employees  
+├── sales__order_items.ddl                   # sales.order_items
+├── bss_company_detail.md                    # 对应的MD文档
+├── hr__employees_detail.md
+├── sales__order_items_detail.md
+├── qs_highway_db_20240626_pair.json         # Question-SQL对文件
+├── filename_mapping.txt                     # 文件名映射
+└── sql_validation_20240626_summary.log      # SQL验证报告
 ```
 ```
 
 
-### 7.3 表清单文件格式
+## 8. 错误处理与容错
 
 
-```text
-bss_service_area
-bss_branch
-public.bss_company
-highway.bss_car_day_count
-```
-
-## 8. 扩展性设计
+### 8.1 错误处理策略
 
 
-### 8.1 数据库适配
+#### 8.1.1 工作流级错误处理
+- 步骤失败 → 生成详细错误报告,包含失败原因和已完成步骤
+- 支持从失败步骤重新开始
+- 保留中间结果用于调试
 
 
-- 当前支持: PostgreSQL
-- 预留扩展: MySQL适配接口
-- 设计原则: 数据库特定代码隔离在DatabaseInspectorTool中
+#### 8.1.2 表级错误处理
+- 某表处理失败 → 记录错误,继续下一表
+- 失败表数超过阈值 → 警告但继续执行
 
 
-### 8.2 LLM模型适配
+#### 8.1.3 LLM调用错误处理
+- 超时/失败 → 自动重试(最大3次)
+- 重试失败 → 使用原始注释或默认注释
 
 
-- 当前支持: qwen/deepseek/ollama
-- 复用现有vanna配置,自动适配不同模型
-- 提示词模板支持不同模型的特点
+### 8.2 日志设计
 
 
-### 8.3 输出格式扩展
+#### 8.2.1 日志级别
+- **INFO**: 正常处理流程
+- **WARNING**: 可恢复错误
+- **ERROR**: 影响结果的错误
 
 
-- 当前: DDL + MD
-- 预留: JSON格式、Excel格式等
+#### 8.2.2 日志输出
+- **控制台**: 进度信息和关键错误
+- **文件**: 详细执行日志
+- **格式**: 包含时间戳、级别、组件、消息
 
 
 ## 9. 性能考虑
 ## 9. 性能考虑
 
 
 ### 9.1 并发控制
 ### 9.1 并发控制
-
 - 表级并发: 控制数据库连接数
 - 表级并发: 控制数据库连接数
-- LLM调用: 避免过于频繁的API调用
+- LLM调用: 避免过于频繁的API调用(建议最大并发1)
 - 内存管理: 及时释放大数据集
 - 内存管理: 及时释放大数据集
 
 
 ### 9.2 数据采样优化
 ### 9.2 数据采样优化
-
 - 限制采样数量避免大表性能问题
 - 限制采样数量避免大表性能问题
-- 智能采样策略(如分页采样)
+- 智能采样策略(分层采样)
+- 大表检测和特殊处理
 
 
 ### 9.3 缓存策略
 ### 9.3 缓存策略
-
 - 表元数据缓存(单次运行内)
 - 表元数据缓存(单次运行内)
 - LLM结果缓存(避免重复调用)
 - LLM结果缓存(避免重复调用)
+- 连接池复用
 
 
-## 10. 测试策略
-
-### 10.1 单元测试
-
-#### 10.1.1 工具测试
-
-- **DatabaseInspectorTool**: Mock数据库连接,测试元数据提取
-- **CommentGeneratorTool**: Mock LLM调用,测试注释生成逻辑
-- **DDLGeneratorTool**: 测试DDL格式生成的正确性
-- **DocGeneratorTool**: 测试MD文档格式和内容
-
-#### 10.1.2 边界条件测试
-
-- **文件名冲突**: 测试多schema重名表的文件名生成
-- **系统表过滤**: 测试各种系统表的正确过滤
-- **大表处理**: 测试智能采样策略
-- **权限异常**: 测试只读库、权限不足等场景
-
-#### 10.1.3 数据结构测试
-
-- **TableMetadata**: 测试各种表结构的正确解析
-- **FieldInfo**: 测试枚举字段检测逻辑
-- **ProcessingResult**: 测试错误处理和结果传递
-
-### 10.2 集成测试
-
-#### 10.2.1 完整流程测试
-
-- **标准流程**: 正常表的完整处理链测试
-- **Pipeline测试**: 不同处理链配置的正确执行
-- **并发测试**: 多表并发处理的稳定性
-
-#### 10.2.2 数据库适配测试
-
-- **不同PostgreSQL版本**: 9.6, 10, 11, 12, 13, 14, 15
-- **不同schema配置**: public, 多schema, 复杂层级
-- **特殊表结构**: 无主键表、超多字段表、复杂数据类型
-
-#### 10.2.3 LLM集成测试
-
-- **不同模型**: qwen, deepseek, ollama各版本
-- **网络异常**: 超时、重试、降级处理
-- **Token限制**: 超长输入的分段处理
-
-#### 10.2.4 错误场景测试
-
-- **网络中断**: 数据库连接中断恢复
-- **权限变化**: 运行中权限被收回
-- **磁盘空间**: 输出目录空间不足
-- **内存限制**: 大量表并发处理
-
-### 10.3 性能测试
-
-#### 10.3.1 规模测试
-
-- **小规模**: 10个表,验证基本功能
-- **中规模**: 100个表,验证并发效率
-- **大规模**: 1000+个表,验证内存和稳定性
-
-#### 10.3.2 压力测试
-
-- **长时间运行**: 24小时连续处理
-- **资源限制**: 低内存、低CPU环境
-- **高并发**: 最大并发数的边界测试
+## 10. 扩展性设计
 
 
-#### 10.3.3 性能基准
-
-- **处理速度**: 平均每表处理时间
-- **资源消耗**: 内存、CPU、网络使用率
-- **LLM调用**: API调用频率和响应时间
-
-### 10.4 用户验收测试
-
-#### 10.4.1 功能验收
-
-- **输出质量**: DDL和MD文档的正确性和可读性
-- **中文注释**: 翻译质量和业务相关性
-- **枚举检测**: 准确率和误报率
-- **错误处理**: 异常情况的友好提示
-
-#### 10.4.2 易用性测试
-
-- **命令行接口**: 参数的直观性和错误提示
-- **配置文件**: 配置项的完整性和说明
-- **日志输出**: 进度显示和错误定位
-
-#### 10.4.3 兼容性测试
-
-- **操作系统**: Windows, Linux, macOS
-- **Python版本**: 3.8, 3.9, 3.10, 3.11
-- **依赖版本**: 各关键依赖库的版本兼容性
-
-### 10.5 测试环境配置
-
-#### 10.5.1 测试数据准备
+### 10.1 数据库适配
+- 当前支持: PostgreSQL
+- 预留扩展: MySQL适配接口
+- 设计原则: 数据库特定代码隔离在DatabaseInspectorTool中
 
 
-```sql
--- 创建测试schema和表
-CREATE SCHEMA test_schema;
-CREATE SCHEMA hr;
-
--- 标准表(有主键、外键、注释)
-CREATE TABLE test_schema.standard_table (
-    id SERIAL PRIMARY KEY,
-    name VARCHAR(100) NOT NULL COMMENT '姓名',
-    status VARCHAR(20) DEFAULT 'active' COMMENT '状态'
-);
+### 10.2 LLM模型适配
+- 当前支持: qwen/deepseek/ollama
+- 复用现有vanna配置,自动适配不同模型
+- 提示词模板支持不同模型的特点
 
 
--- 枚举字段表
-CREATE TABLE test_schema.enum_table (
-    id SERIAL PRIMARY KEY,
-    gender VARCHAR(10), -- 性别: 男/女
-    education VARCHAR(20), -- 学历: 本科/硕士/博士
-    type INTEGER -- 类型: 1,2,3
-);
+### 10.3 输出格式扩展
+- 当前: DDL + MD + JSON
+- 预留: Excel格式、其他结构化格式
 
 
--- 无主键表
-CREATE TABLE test_schema.no_pk_table (
-    col1 VARCHAR(50),
-    col2 INTEGER
-);
+### 10.4 工作流扩展
+- 模块化设计,可添加新的处理步骤
+- 插件式架构,支持自定义工具
+- 配置驱动,灵活组合处理链
 
 
--- 大字段表
-CREATE TABLE test_schema.wide_table (
-    id SERIAL PRIMARY KEY
-    -- 动态生成100个字段
-);
-```
+## 11. 总结
 
 
-#### 10.5.2 Mock数据配置
+Data Pipeline系统提供了完整的数据库逆向工程解决方案,从原始数据库schema到可用的训练数据,整个流程完全自动化。系统设计遵循以下原则:
 
 
-```python
-# 测试用的Mock LLM响应
-MOCK_LLM_RESPONSES = {
-    "table_comment": {
-        "standard_table": "标准测试表,用于验证基本功能",
-        "enum_table": "枚举字段测试表,包含多种枚举类型字段"
-    },
-    "field_comments": {
-        "gender": "性别字段,枚举值:男、女",
-        "education": "教育程度,枚举值:本科、硕士、博士"
-    },
-    "enum_suggestions": {
-        "gender": ["男", "女"],
-        "education": ["本科", "硕士", "博士"],
-        "type": ["1", "2", "3"]
-    }
-}
-```
+1. **模块化**: 清晰的模块划分,便于维护和扩展
+2. **可配置**: 丰富的配置选项,适应不同场景需求
+3. **容错性**: 完善的错误处理,确保系统稳定运行
+4. **可观测**: 详细的日志和报告,便于问题定位
+5. **高性能**: 合理的并发控制和资源管理
+6. **可扩展**: 预留扩展接口,支持未来功能增强
 
 
-#### 10.5.3 CI/CD集成
-
-```yaml
-# .github/workflows/test.yml
-name: Schema Tools Tests
-
-on: [push, pull_request]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    services:
-      postgres:
-        image: postgres:13
-        env:
-          POSTGRES_PASSWORD: postgres
-        options: >-
-          --health-cmd pg_isready
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 5
-    
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.9
-    
-    - name: Install dependencies
-      run: |
-        pip install -r requirements.txt
-        pip install pytest pytest-asyncio
-    
-    - name: Setup test database
-      run: |
-        python tests/setup_test_db.py
-    
-    - name: Run unit tests
-      run: pytest tests/unit/
-    
-    - name: Run integration tests
-      run: pytest tests/integration/
-    
-    - name: Run performance tests
-      run: pytest tests/performance/ --benchmark-only
-```
+通过工作流编排器,用户可以一键完成所有步骤,也可以根据需要分步执行和调试。系统适合各种规模的数据处理需求,是vanna.ai训练数据准备的理想解决方案。

+ 0 - 2579
docs/Schema Tools 详细设计文档.md

@@ -1,2579 +0,0 @@
-# Schema Tools 详细设计文档
-
-## 1. 项目结构与模块设计
-
-### 1.1 完整目录结构
-
-```
-schema_tools/
-├── __init__.py                     # 模块入口,导出主要接口
-├── __main__.py                     # 命令行入口
-├── config.py                       # 配置管理
-├── training_data_agent.py          # 主AI Agent
-├── qs_agent.py                     # Question-SQL生成Agent (新增)
-├── qs_generator.py                 # Question-SQL命令行入口 (新增)
-├── sql_validation_agent.py         # SQL验证Agent (新增)
-├── sql_validator.py                # SQL验证命令行入口 (新增)
-├── schema_workflow_orchestrator.py # 端到端工作流编排器 (新增)
-├── tools/                          # Agent工具集
-│   ├── __init__.py                 # 工具模块初始化
-│   ├── base.py                     # 基础工具类和注册机制
-│   ├── database_inspector.py       # 数据库元数据检查工具
-│   ├── data_sampler.py             # 数据采样工具
-│   ├── comment_generator.py        # LLM注释生成工具
-│   ├── ddl_generator.py            # DDL格式生成工具
-│   └── doc_generator.py            # MD文档生成工具
-├── validators/                     # 验证器模块 (新增)
-│   ├── __init__.py
-│   ├── file_count_validator.py     # 文件数量验证器
-│   └── sql_validator.py            # SQL验证器核心模块
-├── analyzers/                      # 分析器模块 (新增)
-│   ├── __init__.py
-│   ├── md_analyzer.py              # MD文件分析器
-│   └── theme_extractor.py          # 主题提取器
-├── utils/                          # 工具函数
-│   ├── __init__.py
-│   ├── data_structures.py          # 数据结构定义
-│   ├── table_parser.py             # 表清单解析器
-│   ├── file_manager.py             # 文件管理器
-│   ├── system_filter.py            # 系统表过滤器
-│   ├── permission_checker.py       # 权限检查器
-│   ├── large_table_handler.py      # 大表处理器
-│   └── logger.py                   # 日志管理
-├── prompts/                        # 提示词模板
-│   ├── table_comment_template.txt
-│   ├── field_comment_template.txt
-│   ├── enum_detection_template.txt
-│   ├── business_context.txt
-│   └── business_dictionary.txt
-└── tests/                          # 测试用例
-    ├── unit/
-    ├── integration/
-    └── fixtures/
-```
-
-## 2. 核心数据结构设计
-
-### 2.1 数据结构定义 (`utils/data_structures.py`)
-
-```python
-from dataclasses import dataclass, field
-from typing import List, Dict, Optional, Any, Union
-from enum import Enum
-import hashlib
-import json
-
-class FieldType(Enum):
-    """字段类型枚举"""
-    INTEGER = "integer"
-    VARCHAR = "varchar"
-    TEXT = "text"
-    TIMESTAMP = "timestamp"
-    DATE = "date"
-    BOOLEAN = "boolean"
-    NUMERIC = "numeric"
-    ENUM = "enum"
-    JSON = "json"
-    UUID = "uuid"
-    OTHER = "other"
-
-class ProcessingStatus(Enum):
-    """处理状态枚举"""
-    PENDING = "pending"
-    RUNNING = "running"
-    SUCCESS = "success"
-    FAILED = "failed"
-    SKIPPED = "skipped"
-
-@dataclass
-class FieldInfo:
-    """字段信息标准结构"""
-    name: str
-    type: str
-    nullable: bool
-    default_value: Optional[str] = None
-    comment: Optional[str] = None
-    original_comment: Optional[str] = None  # 原始注释
-    generated_comment: Optional[str] = None  # LLM生成的注释
-    is_primary_key: bool = False
-    is_foreign_key: bool = False
-    is_enum: bool = False
-    enum_values: Optional[List[str]] = None
-    enum_description: Optional[str] = None
-    max_length: Optional[int] = None
-    precision: Optional[int] = None
-    scale: Optional[int] = None
-    
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典格式"""
-        return {
-            'name': self.name,
-            'type': self.type,
-            'nullable': self.nullable,
-            'default_value': self.default_value,
-            'comment': self.comment,
-            'is_primary_key': self.is_primary_key,
-            'is_foreign_key': self.is_foreign_key,
-            'is_enum': self.is_enum,
-            'enum_values': self.enum_values
-        }
-
-@dataclass
-class TableMetadata:
-    """表元数据标准结构"""
-    schema_name: str
-    table_name: str
-    full_name: str  # schema.table_name
-    comment: Optional[str] = None
-    original_comment: Optional[str] = None  # 原始注释
-    generated_comment: Optional[str] = None  # LLM生成的注释
-    fields: List[FieldInfo] = field(default_factory=list)
-    sample_data: List[Dict[str, Any]] = field(default_factory=list)
-    row_count: Optional[int] = None
-    table_size: Optional[str] = None  # 表大小(如 "1.2 MB")
-    created_date: Optional[str] = None
-    
-    @property
-    def safe_file_name(self) -> str:
-        """生成安全的文件名"""
-        if self.schema_name.lower() == 'public':
-            return self.table_name
-        return f"{self.schema_name}__{self.table_name}".replace('.', '__').replace('-', '_').replace(' ', '_')
-    
-    def get_metadata_hash(self) -> str:
-        """计算元数据哈希值,用于增量更新判断"""
-        hash_data = {
-            'schema_name': self.schema_name,
-            'table_name': self.table_name,
-            'fields': [f.to_dict() for f in self.fields],
-            'comment': self.original_comment
-        }
-        return hashlib.md5(json.dumps(hash_data, sort_keys=True).encode()).hexdigest()
-
-@dataclass
-class ProcessingResult:
-    """工具处理结果标准结构"""
-    success: bool
-    data: Optional[Any] = None
-    error_message: Optional[str] = None
-    warnings: List[str] = field(default_factory=list)
-    execution_time: Optional[float] = None
-    metadata: Dict[str, Any] = field(default_factory=dict)
-    
-    def add_warning(self, warning: str):
-        """添加警告信息"""
-        self.warnings.append(warning)
-    
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典格式"""
-        return {
-            'success': self.success,
-            'data': self.data,
-            'error_message': self.error_message,
-            'warnings': self.warnings,
-            'execution_time': self.execution_time,
-            'metadata': self.metadata
-        }
-
-@dataclass
-class TableProcessingContext:
-    """表处理上下文"""
-    table_metadata: TableMetadata
-    business_context: str
-    output_dir: str
-    pipeline: str
-    vn: Any  # vanna实例
-    file_manager: Any
-    current_step: str = "initialized"
-    step_results: Dict[str, ProcessingResult] = field(default_factory=dict)
-    start_time: Optional[float] = None
-    
-    def update_step(self, step_name: str, result: ProcessingResult):
-        """更新步骤结果"""
-        self.current_step = step_name
-        self.step_results[step_name] = result
-```
-
-## 3. 工具注册与管理系统
-
-### 3.1 基础工具类 (`tools/base.py`)
-
-```python
-import asyncio
-import time
-import logging
-from abc import ABC, abstractmethod
-from typing import Dict, Any, Optional, Type
-from utils.data_structures import ProcessingResult, TableProcessingContext
-
-class ToolRegistry:
-    """工具注册管理器"""
-    _tools: Dict[str, Type['BaseTool']] = {}
-    _instances: Dict[str, 'BaseTool'] = {}
-    
-    @classmethod
-    def register(cls, name: str):
-        """装饰器:注册工具"""
-        def decorator(tool_class: Type['BaseTool']):
-            cls._tools[name] = tool_class
-            logging.debug(f"注册工具: {name} -> {tool_class.__name__}")
-            return tool_class
-        return decorator
-    
-    @classmethod
-    def get_tool(cls, name: str, **kwargs) -> 'BaseTool':
-        """获取工具实例,支持单例模式"""
-        if name not in cls._instances:
-            if name not in cls._tools:
-                raise ValueError(f"工具 '{name}' 未注册")
-            
-            tool_class = cls._tools[name]
-            
-            # 自动注入vanna实例到需要LLM的工具
-            if hasattr(tool_class, 'needs_llm') and tool_class.needs_llm:
-                from core.vanna_llm_factory import create_vanna_instance
-                kwargs['vn'] = create_vanna_instance()
-                logging.debug(f"为工具 {name} 注入LLM实例")
-            
-            cls._instances[name] = tool_class(**kwargs)
-        
-        return cls._instances[name]
-    
-    @classmethod
-    def list_tools(cls) -> List[str]:
-        """列出所有已注册的工具"""
-        return list(cls._tools.keys())
-    
-    @classmethod
-    def clear_instances(cls):
-        """清除所有工具实例(用于测试)"""
-        cls._instances.clear()
-
-class BaseTool(ABC):
-    """工具基类"""
-    
-    needs_llm: bool = False  # 是否需要LLM实例
-    tool_name: str = ""      # 工具名称
-    
-    def __init__(self, **kwargs):
-        self.logger = logging.getLogger(f"schema_tools.{self.__class__.__name__}")
-        
-        # 如果工具需要LLM,检查是否已注入
-        if self.needs_llm and 'vn' not in kwargs:
-            raise ValueError(f"工具 {self.__class__.__name__} 需要LLM实例但未提供")
-        
-        # 存储vanna实例
-        if 'vn' in kwargs:
-            self.vn = kwargs['vn']
-    
-    @abstractmethod
-    async def execute(self, context: TableProcessingContext) -> ProcessingResult:
-        """
-        执行工具逻辑
-        Args:
-            context: 表处理上下文
-        Returns:
-            ProcessingResult: 处理结果
-        """
-        pass
-    
-    async def _execute_with_timing(self, context: TableProcessingContext) -> ProcessingResult:
-        """带计时的执行包装器"""
-        start_time = time.time()
-        
-        try:
-            self.logger.info(f"开始执行工具: {self.tool_name}")
-            result = await self.execute(context)
-            execution_time = time.time() - start_time
-            result.execution_time = execution_time
-            
-            if result.success:
-                self.logger.info(f"工具 {self.tool_name} 执行成功,耗时: {execution_time:.2f}秒")
-            else:
-                self.logger.error(f"工具 {self.tool_name} 执行失败: {result.error_message}")
-            
-            return result
-            
-        except Exception as e:
-            execution_time = time.time() - start_time
-            self.logger.exception(f"工具 {self.tool_name} 执行异常")
-            
-            return ProcessingResult(
-                success=False,
-                error_message=f"工具执行异常: {str(e)}",
-                execution_time=execution_time
-            )
-    
-    def validate_input(self, context: TableProcessingContext) -> bool:
-        """输入验证(子类可重写)"""
-        return context.table_metadata is not None
-```
-
-### 3.2 Pipeline执行器 (`training_data_agent.py` 的一部分)
-
-```python
-class PipelineExecutor:
-    """处理链执行器"""
-    
-    def __init__(self, pipeline_config: Dict[str, List[str]]):
-        self.pipeline_config = pipeline_config
-        self.logger = logging.getLogger("schema_tools.PipelineExecutor")
-    
-    async def execute_pipeline(self, pipeline_name: str, context: TableProcessingContext) -> Dict[str, ProcessingResult]:
-        """执行指定的处理链"""
-        if pipeline_name not in self.pipeline_config:
-            raise ValueError(f"未知的处理链: {pipeline_name}")
-        
-        steps = self.pipeline_config[pipeline_name]
-        results = {}
-        
-        self.logger.info(f"开始执行处理链 '{pipeline_name}': {' -> '.join(steps)}")
-        
-        for step_name in steps:
-            try:
-                tool = ToolRegistry.get_tool(step_name)
-                
-                # 验证输入
-                if not tool.validate_input(context):
-                    result = ProcessingResult(
-                        success=False,
-                        error_message=f"工具 {step_name} 输入验证失败"
-                    )
-                else:
-                    result = await tool._execute_with_timing(context)
-                
-                results[step_name] = result
-                context.update_step(step_name, result)
-                
-                # 如果步骤失败且不允许继续,则停止
-                if not result.success:
-                    from config import SCHEMA_TOOLS_CONFIG
-                    if not SCHEMA_TOOLS_CONFIG["continue_on_error"]:
-                        self.logger.error(f"步骤 {step_name} 失败,停止处理链执行")
-                        break
-                    else:
-                        self.logger.warning(f"步骤 {step_name} 失败,继续执行下一步")
-                
-            except Exception as e:
-                self.logger.exception(f"执行步骤 {step_name} 时发生异常")
-                results[step_name] = ProcessingResult(
-                    success=False,
-                    error_message=f"步骤执行异常: {str(e)}"
-                )
-                break
-        
-        return results
-```
-
-## 4. 核心工具实现
-
-### 4.1 数据库检查工具 (`tools/database_inspector.py`)
-
-```python
-import asyncio
-import asyncpg
-from typing import List, Dict, Any, Optional
-from tools.base import BaseTool, ToolRegistry
-from utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo, TableMetadata
-
-@ToolRegistry.register("database_inspector")
-class DatabaseInspectorTool(BaseTool):
-    """数据库元数据检查工具"""
-    
-    needs_llm = False
-    tool_name = "数据库检查器"
-    
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.db_connection = kwargs.get('db_connection')
-        self.connection_pool = None
-    
-    async def execute(self, context: TableProcessingContext) -> ProcessingResult:
-        """执行数据库元数据检查"""
-        try:
-            # 建立数据库连接
-            if not self.connection_pool:
-                await self._create_connection_pool()
-            
-            table_name = context.table_metadata.table_name
-            schema_name = context.table_metadata.schema_name
-            
-            # 获取表的基本信息
-            table_info = await self._get_table_info(schema_name, table_name)
-            if not table_info:
-                return ProcessingResult(
-                    success=False,
-                    error_message=f"表 {schema_name}.{table_name} 不存在或无权限访问"
-                )
-            
-            # 获取字段信息
-            fields = await self._get_table_fields(schema_name, table_name)
-            
-            # 获取表注释
-            table_comment = await self._get_table_comment(schema_name, table_name)
-            
-            # 获取表统计信息
-            stats = await self._get_table_statistics(schema_name, table_name)
-            
-            # 更新表元数据
-            context.table_metadata.original_comment = table_comment
-            context.table_metadata.comment = table_comment
-            context.table_metadata.fields = fields
-            context.table_metadata.row_count = stats.get('row_count')
-            context.table_metadata.table_size = stats.get('table_size')
-            
-            return ProcessingResult(
-                success=True,
-                data={
-                    'fields_count': len(fields),
-                    'table_comment': table_comment,
-                    'row_count': stats.get('row_count'),
-                    'table_size': stats.get('table_size')
-                },
-                metadata={'tool': self.tool_name}
-            )
-            
-        except Exception as e:
-            self.logger.exception(f"数据库检查失败")
-            return ProcessingResult(
-                success=False,
-                error_message=f"数据库检查失败: {str(e)}"
-            )
-    
-    async def _create_connection_pool(self):
-        """创建数据库连接池"""
-        try:
-            self.connection_pool = await asyncpg.create_pool(
-                self.db_connection,
-                min_size=1,
-                max_size=5,
-                command_timeout=30
-            )
-            self.logger.info("数据库连接池创建成功")
-        except Exception as e:
-            self.logger.error(f"创建数据库连接池失败: {e}")
-            raise
-    
-    async def _get_table_info(self, schema_name: str, table_name: str) -> Optional[Dict]:
-        """获取表基本信息"""
-        query = """
-        SELECT schemaname, tablename, tableowner, tablespace, hasindexes, hasrules, hastriggers
-        FROM pg_tables 
-        WHERE schemaname = $1 AND tablename = $2
-        """
-        async with self.connection_pool.acquire() as conn:
-            result = await conn.fetchrow(query, schema_name, table_name)
-            return dict(result) if result else None
-    
-    async def _get_table_fields(self, schema_name: str, table_name: str) -> List[FieldInfo]:
-        """获取表字段信息"""
-        query = """
-        SELECT 
-            c.column_name,
-            c.data_type,
-            c.is_nullable,
-            c.column_default,
-            c.character_maximum_length,
-            c.numeric_precision,
-            c.numeric_scale,
-            pd.description as column_comment,
-            CASE WHEN pk.column_name IS NOT NULL THEN true ELSE false END as is_primary_key,
-            CASE WHEN fk.column_name IS NOT NULL THEN true ELSE false END as is_foreign_key
-        FROM information_schema.columns c
-        LEFT JOIN pg_description pd ON pd.objsubid = c.ordinal_position 
-            AND pd.objoid = (
-                SELECT oid FROM pg_class 
-                WHERE relname = c.table_name 
-                AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = c.table_schema)
-            )
-        LEFT JOIN (
-            SELECT ku.column_name
-            FROM information_schema.table_constraints tc
-            JOIN information_schema.key_column_usage ku ON tc.constraint_name = ku.constraint_name
-            WHERE tc.table_schema = $1 AND tc.table_name = $2 AND tc.constraint_type = 'PRIMARY KEY'
-        ) pk ON pk.column_name = c.column_name
-        LEFT JOIN (
-            SELECT ku.column_name
-            FROM information_schema.table_constraints tc
-            JOIN information_schema.key_column_usage ku ON tc.constraint_name = ku.constraint_name
-            WHERE tc.table_schema = $1 AND tc.table_name = $2 AND tc.constraint_type = 'FOREIGN KEY'
-        ) fk ON fk.column_name = c.column_name
-        WHERE c.table_schema = $1 AND c.table_name = $2
-        ORDER BY c.ordinal_position
-        """
-        
-        fields = []
-        async with self.connection_pool.acquire() as conn:
-            rows = await conn.fetch(query, schema_name, table_name)
-            
-            for row in rows:
-                field = FieldInfo(
-                    name=row['column_name'],
-                    type=row['data_type'],
-                    nullable=row['is_nullable'] == 'YES',
-                    default_value=row['column_default'],
-                    original_comment=row['column_comment'],
-                    comment=row['column_comment'],
-                    is_primary_key=row['is_primary_key'],
-                    is_foreign_key=row['is_foreign_key'],
-                    max_length=row['character_maximum_length'],
-                    precision=row['numeric_precision'],
-                    scale=row['numeric_scale']
-                )
-                fields.append(field)
-        
-        return fields
-    
-    async def _get_table_comment(self, schema_name: str, table_name: str) -> Optional[str]:
-        """获取表注释"""
-        query = """
-        SELECT obj_description(oid) as table_comment
-        FROM pg_class 
-        WHERE relname = $2 
-        AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = $1)
-        """
-        async with self.connection_pool.acquire() as conn:
-            result = await conn.fetchval(query, schema_name, table_name)
-            return result
-    
-    async def _get_table_statistics(self, schema_name: str, table_name: str) -> Dict[str, Any]:
-        """获取表统计信息"""
-        stats_query = """
-        SELECT 
-            schemaname,
-            tablename,
-            attname,
-            n_distinct,
-            most_common_vals,
-            most_common_freqs,
-            histogram_bounds
-        FROM pg_stats 
-        WHERE schemaname = $1 AND tablename = $2
-        """
-        
-        size_query = """
-        SELECT pg_size_pretty(pg_total_relation_size($1)) as table_size,
-               pg_relation_size($1) as table_size_bytes
-        """
-        
-        count_query = f"SELECT COUNT(*) as row_count FROM {schema_name}.{table_name}"
-        
-        stats = {}
-        async with self.connection_pool.acquire() as conn:
-            try:
-                # 获取行数
-                row_count = await conn.fetchval(count_query)
-                stats['row_count'] = row_count
-                
-                # 获取表大小
-                table_oid = await conn.fetchval(
-                    "SELECT oid FROM pg_class WHERE relname = $1 AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = $2)",
-                    table_name, schema_name
-                )
-                if table_oid:
-                    size_result = await conn.fetchrow(size_query, table_oid)
-                    stats['table_size'] = size_result['table_size']
-                    stats['table_size_bytes'] = size_result['table_size_bytes']
-                
-            except Exception as e:
-                self.logger.warning(f"获取表统计信息失败: {e}")
-        
-        return stats
-```
-
-### 4.2 数据采样工具 (`tools/data_sampler.py`)
-
-```python
-import random
-from typing import List, Dict, Any
-from tools.base import BaseTool, ToolRegistry
-from utils.data_structures import ProcessingResult, TableProcessingContext
-
-@ToolRegistry.register("data_sampler")
-class DataSamplerTool(BaseTool):
-    """数据采样工具"""
-    
-    needs_llm = False
-    tool_name = "数据采样器"
-    
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.db_connection = kwargs.get('db_connection')
-    
-    async def execute(self, context: TableProcessingContext) -> ProcessingResult:
-        """执行数据采样"""
-        try:
-            from config import SCHEMA_TOOLS_CONFIG
-            
-            table_metadata = context.table_metadata
-            sample_limit = SCHEMA_TOOLS_CONFIG["sample_data_limit"]
-            large_table_threshold = SCHEMA_TOOLS_CONFIG["large_table_threshold"]
-            
-            # 判断是否为大表,使用不同的采样策略
-            if table_metadata.row_count and table_metadata.row_count > large_table_threshold:
-                sample_data = await self._smart_sample_large_table(table_metadata, sample_limit)
-                self.logger.info(f"大表 {table_metadata.full_name} 使用智能采样策略")
-            else:
-                sample_data = await self._simple_sample(table_metadata, sample_limit)
-            
-            # 更新上下文中的采样数据
-            context.table_metadata.sample_data = sample_data
-            
-            return ProcessingResult(
-                success=True,
-                data={
-                    'sample_count': len(sample_data),
-                    'sampling_strategy': 'smart' if table_metadata.row_count and table_metadata.row_count > large_table_threshold else 'simple'
-                },
-                metadata={'tool': self.tool_name}
-            )
-            
-        except Exception as e:
-            self.logger.exception(f"数据采样失败")
-            return ProcessingResult(
-                success=False,
-                error_message=f"数据采样失败: {str(e)}"
-            )
-    
-    async def _simple_sample(self, table_metadata: TableMetadata, limit: int) -> List[Dict[str, Any]]:
-        """简单采样策略"""
-        from tools.database_inspector import DatabaseInspectorTool
-        
-        # 复用数据库检查工具的连接
-        inspector = ToolRegistry.get_tool("database_inspector")
-        
-        query = f"SELECT * FROM {table_metadata.full_name} LIMIT {limit}"
-        
-        async with inspector.connection_pool.acquire() as conn:
-            rows = await conn.fetch(query)
-            return [dict(row) for row in rows]
-    
-    async def _smart_sample_large_table(self, table_metadata: TableMetadata, limit: int) -> List[Dict[str, Any]]:
-        """智能采样策略(用于大表)"""
-        from tools.database_inspector import DatabaseInspectorTool
-        
-        inspector = ToolRegistry.get_tool("database_inspector")
-        samples_per_section = max(1, limit // 3)
-        
-        samples = []
-        
-        async with inspector.connection_pool.acquire() as conn:
-            # 1. 前N行采样
-            front_query = f"SELECT * FROM {table_metadata.full_name} LIMIT {samples_per_section}"
-            front_rows = await conn.fetch(front_query)
-            samples.extend([dict(row) for row in front_rows])
-            
-            # 2. 随机中间采样(使用TABLESAMPLE)
-            if table_metadata.row_count > samples_per_section * 2:
-                try:
-                    # 计算采样百分比
-                    sample_percent = min(1.0, (samples_per_section * 100.0) / table_metadata.row_count)
-                    middle_query = f"""
-                    SELECT * FROM {table_metadata.full_name} 
-                    TABLESAMPLE SYSTEM({sample_percent}) 
-                    LIMIT {samples_per_section}
-                    """
-                    middle_rows = await conn.fetch(middle_query)
-                    samples.extend([dict(row) for row in middle_rows])
-                except Exception as e:
-                    self.logger.warning(f"TABLESAMPLE采样失败,使用OFFSET采样: {e}")
-                    # 回退到OFFSET采样
-                    offset = random.randint(samples_per_section, table_metadata.row_count - samples_per_section)
-                    offset_query = f"SELECT * FROM {table_metadata.full_name} OFFSET {offset} LIMIT {samples_per_section}"
-                    offset_rows = await conn.fetch(offset_query)
-                    samples.extend([dict(row) for row in offset_rows])
-            
-            # 3. 后N行采样
-            remaining = limit - len(samples)
-            if remaining > 0:
-                # 使用ORDER BY ... DESC来获取最后的行
-                tail_query = f"""
-                SELECT * FROM (
-                    SELECT *, ROW_NUMBER() OVER() as rn 
-                    FROM {table_metadata.full_name}
-                ) sub 
-                WHERE sub.rn > (SELECT COUNT(*) FROM {table_metadata.full_name}) - {remaining}
-                ORDER BY sub.rn
-                """
-                try:
-                    tail_rows = await conn.fetch(tail_query)
-                    # 移除ROW_NUMBER列
-                    for row in tail_rows:
-                        row_dict = dict(row)
-                        row_dict.pop('rn', None)
-                        samples.append(row_dict)
-                except Exception as e:
-                    self.logger.warning(f"尾部采样失败: {e}")
-        
-        return samples[:limit]  # 确保不超过限制
-```
-
-### 4.3 LLM注释生成工具 (`tools/comment_generator.py`)
-
-~~~python
-import asyncio
-from typing import List, Dict, Any, Tuple
-from tools.base import BaseTool, ToolRegistry
-from utils.data_structures import ProcessingResult, TableProcessingContext, FieldInfo
-
-@ToolRegistry.register("comment_generator")
-class CommentGeneratorTool(BaseTool):
-    """LLM注释生成工具"""
-    
-    needs_llm = True
-    tool_name = "注释生成器"
-    
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.business_context = kwargs.get('business_context', '')
-        self.business_dictionary = self._load_business_dictionary()
-    
-    async def execute(self, context: TableProcessingContext) -> ProcessingResult:
-        """执行注释生成"""
-        try:
-            table_metadata = context.table_metadata
-            
-            # 生成表注释
-            table_comment_result = await self._generate_table_comment(table_metadata, context.business_context)
-            
-            # 生成字段注释和枚举建议
-            field_results = await self._generate_field_comments_and_enums(table_metadata, context.business_context)
-            
-            # 更新表元数据
-            if table_comment_result['success']:
-                table_metadata.generated_comment = table_comment_result['comment']
-                table_metadata.comment = table_comment_result['comment']
-            
-            # 更新字段信息
-            enum_suggestions = []
-            for i, field in enumerate(table_metadata.fields):
-                if i < len(field_results) and field_results[i]['success']:
-                    field.generated_comment = field_results[i]['comment']
-                    field.comment = field_results[i]['comment']
-                    
-                    # 处理枚举建议
-                    if field_results[i].get('is_enum'):
-                        field.is_enum = True
-                        enum_suggestions.append({
-                            'field_name': field.name,
-                            'suggested_values': field_results[i].get('enum_values', []),
-                            'enum_description': field_results[i].get('enum_description', '')
-                        })
-            
-            # 验证枚举建议
-            if enum_suggestions:
-                validated_enums = await self._validate_enum_suggestions(table_metadata, enum_suggestions)
-                
-                # 更新验证后的枚举信息
-                for enum_info in validated_enums:
-                    field_name = enum_info['field_name']
-                    for field in table_metadata.fields:
-                        if field.name == field_name:
-                            field.enum_values = enum_info['actual_values']
-                            field.enum_description = enum_info['description']
-                            break
-            
-            return ProcessingResult(
-                success=True,
-                data={
-                    'table_comment_generated': table_comment_result['success'],
-                    'field_comments_generated': sum(1 for r in field_results if r['success']),
-                    'enum_fields_detected': len([f for f in table_metadata.fields if f.is_enum]),
-                    'enum_suggestions': enum_suggestions
-                },
-                metadata={'tool': self.tool_name}
-            )
-            
-        except Exception as e:
-            self.logger.exception(f"注释生成失败")
-            return ProcessingResult(
-                success=False,
-                error_message=f"注释生成失败: {str(e)}"
-            )
-    
-    async def _generate_table_comment(self, table_metadata, business_context: str) -> Dict[str, Any]:
-        """生成表注释"""
-        try:
-            prompt = self._build_table_comment_prompt(table_metadata, business_context)
-            
-            # 调用LLM
-            response = await self._call_llm_with_retry(prompt)
-            
-            # 解析响应
-            comment = self._extract_table_comment(response)
-            
-            return {
-                'success': True,
-                'comment': comment,
-                'original_response': response
-            }
-            
-        except Exception as e:
-            self.logger.error(f"表注释生成失败: {e}")
-            return {
-                'success': False,
-                'comment': table_metadata.original_comment or f"{table_metadata.table_name}表",
-                'error': str(e)
-            }
-    
-    async def _generate_field_comments_and_enums(self, table_metadata, business_context: str) -> List[Dict[str, Any]]:
-        """批量生成字段注释和枚举建议"""
-        try:
-            # 构建批量处理的提示词
-            prompt = self._build_field_batch_prompt(table_metadata, business_context)
-            
-            # 调用LLM
-            response = await self._call_llm_with_retry(prompt)
-            
-            # 解析批量响应
-            field_results = self._parse_field_batch_response(response, table_metadata.fields)
-            
-            return field_results
-            
-        except Exception as e:
-            self.logger.error(f"字段注释批量生成失败: {e}")
-            # 返回默认结果
-            return [
-                {
-                    'success': False,
-                    'comment': field.original_comment or field.name,
-                    'is_enum': False,
-                    'error': str(e)
-                }
-                for field in table_metadata.fields
-            ]
-    
-    def _build_table_comment_prompt(self, table_metadata, business_context: str) -> str:
-        """构建表注释生成提示词"""
-        # 准备字段信息摘要
-        fields_summary = []
-        for field in table_metadata.fields[:10]:  # 只显示前10个字段避免过长
-            field_desc = f"- {field.name} ({field.type})"
-            if field.comment:
-                field_desc += f": {field.comment}"
-            fields_summary.append(field_desc)
-        
-        # 准备样例数据摘要
-        sample_summary = ""
-        if table_metadata.sample_data:
-            sample_count = min(3, len(table_metadata.sample_data))
-            sample_summary = f"\n样例数据({sample_count}条):\n"
-            for i, sample in enumerate(table_metadata.sample_data[:sample_count]):
-                sample_str = ", ".join([f"{k}={v}" for k, v in list(sample.items())[:5]])
-                sample_summary += f"{i+1}. {sample_str}\n"
-        
-        prompt = f"""你是一个数据库文档专家。请根据以下信息为数据库表生成简洁、准确的中文注释。
-
-业务背景: {business_context}
-{self.business_dictionary}
-
-表信息:
-- 表名: {table_metadata.table_name}
-- Schema: {table_metadata.schema_name}
-- 现有注释: {table_metadata.original_comment or "无"}
-- 字段数量: {len(table_metadata.fields)}
-- 数据行数: {table_metadata.row_count or "未知"}
-
-主要字段:
-{chr(10).join(fields_summary)}
-
-{sample_summary}
-
-请生成一个简洁、准确的中文表注释,要求:
-1. 如果现有注释是英文,请翻译为中文并改进
-2. 根据字段名称和样例数据推断表的业务用途
-3. 注释长度控制在50字以内
-4. 突出表的核心业务价值
-
-表注释:"""
-        
-        return prompt
-    
-    def _build_field_batch_prompt(self, table_metadata, business_context: str) -> str:
-        """构建字段批量处理提示词"""
-        # 准备字段信息
-        fields_info = []
-        sample_values = {}
-        
-        # 收集字段的样例值
-        for sample in table_metadata.sample_data[:5]:
-            for field_name, value in sample.items():
-                if field_name not in sample_values:
-                    sample_values[field_name] = []
-                if value is not None and len(sample_values[field_name]) < 5:
-                    sample_values[field_name].append(str(value))
-        
-        # 构建字段信息列表
-        for field in table_metadata.fields:
-            field_info = f"{field.name} ({field.type})"
-            if field.original_comment:
-                field_info += f" - 原注释: {field.original_comment}"
-            
-            # 添加样例值
-            if field.name in sample_values and sample_values[field.name]:
-                values_str = ", ".join(sample_values[field.name][:3])
-                field_info += f" - 样例值: {values_str}"
-            
-            fields_info.append(field_info)
-        
-        prompt = f"""你是一个数据库文档专家。请为以下表的所有字段生成中文注释,并识别可能的枚举字段。
-
-业务背景: {business_context}
-{self.business_dictionary}
-
-表名: {table_metadata.schema_name}.{table_metadata.table_name}
-表注释: {table_metadata.comment or "无"}
-
-字段列表:
-{chr(10).join([f"{i+1}. {info}" for i, info in enumerate(fields_info)])}
-
-请按以下JSON格式输出每个字段的分析结果:
-```json
-{{
-  "fields": [
-    {{
-      "name": "字段名",
-      "comment": "中文注释(简洁明确,15字以内)",
-      "is_enum": true/false,
-      "enum_values": ["值1", "值2", "值3"] (如果是枚举),
-      "enum_description": "枚举含义说明" (如果是枚举)
-    }}
-  ]
-}}
-~~~
-
-注释生成要求:
-
-1. 如果原注释是英文,翻译为中文并改进
-2. 根据字段名、类型和样例值推断字段含义
-3. 识别可能的枚举字段(如状态、类型、级别等)
-4. 枚举判断标准: VARCHAR类型 + 样例值重复度高 + 字段名暗示分类
-5. 注释要贴近{business_context}的业务场景
-
-请输出JSON格式的分析结果:"""
-
-```
-    return prompt
-
-async def _call_llm_with_retry(self, prompt: str, max_retries: int = 3) -> str:
-    """带重试的LLM调用"""
-    from config import SCHEMA_TOOLS_CONFIG
-    
-    for attempt in range(max_retries):
-        try:
-            # 使用vanna实例调用LLM
-            response = await asyncio.to_thread(self.vn.ask, prompt)
-            
-            if response and response.strip():
-                return response.strip()
-            else:
-                raise ValueError("LLM返回空响应")
-                
-        except Exception as e:
-            self.logger.warning(f"LLM调用失败 (尝试 {attempt + 1}/{max_retries}): {e}")
-            if attempt == max_retries - 1:
-                raise
-            await asyncio.sleep(1)  # 等待1秒后重试
-    
-    raise Exception("LLM调用达到最大重试次数")
-
-def _extract_table_comment(self, llm_response: str) -> str:
-    """从LLM响应中提取表注释"""
-    # 简单的文本清理和提取逻辑
-    lines = llm_response.strip().split('\n')
-    
-    # 查找包含实际注释的行
-    for line in lines:
-        line = line.strip()
-        if line and not line.startswith('#') and not line.startswith('*'):
-            # 移除可能的前缀
-            prefixes = ['表注释:', '注释:', '说明:', '表说明:']
-            for prefix in prefixes:
-                if line.startswith(prefix):
-                    line = line[len(prefix):].strip()
-            
-            if line:
-                return line[:200]  # 限制长度
-    
-    return llm_response.strip()[:200]
-
-def _parse_field_batch_response(self, llm_response: str, fields: List[FieldInfo]) -> List[Dict[str, Any]]:
-    """解析字段批量处理响应"""
-    import json
-    import re
-    
-    try:
-        # 尝试提取JSON部分
-        json_match = re.search(r'```json\s*(.*?)\s*```', llm_response, re.DOTALL)
-        if json_match:
-            json_str = json_match.group(1)
-        else:
-            # 如果没有代码块,尝试直接解析
-            json_str = llm_response
-        
-        # 解析JSON
-        parsed_data = json.loads(json_str)
-        field_data = parsed_data.get('fields', [])
-        
-        # 映射到字段结果
-        results = []
-        for i, field in enumerate(fields):
-            if i < len(field_data):
-                data = field_data[i]
-                results.append({
-                    'success': True,
-                    'comment': data.get('comment', field.name),
-                    'is_enum': data.get('is_enum', False),
-                    'enum_values': data.get('enum_values', []),
-                    'enum_description': data.get('enum_description', '')
-                })
-            else:
-                # 默认结果
-                results.append({
-                    'success': False,
-                    'comment': field.original_comment or field.name,
-                    'is_enum': False
-                })
-        
-        return results
-        
-    except Exception as e:
-        self.logger.error(f"解析字段批量响应失败: {e}")
-        # 返回默认结果
-        return [
-            {
-                'success': False,
-                'comment': field.original_comment or field.name,
-                'is_enum': False,
-                'error': str(e)
-            }
-            for field in fields
-        ]
-
-async def _validate_enum_suggestions(self, table_metadata, enum_suggestions: List[Dict]) -> List[Dict]:
-    """验证枚举建议"""
-    from tools.database_inspector import DatabaseInspectorTool
-    from config import SCHEMA_TOOLS_CONFIG
-    
-    validated_enums = []
-    inspector = ToolRegistry.get_tool("database_inspector")
-    sample_limit = SCHEMA_TOOLS_CONFIG["enum_detection_sample_limit"]
-    
-    for enum_info in enum_suggestions:
-        field_name = enum_info['field_name']
-        
-        try:
-            # 查询字段的所有不同值
-            query = f"""
-            SELECT DISTINCT {field_name} as value, COUNT(*) as count
-            FROM {table_metadata.full_name}
-            WHERE {field_name} IS NOT NULL
-            GROUP BY {field_name}
-            ORDER BY count DESC
-            LIMIT {sample_limit}
-            """
-            
-            async with inspector.connection_pool.acquire() as conn:
-                rows = await conn.fetch(query)
-                
-                actual_values = [str(row['value']) for row in rows]
-                
-                # 验证是否真的是枚举(不同值数量合理)
-                max_enum_values = SCHEMA_TOOLS_CONFIG["enum_max_distinct_values"]
-                if len(actual_values) <= max_enum_values:
-                    validated_enums.append({
-                        'field_name': field_name,
-                        'actual_values': actual_values,
-                        'suggested_values': enum_info['suggested_values'],
-                        'description': enum_info['enum_description'],
-                        'value_counts': [(row['value'], row['count']) for row in rows]
-                    })
-                    self.logger.info(f"确认字段 {field_name} 为枚举类型,包含 {len(actual_values)} 个值")
-                else:
-                    self.logger.info(f"字段 {field_name} 不同值过多({len(actual_values)}),不认为是枚举")
-                    
-        except Exception as e:
-            self.logger.warning(f"验证字段 {field_name} 的枚举建议失败: {e}")
-    
-    return validated_enums
-
-def _load_business_dictionary(self) -> str:
-    """加载业务词典"""
-    try:
-        import os
-        dict_file = os.path.join(os.path.dirname(__file__), '..', 'prompts', 'business_dictionary.txt')
-        if os.path.exists(dict_file):
-            with open(dict_file, 'r', encoding='utf-8') as f:
-                content = f.read().strip()
-                return f"\n业务词典:\n{content}\n" if content else ""
-        return ""
-    except Exception as e:
-        self.logger.warning(f"加载业务词典失败: {e}")
-        return ""
-## 5. 主AI Agent实现
-
-### 5.1 主Agent核心代码 (`training_data_agent.py`)
-
-```python
-import asyncio
-import time
-import logging
-import os
-from typing import List, Dict, Any, Optional
-from pathlib import Path
-
-from tools.base import ToolRegistry, PipelineExecutor
-from utils.data_structures import TableMetadata, TableProcessingContext, ProcessingResult
-from utils.file_manager import FileNameManager
-from utils.system_filter import SystemTableFilter
-from utils.permission_checker import DatabasePermissionChecker
-from utils.table_parser import TableListParser
-from utils.logger import setup_logging
-
-class SchemaTrainingDataAgent:
-    """Schema训练数据生成AI Agent"""
-    
-    def __init__(self, 
-                 db_connection: str,
-                 table_list_file: str,
-                 business_context: str = None,
-                 output_dir: str = None,
-                 pipeline: str = "full"):
-        
-        self.db_connection = db_connection
-        self.table_list_file = table_list_file
-        self.business_context = business_context or "数据库管理系统"
-        self.pipeline = pipeline
-        
-        # 配置管理
-        from config import SCHEMA_TOOLS_CONFIG
-        self.config = SCHEMA_TOOLS_CONFIG
-        self.output_dir = output_dir or self.config["output_directory"]
-        
-        # 初始化组件
-        self.file_manager = FileNameManager(self.output_dir)
-        self.system_filter = SystemTableFilter()
-        self.table_parser = TableListParser()
-        self.pipeline_executor = PipelineExecutor(self.config["available_pipelines"])
-        
-        # 统计信息
-        self.stats = {
-            'total_tables': 0,
-            'processed_tables': 0,
-            'failed_tables': 0,
-            'skipped_tables': 0,
-            'start_time': None,
-            'end_time': None
-        }
-        
-        self.failed_tables = []
-        self.logger = logging.getLogger("schema_tools.Agent")
-    
-    async def generate_training_data(self) -> Dict[str, Any]:
-        """主入口:生成训练数据"""
-        try:
-            self.stats['start_time'] = time.time()
-            self.logger.info("🚀 开始生成Schema训练数据")
-            
-            # 1. 初始化
-            await self._initialize()
-            
-            # 2. 检查数据库权限
-            await self._check_database_permissions()
-            
-            # 3. 解析表清单
-            tables = await self._parse_table_list()
-            
-            # 4. 过滤系统表
-            user_tables = self._filter_system_tables(tables)
-            
-            # 5. 并发处理表
-            results = await self._process_tables_concurrently(user_tables)
-            
-            # 6. 生成总结报告
-            report = self._generate_summary_report(results)
-            
-            self.stats['end_time'] = time.time()
-            self.logger.info("✅ Schema训练数据生成完成")
-            
-            return report
-            
-        except Exception as e:
-            self.stats['end_time'] = time.time()
-            self.logger.exception("❌ Schema训练数据生成失败")
-            raise
-    
-    async def _initialize(self):
-        """初始化Agent"""
-        # 创建输出目录
-        os.makedirs(self.output_dir, exist_ok=True)
-        if self.config["create_subdirectories"]:
-            os.makedirs(os.path.join(self.output_dir, "ddl"), exist_ok=True)
-            os.makedirs(os.path.join(self.output_dir, "docs"), exist_ok=True)
-            os.makedirs(os.path.join(self.output_dir, "logs"), exist_ok=True)
-        
-        # 初始化数据库工具
-        database_tool = ToolRegistry.get_tool("database_inspector", db_connection=self.db_connection)
-        await database_tool._create_connection_pool()
-        
-        self.logger.info(f"初始化完成,输出目录: {self.output_dir}")
-    
-    async def _check_database_permissions(self):
-        """检查数据库权限"""
-        if not self.config["check_permissions"]:
-            return
-        
-        inspector = ToolRegistry.get_tool("database_inspector")
-        checker = DatabasePermissionChecker(inspector)
-        
-        permissions = await checker.check_permissions()
-        
-        if not permissions['connect']:
-            raise Exception("无法连接到数据库")
-        
-        if self.config["require_select_permission"] and not permissions['select_data']:
-            if not self.config["allow_readonly_database"]:
-                raise Exception("数据库查询权限不足")
-            else:
-                self.logger.warning("数据库为只读或权限受限,部分功能可能受影响")
-        
-        self.logger.info(f"数据库权限检查完成: {permissions}")
-    
-    async def _parse_table_list(self) -> List[str]:
-        """解析表清单文件"""
-        tables = self.table_parser.parse_file(self.table_list_file)
-        self.stats['total_tables'] = len(tables)
-        self.logger.info(f"📋 从清单文件读取到 {len(tables)} 个表")
-        return tables
-    
-    def _filter_system_tables(self, tables: List[str]) -> List[str]:
-        """过滤系统表"""
-        if not self.config["filter_system_tables"]:
-            return tables
-        
-        user_tables = self.system_filter.filter_user_tables(tables)
-        filtered_count = len(tables) - len(user_tables)
-        
-        if filtered_count > 0:
-            self.logger.info(f"🔍 过滤了 {filtered_count} 个系统表,保留 {len(user_tables)} 个用户表")
-            self.stats['skipped_tables'] += filtered_count
-        
-        return user_tables
-    
-    async def _process_tables_concurrently(self, tables: List[str]) -> List[Dict[str, Any]]:
-        """并发处理表"""
-        max_concurrent = self.config["max_concurrent_tables"]
-        semaphore = asyncio.Semaphore(max_concurrent)
-        
-        self.logger.info(f"🔄 开始并发处理 {len(tables)} 个表 (最大并发: {max_concurrent})")
-        
-        # 创建任务
-        tasks = [
-            self._process_single_table_with_semaphore(semaphore, table_spec)
-            for table_spec in tables
-        ]
-        
-        # 并发执行
-        results = await asyncio.gather(*tasks, return_exceptions=True)
-        
-        # 统计结果
-        successful = sum(1 for r in results if isinstance(r, dict) and r.get('success', False))
-        failed = len(results) - successful
-        
-        self.stats['processed_tables'] = successful
-        self.stats['failed_tables'] = failed
-        
-        self.logger.info(f"📊 处理完成: 成功 {successful} 个,失败 {failed} 个")
-        
-        return [r for r in results if isinstance(r, dict)]
-    
-    async def _process_single_table_with_semaphore(self, semaphore: asyncio.Semaphore, table_spec: str) -> Dict[str, Any]:
-        """带信号量的单表处理"""
-        async with semaphore:
-            return await self._process_single_table(table_spec)
-    
-    async def _process_single_table(self, table_spec: str) -> Dict[str, Any]:
-        """处理单个表"""
-        start_time = time.time()
-        
-        try:
-            # 解析表名
-            if '.' in table_spec:
-                schema_name, table_name = table_spec.split('.', 1)
-            else:
-                schema_name, table_name = 'public', table_spec
-            
-            full_name = f"{schema_name}.{table_name}"
-            self.logger.info(f"🔍 开始处理表: {full_name}")
-            
-            # 创建表元数据
-            table_metadata = TableMetadata(
-                schema_name=schema_name,
-                table_name=table_name,
-                full_name=full_name
-            )
-            
-            # 创建处理上下文
-            context = TableProcessingContext(
-                table_metadata=table_metadata,
-                business_context=self.business_context,
-                output_dir=self.output_dir,
-                pipeline=self.pipeline,
-                vn=None,  # 将在工具中注入
-                file_manager=self.file_manager,
-                start_time=start_time
-            )
-            
-            # 执行处理链
-            step_results = await self.pipeline_executor.execute_pipeline(self.pipeline, context)
-            
-            # 计算总体成功状态
-            success = all(result.success for result in step_results.values())
-            
-            execution_time = time.time() - start_time
-            
-            if success:
-                self.logger.info(f"✅ 表 {full_name} 处理成功,耗时: {execution_time:.2f}秒")
-            else:
-                self.logger.error(f"❌ 表 {full_name} 处理失败,耗时: {execution_time:.2f}秒")
-                self.failed_tables.append(full_name)
-            
-            return {
-                'success': success,
-                'table_name': full_name,
-                'execution_time': execution_time,
-                'step_results': {k: v.to_dict() for k, v in step_results.items()},
-                'metadata': {
-                    'fields_count': len(table_metadata.fields),
-                    'row_count': table_metadata.row_count,
-                    'enum_fields': len([f for f in table_metadata.fields if f.is_enum])
-                }
-            }
-            
-        except Exception as e:
-            execution_time = time.time() - start_time
-            error_msg = f"表 {table_spec} 处理异常: {str(e)}"
-            self.logger.exception(error_msg)
-            self.failed_tables.append(table_spec)
-            
-            return {
-                'success': False,
-                'table_name': table_spec,
-                'execution_time': execution_time,
-                'error_message': error_msg,
-                'step_results': {}
-            }
-    
-    def _generate_summary_report(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """生成总结报告"""
-        total_time = self.stats['end_time'] - self.stats['start_time']
-        
-        # 计算统计信息
-        successful_results = [r for r in results if r.get('success', False)]
-        failed_results = [r for r in results if not r.get('success', False)]
-        
-        total_fields = sum(r.get('metadata', {}).get('fields_count', 0) for r in successful_results)
-        total_enum_fields = sum(r.get('metadata', {}).get('enum_fields', 0) for r in successful_results)
-        
-        avg_execution_time = sum(r.get('execution_time', 0) for r in results) / len(results) if results else 0
-        
-        report = {
-            'summary': {
-                'total_tables': self.stats['total_tables'],
-                'processed_successfully': len(successful_results),
-                'failed': len(failed_results),
-                'skipped_system_tables': self.stats['skipped_tables'],
-                'total_execution_time': total_time,
-                'average_table_time': avg_execution_time
-            },
-            'statistics': {
-                'total_fields_processed': total_fields,
-                'enum_fields_detected': total_enum_fields,
-                'files_generated': len(successful_results) * (2 if self.pipeline == 'full' else 1)
-            },
-            'failed_tables': self.failed_tables,
-            'detailed_results': results,
-            'configuration': {
-                'pipeline': self.pipeline,
-                'business_context': self.business_context,
-                'output_directory': self.output_dir,
-                'max_concurrent_tables': self.config['max_concurrent_tables']
-            }
-        }
-        
-        # 输出总结
-        self.logger.info(f"📊 处理总结:")
-        self.logger.info(f"  ✅ 成功: {report['summary']['processed_successfully']} 个表")
-        self.logger.info(f"  ❌ 失败: {report['summary']['failed']} 个表")
-        self.logger.info(f"  ⏭️  跳过: {report['summary']['skipped_system_tables']} 个系统表")
-        self.logger.info(f"  📁 生成文件: {report['statistics']['files_generated']} 个")
-        self.logger.info(f"  🕐 总耗时: {total_time:.2f} 秒")
-        
-        if self.failed_tables:
-            self.logger.warning(f"❌ 失败的表: {', '.join(self.failed_tables)}")
-        
-        return report
-    
-    async def check_database_permissions(self) -> Dict[str, bool]:
-        """检查数据库权限(供外部调用)"""
-        inspector = ToolRegistry.get_tool("database_inspector", db_connection=self.db_connection)
-        checker = DatabasePermissionChecker(inspector)
-        return await checker.check_permissions()
-```
-
-## 6. 命令行接口实现
-
-### 6.1 命令行入口 (`__main__.py`)
-
-```python
-import argparse
-import asyncio
-import sys
-import os
-import logging
-from pathlib import Path
-
-def setup_argument_parser():
-    """设置命令行参数解析器"""
-    parser = argparse.ArgumentParser(
-        description='Schema Tools - 自动生成数据库训练数据',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-示例用法:
-  # 基本使用
-  python -m schema_tools --db-connection "postgresql://user:pass@host:5432/db" --table-list tables.txt
-  
-  # 指定业务上下文和输出目录
-  python -m schema_tools --db-connection "..." --table-list tables.txt --business-context "电商系统" --output-dir output
-  
-  # 仅生成DDL文件
-  python -m schema_tools --db-connection "..." --table-list tables.txt --pipeline ddl_only
-  
-  # 权限检查模式
-  python -m schema_tools --db-connection "..." --check-permissions-only
-        """
-    )
-    
-    # 必需参数
-    parser.add_argument(
-        '--db-connection',
-        required=True,
-        help='数据库连接字符串 (例如: postgresql://user:pass@localhost:5432/dbname)'
-    )
-    
-    # 可选参数
-    parser.add_argument(
-        '--table-list',
-        help='表清单文件路径'
-    )
-    
-    parser.add_argument(
-        '--business-context',
-        help='业务上下文描述'
-    )
-    
-    parser.add_argument(
-        '--business-context-file',
-        help='业务上下文文件路径'
-    )
-    
-    parser.add_argument(
-        '--output-dir',
-        help='输出目录路径'
-    )
-    
-    parser.add_argument(
-        '--pipeline',
-        choices=['full', 'ddl_only', 'analysis_only'],
-        help='处理链类型'
-    )
-    
-    parser.add_argument(
-        '--max-concurrent',
-        type=int,
-        help='最大并发表数量'
-    )
-    
-    # 功能开关
-    parser.add_argument(
-        '--no-filter-system-tables',
-        action='store_true',
-        help='禁用系统表过滤'
-    )
-    
-    parser.add_argument(
-        '--check-permissions-only',
-        action='store_true',
-        help='仅检查数据库权限,不处理表'
-    )
-    
-    parser.add_argument(
-        '--verbose', '-v',
-        action='store_true',
-        help='启用详细日志输出'
-    )
-    
-    parser.add_argument(
-        '--log-file',
-        help='日志文件路径'
-    )
-    
-    return parser
-
-def load_config_with_overrides(args):
-    """加载配置并应用命令行覆盖"""
-    from config import SCHEMA_TOOLS_CONFIG
-    
-    config = SCHEMA_TOOLS_CONFIG.copy()
-    
-    # 命令行参数覆盖配置
-    if args.output_dir:
-        config["output_directory"] = args.output_dir
-    
-    if args.pipeline:
-        config["default_pipeline"] = args.pipeline
-    
-    if args.max_concurrent:
-        config["max_concurrent_tables"] = args.max_concurrent
-    
-    if args.no_filter_system_tables:
-        config["filter_system_tables"] = False
-    
-    if args.log_file:
-        config["log_file"] = args.log_file
-    
-    return config
-
-def load_business_context(args):
-    """加载业务上下文"""
-    if args.business_context_file:
-        try:
-            with open(args.business_context_file, 'r', encoding='utf-8') as f:
-                return f.read().strip()
-        except Exception as e:
-            print(f"警告: 无法读取业务上下文文件 {args.business_context_file}: {e}")
-    
-    if args.business_context:
-        return args.business_context
-    
-    from config import SCHEMA_TOOLS_CONFIG
-    return SCHEMA_TOOLS_CONFIG.get("default_business_context", "数据库管理系统")
-
-async def check_permissions_only(db_connection: str):
-    """仅检查数据库权限"""
-    from training_data_agent import SchemaTrainingDataAgent
-    
-    print("🔍 检查数据库权限...")
-    
-    try:
-        agent = SchemaTrainingDataAgent(
-            db_connection=db_connection,
-            table_list_file="",  # 不需要表清单
-            business_context=""   # 不需要业务上下文
-        )
-        
-        # 初始化Agent以建立数据库连接
-        await agent._initialize()
-        
-        # 检查权限
-        permissions = await agent.check_database_permissions()
-        
-        print("\n📋 权限检查结果:")
-        print(f"  ✅ 数据库连接: {'可用' if permissions['connect'] else '不可用'}")
-        print(f"  ✅ 元数据查询: {'可用' if permissions['select_metadata'] else '不可用'}")
-        print(f"  ✅ 数据查询: {'可用' if permissions['select_data'] else '不可用'}")
-        print(f"  ℹ️  数据库类型: {'只读' if permissions['is_readonly'] else '读写'}")
-        
-        if all(permissions.values()):
-            print("\n✅ 数据库权限检查通过,可以开始处理")
-            return True
-        else:
-            print("\n❌ 数据库权限不足,请检查配置")
-            return False
-            
-    except Exception as e:
-        print(f"\n❌ 权限检查失败: {e}")
-        return False
-
-async def main():
-    """主入口函数"""
-    parser = setup_argument_parser()
-    args = parser.parse_args()
-    
-    # 设置日志
-    from utils.logger import setup_logging
-    setup_logging(
-        verbose=args.verbose,
-        log_file=args.log_file
-    )
-    
-    # 仅权限检查模式
-    if args.check_permissions_only:
-        success = await check_permissions_only(args.db_connection)
-        sys.exit(0 if success else 1)
-    
-    # 验证必需参数
-    if not args.table_list:
-        print("错误: 需要指定 --table-list 参数")
-        parser.print_help()
-        sys.exit(1)
-    
-    if not os.path.exists(args.table_list):
-        print(f"错误: 表清单文件不存在: {args.table_list}")
-        sys.exit(1)
-    
-    try:
-        # 加载配置和业务上下文
-        config = load_config_with_overrides(args)
-        business_context = load_business_context(args)
-        
-        # 创建Agent
-        from training_data_agent import SchemaTrainingDataAgent
-        
-        agent = SchemaTrainingDataAgent(
-            db_connection=args.db_connection,
-            table_list_file=args.table_list,
-            business_context=business_context,
-            output_dir=config["output_directory"],
-            pipeline=config["default_pipeline"]
-        )
-        
-        # 执行生成
-        print("🚀 开始生成Schema训练数据...")
-        report = await agent.generate_training_data()
-        
-        # 输出结果
-        if report['summary']['failed'] == 0:
-            print("\n🎉 所有表处理成功!")
-        else:
-            print(f"\n⚠️  处理完成,但有 {report['summary']['failed']} 个表失败")
-        
-        print(f"📁 输出目录: {config['output_directory']}")
-        
-        # 如果有失败的表,返回非零退出码
-        sys.exit(1 if report['summary']['failed'] > 0 else 0)
-        
-    except KeyboardInterrupt:
-        print("\n\n⏹️  用户中断,程序退出")
-        sys.exit(130)
-    except Exception as e:
-        print(f"\n❌ 程序执行失败: {e}")
-        if args.verbose:
-            import traceback
-            traceback.print_exc()
-        sys.exit(1)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### 6.2 实际输出样例(基于高速公路服务区业务)
-
-#### 6.2.1 DDL文件输出样例 (`bss_service_area.ddl`)
-
-sql
-
-```sql
--- 中文名: 服务区基础信息表
--- 描述: 记录高速公路服务区的基础属性,包括服务区编码、名称、方向、公司归属、地理位置、服务类型和状态,是业务分析与服务区定位的核心表。
-create table bss_service_area (
-  id varchar(32) not null,              -- 服务区唯一标识(主键,UUID格式)
-  version integer not null,             -- 数据版本号
-  create_ts timestamp(3),               -- 创建时间
-  created_by varchar(50),               -- 创建人
-  update_ts timestamp(3),               -- 更新时间
-  updated_by varchar(50),               -- 更新人
-  delete_ts timestamp(3),               -- 删除时间
-  deleted_by varchar(50),               -- 删除人
-  service_area_name varchar(255),       -- 服务区名称
-  service_area_no varchar(255),         -- 服务区编码(业务唯一标识)
-  company_id varchar(32),               -- 公司ID(外键关联bss_company.id)
-  service_position varchar(255),        -- 经纬度坐标
-  service_area_type varchar(50),        -- 服务区类型(枚举:信息化服务区、智能化服务区)
-  service_state varchar(50),            -- 服务区状态(枚举:开放、关闭、上传数据)
-  primary key (id)
-);
-```
-
-#### 6.2.2 MD文档输出样例 (`bss_service_area_detail.md`)
-
-markdown
-
-```markdown
-## bss_service_area(服务区基础信息表)
-bss_service_area 表记录高速公路服务区的基础属性,包括服务区编码、名称、方向、公司归属、地理位置、服务类型和状态,是业务分析与服务区定位的核心表。
-
-字段列表:
-- id (varchar(32)) - 服务区唯一标识(主键,UUID格式)[示例: 0271d68ef93de9684b7ad8c7aae600b6]
-- version (integer) - 数据版本号 [示例: 3]
-- create_ts (timestamp(3)) - 创建时间 [示例: 2021-05-21 13:26:40.589]
-- created_by (varchar(50)) - 创建人 [示例: admin]
-- update_ts (timestamp(3)) - 更新时间 [示例: 2021-07-10 15:41:28.795]
-- updated_by (varchar(50)) - 更新人 [示例: admin]
-- delete_ts (timestamp(3)) - 删除时间
-- deleted_by (varchar(50)) - 删除人
-- service_area_name (varchar(255)) - 服务区名称 [示例: 鄱阳湖服务区]
-- service_area_no (varchar(255)) - 服务区编码(业务唯一标识)[示例: H0509]
-- company_id (varchar(32)) - 公司ID(外键关联bss_company.id)[示例: b1629f07c8d9ac81494fbc1de61f1ea5]
-- service_position (varchar(255)) - 经纬度坐标 [示例: 114.574721,26.825584]
-- service_area_type (varchar(50)) - 服务区类型(枚举:信息化服务区、智能化服务区)[示例: 信息化服务区]
-- service_state (varchar(50)) - 服务区状态(枚举:开放、关闭、上传数据)[示例: 开放]
-
-字段补充说明:
-- id 为主键,使用 UUID 编码,唯一标识每个服务区
-- company_id 外键关联服务区管理公司表(bss_company.id)
-- service_position 经纬度格式为"经度,纬度"
-- service_area_type 为枚举字段,包含两个取值:信息化服务区、智能化服务区
-- service_state 为枚举字段,包含三个取值:开放、关闭、上传数据
-- 本表是多个表(bss_branch, bss_car_day_count等)的核心关联实体
-```
-
-#### 6.2.3 复杂表样例 (`bss_business_day_data.ddl`)
-
-sql
-
-```sql
--- 中文名: 档口日营业数据表
--- 描述: 记录每天每个档口的营业情况,包含微信、支付宝、现金、金豆等支付方式的金额与订单数,是核心交易数据表。
-create table bss_business_day_data (
-  id varchar(32) not null,        -- 主键ID
-  version integer not null,       -- 数据版本号
-  create_ts timestamp(3),         -- 创建时间
-  created_by varchar(50),         -- 创建人
-  update_ts timestamp(3),         -- 更新时间
-  updated_by varchar(50),         -- 更新人
-  delete_ts timestamp(3),         -- 删除时间
-  deleted_by varchar(50),         -- 删除人
-  oper_date date,                 -- 统计日期
-  service_no varchar(255),        -- 服务区编码
-  service_name varchar(255),      -- 服务区名称
-  branch_no varchar(255),         -- 档口编码
-  branch_name varchar(255),       -- 档口名称
-  wx numeric(19,4),               -- 微信支付金额
-  wx_order integer,               -- 微信支付订单数量
-  zfb numeric(19,4),              -- 支付宝支付金额
-  zf_order integer,               -- 支付宝支付订单数量
-  rmb numeric(19,4),              -- 现金支付金额
-  rmb_order integer,              -- 现金支付订单数量
-  xs numeric(19,4),               -- 行吧支付金额
-  xs_order integer,               -- 行吧支付订单数量
-  jd numeric(19,4),               -- 金豆支付金额
-  jd_order integer,               -- 金豆支付订单数量
-  order_sum integer,              -- 订单总数
-  pay_sum numeric(19,4),          -- 支付总金额
-  source_type integer,            -- 数据来源类型ID
-  primary key (id)
-);
-```
-
-### 6.3 输出格式关键特征
-
-#### 6.3.1 DDL格式特征
-
-- **中文表头注释**: 包含表中文名和业务描述
-- **字段注释**: 每个字段都有中文注释说明
-- **枚举标识**: 对于枚举字段,在注释中明确标出可选值
-- **外键关系**: 明确标出外键关联关系
-- **业务标识**: 特殊业务字段(如编码、ID)有详细说明
-
-#### 6.3.2 MD格式特征
-
-- **表级描述**: 详细的表业务用途说明
-- **字段示例值**: 每个字段都提供真实的示例数据
-- **枚举值详解**: 枚举字段的所有可能取值完整列出
-- **补充说明**: 重要字段的额外业务逻辑说明
-- **关联关系**: 与其他表的关联关系说明
-
-
-
-## 7. 配置文件完整实现
-
-### 7.1 配置文件 (`config.py`)
-
-```python
-import os
-import sys
-
-# 导入app_config获取数据库等配置
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-try:
-    import app_config
-except ImportError:
-    app_config = None
-
-# Schema Tools专用配置
-SCHEMA_TOOLS_CONFIG = {
-    # 核心配置
-    "default_db_connection": None,  # 从命令行指定
-    "default_business_context": "数据库管理系统", 
-    "output_directory": "training/generated_data",
-    
-    # 处理链配置
-    "default_pipeline": "full",
-    "available_pipelines": {
-        "full": [
-            "database_inspector", 
-            "data_sampler", 
-            "comment_generator", 
-            "ddl_generator", 
-            "doc_generator"
-        ],
-        "ddl_only": [
-            "database_inspector", 
-            "data_sampler", 
-            "comment_generator", 
-            "ddl_generator"
-        ],
-        "analysis_only": [
-            "database_inspector", 
-            "data_sampler", 
-            "comment_generator"
-        ]
-    },
-    
-    # 数据处理配置
-    "sample_data_limit": 20,                    # 用于LLM分析的采样数据量
-    "enum_detection_sample_limit": 5000,        # 枚举检测时的采样限制
-    "enum_max_distinct_values": 20,             # 枚举字段最大不同值数量
-    "enum_varchar_keywords": [                  # VARCHAR枚举关键词
-        "性别", "gender", "状态", "status", "类型", "type", 
-        "级别", "level", "方向", "direction", "品类", "classify",
-        "模式", "mode", "格式", "format"
-    ],
-    "large_table_threshold": 1000000,           # 大表阈值(行数)
-    
-    # 并发配置
-    "max_concurrent_tables": 1,  # 建议保持1,避免LLM并发调用问题                 # 最大并发处理表数
-    
-    # LLM配置
-    "use_app_config_llm": True,                # 是否使用app_config中的LLM配置
-    "comment_generation_timeout": 30,          # LLM调用超时时间(秒)
-    "max_llm_retries": 3,                      # LLM调用最大重试次数
-    
-    # 系统表过滤配置
-    "filter_system_tables": True,              # 是否过滤系统表
-    "custom_system_prefixes": [],              # 用户自定义系统表前缀
-    "custom_system_schemas": [],               # 用户自定义系统schema
-    
-    # 权限与安全配置
-    "check_permissions": True,                 # 是否检查数据库权限
-    "require_select_permission": True,         # 是否要求SELECT权限
-    "allow_readonly_database": True,           # 是否允许只读数据库
-    
-    # 错误处理配置
-    "continue_on_error": True,                 # 遇到错误是否继续
-    "max_table_failures": 5,                  # 最大允许失败表数
-    "skip_large_tables": False,               # 是否跳过超大表
-    "max_table_size": 10000000,               # 最大表行数限制
-    
-    # 文件配置
-    "ddl_file_suffix": ".ddl",
-    "doc_file_suffix": "_detail.md",
-    "log_file": "schema_tools.log",
-    "create_subdirectories": True,            # 是否创建ddl/docs子目录
-    
-    # 输出格式配置
-    "include_sample_data_in_comments": True,  # 注释中是否包含示例数据
-    "max_comment_length": 500,                # 最大注释长度
-    "include_field_statistics": True,         # 是否包含字段统计信息
-    
-    # 调试配置
-    "debug_mode": False,                      # 调试模式
-    "save_llm_prompts": False,               # 是否保存LLM提示词
-    "save_llm_responses": False,             # 是否保存LLM响应
-}
-
-# 从app_config获取相关配置(如果可用)
-if app_config:
-    # 继承数据库配置
-    if hasattr(app_config, 'PGVECTOR_CONFIG'):
-        pgvector_config = app_config.PGVECTOR_CONFIG
-        if not SCHEMA_TOOLS_CONFIG["default_db_connection"]:
-            SCHEMA_TOOLS_CONFIG["default_db_connection"] = (
-                f"postgresql://{pgvector_config['user']}:{pgvector_config['password']}"
-                f"@{pgvector_config['host']}:{pgvector_config['port']}/{pgvector_config['dbname']}"
-            )
-
-def get_config():
-    """获取当前配置"""
-    return SCHEMA_TOOLS_CONFIG
-
-def update_config(**kwargs):
-    """更新配置"""
-    SCHEMA_TOOLS_CONFIG.update(kwargs)
-
-def validate_config():
-    """验证配置有效性"""
-    errors = []
-    
-    # 检查必要配置
-    if SCHEMA_TOOLS_CONFIG["max_concurrent_tables"] <= 0:
-        errors.append("max_concurrent_tables 必须大于0")
-    
-    if SCHEMA_TOOLS_CONFIG["sample_data_limit"] <= 0:
-        errors.append("sample_data_limit 必须大于0")
-    
-    # 检查处理链配置
-    default_pipeline = SCHEMA_TOOLS_CONFIG["default_pipeline"]
-    available_pipelines = SCHEMA_TOOLS_CONFIG["available_pipelines"]
-    
-    if default_pipeline not in available_pipelines:
-        errors.append(f"default_pipeline '{default_pipeline}' 不在 available_pipelines 中")
-    
-    if errors:
-        raise ValueError("配置验证失败:\n" + "\n".join(f"  - {error}" for error in errors))
-    
-    return True
-
-# 启动时验证配置
-try:
-    validate_config()
-except ValueError as e:
-    print(f"警告: {e}")
-```
-
-这个详细设计文档涵盖了Schema Tools的完整实现,包括:
-
-## 核心特性
-
-1. **完整的数据结构设计** - 标准化的数据模型
-2. **工具注册机制** - 装饰器注册和自动依赖注入
-3. **Pipeline处理链** - 可配置的处理流程
-4. **并发处理** - 表级并发和错误处理
-5. **LLM集成** - 智能注释生成和枚举检测
-6. **权限管理** - 数据库权限检查和只读适配
-7. **命令行接口** - 完整的CLI支持
-
-## 实现亮点
-
-- **类型安全**: 使用dataclass定义明确的数据结构
-- **错误处理**: 完善的异常处理和重试机制
-- **可扩展性**: 工具注册机制便于添加新功能
-- **配置灵活**: 多层次配置支持
-- **日志完整**: 详细的执行日志和统计报告
-
-## 8. Question-SQL生成功能详细设计(新增)
-
-### 8.1 功能概述
-
-Question-SQL生成功能是Schema Tools的扩展模块,用于从已生成的DDL和MD文件自动生成高质量的Question-SQL训练数据对。该功能可以独立运行,支持人工检查DDL/MD文件后再执行。
-
-### 8.2 核心组件设计
-
-#### 8.2.1 QuestionSQLGenerationAgent (`qs_agent.py`)
-
-```python
-class QuestionSQLGenerationAgent:
-    """Question-SQL生成Agent"""
-    
-    def __init__(self, 
-                 output_dir: str,
-                 table_list_file: str,
-                 business_context: str,
-                 db_name: str = None):
-        """
-        初始化Agent
-        
-        Args:
-            output_dir: 输出目录(包含DDL和MD文件)
-            table_list_file: 表清单文件路径
-            business_context: 业务上下文
-            db_name: 数据库名称(用于输出文件命名)
-        """
-        self.output_dir = Path(output_dir)
-        self.table_list_file = table_list_file
-        self.business_context = business_context
-        self.db_name = db_name or "db"
-        
-        # 初始化组件
-        self.validator = FileCountValidator()
-        self.md_analyzer = MDFileAnalyzer(output_dir)
-        self.theme_extractor = None  # 延迟初始化
-        
-        # 中间结果存储
-        self.intermediate_results = []
-        self.intermediate_file = None
-```
-
-#### 8.2.2 文件数量验证器 (`validators/file_count_validator.py`)
-
-```python
-@dataclass
-class ValidationResult:
-    """验证结果"""
-    is_valid: bool
-    table_count: int
-    ddl_count: int
-    md_count: int
-    error: str = ""
-    missing_ddl: List[str] = field(default_factory=list)
-    missing_md: List[str] = field(default_factory=list)
-
-class FileCountValidator:
-    """文件数量验证器"""
-    
-    def validate(self, table_list_file: str, output_dir: str) -> ValidationResult:
-        """
-        验证生成的文件数量是否与表数量一致
-        
-        主要验证:
-        1. 表数量是否超过20个限制
-        2. DDL文件数量是否与表数量一致
-        3. MD文件数量是否与表数量一致
-        """
-        # 解析表清单
-        tables = self.table_parser.parse_file(table_list_file)
-        table_count = len(tables)
-        
-        # 检查表数量限制
-        max_tables = self.config['qs_generation']['max_tables']
-        if table_count > max_tables:
-            return ValidationResult(
-                is_valid=False,
-                table_count=table_count,
-                ddl_count=0,
-                md_count=0,
-                error=f"表数量({table_count})超过限制({max_tables})"
-            )
-```
-
-#### 8.2.3 MD文件分析器 (`analyzers/md_analyzer.py`)
-
-```python
-class MDFileAnalyzer:
-    """MD文件分析器"""
-    
-    async def read_all_md_files(self) -> str:
-        """
-        读取所有MD文件的完整内容
-        
-        Returns:
-            所有MD文件内容的组合字符串
-        """
-        md_files = sorted(self.output_dir.glob("*_detail.md"))
-        
-        all_contents = []
-        all_contents.append(f"# 数据库表结构文档汇总\n")
-        all_contents.append(f"共包含 {len(md_files)} 个表\n\n")
-        
-        for md_file in md_files:
-            content = md_file.read_text(encoding='utf-8')
-            
-            # 添加分隔符,便于LLM区分不同表
-            all_contents.append("=" * 80)
-            all_contents.append(f"# 文件: {md_file.name}")
-            all_contents.append("=" * 80)
-            all_contents.append(content)
-            all_contents.append("\n")
-        
-        combined_content = "\n".join(all_contents)
-        
-        # 检查内容大小(预估token数)
-        estimated_tokens = len(combined_content) / 4
-        if estimated_tokens > 100000:
-            self.logger.warning(f"MD内容可能过大,预估tokens: {estimated_tokens:.0f}")
-        
-        return combined_content
-```
-
-#### 8.2.4 主题提取器 (`analyzers/theme_extractor.py`)
-
-```python
-class ThemeExtractor:
-    """主题提取器"""
-    
-    async def extract_themes(self, md_contents: str) -> List[Dict[str, Any]]:
-        """
-        从MD内容中提取分析主题
-        """
-        prompt = f"""你是一位经验丰富的业务数据分析师,正在分析{self.business_context}的数据库。
-
-以下是数据库中所有表的详细结构说明:
-
-{md_contents}
-
-基于对这些表结构的理解,请从业务分析的角度提出 {theme_count} 个数据查询分析主题。
-
-要求:
-1. 每个主题应该有明确的业务价值和分析目标
-2. 主题之间应该有所区别,覆盖不同的业务领域  
-3. 你需要自行决定每个主题应该涉及哪些表
-4. 主题应该体现实际业务场景的数据分析需求
-5. 考虑时间维度、对比分析、排名统计等多种分析角度
-
-请以JSON格式输出:
-```json
-{{
-  "themes": [
-    {{
-      "name": "经营收入分析",
-      "description": "分析服务区的营业收入情况,包括日收入趋势、月度对比、服务区排名等",
-      "focus_areas": ["收入趋势", "服务区对比", "时间维度分析"],
-      "related_tables": ["bss_business_day_data", "其他相关表名"]
-    }}
-  ]
-}}
-```"""
-        
-        response = await self._call_llm(prompt)
-        themes = self._parse_theme_response(response)
-        
-        return themes
-```
-
-### 8.3 执行流程详细设计
-
-#### 8.3.1 主流程
-
-```python
-async def generate(self) -> Dict[str, Any]:
-    """生成Question-SQL对"""
-    
-    # 1. 验证文件数量
-    validation_result = self.validator.validate(self.table_list_file, str(self.output_dir))
-    if not validation_result.is_valid:
-        raise ValueError(f"文件验证失败: {validation_result.error}")
-    
-    # 2. 读取所有MD文件内容
-    md_contents = await self.md_analyzer.read_all_md_files()
-    
-    # 3. 初始化LLM组件
-    self._initialize_llm_components()
-    
-    # 4. 提取分析主题
-    themes = await self.theme_extractor.extract_themes(md_contents)
-    
-    # 5. 初始化中间结果文件
-    self._init_intermediate_file()
-    
-    # 6. 处理每个主题
-    if self.config['qs_generation']['max_concurrent_themes'] > 1:
-        results = await self._process_themes_parallel(themes, md_contents)
-    else:
-        results = await self._process_themes_serial(themes, md_contents)
-    
-    # 7. 保存最终结果
-    output_file = await self._save_final_results(all_qs_pairs)
-    
-    return report
-```
-
-#### 8.3.2 主题处理
-
-```python
-async def _process_single_theme(self, theme: Dict, md_contents: str) -> Dict:
-    """处理单个主题"""
-    
-    prompt = f"""你是一位业务数据分析师,正在为{self.business_context}设计数据查询。
-
-当前分析主题:{theme['name']}
-主题描述:{theme['description']}
-关注领域:{', '.join(theme['focus_areas'])}
-相关表:{', '.join(theme['related_tables'])}
-
-数据库表结构信息:
-{md_contents}
-
-请为这个主题生成 {questions_count} 个业务问题和对应的SQL查询。
-
-要求:
-1. 问题应该从业务角度出发,贴合主题要求,具有实际分析价值
-2. SQL必须使用PostgreSQL语法
-3. 考虑实际业务逻辑(如软删除使用 delete_ts IS NULL 条件)
-4. 使用中文别名提高可读性(使用 AS 指定列别名)
-5. 问题应该多样化,覆盖不同的分析角度
-6. 包含时间筛选、分组统计、排序、限制等不同类型的查询
-7. SQL语句末尾必须以分号结束
-
-输出JSON格式:
-```json
-[
-  {{
-    "question": "具体的业务问题?",
-    "sql": "SELECT column AS 中文名 FROM table WHERE condition;"
-  }}
-]
-```"""
-    
-    response = await self._call_llm(prompt)
-    qs_pairs = self._parse_qs_response(response)
-    validated_pairs = self._validate_qs_pairs(qs_pairs, theme['name'])
-    
-    # 保存中间结果
-    await self._save_theme_results(theme['name'], validated_pairs)
-    
-    return {
-        'success': True,
-        'theme_name': theme['name'],
-        'qs_pairs': validated_pairs
-    }
-```
-
-### 8.4 中间结果保存机制
-
-```python
-def _init_intermediate_file(self):
-    """初始化中间结果文件"""
-    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-    self.intermediate_file = self.output_dir / f"qs_intermediate_{timestamp}.json"
-    self.intermediate_results = []
-
-async def _save_theme_results(self, theme_name: str, qs_pairs: List[Dict]):
-    """保存单个主题的结果"""
-    theme_result = {
-        "theme": theme_name,
-        "timestamp": datetime.now().isoformat(),
-        "questions_count": len(qs_pairs),
-        "questions": qs_pairs
-    }
-    
-    self.intermediate_results.append(theme_result)
-    
-    # 立即保存到中间文件
-    if self.config['qs_generation']['save_intermediate']:
-        with open(self.intermediate_file, 'w', encoding='utf-8') as f:
-            json.dump(self.intermediate_results, f, ensure_ascii=False, indent=2)
-```
-
-## 9. SQL验证器核心模块
-
-### 9.1 SQL验证器设计 (`validators/sql_validator.py`)
-
-```python
-@dataclass
-class SQLValidationResult:
-    """SQL验证结果"""
-    sql: str
-    valid: bool
-    error_message: str = ""
-    execution_time: float = 0.0
-    retry_count: int = 0
-    
-    # SQL修复相关字段
-    repair_attempted: bool = False
-    repair_successful: bool = False
-    repaired_sql: str = ""
-    repair_error: str = ""
-
-@dataclass
-class ValidationStats:
-    """验证统计信息"""
-    total_sqls: int = 0
-    valid_sqls: int = 0
-    invalid_sqls: int = 0
-    total_time: float = 0.0
-    avg_time_per_sql: float = 0.0
-    retry_count: int = 0
-    
-    # SQL修复统计
-    repair_attempted: int = 0
-    repair_successful: int = 0
-    repair_failed: int = 0
-
-class SQLValidator:
-    """SQL验证器核心类"""
-    
-    def __init__(self, db_connection: str = None):
-        self.db_connection = db_connection
-        self.connection_pool = None
-        self.config = SCHEMA_TOOLS_CONFIG['sql_validation']
-        
-    async def validate_sql(self, sql: str, retry_count: int = 0) -> SQLValidationResult:
-        """验证单个SQL语句"""
-        start_time = time.time()
-        
-        try:
-            if not self.connection_pool:
-                await self._get_connection_pool()
-            
-            # 使用EXPLAIN验证SQL语法和表结构
-            explain_sql = f"EXPLAIN {sql}"
-            
-            async with self.connection_pool.acquire() as conn:
-                # 设置只读模式
-                if self.config['readonly_mode']:
-                    await conn.execute("SET default_transaction_read_only = on")
-                
-                # 执行EXPLAIN
-                await asyncio.wait_for(
-                    conn.fetch(explain_sql),
-                    timeout=self.config['validation_timeout']
-                )
-            
-            execution_time = time.time() - start_time
-            
-            return SQLValidationResult(
-                sql=sql,
-                valid=True,
-                execution_time=execution_time,
-                retry_count=retry_count
-            )
-            
-        except asyncio.TimeoutError:
-            execution_time = time.time() - start_time
-            error_msg = f"SQL验证超时({self.config['validation_timeout']}秒)"
-            
-            return SQLValidationResult(
-                sql=sql,
-                valid=False,
-                error_message=error_msg,
-                execution_time=execution_time,
-                retry_count=retry_count
-            )
-            
-        except Exception as e:
-            execution_time = time.time() - start_time
-            error_msg = str(e)
-            
-            # 检查是否需要重试
-            if retry_count < self.config['max_retry_count'] and self._should_retry(e):
-                await asyncio.sleep(0.5)  # 短暂延迟
-                return await self.validate_sql(sql, retry_count + 1)
-            
-            return SQLValidationResult(
-                sql=sql,
-                valid=False,
-                error_message=error_msg,
-                execution_time=execution_time,
-                retry_count=retry_count
-            )
-    
-    async def validate_sqls_batch(self, sqls: List[str]) -> List[SQLValidationResult]:
-        """批量验证SQL语句"""
-        max_concurrent = self.config['max_concurrent_validations']
-        semaphore = asyncio.Semaphore(max_concurrent)
-        
-        async def validate_with_semaphore(sql):
-            async with semaphore:
-                return await self.validate_sql(sql)
-        
-        # 并发执行验证
-        tasks = [validate_with_semaphore(sql) for sql in sqls]
-        results = await asyncio.gather(*tasks, return_exceptions=True)
-        
-        # 处理异常结果
-        processed_results = []
-        for i, result in enumerate(results):
-            if isinstance(result, Exception):
-                processed_results.append(SQLValidationResult(
-                    sql=sqls[i],
-                    valid=False,
-                    error_message=f"验证异常: {str(result)}"
-                ))
-            else:
-                processed_results.append(result)
-        
-        return processed_results
-    
-    def calculate_stats(self, results: List[SQLValidationResult]) -> ValidationStats:
-        """计算验证统计信息"""
-        stats = ValidationStats()
-        
-        stats.total_sqls = len(results)
-        stats.valid_sqls = sum(1 for r in results if r.valid)
-        stats.invalid_sqls = stats.total_sqls - stats.valid_sqls
-        stats.total_time = sum(r.execution_time for r in results)
-        stats.avg_time_per_sql = stats.total_time / stats.total_sqls if stats.total_sqls > 0 else 0.0
-        stats.retry_count = sum(r.retry_count for r in results)
-        
-        # 修复统计
-        stats.repair_attempted = sum(1 for r in results if r.repair_attempted)
-        stats.repair_successful = sum(1 for r in results if r.repair_successful)
-        stats.repair_failed = stats.repair_attempted - stats.repair_successful
-        
-        return stats
-```
-
-### 9.2 SQL验证Agent (`sql_validation_agent.py`)
-
-```python
-class SQLValidationAgent:
-    """SQL验证Agent - 管理SQL验证的完整流程"""
-    
-    async def validate(self) -> Dict[str, Any]:
-        """执行SQL验证流程"""
-        
-        # 1. 读取输入文件
-        questions_sqls = await self._load_questions_sqls()
-        
-        # 2. 提取SQL语句
-        sqls = [item['sql'] for item in questions_sqls]
-        
-        # 3. 执行验证
-        validation_results = await self._validate_sqls_with_batching(sqls)
-        
-        # 4. 计算统计信息
-        stats = self.validator.calculate_stats(validation_results)
-        
-        # 5. 尝试修复失败的SQL(如果启用LLM修复)
-        if self.config.get('enable_sql_repair', False) and self.vn:
-            validation_results = await self._attempt_sql_repair(questions_sqls, validation_results)
-            stats = self.validator.calculate_stats(validation_results)
-        
-        # 6. 修改原始JSON文件(如果启用文件修改)
-        file_modification_stats = {'modified': 0, 'deleted': 0, 'failed_modifications': 0}
-        if self.config.get('modify_original_file', False):
-            file_modification_stats = await self._modify_original_json_file(questions_sqls, validation_results)
-        
-        # 7. 生成详细报告
-        report = await self._generate_report(questions_sqls, validation_results, stats, file_modification_stats)
-        
-        # 8. 保存验证报告
-        if self.config['save_validation_report']:
-            await self._save_validation_report(report)
-        
-        return report
-    
-    async def _attempt_sql_repair(self, questions_sqls: List[Dict], validation_results: List[SQLValidationResult]) -> List[SQLValidationResult]:
-        """尝试修复失败的SQL"""
-        
-        failed_indices = [i for i, result in enumerate(validation_results) if not result.valid]
-        
-        if not failed_indices:
-            return validation_results
-        
-        # 批量修复
-        batch_size = self.config.get('repair_batch_size', 5)
-        updated_results = validation_results.copy()
-        
-        for i in range(0, len(failed_indices), batch_size):
-            batch_indices = failed_indices[i:i + batch_size]
-            
-            # 准备批次数据
-            batch_data = []
-            for idx in batch_indices:
-                batch_data.append({
-                    'index': idx,
-                    'question': questions_sqls[idx]['question'],
-                    'sql': validation_results[idx].sql,
-                    'error': validation_results[idx].error_message
-                })
-            
-            # 调用LLM修复
-            repaired_sqls = await self._repair_sqls_with_llm(batch_data)
-            
-            # 验证修复后的SQL
-            for j, idx in enumerate(batch_indices):
-                original_result = updated_results[idx]
-                original_result.repair_attempted = True
-                
-                if j < len(repaired_sqls) and repaired_sqls[j]:
-                    repaired_sql = repaired_sqls[j]
-                    
-                    # 验证修复后的SQL
-                    repair_result = await self.validator.validate_sql(repaired_sql)
-                    
-                    if repair_result.valid:
-                        # 修复成功
-                        original_result.repair_successful = True
-                        original_result.repaired_sql = repaired_sql
-                        original_result.valid = True  # 更新为有效
-                    else:
-                        # 修复失败
-                        original_result.repair_successful = False
-                        original_result.repair_error = repair_result.error_message
-                else:
-                    # LLM修复失败
-                    original_result.repair_successful = False
-                    original_result.repair_error = "LLM修复失败或返回空结果"
-        
-        return updated_results
-    
-    async def _modify_original_json_file(self, questions_sqls: List[Dict], validation_results: List[SQLValidationResult]) -> Dict[str, int]:
-        """修改原始JSON文件"""
-        stats = {'modified': 0, 'deleted': 0, 'failed_modifications': 0}
-        
-        try:
-            # 读取原始JSON文件
-            with open(self.input_file, 'r', encoding='utf-8') as f:
-                original_data = json.load(f)
-            
-            # 创建备份文件
-            backup_file = Path(str(self.input_file) + '.backup')
-            with open(backup_file, 'w', encoding='utf-8') as f:
-                json.dump(original_data, f, ensure_ascii=False, indent=2)
-            
-            # 构建修改计划
-            modifications = []
-            deletions = []
-            
-            for i, (qs, result) in enumerate(zip(questions_sqls, validation_results)):
-                if result.repair_successful and result.repaired_sql:
-                    # 修复成功的SQL
-                    modifications.append({
-                        'index': i,
-                        'original_sql': result.sql,
-                        'repaired_sql': result.repaired_sql,
-                        'question': qs['question']
-                    })
-                elif not result.valid and not result.repair_successful:
-                    # 无法修复的SQL,标记删除
-                    deletions.append({
-                        'index': i,
-                        'question': qs['question'],
-                        'sql': result.sql,
-                        'error': result.error_message
-                    })
-            
-            # 执行修改(从后往前,避免索引变化)
-            new_data = original_data.copy()
-            
-            # 先删除无效项(从后往前删除)
-            for deletion in sorted(deletions, key=lambda x: x['index'], reverse=True):
-                if deletion['index'] < len(new_data):
-                    new_data.pop(deletion['index'])
-                    stats['deleted'] += 1
-            
-            # 再修改SQL(需要重新计算索引)
-            for modification in sorted(modifications, key=lambda x: x['index']):
-                # 计算删除后的新索引
-                new_index = modification['index']
-                for deletion in deletions:
-                    if deletion['index'] < modification['index']:
-                        new_index -= 1
-                
-                if new_index < len(new_data):
-                    new_data[new_index]['sql'] = modification['repaired_sql']
-                    stats['modified'] += 1
-            
-            # 写入修改后的文件
-            with open(self.input_file, 'w', encoding='utf-8') as f:
-                json.dump(new_data, f, ensure_ascii=False, indent=2)
-            
-            # 记录详细修改信息到日志文件
-            await self._write_modification_log(modifications, deletions)
-            
-        except Exception as e:
-            stats['failed_modifications'] = 1
-        
-        return stats
-```
-
-## 10. 工作流编排器设计
-
-### 10.1 SchemaWorkflowOrchestrator核心功能
-
-```python
-class SchemaWorkflowOrchestrator:
-    """端到端的Schema处理编排器"""
-    
-    async def execute_complete_workflow(self) -> Dict[str, Any]:
-        """执行完整的Schema处理工作流程"""
-        
-        # 步骤1: 生成DDL和MD文件
-        await self._execute_step_1_ddl_md_generation()
-        
-        # 步骤2: 生成Question-SQL对
-        await self._execute_step_2_question_sql_generation()
-        
-        # 步骤3: 验证和修正SQL(可选)
-        if self.enable_sql_validation:
-            await self._execute_step_3_sql_validation()
-        
-        # 生成最终报告
-        final_report = await self._generate_final_report()
-        
-        return final_report
-```
-
-这样,文档就与当前代码完全一致了,包含了所有新增的SQL验证、LLM修复、文件修改等功能的详细设计说明。

+ 155 - 73
docs/run_training说明.md

@@ -1,7 +1,28 @@
-## 文件扩展名与处理函数对应关系
+# 训练数据管理系统说明
+
+## 概述
+
+训练数据管理系统位于 `data_pipeline/trainer/` 目录下,负责将生成的训练数据文件加载到向量数据库中。该系统支持多种文件格式的自动识别和处理。
+
+## 主要组件
+
+### 1. 核心文件
+- **`run_training.py`** - 主训练脚本,支持命令行调用
+- **`vanna_trainer.py`** - 训练器核心模块,封装训练逻辑
+
+### 2. 配置来源
+训练数据路径配置现已统一到 `data_pipeline/config.py`:
+```python
+SCHEMA_TOOLS_CONFIG = {
+    "output_directory": "./data_pipeline/training_data/",
+    # 其他配置...
+}
+```
+
+## 文件格式与处理逻辑
 
 
 ### 文件处理优先级和判断逻辑
 ### 文件处理优先级和判断逻辑
-代码中的文件类型判断按以下顺序进行:
+代码按以下顺序判断文件类型
 
 
 1. **`.ddl`** → DDL文件
 1. **`.ddl`** → DDL文件
 2. **`.md` 或 `.markdown`** → 文档文件  
 2. **`.md` 或 `.markdown`** → 文档文件  
@@ -10,7 +31,7 @@
 5. **`.sql` (但不以 `_pair.sql` 或 `_pairs.sql` 结尾)** → SQL示例文件
 5. **`.sql` (但不以 `_pair.sql` 或 `_pairs.sql` 结尾)** → SQL示例文件
 6. **其他** → 跳过处理
 6. **其他** → 跳过处理
 
 
-### 1. **DDL文件** (`.ddl`)
+### 1. DDL文件 (`.ddl`)
 - **处理函数**: `train_ddl_statements()`
 - **处理函数**: `train_ddl_statements()`
 - **调用的训练函数**: `train_ddl()`
 - **调用的训练函数**: `train_ddl()`
 - **文件格式**: 
 - **文件格式**: 
@@ -18,17 +39,15 @@
   - 每个DDL语句之间用分号分隔
   - 每个DDL语句之间用分号分隔
   - 示例格式:
   - 示例格式:
     ```sql
     ```sql
-    CREATE TABLE users (
-        id INT PRIMARY KEY,
-        name VARCHAR(100)
-    );
-    CREATE TABLE orders (
-        id INT PRIMARY KEY,
-        user_id INT REFERENCES users(id)
+    create table public.bss_company (
+      id varchar(32) not null     -- 主键ID,主键,
+      version integer not null    -- 版本号,
+      company_name varchar(255)   -- 公司名称,
+      primary key (id)
     );
     );
     ```
     ```
 
 
-### 2. **文档文件** (`.md`, `.markdown`)
+### 2. 文档文件 (`.md`, `.markdown`)
 - **处理函数**: `train_documentation_blocks()`
 - **处理函数**: `train_documentation_blocks()`
 - **调用的训练函数**: `train_documentation()`
 - **调用的训练函数**: `train_documentation()`
 - **文件格式**:
 - **文件格式**:
@@ -36,18 +55,15 @@
   - **非Markdown文件**: 使用 `---` 作为分隔符
   - **非Markdown文件**: 使用 `---` 作为分隔符
   - 示例格式:
   - 示例格式:
     ```markdown
     ```markdown
-    # 用户表说明
-    用户表存储系统中所有用户的基本信息...
-    
-    ## 字段说明
-    - id: 用户唯一标识符
-    - name: 用户姓名
+    ## bss_company(存储高速公路管理公司信息)
+    bss_company 表存储高速公路管理公司信息,用于服务区运营管理
     
     
-    ### 注意事项
-    用户名不能重复...
+    字段列表:
+    - id (varchar(32)) - 主键ID [主键, 非空]
+    - company_name (varchar(255)) - 公司名称
     ```
     ```
 
 
-### 3. **SQL示例文件** (`.sql`, 但排除 `_pair.sql` 和 `_pairs.sql`)
+### 3. SQL示例文件 (`.sql`, 但排除 `_pair.sql` 和 `_pairs.sql`)
 - **处理函数**: `train_sql_examples()`
 - **处理函数**: `train_sql_examples()`
 - **调用的训练函数**: `train_sql_example()`
 - **调用的训练函数**: `train_sql_example()`
 - **文件格式**:
 - **文件格式**:
@@ -55,12 +71,11 @@
   - 每个SQL示例之间用分号分隔
   - 每个SQL示例之间用分号分隔
   - 示例格式:
   - 示例格式:
     ```sql
     ```sql
-    SELECT * FROM users WHERE age > 18;
-    SELECT COUNT(*) FROM orders WHERE status = 'completed';
-    SELECT u.name, COUNT(o.id) FROM users u LEFT JOIN orders o ON u.id = o.user_id GROUP BY u.id;
+    SELECT * FROM bss_company WHERE delete_ts IS NULL;
+    SELECT company_name, company_no FROM bss_company ORDER BY company_name;
     ```
     ```
 
 
-### 4. **格式化问答对文件** (`_pair.sql`, `_pairs.sql`)
+### 4. 格式化问答对文件 (`_pair.sql`, `_pairs.sql`)
 - **处理函数**: `train_formatted_question_sql_pairs()`
 - **处理函数**: `train_formatted_question_sql_pairs()`
 - **调用的训练函数**: `train_question_sql_pair()`
 - **调用的训练函数**: `train_question_sql_pair()`
 - **文件格式**:
 - **文件格式**:
@@ -69,18 +84,19 @@
   - 支持单行和多行SQL
   - 支持单行和多行SQL
   - 示例格式:
   - 示例格式:
     ```
     ```
-    Question: 查询所有成年用户
-    SQL: SELECT * FROM users WHERE age >= 18;
+    Question: 查询所有公司信息
+    SQL: SELECT * FROM bss_company WHERE delete_ts IS NULL;
 
 
-    Question: 统计每个用户的订单数量
+    Question: 统计每个公司的服务区数量
     SQL: 
     SQL: 
-    SELECT u.name, COUNT(o.id) as order_count
-    FROM users u 
-    LEFT JOIN orders o ON u.id = o.user_id 
-    GROUP BY u.id, u.name;
+    SELECT c.company_name, COUNT(sa.id) as area_count
+    FROM bss_company c 
+    LEFT JOIN bss_service_area sa ON c.id = sa.company_id 
+    WHERE c.delete_ts IS NULL
+    GROUP BY c.company_name;
     ```
     ```
 
 
-### 5. **JSON格式问答对文件** (`_pair.json`, `_pairs.json`)
+### 5. JSON格式问答对文件 (`_pair.json`, `_pairs.json`)
 - **处理函数**: `train_json_question_sql_pairs()`
 - **处理函数**: `train_json_question_sql_pairs()`
 - **调用的训练函数**: `train_question_sql_pair()`
 - **调用的训练函数**: `train_question_sql_pair()`
 - **文件格式**:
 - **文件格式**:
@@ -90,58 +106,124 @@
     ```json
     ```json
     [
     [
         {
         {
-            "question": "查询所有成年用户",
-            "sql": "SELECT * FROM users WHERE age >= 18"
+            "question": "查询所有公司信息",
+            "sql": "SELECT * FROM bss_company WHERE delete_ts IS NULL"
         },
         },
         {
         {
-            "question": "统计每个用户的订单数量",
-            "sql": "SELECT u.name, COUNT(o.id) as order_count FROM users u LEFT JOIN orders o ON u.id = o.user_id GROUP BY u.id, u.name"
+            "question": "按公司统计服务区数量",
+            "sql": "SELECT company_name, COUNT(*) FROM bss_service_area GROUP BY company_name"
         }
         }
     ]
     ]
     ```
     ```
 
 
-### 6. **传统问答对文件** (其他格式,通过 `train_question_sql_pairs()` 处理)
-- **处理函数**: `train_question_sql_pairs()`
-- **调用的训练函数**: `train_question_sql_pair()`
-- **文件格式**:
-  - 每行一个问答对
-  - 使用 `::` 分隔问题和SQL
-  - 示例格式:
-    ```
-    查询所有成年用户::SELECT * FROM users WHERE age >= 18
-    统计订单总数::SELECT COUNT(*) FROM orders
-    ```
-
-
+## 使用方式
+
+### 1. 命令行使用
+
+#### 基本使用
+```bash
+# 使用默认配置路径
+python -m data_pipeline.trainer.run_training
+
+# 指定训练数据目录
+python -m data_pipeline.trainer.run_training --data_path ./data_pipeline/training_data/
+```
+
+### 2. 在工作流中调用
+训练数据加载已集成到工作流编排器中,作为第4步自动执行:
+```bash
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://user:pass@localhost:5432/database" \
+  --table-list tables.txt \
+  --business-context "业务描述"
+```
+
+### 3. 编程方式调用
+```python
+from data_pipeline.trainer.run_training import process_training_files
+
+# 处理训练文件
+success = process_training_files("./data_pipeline/training_data/")
+if success:
+    print("训练数据加载成功")
+```
+
+## 扫描策略
+
+### 目录扫描范围
+- **只扫描指定目录的直接文件**,不递归扫描子目录
+- 跳过所有子目录,只处理文件
+
+### 不支持的文件类型
+- **`.txt` 文件** - 不被处理
+- **其他扩展名文件** - 被跳过
+- **子目录** - 被忽略
+
+## 配置架构
+
+### 配置优先级
+```
+命令行参数 > data_pipeline/config.py > 默认值
+```
+
+### 统一配置
+所有数据管道相关配置现统一在 `data_pipeline/config.py`:
+```python
+SCHEMA_TOOLS_CONFIG = {
+    "output_directory": "./data_pipeline/training_data/",
+    # 训练相关配置...
+}
+```
 
 
 ## 统计信息
 ## 统计信息
 
 
-训练完成后会显示以下统计:
+训练完成后显示统计:
 - DDL文件数量
 - DDL文件数量
 - 文档文件数量  
 - 文档文件数量  
 - SQL示例文件数量
 - SQL示例文件数量
 - 格式化问答对文件数量
 - 格式化问答对文件数量
 - JSON问答对文件数量
 - JSON问答对文件数量
-
-这个设计使得训练系统能够灵活处理多种不同格式的训练数据,满足不同场景下的数据准备需求。
-
-
-# 训练脚本批处理配置
-# 这些配置仅用于 training/run_training.py 训练脚本的批处理优化
-# 批处理可以提高训练效率,但会增加内存使用和复杂度
-# 
-# TRAINING_BATCH_PROCESSING_ENABLED: 
-#   - True: 启用批处理,将多个训练项目打包一起处理
-#   - False: 逐项处理,每个训练项目单独处理(更稳定但较慢)
-# 
-# TRAINING_BATCH_SIZE: 每批处理的训练项目数量
-#   - 较大值: 处理更快但占用更多内存
-#   - 较小值: 内存占用少但处理较慢
-#   - 建议范围: 5-20
-# 
-# TRAINING_MAX_WORKERS: 训练批处理的最大工作线程数
-#   - 建议设置为CPU核心数的1-2倍
-#   - 过多线程可能导致资源竞争
-TRAINING_BATCH_PROCESSING_ENABLED = True    # 是否启用训练数据批处理
-TRAINING_BATCH_SIZE = 10                    # 每批处理的训练项目数量
-TRAINING_MAX_WORKERS = 4                    # 训练批处理的最大工作线程数
+- 总训练记录数
+- 各数据类型分布
+
+## 向量数据库集成
+
+### 支持的向量数据库
+- **ChromaDB** - 本地文件存储
+- **PgVector** - PostgreSQL向量扩展
+
+### 数据类型标识
+训练数据加载时会自动标记数据类型:
+- `ddl` - DDL语句
+- `documentation` - 文档内容
+- `sql` - SQL示例和问答对
+
+### 验证机制
+训练完成后自动验证:
+- 从向量数据库检索训练数据
+- 统计各类型数据数量
+- 确保数据成功加载
+
+## 最佳实践
+
+### 文件组织
+```
+data_pipeline/training_data/
+├── *.ddl              # DDL文件
+├── *_detail.md        # MD文档文件
+├── qs_*_pair.json     # 问答对文件
+├── filename_mapping.txt  # 文件映射
+└── logs/              # 日志目录(如果需要)
+```
+
+### 命名规范
+- DDL文件: `table_name.ddl` 或 `schema__table_name.ddl`
+- MD文件: `table_name_detail.md` 或 `schema__table_name_detail.md`  
+- JSON问答对: `qs_dbname_timestamp_pair.json`
+
+### 性能优化
+- 批处理配置现已移至 `app_config.py` 中的训练配置区域
+- 单线程处理确保稳定性
+- 自动识别文件格式,无需手动分类
+
+这个训练数据管理系统为完整的数据管道提供了最后一环,确保生成的训练数据能够有效地加载到向量数据库中供AI模型使用。

+ 0 - 135
schema_tools/test_schema_tools.py

@@ -1,135 +0,0 @@
-"""
-测试Schema Tools模块
-"""
-import asyncio
-import os
-import sys
-from pathlib import Path
-
-# 添加项目根目录到Python路径
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-
-async def test_basic_functionality():
-    """测试基本功能"""
-    print("===== 测试 Schema Tools =====")
-    
-    # 1. 测试配置
-    from schema_tools.config import SCHEMA_TOOLS_CONFIG, validate_config
-    print("\n1. 测试配置验证...")
-    try:
-        validate_config()
-        print("✅ 配置验证通过")
-    except Exception as e:
-        print(f"❌ 配置验证失败: {e}")
-        return
-    
-    # 2. 测试工具注册
-    from schema_tools.tools import ToolRegistry
-    print("\n2. 已注册的工具:")
-    tools = ToolRegistry.list_tools()
-    for tool in tools:
-        print(f"  - {tool}")
-    
-    # 3. 创建测试表清单文件
-    test_tables_file = "test_tables.txt"
-    with open(test_tables_file, 'w', encoding='utf-8') as f:
-        f.write("# 测试表清单\n")
-        f.write("public.users\n")
-        f.write("public.orders\n")
-        f.write("hr.employees\n")
-    print(f"\n3. 创建测试表清单文件: {test_tables_file}")
-    
-    # 4. 测试权限检查(仅模拟)
-    print("\n4. 测试数据库权限检查...")
-    
-    # 这里需要真实的数据库连接字符串
-    # 从环境变量或app_config获取
-    try:
-        import app_config
-        if hasattr(app_config, 'PGVECTOR_CONFIG'):
-            pg_config = app_config.PGVECTOR_CONFIG
-            db_connection = f"postgresql://{pg_config['user']}:{pg_config['password']}@{pg_config['host']}:{pg_config['port']}/{pg_config['dbname']}"
-            print(f"使用PgVector数据库配置")
-        else:
-            print("⚠️ 未找到数据库配置,跳过权限测试")
-            db_connection = None
-    except:
-        print("⚠️ 无法导入app_config,跳过权限测试")
-        db_connection = None
-    
-    if db_connection:
-        from schema_tools.training_data_agent import SchemaTrainingDataAgent
-        
-        try:
-            agent = SchemaTrainingDataAgent(
-                db_connection=db_connection,
-                table_list_file=test_tables_file,
-                business_context="测试业务系统"
-            )
-            
-            permissions = await agent.check_database_permissions()
-            print(f"数据库权限: {permissions}")
-        except Exception as e:
-            print(f"❌ 权限检查失败: {e}")
-    
-    # 清理测试文件
-    if os.path.exists(test_tables_file):
-        os.remove(test_tables_file)
-    
-    print("\n===== 测试完成 =====")
-
-async def test_table_parser():
-    """测试表清单解析器"""
-    print("\n===== 测试表清单解析器 =====")
-    
-    from schema_tools.utils.table_parser import TableListParser
-    
-    parser = TableListParser()
-    
-    # 测试字符串解析
-    test_cases = [
-        "public.users",
-        "hr.employees,sales.orders",
-        "users\norders\nproducts",
-        "schema.table_name"
-    ]
-    
-    for test_str in test_cases:
-        result = parser.parse_string(test_str)
-        print(f"输入: {repr(test_str)}")
-        print(f"结果: {result}")
-        print()
-
-async def test_system_filter():
-    """测试系统表过滤器"""
-    print("\n===== 测试系统表过滤器 =====")
-    
-    from schema_tools.utils.system_filter import SystemTableFilter
-    
-    filter = SystemTableFilter()
-    
-    test_tables = [
-        "pg_class",
-        "information_schema.tables",
-        "public.users",
-        "hr.employees",
-        "pg_temp_1.temp_table",
-        "my_table"
-    ]
-    
-    for table in test_tables:
-        if '.' in table:
-            schema, name = table.split('.', 1)
-        else:
-            schema, name = 'public', table
-        
-        is_system = filter.is_system_table(schema, name)
-        print(f"{table}: {'系统表' if is_system else '用户表'}")
-
-if __name__ == "__main__":
-    print("Schema Tools 测试脚本\n")
-    
-    # 运行测试
-    asyncio.run(test_basic_functionality())
-    asyncio.run(test_table_parser())
-    asyncio.run(test_system_filter())

+ 0 - 313
schema_tools/workflow_example.py

@@ -1,313 +0,0 @@
-"""
-Schema工作流编排器使用示例
-演示如何使用SchemaWorkflowOrchestrator执行完整的工作流程
-"""
-
-import asyncio
-import sys
-import os
-from pathlib import Path
-
-# 添加项目根目录到Python路径
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from schema_tools.schema_workflow_orchestrator import SchemaWorkflowOrchestrator
-from schema_tools.utils.logger import setup_logging
-
-
-async def example_complete_workflow():
-    """完整工作流程示例"""
-    print("=" * 60)
-    print("完整工作流程示例")
-    print("=" * 60)
-    
-    # 设置日志
-    setup_logging(verbose=True)
-    
-    # 配置参数
-    db_connection = "postgresql://user:password@localhost:5432/test_db"
-    table_list_file = "schema_tools/tables.txt"
-    business_context = "高速公路服务区管理系统"
-    db_name = "highway_db"
-    output_dir = "./example_output"
-    
-    try:
-        # 创建工作流编排器
-        orchestrator = SchemaWorkflowOrchestrator(
-            db_connection=db_connection,
-            table_list_file=table_list_file,
-            business_context=business_context,
-            db_name=db_name,
-            output_dir=output_dir,
-            enable_sql_validation=True,
-            enable_llm_repair=True,
-            modify_original_file=True
-        )
-        
-        print(f"🚀 开始执行完整工作流程...")
-        print(f"📁 输出目录: {output_dir}")
-        print(f"🏢 业务背景: {business_context}")
-        print(f"💾 数据库: {db_name}")
-        
-        # 执行完整工作流程
-        report = await orchestrator.execute_complete_workflow()
-        
-        # 打印详细摘要
-        orchestrator.print_final_summary(report)
-        
-        # 分析结果
-        if report["success"]:
-            print(f"\n🎉 工作流程执行成功!")
-            
-            # 显示各步骤详情
-            results = report["processing_results"]
-            
-            if "ddl_md_generation" in results:
-                ddl_md = results["ddl_md_generation"]
-                print(f"📋 步骤1 - DDL/MD生成:")
-                print(f"   处理表数: {ddl_md.get('processed_successfully', 0)}")
-                print(f"   生成文件: {ddl_md.get('files_generated', 0)}")
-                print(f"   耗时: {ddl_md.get('duration', 0):.2f}秒")
-            
-            if "question_sql_generation" in results:
-                qs = results["question_sql_generation"]
-                print(f"🤖 步骤2 - Question-SQL生成:")
-                print(f"   生成主题: {qs.get('total_themes', 0)}")
-                print(f"   成功主题: {qs.get('successful_themes', 0)}")
-                print(f"   问答对数: {qs.get('total_questions', 0)}")
-                print(f"   耗时: {qs.get('duration', 0):.2f}秒")
-            
-            if "sql_validation" in results:
-                validation = results["sql_validation"]
-                print(f"🔍 步骤3 - SQL验证:")
-                print(f"   原始SQL数: {validation.get('original_sql_count', 0)}")
-                print(f"   有效SQL数: {validation.get('valid_sql_count', 0)}")
-                print(f"   成功率: {validation.get('success_rate', 0):.1%}")
-                print(f"   耗时: {validation.get('duration', 0):.2f}秒")
-            
-            outputs = report["final_outputs"]
-            print(f"\n📄 最终输出:")
-            print(f"   主要文件: {outputs['primary_output_file']}")
-            print(f"   问题总数: {outputs['final_question_count']}")
-            
-        else:
-            print(f"\n❌ 工作流程执行失败:")
-            error = report["error"]
-            print(f"   失败步骤: {error['failed_step']}")
-            print(f"   错误信息: {error['message']}")
-            
-            # 显示已完成的步骤
-            completed = report["workflow_summary"]["completed_steps"]
-            if completed:
-                print(f"   已完成步骤: {', '.join(completed)}")
-        
-    except Exception as e:
-        print(f"\n❌ 示例执行失败: {e}")
-        import traceback
-        traceback.print_exc()
-
-
-async def example_skip_validation():
-    """跳过验证的工作流程示例"""
-    print("=" * 60)
-    print("跳过验证的工作流程示例")
-    print("=" * 60)
-    
-    # 设置日志
-    setup_logging(verbose=True)
-    
-    # 配置参数(跳过SQL验证)
-    db_connection = "postgresql://user:password@localhost:5432/test_db"
-    table_list_file = "schema_tools/tables.txt"
-    business_context = "电商系统"
-    db_name = "ecommerce_db"
-    output_dir = "./example_output_no_validation"
-    
-    try:
-        # 创建工作流编排器(跳过验证)
-        orchestrator = SchemaWorkflowOrchestrator(
-            db_connection=db_connection,
-            table_list_file=table_list_file,
-            business_context=business_context,
-            db_name=db_name,
-            output_dir=output_dir,
-            enable_sql_validation=False,  # 跳过SQL验证
-            enable_llm_repair=False,
-            modify_original_file=False
-        )
-        
-        print(f"🚀 开始执行工作流程(跳过验证)...")
-        
-        # 执行工作流程
-        report = await orchestrator.execute_complete_workflow()
-        
-        # 打印摘要
-        orchestrator.print_final_summary(report)
-        
-        print(f"\n📊 执行结果:")
-        print(f"   成功: {'是' if report['success'] else '否'}")
-        print(f"   完成步骤数: {len(report['workflow_summary']['completed_steps'])}")
-        print(f"   总耗时: {report['workflow_summary']['total_duration']}秒")
-        
-    except Exception as e:
-        print(f"\n❌ 示例执行失败: {e}")
-
-
-async def example_error_handling():
-    """错误处理示例"""
-    print("=" * 60)
-    print("错误处理示例")
-    print("=" * 60)
-    
-    # 设置日志
-    setup_logging(verbose=True)
-    
-    # 故意使用错误的配置来演示错误处理
-    db_connection = "postgresql://invalid:invalid@localhost:5432/invalid_db"
-    table_list_file = "nonexistent_tables.txt"
-    business_context = "测试系统"
-    db_name = "test_db"
-    output_dir = "./example_error_output"
-    
-    try:
-        # 创建工作流编排器
-        orchestrator = SchemaWorkflowOrchestrator(
-            db_connection=db_connection,
-            table_list_file=table_list_file,
-            business_context=business_context,
-            db_name=db_name,
-            output_dir=output_dir
-        )
-        
-        print(f"🚀 开始执行工作流程(故意触发错误)...")
-        
-        # 执行工作流程
-        report = await orchestrator.execute_complete_workflow()
-        
-        # 分析错误报告
-        if not report["success"]:
-            print(f"\n🔍 错误分析:")
-            error = report["error"]
-            print(f"   错误类型: {error['type']}")
-            print(f"   错误信息: {error['message']}")
-            print(f"   失败步骤: {error['failed_step']}")
-            
-            # 显示部分结果
-            partial = report.get("partial_results", {})
-            if partial:
-                print(f"   部分结果: {list(partial.keys())}")
-        
-    except Exception as e:
-        print(f"\n❌ 预期的错误: {e}")
-        print("这是演示错误处理的正常情况")
-
-
-def show_usage_examples():
-    """显示使用示例"""
-    print("=" * 60)
-    print("SchemaWorkflowOrchestrator 使用示例")
-    print("=" * 60)
-    
-    examples = [
-        {
-            "title": "1. 编程方式 - 完整工作流程",
-            "code": """
-import asyncio
-from schema_tools.schema_workflow_orchestrator import SchemaWorkflowOrchestrator
-
-async def run_complete_workflow():
-    orchestrator = SchemaWorkflowOrchestrator(
-        db_connection="postgresql://user:pass@localhost:5432/dbname",
-        table_list_file="tables.txt",
-        business_context="高速公路服务区管理系统",
-        db_name="highway_db",
-        output_dir="./output"
-    )
-    
-    # 一键执行完整流程
-    report = await orchestrator.execute_complete_workflow()
-    
-    if report["success"]:
-        print(f"✅ 编排完成!最终生成 {report['final_outputs']['final_question_count']} 个问答对")
-        print(f"📄 输出文件: {report['final_outputs']['primary_output_file']}")
-    else:
-        print(f"❌ 编排失败: {report['error']['message']}")
-
-asyncio.run(run_complete_workflow())
-            """
-        },
-        {
-            "title": "2. 命令行方式 - 完整工作流程",
-            "code": """
-python -m schema_tools.schema_workflow_orchestrator \\
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \\
-  --table-list tables.txt \\
-  --business-context "高速公路服务区管理系统" \\
-  --db-name highway_db \\
-  --output-dir ./output
-            """
-        },
-        {
-            "title": "3. 命令行方式 - 跳过验证",
-            "code": """
-python -m schema_tools.schema_workflow_orchestrator \\
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \\
-  --table-list tables.txt \\
-  --business-context "电商系统" \\
-  --db-name ecommerce_db \\
-  --skip-validation
-            """
-        },
-        {
-            "title": "4. 命令行方式 - 禁用LLM修复",
-            "code": """
-python -m schema_tools.schema_workflow_orchestrator \\
-  --db-connection "postgresql://user:pass@localhost:5432/dbname" \\
-  --table-list tables.txt \\
-  --business-context "管理系统" \\
-  --db-name management_db \\
-  --disable-llm-repair \\
-  --verbose
-            """
-        }
-    ]
-    
-    for example in examples:
-        print(f"\n{example['title']}:")
-        print(example['code'])
-
-
-async def main():
-    """主函数"""
-    print("Schema工作流编排器使用示例")
-    print("请选择要运行的示例:")
-    print("1. 完整工作流程示例")
-    print("2. 跳过验证的工作流程示例")
-    print("3. 错误处理示例")
-    print("4. 显示使用示例代码")
-    print("0. 退出")
-    
-    try:
-        choice = input("\n请输入选择 (0-4): ").strip()
-        
-        if choice == "1":
-            await example_complete_workflow()
-        elif choice == "2":
-            await example_skip_validation()
-        elif choice == "3":
-            await example_error_handling()
-        elif choice == "4":
-            show_usage_examples()
-        elif choice == "0":
-            print("退出示例程序")
-        else:
-            print("无效选择")
-    
-    except KeyboardInterrupt:
-        print("\n\n用户中断,退出程序")
-    except Exception as e:
-        print(f"\n示例执行失败: {e}")
-
-
-if __name__ == "__main__":
-    asyncio.run(main()) 

+ 0 - 10
test_file_modification.json

@@ -1,10 +0,0 @@
-[
-  {
-    "question": "查询所有服务区名称",
-    "sql": "SELECT service_area_name FROM bss_service_area WHERE delete_ts IS NULL;"
-  },
-  {
-    "question": "测试无效SQL",
-    "sql": "SELECT * FROM non_existent_table WHERE id = 1;"
-  }
-] 

+ 0 - 89
test_qa_apis.py

@@ -1,89 +0,0 @@
-#!/usr/bin/env python
-"""
-QA反馈API测试脚本
-用于验证所有API端点是否正常工作
-"""
-
-import requests
-import json
-
-# 配置
-BASE_URL = "http://localhost:8084"  # 根据你的端口配置
-API_PREFIX = "/api/v0/qa_feedback"
-
-def test_api(method, endpoint, data=None, expected_status=200):
-    """测试API端点"""
-    url = f"{BASE_URL}{API_PREFIX}{endpoint}"
-    
-    try:
-        if method == "GET":
-            response = requests.get(url)
-        elif method == "POST":
-            response = requests.post(url, json=data, headers={'Content-Type': 'application/json'})
-        elif method == "PUT":
-            response = requests.put(url, json=data, headers={'Content-Type': 'application/json'})
-        elif method == "DELETE":
-            response = requests.delete(url)
-        
-        print(f"\n{'='*60}")
-        print(f"测试: {method} {endpoint}")
-        print(f"URL: {url}")
-        print(f"状态码: {response.status_code}")
-        print(f"响应:")
-        try:
-            print(json.dumps(response.json(), indent=2, ensure_ascii=False))
-        except:
-            print(response.text)
-        
-        return response.status_code == expected_status
-        
-    except Exception as e:
-        print(f"❌ 测试失败: {e}")
-        return False
-
-def main():
-    """主测试函数"""
-    print("🚀 开始测试QA反馈模块API...")
-    
-    # 1. 测试统计API (GET)
-    print("\n📊 测试统计API")
-    test_api("GET", "/stats")
-    
-    # 2. 测试查询API (POST)
-    print("\n🔍 测试查询API")
-    test_api("POST", "/query", {
-        "page": 1,
-        "page_size": 10
-    })
-    
-    # 3. 测试添加反馈API (POST)
-    print("\n➕ 测试添加反馈API")
-    add_result = test_api("POST", "/add", {
-        "question": "测试问题",
-        "sql": "SELECT 1 as test",
-        "is_thumb_up": True,
-        "user_id": "test_user"
-    })
-    
-    # 4. 测试训练API (POST) - 重点测试
-    print("\n⭐ 测试训练API (重点)")
-    test_api("POST", "/add_to_training", {
-        "feedback_ids": [1, 2, 3]
-    }, expected_status=404)  # 可能没有这些ID,但API应该存在
-    
-    # 5. 测试更新API (PUT)
-    print("\n✏️ 测试更新API")
-    test_api("PUT", "/update/1", {
-        "question": "更新的问题"
-    }, expected_status=404)  # 可能没有ID=1的记录
-    
-    # 6. 测试删除API (DELETE)
-    print("\n🗑️ 测试删除API")
-    test_api("DELETE", "/delete/999", expected_status=404)  # 测试不存在的ID
-    
-    print(f"\n{'='*60}")
-    print("🎯 测试完成!")
-    print("📝 重点关注训练API是否返回正确的错误信息而不是'API not ported'")
-
-if __name__ == "__main__":
-    main() 

+ 0 - 180
test_training_data_apis.py

@@ -1,180 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-训练数据管理API测试脚本
-用于测试新增的训练数据管理接口
-"""
-
-import requests
-import json
-import sys
-
-# API基础URL
-BASE_URL = "http://localhost:8084"
-API_PREFIX = "/api/v0/training_data"
-
-def test_api(method: str, endpoint: str, data=None, expected_status=200):
-    """测试API的通用函数"""
-    url = f"{BASE_URL}{API_PREFIX}{endpoint}"
-    
-    try:
-        if method == "GET":
-            response = requests.get(url)
-        elif method == "POST":
-            response = requests.post(url, json=data, headers={'Content-Type': 'application/json'})
-        elif method == "DELETE":
-            response = requests.delete(url, json=data, headers={'Content-Type': 'application/json'})
-        else:
-            print(f"❌ 不支持的HTTP方法: {method}")
-            return False
-        
-        print(f"📤 {method} {endpoint}")
-        if data:
-            print(f"📋 请求数据: {json.dumps(data, ensure_ascii=False, indent=2)}")
-        
-        print(f"📥 状态码: {response.status_code}")
-        
-        if response.status_code == expected_status:
-            print("✅ 状态码正确")
-        else:
-            print(f"⚠️ 期望状态码: {expected_status}, 实际状态码: {response.status_code}")
-        
-        try:
-            response_json = response.json()
-            print(f"📄 响应: {json.dumps(response_json, ensure_ascii=False, indent=2)}")
-            return True
-        except:
-            print(f"📄 响应: {response.text}")
-            return False
-            
-    except requests.ConnectionError:
-        print(f"❌ 连接失败: 请确保服务器运行在 {BASE_URL}")
-        return False
-    except Exception as e:
-        print(f"❌ 请求失败: {str(e)}")
-        return False
-
-def main():
-    """主测试函数"""
-    print("🚀 开始测试训练数据管理API...")
-    print(f"🔗 服务器地址: {BASE_URL}")
-    print("="*60)
-    
-    # 1. 测试统计API (GET)
-    print("\n📊 测试统计API")
-    test_api("GET", "/stats")
-    
-    # 2. 测试查询API (POST) - 基础查询
-    print("\n🔍 测试查询API - 基础查询")
-    test_api("POST", "/query", {
-        "page": 1,
-        "page_size": 10
-    })
-    
-    # 3. 测试查询API (POST) - 带筛选
-    print("\n🔍 测试查询API - 带筛选")
-    test_api("POST", "/query", {
-        "page": 1,
-        "page_size": 5,
-        "training_data_type": "sql",
-        "search_keyword": "用户"
-    })
-    
-    # 4. 测试创建API (POST) - 单条SQL记录
-    print("\n➕ 测试创建API - 单条SQL记录")
-    test_api("POST", "/create", {
-        "data": {
-            "training_data_type": "sql",
-            "question": "查询所有测试用户",
-            "sql": "SELECT * FROM users WHERE status = 'test'"
-        }
-    })
-    
-    # 5. 测试创建API (POST) - 批量记录
-    print("\n➕ 测试创建API - 批量记录")
-    test_api("POST", "/create", {
-        "data": [
-            {
-                "training_data_type": "documentation",
-                "content": "这是一个测试文档,用于说明用户表的结构和用途。"
-            },
-            {
-                "training_data_type": "ddl",
-                "ddl": "CREATE TABLE test_table (id INT PRIMARY KEY, name VARCHAR(100));"
-            }
-        ]
-    })
-    
-    # 6. 测试创建API (POST) - SQL语法错误
-    print("\n➕ 测试创建API - SQL语法错误")
-    test_api("POST", "/create", {
-        "data": {
-            "training_data_type": "sql",
-            "question": "测试错误SQL",
-            "sql": "INVALID SQL SYNTAX"
-        }
-    }, expected_status=200)  # 批量操作中的错误仍返回200,但results中会有错误信息
-    
-    # 6.1. 测试创建API (POST) - 危险SQL操作检查
-    print("\n➕ 测试创建API - 危险SQL操作检查")
-    test_api("POST", "/create", {
-        "data": [
-            {
-                "training_data_type": "sql",
-                "question": "测试UPDATE操作",
-                "sql": "UPDATE users SET status = 'inactive' WHERE id = 1"
-            },
-            {
-                "training_data_type": "sql",
-                "question": "测试DELETE操作",
-                "sql": "DELETE FROM users WHERE id = 1"
-            },
-            {
-                "training_data_type": "sql",
-                "question": "测试DROP操作",
-                "sql": "DROP TABLE test_table"
-            }
-        ]
-    }, expected_status=200)  # 批量操作返回200,但会有错误信息
-    
-    # 7. 测试删除API (POST) - 不存在的ID
-    print("\n🗑️ 测试删除API - 不存在的ID")
-    test_api("POST", "/delete", {
-        "ids": ["non-existent-id-1", "non-existent-id-2"],
-        "confirm": True
-    })
-    
-    # 8. 测试删除API (POST) - 缺少确认
-    print("\n🗑️ 测试删除API - 缺少确认")
-    test_api("POST", "/delete", {
-        "ids": ["test-id"],
-        "confirm": False
-    }, expected_status=400)
-    
-    # 9. 测试参数验证 - 页码错误
-    print("\n⚠️ 测试参数验证 - 页码错误")
-    test_api("POST", "/query", {
-        "page": 0,
-        "page_size": 10
-    }, expected_status=400)
-    
-    # 10. 测试参数验证 - 页面大小错误
-    print("\n⚠️ 测试参数验证 - 页面大小错误")
-    test_api("POST", "/query", {
-        "page": 1,
-        "page_size": 150
-    }, expected_status=400)
-    
-    print(f"\n{'='*60}")
-    print("🎯 测试完成!")
-    print("\n📝 说明:")
-    print("- ✅ 表示API响应正常")
-    print("- ⚠️ 表示状态码不符合预期")
-    print("- ❌ 表示连接或请求失败")
-    print("\n💡 提示:")
-    print("- 首次运行时可能没有训练数据,这是正常的")
-    print("- 创建操作成功后,再次查询可以看到新增的数据")
-    print("- 删除不存在的ID会返回成功,但failed_count会显示失败数量")
-
-if __name__ == "__main__":
-    main() 

+ 0 - 14
training/__init__.py

@@ -1,14 +0,0 @@
-# training_tools 模块
-# 包含用于训练Vanna模型的工具和实用程序
-
-__version__ = '0.1.0'
-
-# 导出关键的训练函数
-from .vanna_trainer import (
-    train_ddl,
-    train_documentation,
-    train_sql_example,
-    train_question_sql_pair,
-    flush_training,
-    shutdown_trainer
-)