Przeglądaj źródła

已经为./data_pipeline模块的命令行模式添加了truncate/backup参数,现在准备为API添加这两个参数。

wangxq 1 miesiąc temu
rodzic
commit
1946fe5ac3
54 zmienionych plików z 2594 dodań i 80 usunięć
  1. 15 0
      data_pipeline/config.py
  2. 143 3
      data_pipeline/schema_workflow.py
  3. 70 4
      data_pipeline/trainer/run_training.py
  4. 358 0
      data_pipeline/trainer/vector_table_manager.py
  5. 31 0
      data_pipeline/training_data/manual_20250721_002320/bss_business_day_data.ddl
  6. 32 0
      data_pipeline/training_data/manual_20250721_002320/bss_business_day_data_detail.md
  7. 17 0
      data_pipeline/training_data/manual_20250721_002320/bss_car_day_count.ddl
  8. 18 0
      data_pipeline/training_data/manual_20250721_002320/bss_car_day_count_detail.md
  9. 15 0
      data_pipeline/training_data/manual_20250721_002320/bss_company.ddl
  10. 16 0
      data_pipeline/training_data/manual_20250721_002320/bss_company_detail.md
  11. 16 0
      data_pipeline/training_data/manual_20250721_002320/bss_section_route.ddl
  12. 7 0
      data_pipeline/training_data/manual_20250721_002320/bss_section_route_area_link.ddl
  13. 7 0
      data_pipeline/training_data/manual_20250721_002320/bss_section_route_area_link_detail.md
  14. 16 0
      data_pipeline/training_data/manual_20250721_002320/bss_section_route_detail.md
  15. 19 0
      data_pipeline/training_data/manual_20250721_002320/bss_service_area.ddl
  16. 21 0
      data_pipeline/training_data/manual_20250721_002320/bss_service_area_detail.md
  17. 18 0
      data_pipeline/training_data/manual_20250721_002320/bss_service_area_mapper.ddl
  18. 20 0
      data_pipeline/training_data/manual_20250721_002320/bss_service_area_mapper_detail.md
  19. 11 0
      data_pipeline/training_data/manual_20250721_002320/db_query_decision_prompt.txt
  20. 10 0
      data_pipeline/training_data/manual_20250721_002320/filename_mapping.txt
  21. 62 0
      data_pipeline/training_data/manual_20250721_002320/metadata.txt
  22. 20 0
      data_pipeline/training_data/manual_20250721_002320/metadata_detail.md
  23. 202 0
      data_pipeline/training_data/manual_20250721_002320/qs_highway_db_20250721_002747_pair.json
  24. 202 0
      data_pipeline/training_data/manual_20250721_002320/qs_highway_db_20250721_002747_pair.json.backup
  25. 5 0
      data_pipeline/training_data/manual_20250721_002320/vector_bak/langchain_pg_collection_20250721_002757.csv
  26. 1 0
      data_pipeline/training_data/manual_20250721_002320/vector_bak/langchain_pg_embedding_20250721_002757.csv
  27. 11 0
      data_pipeline/training_data/manual_20250721_002320/vector_bak/vector_backup_log.txt
  28. 31 0
      data_pipeline/training_data/manual_20250721_010214/bss_business_day_data.ddl
  29. 32 0
      data_pipeline/training_data/manual_20250721_010214/bss_business_day_data_detail.md
  30. 17 0
      data_pipeline/training_data/manual_20250721_010214/bss_car_day_count.ddl
  31. 18 0
      data_pipeline/training_data/manual_20250721_010214/bss_car_day_count_detail.md
  32. 15 0
      data_pipeline/training_data/manual_20250721_010214/bss_company.ddl
  33. 19 0
      data_pipeline/training_data/manual_20250721_010214/bss_company_detail.md
  34. 16 0
      data_pipeline/training_data/manual_20250721_010214/bss_section_route.ddl
  35. 7 0
      data_pipeline/training_data/manual_20250721_010214/bss_section_route_area_link.ddl
  36. 7 0
      data_pipeline/training_data/manual_20250721_010214/bss_section_route_area_link_detail.md
  37. 16 0
      data_pipeline/training_data/manual_20250721_010214/bss_section_route_detail.md
  38. 19 0
      data_pipeline/training_data/manual_20250721_010214/bss_service_area.ddl
  39. 21 0
      data_pipeline/training_data/manual_20250721_010214/bss_service_area_detail.md
  40. 18 0
      data_pipeline/training_data/manual_20250721_010214/bss_service_area_mapper.ddl
  41. 20 0
      data_pipeline/training_data/manual_20250721_010214/bss_service_area_mapper_detail.md
  42. 11 0
      data_pipeline/training_data/manual_20250721_010214/db_query_decision_prompt.txt
  43. 10 0
      data_pipeline/training_data/manual_20250721_010214/filename_mapping.txt
  44. 62 0
      data_pipeline/training_data/manual_20250721_010214/metadata.txt
  45. 20 0
      data_pipeline/training_data/manual_20250721_010214/metadata_detail.md
  46. 202 0
      data_pipeline/training_data/manual_20250721_010214/qs_highway_db_20250721_010658_pair.json
  47. 202 0
      data_pipeline/training_data/manual_20250721_010214/qs_highway_db_20250721_010658_pair.json.backup
  48. 5 0
      data_pipeline/training_data/manual_20250721_010214/vector_bak/langchain_pg_collection_20250721_010708.csv
  49. 1 0
      data_pipeline/training_data/manual_20250721_010214/vector_bak/langchain_pg_embedding_20250721_010708.csv
  50. 11 0
      data_pipeline/training_data/manual_20250721_010214/vector_bak/vector_backup_log.txt
  51. 124 15
      docs/data_pipeline_API调用指南.md
  52. 171 53
      docs/data_pipeline_脚本化调用指南.md
  53. 36 5
      docs/vector_table_management_design.md
  54. 120 0
      test_vector_backup_only.py

+ 15 - 0
data_pipeline/config.py

@@ -133,6 +133,21 @@ SCHEMA_TOOLS_CONFIG = {
         "max_lines": 1000,                   # 最大行数限制
         "encoding": "utf-8",                 # 文件编码
         "allow_overwrite": True,             # 是否允许覆盖已存在的文件
+    },
+    
+    # Vector表管理配置
+    "vector_table_management": {
+        "backup_enabled": True,
+        "backup_directory": "vector_bak",
+        "supported_tables": [
+            "langchain_pg_collection",
+            "langchain_pg_embedding"
+        ],
+        "truncate_tables": [
+            "langchain_pg_embedding"  # 只清空embedding表
+        ],
+        "timestamp_format": "%Y%m%d_%H%M%S",
+        "backup_temp_suffix": ".tmp"
     }
 }
 

+ 143 - 3
data_pipeline/schema_workflow.py

@@ -30,7 +30,9 @@ class SchemaWorkflowOrchestrator:
                  enable_sql_validation: bool = True,
                  enable_llm_repair: bool = True,
                  modify_original_file: bool = True,
-                 enable_training_data_load: bool = True):
+                 enable_training_data_load: bool = True,
+                 backup_vector_tables: bool = False,
+                 truncate_vector_tables: bool = False):
         """
         初始化Schema工作流编排器
         
@@ -44,6 +46,8 @@ class SchemaWorkflowOrchestrator:
             enable_llm_repair: 是否启用LLM修复功能
             modify_original_file: 是否修改原始JSON文件
             enable_training_data_load: 是否启用训练数据加载
+            backup_vector_tables: 是否备份vector表数据
+            truncate_vector_tables: 是否清空vector表数据(自动启用备份)
         """
         self.db_connection = db_connection
         self.table_list_file = table_list_file
@@ -54,6 +58,14 @@ class SchemaWorkflowOrchestrator:
         self.modify_original_file = modify_original_file
         self.enable_training_data_load = enable_training_data_load
         
+        # 处理vector表管理参数
+        # 参数验证和自动启用逻辑:如果启用truncate,自动启用backup
+        if truncate_vector_tables:
+            backup_vector_tables = True
+            
+        self.backup_vector_tables = backup_vector_tables
+        self.truncate_vector_tables = truncate_vector_tables
+        
         # 处理task_id
         if task_id is None:
             # 脚本模式:自动生成manual开头的task_id
@@ -142,6 +154,10 @@ class SchemaWorkflowOrchestrator:
             else:
                 self.logger.info("⏭️ 跳过SQL验证步骤")
             
+            # 新增:独立的Vector表管理(在训练加载之前或替代训练加载)
+            if self.backup_vector_tables or self.truncate_vector_tables:
+                await self._execute_vector_table_management()
+            
             # 步骤4: 训练数据加载(可选)
             if self.enable_training_data_load:
                 await self._execute_step_4_training_data_load()
@@ -354,6 +370,51 @@ class SchemaWorkflowOrchestrator:
             self.logger.error(f"❌ 步骤3失败: {str(e)}")
             raise
     
+    async def _execute_vector_table_management(self):
+        """独立执行Vector表管理(支持--skip-training-load场景)"""
+        if not (self.backup_vector_tables or self.truncate_vector_tables):
+            return
+            
+        self.logger.info("=" * 60)
+        self.logger.info("🗂️ 开始执行Vector表管理")
+        self.logger.info("=" * 60)
+        
+        vector_stats = None
+        try:
+            from data_pipeline.trainer.vector_table_manager import VectorTableManager
+            
+            vector_manager = VectorTableManager(
+                task_output_dir=str(self.output_dir),
+                task_id=self.task_id
+            )
+            
+            # 执行vector表管理
+            vector_stats = await vector_manager.execute_vector_management(
+                backup=self.backup_vector_tables,
+                truncate=self.truncate_vector_tables
+            )
+            
+            # 记录结果到工作流状态(无论成功失败都记录)
+            self.workflow_state["artifacts"]["vector_management"] = vector_stats
+            
+            if vector_stats.get("errors"):
+                self.logger.warning(f"⚠️ Vector表管理完成,但有错误: {'; '.join(vector_stats['errors'])}")
+            else:
+                self.logger.info("✅ Vector表管理完成")
+            
+        except Exception as e:
+            self.logger.error(f"❌ Vector表管理失败: {e}")
+            # 即使异常也要记录基本状态
+            if vector_stats is None:
+                vector_stats = {
+                    "backup_performed": self.backup_vector_tables,
+                    "truncate_performed": False,
+                    "errors": [f"执行异常: {str(e)}"],
+                    "duration": 0
+                }
+                self.workflow_state["artifacts"]["vector_management"] = vector_stats
+            raise
+    
     async def _execute_step_4_training_data_load(self):
         """步骤4: 训练数据加载"""
         self.workflow_state["current_step"] = "training_data_load"
@@ -377,7 +438,10 @@ class SchemaWorkflowOrchestrator:
             
             # 执行训练数据加载
             self.logger.info("🔄 开始处理训练文件...")
-            load_successful = process_training_files(training_data_dir, self.task_id)
+            # 禁用vector管理参数以避免重复执行
+            load_successful, _ = process_training_files(training_data_dir, self.task_id, 
+                                                       backup_vector_tables=False, 
+                                                       truncate_vector_tables=False)
             
             step_duration = time.time() - step_start_time
             
@@ -608,6 +672,31 @@ class SchemaWorkflowOrchestrator:
             else:
                 self.logger.info(f"  📚 训练数据加载: 启用")
             
+            # Vector表管理总结
+            vector_stats = report.get("workflow_state", {}).get("artifacts", {}).get("vector_management")
+            if vector_stats:
+                self.logger.info("📊 Vector表管理:")
+                if vector_stats.get("backup_performed", False):
+                    tables_count = len(vector_stats.get("tables_backed_up", {}))
+                    total_size = sum(
+                        self._parse_file_size(info.get("file_size", "0 B")) 
+                        for info in vector_stats.get("tables_backed_up", {}).values() 
+                        if info.get("success", False)
+                    )
+                    self.logger.info(f"   ✅ 备份执行: {tables_count}个表,总大小: {self._format_size(total_size)}")
+                else:
+                    self.logger.info("   - 备份执行: 未执行")
+                    
+                if vector_stats.get("truncate_performed", False):
+                    self.logger.info("   ✅ 清空执行: langchain_pg_embedding表已清空")
+                else:
+                    self.logger.info("   - 清空执行: 未执行")
+                    
+                duration = vector_stats.get("duration", 0)
+                self.logger.info(f"   ⏱️  执行耗时: {duration:.1f}秒")
+            else:
+                self.logger.info("📊 Vector表管理: 未执行(未启用相关参数)")
+            
         else:
             error = report["error"]
             summary = report["workflow_summary"]
@@ -620,6 +709,43 @@ class SchemaWorkflowOrchestrator:
         
         self.logger.info("=" * 80)
     
+    def _parse_file_size(self, size_str: str) -> float:
+        """解析文件大小字符串为字节数"""
+        import re
+        
+        # 匹配数字和单位的正则表达式
+        match = re.match(r'(\d+\.?\d*)\s*([KMGT]?B)', size_str.upper())
+        if not match:
+            return 0.0
+            
+        size, unit = match.groups()
+        size = float(size)
+        
+        unit_multipliers = {
+            'B': 1,
+            'KB': 1024,
+            'MB': 1024**2,
+            'GB': 1024**3,
+            'TB': 1024**4
+        }
+        
+        return size * unit_multipliers.get(unit, 1)
+    
+    def _format_size(self, size_bytes: float) -> str:
+        """格式化字节数为可读的大小字符串"""
+        if size_bytes == 0:
+            return "0 B"
+        
+        size_names = ["B", "KB", "MB", "GB"]
+        i = 0
+        size = float(size_bytes)
+        
+        while size >= 1024.0 and i < len(size_names) - 1:
+            size /= 1024.0
+            i += 1
+        
+        return f"{size:.1f} {size_names[i]}"
+    
     def _parse_db_connection(self, db_connection: str) -> Dict[str, str]:
         """
         解析PostgreSQL连接字符串
@@ -741,6 +867,18 @@ def setup_argument_parser():
         help="跳过训练数据加载步骤"
     )
     
+    parser.add_argument(
+        "--backup-vector-tables",
+        action="store_true",
+        help="备份vector表数据到任务目录"
+    )
+    
+    parser.add_argument(
+        "--truncate-vector-tables",
+        action="store_true",
+        help="清空vector表数据(自动启用备份)"
+    )
+    
     parser.add_argument(
         "--verbose", "-v",
         action="store_true",
@@ -790,7 +928,9 @@ async def main():
             enable_sql_validation=not args.skip_validation,
             enable_llm_repair=not args.disable_llm_repair,
             modify_original_file=not args.no_modify_file,
-            enable_training_data_load=not args.skip_training_load
+            enable_training_data_load=not args.skip_training_load,
+            backup_vector_tables=args.backup_vector_tables,
+            truncate_vector_tables=args.truncate_vector_tables
         )
         
         # 获取logger用于启动信息

+ 70 - 4
data_pipeline/trainer/run_training.py

@@ -333,12 +333,14 @@ def train_json_question_sql_pairs(json_file):
     except Exception as e:
         print(f" 错误:处理JSON问答训练 - {e}")
 
-def process_training_files(data_path, task_id=None):
+def process_training_files(data_path, task_id=None, backup_vector_tables=False, truncate_vector_tables=False):
     """处理指定路径下的所有训练文件
     
     Args:
         data_path (str): 训练数据目录路径
         task_id (str): 任务ID,用于日志记录
+        backup_vector_tables (bool): 是否备份vector表数据
+        truncate_vector_tables (bool): 是否清空vector表数据
     """
     # 初始化日志
     if task_id:
@@ -366,6 +368,28 @@ def process_training_files(data_path, task_id=None):
         else:
             print(message)
     
+    # Vector表管理(前置步骤)
+    vector_stats = None
+    if backup_vector_tables or truncate_vector_tables:
+        # 参数验证和自动启用逻辑
+        if truncate_vector_tables:
+            backup_vector_tables = True
+        
+        try:
+            import asyncio
+            from data_pipeline.trainer.vector_table_manager import VectorTableManager
+            
+            log_message("🗂️ 开始执行Vector表管理...")
+            
+            vector_manager = VectorTableManager(data_path, task_id)
+            vector_stats = asyncio.run(vector_manager.execute_vector_management(backup_vector_tables, truncate_vector_tables))
+            
+            log_message("✅ Vector表管理完成")
+            
+        except Exception as e:
+            log_message(f"❌ Vector表管理失败: {e}", "error")
+            return False
+    
     # 初始化统计计数器
     stats = {
         "ddl": 0,
@@ -434,9 +458,9 @@ def process_training_files(data_path, task_id=None):
     total_files = sum(stats.values())
     if total_files == 0:
         log_message(f"警告: 在目录 {data_path} 中未找到任何可训练的文件", "warning")
-        return False
+        return False, vector_stats
         
-    return True
+    return True, vector_stats
 
 def check_pgvector_connection():
     """检查 PgVector 数据库连接是否可用
@@ -537,6 +561,13 @@ def main():
     
     parser.add_argument('--data_path', type=str, default=default_path,
                         help='训练数据目录路径 (默认: 从data_pipeline.config.SCHEMA_TOOLS_CONFIG)')
+    
+    parser.add_argument('--backup-vector-tables', action='store_true',
+                        help='备份vector表数据')
+    
+    parser.add_argument('--truncate-vector-tables', action='store_true',
+                        help='清空vector表数据(自动启用备份)')
+    
     args = parser.parse_args()
     
     # 使用Path对象处理路径以确保跨平台兼容性
@@ -605,7 +636,9 @@ def main():
         print(f"\n===== 未知的向量数据库类型: {vector_db_type} =====\n")
     
     # 处理训练文件
-    process_successful = process_training_files(data_path)
+    process_successful, vector_stats = process_training_files(data_path, None, 
+                                                             args.backup_vector_tables, 
+                                                             args.truncate_vector_tables)
     
     if process_successful:
         # 训练结束,刷新和关闭批处理器
@@ -642,6 +675,39 @@ def main():
     else:
         print("\n===== 未能找到或处理任何训练文件,训练过程终止 =====")
     
+    # Vector表管理总结
+    print("\n===== Vector表管理统计 =====")
+    if vector_stats:
+        if vector_stats.get("backup_performed", False):
+            tables_info = vector_stats.get("tables_backed_up", {})
+            print(f"✓ 备份执行: 成功备份 {len(tables_info)} 个表")
+            for table_name, info in tables_info.items():
+                if info.get("success", False):
+                    print(f"  - {table_name}: {info['row_count']}行 -> {info['backup_file']} ({info['file_size']})")
+                else:
+                    print(f"  - {table_name}: 备份失败 - {info.get('error', '未知错误')}")
+        else:
+            print("- 备份执行: 未执行")
+            
+        if vector_stats.get("truncate_performed", False):
+            truncate_info = vector_stats.get("truncate_results", {})
+            print("✓ 清空执行: langchain_pg_embedding表已清空")
+            for table_name, info in truncate_info.items():
+                if info.get("success", False):
+                    print(f"  - {table_name}: {info['rows_before']}行 -> 0行")
+                else:
+                    print(f"  - {table_name}: 清空失败 - {info.get('error', '未知错误')}")
+        else:
+            print("- 清空执行: 未执行")
+            
+        print(f"✓ 总耗时: {vector_stats.get('duration', 0):.1f}秒")
+        
+        if vector_stats.get("errors"):
+            print(f"⚠ 错误: {'; '.join(vector_stats['errors'])}")
+    else:
+        print("- 未执行vector表管理操作")
+    print("===========================")
+    
     # 输出embedding模型信息
     print("\n===== Embedding模型信息 =====")
     try:

+ 358 - 0
data_pipeline/trainer/vector_table_manager.py

@@ -0,0 +1,358 @@
+import asyncio
+import time
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, List
+import psycopg2
+import logging
+
+
+class VectorTableManager:
+    """Vector表管理器,负责备份和清空操作"""
+    
+    def __init__(self, task_output_dir: str, task_id: str = None):
+        """
+        Args:
+            task_output_dir: 任务输出目录(用于存放备份文件)
+            task_id: 任务ID(用于日志记录)
+        Note:
+            数据库连接将从data_pipeline.config.SCHEMA_TOOLS_CONFIG自动获取
+        """
+        self.task_output_dir = task_output_dir
+        self.task_id = task_id
+        
+        # 从data_pipeline.config获取配置
+        from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+        self.config = SCHEMA_TOOLS_CONFIG.get("vector_table_management", {})
+        
+        # 初始化日志
+        if task_id:
+            from data_pipeline.dp_logging import get_logger
+            self.logger = get_logger("VectorTableManager", task_id)
+        else:
+            import logging
+            self.logger = logging.getLogger("VectorTableManager")
+    
+    async def execute_vector_management(self, backup: bool, truncate: bool) -> Dict[str, Any]:
+        """执行vector表管理操作的主流程"""
+        
+        start_time = time.time()
+        
+        # 1. 参数验证和自动启用逻辑
+        if truncate and not backup:
+            backup = True
+            self.logger.info("🔄 启用truncate时自动启用backup")
+        
+        if not backup and not truncate:
+            self.logger.info("⏭️ 未启用vector表管理,跳过操作")
+            return {"backup_performed": False, "truncate_performed": False}
+        
+        # 2. 初始化结果统计
+        result = {
+            "backup_performed": backup,
+            "truncate_performed": truncate,
+            "tables_backed_up": {},
+            "truncate_results": {},
+            "errors": [],
+            "backup_directory": None,
+            "duration": 0
+        }
+        
+        try:
+            # 3. 创建备份目录
+            backup_dir = Path(self.task_output_dir) / self.config.get("backup_directory", "vector_bak")
+            if backup:
+                backup_dir.mkdir(parents=True, exist_ok=True)
+                result["backup_directory"] = str(backup_dir)
+                self.logger.info(f"📁 备份目录: {backup_dir}")
+            
+            # 4. 执行备份操作
+            if backup:
+                self.logger.info("🗂️ 开始备份vector表...")
+                backup_results = await self.backup_vector_tables()
+                result["tables_backed_up"] = backup_results
+                
+                # 检查备份是否全部成功
+                backup_failed = any(not r.get("success", False) for r in backup_results.values())
+                if backup_failed:
+                    result["errors"].append("部分表备份失败")
+                    if truncate:
+                        self.logger.error("❌ 备份失败,取消清空操作")
+                        result["truncate_performed"] = False
+                        truncate = False
+            
+            # 5. 执行清空操作(仅在备份成功时)
+            if truncate:
+                self.logger.info("🗑️ 开始清空vector表...")
+                truncate_results = await self.truncate_vector_tables()
+                result["truncate_results"] = truncate_results
+                
+                # 检查清空是否成功
+                truncate_failed = any(not r.get("success", False) for r in truncate_results.values())
+                if truncate_failed:
+                    result["errors"].append("部分表清空失败")
+            
+            # 6. 生成备份日志文件
+            if backup and backup_dir.exists():
+                self._write_backup_log(backup_dir, result)
+            
+            # 7. 计算总耗时
+            result["duration"] = time.time() - start_time
+            
+            # 8. 记录最终状态
+            if result["errors"]:
+                self.logger.warning(f"⚠️ Vector表管理完成,但有错误: {'; '.join(result['errors'])}")
+            else:
+                self.logger.info(f"✅ Vector表管理完成,耗时: {result['duration']:.2f}秒")
+            
+            return result
+            
+        except Exception as e:
+            result["duration"] = time.time() - start_time
+            result["errors"].append(f"执行失败: {str(e)}")
+            self.logger.error(f"❌ Vector表管理失败: {e}")
+            raise
+    
+    async def backup_vector_tables(self) -> Dict[str, Any]:
+        """备份vector表数据"""
+        
+        # 1. 创建备份目录
+        backup_dir = Path(self.task_output_dir) / self.config.get("backup_directory", "vector_bak")
+        backup_dir.mkdir(parents=True, exist_ok=True)
+        
+        # 2. 生成时间戳
+        timestamp = datetime.now().strftime(self.config.get("timestamp_format", "%Y%m%d_%H%M%S"))
+        
+        # 3. 执行备份(每个表分别处理)
+        results = {}
+        supported_tables = self.config.get("supported_tables", ["langchain_pg_collection", "langchain_pg_embedding"])
+        
+        for table_name in supported_tables:
+            try:
+                # 3.1 定义文件路径(.tmp临时文件)
+                temp_file = backup_dir / f"{table_name}_{timestamp}.csv.tmp"
+                final_file = backup_dir / f"{table_name}_{timestamp}.csv"
+                
+                # 确保使用绝对路径(PostgreSQL COPY命令要求)
+                temp_file_abs = temp_file.resolve()
+                
+                # 3.2 通过psycopg2使用流式客户端导出(支持大数据量)
+                start_time = time.time()
+                row_count = 0
+                batch_size = 10000  # 每批处理1万条记录
+                
+                with self.get_connection() as conn:
+                    # 临时关闭autocommit以支持流式处理
+                    old_autocommit = conn.autocommit
+                    conn.autocommit = False
+                    
+                    try:
+                        with conn.cursor() as cursor:
+                            # 设置游标为流式模式
+                            cursor.itersize = batch_size
+                            
+                            # 执行编码设置
+                            cursor.execute("SET client_encoding TO 'UTF8'")
+                            
+                            # 执行查询
+                            cursor.execute(f"SELECT * FROM {table_name}")
+                            
+                            # 获取列名
+                            colnames = [desc[0] for desc in cursor.description]
+                            
+                            # 使用流式方式写入CSV文件
+                            import csv
+                            with open(temp_file_abs, 'w', newline='', encoding='utf-8') as csvfile:
+                                writer = csv.writer(csvfile)
+                                
+                                # 写入表头
+                                writer.writerow(colnames)
+                                
+                                # 流式读取和写入数据
+                                while True:
+                                    rows = cursor.fetchmany(batch_size)
+                                    if not rows:
+                                        break
+                                        
+                                    # 批量写入当前批次的数据
+                                    for row in rows:
+                                        writer.writerow(row)
+                                        row_count += 1
+                                    
+                                    # 记录进度(大数据量时有用)
+                                    if row_count % (batch_size * 5) == 0:  # 每5万条记录记录一次
+                                        self.logger.info(f"📊 {table_name} 已导出 {row_count} 行数据...")
+                        
+                        # 提交事务
+                        conn.commit()
+                        
+                    finally:
+                        # 恢复原来的autocommit设置
+                        conn.autocommit = old_autocommit
+                
+                self.logger.info(f"📊 {table_name} 流式导出完成,总计 {row_count} 行")
+                
+                # 3.3 导出完成后,重命名文件 (.tmp -> .csv)
+                if temp_file.exists():
+                    temp_file.rename(final_file)
+                    
+                    # 3.4 获取文件信息
+                    file_stat = final_file.stat()
+                    duration = time.time() - start_time
+                    
+                    results[table_name] = {
+                        "success": True,
+                        "row_count": row_count,
+                        "file_size": self._format_file_size(file_stat.st_size),
+                        "backup_file": final_file.name,
+                        "duration": duration
+                    }
+                    
+                    self.logger.info(f"✅ {table_name} 备份成功: {row_count}行 -> {final_file.name}")
+                else:
+                    raise Exception(f"临时文件 {temp_file} 未生成")
+                    
+            except Exception as e:
+                results[table_name] = {
+                    "success": False,
+                    "error": str(e)
+                }
+                self.logger.error(f"❌ {table_name} 备份失败: {e}")
+                
+                # 清理可能的临时文件
+                if temp_file.exists():
+                    temp_file.unlink()
+        
+        return results
+    
+    async def truncate_vector_tables(self) -> Dict[str, Any]:
+        """清空vector表数据(只清空langchain_pg_embedding)"""
+        
+        results = {}
+        
+        # 只清空配置中指定的表(通常只有langchain_pg_embedding)
+        truncate_tables = self.config.get("truncate_tables", ["langchain_pg_embedding"])
+        
+        for table_name in truncate_tables:
+            try:
+                # 记录清空前的行数(用于统计)
+                count_sql = f"SELECT COUNT(*) FROM {table_name}"
+                
+                start_time = time.time()
+                with self.get_connection() as conn:
+                    with conn.cursor() as cursor:
+                        # 1. 获取清空前的行数
+                        cursor.execute(count_sql)
+                        rows_before = cursor.fetchone()[0]
+                        
+                        # 2. 执行TRUNCATE
+                        cursor.execute(f"TRUNCATE TABLE {table_name}")
+                        
+                        # 3. 验证清空结果
+                        cursor.execute(count_sql)
+                        rows_after = cursor.fetchone()[0]
+                
+                duration = time.time() - start_time
+                
+                if rows_after == 0:
+                    results[table_name] = {
+                        "success": True,
+                        "rows_before": rows_before,
+                        "rows_after": rows_after,
+                        "duration": duration
+                    }
+                    self.logger.info(f"✅ {table_name} 清空成功: {rows_before}行 -> 0行")
+                else:
+                    raise Exception(f"清空失败,表中仍有 {rows_after} 行数据")
+                    
+            except Exception as e:
+                results[table_name] = {
+                    "success": False,
+                    "error": str(e)
+                }
+                self.logger.error(f"❌ {table_name} 清空失败: {e}")
+        
+        return results
+    
+    def get_connection(self):
+        """获取pgvector数据库连接(从data_pipeline.config获取配置)"""
+        import psycopg2
+        
+        try:
+            # 方法1:如果SCHEMA_TOOLS_CONFIG中有连接字符串,直接使用
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            connection_string = SCHEMA_TOOLS_CONFIG.get("default_db_connection")
+            if connection_string:
+                conn = psycopg2.connect(connection_string)
+            else:
+                # 方法2:从app_config获取pgvector数据库配置
+                import app_config
+                pgvector_config = app_config.PGVECTOR_CONFIG
+                conn = psycopg2.connect(
+                    host=pgvector_config.get('host'),
+                    port=pgvector_config.get('port'),
+                    database=pgvector_config.get('dbname'),
+                    user=pgvector_config.get('user'),
+                    password=pgvector_config.get('password')
+                )
+            
+            # 设置自动提交,避免事务问题
+            conn.autocommit = True
+            return conn
+            
+        except Exception as e:
+            self.logger.error(f"pgvector数据库连接失败: {e}")
+            raise
+
+    def _write_backup_log(self, backup_dir: Path, result: Dict[str, Any]):
+        """写入详细的备份日志"""
+        log_file = backup_dir / "vector_backup_log.txt"
+        
+        try:
+            with open(log_file, 'w', encoding='utf-8') as f:
+                f.write("=== Vector Table Backup Log ===\n")
+                f.write(f"Backup Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+                f.write(f"Task ID: {self.task_id or 'Unknown'}\n")
+                f.write(f"Duration: {result.get('duration', 0):.2f}s\n\n")
+                
+                # 备份状态
+                f.write("Tables Backup Status:\n")
+                for table_name, info in result.get("tables_backed_up", {}).items():
+                    if info.get("success", False):
+                        f.write(f"✓ {table_name}: {info['row_count']} rows -> {info['backup_file']} ({info['file_size']})\n")
+                    else:
+                        f.write(f"✗ {table_name}: FAILED - {info.get('error', 'Unknown error')}\n")
+                
+                # 清空状态
+                if result.get("truncate_performed", False):
+                    f.write("\nTruncate Status:\n")
+                    for table_name, info in result.get("truncate_results", {}).items():
+                        if info.get("success", False):
+                            f.write(f"✓ {table_name}: TRUNCATED ({info['rows_before']} rows removed)\n")
+                        else:
+                            f.write(f"✗ {table_name}: FAILED - {info.get('error', 'Unknown error')}\n")
+                else:
+                    f.write("\nTruncate Status:\n- Not performed\n")
+                
+                # 错误汇总
+                if result.get("errors"):
+                    f.write(f"\nErrors: {'; '.join(result['errors'])}\n")
+                    
+        except Exception as e:
+            self.logger.warning(f"写入备份日志失败: {e}")
+    
+    def _format_file_size(self, size_bytes: int) -> str:
+        """格式化文件大小显示"""
+        if size_bytes == 0:
+            return "0 B"
+        
+        size_names = ["B", "KB", "MB", "GB"]
+        i = 0
+        size = float(size_bytes)
+        
+        while size >= 1024.0 and i < len(size_names) - 1:
+            size /= 1024.0
+            i += 1
+        
+        return f"{size:.1f} {size_names[i]}" 

+ 31 - 0
data_pipeline/training_data/manual_20250721_002320/bss_business_day_data.ddl

@@ -0,0 +1,31 @@
+-- 中文名: `bss_business_day_data` 表用于记录高速公路服务区每日经营数据
+-- 描述: `bss_business_day_data` 表用于记录高速公路服务区每日经营数据,支持业务分析与统计。
+create table public.bss_business_day_data (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  oper_date date              -- 统计日期,
+  service_no varchar(255)     -- 服务区编码,
+  service_name varchar(255)   -- 服务区名称,
+  branch_no varchar(255)      -- 档口编码,
+  branch_name varchar(255)    -- 档口名称,
+  wx numeric(19,4)            -- 微信支付金额,
+  wx_order integer            -- 微信订单数量,
+  zfb numeric(19,4)           -- 支付宝支付金额,
+  zf_order integer            -- 支付宝订单数量,
+  rmb numeric(19,4)           -- 现金支付金额,
+  rmb_order integer           -- 现金订单数量,
+  xs numeric(19,4)            -- 行吧支付金额,
+  xs_order integer            -- 行吧订单数量,
+  jd numeric(19,4)            -- 金豆支付金额,
+  jd_order integer            -- 金豆订单数量,
+  order_sum integer           -- 订单总数,
+  pay_sum numeric(19,4)       -- 总支付金额,
+  source_type integer         -- 数据来源类别,
+  primary key (id)
+);

+ 32 - 0
data_pipeline/training_data/manual_20250721_002320/bss_business_day_data_detail.md

@@ -0,0 +1,32 @@
+## bss_business_day_data(`bss_business_day_data` 表用于记录高速公路服务区每日经营数据)
+bss_business_day_data 表`bss_business_day_data` 表用于记录高速公路服务区每日经营数据,支持业务分析与统计。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- created_by (varchar(50)) - 创建人 [示例: xingba]
+- update_ts (timestamp) - 更新时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- oper_date (date) - 统计日期 [示例: 2023-04-01]
+- service_no (varchar(255)) - 服务区编码 [示例: 1028, H0501]
+- service_name (varchar(255)) - 服务区名称 [示例: 宜春服务区, 庐山服务区]
+- branch_no (varchar(255)) - 档口编码 [示例: 1, H05016]
+- branch_name (varchar(255)) - 档口名称 [示例: 宜春南区, 庐山鲜徕客东区]
+- wx (numeric(19,4)) - 微信支付金额 [示例: 4790.0000, 2523.0000]
+- wx_order (integer) - 微信订单数量 [示例: 253, 133]
+- zfb (numeric(19,4)) - 支付宝支付金额 [示例: 229.0000, 0.0000]
+- zf_order (integer) - 支付宝订单数量 [示例: 15, 0]
+- rmb (numeric(19,4)) - 现金支付金额 [示例: 1058.5000, 124.0000]
+- rmb_order (integer) - 现金订单数量 [示例: 56, 12]
+- xs (numeric(19,4)) - 行吧支付金额 [示例: 0.0000, 40.0000]
+- xs_order (integer) - 行吧订单数量 [示例: 0, 1]
+- jd (numeric(19,4)) - 金豆支付金额 [示例: 0.0000]
+- jd_order (integer) - 金豆订单数量 [示例: 0]
+- order_sum (integer) - 订单总数 [示例: 324, 146]
+- pay_sum (numeric(19,4)) - 总支付金额 [示例: 6077.5000, 2687.0000]
+- source_type (integer) - 数据来源类别 [示例: 1, 0, 4]
+字段补充说明:
+- id 为主键
+- source_type 为枚举字段,包含取值:0、4、1、2、3

+ 17 - 0
data_pipeline/training_data/manual_20250721_002320/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: 记录高速公路服务区每日车辆统计信息
+-- 描述: 记录高速公路服务区每日车辆统计信息,用于车流分析与运营决策。
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  customer_count bigint       -- 车辆数量,
+  car_type varchar(100)       -- 车辆类别,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 18 - 0
data_pipeline/training_data/manual_20250721_002320/bss_car_day_count_detail.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(记录高速公路服务区每日车辆统计信息)
+bss_car_day_count 表记录高速公路服务区每日车辆统计信息,用于车流分析与运营决策。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人
+- update_ts (timestamp) - 更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- customer_count (bigint) - 车辆数量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类别 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

+ 15 - 0
data_pipeline/training_data/manual_20250721_002320/bss_company.ddl

@@ -0,0 +1,15 @@
+-- 中文名: **表注释:** 公司信息表
+-- 描述: **表注释:** 公司信息表,存储服务区合作企业基础信息。
+create table public.bss_company (
+  id varchar(32) not null     -- 公司唯一标识,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  company_name varchar(255)   -- 公司名称,
+  company_no varchar(255)     -- 公司编码,
+  primary key (id)
+);

+ 16 - 0
data_pipeline/training_data/manual_20250721_002320/bss_company_detail.md

@@ -0,0 +1,16 @@
+## bss_company(**表注释:** 公司信息表)
+bss_company 表**表注释:** 公司信息表,存储服务区合作企业基础信息。
+字段列表:
+- id (varchar(32)) - 公司唯一标识 [主键, 非空] [示例: 30675d85ba5044c31acfa243b9d16334, 47ed0bb37f5a85f3d9245e4854959b81]
+- version (integer) - 版本号 [非空] [示例: 1, 2]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- company_name (varchar(255)) - 公司名称 [示例: 上饶分公司, 宜春分公司]
+- company_no (varchar(255)) - 公司编码 [示例: H03, H02, H07]
+字段补充说明:
+- id 为主键
+- company_no 为枚举字段,包含取值:H01、H02、H03、H04、H05、H06、H07、H08、Q01

+ 16 - 0
data_pipeline/training_data/manual_20250721_002320/bss_section_route.ddl

@@ -0,0 +1,16 @@
+-- 中文名: 路段路线信息表
+-- 描述: 路段路线信息表,用于存储高速公路服务区关联的路段与路线名称等基础信息。
+create table public.bss_section_route (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  section_name varchar(255)   -- 路段名称,
+  route_name varchar(255)     -- 路线名称,
+  code varchar(255)           -- 编号,
+  primary key (id)
+);

+ 7 - 0
data_pipeline/training_data/manual_20250721_002320/bss_section_route_area_link.ddl

@@ -0,0 +1,7 @@
+-- 中文名: 路线与服务区关联表
+-- 描述: 路线与服务区关联表,记录高速公路路线对应的服务区信息。
+create table public.bss_section_route_area_link (
+  section_route_id varchar(32) not null -- 路段路线ID,主键,
+  service_area_id varchar(32) not null -- 服务区ID,主键,
+  primary key (section_route_id, service_area_id)
+);

+ 7 - 0
data_pipeline/training_data/manual_20250721_002320/bss_section_route_area_link_detail.md

@@ -0,0 +1,7 @@
+## bss_section_route_area_link(路线与服务区关联表)
+bss_section_route_area_link 表路线与服务区关联表,记录高速公路路线对应的服务区信息。
+字段列表:
+- section_route_id (varchar(32)) - 路段路线ID [主键, 非空] [示例: v8elrsfs5f7lt7jl8a6p87smfzesn3rz, hxzi2iim238e3s1eajjt1enmh9o4h3wp]
+- service_area_id (varchar(32)) - 服务区ID [主键, 非空] [示例: 08e01d7402abd1d6a4d9fdd5df855ef8, 091662311d2c737029445442ff198c4c]
+字段补充说明:
+- 复合主键:section_route_id, service_area_id

+ 16 - 0
data_pipeline/training_data/manual_20250721_002320/bss_section_route_detail.md

@@ -0,0 +1,16 @@
+## bss_section_route(路段路线信息表)
+bss_section_route 表路段路线信息表,用于存储高速公路服务区关联的路段与路线名称等基础信息。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 04ri3j67a806uw2c6o6dwdtz4knexczh, 0g5mnefxxtukql2cq6acul7phgskowy7]
+- version (integer) - 版本号 [非空] [示例: 1, 0]
+- create_ts (timestamp) - 创建时间 [示例: 2021-10-29 19:43:50, 2022-03-04 16:07:16]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- section_name (varchar(255)) - 路段名称 [示例: 昌栗, 昌宁, 昌九]
+- route_name (varchar(255)) - 路线名称 [示例: 昌栗, 昌韶, /]
+- code (varchar(255)) - 编号 [示例: SR0001, SR0002, SR0147]
+字段补充说明:
+- id 为主键

+ 19 - 0
data_pipeline/training_data/manual_20250721_002320/bss_service_area.ddl

@@ -0,0 +1,19 @@
+-- 中文名: `bss_service_area` 表用于存储高速公路服务区的基本信息
+-- 描述: `bss_service_area` 表用于存储高速公路服务区的基本信息,如名称、编码及操作记录,为核心业务提供数据支撑。
+create table public.bss_service_area (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  service_area_name varchar(255) -- 服务区名称,
+  service_area_no varchar(255) -- 服务区编码,
+  company_id varchar(32)      -- 所属公司ID,
+  service_position varchar(255) -- 服务区经纬度,
+  service_area_type varchar(50) -- 服务区类型,
+  service_state varchar(50)   -- 服务区状态,
+  primary key (id)
+);

+ 21 - 0
data_pipeline/training_data/manual_20250721_002320/bss_service_area_detail.md

@@ -0,0 +1,21 @@
+## bss_service_area(`bss_service_area` 表用于存储高速公路服务区的基本信息)
+bss_service_area 表`bss_service_area` 表用于存储高速公路服务区的基本信息,如名称、编码及操作记录,为核心业务提供数据支撑。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 0271d68ef93de9684b7ad8c7aae600b6, 08e01d7402abd1d6a4d9fdd5df855ef8]
+- version (integer) - 版本号 [非空] [示例: 3, 6]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-21 13:26:40.589000, 2021-05-20 19:51:46.314000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-07-10 15:41:28.795000, 2021-07-11 09:33:08.455000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人 [示例: ]
+- service_area_name (varchar(255)) - 服务区名称 [示例: 白鹭湖停车区, 南昌南服务区]
+- service_area_no (varchar(255)) - 服务区编码 [示例: H0814, H0105]
+- company_id (varchar(32)) - 所属公司ID [示例: b1629f07c8d9ac81494fbc1de61f1ea5, ee9bf1180a2b45003f96e597a4b7f15a]
+- service_position (varchar(255)) - 服务区经纬度 [示例: 114.574721,26.825584, 115.910549,28.396355]
+- service_area_type (varchar(50)) - 服务区类型 [示例: 信息化服务区]
+- service_state (varchar(50)) - 服务区状态 [示例: 开放, 关闭]
+字段补充说明:
+- id 为主键
+- service_area_type 为枚举字段,包含取值:信息化服务区、智能化服务区
+- service_state 为枚举字段,包含取值:开放、关闭、上传数据

+ 18 - 0
data_pipeline/training_data/manual_20250721_002320/bss_service_area_mapper.ddl

@@ -0,0 +1,18 @@
+-- 中文名: 服务区基础信息表
+-- 描述: 服务区基础信息表,记录全国高速公路服务区的编码、名称及生命周期信息。
+create table public.bss_service_area_mapper (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  service_name varchar(255)   -- 服务区名称,
+  service_no varchar(255)     -- 服务区编码,
+  service_area_id varchar(32) -- 服务区ID,
+  source_system_type varchar(50) -- 数据来源类别名称,
+  source_type integer         -- 数据来源类别ID,
+  primary key (id)
+);

+ 20 - 0
data_pipeline/training_data/manual_20250721_002320/bss_service_area_mapper_detail.md

@@ -0,0 +1,20 @@
+## bss_service_area_mapper(服务区基础信息表)
+bss_service_area_mapper 表服务区基础信息表,记录全国高速公路服务区的编码、名称及生命周期信息。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00e1e893909211ed8ee6fa163eaf653f, 013867f5962211ed8ee6fa163eaf653f]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-01-10 10:54:03, 2023-01-17 12:47:29]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2023-01-10 10:54:07, 2023-01-17 12:47:32]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- service_name (varchar(255)) - 服务区名称 [示例: 信丰西服务区, 南康北服务区]
+- service_no (varchar(255)) - 服务区编码 [示例: 1067, 1062]
+- service_area_id (varchar(32)) - 服务区ID [示例: 97cd6cd516a551409a4d453a58f9e170, fdbdd042962011ed8ee6fa163eaf653f]
+- source_system_type (varchar(50)) - 数据来源类别名称 [示例: 驿美, 驿购]
+- source_type (integer) - 数据来源类别ID [示例: 3, 1]
+字段补充说明:
+- id 为主键
+- source_system_type 为枚举字段,包含取值:司乘管理、商业管理、驿购、驿美、手工录入
+- source_type 为枚举字段,包含取值:5、0、1、3、4

+ 11 - 0
data_pipeline/training_data/manual_20250721_002320/db_query_decision_prompt.txt

@@ -0,0 +1,11 @@
+=== 数据库业务范围 ===
+当前数据库存储的是高速公路服务区运营管理的相关数据,主要涉及经营流水、车流统计、公司信息、路段路线、服务区基础信息等,包含以下业务数据:
+核心业务实体:
+- 服务区:描述高速公路沿线提供停车、加油、餐饮等服务的地点,主要字段:service_area_name、service_area_no、service_state、service_position
+- 经营档口:描述服务区内的具体经营单元,主要字段:branch_no、branch_name
+- 车辆:描述经过服务区的车辆类型与数量,主要字段:car_type、customer_count
+- 公司:描述服务区所属的管理公司,主要字段:company_name、company_no
+- 支付方式:描述经营数据中的支付类型,主要字段:wx、zfb、rmb、xs、jd
+关键业务指标:
+- 日经营金额:记录每个档口或服务区每日的支付金额总和,可用于分析经营趋势,字段:pay_sum
+- 日订单数量:记录每个档口或服务区每日的订单数量,可用于分析消费频次,字段:order_sum

+ 10 - 0
data_pipeline/training_data/manual_20250721_002320/filename_mapping.txt

@@ -0,0 +1,10 @@
+# 文件名映射报告
+# 格式: 原始表名 -> 实际文件名
+
+public.bss_business_day_data -> bss_business_day_data_detail.md
+public.bss_car_day_count -> bss_car_day_count_detail.md
+public.bss_company -> bss_company_detail.md
+public.bss_section_route -> bss_section_route_detail.md
+public.bss_section_route_area_link -> bss_section_route_area_link_detail.md
+public.bss_service_area -> bss_service_area_detail.md
+public.bss_service_area_mapper -> bss_service_area_mapper_detail.md

+ 62 - 0
data_pipeline/training_data/manual_20250721_002320/metadata.txt

@@ -0,0 +1,62 @@
+-- Schema Tools生成的主题元数据
+-- 业务背景: 高速公路服务区管理系统
+-- 生成时间: 2025-07-21 00:27:47
+-- 数据库: highway_db
+
+-- 创建表(如果不存在)
+CREATE TABLE IF NOT EXISTS metadata (
+    id SERIAL PRIMARY KEY,    -- 主键
+    topic_name VARCHAR(100) NOT NULL,  -- 业务主题名称
+    description TEXT,                  -- 业务主体说明
+    related_tables TEXT[],			  -- 相关表名
+    biz_entities TEXT[],               -- 主要业务实体名称
+    biz_metrics TEXT[],                -- 主要业务指标名称
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP    -- 插入时间
+);
+
+-- 插入主题数据
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '日营收分析',
+  '基于 bss_business_day_data 表,分析各服务区每日营收、订单结构及支付方式分布,用于经营优化。',
+  'bss_business_day_data,bss_service_area',
+  '服务区,档口,支付方式,统计日期',
+  '总营收,订单数量,支付方式占比,日营收趋势'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '车流统计分析',
+  '基于 bss_car_day_count 表,分析各服务区每日车辆类型及数量变化,用于车流与运营关联分析。',
+  'bss_car_day_count,bss_service_area',
+  '服务区,车辆类型,统计日期',
+  '车流量趋势,车辆类型分布,车流量排名'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '公司业绩对比',
+  '结合 bss_company 和 bss_service_area,分析各公司下属服务区营收与车流表现,支持公司级管理决策。',
+  'bss_company,bss_service_area,bss_business_day_data,bss_car_day_count',
+  '公司,服务区,统计日期',
+  '平均营收,车流总量,营收排名,车流排名'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '路线关联分析',
+  '基于 bss_section_route 和 bss_section_route_area_link,分析路线与服务区的关联关系,支持路网运营优化。',
+  'bss_section_route,bss_section_route_area_link,bss_service_area',
+  '路段,路线,服务区',
+  '服务区数量,路段覆盖,路线关联度'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '服务区状态分析',
+  '基于 bss_service_area 表,分析服务区类型、状态分布及地理位置,支持服务区布局与运营策略制定。',
+  'bss_service_area,bss_company',
+  '服务区,类型,状态,公司',
+  '服务区数量,开放比例,区域分布,公司覆盖率'
+);
+

+ 20 - 0
data_pipeline/training_data/manual_20250721_002320/metadata_detail.md

@@ -0,0 +1,20 @@
+## metadata(存储分析主题元数据)
+
+`metadata` 主要描述了当前数据库包含了哪些数据内容,哪些分析主题,哪些指标等等。
+
+字段列表:
+
+- `id` (serial) - 主键ID [主键, 非空]
+- `topic_name` (varchar(100)) - 业务主题名称 [非空]
+- `description` (text) - 业务主题说明
+- `related_tables` (text[]) - 涉及的数据表 [示例: bss_business_day_data, bss_section_route]
+- `biz_entities` (text[]) - 主要业务实体名称 [示例: 服务区, 路段, 状态]
+- `biz_metrics` (text[]) - 主要业务指标名称 [示例: 车流量排名, 平均营收, 车流总量]
+- `created_at` (timestamp) - 插入时间 [默认值: `CURRENT_TIMESTAMP`]
+
+字段补充说明:
+
+- `id` 为主键,自增;
+- `related_tables` 用于建立主题与具体明细表的依赖关系;
+- `biz_entities` 表示主题关注的核心对象,例如服务区、车辆、公司;
+- `biz_metrics` 表示该主题关注的业务分析指标,例如营收对比、趋势变化、占比结构等。

+ 202 - 0
data_pipeline/training_data/manual_20250721_002320/qs_highway_db_20250721_002747_pair.json

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "查询2023年4月1日各服务区的总营收和订单数量,并按总营收降序排序。",
+    "sql": "SELECT service_name AS 服务区名称, pay_sum AS 总营收, order_sum AS 订单数量 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL ORDER BY pay_sum DESC;"
+  },
+  {
+    "question": "统计2023年4月各服务区每日总营收,按日期和服务区分组。",
+    "sql": "SELECT oper_date AS 统计日期, service_name AS 服务区名称, SUM(pay_sum) AS 日总营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY oper_date, service_name ORDER BY oper_date;"
+  },
+  {
+    "question": "查询2023年4月1日各支付方式的总金额及订单数量,并按支付方式分类。",
+    "sql": "SELECT '微信' AS 支付方式, SUM(wx) AS 支付金额, SUM(wx_order) AS 订单数量 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '支付宝', SUM(zfb), SUM(zf_order) FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '现金', SUM(rmb), SUM(rmb_order) FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "统计2023年4月各档口的总营收TOP 10,并显示对应的支付方式占比。",
+    "sql": "SELECT branch_name AS 档口名称, pay_sum AS 总营收, wx / pay_sum * 100 AS 微信占比, zfb / pay_sum * 100 AS 支付宝占比, rmb / pay_sum * 100 AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL ORDER BY pay_sum DESC LIMIT 10;"
+  },
+  {
+    "question": "查询2023年4月各服务区现金支付金额占比,并按占比降序排序。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(rmb) / SUM(pay_sum) * 100 AS 现金支付占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 现金支付占比 DESC;"
+  },
+  {
+    "question": "统计2023年4月各周的总营收趋势,按周分组。",
+    "sql": "SELECT date_trunc('week', oper_date) AS 周, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY 周 ORDER BY 周;"
+  },
+  {
+    "question": "查询2023年4月每日订单数量趋势,并按日期升序排序。",
+    "sql": "SELECT oper_date AS 统计日期, SUM(order_sum) AS 订单数量 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY oper_date ORDER BY oper_date ASC;"
+  },
+  {
+    "question": "统计2023年4月各服务区的微信支付占比,并按占比降序排序。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx) / SUM(pay_sum) * 100 AS 微信支付占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 微信支付占比 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各服务区现金支付金额超过1000元的服务区名称及金额。",
+    "sql": "SELECT service_name AS 服务区名称, rmb AS 现金支付金额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND rmb > 1000 AND delete_ts IS NULL ORDER BY rmb DESC;"
+  },
+  {
+    "question": "统计2023年4月各服务区的平均每日营收,并按平均值降序排序。",
+    "sql": "SELECT service_name AS 服务区名称, AVG(pay_sum) AS 平均每日营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 平均每日营收 DESC;"
+  },
+  {
+    "question": "统计各服务区每日总车流量,并按日期和服务区排序。",
+    "sql": "SELECT count_date AS 统计日期, service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY count_date, service_area_id ORDER BY count_date, service_area_id;"
+  },
+  {
+    "question": "查询2023年4月各服务区每日车流趋势,仅显示城际车辆。",
+    "sql": "SELECT count_date AS 统计日期, service_area_id AS 服务区ID, customer_count AS 车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND car_type = '城际' AND count_date BETWEEN '2023-04-01' AND '2023-04-30' ORDER BY count_date;"
+  },
+  {
+    "question": "统计各车辆类型在所有服务区的总占比。",
+    "sql": "SELECT car_type AS 车辆类型, SUM(customer_count) AS 总车流量, ROUND(SUM(customer_count) * 100.0 / (SELECT SUM(customer_count) FROM bss_car_day_count WHERE delete_ts IS NULL), 2) AS 占比百分比 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type;"
+  },
+  {
+    "question": "找出2023年4月车流量最高的5个服务区。",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY service_area_id ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析某特定服务区(如ID为'17461166e7fa3ecda03534a5795ce985')2023年4月每日车流量变化趋势。",
+    "sql": "SELECT count_date AS 统计日期, customer_count AS 车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND service_area_id = '17461166e7fa3ecda03534a5795ce985' AND count_date BETWEEN '2023-04-01' AND '2023-04-30' ORDER BY count_date;"
+  },
+  {
+    "question": "比较不同车辆类型在2023年4月的平均日车流量。",
+    "sql": "SELECT car_type AS 车辆类型, AVG(customer_count) AS 平均日车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY car_type ORDER BY 平均日车流量 DESC;"
+  },
+  {
+    "question": "统计各服务区在2023年4月每日车流量的标准差,以评估车流波动性。",
+    "sql": "SELECT service_area_id AS 服务区ID, STDDEV(customer_count) AS 车流标准差 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY service_area_id ORDER BY 车流标准差 DESC;"
+  },
+  {
+    "question": "查询2023年4月每天总车流量的变化趋势。",
+    "sql": "SELECT count_date AS 统计日期, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY count_date ORDER BY count_date;"
+  },
+  {
+    "question": "找出2023年4月平均日车流量最高的5个服务区。",
+    "sql": "SELECT service_area_id AS 服务区ID, AVG(customer_count) AS 平均日车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY service_area_id ORDER BY 平均日车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计2023年4月各服务区每日车流量与经营数据的关联性(需联合bss_business_day_data表)。",
+    "sql": "SELECT a.count_date AS 统计日期, a.service_area_id AS 服务区ID, SUM(a.customer_count) AS 总车流量, SUM(b.pay_sum) AS 总支付金额 FROM bss_car_day_count a LEFT JOIN bss_business_day_data b ON a.service_area_id = b.service_no AND a.count_date = b.oper_date WHERE a.delete_ts IS NULL AND a.count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY a.count_date, a.service_area_id ORDER BY 总车流量 DESC;"
+  },
+  {
+    "question": "统计各公司在2023年4月1日的平均营收,并按平均营收降序排列。",
+    "sql": "SELECT bco.company_name AS 公司名称, AVG(bbd.pay_sum) AS 平均营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL GROUP BY bco.company_name ORDER BY 平均营收 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各公司的车流总量,并按车流总量降序排列。",
+    "sql": "SELECT bco.company_name AS 公司名称, SUM(bcc.customer_count) AS 车流总量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bco ON bsa.company_id = bco.id WHERE bcc.count_date = '2023-04-01' AND bcc.delete_ts IS NULL GROUP BY bco.company_name ORDER BY 车流总量 DESC;"
+  },
+  {
+    "question": "列出2023年4月1日营收排名前5的服务区及其所属公司。",
+    "sql": "SELECT bbd.service_name AS 服务区名称, bco.company_name AS 公司名称, SUM(bbd.pay_sum) AS 营收总额 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL GROUP BY bbd.service_name, bco.company_name ORDER BY 营收总额 DESC LIMIT 5;"
+  },
+  {
+    "question": "列出2023年4月1日车流排名前5的服务区及其所属公司。",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, bco.company_name AS 公司名称, SUM(bcc.customer_count) AS 车流总量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bco ON bsa.company_id = bco.id WHERE bcc.count_date = '2023-04-01' AND bcc.delete_ts IS NULL GROUP BY bsa.service_area_name, bco.company_name ORDER BY 车流总量 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析2023年4月1日各公司平均营收与车流总量的关系,按公司分组。",
+    "sql": "SELECT bco.company_name AS 公司名称, AVG(bbd.pay_sum) AS 平均营收, SUM(bcc.customer_count) AS 车流总量 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id JOIN bss_car_day_count bcc ON bsa.id = bcc.service_area_id AND bbd.oper_date = bcc.count_date WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL AND bcc.delete_ts IS NULL GROUP BY bco.company_name;"
+  },
+  {
+    "question": "统计2023年4月各公司每日平均营收,并按日期升序、平均营收降序排列。",
+    "sql": "SELECT bbd.oper_date AS 统计日期, bco.company_name AS 公司名称, AVG(bbd.pay_sum) AS 平均营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND bbd.delete_ts IS NULL GROUP BY bbd.oper_date, bco.company_name ORDER BY 统计日期 ASC, 平均营收 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各公司服务区的营收明细,并按营收降序排列。",
+    "sql": "SELECT bbd.service_name AS 服务区名称, bco.company_name AS 公司名称, bbd.pay_sum AS 营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL ORDER BY 营收 DESC;"
+  },
+  {
+    "question": "统计2023年4月各公司总营收与车流总量,并计算营收占比。",
+    "sql": "WITH company_revenue AS (SELECT bco.company_name AS 公司名称, SUM(bbd.pay_sum) AS 总营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND bbd.delete_ts IS NULL GROUP BY bco.company_name), company_traffic AS (SELECT bco.company_name AS 公司名称, SUM(bcc.customer_count) AS 总车流 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bco ON bsa.company_id = bco.id WHERE bcc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND bcc.delete_ts IS NULL GROUP BY bco.company_name) SELECT cr.公司名称, cr.总营收, ct.总车流, (cr.总营收 / SUM(cr.总营收) OVER ()) * 100 AS 营收占比百分比 FROM company_revenue cr JOIN company_traffic ct ON cr.公司名称 = ct.公司名称;"
+  },
+  {
+    "question": "列出2023年4月1日各公司下辖服务区的营收排名。",
+    "sql": "SELECT bco.company_name AS 公司名称, bbd.service_name AS 服务区名称, SUM(bbd.pay_sum) AS 营收总额, RANK() OVER (PARTITION BY bco.company_name ORDER BY SUM(bbd.pay_sum) DESC) AS 排名 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL GROUP BY bco.company_name, bbd.service_name;"
+  },
+  {
+    "question": "列出2023年4月1日各公司下辖服务区的车流排名。",
+    "sql": "SELECT bco.company_name AS 公司名称, bsa.service_area_name AS 服务区名称, SUM(bcc.customer_count) AS 车流总量, RANK() OVER (PARTITION BY bco.company_name ORDER BY SUM(bcc.customer_count) DESC) AS 排名 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bco ON bsa.company_id = bco.id WHERE bcc.count_date = '2023-04-01' AND bcc.delete_ts IS NULL GROUP BY bco.company_name, bsa.service_area_name;"
+  },
+  {
+    "question": "统计每条路线关联的服务区数量,并按数量降序排列。",
+    "sql": "SELECT bsr.route_name AS 路线名称, COUNT(bsral.service_area_id) AS 服务区数量 FROM bss_section_route bsr LEFT JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id AND bsral.service_area_id IS NOT NULL GROUP BY bsr.route_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "查询每个路段覆盖的服务区数量,并列出覆盖最少的5个路段。",
+    "sql": "SELECT bsr.section_name AS 路段名称, COUNT(bsral.service_area_id) AS 服务区数量 FROM bss_section_route bsr LEFT JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id AND bsral.service_area_id IS NOT NULL GROUP BY bsr.section_name ORDER BY 服务区数量 ASC LIMIT 5;"
+  },
+  {
+    "question": "找出与最多服务区关联的路线,并列出其关联的服务区名称。",
+    "sql": "SELECT bsr.route_name AS 路线名称, bsa.service_area_name AS 服务区名称 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id WHERE (bsr.id IN (SELECT section_route_id FROM bss_section_route_area_link GROUP BY section_route_id ORDER BY COUNT(service_area_id) DESC LIMIT 1)) ORDER BY 服务区名称;"
+  },
+  {
+    "question": "统计每条路线在2023年4月1日当天的总支付金额,并按路线名称排序。",
+    "sql": "SELECT bsr.route_name AS 路线名称, SUM(bdd.pay_sum) AS 总支付金额 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_business_day_data bdd ON bsa.service_area_no = bdd.service_no WHERE bdd.oper_date = '2023-04-01' GROUP BY bsr.route_name ORDER BY 路线名称;"
+  },
+  {
+    "question": "列出2022年3月2日当天车辆数量最多的前5个服务区及其关联路线。",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, bsr.route_name AS 路线名称 FROM bss_service_area bsa JOIN bss_section_route_area_link bsral ON bsa.id = bsral.service_area_id JOIN bss_section_route bsr ON bsral.section_route_id = bsr.id JOIN bss_car_day_count bcdc ON bsa.id = bcdc.service_area_id WHERE bcdc.count_date = '2022-03-02' ORDER BY bcdc.customer_count DESC LIMIT 5;"
+  },
+  {
+    "question": "统计每个服务区所属公司的路线覆盖情况,并列出覆盖最少的公司。",
+    "sql": "SELECT bc.company_name AS 公司名称, COUNT(DISTINCT bsr.route_name) AS 路线数量 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_company bc ON bsa.company_id = bc.id GROUP BY bc.company_name ORDER BY 路线数量 ASC LIMIT 1;"
+  },
+  {
+    "question": "查询2023年4月1日当天,每条路线的订单总数,并按订单总数降序排列。",
+    "sql": "SELECT bsr.route_name AS 路线名称, SUM(bdd.order_sum) AS 订单总数 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_business_day_data bdd ON bsa.service_area_no = bdd.service_no WHERE bdd.oper_date = '2023-04-01' GROUP BY bsr.route_name ORDER BY 订单总数 DESC;"
+  },
+  {
+    "question": "列出所有路线及其关联的服务区数量,仅包括服务区状态为开放的记录。",
+    "sql": "SELECT bsr.route_name AS 路线名称, COUNT(bsa.id) AS 服务区数量 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id WHERE bsa.service_state = '开放' GROUP BY bsr.route_name;"
+  },
+  {
+    "question": "统计每个服务区的微信支付金额占比,并按路线分组列出占比最高的服务区。",
+    "sql": "WITH wx_sum_per_route AS (SELECT bsr.route_name, SUM(bdd.wx) AS total_wx FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_business_day_data bdd ON bsa.service_area_no = bdd.service_no GROUP BY bsr.route_name), wx_per_area AS (SELECT bsr.route_name, bsa.service_area_name, SUM(bdd.wx) AS area_wx FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_business_day_data bdd ON bsa.service_area_no = bdd.service_no GROUP BY bsr.route_name, bsa.service_area_name) SELECT wpa.route_name AS 路线名称, wpa.service_area_name AS 服务区名称, (wpa.area_wx / wsp.total_wx * 100) AS 微信占比 FROM wx_per_area wpa JOIN wx_sum_per_route wsp ON wpa.route_name = wsp.route_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "列出2022年2月2日当天,车辆类型为过境的车流数量超过1000的路线及其服务区。",
+    "sql": "SELECT bsr.route_name AS 路线名称, bsa.service_area_name AS 服务区名称, bcdc.customer_count AS 车辆数量 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_car_day_count bcdc ON bsa.id = bcdc.service_area_id WHERE bcdc.count_date = '2022-02-02' AND bcdc.car_type = '过境' AND bcdc.customer_count > 1000;"
+  },
+  {
+    "question": "统计各类型服务区的数量及占比,仅考虑未删除的服务区",
+    "sql": "SELECT service_area_type AS 服务区类型, COUNT(*) AS 服务区数量, ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM bss_service_area WHERE delete_ts IS NULL), 2) AS 占比 FROM bss_service_area WHERE delete_ts IS NULL GROUP BY service_area_type;"
+  },
+  {
+    "question": "统计各公司管理的服务区数量及开放比例",
+    "sql": "SELECT c.company_name AS 公司名称, COUNT(sa.id) AS 服务区总数, SUM(CASE WHEN sa.service_state = '开放' THEN 1 ELSE 0 END) AS 开放数量, ROUND(SUM(CASE WHEN sa.service_state = '开放' THEN 1 ELSE 0 END) * 100.0 / COUNT(sa.id), 2) AS 开放比例 FROM bss_service_area sa JOIN bss_company c ON sa.company_id = c.id WHERE sa.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY c.company_name;"
+  },
+  {
+    "question": "查询最近一周新增的服务区列表及其所属公司",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, c.company_name AS 所属公司, sa.create_ts AS 创建时间 FROM bss_service_area sa LEFT JOIN bss_company c ON sa.company_id = c.id WHERE sa.create_ts >= NOW() - INTERVAL '7 days' AND sa.delete_ts IS NULL ORDER BY sa.create_ts DESC;"
+  },
+  {
+    "question": "统计不同状态的服务区数量分布",
+    "sql": "SELECT service_state AS 服务区状态, COUNT(*) AS 数量 FROM bss_service_area WHERE delete_ts IS NULL GROUP BY service_state;"
+  },
+  {
+    "question": "列出所有关闭的服务区及其所属公司名称",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, c.company_name AS 所属公司 FROM bss_service_area sa LEFT JOIN bss_company c ON sa.company_id = c.id WHERE sa.service_state = '关闭' AND sa.delete_ts IS NULL;"
+  },
+  {
+    "question": "按省份划分服务区数量(假设服务区编码前两位代表省份)",
+    "sql": "SELECT LEFT(service_area_no, 2) AS 省份编码, COUNT(*) AS 服务区数量 FROM bss_service_area WHERE delete_ts IS NULL GROUP BY LEFT(service_area_no, 2) ORDER BY 服务区数量 DESC LIMIT 10;"
+  },
+  {
+    "question": "找出最近一个月更新过的服务区及其最后更新时间",
+    "sql": "SELECT service_area_name AS 服务区名称, update_ts AS 最后更新时间 FROM bss_service_area WHERE update_ts >= NOW() - INTERVAL '1 month' AND delete_ts IS NULL ORDER BY update_ts DESC;"
+  },
+  {
+    "question": "列出所有服务区的经纬度信息及其所属公司名称",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, sa.service_position AS 经纬度, c.company_name AS 所属公司 FROM bss_service_area sa LEFT JOIN bss_company c ON sa.company_id = c.id WHERE sa.delete_ts IS NULL;"
+  },
+  {
+    "question": "按公司统计其管理的服务区数量,并按数量降序排列",
+    "sql": "SELECT c.company_name AS 公司名称, COUNT(sa.id) AS 服务区数量 FROM bss_service_area sa JOIN bss_company c ON sa.company_id = c.id WHERE sa.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY c.company_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "统计各类型服务区中关闭的数量及占比",
+    "sql": "SELECT service_area_type AS 服务区类型, COUNT(*) AS 总数量, SUM(CASE WHEN service_state = '关闭' THEN 1 ELSE 0 END) AS 关闭数量, ROUND(SUM(CASE WHEN service_state = '关闭' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS 关闭比例 FROM bss_service_area WHERE delete_ts IS NULL GROUP BY service_area_type;"
+  }
+]

+ 202 - 0
data_pipeline/training_data/manual_20250721_002320/qs_highway_db_20250721_002747_pair.json.backup

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "查询2023年4月1日各服务区的总营收和订单数量,并按总营收降序排序。",
+    "sql": "SELECT service_name AS 服务区名称, pay_sum AS 总营收, order_sum AS 订单数量 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL ORDER BY pay_sum DESC;"
+  },
+  {
+    "question": "统计2023年4月各服务区每日总营收,按日期和服务区分组。",
+    "sql": "SELECT oper_date AS 统计日期, service_name AS 服务区名称, SUM(pay_sum) AS 日总营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY oper_date, service_name ORDER BY oper_date;"
+  },
+  {
+    "question": "查询2023年4月1日各支付方式的总金额及订单数量,并按支付方式分类。",
+    "sql": "SELECT '微信' AS 支付方式, SUM(wx) AS 支付金额, SUM(wx_order) AS 订单数量 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '支付宝', SUM(zfb), SUM(zf_order) FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '现金', SUM(rmb), SUM(rmb_order) FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "统计2023年4月各档口的总营收TOP 10,并显示对应的支付方式占比。",
+    "sql": "SELECT branch_name AS 档口名称, pay_sum AS 总营收, wx / pay_sum * 100 AS 微信占比, zfb / pay_sum * 100 AS 支付宝占比, rmb / pay_sum * 100 AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL ORDER BY pay_sum DESC LIMIT 10;"
+  },
+  {
+    "question": "查询2023年4月各服务区现金支付金额占比,并按占比降序排序。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(rmb) / SUM(pay_sum) * 100 AS 现金支付占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 现金支付占比 DESC;"
+  },
+  {
+    "question": "统计2023年4月各周的总营收趋势,按周分组。",
+    "sql": "SELECT date_trunc('week', oper_date) AS 周, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY 周 ORDER BY 周;"
+  },
+  {
+    "question": "查询2023年4月每日订单数量趋势,并按日期升序排序。",
+    "sql": "SELECT oper_date AS 统计日期, SUM(order_sum) AS 订单数量 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY oper_date ORDER BY oper_date ASC;"
+  },
+  {
+    "question": "统计2023年4月各服务区的微信支付占比,并按占比降序排序。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx) / SUM(pay_sum) * 100 AS 微信支付占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 微信支付占比 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各服务区现金支付金额超过1000元的服务区名称及金额。",
+    "sql": "SELECT service_name AS 服务区名称, rmb AS 现金支付金额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND rmb > 1000 AND delete_ts IS NULL ORDER BY rmb DESC;"
+  },
+  {
+    "question": "统计2023年4月各服务区的平均每日营收,并按平均值降序排序。",
+    "sql": "SELECT service_name AS 服务区名称, AVG(pay_sum) AS 平均每日营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 平均每日营收 DESC;"
+  },
+  {
+    "question": "统计各服务区每日总车流量,并按日期和服务区排序。",
+    "sql": "SELECT count_date AS 统计日期, service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY count_date, service_area_id ORDER BY count_date, service_area_id;"
+  },
+  {
+    "question": "查询2023年4月各服务区每日车流趋势,仅显示城际车辆。",
+    "sql": "SELECT count_date AS 统计日期, service_area_id AS 服务区ID, customer_count AS 车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND car_type = '城际' AND count_date BETWEEN '2023-04-01' AND '2023-04-30' ORDER BY count_date;"
+  },
+  {
+    "question": "统计各车辆类型在所有服务区的总占比。",
+    "sql": "SELECT car_type AS 车辆类型, SUM(customer_count) AS 总车流量, ROUND(SUM(customer_count) * 100.0 / (SELECT SUM(customer_count) FROM bss_car_day_count WHERE delete_ts IS NULL), 2) AS 占比百分比 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type;"
+  },
+  {
+    "question": "找出2023年4月车流量最高的5个服务区。",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY service_area_id ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析某特定服务区(如ID为'17461166e7fa3ecda03534a5795ce985')2023年4月每日车流量变化趋势。",
+    "sql": "SELECT count_date AS 统计日期, customer_count AS 车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND service_area_id = '17461166e7fa3ecda03534a5795ce985' AND count_date BETWEEN '2023-04-01' AND '2023-04-30' ORDER BY count_date;"
+  },
+  {
+    "question": "比较不同车辆类型在2023年4月的平均日车流量。",
+    "sql": "SELECT car_type AS 车辆类型, AVG(customer_count) AS 平均日车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY car_type ORDER BY 平均日车流量 DESC;"
+  },
+  {
+    "question": "统计各服务区在2023年4月每日车流量的标准差,以评估车流波动性。",
+    "sql": "SELECT service_area_id AS 服务区ID, STDDEV(customer_count) AS 车流标准差 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY service_area_id ORDER BY 车流标准差 DESC;"
+  },
+  {
+    "question": "查询2023年4月每天总车流量的变化趋势。",
+    "sql": "SELECT count_date AS 统计日期, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY count_date ORDER BY count_date;"
+  },
+  {
+    "question": "找出2023年4月平均日车流量最高的5个服务区。",
+    "sql": "SELECT service_area_id AS 服务区ID, AVG(customer_count) AS 平均日车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY service_area_id ORDER BY 平均日车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计2023年4月各服务区每日车流量与经营数据的关联性(需联合bss_business_day_data表)。",
+    "sql": "SELECT a.count_date AS 统计日期, a.service_area_id AS 服务区ID, SUM(a.customer_count) AS 总车流量, SUM(b.pay_sum) AS 总支付金额 FROM bss_car_day_count a LEFT JOIN bss_business_day_data b ON a.service_area_id = b.service_no AND a.count_date = b.oper_date WHERE a.delete_ts IS NULL AND a.count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY a.count_date, a.service_area_id ORDER BY 总车流量 DESC;"
+  },
+  {
+    "question": "统计各公司在2023年4月1日的平均营收,并按平均营收降序排列。",
+    "sql": "SELECT bco.company_name AS 公司名称, AVG(bbd.pay_sum) AS 平均营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL GROUP BY bco.company_name ORDER BY 平均营收 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各公司的车流总量,并按车流总量降序排列。",
+    "sql": "SELECT bco.company_name AS 公司名称, SUM(bcc.customer_count) AS 车流总量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bco ON bsa.company_id = bco.id WHERE bcc.count_date = '2023-04-01' AND bcc.delete_ts IS NULL GROUP BY bco.company_name ORDER BY 车流总量 DESC;"
+  },
+  {
+    "question": "列出2023年4月1日营收排名前5的服务区及其所属公司。",
+    "sql": "SELECT bbd.service_name AS 服务区名称, bco.company_name AS 公司名称, SUM(bbd.pay_sum) AS 营收总额 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL GROUP BY bbd.service_name, bco.company_name ORDER BY 营收总额 DESC LIMIT 5;"
+  },
+  {
+    "question": "列出2023年4月1日车流排名前5的服务区及其所属公司。",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, bco.company_name AS 公司名称, SUM(bcc.customer_count) AS 车流总量 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bco ON bsa.company_id = bco.id WHERE bcc.count_date = '2023-04-01' AND bcc.delete_ts IS NULL GROUP BY bsa.service_area_name, bco.company_name ORDER BY 车流总量 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析2023年4月1日各公司平均营收与车流总量的关系,按公司分组。",
+    "sql": "SELECT bco.company_name AS 公司名称, AVG(bbd.pay_sum) AS 平均营收, SUM(bcc.customer_count) AS 车流总量 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id JOIN bss_car_day_count bcc ON bsa.id = bcc.service_area_id AND bbd.oper_date = bcc.count_date WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL AND bcc.delete_ts IS NULL GROUP BY bco.company_name;"
+  },
+  {
+    "question": "统计2023年4月各公司每日平均营收,并按日期升序、平均营收降序排列。",
+    "sql": "SELECT bbd.oper_date AS 统计日期, bco.company_name AS 公司名称, AVG(bbd.pay_sum) AS 平均营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND bbd.delete_ts IS NULL GROUP BY bbd.oper_date, bco.company_name ORDER BY 统计日期 ASC, 平均营收 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各公司服务区的营收明细,并按营收降序排列。",
+    "sql": "SELECT bbd.service_name AS 服务区名称, bco.company_name AS 公司名称, bbd.pay_sum AS 营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL ORDER BY 营收 DESC;"
+  },
+  {
+    "question": "统计2023年4月各公司总营收与车流总量,并计算营收占比。",
+    "sql": "WITH company_revenue AS (SELECT bco.company_name AS 公司名称, SUM(bbd.pay_sum) AS 总营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND bbd.delete_ts IS NULL GROUP BY bco.company_name), company_traffic AS (SELECT bco.company_name AS 公司名称, SUM(bcc.customer_count) AS 总车流 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bco ON bsa.company_id = bco.id WHERE bcc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND bcc.delete_ts IS NULL GROUP BY bco.company_name) SELECT cr.公司名称, cr.总营收, ct.总车流, (cr.总营收 / SUM(cr.总营收) OVER ()) * 100 AS 营收占比百分比 FROM company_revenue cr JOIN company_traffic ct ON cr.公司名称 = ct.公司名称;"
+  },
+  {
+    "question": "列出2023年4月1日各公司下辖服务区的营收排名。",
+    "sql": "SELECT bco.company_name AS 公司名称, bbd.service_name AS 服务区名称, SUM(bbd.pay_sum) AS 营收总额, RANK() OVER (PARTITION BY bco.company_name ORDER BY SUM(bbd.pay_sum) DESC) AS 排名 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_no = bsa.service_area_no JOIN bss_company bco ON bsa.company_id = bco.id WHERE bbd.oper_date = '2023-04-01' AND bbd.delete_ts IS NULL GROUP BY bco.company_name, bbd.service_name;"
+  },
+  {
+    "question": "列出2023年4月1日各公司下辖服务区的车流排名。",
+    "sql": "SELECT bco.company_name AS 公司名称, bsa.service_area_name AS 服务区名称, SUM(bcc.customer_count) AS 车流总量, RANK() OVER (PARTITION BY bco.company_name ORDER BY SUM(bcc.customer_count) DESC) AS 排名 FROM bss_car_day_count bcc JOIN bss_service_area bsa ON bcc.service_area_id = bsa.id JOIN bss_company bco ON bsa.company_id = bco.id WHERE bcc.count_date = '2023-04-01' AND bcc.delete_ts IS NULL GROUP BY bco.company_name, bsa.service_area_name;"
+  },
+  {
+    "question": "统计每条路线关联的服务区数量,并按数量降序排列。",
+    "sql": "SELECT bsr.route_name AS 路线名称, COUNT(bsral.service_area_id) AS 服务区数量 FROM bss_section_route bsr LEFT JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id AND bsral.service_area_id IS NOT NULL GROUP BY bsr.route_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "查询每个路段覆盖的服务区数量,并列出覆盖最少的5个路段。",
+    "sql": "SELECT bsr.section_name AS 路段名称, COUNT(bsral.service_area_id) AS 服务区数量 FROM bss_section_route bsr LEFT JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id AND bsral.service_area_id IS NOT NULL GROUP BY bsr.section_name ORDER BY 服务区数量 ASC LIMIT 5;"
+  },
+  {
+    "question": "找出与最多服务区关联的路线,并列出其关联的服务区名称。",
+    "sql": "SELECT bsr.route_name AS 路线名称, bsa.service_area_name AS 服务区名称 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id WHERE (bsr.id IN (SELECT section_route_id FROM bss_section_route_area_link GROUP BY section_route_id ORDER BY COUNT(service_area_id) DESC LIMIT 1)) ORDER BY 服务区名称;"
+  },
+  {
+    "question": "统计每条路线在2023年4月1日当天的总支付金额,并按路线名称排序。",
+    "sql": "SELECT bsr.route_name AS 路线名称, SUM(bdd.pay_sum) AS 总支付金额 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_business_day_data bdd ON bsa.service_area_no = bdd.service_no WHERE bdd.oper_date = '2023-04-01' GROUP BY bsr.route_name ORDER BY 路线名称;"
+  },
+  {
+    "question": "列出2022年3月2日当天车辆数量最多的前5个服务区及其关联路线。",
+    "sql": "SELECT bsa.service_area_name AS 服务区名称, bsr.route_name AS 路线名称 FROM bss_service_area bsa JOIN bss_section_route_area_link bsral ON bsa.id = bsral.service_area_id JOIN bss_section_route bsr ON bsral.section_route_id = bsr.id JOIN bss_car_day_count bcdc ON bsa.id = bcdc.service_area_id WHERE bcdc.count_date = '2022-03-02' ORDER BY bcdc.customer_count DESC LIMIT 5;"
+  },
+  {
+    "question": "统计每个服务区所属公司的路线覆盖情况,并列出覆盖最少的公司。",
+    "sql": "SELECT bc.company_name AS 公司名称, COUNT(DISTINCT bsr.route_name) AS 路线数量 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_company bc ON bsa.company_id = bc.id GROUP BY bc.company_name ORDER BY 路线数量 ASC LIMIT 1;"
+  },
+  {
+    "question": "查询2023年4月1日当天,每条路线的订单总数,并按订单总数降序排列。",
+    "sql": "SELECT bsr.route_name AS 路线名称, SUM(bdd.order_sum) AS 订单总数 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_business_day_data bdd ON bsa.service_area_no = bdd.service_no WHERE bdd.oper_date = '2023-04-01' GROUP BY bsr.route_name ORDER BY 订单总数 DESC;"
+  },
+  {
+    "question": "列出所有路线及其关联的服务区数量,仅包括服务区状态为开放的记录。",
+    "sql": "SELECT bsr.route_name AS 路线名称, COUNT(bsa.id) AS 服务区数量 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id WHERE bsa.service_state = '开放' GROUP BY bsr.route_name;"
+  },
+  {
+    "question": "统计每个服务区的微信支付金额占比,并按路线分组列出占比最高的服务区。",
+    "sql": "WITH wx_sum_per_route AS (SELECT bsr.route_name, SUM(bdd.wx) AS total_wx FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_business_day_data bdd ON bsa.service_area_no = bdd.service_no GROUP BY bsr.route_name), wx_per_area AS (SELECT bsr.route_name, bsa.service_area_name, SUM(bdd.wx) AS area_wx FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_business_day_data bdd ON bsa.service_area_no = bdd.service_no GROUP BY bsr.route_name, bsa.service_area_name) SELECT wpa.route_name AS 路线名称, wpa.service_area_name AS 服务区名称, (wpa.area_wx / wsp.total_wx * 100) AS 微信占比 FROM wx_per_area wpa JOIN wx_sum_per_route wsp ON wpa.route_name = wsp.route_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "列出2022年2月2日当天,车辆类型为过境的车流数量超过1000的路线及其服务区。",
+    "sql": "SELECT bsr.route_name AS 路线名称, bsa.service_area_name AS 服务区名称, bcdc.customer_count AS 车辆数量 FROM bss_section_route bsr JOIN bss_section_route_area_link bsral ON bsr.id = bsral.section_route_id JOIN bss_service_area bsa ON bsral.service_area_id = bsa.id JOIN bss_car_day_count bcdc ON bsa.id = bcdc.service_area_id WHERE bcdc.count_date = '2022-02-02' AND bcdc.car_type = '过境' AND bcdc.customer_count > 1000;"
+  },
+  {
+    "question": "统计各类型服务区的数量及占比,仅考虑未删除的服务区",
+    "sql": "SELECT service_area_type AS 服务区类型, COUNT(*) AS 服务区数量, ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM bss_service_area WHERE delete_ts IS NULL), 2) AS 占比 FROM bss_service_area WHERE delete_ts IS NULL GROUP BY service_area_type;"
+  },
+  {
+    "question": "统计各公司管理的服务区数量及开放比例",
+    "sql": "SELECT c.company_name AS 公司名称, COUNT(sa.id) AS 服务区总数, SUM(CASE WHEN sa.service_state = '开放' THEN 1 ELSE 0 END) AS 开放数量, ROUND(SUM(CASE WHEN sa.service_state = '开放' THEN 1 ELSE 0 END) * 100.0 / COUNT(sa.id), 2) AS 开放比例 FROM bss_service_area sa JOIN bss_company c ON sa.company_id = c.id WHERE sa.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY c.company_name;"
+  },
+  {
+    "question": "查询最近一周新增的服务区列表及其所属公司",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, c.company_name AS 所属公司, sa.create_ts AS 创建时间 FROM bss_service_area sa LEFT JOIN bss_company c ON sa.company_id = c.id WHERE sa.create_ts >= NOW() - INTERVAL '7 days' AND sa.delete_ts IS NULL ORDER BY sa.create_ts DESC;"
+  },
+  {
+    "question": "统计不同状态的服务区数量分布",
+    "sql": "SELECT service_state AS 服务区状态, COUNT(*) AS 数量 FROM bss_service_area WHERE delete_ts IS NULL GROUP BY service_state;"
+  },
+  {
+    "question": "列出所有关闭的服务区及其所属公司名称",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, c.company_name AS 所属公司 FROM bss_service_area sa LEFT JOIN bss_company c ON sa.company_id = c.id WHERE sa.service_state = '关闭' AND sa.delete_ts IS NULL;"
+  },
+  {
+    "question": "按省份划分服务区数量(假设服务区编码前两位代表省份)",
+    "sql": "SELECT LEFT(service_area_no, 2) AS 省份编码, COUNT(*) AS 服务区数量 FROM bss_service_area WHERE delete_ts IS NULL GROUP BY LEFT(service_area_no, 2) ORDER BY 服务区数量 DESC LIMIT 10;"
+  },
+  {
+    "question": "找出最近一个月更新过的服务区及其最后更新时间",
+    "sql": "SELECT service_area_name AS 服务区名称, update_ts AS 最后更新时间 FROM bss_service_area WHERE update_ts >= NOW() - INTERVAL '1 month' AND delete_ts IS NULL ORDER BY update_ts DESC;"
+  },
+  {
+    "question": "列出所有服务区的经纬度信息及其所属公司名称",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, sa.service_position AS 经纬度, c.company_name AS 所属公司 FROM bss_service_area sa LEFT JOIN bss_company c ON sa.company_id = c.id WHERE sa.delete_ts IS NULL;"
+  },
+  {
+    "question": "按公司统计其管理的服务区数量,并按数量降序排列",
+    "sql": "SELECT c.company_name AS 公司名称, COUNT(sa.id) AS 服务区数量 FROM bss_service_area sa JOIN bss_company c ON sa.company_id = c.id WHERE sa.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY c.company_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "统计各类型服务区中关闭的数量及占比",
+    "sql": "SELECT service_area_type AS 服务区类型, COUNT(*) AS 总数量, SUM(CASE WHEN service_state = '关闭' THEN 1 ELSE 0 END) AS 关闭数量, ROUND(SUM(CASE WHEN service_state = '关闭' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS 关闭比例 FROM bss_service_area WHERE delete_ts IS NULL GROUP BY service_area_type;"
+  }
+]

+ 5 - 0
data_pipeline/training_data/manual_20250721_002320/vector_bak/langchain_pg_collection_20250721_002757.csv

@@ -0,0 +1,5 @@
+uuid,name,cmetadata
+f4e11877-44e7-4741-b511-fa0e2e399395,sql,
+f0b714ca-44a9-433a-8768-390740bd1a18,ddl,
+98b97e3a-752d-4115-9667-7635687dbc1c,documentation,
+ab83ab0a-5649-4722-984d-b093227cdb02,error_sql,

Plik diff jest za duży
+ 1 - 0
data_pipeline/training_data/manual_20250721_002320/vector_bak/langchain_pg_embedding_20250721_002757.csv


+ 11 - 0
data_pipeline/training_data/manual_20250721_002320/vector_bak/vector_backup_log.txt

@@ -0,0 +1,11 @@
+=== Vector Table Backup Log ===
+Backup Time: 2025-07-21 00:27:58
+Task ID: manual_20250721_002320
+Duration: 0.00s
+
+Tables Backup Status:
+✓ langchain_pg_collection: 4 rows -> langchain_pg_collection_20250721_002757.csv (209.0 B)
+✓ langchain_pg_embedding: 814 rows -> langchain_pg_embedding_20250721_002757.csv (10.4 MB)
+
+Truncate Status:
+✓ langchain_pg_embedding: TRUNCATED (814 rows removed)

+ 31 - 0
data_pipeline/training_data/manual_20250721_010214/bss_business_day_data.ddl

@@ -0,0 +1,31 @@
+-- 中文名: `bss_business_day_data` 表用于记录高速公路服务区每日经营数据
+-- 描述: `bss_business_day_data` 表用于记录高速公路服务区每日经营数据,支持业务分析与统计。
+create table public.bss_business_day_data (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  oper_date date              -- 统计日期,
+  service_no varchar(255)     -- 服务区编码,
+  service_name varchar(255)   -- 服务区名称,
+  branch_no varchar(255)      -- 档口编码,
+  branch_name varchar(255)    -- 档口名称,
+  wx numeric(19,4)            -- 微信支付金额,
+  wx_order integer            -- 微信订单数量,
+  zfb numeric(19,4)           -- 支付宝支付金额,
+  zf_order integer            -- 支付宝订单数量,
+  rmb numeric(19,4)           -- 现金支付金额,
+  rmb_order integer           -- 现金订单数量,
+  xs numeric(19,4)            -- 行吧支付金额,
+  xs_order integer            -- 行吧订单数量,
+  jd numeric(19,4)            -- 金豆支付金额,
+  jd_order integer            -- 金豆订单数量,
+  order_sum integer           -- 订单总数,
+  pay_sum numeric(19,4)       -- 支付总金额,
+  source_type integer         -- 数据来源类别,
+  primary key (id)
+);

+ 32 - 0
data_pipeline/training_data/manual_20250721_010214/bss_business_day_data_detail.md

@@ -0,0 +1,32 @@
+## bss_business_day_data(`bss_business_day_data` 表用于记录高速公路服务区每日经营数据)
+bss_business_day_data 表`bss_business_day_data` 表用于记录高速公路服务区每日经营数据,支持业务分析与统计。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- created_by (varchar(50)) - 创建人 [示例: xingba]
+- update_ts (timestamp) - 更新时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- oper_date (date) - 统计日期 [示例: 2023-04-01]
+- service_no (varchar(255)) - 服务区编码 [示例: 1028, H0501]
+- service_name (varchar(255)) - 服务区名称 [示例: 宜春服务区, 庐山服务区]
+- branch_no (varchar(255)) - 档口编码 [示例: 1, H05016]
+- branch_name (varchar(255)) - 档口名称 [示例: 宜春南区, 庐山鲜徕客东区]
+- wx (numeric(19,4)) - 微信支付金额 [示例: 4790.0000, 2523.0000]
+- wx_order (integer) - 微信订单数量 [示例: 253, 133]
+- zfb (numeric(19,4)) - 支付宝支付金额 [示例: 229.0000, 0.0000]
+- zf_order (integer) - 支付宝订单数量 [示例: 15, 0]
+- rmb (numeric(19,4)) - 现金支付金额 [示例: 1058.5000, 124.0000]
+- rmb_order (integer) - 现金订单数量 [示例: 56, 12]
+- xs (numeric(19,4)) - 行吧支付金额 [示例: 0.0000, 40.0000]
+- xs_order (integer) - 行吧订单数量 [示例: 0, 1]
+- jd (numeric(19,4)) - 金豆支付金额 [示例: 0.0000]
+- jd_order (integer) - 金豆订单数量 [示例: 0]
+- order_sum (integer) - 订单总数 [示例: 324, 146]
+- pay_sum (numeric(19,4)) - 支付总金额 [示例: 6077.5000, 2687.0000]
+- source_type (integer) - 数据来源类别 [示例: 1, 0, 4]
+字段补充说明:
+- id 为主键
+- source_type 为枚举字段,包含取值:0、4、1、2、3

+ 17 - 0
data_pipeline/training_data/manual_20250721_010214/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: `bss_car_day_count` 表用于按天统计进入服务区的车辆数量及类型
+-- 描述: `bss_car_day_count` 表用于按天统计进入服务区的车辆数量及类型,支持车流分析与运营决策。
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  customer_count bigint       -- 车辆数量,
+  car_type varchar(100)       -- 车辆类别,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 18 - 0
data_pipeline/training_data/manual_20250721_010214/bss_car_day_count_detail.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(`bss_car_day_count` 表用于按天统计进入服务区的车辆数量及类型)
+bss_car_day_count 表`bss_car_day_count` 表用于按天统计进入服务区的车辆数量及类型,支持车流分析与运营决策。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人
+- update_ts (timestamp) - 更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- customer_count (bigint) - 车辆数量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类别 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

+ 15 - 0
data_pipeline/training_data/manual_20250721_010214/bss_company.ddl

@@ -0,0 +1,15 @@
+-- 中文名: `bss_company` 表用于存储高速公路服务区相关企业的基本信息
+-- 描述: `bss_company` 表用于存储高速公路服务区相关企业的基本信息,包括公司名称、编码及操作记录,支撑服务区运营管理。
+create table public.bss_company (
+  id varchar(32) not null     -- 公司唯一标识,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  company_name varchar(255)   -- 公司名称,
+  company_no varchar(255)     -- 公司编码,
+  primary key (id)
+);

+ 19 - 0
data_pipeline/training_data/manual_20250721_010214/bss_company_detail.md

@@ -0,0 +1,19 @@
+## bss_company(`bss_company` 表用于存储高速公路服务区相关企业的基本信息)
+bss_company 表`bss_company` 表用于存储高速公路服务区相关企业的基本信息,包括公司名称、编码及操作记录,支撑服务区运营管理。
+字段列表:
+- id (varchar(32)) - 公司唯一标识 [主键, 非空] [示例: 30675d85ba5044c31acfa243b9d16334, 47ed0bb37f5a85f3d9245e4854959b81]
+- version (integer) - 版本号 [非空] [示例: 1, 2]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- company_name (varchar(255)) - 公司名称 [示例: 上饶分公司, 宜春分公司, 景德镇分公司]
+- company_no (varchar(255)) - 公司编码 [示例: H03, H02, H07]
+字段补充说明:
+- id 为主键
+- created_by 为枚举字段,包含取值:admin
+- updated_by 为枚举字段,包含取值:admin
+- company_name 为枚举字段,包含取值:抚州分公司、赣州分公司、吉安分公司、景德镇分公司、九江分公司、南昌分公司、其他公司管辖、上饶分公司、宜春分公司
+- company_no 为枚举字段,包含取值:H01、H02、H03、H04、H05、H06、H07、H08、Q01

+ 16 - 0
data_pipeline/training_data/manual_20250721_010214/bss_section_route.ddl

@@ -0,0 +1,16 @@
+-- 中文名: 路段与路线信息表
+-- 描述: 路段与路线信息表,用于管理高速公路服务区所属路段及路线名称。
+create table public.bss_section_route (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  section_name varchar(255)   -- 路段名称,
+  route_name varchar(255)     -- 路线名称,
+  code varchar(255)           -- 编号,
+  primary key (id)
+);

+ 7 - 0
data_pipeline/training_data/manual_20250721_010214/bss_section_route_area_link.ddl

@@ -0,0 +1,7 @@
+-- 中文名: 路段路线与服务区关联表
+-- 描述: 路段路线与服务区关联表,记录路线与服务区之间的绑定关系。
+create table public.bss_section_route_area_link (
+  section_route_id varchar(32) not null -- 路段路线ID,主键,
+  service_area_id varchar(32) not null -- 服务区ID,主键,
+  primary key (section_route_id, service_area_id)
+);

+ 7 - 0
data_pipeline/training_data/manual_20250721_010214/bss_section_route_area_link_detail.md

@@ -0,0 +1,7 @@
+## bss_section_route_area_link(路段路线与服务区关联表)
+bss_section_route_area_link 表路段路线与服务区关联表,记录路线与服务区之间的绑定关系。
+字段列表:
+- section_route_id (varchar(32)) - 路段路线ID [主键, 非空] [示例: v8elrsfs5f7lt7jl8a6p87smfzesn3rz, hxzi2iim238e3s1eajjt1enmh9o4h3wp]
+- service_area_id (varchar(32)) - 服务区ID [主键, 非空] [示例: 08e01d7402abd1d6a4d9fdd5df855ef8, 091662311d2c737029445442ff198c4c]
+字段补充说明:
+- 复合主键:section_route_id, service_area_id

+ 16 - 0
data_pipeline/training_data/manual_20250721_010214/bss_section_route_detail.md

@@ -0,0 +1,16 @@
+## bss_section_route(路段与路线信息表)
+bss_section_route 表路段与路线信息表,用于管理高速公路服务区所属路段及路线名称。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 04ri3j67a806uw2c6o6dwdtz4knexczh, 0g5mnefxxtukql2cq6acul7phgskowy7]
+- version (integer) - 版本号 [非空] [示例: 1, 0]
+- create_ts (timestamp) - 创建时间 [示例: 2021-10-29 19:43:50, 2022-03-04 16:07:16]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- section_name (varchar(255)) - 路段名称 [示例: 昌栗, 昌宁, 昌九]
+- route_name (varchar(255)) - 路线名称 [示例: 昌栗, 昌韶, /]
+- code (varchar(255)) - 编号 [示例: SR0001, SR0002, SR0147]
+字段补充说明:
+- id 为主键

+ 19 - 0
data_pipeline/training_data/manual_20250721_010214/bss_service_area.ddl

@@ -0,0 +1,19 @@
+-- 中文名: `bss_service_area` 表用于存储高速公路服务区的基本信息
+-- 描述: `bss_service_area` 表用于存储高速公路服务区的基本信息,包括服务区名称、编码及操作记录,为核心业务提供数据支撑。
+create table public.bss_service_area (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  service_area_name varchar(255) -- 服务区名称,
+  service_area_no varchar(255) -- 服务区编码,
+  company_id varchar(32)      -- 所属公司ID,
+  service_position varchar(255) -- 服务区经纬度,
+  service_area_type varchar(50) -- 服务区类型,
+  service_state varchar(50)   -- 服务区状态,
+  primary key (id)
+);

+ 21 - 0
data_pipeline/training_data/manual_20250721_010214/bss_service_area_detail.md

@@ -0,0 +1,21 @@
+## bss_service_area(`bss_service_area` 表用于存储高速公路服务区的基本信息)
+bss_service_area 表`bss_service_area` 表用于存储高速公路服务区的基本信息,包括服务区名称、编码及操作记录,为核心业务提供数据支撑。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 0271d68ef93de9684b7ad8c7aae600b6, 08e01d7402abd1d6a4d9fdd5df855ef8]
+- version (integer) - 版本号 [非空] [示例: 3, 6]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-21 13:26:40.589000, 2021-05-20 19:51:46.314000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-07-10 15:41:28.795000, 2021-07-11 09:33:08.455000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人 [示例: ]
+- service_area_name (varchar(255)) - 服务区名称 [示例: 白鹭湖停车区, 南昌南服务区]
+- service_area_no (varchar(255)) - 服务区编码 [示例: H0814, H0105]
+- company_id (varchar(32)) - 所属公司ID [示例: b1629f07c8d9ac81494fbc1de61f1ea5, ee9bf1180a2b45003f96e597a4b7f15a]
+- service_position (varchar(255)) - 服务区经纬度 [示例: 114.574721,26.825584, 115.910549,28.396355]
+- service_area_type (varchar(50)) - 服务区类型 [示例: 信息化服务区]
+- service_state (varchar(50)) - 服务区状态 [示例: 开放, 关闭]
+字段补充说明:
+- id 为主键
+- service_area_type 为枚举字段,包含取值:信息化服务区、智能化服务区
+- service_state 为枚举字段,包含取值:开放、关闭、上传数据

+ 18 - 0
data_pipeline/training_data/manual_20250721_010214/bss_service_area_mapper.ddl

@@ -0,0 +1,18 @@
+-- 中文名: 服务区基础信息映射表
+-- 描述: 服务区基础信息映射表,用于统一服务区名称与编码的关联关系及生命周期管理。
+create table public.bss_service_area_mapper (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  service_name varchar(255)   -- 服务区名称,
+  service_no varchar(255)     -- 服务区编码,
+  service_area_id varchar(32) -- 服务区ID,
+  source_system_type varchar(50) -- 数据来源类别名称,
+  source_type integer         -- 数据来源类别ID,
+  primary key (id)
+);

+ 20 - 0
data_pipeline/training_data/manual_20250721_010214/bss_service_area_mapper_detail.md

@@ -0,0 +1,20 @@
+## bss_service_area_mapper(服务区基础信息映射表)
+bss_service_area_mapper 表服务区基础信息映射表,用于统一服务区名称与编码的关联关系及生命周期管理。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00e1e893909211ed8ee6fa163eaf653f, 013867f5962211ed8ee6fa163eaf653f]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-01-10 10:54:03, 2023-01-17 12:47:29]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2023-01-10 10:54:07, 2023-01-17 12:47:32]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- service_name (varchar(255)) - 服务区名称 [示例: 信丰西服务区, 南康北服务区]
+- service_no (varchar(255)) - 服务区编码 [示例: 1067, 1062]
+- service_area_id (varchar(32)) - 服务区ID [示例: 97cd6cd516a551409a4d453a58f9e170, fdbdd042962011ed8ee6fa163eaf653f]
+- source_system_type (varchar(50)) - 数据来源类别名称 [示例: 驿美, 驿购]
+- source_type (integer) - 数据来源类别ID [示例: 3, 1]
+字段补充说明:
+- id 为主键
+- source_system_type 为枚举字段,包含取值:司乘管理、商业管理、驿购、驿美、手工录入
+- source_type 为枚举字段,包含取值:5、0、1、3、4

+ 11 - 0
data_pipeline/training_data/manual_20250721_010214/db_query_decision_prompt.txt

@@ -0,0 +1,11 @@
+=== 数据库业务范围 ===
+当前数据库存储的是高速公路服务区运营管理的相关数据,主要涉及服务区经营、车辆统计、公司管理及路段路线关联,包含以下业务数据:
+核心业务实体:
+- 服务区:表示高速公路的服务区域,主要字段:service_area_name、service_area_no、service_state
+- 档口:表示服务区内的具体经营点,主要字段:branch_no、branch_name
+- 公司:表示服务区所属的管理公司,主要字段:company_name、company_no
+- 车辆:表示进入服务区的车辆类型,主要字段:car_type、customer_count
+- 路段与路线:表示服务区所属的路段和路线信息,主要字段:section_name、route_name
+关键业务指标:
+- 经营数据:包括支付总金额(pay_sum)、订单总数(order_sum)、各类支付方式金额(wx、zfb、rmb)及订单数(wx_order、zf_order、rmb_order)
+- 车流统计:进入服务区的各类车辆数量(customer_count),按车辆类别(car_type)进行统计

+ 10 - 0
data_pipeline/training_data/manual_20250721_010214/filename_mapping.txt

@@ -0,0 +1,10 @@
+# 文件名映射报告
+# 格式: 原始表名 -> 实际文件名
+
+public.bss_business_day_data -> bss_business_day_data_detail.md
+public.bss_car_day_count -> bss_car_day_count_detail.md
+public.bss_company -> bss_company_detail.md
+public.bss_section_route -> bss_section_route_detail.md
+public.bss_section_route_area_link -> bss_section_route_area_link_detail.md
+public.bss_service_area -> bss_service_area_detail.md
+public.bss_service_area_mapper -> bss_service_area_mapper_detail.md

+ 62 - 0
data_pipeline/training_data/manual_20250721_010214/metadata.txt

@@ -0,0 +1,62 @@
+-- Schema Tools生成的主题元数据
+-- 业务背景: 高速公路服务区管理系统
+-- 生成时间: 2025-07-21 01:06:58
+-- 数据库: highway_db
+
+-- 创建表(如果不存在)
+CREATE TABLE IF NOT EXISTS metadata (
+    id SERIAL PRIMARY KEY,    -- 主键
+    topic_name VARCHAR(100) NOT NULL,  -- 业务主题名称
+    description TEXT,                  -- 业务主体说明
+    related_tables TEXT[],			  -- 相关表名
+    biz_entities TEXT[],               -- 主要业务实体名称
+    biz_metrics TEXT[],                -- 主要业务指标名称
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP    -- 插入时间
+);
+
+-- 插入主题数据
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '日营业数据分析',
+  '分析每个服务区和档口每日的营业收入、订单数量及支付方式分布,支撑经营优化决策。',
+  'bss_business_day_data',
+  '服务区,档口,支付方式,统计日期',
+  '收入趋势,订单数量,支付方式占比,服务区对比'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '车流统计分析',
+  '基于车辆进入服务区的数据,分析车流数量及类型分布,为设施规划和运营管理提供依据。',
+  'bss_car_day_count,bss_service_area',
+  '服务区,车辆类型,统计日期',
+  '车流趋势,车辆类型占比,服务区车流排名'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '公司运营对比',
+  '对比不同公司管辖服务区的营业收入和车流数据,评估各公司的运营绩效。',
+  'bss_company,bss_service_area,bss_business_day_data,bss_car_day_count',
+  '公司,服务区,支付方式',
+  '公司营收排名,公司车流排名,单位车流营收对比'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '服务区路段关联分析',
+  '分析服务区与路段路线的关联关系,评估路段车流对服务区业务的影响。',
+  'bss_section_route,bss_section_route_area_link,bss_service_area,bss_car_day_count',
+  '路段,路线,服务区',
+  '路段服务区数量,路段车流总量,服务区车流分布'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '服务区状态与营收关系',
+  '分析服务区开放与关闭状态对营业收入的影响,优化服务区运营策略。',
+  'bss_service_area,bss_business_day_data',
+  '服务区,服务状态,统计日期',
+  '开放状态营收,关闭状态营收,营收状态对比'
+);
+

+ 20 - 0
data_pipeline/training_data/manual_20250721_010214/metadata_detail.md

@@ -0,0 +1,20 @@
+## metadata(存储分析主题元数据)
+
+`metadata` 主要描述了当前数据库包含了哪些数据内容,哪些分析主题,哪些指标等等。
+
+字段列表:
+
+- `id` (serial) - 主键ID [主键, 非空]
+- `topic_name` (varchar(100)) - 业务主题名称 [非空]
+- `description` (text) - 业务主题说明
+- `related_tables` (text[]) - 涉及的数据表 [示例: bss_section_route_area_link, bss_company]
+- `biz_entities` (text[]) - 主要业务实体名称 [示例: 路段, 公司, 路线]
+- `biz_metrics` (text[]) - 主要业务指标名称 [示例: 路段服务区数量, 车流趋势, 关闭状态营收]
+- `created_at` (timestamp) - 插入时间 [默认值: `CURRENT_TIMESTAMP`]
+
+字段补充说明:
+
+- `id` 为主键,自增;
+- `related_tables` 用于建立主题与具体明细表的依赖关系;
+- `biz_entities` 表示主题关注的核心对象,例如服务区、车辆、公司;
+- `biz_metrics` 表示该主题关注的业务分析指标,例如营收对比、趋势变化、占比结构等。

+ 202 - 0
data_pipeline/training_data/manual_20250721_010214/qs_highway_db_20250721_010658_pair.json

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "统计最近7天每个服务区的总营业收入和订单数量,并按营业收入降序排列。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营业收入, SUM(order_sum) AS 总订单数量 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - INTERVAL '7 days' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营业收入 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各档口的现金支付金额及订单数量,并按金额降序排序。",
+    "sql": "SELECT branch_name AS 档口名称, rmb AS 现金支付金额, rmb_order AS 现金订单数量 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL ORDER BY 现金支付金额 DESC;"
+  },
+  {
+    "question": "分析2023年4月各服务区微信支付与支付宝支付的占比情况。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx) / SUM(pay_sum) * 100 AS 微信支付占比, SUM(zfb) / SUM(pay_sum) * 100 AS 支付宝支付占比 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "找出2023年4月营业收入最高的前5个服务区。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营业收入 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计2023年4月每天的营业收入趋势,显示每日总收入。",
+    "sql": "SELECT oper_date AS 统计日期, SUM(pay_sum) AS 日营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "比较不同服务区2023年4月的平均每日营业收入。",
+    "sql": "SELECT service_name AS 服务区名称, AVG(pay_sum) AS 平均每日营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 平均每日营业收入 DESC;"
+  },
+  {
+    "question": "查询2023年4月营业收入最低的3个服务区。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营业收入 ASC LIMIT 3;"
+  },
+  {
+    "question": "统计2023年4月各服务区不同支付方式的订单数量分布。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx_order) AS 微信订单数, SUM(zf_order) AS 支付宝订单数, SUM(rmb_order) AS 现金订单数 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "查询2023年4月营业收入超过10000元的服务区。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name HAVING SUM(pay_sum) > 10000 ORDER BY 总营业收入 DESC;"
+  },
+  {
+    "question": "统计2023年4月各服务区每日营业收入的波动情况。",
+    "sql": "SELECT service_name AS 服务区名称, oper_date AS 统计日期, SUM(pay_sum) AS 日营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name, oper_date ORDER BY 服务区名称, 统计日期;"
+  },
+  {
+    "question": "统计2023年每个月进入宜春服务区的车辆总数及平均每日车流量,并按月份排序。",
+    "sql": "SELECT EXTRACT(MONTH FROM count_date) AS 月份, SUM(customer_count) AS 总车流量, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' AND service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '宜春服务区') AND delete_ts IS NULL GROUP BY EXTRACT(MONTH FROM count_date) ORDER BY 月份;"
+  },
+  {
+    "question": "2023年4月,各服务区车辆总数排名前5的服务区名称及车流量。",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, SUM(cc.customer_count) AS 总车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "2023年,各车辆类型在所有服务区的占比情况。",
+    "sql": "SELECT car_type AS 车辆类型, SUM(customer_count) AS 总车流量, ROUND(SUM(customer_count) * 100.0 / (SELECT SUM(customer_count) FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' AND delete_ts IS NULL), 2) AS 占比百分比 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' AND delete_ts IS NULL GROUP BY car_type ORDER BY 总车流量 DESC;"
+  },
+  {
+    "question": "2023年,平均每日车流量最低的3个服务区名称及平均车流量。",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, AVG(cc.customer_count) AS 日均车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date BETWEEN '2023-01-01' AND '2023-12-31' AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 日均车流量 ASC LIMIT 3;"
+  },
+  {
+    "question": "2023年4月,每天进入庐山服务区的车流量趋势变化。",
+    "sql": "SELECT count_date AS 统计日期, customer_count AS 车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' AND service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '庐山服务区') AND delete_ts IS NULL ORDER BY count_date;"
+  },
+  {
+    "question": "2023年4月,各公司管辖的服务区平均每日车流量对比。",
+    "sql": "SELECT co.company_name AS 公司名称, AVG(cc.customer_count) AS 日均车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id JOIN bss_company co ON sa.company_id = co.id WHERE cc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL AND co.delete_ts IS NULL GROUP BY co.company_name ORDER BY 日均车流量 DESC;"
+  },
+  {
+    "question": "2023年4月,过境车辆最多的前3个服务区名称及数量。",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, SUM(cc.customer_count) AS 过境车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND cc.car_type = '过境' AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 过境车流量 DESC LIMIT 3;"
+  },
+  {
+    "question": "2023年4月,每个星期几在宜春服务区的平均车流量,并按星期排序。",
+    "sql": "SELECT EXTRACT(DOW FROM count_date) AS 星期, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' AND service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '宜春服务区') AND delete_ts IS NULL GROUP BY EXTRACT(DOW FROM count_date) ORDER BY 星期;"
+  },
+  {
+    "question": "2023年4月,所有服务区中,每日车流量超过1000的日期及服务区名称。",
+    "sql": "SELECT cc.count_date AS 统计日期, sa.service_area_name AS 服务区名称, cc.customer_count AS 车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND cc.customer_count > 1000 AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL ORDER BY cc.count_date;"
+  },
+  {
+    "question": "2023年4月,各车辆类型在宜春服务区的每日车流量明细。",
+    "sql": "SELECT count_date AS 统计日期, car_type AS 车辆类型, customer_count AS 车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' AND service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '宜春服务区') AND delete_ts IS NULL ORDER BY count_date, car_type;"
+  },
+  {
+    "question": "统计各公司2023年4月1日的总营业收入,并按公司名称分组。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.pay_sum) AS 总营业收入 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name;"
+  },
+  {
+    "question": "统计各公司2023年4月1日的车辆总数,并按公司名称排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(car_count) AS 车辆总数 FROM (SELECT service_area_id, SUM(customer_count) AS car_count FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id) t JOIN bss_service_area s ON t.service_area_id = s.id JOIN bss_company b ON s.company_id = b.id GROUP BY b.company_name ORDER BY 车辆总数 DESC;"
+  },
+  {
+    "question": "计算各公司单位车流的平均营业收入,并按公司名称排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.pay_sum) / SUM(car_count) AS 单位车流营收 FROM (SELECT service_area_id, SUM(customer_count) AS car_count FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id) t JOIN bss_service_area s ON t.service_area_id = s.id JOIN bss_company b ON s.company_id = b.id JOIN bss_business_day_data a ON s.service_area_no = a.service_no WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name;"
+  },
+  {
+    "question": "找出2023年4月1日营业收入最高的前5家公司。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.pay_sum) AS 总营业收入 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name ORDER BY 总营业收入 DESC LIMIT 5;"
+  },
+  {
+    "question": "找出2023年4月1日车流量最高的前5家公司。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(car_count) AS 车辆总数 FROM (SELECT service_area_id, SUM(customer_count) AS car_count FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id) t JOIN bss_service_area s ON t.service_area_id = s.id JOIN bss_company b ON s.company_id = b.id GROUP BY b.company_name ORDER BY 车辆总数 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计各公司在2023年4月1日的现金支付总金额,并按公司名称排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.rmb) AS 现金支付总额 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name ORDER BY 现金支付总额 DESC;"
+  },
+  {
+    "question": "统计各公司在2023年4月1日使用微信支付的订单数量,并按公司名称排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.wx_order) AS 微信订单数量 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name;"
+  },
+  {
+    "question": "比较各公司在2023年4月1日的支付宝支付金额占比。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.zfb) / SUM(a.pay_sum) * 100 AS 支付宝占比百分比 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name;"
+  },
+  {
+    "question": "统计各公司在2023年4月1日的订单总数,并按订单数量排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.order_sum) AS 订单总数 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name ORDER BY 订单总数 DESC;"
+  },
+  {
+    "question": "找出2023年4月1日车流和营收均排名前五的公司。",
+    "sql": "WITH car_count_rank AS (SELECT b.company_name AS 公司名称, SUM(car_count) AS 车辆总数 FROM (SELECT service_area_id, SUM(customer_count) AS car_count FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id) t JOIN bss_service_area s ON t.service_area_id = s.id JOIN bss_company b ON s.company_id = b.id GROUP BY b.company_name ORDER BY 车辆总数 DESC LIMIT 5), revenue_rank AS (SELECT b.company_name AS 公司名称, SUM(a.pay_sum) AS 总营业收入 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name ORDER BY 总营业收入 DESC LIMIT 5) SELECT car_count_rank.公司名称 FROM car_count_rank INNER JOIN revenue_rank ON car_count_rank.公司名称 = revenue_rank.公司名称;"
+  },
+  {
+    "question": "统计各路段关联的服务区数量,并按数量降序排列。",
+    "sql": "SELECT section_name AS 路段名称, COUNT(service_area_id) AS 服务区数量 FROM bss_section_route LEFT JOIN bss_section_route_area_link ON bss_section_route.id = bss_section_route_area_link.section_route_id WHERE bss_section_route.delete_ts IS NULL GROUP BY section_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各服务区的车流总量,并按车流降序排列。",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date = '2023-04-01' GROUP BY service_area_id ORDER BY 总车流量 DESC;"
+  },
+  {
+    "question": "找出2023年4月1日车流最少的5个服务区。",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date = '2023-04-01' GROUP BY service_area_id ORDER BY 总车流量 ASC LIMIT 5;"
+  },
+  {
+    "question": "分析2022年各路段的月均车流情况,并按路段排序。",
+    "sql": "SELECT bss_section_route.section_name AS 路段名称, AVG(monthly_count) AS 月均车流量 FROM (SELECT bss_section_route_area_link.section_route_id, DATE_TRUNC('month', count_date) AS 月份, SUM(customer_count) AS monthly_count FROM bss_car_day_count JOIN bss_section_route_area_link ON bss_car_day_count.service_area_id = bss_section_route_area_link.service_area_id WHERE count_date BETWEEN '2022-01-01' AND '2022-12-31' GROUP BY bss_section_route_area_link.section_route_id, 月份) AS monthly_data JOIN bss_section_route ON monthly_data.section_route_id = bss_section_route.id GROUP BY bss_section_route.section_name ORDER BY 路段名称;"
+  },
+  {
+    "question": "查询所有开放状态的服务区及其所属路段。",
+    "sql": "SELECT bss_service_area.service_area_name AS 服务区名称, bss_section_route.section_name AS 路段名称 FROM bss_service_area JOIN bss_section_route_area_link ON bss_service_area.id = bss_section_route_area_link.service_area_id JOIN bss_section_route ON bss_section_route_area_link.section_route_id = bss_section_route.id WHERE bss_service_area.delete_ts IS NULL AND bss_service_area.service_state = '开放';"
+  },
+  {
+    "question": "统计2023年4月1日各车辆类型在各服务区的分布情况。",
+    "sql": "SELECT service_area_id AS 服务区ID, car_type AS 车辆类型, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date = '2023-04-01' GROUP BY service_area_id, car_type ORDER BY 服务区ID, 总车流量 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日车流最多的3个路段及其总车流量。",
+    "sql": "SELECT bss_section_route.section_name AS 路段名称, SUM(bss_car_day_count.customer_count) AS 总车流量 FROM bss_car_day_count JOIN bss_section_route_area_link ON bss_car_day_count.service_area_id = bss_section_route_area_link.service_area_id JOIN bss_section_route ON bss_section_route_area_link.section_route_id = bss_section_route.id WHERE bss_car_day_count.count_date = '2023-04-01' GROUP BY bss_section_route.section_name ORDER BY 总车流量 DESC LIMIT 3;"
+  },
+  {
+    "question": "计算2023年4月1日各服务区车流占所属路段车流的百分比。",
+    "sql": "WITH section_total AS (SELECT bss_section_route.id AS section_id, SUM(bss_car_day_count.customer_count) AS total_count FROM bss_car_day_count JOIN bss_section_route_area_link ON bss_car_day_count.service_area_id = bss_section_route_area_link.service_area_id JOIN bss_section_route ON bss_section_route_area_link.section_route_id = bss_section_route.id WHERE bss_car_day_count.count_date = '2023-04-01' GROUP BY bss_section_route.id), area_count AS (SELECT bss_section_route_area_link.section_route_id, bss_car_day_count.service_area_id, SUM(bss_car_day_count.customer_count) AS area_count FROM bss_car_day_count JOIN bss_section_route_area_link ON bss_car_day_count.service_area_id = bss_section_route_area_link.service_area_id WHERE bss_car_day_count.count_date = '2023-04-01' GROUP BY bss_section_route_area_link.section_route_id, bss_car_day_count.service_area_id) SELECT area_count.service_area_id AS 服务区ID, bss_section_route.section_name AS 路段名称, area_count.area_count AS 服务区车流量, section_total.total_count AS 路段总车流量, (area_count.area_count::numeric / section_total.total_count) * 100 AS 占比百分比 FROM area_count JOIN section_total ON area_count.section_route_id = section_total.section_id JOIN bss_section_route ON area_count.section_route_id = bss_section_route.id;"
+  },
+  {
+    "question": "统计各公司管辖路段的平均服务区数量。",
+    "sql": "SELECT bss_company.company_name AS 公司名称, AVG(area_count) AS 平均服务区数量 FROM (SELECT bss_section_route.section_name, COUNT(bss_section_route_area_link.service_area_id) AS area_count FROM bss_section_route LEFT JOIN bss_section_route_area_link ON bss_section_route.id = bss_section_route_area_link.section_route_id GROUP BY bss_section_route.section_name) AS section_area_count JOIN bss_section_route ON section_area_count.section_name = bss_section_route.section_name JOIN bss_service_area ON bss_section_route.id = (SELECT section_route_id FROM bss_section_route_area_link WHERE bss_section_route_area_link.service_area_id = bss_service_area.id LIMIT 1) JOIN bss_company ON bss_service_area.company_id = bss_company.id GROUP BY bss_company.company_name;"
+  },
+  {
+    "question": "查询2023年4月1日所有服务区的车流数据,包括车辆类型明细。",
+    "sql": "SELECT service_area_id AS 服务区ID, car_type AS 车辆类型, customer_count AS 车流量 FROM bss_car_day_count WHERE count_date = '2023-04-01' ORDER BY 服务区ID, 车流量 DESC;"
+  },
+  {
+    "question": "统计最近7天内每天各服务区的营业总额,并按日期和服务区名称排序。",
+    "sql": "SELECT oper_date AS 统计日期, service_name AS 服务区名称, SUM(pay_sum) AS 营业总额 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - 7 GROUP BY oper_date, service_name ORDER BY oper_date, service_name;"
+  },
+  {
+    "question": "统计各服务区在2023年4月期间的总营收,并按营收从高到低排序。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE delete_ts IS NULL AND EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 4 GROUP BY service_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "统计2023年4月期间,开放状态和关闭状态的服务区的平均日营收,并按状态排序。",
+    "sql": "SELECT sa.service_state AS 服务区状态, AVG(bd.pay_sum) AS 平均日营收 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY sa.service_state ORDER BY 平均日营收 DESC;"
+  },
+  {
+    "question": "找出2023年4月营收最高的前5个服务区。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE delete_ts IS NULL AND EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 4 GROUP BY service_name ORDER BY 总营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计2023年4月每天开放状态服务区的总营收,并按日期排序。",
+    "sql": "SELECT bd.oper_date AS 统计日期, SUM(bd.pay_sum) AS 开放状态总营收 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE sa.service_state = '开放' AND bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY bd.oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "统计2023年4月每天关闭状态服务区的总营收,并按日期排序。",
+    "sql": "SELECT bd.oper_date AS 统计日期, SUM(bd.pay_sum) AS 关闭状态总营收 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE sa.service_state = '关闭' AND bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY bd.oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "比较2023年4月开放和关闭状态服务区的总营收差异。",
+    "sql": "SELECT sa.service_state AS 服务区状态, SUM(bd.pay_sum) AS 总营收 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY sa.service_state;"
+  },
+  {
+    "question": "统计2023年4月每天所有服务区的总营收,并按日期排序。",
+    "sql": "SELECT oper_date AS 统计日期, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE delete_ts IS NULL AND EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 4 GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "列出2023年4月期间,每个服务区的总营收、订单总数,并按总营收从高到低排序。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, SUM(order_sum) AS 订单总数 FROM bss_business_day_data WHERE delete_ts IS NULL AND EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 4 GROUP BY service_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "统计2023年4月期间,开放状态服务区的总营收、订单总数,并按总营收从高到低排序。",
+    "sql": "SELECT bd.service_name AS 服务区名称, SUM(bd.pay_sum) AS 总营收, SUM(bd.order_sum) AS 订单总数 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE sa.service_state = '开放' AND bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY bd.service_name ORDER BY 总营收 DESC;"
+  }
+]

+ 202 - 0
data_pipeline/training_data/manual_20250721_010214/qs_highway_db_20250721_010658_pair.json.backup

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "统计最近7天每个服务区的总营业收入和订单数量,并按营业收入降序排列。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营业收入, SUM(order_sum) AS 总订单数量 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - INTERVAL '7 days' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营业收入 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各档口的现金支付金额及订单数量,并按金额降序排序。",
+    "sql": "SELECT branch_name AS 档口名称, rmb AS 现金支付金额, rmb_order AS 现金订单数量 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL ORDER BY 现金支付金额 DESC;"
+  },
+  {
+    "question": "分析2023年4月各服务区微信支付与支付宝支付的占比情况。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx) / SUM(pay_sum) * 100 AS 微信支付占比, SUM(zfb) / SUM(pay_sum) * 100 AS 支付宝支付占比 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "找出2023年4月营业收入最高的前5个服务区。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营业收入 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计2023年4月每天的营业收入趋势,显示每日总收入。",
+    "sql": "SELECT oper_date AS 统计日期, SUM(pay_sum) AS 日营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "比较不同服务区2023年4月的平均每日营业收入。",
+    "sql": "SELECT service_name AS 服务区名称, AVG(pay_sum) AS 平均每日营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 平均每日营业收入 DESC;"
+  },
+  {
+    "question": "查询2023年4月营业收入最低的3个服务区。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营业收入 ASC LIMIT 3;"
+  },
+  {
+    "question": "统计2023年4月各服务区不同支付方式的订单数量分布。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx_order) AS 微信订单数, SUM(zf_order) AS 支付宝订单数, SUM(rmb_order) AS 现金订单数 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "查询2023年4月营业收入超过10000元的服务区。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name HAVING SUM(pay_sum) > 10000 ORDER BY 总营业收入 DESC;"
+  },
+  {
+    "question": "统计2023年4月各服务区每日营业收入的波动情况。",
+    "sql": "SELECT service_name AS 服务区名称, oper_date AS 统计日期, SUM(pay_sum) AS 日营业收入 FROM bss_business_day_data WHERE oper_date >= '2023-04-01' AND oper_date <= '2023-04-30' AND delete_ts IS NULL GROUP BY service_name, oper_date ORDER BY 服务区名称, 统计日期;"
+  },
+  {
+    "question": "统计2023年每个月进入宜春服务区的车辆总数及平均每日车流量,并按月份排序。",
+    "sql": "SELECT EXTRACT(MONTH FROM count_date) AS 月份, SUM(customer_count) AS 总车流量, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' AND service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '宜春服务区') AND delete_ts IS NULL GROUP BY EXTRACT(MONTH FROM count_date) ORDER BY 月份;"
+  },
+  {
+    "question": "2023年4月,各服务区车辆总数排名前5的服务区名称及车流量。",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, SUM(cc.customer_count) AS 总车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "2023年,各车辆类型在所有服务区的占比情况。",
+    "sql": "SELECT car_type AS 车辆类型, SUM(customer_count) AS 总车流量, ROUND(SUM(customer_count) * 100.0 / (SELECT SUM(customer_count) FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' AND delete_ts IS NULL), 2) AS 占比百分比 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' AND delete_ts IS NULL GROUP BY car_type ORDER BY 总车流量 DESC;"
+  },
+  {
+    "question": "2023年,平均每日车流量最低的3个服务区名称及平均车流量。",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, AVG(cc.customer_count) AS 日均车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date BETWEEN '2023-01-01' AND '2023-12-31' AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 日均车流量 ASC LIMIT 3;"
+  },
+  {
+    "question": "2023年4月,每天进入庐山服务区的车流量趋势变化。",
+    "sql": "SELECT count_date AS 统计日期, customer_count AS 车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' AND service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '庐山服务区') AND delete_ts IS NULL ORDER BY count_date;"
+  },
+  {
+    "question": "2023年4月,各公司管辖的服务区平均每日车流量对比。",
+    "sql": "SELECT co.company_name AS 公司名称, AVG(cc.customer_count) AS 日均车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id JOIN bss_company co ON sa.company_id = co.id WHERE cc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL AND co.delete_ts IS NULL GROUP BY co.company_name ORDER BY 日均车流量 DESC;"
+  },
+  {
+    "question": "2023年4月,过境车辆最多的前3个服务区名称及数量。",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, SUM(cc.customer_count) AS 过境车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND cc.car_type = '过境' AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 过境车流量 DESC LIMIT 3;"
+  },
+  {
+    "question": "2023年4月,每个星期几在宜春服务区的平均车流量,并按星期排序。",
+    "sql": "SELECT EXTRACT(DOW FROM count_date) AS 星期, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' AND service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '宜春服务区') AND delete_ts IS NULL GROUP BY EXTRACT(DOW FROM count_date) ORDER BY 星期;"
+  },
+  {
+    "question": "2023年4月,所有服务区中,每日车流量超过1000的日期及服务区名称。",
+    "sql": "SELECT cc.count_date AS 统计日期, sa.service_area_name AS 服务区名称, cc.customer_count AS 车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date BETWEEN '2023-04-01' AND '2023-04-30' AND cc.customer_count > 1000 AND cc.delete_ts IS NULL AND sa.delete_ts IS NULL ORDER BY cc.count_date;"
+  },
+  {
+    "question": "2023年4月,各车辆类型在宜春服务区的每日车流量明细。",
+    "sql": "SELECT count_date AS 统计日期, car_type AS 车辆类型, customer_count AS 车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' AND service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '宜春服务区') AND delete_ts IS NULL ORDER BY count_date, car_type;"
+  },
+  {
+    "question": "统计各公司2023年4月1日的总营业收入,并按公司名称分组。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.pay_sum) AS 总营业收入 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name;"
+  },
+  {
+    "question": "统计各公司2023年4月1日的车辆总数,并按公司名称排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(car_count) AS 车辆总数 FROM (SELECT service_area_id, SUM(customer_count) AS car_count FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id) t JOIN bss_service_area s ON t.service_area_id = s.id JOIN bss_company b ON s.company_id = b.id GROUP BY b.company_name ORDER BY 车辆总数 DESC;"
+  },
+  {
+    "question": "计算各公司单位车流的平均营业收入,并按公司名称排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.pay_sum) / SUM(car_count) AS 单位车流营收 FROM (SELECT service_area_id, SUM(customer_count) AS car_count FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id) t JOIN bss_service_area s ON t.service_area_id = s.id JOIN bss_company b ON s.company_id = b.id JOIN bss_business_day_data a ON s.service_area_no = a.service_no WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name;"
+  },
+  {
+    "question": "找出2023年4月1日营业收入最高的前5家公司。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.pay_sum) AS 总营业收入 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name ORDER BY 总营业收入 DESC LIMIT 5;"
+  },
+  {
+    "question": "找出2023年4月1日车流量最高的前5家公司。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(car_count) AS 车辆总数 FROM (SELECT service_area_id, SUM(customer_count) AS car_count FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id) t JOIN bss_service_area s ON t.service_area_id = s.id JOIN bss_company b ON s.company_id = b.id GROUP BY b.company_name ORDER BY 车辆总数 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计各公司在2023年4月1日的现金支付总金额,并按公司名称排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.rmb) AS 现金支付总额 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name ORDER BY 现金支付总额 DESC;"
+  },
+  {
+    "question": "统计各公司在2023年4月1日使用微信支付的订单数量,并按公司名称排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.wx_order) AS 微信订单数量 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name;"
+  },
+  {
+    "question": "比较各公司在2023年4月1日的支付宝支付金额占比。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.zfb) / SUM(a.pay_sum) * 100 AS 支付宝占比百分比 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name;"
+  },
+  {
+    "question": "统计各公司在2023年4月1日的订单总数,并按订单数量排序。",
+    "sql": "SELECT b.company_name AS 公司名称, SUM(a.order_sum) AS 订单总数 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name ORDER BY 订单总数 DESC;"
+  },
+  {
+    "question": "找出2023年4月1日车流和营收均排名前五的公司。",
+    "sql": "WITH car_count_rank AS (SELECT b.company_name AS 公司名称, SUM(car_count) AS 车辆总数 FROM (SELECT service_area_id, SUM(customer_count) AS car_count FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id) t JOIN bss_service_area s ON t.service_area_id = s.id JOIN bss_company b ON s.company_id = b.id GROUP BY b.company_name ORDER BY 车辆总数 DESC LIMIT 5), revenue_rank AS (SELECT b.company_name AS 公司名称, SUM(a.pay_sum) AS 总营业收入 FROM bss_business_day_data a JOIN bss_service_area c ON a.service_no = c.service_area_no JOIN bss_company b ON c.company_id = b.id WHERE a.oper_date = '2023-04-01' AND a.delete_ts IS NULL GROUP BY b.company_name ORDER BY 总营业收入 DESC LIMIT 5) SELECT car_count_rank.公司名称 FROM car_count_rank INNER JOIN revenue_rank ON car_count_rank.公司名称 = revenue_rank.公司名称;"
+  },
+  {
+    "question": "统计各路段关联的服务区数量,并按数量降序排列。",
+    "sql": "SELECT section_name AS 路段名称, COUNT(service_area_id) AS 服务区数量 FROM bss_section_route LEFT JOIN bss_section_route_area_link ON bss_section_route.id = bss_section_route_area_link.section_route_id WHERE bss_section_route.delete_ts IS NULL GROUP BY section_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日各服务区的车流总量,并按车流降序排列。",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date = '2023-04-01' GROUP BY service_area_id ORDER BY 总车流量 DESC;"
+  },
+  {
+    "question": "找出2023年4月1日车流最少的5个服务区。",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date = '2023-04-01' GROUP BY service_area_id ORDER BY 总车流量 ASC LIMIT 5;"
+  },
+  {
+    "question": "分析2022年各路段的月均车流情况,并按路段排序。",
+    "sql": "SELECT bss_section_route.section_name AS 路段名称, AVG(monthly_count) AS 月均车流量 FROM (SELECT bss_section_route_area_link.section_route_id, DATE_TRUNC('month', count_date) AS 月份, SUM(customer_count) AS monthly_count FROM bss_car_day_count JOIN bss_section_route_area_link ON bss_car_day_count.service_area_id = bss_section_route_area_link.service_area_id WHERE count_date BETWEEN '2022-01-01' AND '2022-12-31' GROUP BY bss_section_route_area_link.section_route_id, 月份) AS monthly_data JOIN bss_section_route ON monthly_data.section_route_id = bss_section_route.id GROUP BY bss_section_route.section_name ORDER BY 路段名称;"
+  },
+  {
+    "question": "查询所有开放状态的服务区及其所属路段。",
+    "sql": "SELECT bss_service_area.service_area_name AS 服务区名称, bss_section_route.section_name AS 路段名称 FROM bss_service_area JOIN bss_section_route_area_link ON bss_service_area.id = bss_section_route_area_link.service_area_id JOIN bss_section_route ON bss_section_route_area_link.section_route_id = bss_section_route.id WHERE bss_service_area.delete_ts IS NULL AND bss_service_area.service_state = '开放';"
+  },
+  {
+    "question": "统计2023年4月1日各车辆类型在各服务区的分布情况。",
+    "sql": "SELECT service_area_id AS 服务区ID, car_type AS 车辆类型, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date = '2023-04-01' GROUP BY service_area_id, car_type ORDER BY 服务区ID, 总车流量 DESC;"
+  },
+  {
+    "question": "查询2023年4月1日车流最多的3个路段及其总车流量。",
+    "sql": "SELECT bss_section_route.section_name AS 路段名称, SUM(bss_car_day_count.customer_count) AS 总车流量 FROM bss_car_day_count JOIN bss_section_route_area_link ON bss_car_day_count.service_area_id = bss_section_route_area_link.service_area_id JOIN bss_section_route ON bss_section_route_area_link.section_route_id = bss_section_route.id WHERE bss_car_day_count.count_date = '2023-04-01' GROUP BY bss_section_route.section_name ORDER BY 总车流量 DESC LIMIT 3;"
+  },
+  {
+    "question": "计算2023年4月1日各服务区车流占所属路段车流的百分比。",
+    "sql": "WITH section_total AS (SELECT bss_section_route.id AS section_id, SUM(bss_car_day_count.customer_count) AS total_count FROM bss_car_day_count JOIN bss_section_route_area_link ON bss_car_day_count.service_area_id = bss_section_route_area_link.service_area_id JOIN bss_section_route ON bss_section_route_area_link.section_route_id = bss_section_route.id WHERE bss_car_day_count.count_date = '2023-04-01' GROUP BY bss_section_route.id), area_count AS (SELECT bss_section_route_area_link.section_route_id, bss_car_day_count.service_area_id, SUM(bss_car_day_count.customer_count) AS area_count FROM bss_car_day_count JOIN bss_section_route_area_link ON bss_car_day_count.service_area_id = bss_section_route_area_link.service_area_id WHERE bss_car_day_count.count_date = '2023-04-01' GROUP BY bss_section_route_area_link.section_route_id, bss_car_day_count.service_area_id) SELECT area_count.service_area_id AS 服务区ID, bss_section_route.section_name AS 路段名称, area_count.area_count AS 服务区车流量, section_total.total_count AS 路段总车流量, (area_count.area_count::numeric / section_total.total_count) * 100 AS 占比百分比 FROM area_count JOIN section_total ON area_count.section_route_id = section_total.section_id JOIN bss_section_route ON area_count.section_route_id = bss_section_route.id;"
+  },
+  {
+    "question": "统计各公司管辖路段的平均服务区数量。",
+    "sql": "SELECT bss_company.company_name AS 公司名称, AVG(area_count) AS 平均服务区数量 FROM (SELECT bss_section_route.section_name, COUNT(bss_section_route_area_link.service_area_id) AS area_count FROM bss_section_route LEFT JOIN bss_section_route_area_link ON bss_section_route.id = bss_section_route_area_link.section_route_id GROUP BY bss_section_route.section_name) AS section_area_count JOIN bss_section_route ON section_area_count.section_name = bss_section_route.section_name JOIN bss_service_area ON bss_section_route.id = (SELECT section_route_id FROM bss_section_route_area_link WHERE bss_section_route_area_link.service_area_id = bss_service_area.id LIMIT 1) JOIN bss_company ON bss_service_area.company_id = bss_company.id GROUP BY bss_company.company_name;"
+  },
+  {
+    "question": "查询2023年4月1日所有服务区的车流数据,包括车辆类型明细。",
+    "sql": "SELECT service_area_id AS 服务区ID, car_type AS 车辆类型, customer_count AS 车流量 FROM bss_car_day_count WHERE count_date = '2023-04-01' ORDER BY 服务区ID, 车流量 DESC;"
+  },
+  {
+    "question": "统计最近7天内每天各服务区的营业总额,并按日期和服务区名称排序。",
+    "sql": "SELECT oper_date AS 统计日期, service_name AS 服务区名称, SUM(pay_sum) AS 营业总额 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - 7 GROUP BY oper_date, service_name ORDER BY oper_date, service_name;"
+  },
+  {
+    "question": "统计各服务区在2023年4月期间的总营收,并按营收从高到低排序。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE delete_ts IS NULL AND EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 4 GROUP BY service_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "统计2023年4月期间,开放状态和关闭状态的服务区的平均日营收,并按状态排序。",
+    "sql": "SELECT sa.service_state AS 服务区状态, AVG(bd.pay_sum) AS 平均日营收 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY sa.service_state ORDER BY 平均日营收 DESC;"
+  },
+  {
+    "question": "找出2023年4月营收最高的前5个服务区。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE delete_ts IS NULL AND EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 4 GROUP BY service_name ORDER BY 总营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计2023年4月每天开放状态服务区的总营收,并按日期排序。",
+    "sql": "SELECT bd.oper_date AS 统计日期, SUM(bd.pay_sum) AS 开放状态总营收 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE sa.service_state = '开放' AND bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY bd.oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "统计2023年4月每天关闭状态服务区的总营收,并按日期排序。",
+    "sql": "SELECT bd.oper_date AS 统计日期, SUM(bd.pay_sum) AS 关闭状态总营收 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE sa.service_state = '关闭' AND bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY bd.oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "比较2023年4月开放和关闭状态服务区的总营收差异。",
+    "sql": "SELECT sa.service_state AS 服务区状态, SUM(bd.pay_sum) AS 总营收 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY sa.service_state;"
+  },
+  {
+    "question": "统计2023年4月每天所有服务区的总营收,并按日期排序。",
+    "sql": "SELECT oper_date AS 统计日期, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE delete_ts IS NULL AND EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 4 GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "列出2023年4月期间,每个服务区的总营收、订单总数,并按总营收从高到低排序。",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, SUM(order_sum) AS 订单总数 FROM bss_business_day_data WHERE delete_ts IS NULL AND EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 4 GROUP BY service_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "统计2023年4月期间,开放状态服务区的总营收、订单总数,并按总营收从高到低排序。",
+    "sql": "SELECT bd.service_name AS 服务区名称, SUM(bd.pay_sum) AS 总营收, SUM(bd.order_sum) AS 订单总数 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE sa.service_state = '开放' AND bd.delete_ts IS NULL AND sa.delete_ts IS NULL AND EXTRACT(YEAR FROM bd.oper_date) = 2023 AND EXTRACT(MONTH FROM bd.oper_date) = 4 GROUP BY bd.service_name ORDER BY 总营收 DESC;"
+  }
+]

+ 5 - 0
data_pipeline/training_data/manual_20250721_010214/vector_bak/langchain_pg_collection_20250721_010708.csv

@@ -0,0 +1,5 @@
+uuid,name,cmetadata
+f4e11877-44e7-4741-b511-fa0e2e399395,sql,
+f0b714ca-44a9-433a-8768-390740bd1a18,ddl,
+98b97e3a-752d-4115-9667-7635687dbc1c,documentation,
+ab83ab0a-5649-4722-984d-b093227cdb02,error_sql,

+ 1 - 0
data_pipeline/training_data/manual_20250721_010214/vector_bak/langchain_pg_embedding_20250721_010708.csv

@@ -0,0 +1 @@
+id,collection_id,embedding,document,cmetadata

+ 11 - 0
data_pipeline/training_data/manual_20250721_010214/vector_bak/vector_backup_log.txt

@@ -0,0 +1,11 @@
+=== Vector Table Backup Log ===
+Backup Time: 2025-07-21 01:07:08
+Task ID: manual_20250721_010214
+Duration: 0.00s
+
+Tables Backup Status:
+✓ langchain_pg_collection: 4 rows -> langchain_pg_collection_20250721_010708.csv (209.0 B)
+✓ langchain_pg_embedding: 0 rows -> langchain_pg_embedding_20250721_010708.csv (47.0 B)
+
+Truncate Status:
+✓ langchain_pg_embedding: TRUNCATED (0 rows removed)

+ 124 - 15
docs/data_pipeline_API调用指南.md

@@ -57,6 +57,7 @@ unified_api.py (Flask应用)
 | **完整工作流** | 一次性执行4个步骤 | 生产环境,自动化处理 |
 | **分步执行** | 逐步执行各个步骤 | 调试,质量控制 |
 | **后台执行** | 使用subprocess独立进程 | 长时间任务,不阻塞API |
+| **Vector表管理** | 备份和清空vector表数据 | 重新训练前清理旧数据 |
 
 ## 2. 核心API端点
 
@@ -106,6 +107,28 @@ unified_api.py (Flask应用)
 | `409` | 冲突 | 任务已在执行 |
 | `500` | 服务器错误 | 内部错误 |
 
+### 2.4 Vector表管理功能
+
+data_pipeline API现在支持Vector表管理功能,用于备份和清空向量数据:
+
+#### 关键参数
+- `backup_vector_tables`: 备份vector表数据到任务目录
+- `truncate_vector_tables`: 清空vector表数据(自动启用备份)
+
+#### 参数依赖关系
+- ✅ 可以单独使用 `backup_vector_tables`
+- ❌ 不能单独使用 `truncate_vector_tables`  
+- 🔄 使用 `truncate_vector_tables` 时自动启用 `backup_vector_tables`
+
+#### 影响的表
+- `langchain_pg_collection`: 只备份,不清空
+- `langchain_pg_embedding`: 备份并清空
+
+#### 应用场景
+- **重新训练**: 在加载新训练数据前清空旧的embedding数据
+- **数据迁移**: 备份vector数据用于环境迁移
+- **版本管理**: 保留不同版本的vector数据备份
+
 ## 3. 任务管理API
 
 ### 3.1 创建任务
@@ -125,6 +148,8 @@ unified_api.py (Flask应用)
 | `enable_llm_repair` | ❌ | boolean | `true` | 是否启用LLM修复 |
 | `modify_original_file` | ❌ | boolean | `true` | 是否修改原始文件 |
 | `enable_training_data_load` | ❌ | boolean | `true` | 是否启用训练数据加载 |
+| `backup_vector_tables` | ❌ | boolean | `false` | 是否备份vector表数据 |
+| `truncate_vector_tables` | ❌ | boolean | `false` | 是否清空vector表数据(自动启用备份) |
 
 #### 请求示例
 
@@ -139,7 +164,9 @@ curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks \
     "enable_sql_validation": true,
     "enable_llm_repair": true,
     "modify_original_file": true,
-    "enable_training_data_load": true
+    "enable_training_data_load": true,
+    "backup_vector_tables": false,
+    "truncate_vector_tables": false
   }'
 ```
 
@@ -169,6 +196,8 @@ curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks \
 |------|------|------|--------|------|
 | `execution_mode` | ❌ | enum | `complete` | 执行模式:`complete`/`step` |
 | `step_name` | ❌ | string | - | 步骤名称,步骤模式时必需 |
+| `backup_vector_tables` | ❌ | boolean | `false` | 是否备份vector表数据 |
+| `truncate_vector_tables` | ❌ | boolean | `false` | 是否清空vector表数据(自动启用备份) |
 
 #### 有效步骤名称
 
@@ -184,7 +213,17 @@ curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks \
 curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/execute \
   -H "Content-Type: application/json" \
   -d '{
-    "execution_mode": "complete"
+    "execution_mode": "complete",
+    "backup_vector_tables": false,
+    "truncate_vector_tables": false
+  }'
+
+# 执行完整工作流并清空vector表
+curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/execute \
+  -H "Content-Type: application/json" \
+  -d '{
+    "execution_mode": "complete",
+    "truncate_vector_tables": true
   }'
 
 # 执行单个步骤
@@ -192,7 +231,9 @@ curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_1430
   -H "Content-Type: application/json" \
   -d '{
     "execution_mode": "step",
-    "step_name": "ddl_generation"
+    "step_name": "ddl_generation",
+    "backup_vector_tables": false,
+    "truncate_vector_tables": false
   }'
 ```
 
@@ -238,7 +279,9 @@ curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_1430
         "completed_at": null,
         "parameters": {
             "business_context": "高速公路服务区管理系统",
-            "enable_sql_validation": true
+            "enable_sql_validation": true,
+            "backup_vector_tables": false,
+            "truncate_vector_tables": false
         },
         "current_step": {
             "execution_id": "task_20250627_143052_step_qa_generation_exec_20250627143521",
@@ -855,7 +898,9 @@ curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks \
     "table_list_file": "tables.txt",
     "business_context": "高速公路服务区管理系统",
     "db_name": "highway_db",
-    "task_name": "高速公路数据处理"
+    "task_name": "高速公路数据处理",
+    "backup_vector_tables": false,
+    "truncate_vector_tables": false
   }'
 ```
 
@@ -864,7 +909,9 @@ curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks \
 curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/execute \
   -H "Content-Type: application/json" \
   -d '{
-    "execution_mode": "complete"
+    "execution_mode": "complete",
+    "backup_vector_tables": false,
+    "truncate_vector_tables": false
   }'
 ```
 
@@ -894,7 +941,49 @@ curl "http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/file
 curl -O "http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/files/qs_highway_db_20250627_143052_pair.json"
 ```
 
-### 8.2 分步执行示例
+### 8.2 Vector表管理示例
+
+#### 带Vector表管理的完整工作流
+```bash
+# 创建任务并启用vector表清空
+curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks \
+  -H "Content-Type: application/json" \
+  -d '{
+    "table_list_file": "tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "db_name": "highway_db",
+    "task_name": "高速公路数据处理_清空vector",
+    "truncate_vector_tables": true
+  }'
+
+# 执行工作流(truncate_vector_tables会自动启用backup_vector_tables)
+curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/execute \
+  -H "Content-Type: application/json" \
+  -d '{
+    "execution_mode": "complete",
+    "truncate_vector_tables": true
+  }'
+
+# 检查vector表管理结果
+curl "http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052"
+
+# 下载备份文件(如果有)
+curl "http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/files" | \
+  jq -r '.data.files[] | select(.file_name | contains("langchain_")) | .download_url'
+```
+
+#### 仅备份Vector表
+```bash
+curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/execute \
+  -H "Content-Type: application/json" \
+  -d '{
+    "execution_mode": "complete",
+    "backup_vector_tables": true,
+    "truncate_vector_tables": false
+  }'
+```
+
+### 8.3 分步执行示例
 
 #### 步骤1: 创建任务(无表清单)
 ```bash
@@ -932,7 +1021,7 @@ curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_1430
   }'
 ```
 
-### 8.3 数据库工具使用示例
+### 8.4 数据库工具使用示例
 
 #### 获取数据库表列表
 ```bash
@@ -955,7 +1044,7 @@ curl -X POST http://localhost:8084/api/v0/database/table/ddl \
   }'
 ```
 
-### 8.4 JavaScript客户端示例
+### 8.5 JavaScript客户端示例
 
 ```javascript
 class DataPipelineAPI {
@@ -1041,7 +1130,9 @@ async function runDataPipelineWorkflow() {
             table_list_file: 'tables.txt',
             business_context: '高速公路服务区管理系统',
             db_name: 'highway_db',
-            task_name: '高速公路数据处理'
+            task_name: '高速公路数据处理',
+            backup_vector_tables: false,
+            truncate_vector_tables: false
         });
         
         const taskId = createResult.data.task_id;
@@ -1065,7 +1156,7 @@ async function runDataPipelineWorkflow() {
 }
 ```
 
-### 8.5 Python客户端示例
+### 8.6 Python客户端示例
 
 ```python
 import requests
@@ -1171,7 +1262,9 @@ def run_data_pipeline_workflow():
             table_list_file='tables.txt',
             business_context='高速公路服务区管理系统',
             db_name='highway_db',
-            task_name='高速公路数据处理'
+            task_name='高速公路数据处理',
+            backup_vector_tables=False,
+            truncate_vector_tables=False
         )
         
         task_id = create_result['data']['task_id']
@@ -1218,6 +1311,8 @@ if __name__ == '__main__':
 | `INVALID_STEP_NAME` | 400 | 无效的步骤名称 | 使用有效的步骤名称 |
 | `FILE_NOT_FOUND` | 404 | 文件不存在 | 检查文件名是否正确 |
 | `DATABASE_CONNECTION_ERROR` | 500 | 数据库连接失败 | 检查数据库配置 |
+| `VECTOR_BACKUP_FAILED` | 500 | Vector表备份失败 | 检查数据库连接和磁盘空间 |
+| `VECTOR_TRUNCATE_FAILED` | 500 | Vector表清空失败 | 检查数据库权限 |
 
 ### 9.2 错误响应格式
 
@@ -1370,19 +1465,33 @@ task_name_patterns = {
         "enable_sql_validation": true,
         "enable_llm_repair": true,
         "modify_original_file": true,
-        "enable_training_data_load": true
+        "enable_training_data_load": true,
+        "backup_vector_tables": false,
+        "truncate_vector_tables": false
     },
     "调试配置": {
         "enable_sql_validation": false,
         "enable_llm_repair": false,
         "modify_original_file": false,
-        "enable_training_data_load": false
+        "enable_training_data_load": false,
+        "backup_vector_tables": false,
+        "truncate_vector_tables": false
     },
     "快速配置": {
         "enable_sql_validation": true,
         "enable_llm_repair": false,
         "modify_original_file": false,
-        "enable_training_data_load": true
+        "enable_training_data_load": true,
+        "backup_vector_tables": false,
+        "truncate_vector_tables": false
+    },
+    "Vector清理配置": {
+        "enable_sql_validation": true,
+        "enable_llm_repair": true,
+        "modify_original_file": true,
+        "enable_training_data_load": true,
+        "backup_vector_tables": true,
+        "truncate_vector_tables": true
     }
 }
 ```

+ 171 - 53
docs/data_pipeline_脚本化调用指南.md

@@ -9,11 +9,12 @@
 1. [模块架构概览](#1-模块架构概览)
 2. [核心脚本入口](#2-核心脚本入口)
 3. [一键工作流脚本](#3-一键工作流脚本)
-4. [分步执行脚本](#4-分步执行脚本)
-5. [日志配置](#5-日志配置)
-6. [配置文件](#6-配置文件)
-7. [使用示例](#7-使用示例)
-8. [故障排查](#8-故障排查)
+4. [Vector表管理功能](#4-vector表管理功能)
+5. [分步执行脚本](#5-分步执行脚本)
+6. [日志配置](#6-日志配置)
+7. [配置文件](#7-配置文件)
+8. [使用示例](#8-使用示例)
+9. [故障排查](#9-故障排查)
 
 ## 1. 模块架构概览
 
@@ -93,6 +94,8 @@ python data_pipeline/schema_workflow.py [参数]
 | `--disable-llm-repair` | flag | `False` | 禁用LLM修复功能 |
 | `--no-modify-file` | flag | `False` | 不修改原始JSON文件 |
 | `--skip-training-load` | flag | `False` | 跳过训练数据加载步骤 |
+| `--backup-vector-tables` | flag | `False` | 备份vector表数据到任务目录 |
+| `--truncate-vector-tables` | flag | `False` | 清空vector表数据(自动启用备份) |
 | `--verbose` | flag | `False` | 启用详细日志输出 |
 | `--log-file` | string | 无 | 指定日志文件路径 |
 
@@ -134,9 +137,105 @@ python -m data_pipeline.schema_workflow \
   --log-file ./logs/workflow.log
 ```
 
-## 4. 分步执行脚本
+#### Vector表管理
+```bash
+# 仅备份vector表
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" \
+  --table-list ./data_pipeline/tables.txt \
+  --business-context "高速公路服务区管理系统" \
+  --skip-training-load \
+  --backup-vector-tables
+
+# 备份并清空vector表(自动启用备份)
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" \
+  --table-list ./data_pipeline/tables.txt \
+  --business-context "高速公路服务区管理系统" \
+  --skip-training-load \
+  --truncate-vector-tables
+```
+
+## 4. Vector表管理功能
+
+### 4.1 功能概述
+
+data_pipeline现在支持Vector表管理功能,用于备份和清空向量数据:
 
-### 4.1 DDL/MD文档生成
+#### 关键参数
+- `--backup-vector-tables`: 备份vector表数据到任务目录
+- `--truncate-vector-tables`: 清空vector表数据(自动启用备份)
+
+#### 参数依赖关系
+- ✅ 可以单独使用 `--backup-vector-tables`
+- ❌ 不能单独使用 `--truncate-vector-tables`  
+- 🔄 使用 `--truncate-vector-tables` 时自动启用 `--backup-vector-tables`
+
+#### 影响的表
+- `langchain_pg_collection`: 只备份,不清空
+- `langchain_pg_embedding`: 备份并清空
+
+#### 应用场景
+- **重新训练**: 在加载新训练数据前清空旧的embedding数据
+- **数据迁移**: 备份vector数据用于环境迁移
+- **版本管理**: 保留不同版本的vector数据备份
+
+### 4.2 支持的脚本
+
+| 脚本 | 支持备份 | 支持清空 | 说明 |
+|------|----------|----------|------|
+| `schema_workflow.py` | ✅ | ✅ | 完整工作流,独立执行Vector管理 |
+| `trainer/run_training.py` | ✅ | ✅ | 训练前执行Vector管理 |
+
+### 4.3 使用示例
+
+#### 一键工作流 + Vector管理
+```bash
+# 完整工作流 + 清空vector表
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" \
+  --table-list ./data_pipeline/tables.txt \
+  --business-context "高速公路服务区管理系统" \
+  --truncate-vector-tables
+
+# 跳过训练加载,仅执行vector管理
+python -m data_pipeline.schema_workflow \
+  --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" \
+  --table-list ./data_pipeline/tables.txt \
+  --business-context "高速公路服务区管理系统" \
+  --skip-training-load \
+  --backup-vector-tables
+```
+
+#### 独立训练脚本 + Vector管理
+```bash
+# 训练前清空vector表
+python -m data_pipeline.trainer.run_training \
+  --data_path ./data_pipeline/training_data/manual_20250627_143052/ \
+  --truncate-vector-tables
+
+# 仅备份vector表
+python -m data_pipeline.trainer.run_training \
+  --data_path ./data_pipeline/training_data/manual_20250627_143052/ \
+  --backup-vector-tables
+```
+
+### 4.4 输出文件
+
+启用Vector表管理后,会在任务目录下生成备份文件:
+
+```
+data_pipeline/training_data/manual_20250627_143052/
+├── vector_bak/                                 # Vector表备份目录
+│   ├── langchain_pg_collection_20250627_143052.csv
+│   ├── langchain_pg_embedding_20250627_143052.csv
+│   └── vector_backup_log.txt                  # 备份操作详细日志
+└── data_pipeline.log                          # 包含Vector管理的执行日志
+```
+
+## 5. 分步执行脚本
+
+### 5.1 DDL/MD文档生成
 
 **入口点**: `data_pipeline/ddl_generation/ddl_md_generator.py`
 
@@ -177,7 +276,7 @@ python -m data_pipeline.ddl_generation.ddl_md_generator \
   --check-permissions-only
 ```
 
-### 4.2 Question-SQL对生成
+### 5.2 Question-SQL对生成
 
 **入口点**: `data_pipeline/qa_generation/qs_generator.py`
 
@@ -210,7 +309,7 @@ python -m data_pipeline.qa_generation.qs_generator \
   --verbose
 ```
 
-### 4.3 SQL验证工具
+### 5.3 SQL验证工具
 
 **入口点**: `data_pipeline/validators/sql_validate_cli.py`
 
@@ -254,7 +353,7 @@ python -m data_pipeline.validators.sql_validate_cli \
   --verbose
 ```
 
-### 4.4 训练数据加载
+### 5.4 训练数据加载
 
 **入口点**: `data_pipeline/trainer/run_training.py`
 
@@ -263,6 +362,8 @@ python -m data_pipeline.validators.sql_validate_cli \
 | 参数 | 必需 | 类型 | 默认值 | 说明 |
 |------|------|------|--------|------|
 | `--data_path` | ❌ | string | config值 | 训练数据目录路径 |
+| `--backup-vector-tables` | ❌ | flag | `False` | 备份vector表数据 |
+| `--truncate-vector-tables` | ❌ | flag | `False` | 清空vector表数据(自动启用备份) |
 
 #### 支持的文件格式
 
@@ -281,18 +382,24 @@ python -m data_pipeline.trainer.run_training
 # 指定路径
 python -m data_pipeline.trainer.run_training \
   --data_path ./data_pipeline/training_data/task_20250627_143052/
+
+# 带vector表管理
+python -m data_pipeline.trainer.run_training \
+  --data_path ./data_pipeline/training_data/task_20250627_143052/ \
+  --backup-vector-tables \
+  --truncate-vector-tables
 ```
 
-## 5. 日志配置
+## 6. 日志配置
 
-### 5.1 日志系统架构
+### 6.1 日志系统架构
 
 data_pipeline使用统一的日志管理系统,支持两种模式:
 
 1. **脚本模式**: 生成`manual_YYYYMMDD_HHMMSS`格式的task_id
 2. **API模式**: 使用传入的task_id
 
-### 5.2 日志文件位置
+### 6.2 日志文件位置
 
 #### 脚本模式日志
 ```
@@ -315,9 +422,9 @@ logs/
 └── data_pipeline.log                    # data_pipeline模块日志(已弃用)
 ```
 
-### 5.3 日志配置方式
+### 6.3 日志配置方式
 
-#### 5.3.1 使用内置日志系统
+#### 6.3.1 使用内置日志系统
 
 ```python
 from data_pipeline.dp_logging import get_logger
@@ -330,7 +437,7 @@ logger = get_logger("SchemaWorkflow", task_id)
 logger = get_logger("SchemaWorkflow", "task_20250627_143052")
 ```
 
-#### 5.3.2 日志级别配置
+#### 6.3.2 日志级别配置
 
 | 级别 | 用途 | 输出位置 |
 |------|------|----------|
@@ -340,7 +447,7 @@ logger = get_logger("SchemaWorkflow", "task_20250627_143052")
 | `ERROR` | 错误信息 | 控制台 + 文件 |
 | `CRITICAL` | 严重错误 | 控制台 + 文件 |
 
-#### 5.3.3 日志格式
+#### 6.3.3 日志格式
 
 ```
 2025-07-01 14:30:52 [INFO] [SchemaWorkflowOrchestrator] schema_workflow.py:123 - 🚀 开始执行Schema工作流编排
@@ -349,9 +456,9 @@ logger = get_logger("SchemaWorkflow", "task_20250627_143052")
 2025-07-01 14:31:25 [ERROR] [TrainingDataLoader] run_training.py:234 - 训练数据加载失败: 连接超时
 ```
 
-### 5.4 日志配置参数
+### 6.4 日志配置参数
 
-#### 5.4.1 命令行参数
+#### 6.4.1 命令行参数
 
 ```bash
 # 启用详细日志
@@ -365,7 +472,7 @@ python -m data_pipeline.schema_workflow \
   [其他参数]
 ```
 
-#### 5.4.2 环境变量配置
+#### 6.4.2 环境变量配置
 
 ```bash
 # 设置日志级别
@@ -375,7 +482,7 @@ export DATA_PIPELINE_LOG_LEVEL=DEBUG
 export DATA_PIPELINE_LOG_DIR=./logs/data_pipeline/
 ```
 
-#### 5.4.3 编程方式配置
+#### 6.4.3 编程方式配置
 
 ```python
 import logging
@@ -389,16 +496,16 @@ logger = get_logger("CustomModule", "manual_20250627_143052")
 logger.info("自定义日志消息")
 ```
 
-## 6. 配置文件
+## 7. 配置文件
 
-### 6.1 主配置文件
+### 7.1 主配置文件
 
 **位置**: `data_pipeline/config.py`  
 **变量**: `SCHEMA_TOOLS_CONFIG`
 
-### 6.2 主要配置项
+### 7.2 主要配置项
 
-#### 6.2.1 核心配置
+#### 7.2.1 核心配置
 
 ```python
 {
@@ -408,7 +515,7 @@ logger.info("自定义日志消息")
 }
 ```
 
-#### 6.2.2 处理链配置
+#### 7.2.2 处理链配置
 
 ```python
 {
@@ -420,7 +527,7 @@ logger.info("自定义日志消息")
 }
 ```
 
-#### 6.2.3 数据处理配置
+#### 7.2.3 数据处理配置
 
 ```python
 {
@@ -431,7 +538,7 @@ logger.info("自定义日志消息")
 }
 ```
 
-#### 6.2.4 并发配置
+#### 7.2.4 并发配置
 
 ```python
 {
@@ -440,7 +547,7 @@ logger.info("自定义日志消息")
 }
 ```
 
-#### 6.2.5 Question-SQL生成配置
+#### 7.2.5 Question-SQL生成配置
 
 ```python
 {
@@ -455,7 +562,7 @@ logger.info("自定义日志消息")
 }
 ```
 
-#### 6.2.6 SQL验证配置
+#### 7.2.6 SQL验证配置
 
 ```python
 {
@@ -470,13 +577,13 @@ logger.info("自定义日志消息")
 }
 ```
 
-### 6.3 配置优先级
+### 7.3 配置优先级
 
 ```
 命令行参数 > data_pipeline/config.py > 默认值
 ```
 
-### 6.4 修改配置
+### 7.4 修改配置
 
 #### 方法1: 直接修改配置文件
 
@@ -497,9 +604,9 @@ update_config({
 })
 ```
 
-## 7. 使用示例
+## 8. 使用示例
 
-### 7.1 典型工作流场景
+### 8.1 典型工作流场景
 
 #### 场景1: 首次处理新数据库
 ```bash
@@ -537,6 +644,12 @@ python -m data_pipeline.validators.sql_validate_cli \
   --db-connection "postgresql://user:pass@localhost:5432/db" \
   --input-file ./debug_output/qs_test_db_*.json \
   --verbose
+
+# 4. 仅训练数据加载 + vector表管理
+python -m data_pipeline.trainer.run_training \
+  --data_path ./debug_output/ \
+  --backup-vector-tables \
+  --truncate-vector-tables
 ```
 
 #### 场景3: 生产环境批量处理
@@ -556,6 +669,7 @@ for db in "${DATABASES[@]}"; do
       --table-list "./tables_${db}.txt" \
       --business-context "${db}业务系统" \
       --output-dir "./output/${db}/" \
+      --truncate-vector-tables \
       --verbose
     
     if [ $? -eq 0 ]; then
@@ -570,7 +684,7 @@ chmod +x process_databases.sh
 ./process_databases.sh
 ```
 
-### 7.2 表清单文件格式
+### 8.2 表清单文件格式
 
 #### 基本格式
 ```
@@ -602,7 +716,7 @@ bss_company              # 公司信息
 bss_service_area         # 服务区信息
 ```
 
-### 7.3 输出文件结构
+### 8.3 输出文件结构
 
 ```
 data_pipeline/training_data/manual_20250627_143052/
@@ -616,15 +730,19 @@ data_pipeline/training_data/manual_20250627_143052/
 ├── sql_validation_20250627_143052_summary.log  # SQL验证摘要
 ├── sql_validation_20250627_143052_report.json  # SQL验证详细报告
 ├── file_modifications_20250627_143052.log      # 文件修改日志
+├── vector_bak/                                 # Vector表备份目录(如果启用)
+│   ├── langchain_pg_collection_20250627_143052.csv
+│   ├── langchain_pg_embedding_20250627_143052.csv
+│   └── vector_backup_log.txt
 ├── metadata.txt                                # 元数据文件
 └── filename_mapping.txt                        # 文件名映射
 ```
 
-## 8. 故障排查
+## 9. 故障排查
 
-### 8.1 常见错误
+### 9.1 常见错误
 
-#### 8.1.1 表数量超过限制
+#### 9.1.1 表数量超过限制
 ```
 错误信息: 表数量(25)超过限制(20)。请分批处理或调整配置中的max_tables参数。
 
@@ -633,7 +751,7 @@ data_pipeline/training_data/manual_20250627_143052/
 2. 修改配置:在config.py中增加max_tables限制
 ```
 
-#### 8.1.2 DDL和MD文件数量不一致
+#### 9.1.2 DDL和MD文件数量不一致
 ```
 错误信息: DDL文件数量(5)与表数量(6)不一致
 
@@ -643,7 +761,7 @@ data_pipeline/training_data/manual_20250627_143052/
 3. 检查数据库权限
 ```
 
-#### 8.1.3 LLM调用失败
+#### 9.1.3 LLM调用失败
 ```
 错误信息: LLM调用超时或失败
 
@@ -654,7 +772,7 @@ data_pipeline/training_data/manual_20250627_143052/
 4. 检查LLM服务配置
 ```
 
-#### 8.1.4 权限不足
+#### 9.1.4 权限不足
 ```
 错误信息: 数据库查询权限不足
 
@@ -664,9 +782,9 @@ data_pipeline/training_data/manual_20250627_143052/
 3. Data Pipeline支持只读数据库
 ```
 
-### 8.2 日志分析
+### 9.2 日志分析
 
-#### 8.2.1 查看详细日志
+#### 9.2.1 查看详细日志
 ```bash
 # 查看最新的任务日志
 find data_pipeline/training_data/ -name "data_pipeline.log" -exec ls -t {} + | head -1 | xargs tail -f
@@ -678,7 +796,7 @@ grep -i "error" data_pipeline/training_data/manual_*/data_pipeline.log
 grep "bss_company" data_pipeline/training_data/manual_*/data_pipeline.log
 ```
 
-#### 8.2.2 日志级别调整
+#### 9.2.2 日志级别调整
 ```bash
 # 启用DEBUG级别日志
 python -m data_pipeline.schema_workflow \
@@ -686,9 +804,9 @@ python -m data_pipeline.schema_workflow \
   [其他参数]
 ```
 
-### 8.3 性能优化
+### 9.3 性能优化
 
-#### 8.3.1 并发配置调优
+#### 9.3.1 并发配置调优
 ```python
 # 在config.py中调整
 "max_concurrent_tables": 2,              # 增加并发数(谨慎)
@@ -696,14 +814,14 @@ python -m data_pipeline.schema_workflow \
 "batch_size": 20                         # 增加批处理大小
 ```
 
-#### 8.3.2 数据采样优化
+#### 9.3.2 数据采样优化
 ```python
 # 减少采样数据量
 "sample_data_limit": 10,                 # 从20减少到10
 "enum_detection_sample_limit": 1000      # 从5000减少到1000
 ```
 
-#### 8.3.3 跳过耗时步骤
+#### 9.3.3 跳过耗时步骤
 ```bash
 # 跳过SQL验证
 python -m data_pipeline.schema_workflow \
@@ -716,9 +834,9 @@ python -m data_pipeline.schema_workflow \
   [其他参数]
 ```
 
-### 8.4 环境检查
+### 9.4 环境检查
 
-#### 8.4.1 依赖检查
+#### 9.4.1 依赖检查
 ```bash
 # 检查Python版本
 python --version
@@ -738,7 +856,7 @@ asyncio.run(test())
 "
 ```
 
-#### 8.4.2 权限检查
+#### 9.4.2 权限检查
 ```bash
 # 使用内置权限检查工具
 python -m data_pipeline.ddl_generation.ddl_md_generator \
@@ -746,7 +864,7 @@ python -m data_pipeline.ddl_generation.ddl_md_generator \
   --check-permissions-only
 ```
 
-#### 8.4.3 磁盘空间检查
+#### 9.4.3 磁盘空间检查
 ```bash
 # 检查输出目录空间
 df -h data_pipeline/training_data/

+ 36 - 5
docs/vector_table_management_design.md

@@ -3,15 +3,15 @@
 ## 概述
 
 为 data_pipeline 添加两个新参数来管理 vector 表数据:
-- `--backup_vector_tables`: 备份vector表数据
-- `--truncate_vector_tables`: 清空vector表数据(自动启用备份)
+- `--backup-vector-tables`: 备份vector表数据
+- `--truncate-vector-tables`: 清空vector表数据(自动启用备份)
 
 ## 需求分析
 
 ### 1. 参数依赖关系
-- 可以单独使用 `--backup_vector_tables`
-- 不可以单独使用 `--truncate_vector_tables`
-- 使用 `--truncate_vector_tables` 时自动启用 `--backup_vector_tables`
+- 可以单独使用 `--backup-vector-tables`
+- 不可以单独使用 `--truncate-vector-tables`
+- 使用 `--truncate-vector-tables` 时自动启用 `--backup-vector-tables`
 
 ### 2. 支持的执行入口
 1. `python -m data_pipeline.schema_workflow`(包括使用 `--skip-training-load` 的情况)
@@ -947,4 +947,35 @@ SCHEMA_TOOLS_CONFIG = {
     - 传递给 `run_training` 时禁用vector管理参数(设为False)
     - 确保操作只执行一次
 
+#### 第四轮修正:
+13. **参数命名一致性**: 根据实际代码修正了文档中的命令行参数写法
+    - 统一使用连字符格式:`--backup-vector-tables` 和 `--truncate-vector-tables`
+    - 修正了概述和需求分析部分的参数名称
+    - 确保文档与实际代码实现的一致性
+
+## 🎯 **正确的使用示例**
+
+### 命令行使用 (注意使用连字符):
+
+```bash
+# 1. 完整工作流 + 备份和清空vector表
+python -m data_pipeline.schema_workflow --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" --table-list ./data_pipeline/tables.txt --business-context "高速公路服务区管理系统" --truncate-vector-tables
+
+# 2. 跳过训练但执行vector表管理
+python -m data_pipeline.schema_workflow --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" --table-list ./data_pipeline/tables.txt --business-context "高速公路服务区管理系统" --skip-training-load --backup-vector-tables
+
+# 3. 跳过训练并清空vector表
+python -m data_pipeline.schema_workflow --db-connection "postgresql://postgres:postgres@localhost:6432/highway_db" --table-list ./data_pipeline/tables.txt --business-context "高速公路服务区管理系统" --skip-training-load --truncate-vector-tables
+
+# 4. 独立训练脚本 + vector表管理
+python -m data_pipeline.trainer.run_training --data_path "./training_data/" --backup-vector-tables --truncate-vector-tables
+
+# 5. 只备份不清空
+python -m data_pipeline.trainer.run_training --data_path "./training_data/" --backup-vector-tables
+```
+
+### 参数说明:
+- `--backup-vector-tables`: 备份 langchain_pg_collection 和 langchain_pg_embedding 表
+- `--truncate-vector-tables`: 清空 langchain_pg_embedding 表(自动启用备份)
+
 核心原则是**安全优先**,确保在任何情况下都不会意外丢失数据。 

+ 120 - 0
test_vector_backup_only.py

@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+"""
+独立测试Vector表备份功能
+只备份langchain_pg_collection和langchain_pg_embedding表
+"""
+
+import asyncio
+import os
+from pathlib import Path
+from datetime import datetime
+
+
+async def test_vector_backup():
+    """测试vector表备份功能"""
+    
+    print("🧪 开始测试Vector表备份功能...")
+    print("=" * 50)
+    
+    # 1. 设置测试输出目录
+    test_dir = Path("./test_vector_backup_output")
+    test_dir.mkdir(exist_ok=True)
+    
+    print(f"📁 测试输出目录: {test_dir.resolve()}")
+    
+    try:
+        # 2. 导入VectorTableManager
+        from data_pipeline.trainer.vector_table_manager import VectorTableManager
+        
+        # 3. 创建管理器实例
+        task_id = f"test_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        vector_manager = VectorTableManager(
+            task_output_dir=str(test_dir), 
+            task_id=task_id
+        )
+        
+        print(f"🆔 任务ID: {task_id}")
+        print("🔧 VectorTableManager 创建成功")
+        
+        # 4. 执行备份(只备份,不清空)
+        print("\n🗂️ 开始执行备份...")
+        result = await vector_manager.execute_vector_management(
+            backup=True,    # 执行备份
+            truncate=False  # 不清空表
+        )
+        
+        # 5. 显示结果
+        print("\n📊 备份结果:")
+        print("=" * 30)
+        
+        if result.get("backup_performed", False):
+            print("✅ 备份状态: 已执行")
+            
+            tables_info = result.get("tables_backed_up", {})
+            for table_name, info in tables_info.items():
+                if info.get("success", False):
+                    print(f"  ✅ {table_name}: {info['row_count']}行 -> {info['backup_file']} ({info['file_size']})")
+                else:
+                    print(f"  ❌ {table_name}: 失败 - {info.get('error', '未知错误')}")
+        else:
+            print("❌ 备份状态: 未执行")
+        
+        duration = result.get("duration", 0)
+        print(f"⏱️  总耗时: {duration:.2f}秒")
+        
+        errors = result.get("errors", [])
+        if errors:
+            print(f"⚠️  错误信息: {'; '.join(errors)}")
+        
+        # 6. 检查生成的文件
+        backup_dir = test_dir / "vector_bak"
+        if backup_dir.exists():
+            print(f"\n📂 备份文件目录: {backup_dir.resolve()}")
+            backup_files = list(backup_dir.glob("*.csv"))
+            if backup_files:
+                print("📄 生成的备份文件:")
+                for file in backup_files:
+                    file_size = file.stat().st_size
+                    print(f"  📄 {file.name} ({file_size} bytes)")
+            else:
+                print("⚠️  未找到CSV备份文件")
+                
+            log_files = list(backup_dir.glob("*.txt"))
+            if log_files:
+                print("📋 日志文件:")
+                for file in log_files:
+                    print(f"  📋 {file.name}")
+        else:
+            print("❌ 备份目录不存在")
+        
+        print("\n🎉 测试完成!")
+        return True
+        
+    except Exception as e:
+        print(f"\n❌ 测试失败: {e}")
+        import traceback
+        print("详细错误信息:")
+        print(traceback.format_exc())
+        return False
+
+
+def main():
+    """主函数"""
+    print("Vector表备份功能独立测试")
+    print("测试目标: langchain_pg_collection, langchain_pg_embedding")
+    print("数据库: 从 data_pipeline.config 自动获取连接配置")
+    print()
+    
+    # 运行异步测试
+    success = asyncio.run(test_vector_backup())
+    
+    if success:
+        print("\n✅ 所有测试通过!")
+        exit(0)
+    else:
+        print("\n❌ 测试失败!")
+        exit(1)
+
+
+if __name__ == "__main__":
+    main() 

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików