Pārlūkot izejas kodu

修改了data_pipeline的调用api,在日志管理方面遇到问题,准备重构这个模块的日志管理.

wangxq 1 nedēļu atpakaļ
vecāks
revīzija
6313111c3c
37 mainītis faili ar 6077 papildinājumiem un 6 dzēšanām
  1. 2 1
      .claude/settings.local.json
  2. 474 1
      citu_app.py
  3. 9 4
      config/logging_config.yaml
  4. 13 0
      core/logging/log_manager.py
  5. 9 0
      data_pipeline/api/__init__.py
  6. 334 0
      data_pipeline/api/simple_db_manager.py
  7. 182 0
      data_pipeline/api/simple_file_manager.py
  8. 521 0
      data_pipeline/api/simple_workflow.py
  9. 346 0
      data_pipeline/sql/init_tables.sql
  10. 78 0
      data_pipeline/task_executor.py
  11. 31 0
      data_pipeline/training_data/task_20250701_131627/bss_business_day_data.ddl
  12. 32 0
      data_pipeline/training_data/task_20250701_131627/bss_business_day_data_detail.md
  13. 17 0
      data_pipeline/training_data/task_20250701_131627/bss_car_day_count.ddl
  14. 18 0
      data_pipeline/training_data/task_20250701_131627/bss_car_day_count_detail.md
  15. 15 0
      data_pipeline/training_data/task_20250701_131627/bss_company.ddl
  16. 15 0
      data_pipeline/training_data/task_20250701_131627/bss_company_detail.md
  17. 16 0
      data_pipeline/training_data/task_20250701_131627/bss_section_route.ddl
  18. 7 0
      data_pipeline/training_data/task_20250701_131627/bss_section_route_area_link.ddl
  19. 7 0
      data_pipeline/training_data/task_20250701_131627/bss_section_route_area_link_detail.md
  20. 16 0
      data_pipeline/training_data/task_20250701_131627/bss_section_route_detail.md
  21. 19 0
      data_pipeline/training_data/task_20250701_131627/bss_service_area.ddl
  22. 21 0
      data_pipeline/training_data/task_20250701_131627/bss_service_area_detail.md
  23. 18 0
      data_pipeline/training_data/task_20250701_131627/bss_service_area_mapper.ddl
  24. 19 0
      data_pipeline/training_data/task_20250701_131627/bss_service_area_mapper_detail.md
  25. 10 0
      data_pipeline/training_data/task_20250701_131627/db_query_decision_prompt.txt
  26. 10 0
      data_pipeline/training_data/task_20250701_131627/filename_mapping.txt
  27. 62 0
      data_pipeline/training_data/task_20250701_131627/metadata.txt
  28. 20 0
      data_pipeline/training_data/task_20250701_131627/metadata_detail.md
  29. 190 0
      data_pipeline/training_data/task_20250701_131627/qs_highway_db_20250701_134736_pair.json
  30. 202 0
      data_pipeline/training_data/task_20250701_131627/qs_highway_db_20250701_134736_pair.json.backup
  31. 14 0
      data_pipeline/training_data/task_20250701_131627/task_config.json
  32. 88 0
      data_pipeline/training_data/task_20250701_131627/task_result.json
  33. 179 0
      docs/data_pipeline_api_config_changes.md
  34. 1204 0
      docs/data_pipeline_api_design.md
  35. 1136 0
      docs/data_pipeline_api_detailed_design.md
  36. 637 0
      docs/data_pipeline_api_usage_guide.md
  37. 106 0
      test_api_changes.py

+ 2 - 1
.claude/settings.local.json

@@ -17,7 +17,8 @@
       "Bash(mv:*)",
       "Bash(rm:*)",
       "Bash(.venv/bin/python:*)",
-      "Bash(./.venv/Scripts/python.exe:*)"
+      "Bash(./.venv/Scripts/python.exe:*)",
+      "Bash(sed:*)"
     ],
     "deny": []
   }

+ 474 - 1
citu_app.py

@@ -2755,5 +2755,478 @@ const chatSession = new ChatSession();
 chatSession.askQuestion("各年龄段客户的流失率如何?");
 """
 
+# ==================== Data Pipeline API ====================
+
+# 导入简化的Data Pipeline模块
+import asyncio
+import os
+from threading import Thread
+from flask import send_file
+
+from data_pipeline.api.simple_workflow import SimpleWorkflowManager
+from data_pipeline.api.simple_file_manager import SimpleFileManager
+
+# 创建简化的管理器
+data_pipeline_manager = None
+data_pipeline_file_manager = None
+
+def get_data_pipeline_manager():
+    """获取Data Pipeline管理器单例"""
+    global data_pipeline_manager
+    if data_pipeline_manager is None:
+        data_pipeline_manager = SimpleWorkflowManager()
+    return data_pipeline_manager
+
+def get_data_pipeline_file_manager():
+    """获取Data Pipeline文件管理器单例"""
+    global data_pipeline_file_manager
+    if data_pipeline_file_manager is None:
+        data_pipeline_file_manager = SimpleFileManager()
+    return data_pipeline_file_manager
+
+# ==================== 简化的Data Pipeline API端点 ====================
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks', methods=['POST'])
+def create_data_pipeline_task():
+    """创建数据管道任务"""
+    try:
+        req = request.get_json(force=True)
+        
+        # 验证必需参数 - 移除db_connection,改为使用app_config配置
+        required_params = ['table_list_file', 'business_context']
+        missing_params = [param for param in required_params if not req.get(param)]
+        
+        if missing_params:
+            return jsonify(bad_request_response(
+                response_text=f"缺少必需参数: {', '.join(missing_params)}",
+                missing_params=missing_params
+            )), 400
+        
+        # 创建任务(自动使用app_config中的数据库配置)
+        manager = get_data_pipeline_manager()
+        task_id = manager.create_task(
+            table_list_file=req.get('table_list_file'),
+            business_context=req.get('business_context'),
+            db_name=req.get('db_name'),  # 可选参数,用于指定特定数据库名称
+            enable_sql_validation=req.get('enable_sql_validation', True),
+            enable_llm_repair=req.get('enable_llm_repair', True),
+            modify_original_file=req.get('modify_original_file', True),
+            enable_training_data_load=req.get('enable_training_data_load', True)
+        )
+        
+        # 获取任务信息
+        task_info = manager.get_task_status(task_id)
+        
+        response_data = {
+            "task_id": task_id,
+            "status": task_info.get('status'),
+            "created_at": task_info.get('created_at').isoformat() if task_info.get('created_at') else None
+        }
+        
+        return jsonify(success_response(
+            response_text="任务创建成功",
+            data=response_data
+        )), 201
+        
+    except Exception as e:
+        logger.error(f"创建数据管道任务失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="创建任务失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/execute', methods=['POST'])
+def execute_data_pipeline_task(task_id):
+    """执行数据管道任务"""
+    try:
+        req = request.get_json(force=True) if request.is_json else {}
+        execution_mode = req.get('execution_mode', 'complete')
+        step_name = req.get('step_name')
+        
+        # 验证执行模式
+        if execution_mode not in ['complete', 'step']:
+            return jsonify(bad_request_response(
+                response_text="无效的执行模式,必须是 'complete' 或 'step'",
+                invalid_params=['execution_mode']
+            )), 400
+        
+        # 如果是步骤执行模式,验证步骤名称
+        if execution_mode == 'step':
+            if not step_name:
+                return jsonify(bad_request_response(
+                    response_text="步骤执行模式需要指定step_name",
+                    missing_params=['step_name']
+                )), 400
+            
+            valid_steps = ['ddl_generation', 'qa_generation', 'sql_validation', 'training_load']
+            if step_name not in valid_steps:
+                return jsonify(bad_request_response(
+                    response_text=f"无效的步骤名称,支持的步骤: {', '.join(valid_steps)}",
+                    invalid_params=['step_name']
+                )), 400
+        
+        # 检查任务是否存在
+        manager = get_data_pipeline_manager()
+        task_info = manager.get_task_status(task_id)
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 使用subprocess启动独立进程执行任务
+        def run_task_subprocess():
+            try:
+                import subprocess
+                import sys
+                from pathlib import Path
+                
+                # 构建执行命令
+                python_executable = sys.executable
+                script_path = Path(__file__).parent / "data_pipeline" / "task_executor.py"
+                
+                cmd = [
+                    python_executable,
+                    str(script_path),
+                    "--task-id", task_id,
+                    "--execution-mode", execution_mode
+                ]
+                
+                if step_name:
+                    cmd.extend(["--step-name", step_name])
+                
+                logger.info(f"启动任务进程: {' '.join(cmd)}")
+                
+                # 启动后台进程(不等待完成)
+                process = subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                    cwd=Path(__file__).parent
+                )
+                
+                logger.info(f"任务进程已启动: PID={process.pid}, task_id={task_id}")
+                
+            except Exception as e:
+                logger.error(f"启动任务进程失败: {task_id}, 错误: {str(e)}")
+        
+        # 在新线程中启动subprocess(避免阻塞API响应)
+        thread = Thread(target=run_task_subprocess, daemon=True)
+        thread.start()
+        
+        response_data = {
+            "task_id": task_id,
+            "execution_mode": execution_mode,
+            "step_name": step_name if execution_mode == 'step' else None,
+            "message": "任务正在后台执行,请通过状态接口查询进度"
+        }
+        
+        return jsonify(success_response(
+            response_text="任务执行已启动",
+            data=response_data
+        )), 202
+        
+    except Exception as e:
+        logger.error(f"启动数据管道任务执行失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="启动任务执行失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>', methods=['GET'])
+def get_data_pipeline_task_status(task_id):
+    """
+    获取数据管道任务状态
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "获取任务状态成功",
+        "data": {
+            "task_id": "task_20250627_143052",
+            "status": "in_progress",
+            "step_status": {
+                "ddl_generation": "completed",
+                "qa_generation": "running",
+                "sql_validation": "pending",
+                "training_load": "pending"
+            },
+            "created_at": "2025-06-27T14:30:52",
+            "started_at": "2025-06-27T14:31:00",
+            "parameters": {...},
+            "current_execution": {...},
+            "total_executions": 2
+        }
+    }
+    """
+    try:
+        manager = get_data_pipeline_manager()
+        task_info = manager.get_task_status(task_id)
+        
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 获取执行记录
+        executions = manager.get_task_executions(task_id)
+        current_execution = executions[0] if executions else None
+        
+        response_data = {
+            "task_id": task_info['id'],
+            "status": task_info['status'],
+            "step_status": task_info.get('step_status', {}),
+            "created_at": task_info['created_at'].isoformat() if task_info.get('created_at') else None,
+            "started_at": task_info['started_at'].isoformat() if task_info.get('started_at') else None,
+            "completed_at": task_info['completed_at'].isoformat() if task_info.get('completed_at') else None,
+            "parameters": task_info.get('parameters', {}),
+            "result": task_info.get('result'),
+            "error_message": task_info.get('error_message'),
+            "current_execution": {
+                "execution_id": current_execution['execution_id'],
+                "step": current_execution['execution_step'],
+                "status": current_execution['status'],
+                "started_at": current_execution['started_at'].isoformat() if current_execution.get('started_at') else None
+            } if current_execution else None,
+            "total_executions": len(executions)
+        }
+        
+        return jsonify(success_response(
+            response_text="获取任务状态成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取数据管道任务状态失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取任务状态失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/logs', methods=['GET'])
+def get_data_pipeline_task_logs(task_id):
+    """
+    获取数据管道任务日志
+    
+    查询参数:
+    - limit: 日志数量限制,默认100
+    - level: 日志级别过滤,可选
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "获取任务日志成功",
+        "data": {
+            "task_id": "task_20250627_143052",
+            "logs": [
+                {
+                    "timestamp": "2025-06-27T14:30:52",
+                    "level": "INFO",
+                    "message": "任务开始执行",
+                    "step_name": "ddl_generation",
+                    "execution_id": "task_20250627_143052_step_ddl_generation_exec_20250627_143100"
+                }
+            ],
+            "total": 15
+        }
+    }
+    """
+    try:
+        limit = request.args.get('limit', 100, type=int)
+        level = request.args.get('level')
+        
+        # 限制最大查询数量
+        limit = min(limit, 1000)
+        
+        manager = get_data_pipeline_manager()
+        
+        # 验证任务是否存在
+        task_info = manager.get_task_status(task_id)
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 获取日志
+        logs = manager.get_task_logs(task_id, limit=limit)
+        
+        # 过滤日志级别
+        if level:
+            logs = [log for log in logs if log.get('log_level') == level.upper()]
+        
+        # 格式化日志
+        formatted_logs = []
+        for log in logs:
+            formatted_logs.append({
+                "timestamp": log['timestamp'].isoformat() if log.get('timestamp') else None,
+                "level": log.get('log_level'),
+                "message": log.get('message'),
+                "step_name": log.get('step_name'),
+                "execution_id": log.get('execution_id'),
+                "module_name": log.get('module_name'),
+                "function_name": log.get('function_name'),
+                "extra_data": log.get('extra_data')
+            })
+        
+        response_data = {
+            "task_id": task_id,
+            "logs": formatted_logs,
+            "total": len(formatted_logs)
+        }
+        
+        return jsonify(success_response(
+            response_text="获取任务日志成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取数据管道任务日志失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取任务日志失败,请稍后重试"
+        )), 500
+
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks', methods=['GET'])
+def list_data_pipeline_tasks():
+    """获取数据管道任务列表"""
+    try:
+        limit = request.args.get('limit', 50, type=int)
+        offset = request.args.get('offset', 0, type=int)
+        status_filter = request.args.get('status')
+        
+        # 限制查询数量
+        limit = min(limit, 100)
+        
+        manager = get_data_pipeline_manager()
+        tasks = manager.get_tasks_list(
+            limit=limit,
+            offset=offset,
+            status_filter=status_filter
+        )
+        
+        # 格式化任务列表
+        formatted_tasks = []
+        for task in tasks:
+            formatted_tasks.append({
+                "task_id": task.get('id'),
+                "status": task.get('status'),
+                "step_status": task.get('step_status'),
+                "created_at": task['created_at'].isoformat() if task.get('created_at') else None,
+                "started_at": task['started_at'].isoformat() if task.get('started_at') else None,
+                "completed_at": task['completed_at'].isoformat() if task.get('completed_at') else None,
+                "created_by": task.get('created_by'),
+                "db_name": task.get('db_name'),
+                "business_context": task.get('business_context')
+            })
+        
+        response_data = {
+            "tasks": formatted_tasks,
+            "total": len(formatted_tasks),
+            "limit": limit,
+            "offset": offset
+        }
+        
+        return jsonify(success_response(
+            response_text="获取任务列表成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取数据管道任务列表失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取任务列表失败,请稍后重试"
+        )), 500
+
+# ==================== Data Pipeline 文件管理 API ====================
+
+from flask import send_file
+
+# 创建文件管理器
+data_pipeline_file_manager = None
+
+def get_data_pipeline_file_manager():
+    """获取Data Pipeline文件管理器单例"""
+    global data_pipeline_file_manager
+    if data_pipeline_file_manager is None:
+        data_pipeline_file_manager = SimpleFileManager()
+    return data_pipeline_file_manager
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/files', methods=['GET'])
+def get_data_pipeline_task_files(task_id):
+    """获取任务文件列表"""
+    try:
+        file_manager = get_data_pipeline_file_manager()
+        
+        # 获取任务文件
+        files = file_manager.get_task_files(task_id)
+        directory_info = file_manager.get_directory_info(task_id)
+        
+        # 格式化文件信息
+        formatted_files = []
+        for file_info in files:
+            formatted_files.append({
+                "file_name": file_info['file_name'],
+                "file_type": file_info['file_type'],
+                "file_size": file_info['file_size'],
+                "file_size_formatted": file_info['file_size_formatted'],
+                "created_at": file_info['created_at'].isoformat() if file_info.get('created_at') else None,
+                "modified_at": file_info['modified_at'].isoformat() if file_info.get('modified_at') else None,
+                "is_readable": file_info['is_readable']
+            })
+        
+        response_data = {
+            "task_id": task_id,
+            "files": formatted_files,
+            "directory_info": directory_info
+        }
+        
+        return jsonify(success_response(
+            response_text="获取任务文件列表成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取任务文件列表失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取任务文件列表失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/files/<file_name>', methods=['GET'])
+def download_data_pipeline_task_file(task_id, file_name):
+    """下载任务文件"""
+    try:
+        file_manager = get_data_pipeline_file_manager()
+        
+        # 验证文件存在且安全
+        if not file_manager.file_exists(task_id, file_name):
+            return jsonify(not_found_response(
+                response_text=f"文件不存在: {file_name}"
+            )), 404
+        
+        if not file_manager.is_file_safe(task_id, file_name):
+            return jsonify(bad_request_response(
+                response_text="非法的文件路径"
+            )), 400
+        
+        # 获取文件路径
+        file_path = file_manager.get_file_path(task_id, file_name)
+        
+        # 检查文件是否可读
+        if not os.access(file_path, os.R_OK):
+            return jsonify(bad_request_response(
+                response_text="文件不可读"
+            )), 400
+        
+        return send_file(
+            file_path,
+            as_attachment=True,
+            download_name=file_name
+        )
+        
+    except Exception as e:
+        logger.error(f"下载任务文件失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="下载文件失败,请稍后重试"
+        )), 500
+
 logger.info("正在启动Flask应用: http://localhost:8084")
-app.run(host="0.0.0.0", port=8084, debug=True)
+app.run(host="0.0.0.0", port=8084, debug=True)

+ 9 - 4
config/logging_config.yaml

@@ -40,6 +40,8 @@ modules:
         backup_count: 10
   
   data_pipeline:
+    # 注意:data_pipeline的日志文件路径会在运行时动态设置到任务目录
+    # 这里的file配置主要用于格式和级别设置
     level: DEBUG
     console:
       enabled: true
@@ -48,12 +50,15 @@ modules:
     file:
       enabled: true
       level: DEBUG
-      filename: "data_pipeline.log"
+      # filename 将在运行时动态设置,不在这里指定
+      # filename: "data_pipeline.log"  # 移除固定路径
       format: "%(asctime)s [%(levelname)s] [%(name)s] %(filename)s:%(lineno)d - %(message)s"
       rotation:
-        enabled: true
-        max_size: "30MB"
-        backup_count: 8
+        # 对于任务特定的日志,通常不需要rotation
+        # 但保留配置以防单个任务产生大量日志
+        enabled: false  # 禁用rotation,因为每个任务的日志是独立的
+        max_size: "10MB"    # 如果启用,限制为10MB
+        backup_count: 2     # 如果启用,只保留2个备份
   
   agent:
     level: DEBUG

+ 13 - 0
core/logging/log_manager.py

@@ -123,6 +123,19 @@ class LogManager:
     
     def _configure_logger(self, logger: logging.Logger, module: str):
         """配置具体的logger"""
+        # 如果配置未初始化,使用默认的控制台日志配置
+        if self.config is None:
+            logger.setLevel(logging.INFO)
+            if not logger.handlers:
+                console_handler = logging.StreamHandler()
+                formatter = logging.Formatter(
+                    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+                )
+                console_handler.setFormatter(formatter)
+                logger.addHandler(console_handler)
+                logger.propagate = False
+            return
+            
         module_config = self.config.get('modules', {}).get(module, self.config['default'])
         
         # 设置日志级别

+ 9 - 0
data_pipeline/api/__init__.py

@@ -0,0 +1,9 @@
+"""
+Data Pipeline API模块
+
+提供数据管道任务的API支持,包括:
+- 任务管理
+- 执行跟踪
+- 日志记录
+- 文件管理
+"""

+ 334 - 0
data_pipeline/api/simple_db_manager.py

@@ -0,0 +1,334 @@
+"""
+Data Pipeline API 简化数据库管理器
+
+复用现有的pgvector数据库连接机制,提供Data Pipeline任务的数据库操作功能
+"""
+
+import json
+from datetime import datetime
+from typing import Dict, Any, List, Optional, Tuple
+
+import psycopg2
+from psycopg2.extras import RealDictCursor, Json
+
+from app_config import PGVECTOR_CONFIG
+from core.logging import get_data_pipeline_logger
+
+
+class SimpleTaskManager:
+    """简化的任务管理器,复用现有pgvector连接"""
+    
+    def __init__(self):
+        """初始化任务管理器"""
+        self.logger = get_data_pipeline_logger("SimpleTaskManager")
+        self._connection = None
+    
+    def _get_connection(self):
+        """获取pgvector数据库连接"""
+        if self._connection is None or self._connection.closed:
+            try:
+                self._connection = psycopg2.connect(
+                    host=PGVECTOR_CONFIG.get('host'),
+                    port=PGVECTOR_CONFIG.get('port'),
+                    database=PGVECTOR_CONFIG.get('dbname'),
+                    user=PGVECTOR_CONFIG.get('user'),
+                    password=PGVECTOR_CONFIG.get('password')
+                )
+                self._connection.autocommit = True
+            except Exception as e:
+                self.logger.error(f"pgvector数据库连接失败: {e}")
+                raise
+        return self._connection
+    
+    def close_connection(self):
+        """关闭数据库连接"""
+        if self._connection and not self._connection.closed:
+            self._connection.close()
+            self._connection = None
+    
+    def generate_task_id(self) -> str:
+        """生成任务ID,格式: task_YYYYMMDD_HHMMSS"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        return f"task_{timestamp}"
+    
+    def create_task(self, 
+                   table_list_file: str,
+                   business_context: str,
+                   db_name: str = None,
+                   **kwargs) -> str:
+        """创建新任务"""
+        task_id = self.generate_task_id()
+        
+        # 从 app_config 获取业务数据库连接信息
+        from app_config import APP_DB_CONFIG
+        
+        # 构建业务数据库连接字符串(用于参数记录)
+        business_db_connection = self._build_db_connection_string(APP_DB_CONFIG)
+        
+        # 使用传入的db_name或从APP_DB_CONFIG提取
+        if not db_name:
+            db_name = APP_DB_CONFIG.get('dbname', 'business_db')
+        
+        # 构建参数
+        parameters = {
+            "db_connection": business_db_connection,  # 业务数据库连接(用于schema_workflow执行)
+            "table_list_file": table_list_file,
+            "business_context": business_context,
+            **kwargs
+        }
+        
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                cursor.execute("""
+                    INSERT INTO data_pipeline_tasks (
+                        id, task_type, status, parameters, created_by, 
+                        db_name, business_context, output_directory
+                    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
+                """, (
+                    task_id, 
+                    'data_workflow', 
+                    'pending', 
+                    Json(parameters),
+                    'api',
+                    db_name,
+                    business_context,
+                    f"./data_pipeline/training_data/{task_id}"
+                ))
+                
+            self.logger.info(f"任务创建成功: {task_id}")
+            return task_id
+            
+        except Exception as e:
+            self.logger.error(f"任务创建失败: {e}")
+            raise
+    
+    def get_task(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """获取任务信息"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+                cursor.execute("SELECT * FROM data_pipeline_tasks WHERE id = %s", (task_id,))
+                result = cursor.fetchone()
+                return dict(result) if result else None
+        except Exception as e:
+            self.logger.error(f"获取任务信息失败: {e}")
+            raise
+    
+    def update_task_status(self, task_id: str, status: str, error_message: Optional[str] = None):
+        """更新任务状态"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                update_fields = ["status = %s"]
+                values = [status]
+                
+                if status == 'in_progress' and not self._get_task_started_at(task_id):
+                    update_fields.append("started_at = CURRENT_TIMESTAMP")
+                
+                if status in ['completed', 'failed']:
+                    update_fields.append("completed_at = CURRENT_TIMESTAMP")
+                
+                if error_message:
+                    update_fields.append("error_message = %s")
+                    values.append(error_message)
+                
+                values.append(task_id)
+                
+                cursor.execute(f"""
+                    UPDATE data_pipeline_tasks 
+                    SET {', '.join(update_fields)}
+                    WHERE id = %s
+                """, values)
+                
+                self.logger.info(f"任务状态更新: {task_id} -> {status}")
+        except Exception as e:
+            self.logger.error(f"任务状态更新失败: {e}")
+            raise
+    
+    def update_step_status(self, task_id: str, step_name: str, step_status: str):
+        """更新步骤状态"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                cursor.execute("""
+                    UPDATE data_pipeline_tasks 
+                    SET step_status = jsonb_set(step_status, %s, %s)
+                    WHERE id = %s
+                """, ([step_name], json.dumps(step_status), task_id))
+                
+                self.logger.debug(f"步骤状态更新: {task_id}.{step_name} -> {step_status}")
+        except Exception as e:
+            self.logger.error(f"步骤状态更新失败: {e}")
+            raise
+    
+    def create_execution(self, task_id: str, execution_step: str) -> str:
+        """创建执行记录"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        execution_id = f"{task_id}_step_{execution_step}_exec_{timestamp}"
+        
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                cursor.execute("""
+                    INSERT INTO data_pipeline_task_executions (
+                        task_id, execution_step, status, execution_id
+                    ) VALUES (%s, %s, %s, %s)
+                """, (task_id, execution_step, 'running', execution_id))
+                
+                self.logger.info(f"执行记录创建: {execution_id}")
+                return execution_id
+        except Exception as e:
+            self.logger.error(f"执行记录创建失败: {e}")
+            raise
+    
+    def complete_execution(self, execution_id: str, status: str, error_message: Optional[str] = None):
+        """完成执行记录"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                # 计算执行时长
+                cursor.execute("""
+                    SELECT started_at FROM data_pipeline_task_executions 
+                    WHERE execution_id = %s
+                """, (execution_id,))
+                result = cursor.fetchone()
+                
+                duration_seconds = None
+                if result and result[0]:
+                    duration_seconds = int((datetime.now() - result[0]).total_seconds())
+                
+                # 更新执行记录
+                update_fields = ["status = %s", "completed_at = CURRENT_TIMESTAMP"]
+                values = [status]
+                
+                if duration_seconds is not None:
+                    update_fields.append("duration_seconds = %s")
+                    values.append(duration_seconds)
+                
+                if error_message:
+                    update_fields.append("error_message = %s")
+                    values.append(error_message)
+                
+                values.append(execution_id)
+                
+                cursor.execute(f"""
+                    UPDATE data_pipeline_task_executions 
+                    SET {', '.join(update_fields)}
+                    WHERE execution_id = %s
+                """, values)
+                
+                self.logger.info(f"执行记录完成: {execution_id} -> {status}")
+        except Exception as e:
+            self.logger.error(f"执行记录完成失败: {e}")
+            raise
+    
+    def record_log(self, task_id: str, log_level: str, message: str, 
+                   execution_id: Optional[str] = None, step_name: Optional[str] = None):
+        """记录日志到数据库"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                cursor.execute("""
+                    INSERT INTO data_pipeline_task_logs (
+                        task_id, execution_id, log_level, message, step_name
+                    ) VALUES (%s, %s, %s, %s, %s)
+                """, (task_id, execution_id, log_level, message, step_name))
+        except Exception as e:
+            self.logger.error(f"日志记录失败: {e}")
+    
+    def get_task_logs(self, task_id: str, limit: int = 100) -> List[Dict[str, Any]]:
+        """获取任务日志"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+                cursor.execute("""
+                    SELECT * FROM data_pipeline_task_logs 
+                    WHERE task_id = %s 
+                    ORDER BY timestamp DESC 
+                    LIMIT %s
+                """, (task_id, limit))
+                
+                return [dict(row) for row in cursor.fetchall()]
+        except Exception as e:
+            self.logger.error(f"获取任务日志失败: {e}")
+            raise
+    
+    def get_task_executions(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务执行记录"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+                cursor.execute("""
+                    SELECT * FROM data_pipeline_task_executions 
+                    WHERE task_id = %s 
+                    ORDER BY started_at DESC
+                """, (task_id,))
+                
+                return [dict(row) for row in cursor.fetchall()]
+        except Exception as e:
+            self.logger.error(f"获取执行记录失败: {e}")
+            raise
+    
+    def get_tasks_list(self, limit: int = 50, offset: int = 0, status_filter: Optional[str] = None) -> List[Dict[str, Any]]:
+        """获取任务列表"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+                where_clause = ""
+                params = []
+                
+                if status_filter:
+                    where_clause = "WHERE status = %s"
+                    params.append(status_filter)
+                
+                params.extend([limit, offset])
+                
+                cursor.execute(f"""
+                    SELECT * FROM data_pipeline_tasks 
+                    {where_clause}
+                    ORDER BY created_at DESC 
+                    LIMIT %s OFFSET %s
+                """, params)
+                
+                return [dict(row) for row in cursor.fetchall()]
+        except Exception as e:
+            self.logger.error(f"获取任务列表失败: {e}")
+            raise
+    
+    def _get_task_started_at(self, task_id: str) -> Optional[datetime]:
+        """获取任务开始时间"""
+        try:
+            conn = self._get_connection()
+            with conn.cursor() as cursor:
+                cursor.execute("SELECT started_at FROM data_pipeline_tasks WHERE id = %s", (task_id,))
+                result = cursor.fetchone()
+                return result[0] if result and result[0] else None
+        except Exception:
+            return None
+    
+    def _build_db_connection_string(self, db_config: dict) -> str:
+        """构建数据库连接字符串"""
+        try:
+            host = db_config.get('host', 'localhost')
+            port = db_config.get('port', 5432)
+            dbname = db_config.get('dbname', 'database')
+            user = db_config.get('user', 'postgres')
+            password = db_config.get('password', '')
+            
+            return f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
+        except Exception:
+            return "postgresql://localhost:5432/database"
+    
+    def _extract_db_name(self, connection_string: str) -> str:
+        """从连接字符串提取数据库名称"""
+        try:
+            if '/' in connection_string:
+                db_name = connection_string.split('/')[-1]
+                if '?' in db_name:
+                    db_name = db_name.split('?')[0]
+                return db_name if db_name else "database"
+            else:
+                return "database"
+        except Exception:
+            return "database"

+ 182 - 0
data_pipeline/api/simple_file_manager.py

@@ -0,0 +1,182 @@
+"""
+Data Pipeline API 简化文件管理器
+
+提供简单的文件列表和下载功能,无压缩等复杂功能
+"""
+
+import os
+from pathlib import Path
+from typing import Dict, Any, List
+from datetime import datetime
+
+from core.logging import get_data_pipeline_logger
+
+
+class SimpleFileManager:
+    """简化的文件管理器"""
+    
+    def __init__(self, base_output_dir: str = "./data_pipeline/training_data/"):
+        """
+        初始化文件管理器
+        
+        Args:
+            base_output_dir: 基础输出目录
+        """
+        self.base_output_dir = Path(base_output_dir)
+        self.logger = get_data_pipeline_logger("SimpleFileManager")
+        
+        # 确保基础目录存在
+        self.base_output_dir.mkdir(parents=True, exist_ok=True)
+    
+    def get_task_directory(self, task_id: str) -> Path:
+        """获取任务目录路径"""
+        return self.base_output_dir / task_id
+    
+    def create_task_directory(self, task_id: str) -> bool:
+        """创建任务目录"""
+        try:
+            task_dir = self.get_task_directory(task_id)
+            task_dir.mkdir(parents=True, exist_ok=True)
+            self.logger.info(f"任务目录已创建: {task_dir}")
+            return True
+        except Exception as e:
+            self.logger.error(f"创建任务目录失败: {e}")
+            return False
+    
+    def get_task_files(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务目录下的所有文件信息"""
+        try:
+            task_dir = self.get_task_directory(task_id)
+            if not task_dir.exists():
+                return []
+            
+            files_info = []
+            for file_path in task_dir.iterdir():
+                if file_path.is_file():
+                    file_info = self._get_file_info(file_path)
+                    files_info.append(file_info)
+            
+            # 按修改时间排序(最新的在前)
+            files_info.sort(key=lambda x: x['modified_at'], reverse=True)
+            return files_info
+            
+        except Exception as e:
+            self.logger.error(f"获取任务文件失败: {e}")
+            return []
+    
+    def _get_file_info(self, file_path: Path) -> Dict[str, Any]:
+        """获取单个文件的基本信息"""
+        try:
+            stat = file_path.stat()
+            
+            return {
+                "file_name": file_path.name,
+                "file_path": str(file_path),
+                "file_type": self._determine_file_type(file_path),
+                "file_size": stat.st_size,
+                "file_size_formatted": self._format_file_size(stat.st_size),
+                "created_at": datetime.fromtimestamp(stat.st_ctime),
+                "modified_at": datetime.fromtimestamp(stat.st_mtime),
+                "is_readable": os.access(file_path, os.R_OK)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"获取文件信息失败: {e}")
+            return {
+                "file_name": file_path.name,
+                "file_path": str(file_path),
+                "file_type": "unknown",
+                "file_size": 0,
+                "file_size_formatted": "0 B",
+                "created_at": datetime.now(),
+                "modified_at": datetime.now(),
+                "is_readable": False
+            }
+    
+    def _determine_file_type(self, file_path: Path) -> str:
+        """根据文件扩展名确定文件类型"""
+        suffix = file_path.suffix.lower()
+        
+        type_mapping = {
+            '.ddl': 'ddl',
+            '.sql': 'sql',
+            '.md': 'markdown',
+            '.markdown': 'markdown',
+            '.json': 'json',
+            '.txt': 'text',
+            '.log': 'log'
+        }
+        
+        return type_mapping.get(suffix, 'other')
+    
+    def _format_file_size(self, size_bytes: int) -> str:
+        """格式化文件大小显示"""
+        if size_bytes == 0:
+            return "0 B"
+        
+        size_names = ["B", "KB", "MB", "GB"]
+        i = 0
+        size = float(size_bytes)
+        
+        while size >= 1024.0 and i < len(size_names) - 1:
+            size /= 1024.0
+            i += 1
+        
+        return f"{size:.1f} {size_names[i]}"
+    
+    def get_file_path(self, task_id: str, file_name: str) -> Path:
+        """获取文件的完整路径"""
+        task_dir = self.get_task_directory(task_id)
+        return task_dir / file_name
+    
+    def file_exists(self, task_id: str, file_name: str) -> bool:
+        """检查文件是否存在"""
+        file_path = self.get_file_path(task_id, file_name)
+        return file_path.exists() and file_path.is_file()
+    
+    def is_file_safe(self, task_id: str, file_name: str) -> bool:
+        """检查文件路径是否安全(防止路径遍历攻击)"""
+        try:
+            task_dir = self.get_task_directory(task_id)
+            file_path = task_dir / file_name
+            
+            # 确保文件在任务目录内
+            file_path.resolve().relative_to(task_dir.resolve())
+            return True
+        except ValueError:
+            return False
+    
+    def get_directory_info(self, task_id: str) -> Dict[str, Any]:
+        """获取任务目录信息"""
+        try:
+            task_dir = self.get_task_directory(task_id)
+            
+            if not task_dir.exists():
+                return {
+                    "exists": False,
+                    "directory_path": str(task_dir),
+                    "total_files": 0,
+                    "total_size": 0,
+                    "total_size_formatted": "0 B"
+                }
+            
+            files = self.get_task_files(task_id)
+            total_size = sum(file_info['file_size'] for file_info in files)
+            
+            return {
+                "exists": True,
+                "directory_path": str(task_dir),
+                "total_files": len(files),
+                "total_size": total_size,
+                "total_size_formatted": self._format_file_size(total_size)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"获取目录信息失败: {e}")
+            return {
+                "exists": False,
+                "directory_path": str(self.get_task_directory(task_id)),
+                "total_files": 0,
+                "total_size": 0,
+                "total_size_formatted": "0 B"
+            }

+ 521 - 0
data_pipeline/api/simple_workflow.py

@@ -0,0 +1,521 @@
+"""
+Data Pipeline API 简化任务工作流
+
+集成简化后的数据库管理器和文件管理器,提供任务执行功能
+"""
+
+import asyncio
+import json
+import os
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+from contextlib import contextmanager
+
+from data_pipeline.schema_workflow import SchemaWorkflowOrchestrator
+from data_pipeline.api.simple_db_manager import SimpleTaskManager
+from data_pipeline.api.simple_file_manager import SimpleFileManager
+from core.logging import get_data_pipeline_logger
+
+
+class SimpleWorkflowExecutor:
+    """简化的任务工作流执行器"""
+    
+    def __init__(self, task_id: str):
+        """
+        初始化工作流执行器
+        
+        Args:
+            task_id: 任务ID
+        """
+        self.task_id = task_id
+        self.logger = get_data_pipeline_logger("SimpleWorkflowExecutor")
+        
+        # 初始化管理器
+        self.task_manager = SimpleTaskManager()
+        self.file_manager = SimpleFileManager()
+        
+        # 任务目录日志记录器
+        self.task_dir_logger = None
+        
+        # 加载任务信息
+        self.task_info = None
+        self.task_params = None
+        self._load_task_info()
+    
+    def _load_task_info(self):
+        """加载任务信息"""
+        try:
+            self.task_info = self.task_manager.get_task(self.task_id)
+            if self.task_info:
+                self.task_params = self.task_info.get('parameters', {})
+            else:
+                raise ValueError(f"任务不存在: {self.task_id}")
+        except Exception as e:
+            self.logger.error(f"加载任务信息失败: {e}")
+            raise
+    
+    def _ensure_task_directory(self) -> bool:
+        """确保任务目录存在"""
+        try:
+            success = self.file_manager.create_task_directory(self.task_id)
+            if success:
+                # 写入任务配置文件
+                self._write_task_config()
+                # 初始化任务目录日志记录器
+                self._setup_task_directory_logger()
+            return success
+        except Exception as e:
+            self.logger.error(f"创建任务目录失败: {e}")
+            return False
+    
+    def _write_task_config(self):
+        """写入任务配置文件"""
+        try:
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            config_file = task_dir / "task_config.json"
+            
+            config_data = {
+                "task_id": self.task_id,
+                "created_at": self.task_info.get('created_at').isoformat() if self.task_info.get('created_at') else None,
+                "parameters": self.task_params,
+                "output_directory": str(task_dir)
+            }
+            
+            with open(config_file, 'w', encoding='utf-8') as f:
+                json.dump(config_data, f, ensure_ascii=False, indent=2, default=str)
+                
+        except Exception as e:
+            self.logger.error(f"写入任务配置失败: {e}")
+    
+    def _setup_task_directory_logger(self):
+        """设置任务目录日志记录器"""
+        try:
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            log_file = task_dir / "data_pipeline.log"
+            
+            # 创建专门的任务目录日志记录器
+            self.task_dir_logger = logging.getLogger(f"TaskDir_{self.task_id}")
+            self.task_dir_logger.setLevel(logging.DEBUG)
+            
+            # 清除已有处理器
+            self.task_dir_logger.handlers.clear()
+            self.task_dir_logger.propagate = False
+            
+            # 创建文件处理器
+            file_handler = logging.FileHandler(log_file, encoding='utf-8')
+            file_handler.setLevel(logging.DEBUG)
+            
+            # 设置详细的日志格式
+            formatter = logging.Formatter(
+                '%(asctime)s [%(levelname)s] %(name)s: %(message)s',
+                datefmt='%Y-%m-%d %H:%M:%S'
+            )
+            file_handler.setFormatter(formatter)
+            
+            self.task_dir_logger.addHandler(file_handler)
+            
+            # 记录初始化信息
+            self.task_dir_logger.info(f"任务目录日志初始化完成 - 任务ID: {self.task_id}")
+            self.task_dir_logger.info(f"任务参数: {json.dumps(self.task_params, ensure_ascii=False, default=str)}")
+            
+        except Exception as e:
+            self.logger.error(f"设置任务目录日志记录器失败: {e}")
+    
+    def _log_to_task_directory(self, level: str, message: str, step_name: str = None):
+        """记录日志到任务目录"""
+        if self.task_dir_logger:
+            try:
+                if step_name:
+                    message = f"[{step_name}] {message}"
+                
+                log_level = getattr(logging, level.upper(), logging.INFO)
+                self.task_dir_logger.log(log_level, message)
+            except Exception as e:
+                self.logger.error(f"记录任务目录日志失败: {e}")
+    
+    def _create_orchestrator(self) -> SchemaWorkflowOrchestrator:
+        """创建工作流编排器"""
+        task_dir = self.file_manager.get_task_directory(self.task_id)
+        
+        return SchemaWorkflowOrchestrator(
+            db_connection=self.task_params['db_connection'],
+            table_list_file=self.task_params['table_list_file'],
+            business_context=self.task_params['business_context'],
+            output_dir=str(task_dir),
+            enable_sql_validation=self.task_params.get('enable_sql_validation', True),
+            enable_llm_repair=self.task_params.get('enable_llm_repair', True),
+            modify_original_file=self.task_params.get('modify_original_file', True),
+            enable_training_data_load=self.task_params.get('enable_training_data_load', True)
+        )
+    
+    @contextmanager
+    def _step_execution(self, step_name: str):
+        """步骤执行上下文管理器"""
+        execution_id = None
+        
+        try:
+            # 开始执行
+            execution_id = self.task_manager.create_execution(self.task_id, step_name)
+            self.task_manager.update_step_status(self.task_id, step_name, "running")
+            self.task_manager.record_log(self.task_id, "INFO", f"开始执行步骤: {step_name}", execution_id, step_name)
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("INFO", f"开始执行步骤: {step_name}", step_name)
+            
+            yield execution_id
+            
+            # 成功完成
+            self.task_manager.complete_execution(execution_id, 'completed')
+            self.task_manager.update_step_status(self.task_id, step_name, "completed")
+            self.task_manager.record_log(self.task_id, "INFO", f"步骤执行完成: {step_name}", execution_id, step_name)
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("INFO", f"步骤执行完成: {step_name}", step_name)
+            
+        except Exception as e:
+            # 执行失败
+            error_msg = str(e)
+            
+            if execution_id:
+                self.task_manager.complete_execution(execution_id, 'failed', error_msg)
+            
+            self.task_manager.update_step_status(self.task_id, step_name, "failed")
+            self.task_manager.record_log(self.task_id, "ERROR", f"步骤执行失败: {step_name} - {error_msg}", execution_id, step_name)
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("ERROR", f"步骤执行失败: {step_name} - {error_msg}", step_name)
+            raise
+    
+    async def execute_complete_workflow(self) -> Dict[str, Any]:
+        """执行完整工作流"""
+        try:
+            # 确保任务目录存在
+            if not self._ensure_task_directory():
+                raise Exception("无法创建任务目录")
+            
+            # 开始任务
+            self.task_manager.update_task_status(self.task_id, 'in_progress')
+            self.task_manager.record_log(self.task_id, "INFO", "任务开始执行")
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("INFO", "完整工作流任务开始执行")
+            
+            # 创建工作流编排器
+            orchestrator = self._create_orchestrator()
+            
+            # 执行完整工作流
+            with self._step_execution("complete") as execution_id:
+                self.task_manager.record_log(self.task_id, "INFO", "开始执行完整工作流", execution_id, "complete")
+                
+                # 重定向SchemaWorkflowOrchestrator的日志到任务目录
+                self._redirect_orchestrator_logs(orchestrator)
+                
+                result = await orchestrator.execute_complete_workflow()
+                
+                # 写入结果文件
+                self._write_result_file(result)
+                
+                self.task_manager.record_log(self.task_id, "INFO", "完整工作流执行完成", execution_id, "complete")
+            
+            # 更新所有子步骤状态为完成
+            self._update_all_step_status_for_complete_workflow(result)
+            
+            # 完成任务
+            self.task_manager.update_task_status(self.task_id, 'completed')
+            self.task_manager.record_log(self.task_id, "INFO", "任务执行完成")
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("INFO", "完整工作流任务执行完成")
+            
+            return {
+                "success": True,
+                "task_id": self.task_id,
+                "execution_mode": "complete",
+                "result": result
+            }
+            
+        except Exception as e:
+            # 记录错误
+            error_msg = str(e)
+            self.task_manager.record_log(self.task_id, "ERROR", f"任务执行失败: {error_msg}")
+            self.task_manager.update_task_status(self.task_id, 'failed', error_msg)
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("ERROR", f"完整工作流任务执行失败: {error_msg}")
+            
+            return {
+                "success": False,
+                "task_id": self.task_id,
+                "execution_mode": "complete",
+                "error": error_msg
+            }
+    
+    async def execute_single_step(self, step_name: str) -> Dict[str, Any]:
+        """执行单个步骤"""
+        try:
+            # 确保任务目录存在
+            if not self._ensure_task_directory():
+                raise Exception("无法创建任务目录")
+            
+            # 更新任务状态
+            self.task_manager.update_task_status(self.task_id, 'in_progress')
+            
+            # 创建工作流编排器
+            orchestrator = self._create_orchestrator()
+            
+            # 重定向SchemaWorkflowOrchestrator的日志到任务目录
+            self._redirect_orchestrator_logs(orchestrator)
+            
+            # 执行指定步骤
+            result = None
+            with self._step_execution(step_name) as execution_id:
+                if step_name == "ddl_generation":
+                    await orchestrator._execute_step_1_ddl_md_generation()
+                    result = orchestrator.workflow_state["artifacts"].get("ddl_md_generation", {})
+                    
+                elif step_name == "qa_generation":
+                    await orchestrator._execute_step_2_question_sql_generation()
+                    result = orchestrator.workflow_state["artifacts"].get("question_sql_generation", {})
+                    
+                elif step_name == "sql_validation":
+                    await orchestrator._execute_step_3_sql_validation()
+                    result = orchestrator.workflow_state["artifacts"].get("sql_validation", {})
+                    
+                elif step_name == "training_load":
+                    await orchestrator._execute_step_4_training_data_load()
+                    result = orchestrator.workflow_state["artifacts"].get("training_data_load", {})
+                    
+                else:
+                    raise ValueError(f"不支持的步骤: {step_name}")
+                
+                # 写入步骤结果文件
+                self._write_step_result_file(step_name, result)
+            
+            # 检查是否所有步骤都已完成
+            self._update_overall_task_status()
+            
+            return {
+                "success": True,
+                "task_id": self.task_id,
+                "execution_mode": "step",
+                "step_name": step_name,
+                "result": result
+            }
+            
+        except Exception as e:
+            # 记录错误
+            error_msg = str(e)
+            self.task_manager.record_log(self.task_id, "ERROR", f"步骤执行失败: {step_name} - {error_msg}")
+            self.task_manager.update_task_status(self.task_id, 'failed', error_msg)
+            
+            # 记录到任务目录日志
+            self._log_to_task_directory("ERROR", f"步骤执行失败: {step_name} - {error_msg}", step_name)
+            
+            return {
+                "success": False,
+                "task_id": self.task_id,
+                "execution_mode": "step",
+                "step_name": step_name,
+                "error": error_msg
+            }
+    
+    def _write_result_file(self, result: Dict[str, Any]):
+        """写入完整结果文件"""
+        try:
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            result_file = task_dir / "task_result.json"
+            
+            with open(result_file, 'w', encoding='utf-8') as f:
+                json.dump(result, f, ensure_ascii=False, indent=2, default=str)
+                
+        except Exception as e:
+            self.logger.error(f"写入结果文件失败: {e}")
+    
+    def _write_step_result_file(self, step_name: str, result: Dict[str, Any]):
+        """写入步骤结果文件"""
+        try:
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            result_file = task_dir / f"{step_name}_result.json"
+            
+            with open(result_file, 'w', encoding='utf-8') as f:
+                json.dump(result, f, ensure_ascii=False, indent=2, default=str)
+                
+        except Exception as e:
+            self.logger.error(f"写入步骤结果文件失败: {e}")
+    
+    def _update_overall_task_status(self):
+        """更新整体任务状态"""
+        try:
+            # 检查所有步骤的完成情况
+            executions = self.task_manager.get_task_executions(self.task_id)
+            
+            completed_steps = set()
+            failed_steps = set()
+            
+            for execution in executions:
+                if execution['status'] == 'completed':
+                    completed_steps.add(execution['execution_step'])
+                elif execution['status'] == 'failed':
+                    failed_steps.add(execution['execution_step'])
+            
+            # 检查是否有失败的步骤
+            if failed_steps:
+                self.task_manager.update_task_status(self.task_id, 'failed')
+                return
+            
+            # 检查是否完成了必要步骤
+            required_steps = {"ddl_generation", "qa_generation"}
+            if required_steps.issubset(completed_steps):
+                # 检查是否有可选步骤完成
+                optional_steps = {"sql_validation", "training_load"}
+                if completed_steps.intersection(optional_steps):
+                    if len(completed_steps) >= 3:
+                        self.task_manager.update_task_status(self.task_id, 'completed')
+                    else:
+                        self.task_manager.update_task_status(self.task_id, 'partial_completed')
+                else:
+                    self.task_manager.update_task_status(self.task_id, 'partial_completed')
+            
+        except Exception as e:
+            self.logger.error(f"更新任务状态失败: {e}")
+    
+    def _redirect_orchestrator_logs(self, orchestrator):
+        """重定向SchemaWorkflowOrchestrator的日志到任务目录"""
+        if self.task_dir_logger and hasattr(orchestrator, 'logger'):
+            try:
+                # 为orchestrator的logger添加任务目录文件处理器
+                for handler in self.task_dir_logger.handlers:
+                    if isinstance(handler, logging.FileHandler):
+                        orchestrator.logger.addHandler(handler)
+                        break
+            except Exception as e:
+                self.logger.error(f"重定向orchestrator日志失败: {e}")
+    
+    def _update_all_step_status_for_complete_workflow(self, result: Dict[str, Any]):
+        """完整工作流成功后,更新所有子步骤状态为完成"""
+        try:
+            # 定义完整工作流包含的所有步骤
+            workflow_steps = ["ddl_generation", "qa_generation", "sql_validation", "training_load"]
+            
+            # 记录日志
+            self._log_to_task_directory("INFO", "开始更新完整工作流各步骤状态为完成")
+            
+            # 逐一更新每个步骤的状态为完成
+            for step_name in workflow_steps:
+                try:
+                    self.task_manager.update_step_status(self.task_id, step_name, "completed")
+                    self.task_manager.record_log(
+                        self.task_id, 
+                        "INFO", 
+                        f"完整工作流执行成功,更新步骤状态为完成: {step_name}",
+                        step_name=step_name
+                    )
+                    self._log_to_task_directory("INFO", f"更新步骤状态为完成: {step_name}", step_name)
+                except Exception as step_error:
+                    self.logger.error(f"更新步骤状态失败 {step_name}: {step_error}")
+                    self._log_to_task_directory("ERROR", f"更新步骤状态失败: {step_name} - {step_error}", step_name)
+            
+            self._log_to_task_directory("INFO", "完整工作流各步骤状态更新完成")
+            
+        except Exception as e:
+            self.logger.error(f"更新完整工作流步骤状态失败: {e}")
+            self._log_to_task_directory("ERROR", f"更新完整工作流步骤状态失败: {e}")
+    
+    def cleanup(self):
+        """清理资源"""
+        try:
+            # 清理任务目录日志记录器
+            if self.task_dir_logger:
+                for handler in self.task_dir_logger.handlers:
+                    handler.close()
+                self.task_dir_logger.handlers.clear()
+                
+            self.task_manager.close_connection()
+        except Exception as e:
+            self.logger.error(f"清理资源失败: {e}")
+
+
+class SimpleWorkflowManager:
+    """简化的任务工作流管理器"""
+    
+    def __init__(self):
+        """初始化工作流管理器"""
+        self.task_manager = SimpleTaskManager()
+        self.file_manager = SimpleFileManager()
+        self.logger = get_data_pipeline_logger("SimpleWorkflowManager")
+    
+    def create_task(self, 
+                   table_list_file: str,
+                   business_context: str,
+                   db_name: str = None,
+                   **kwargs) -> str:
+        """创建新任务"""
+        try:
+            # 验证表清单文件存在
+            if not os.path.exists(table_list_file):
+                raise FileNotFoundError(f"表清单文件不存在: {table_list_file}")
+            
+            # 创建任务(使用app_config中的数据库配置)
+            task_id = self.task_manager.create_task(
+                table_list_file=table_list_file,
+                business_context=business_context,
+                db_name=db_name,
+                **kwargs
+            )
+            
+            return task_id
+            
+        except Exception as e:
+            self.logger.error(f"创建任务失败: {e}")
+            raise
+    
+    async def execute_task(self, 
+                          task_id: str,
+                          execution_mode: str = "complete",
+                          step_name: Optional[str] = None) -> Dict[str, Any]:
+        """执行任务"""
+        executor = None
+        try:
+            executor = SimpleWorkflowExecutor(task_id)
+            
+            if execution_mode == "complete":
+                return await executor.execute_complete_workflow()
+            elif execution_mode == "step":
+                if not step_name:
+                    raise ValueError("步骤执行模式需要指定step_name")
+                return await executor.execute_single_step(step_name)
+            else:
+                raise ValueError(f"不支持的执行模式: {execution_mode}")
+                
+        finally:
+            if executor:
+                executor.cleanup()
+    
+    def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """获取任务状态"""
+        return self.task_manager.get_task(task_id)
+    
+    def get_task_logs(self, task_id: str, limit: int = 100) -> List[Dict[str, Any]]:
+        """获取任务日志"""
+        return self.task_manager.get_task_logs(task_id, limit)
+    
+    def get_task_files(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务文件列表"""
+        return self.file_manager.get_task_files(task_id)
+    
+    def get_task_executions(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务执行记录"""
+        return self.task_manager.get_task_executions(task_id)
+    
+    def get_tasks_list(self, **kwargs) -> List[Dict[str, Any]]:
+        """获取任务列表"""
+        return self.task_manager.get_tasks_list(**kwargs)
+    
+    def cleanup(self):
+        """清理资源"""
+        try:
+            self.task_manager.close_connection()
+        except Exception as e:
+            self.logger.error(f"清理资源失败: {e}")

+ 346 - 0
data_pipeline/sql/init_tables.sql

@@ -0,0 +1,346 @@
+-- Data Pipeline API 数据库初始化脚本
+-- 
+-- 此脚本在pgvector向量数据库中创建Data Pipeline API系统所需的表和索引
+-- 注意:这些表应该创建在pgvector数据库中,而不是业务数据库中
+-- 
+-- 执行方式(使用PGVECTOR_CONFIG中的连接信息):
+-- psql -h host -p port -U username -d pgvector_database_name -f init_tables.sql
+
+-- 设置客户端编码
+SET client_encoding = 'UTF8';
+
+-- 开始事务
+BEGIN;
+
+-- ====================================================================
+-- 任务主表 (data_pipeline_tasks)
+-- ====================================================================
+CREATE TABLE IF NOT EXISTS data_pipeline_tasks (
+    -- 主键:时间戳格式的任务ID
+    id VARCHAR(32) PRIMARY KEY,                    -- 'task_20250627_143052'
+    
+    -- 任务基本信息
+    task_type VARCHAR(50) NOT NULL DEFAULT 'data_workflow',
+    status VARCHAR(20) NOT NULL DEFAULT 'pending', -- pending/in_progress/partial_completed/completed/failed
+    
+    -- 配置和结果(JSON格式)
+    parameters JSONB NOT NULL,                     -- 任务配置参数
+    result JSONB,                                  -- 最终执行结果
+    
+    -- 错误处理
+    error_message TEXT,                            -- 错误详细信息
+    
+    -- 步骤状态跟踪
+    step_status JSONB DEFAULT '{
+        "ddl_generation": "pending",
+        "qa_generation": "pending", 
+        "sql_validation": "pending",
+        "training_load": "pending"
+    }'::jsonb,
+    
+    -- 时间戳
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    started_at TIMESTAMP,
+    completed_at TIMESTAMP,
+    
+    -- 创建者信息
+    created_by VARCHAR(50) DEFAULT 'api',          -- 'api', 'manual', 'system'
+    
+    -- 输出目录
+    output_directory TEXT,                         -- 任务输出目录路径
+    
+    -- 索引字段
+    db_name VARCHAR(100),                          -- 数据库名称(便于筛选)
+    business_context TEXT                          -- 业务上下文(便于搜索)
+);
+
+-- 添加约束
+ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_task_status 
+    CHECK (status IN ('pending', 'in_progress', 'partial_completed', 'completed', 'failed'));
+
+ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_task_type 
+    CHECK (task_type IN ('data_workflow', 'complete_workflow'));
+
+ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_created_by 
+    CHECK (created_by IN ('api', 'manual', 'system'));
+
+-- ====================================================================
+-- 任务执行记录表 (data_pipeline_task_executions)
+-- ====================================================================
+CREATE TABLE IF NOT EXISTS data_pipeline_task_executions (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    execution_step VARCHAR(50) NOT NULL,          -- 'ddl_generation', 'qa_generation', 'sql_validation', 'training_load', 'complete'
+    status VARCHAR(20) NOT NULL,                  -- 'running', 'completed', 'failed'
+    started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    completed_at TIMESTAMP,
+    error_message TEXT,
+    execution_result JSONB,                       -- 步骤执行结果
+    execution_id VARCHAR(100) UNIQUE,             -- {task_id}_step_{step_name}_exec_{timestamp}
+    force_executed BOOLEAN DEFAULT FALSE,         -- 是否强制执行
+    files_cleaned BOOLEAN DEFAULT FALSE,          -- 是否清理了旧文件
+    duration_seconds INTEGER                      -- 执行时长(秒)
+);
+
+-- 添加约束
+ALTER TABLE data_pipeline_task_executions ADD CONSTRAINT chk_execution_status 
+    CHECK (status IN ('running', 'completed', 'failed'));
+
+ALTER TABLE data_pipeline_task_executions ADD CONSTRAINT chk_execution_step 
+    CHECK (execution_step IN ('ddl_generation', 'qa_generation', 'sql_validation', 'training_load', 'complete'));
+
+ALTER TABLE data_pipeline_task_executions ADD CONSTRAINT chk_duration_positive 
+    CHECK (duration_seconds IS NULL OR duration_seconds >= 0);
+
+-- ====================================================================
+-- 任务日志表 (data_pipeline_task_logs)
+-- ====================================================================
+CREATE TABLE IF NOT EXISTS data_pipeline_task_logs (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    execution_id VARCHAR(100) REFERENCES data_pipeline_task_executions(execution_id) ON DELETE SET NULL,
+    
+    -- 日志内容
+    log_level VARCHAR(10) NOT NULL,               -- 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+    message TEXT NOT NULL,                        -- 日志消息内容
+    
+    -- 上下文信息
+    step_name VARCHAR(50),                        -- 执行步骤名称
+    module_name VARCHAR(100),                     -- 模块名称
+    function_name VARCHAR(100),                   -- 函数名称
+    
+    -- 时间戳
+    timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    
+    -- 额外信息(JSON格式)
+    extra_data JSONB DEFAULT '{}'::jsonb          -- 额外的结构化信息
+);
+
+-- 添加约束
+ALTER TABLE data_pipeline_task_logs ADD CONSTRAINT chk_log_level 
+    CHECK (log_level IN ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'));
+
+-- ====================================================================
+-- 任务输出文件表 (data_pipeline_task_outputs)
+-- ====================================================================
+CREATE TABLE IF NOT EXISTS data_pipeline_task_outputs (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    execution_id VARCHAR(100) REFERENCES data_pipeline_task_executions(execution_id) ON DELETE SET NULL,
+    
+    -- 文件信息
+    file_type VARCHAR(50) NOT NULL,               -- 'ddl', 'md', 'json', 'log', 'report'
+    file_name VARCHAR(255) NOT NULL,              -- 文件名
+    file_path TEXT NOT NULL,                      -- 相对路径
+    file_size BIGINT DEFAULT 0,                   -- 文件大小(字节)
+    
+    -- 文件内容摘要
+    content_hash VARCHAR(64),                     -- 文件内容hash
+    description TEXT,                             -- 文件描述
+    
+    -- 时间戳
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    modified_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    
+    -- 状态
+    is_primary BOOLEAN DEFAULT FALSE,             -- 是否为主要输出文件
+    is_downloadable BOOLEAN DEFAULT TRUE          -- 是否可下载
+);
+
+-- 添加约束
+ALTER TABLE data_pipeline_task_outputs ADD CONSTRAINT chk_file_type 
+    CHECK (file_type IN ('ddl', 'md', 'json', 'log', 'report', 'txt', 'other'));
+
+ALTER TABLE data_pipeline_task_outputs ADD CONSTRAINT chk_file_size_positive 
+    CHECK (file_size >= 0);
+
+-- ====================================================================
+-- 创建索引
+-- ====================================================================
+
+-- 任务表索引
+CREATE INDEX IF NOT EXISTS idx_tasks_status ON data_pipeline_tasks(status);
+CREATE INDEX IF NOT EXISTS idx_tasks_created_at ON data_pipeline_tasks(created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_tasks_db_name ON data_pipeline_tasks(db_name);
+CREATE INDEX IF NOT EXISTS idx_tasks_created_by ON data_pipeline_tasks(created_by);
+CREATE INDEX IF NOT EXISTS idx_tasks_task_type ON data_pipeline_tasks(task_type);
+
+-- 执行记录表索引
+CREATE INDEX IF NOT EXISTS idx_executions_task_id ON data_pipeline_task_executions(task_id);
+CREATE INDEX IF NOT EXISTS idx_executions_step ON data_pipeline_task_executions(execution_step);
+CREATE INDEX IF NOT EXISTS idx_executions_status ON data_pipeline_task_executions(status);
+CREATE INDEX IF NOT EXISTS idx_executions_started_at ON data_pipeline_task_executions(started_at DESC);
+CREATE INDEX IF NOT EXISTS idx_executions_task_step ON data_pipeline_task_executions(task_id, execution_step);
+
+-- 日志表索引
+CREATE INDEX IF NOT EXISTS idx_logs_task_id ON data_pipeline_task_logs(task_id);
+CREATE INDEX IF NOT EXISTS idx_logs_execution_id ON data_pipeline_task_logs(execution_id);
+CREATE INDEX IF NOT EXISTS idx_logs_timestamp ON data_pipeline_task_logs(timestamp DESC);
+CREATE INDEX IF NOT EXISTS idx_logs_level ON data_pipeline_task_logs(log_level);
+CREATE INDEX IF NOT EXISTS idx_logs_step ON data_pipeline_task_logs(step_name);
+CREATE INDEX IF NOT EXISTS idx_logs_task_timestamp ON data_pipeline_task_logs(task_id, timestamp DESC);
+
+-- 文件输出表索引
+CREATE INDEX IF NOT EXISTS idx_outputs_task_id ON data_pipeline_task_outputs(task_id);
+CREATE INDEX IF NOT EXISTS idx_outputs_execution_id ON data_pipeline_task_outputs(execution_id);
+CREATE INDEX IF NOT EXISTS idx_outputs_file_type ON data_pipeline_task_outputs(file_type);
+CREATE INDEX IF NOT EXISTS idx_outputs_primary ON data_pipeline_task_outputs(is_primary) WHERE is_primary = TRUE;
+CREATE INDEX IF NOT EXISTS idx_outputs_downloadable ON data_pipeline_task_outputs(is_downloadable) WHERE is_downloadable = TRUE;
+
+-- ====================================================================
+-- 创建清理函数
+-- ====================================================================
+
+-- 清理旧任务的函数
+CREATE OR REPLACE FUNCTION cleanup_old_data_pipeline_tasks(days_to_keep INTEGER DEFAULT 30)
+RETURNS INTEGER AS $$
+DECLARE
+    deleted_count INTEGER;
+    cutoff_date TIMESTAMP;
+BEGIN
+    cutoff_date := NOW() - INTERVAL '1 day' * days_to_keep;
+    
+    -- 删除旧任务(级联删除相关日志和文件记录)
+    DELETE FROM data_pipeline_tasks 
+    WHERE created_at < cutoff_date 
+    AND status IN ('completed', 'failed');
+    
+    GET DIAGNOSTICS deleted_count = ROW_COUNT;
+    
+    -- 记录清理操作
+    INSERT INTO data_pipeline_task_logs (task_id, log_level, message, step_name)
+    VALUES ('system', 'INFO', 
+            FORMAT('清理了 %s 个超过 %s 天的旧任务', deleted_count, days_to_keep),
+            'cleanup');
+    
+    RETURN deleted_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- 获取任务统计信息的函数
+CREATE OR REPLACE FUNCTION get_data_pipeline_task_stats()
+RETURNS TABLE (
+    total_tasks INTEGER,
+    pending_tasks INTEGER,
+    running_tasks INTEGER,
+    completed_tasks INTEGER,
+    failed_tasks INTEGER,
+    avg_completion_time INTERVAL
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        COUNT(*)::INTEGER as total_tasks,
+        COUNT(*) FILTER (WHERE status = 'pending')::INTEGER as pending_tasks,
+        COUNT(*) FILTER (WHERE status IN ('in_progress'))::INTEGER as running_tasks,
+        COUNT(*) FILTER (WHERE status = 'completed')::INTEGER as completed_tasks,
+        COUNT(*) FILTER (WHERE status = 'failed')::INTEGER as failed_tasks,
+        AVG(completed_at - started_at) FILTER (WHERE status = 'completed') as avg_completion_time
+    FROM data_pipeline_tasks;
+END;
+$$ LANGUAGE plpgsql;
+
+-- 检查僵尸任务的函数
+CREATE OR REPLACE FUNCTION check_zombie_data_pipeline_tasks(timeout_hours INTEGER DEFAULT 2)
+RETURNS INTEGER AS $$
+DECLARE
+    zombie_count INTEGER;
+    cutoff_time TIMESTAMP;
+BEGIN
+    cutoff_time := NOW() - INTERVAL '1 hour' * timeout_hours;
+    
+    -- 查找超时的运行中执行
+    UPDATE data_pipeline_task_executions 
+    SET status = 'failed',
+        error_message = FORMAT('执行超时(超过%s小时),可能已停止运行', timeout_hours),
+        completed_at = NOW()
+    WHERE status = 'running' 
+    AND started_at < cutoff_time;
+    
+    GET DIAGNOSTICS zombie_count = ROW_COUNT;
+    
+    -- 更新相关任务状态
+    UPDATE data_pipeline_tasks 
+    SET status = 'failed',
+        error_message = FORMAT('任务超时(超过%s小时),可能已停止运行', timeout_hours)
+    WHERE status IN ('in_progress') 
+    AND started_at < cutoff_time;
+    
+    -- 记录检查操作
+    IF zombie_count > 0 THEN
+        INSERT INTO data_pipeline_task_logs (task_id, log_level, message, step_name)
+        VALUES ('system', 'WARNING', 
+                FORMAT('发现并处理了 %s 个僵尸执行', zombie_count),
+                'zombie_check');
+    END IF;
+    
+    RETURN zombie_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- ====================================================================
+-- 插入初始数据(如果需要)
+-- ====================================================================
+
+-- 这里可以插入一些初始配置数据
+-- 目前暂时不需要
+
+-- ====================================================================
+-- 创建视图(便于查询)
+-- ====================================================================
+
+-- 任务执行概览视图
+CREATE OR REPLACE VIEW v_task_execution_overview AS
+SELECT 
+    t.id as task_id,
+    t.task_type,
+    t.status as task_status,
+    t.step_status,
+    t.created_at,
+    t.started_at,
+    t.completed_at,
+    t.created_by,
+    t.db_name,
+    COALESCE(e.current_execution, '{}') as current_execution,
+    COALESCE(e.execution_count, 0) as total_executions
+FROM data_pipeline_tasks t
+LEFT JOIN (
+    SELECT 
+        task_id,
+        COUNT(*) as execution_count,
+        json_build_object(
+            'execution_id', e1.execution_id,
+            'step', e1.execution_step,
+            'status', e1.status,
+            'started_at', e1.started_at
+        ) as current_execution
+    FROM data_pipeline_task_executions e1
+    WHERE e1.id = (
+        SELECT e2.id 
+        FROM data_pipeline_task_executions e2 
+        WHERE e2.task_id = e1.task_id 
+        ORDER BY e2.started_at DESC 
+        LIMIT 1
+    )
+    GROUP BY task_id, e1.execution_id, e1.execution_step, e1.status, e1.started_at
+) e ON t.id = e.task_id;
+
+-- 提交事务
+COMMIT;
+
+-- 输出创建结果
+\echo 'Data Pipeline API 数据库表创建完成!'
+\echo ''
+\echo '已创建的表:'
+\echo '- data_pipeline_tasks: 任务主表'
+\echo '- data_pipeline_task_executions: 任务执行记录表'
+\echo '- data_pipeline_task_logs: 任务日志表'
+\echo '- data_pipeline_task_outputs: 任务输出文件表'
+\echo ''
+\echo '已创建的函数:'
+\echo '- cleanup_old_data_pipeline_tasks(days): 清理旧任务'
+\echo '- get_data_pipeline_task_stats(): 获取任务统计'
+\echo '- check_zombie_data_pipeline_tasks(hours): 检查僵尸任务'
+\echo ''
+\echo '已创建的视图:'
+\echo '- v_task_execution_overview: 任务执行概览'

+ 78 - 0
data_pipeline/task_executor.py

@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""
+Data Pipeline 独立任务执行器
+
+专门用于subprocess调用,执行data pipeline任务
+"""
+
+import sys
+import asyncio
+import argparse
+import json
+from pathlib import Path
+
+# 确保能够导入项目模块
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from data_pipeline.api.simple_workflow import SimpleWorkflowExecutor
+from core.logging import initialize_logging
+
+
+def main():
+    """主执行函数"""
+    parser = argparse.ArgumentParser(description='Data Pipeline 任务执行器')
+    parser.add_argument('--task-id', required=True, help='任务ID')
+    parser.add_argument('--execution-mode', default='complete', choices=['complete', 'step'], help='执行模式')
+    parser.add_argument('--step-name', help='步骤名称(当execution-mode=step时必需)')
+    
+    args = parser.parse_args()
+    
+    # 初始化日志系统
+    initialize_logging()
+    
+    # 验证参数
+    if args.execution_mode == 'step' and not args.step_name:
+        print("错误: 步骤执行模式需要指定--step-name参数", file=sys.stderr)
+        sys.exit(1)
+    
+    try:
+        # 执行任务
+        result = asyncio.run(execute_task(args.task_id, args.execution_mode, args.step_name))
+        
+        # 输出结果到stdout(供父进程读取)
+        print(json.dumps(result, ensure_ascii=False, default=str))
+        
+        # 设置退出码
+        sys.exit(0 if result.get('success', False) else 1)
+        
+    except Exception as e:
+        error_result = {
+            "success": False,
+            "error": str(e),
+            "task_id": args.task_id,
+            "execution_mode": args.execution_mode
+        }
+        print(json.dumps(error_result, ensure_ascii=False), file=sys.stderr)
+        sys.exit(1)
+
+
+async def execute_task(task_id: str, execution_mode: str, step_name: str = None):
+    """执行任务的异步函数"""
+    executor = None
+    try:
+        executor = SimpleWorkflowExecutor(task_id)
+        
+        if execution_mode == "complete":
+            return await executor.execute_complete_workflow()
+        elif execution_mode == "step":
+            return await executor.execute_single_step(step_name)
+        else:
+            raise ValueError(f"不支持的执行模式: {execution_mode}")
+            
+    finally:
+        if executor:
+            executor.cleanup()
+
+
+if __name__ == "__main__":
+    main()

+ 31 - 0
data_pipeline/training_data/task_20250701_131627/bss_business_day_data.ddl

@@ -0,0 +1,31 @@
+-- 中文名: 业务支撑系统每日营业数据表
+-- 描述: 业务支撑系统每日营业数据表,记录各服务区运营统计信息,包含统计日期、服务区编码及版本控制字段。
+create table public.bss_business_day_data (
+  id varchar(32) not null     -- 主键标识符,主键,
+  version integer not null    -- 数据版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人账号,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 最后更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除操作人,
+  oper_date date              -- 统计日期,
+  service_no varchar(255)     -- 服务区编码,
+  service_name varchar(255)   -- 服务区名称,
+  branch_no varchar(255)      -- 档口编码,
+  branch_name varchar(255)    -- 档口名称,
+  wx numeric(19,4)            -- 微信支付金额,
+  wx_order integer            -- 微信订单数量,
+  zfb numeric(19,4)           -- 支付宝支付金额,
+  zf_order integer            -- 支付宝订单数,
+  rmb numeric(19,4)           -- 现金支付金额,
+  rmb_order integer           -- 现金订单数量,
+  xs numeric(19,4)            -- 行吧支付金额,
+  xs_order integer            -- 行吧订单数量,
+  jd numeric(19,4)            -- 金豆支付金额,
+  jd_order integer            -- 金豆订单数量,
+  order_sum integer           -- 订单总数,
+  pay_sum numeric(19,4)       -- 支付总金额,
+  source_type integer         -- 数据来源类别,
+  primary key (id)
+);

+ 32 - 0
data_pipeline/training_data/task_20250701_131627/bss_business_day_data_detail.md

@@ -0,0 +1,32 @@
+## bss_business_day_data(业务支撑系统每日营业数据表)
+bss_business_day_data 表业务支撑系统每日营业数据表,记录各服务区运营统计信息,包含统计日期、服务区编码及版本控制字段。
+字段列表:
+- id (varchar(32)) - 主键标识符 [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- version (integer) - 数据版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- created_by (varchar(50)) - 创建人账号 [示例: xingba]
+- update_ts (timestamp) - 更新时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- updated_by (varchar(50)) - 最后更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除操作人
+- oper_date (date) - 统计日期 [示例: 2023-04-01]
+- service_no (varchar(255)) - 服务区编码 [示例: 1028, H0501]
+- service_name (varchar(255)) - 服务区名称 [示例: 宜春服务区, 庐山服务区]
+- branch_no (varchar(255)) - 档口编码 [示例: 1, H05016]
+- branch_name (varchar(255)) - 档口名称 [示例: 宜春南区, 庐山鲜徕客东区]
+- wx (numeric(19,4)) - 微信支付金额 [示例: 4790.0000, 2523.0000]
+- wx_order (integer) - 微信订单数量 [示例: 253, 133]
+- zfb (numeric(19,4)) - 支付宝支付金额 [示例: 229.0000, 0.0000]
+- zf_order (integer) - 支付宝订单数 [示例: 15, 0]
+- rmb (numeric(19,4)) - 现金支付金额 [示例: 1058.5000, 124.0000]
+- rmb_order (integer) - 现金订单数量 [示例: 56, 12]
+- xs (numeric(19,4)) - 行吧支付金额 [示例: 0.0000, 40.0000]
+- xs_order (integer) - 行吧订单数量 [示例: 0, 1]
+- jd (numeric(19,4)) - 金豆支付金额 [示例: 0.0000]
+- jd_order (integer) - 金豆订单数量 [示例: 0]
+- order_sum (integer) - 订单总数 [示例: 324, 146]
+- pay_sum (numeric(19,4)) - 支付总金额 [示例: 6077.5000, 2687.0000]
+- source_type (integer) - 数据来源类别 [示例: 1, 0, 4]
+字段补充说明:
+- id 为主键
+- source_type 为枚举字段,包含取值:0、4、1、2、3

+ 17 - 0
data_pipeline/training_data/task_20250701_131627/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: 服务区车辆日统计表
+-- 描述: 服务区车辆日统计表,记录各类型车辆日通行量及操作信息,用于交通流量分析和运营管理。
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  customer_count bigint       -- 车辆数量,
+  car_type varchar(100)       -- 车辆类别,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 18 - 0
data_pipeline/training_data/task_20250701_131627/bss_car_day_count_detail.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(服务区车辆日统计表)
+bss_car_day_count 表服务区车辆日统计表,记录各类型车辆日通行量及操作信息,用于交通流量分析和运营管理。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人
+- update_ts (timestamp) - 更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- customer_count (bigint) - 车辆数量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类别 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

+ 15 - 0
data_pipeline/training_data/task_20250701_131627/bss_company.ddl

@@ -0,0 +1,15 @@
+-- 中文名: 存储高速公路服务区合作公司基础信息(含公司名称及唯一编码)
+-- 描述: 存储高速公路服务区合作公司基础信息(含公司名称及唯一编码),用于业务支撑系统中企业信息管理与业务关联支撑。
+create table public.bss_company (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  company_name varchar(255)   -- 分公司名称,
+  company_no varchar(255)     -- 公司编码,
+  primary key (id)
+);

+ 15 - 0
data_pipeline/training_data/task_20250701_131627/bss_company_detail.md

@@ -0,0 +1,15 @@
+## bss_company(存储高速公路服务区合作公司基础信息(含公司名称及唯一编码))
+bss_company 表存储高速公路服务区合作公司基础信息(含公司名称及唯一编码),用于业务支撑系统中企业信息管理与业务关联支撑。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 30675d85ba5044c31acfa243b9d16334, 47ed0bb37f5a85f3d9245e4854959b81]
+- version (integer) - 版本号 [非空] [示例: 1, 2]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- company_name (varchar(255)) - 分公司名称 [示例: 上饶分公司, 宜春分公司]
+- company_no (varchar(255)) - 公司编码 [示例: H03, H02]
+字段补充说明:
+- id 为主键

+ 16 - 0
data_pipeline/training_data/task_20250701_131627/bss_section_route.ddl

@@ -0,0 +1,16 @@
+-- 中文名: 存储高速公路路段与路线信息
+-- 描述: 存储高速公路路段与路线信息,支持服务区路线关联管理。
+create table public.bss_section_route (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  section_name varchar(255)   -- 路段名称,
+  route_name varchar(255)     -- 路线名称,
+  code varchar(255)           -- 路段编号,
+  primary key (id)
+);

+ 7 - 0
data_pipeline/training_data/task_20250701_131627/bss_section_route_area_link.ddl

@@ -0,0 +1,7 @@
+-- 中文名: 路段路线与服务区关联表
+-- 描述: 路段路线与服务区关联表,维护路线与服务区之间的归属关系。
+create table public.bss_section_route_area_link (
+  section_route_id varchar(32) not null -- 路段路线ID,主键,
+  service_area_id varchar(32) not null -- 服务区ID,主键,
+  primary key (section_route_id, service_area_id)
+);

+ 7 - 0
data_pipeline/training_data/task_20250701_131627/bss_section_route_area_link_detail.md

@@ -0,0 +1,7 @@
+## bss_section_route_area_link(路段路线与服务区关联表)
+bss_section_route_area_link 表路段路线与服务区关联表,维护路线与服务区之间的归属关系。
+字段列表:
+- section_route_id (varchar(32)) - 路段路线ID [主键, 非空] [示例: v8elrsfs5f7lt7jl8a6p87smfzesn3rz, hxzi2iim238e3s1eajjt1enmh9o4h3wp]
+- service_area_id (varchar(32)) - 服务区ID [主键, 非空] [示例: 08e01d7402abd1d6a4d9fdd5df855ef8, 091662311d2c737029445442ff198c4c]
+字段补充说明:
+- 复合主键:section_route_id, service_area_id

+ 16 - 0
data_pipeline/training_data/task_20250701_131627/bss_section_route_detail.md

@@ -0,0 +1,16 @@
+## bss_section_route(存储高速公路路段与路线信息)
+bss_section_route 表存储高速公路路段与路线信息,支持服务区路线关联管理。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 04ri3j67a806uw2c6o6dwdtz4knexczh, 0g5mnefxxtukql2cq6acul7phgskowy7]
+- version (integer) - 版本号 [非空] [示例: 1, 0]
+- create_ts (timestamp) - 创建时间 [示例: 2021-10-29 19:43:50, 2022-03-04 16:07:16]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- section_name (varchar(255)) - 路段名称 [示例: 昌栗, 昌宁]
+- route_name (varchar(255)) - 路线名称 [示例: 昌栗, 昌韶]
+- code (varchar(255)) - 路段编号 [示例: SR0001, SR0002]
+字段补充说明:
+- id 为主键

+ 19 - 0
data_pipeline/training_data/task_20250701_131627/bss_service_area.ddl

@@ -0,0 +1,19 @@
+-- 中文名: 存储高速公路服务区基础信息及版本变更记录
+-- 描述: 存储高速公路服务区基础信息及版本变更记录,支持服务区全生命周期管理。
+create table public.bss_service_area (
+  id varchar(32) not null     -- 主键标识符,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  service_area_name varchar(255) -- 服务区名称,
+  service_area_no varchar(255) -- 服务区编码,
+  company_id varchar(32)      -- 所属公司ID,
+  service_position varchar(255) -- 地理坐标,
+  service_area_type varchar(50) -- 服务区类型,
+  service_state varchar(50)   -- 运营状态,
+  primary key (id)
+);

+ 21 - 0
data_pipeline/training_data/task_20250701_131627/bss_service_area_detail.md

@@ -0,0 +1,21 @@
+## bss_service_area(存储高速公路服务区基础信息及版本变更记录)
+bss_service_area 表存储高速公路服务区基础信息及版本变更记录,支持服务区全生命周期管理。
+字段列表:
+- id (varchar(32)) - 主键标识符 [主键, 非空] [示例: 0271d68ef93de9684b7ad8c7aae600b6, 08e01d7402abd1d6a4d9fdd5df855ef8]
+- version (integer) - 版本号 [非空] [示例: 3, 6]
+- create_ts (timestamp) - 创建时间 [示例: 2021-05-21 13:26:40.589000, 2021-05-20 19:51:46.314000]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2021-07-10 15:41:28.795000, 2021-07-11 09:33:08.455000]
+- updated_by (varchar(50)) - 更新人 [示例: admin]
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人 [示例: ]
+- service_area_name (varchar(255)) - 服务区名称 [示例: 白鹭湖停车区, 南昌南服务区]
+- service_area_no (varchar(255)) - 服务区编码 [示例: H0814, H0105]
+- company_id (varchar(32)) - 所属公司ID [示例: b1629f07c8d9ac81494fbc1de61f1ea5, ee9bf1180a2b45003f96e597a4b7f15a]
+- service_position (varchar(255)) - 地理坐标 [示例: 114.574721,26.825584, 115.910549,28.396355]
+- service_area_type (varchar(50)) - 服务区类型 [示例: 信息化服务区]
+- service_state (varchar(50)) - 运营状态 [示例: 开放, 关闭]
+字段补充说明:
+- id 为主键
+- service_area_type 为枚举字段,包含取值:信息化服务区、智能化服务区
+- service_state 为枚举字段,包含取值:开放、关闭、上传数据

+ 18 - 0
data_pipeline/training_data/task_20250701_131627/bss_service_area_mapper.ddl

@@ -0,0 +1,18 @@
+-- 中文名: BSS服务区基础信息映射表
+-- 描述: BSS服务区基础信息映射表,记录服务区名称、编码及全生命周期操作日志
+create table public.bss_service_area_mapper (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  service_name varchar(255)   -- 服务区名称,
+  service_no varchar(255)     -- 服务区编码,
+  service_area_id varchar(32) -- 服务区ID,
+  source_system_type varchar(50) -- 数据来源系统类型,
+  source_type integer         -- 数据来源类别ID,
+  primary key (id)
+);

+ 19 - 0
data_pipeline/training_data/task_20250701_131627/bss_service_area_mapper_detail.md

@@ -0,0 +1,19 @@
+## bss_service_area_mapper(BSS服务区基础信息映射表)
+bss_service_area_mapper 表BSS服务区基础信息映射表,记录服务区名称、编码及全生命周期操作日志
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00e1e893909211ed8ee6fa163eaf653f, 013867f5962211ed8ee6fa163eaf653f]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-01-10 10:54:03, 2023-01-17 12:47:29]
+- created_by (varchar(50)) - 创建人 [示例: admin]
+- update_ts (timestamp) - 更新时间 [示例: 2023-01-10 10:54:07, 2023-01-17 12:47:32]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- service_name (varchar(255)) - 服务区名称 [示例: 信丰西服务区, 南康北服务区]
+- service_no (varchar(255)) - 服务区编码 [示例: 1067, 1062]
+- service_area_id (varchar(32)) - 服务区ID [示例: 97cd6cd516a551409a4d453a58f9e170, fdbdd042962011ed8ee6fa163eaf653f]
+- source_system_type (varchar(50)) - 数据来源系统类型 [示例: 驿美, 驿购]
+- source_type (integer) - 数据来源类别ID [示例: 3, 1]
+字段补充说明:
+- id 为主键
+- source_system_type 为枚举字段,包含取值:司乘管理、商业管理、驿购、驿美、手工录入

+ 10 - 0
data_pipeline/training_data/task_20250701_131627/db_query_decision_prompt.txt

@@ -0,0 +1,10 @@
+=== 数据库业务范围 ===
+当前数据库存储的是高速公路服务区运营管理的相关数据,主要涉及服务区运营统计、车辆通行量、基础信息管理及路段关联,包含以下业务数据:
+核心业务实体:
+- 服务区:描述高速公路服务区基础信息,主要字段:服务区名称、服务区编码、地理坐标、服务区类型、运营状态
+- 车辆类型:描述通行车辆分类维度,主要字段:车辆类别(其他、危化品、城际、过境)
+- 路段路线:描述高速公路路段与路线归属关系,主要字段:路段名称、路线名称、路段编号
+- 合作公司:描述服务区所属分公司信息,主要字段:分公司名称、公司编码
+关键业务指标:
+- 营收指标:包含微信/支付宝/现金/行吧/金豆支付金额及订单数、支付总金额、订单总数
+- 车辆流量:按类型统计的日通行车辆数量

+ 10 - 0
data_pipeline/training_data/task_20250701_131627/filename_mapping.txt

@@ -0,0 +1,10 @@
+# 文件名映射报告
+# 格式: 原始表名 -> 实际文件名
+
+public.bss_business_day_data -> bss_business_day_data_detail.md
+public.bss_car_day_count -> bss_car_day_count_detail.md
+public.bss_company -> bss_company_detail.md
+public.bss_section_route -> bss_section_route_detail.md
+public.bss_section_route_area_link -> bss_section_route_area_link_detail.md
+public.bss_service_area -> bss_service_area_detail.md
+public.bss_service_area_mapper -> bss_service_area_mapper_detail.md

+ 62 - 0
data_pipeline/training_data/task_20250701_131627/metadata.txt

@@ -0,0 +1,62 @@
+-- Schema Tools生成的主题元数据
+-- 业务背景: 高速公路服务区管理系统
+-- 生成时间: 2025-07-01 13:47:36
+-- 数据库: highway_db
+
+-- 创建表(如果不存在)
+CREATE TABLE IF NOT EXISTS metadata (
+    id SERIAL PRIMARY KEY,    -- 主键
+    topic_name VARCHAR(100) NOT NULL,  -- 业务主题名称
+    description TEXT,                  -- 业务主体说明
+    related_tables TEXT[],			  -- 相关表名
+    biz_entities TEXT[],               -- 主要业务实体名称
+    biz_metrics TEXT[],                -- 主要业务指标名称
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP    -- 插入时间
+);
+
+-- 插入主题数据
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '日营收结构',
+  '分析各服务区每日营收构成及支付方式占比,优化资金管理策略',
+  'bss_business_day_data',
+  '服务区,支付方式,档口',
+  '总营收,现金占比,移动支付比例'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '车流高峰分析',
+  '通过车辆统计表识别服务区高峰时段及车型分布,指导资源调度',
+  'bss_car_day_count,bss_service_area',
+  '服务区,车辆类型,统计日期',
+  '日均车流,高峰时段,危化品车辆占比'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '分公司对比',
+  '比较不同分公司的服务区运营效率及营收能力,发现管理差异',
+  'bss_company,bss_service_area,bss_business_day_data',
+  '分公司,服务区,运营指标',
+  '人均营收,客单价,订单密度'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '路线关联分析',
+  '研究路段路线与服务区的关联关系,优化路线规划和服务区配置',
+  'bss_section_route,bss_section_route_area_link,bss_car_day_count',
+  '路段,路线,服务区',
+  '路线车流,服务区覆盖率,路线营收贡献'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '节假日效应',
+  '分析节假日前后服务区营收和车流变化,制定营销和服务方案',
+  'bss_business_day_data,bss_car_day_count',
+  '服务区,节假日,支付方式',
+  '节前增幅,节假日营收占比,车流增长率'
+);
+

+ 20 - 0
data_pipeline/training_data/task_20250701_131627/metadata_detail.md

@@ -0,0 +1,20 @@
+## metadata(存储分析主题元数据)
+
+`metadata` 主要描述了当前数据库包含了哪些数据内容,哪些分析主题,哪些指标等等。
+
+字段列表:
+
+- `id` (serial) - 主键ID [主键, 非空]
+- `topic_name` (varchar(100)) - 业务主题名称 [非空]
+- `description` (text) - 业务主题说明
+- `related_tables` (text[]) - 涉及的数据表 [示例: bss_business_day_data, bss_section_route_area_link]
+- `biz_entities` (text[]) - 主要业务实体名称 [示例: 车辆类型, 节假日, 路线]
+- `biz_metrics` (text[]) - 主要业务指标名称 [示例: 总营收, 现金占比, 人均营收]
+- `created_at` (timestamp) - 插入时间 [默认值: `CURRENT_TIMESTAMP`]
+
+字段补充说明:
+
+- `id` 为主键,自增;
+- `related_tables` 用于建立主题与具体明细表的依赖关系;
+- `biz_entities` 表示主题关注的核心对象,例如服务区、车辆、公司;
+- `biz_metrics` 表示该主题关注的业务分析指标,例如营收对比、趋势变化、占比结构等。

+ 190 - 0
data_pipeline/training_data/task_20250701_131627/qs_highway_db_20250701_134736_pair.json

@@ -0,0 +1,190 @@
+[
+  {
+    "question": "统计2023年4月1日各服务区的总营收及现金支付金额占比",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, SUM(rmb)/SUM(pay_sum)*100 AS 现金支付占比 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "分析2023年第一季度各支付方式在总营收中的占比变化趋势",
+    "sql": "SELECT oper_date AS 统计日期, SUM(wx)/SUM(pay_sum)*100 AS 微信占比, SUM(zfb)/SUM(pay_sum)*100 AS 支付宝占比, SUM(rmb)/SUM(pay_sum)*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "查询最近7天总营收最高的前5个服务区及其移动支付比例",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, (SUM(wx)+SUM(zfb))/SUM(pay_sum)*100 AS 移动支付比例 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND oper_date < CURRENT_DATE AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "对比不同档口的现金支付订单占比并按占比排序",
+    "sql": "SELECT branch_name AS 档口名称, SUM(rmb_order)/SUM(order_sum)*100 AS 现金订单占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 现金订单占比 DESC;"
+  },
+  {
+    "question": "计算宜春服务区2023年各季度月均营收及最大单日营收",
+    "sql": "SELECT EXTRACT(QUARTER FROM oper_date) AS 季度, AVG(pay_sum) AS 月均营收, MAX(pay_sum) AS 最大单日营收 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND EXTRACT(YEAR FROM oper_date) = 2023 AND delete_ts IS NULL GROUP BY 季度 ORDER BY 季度;"
+  },
+  {
+    "question": "统计2023年4月各服务区订单总数及总营收并按营收排名",
+    "sql": "SELECT service_name AS 服务区名称, SUM(order_sum) AS 订单总数, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "查询最近一天移动支付占比超过80%的服务区信息",
+    "sql": "SELECT service_name AS 服务区名称, (wx+zfb)/pay_sum*100 AS 移动支付比例 FROM bss_business_day_data WHERE oper_date = (SELECT MAX(oper_date) FROM bss_business_day_data WHERE delete_ts IS NULL) AND (wx+zfb)/pay_sum > 0.8 AND delete_ts IS NULL ORDER BY 移动支付比例 DESC;"
+  },
+  {
+    "question": "分析庐山服务区2023年各星期的营收分布情况",
+    "sql": "SELECT EXTRACT(ISODOW FROM oper_date) AS 星期, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE service_name = '庐山服务区' AND EXTRACT(YEAR FROM oper_date) = 2023 AND delete_ts IS NULL GROUP BY 星期 ORDER BY 星期;"
+  },
+  {
+    "question": "统计最近一天总营收超过1万元且现金占比低于10%的服务区",
+    "sql": "SELECT service_name AS 服务区名称, pay_sum AS 总营收, rmb/pay_sum*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date = (SELECT MAX(oper_date) FROM bss_business_day_data WHERE delete_ts IS NULL) AND pay_sum > 10000 AND rmb/pay_sum < 0.1 AND delete_ts IS NULL ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "对比宜春和南昌南服务区最近30天各支付方式的平均日营收",
+    "sql": "SELECT service_name AS 服务区名称, AVG(wx) AS 日均微信营收, AVG(zfb) AS 日均支付宝营收, AVG(rmb) AS 日均现金营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND service_name IN ('宜春服务区','南昌南服务区') AND delete_ts IS NULL GROUP BY service_name ORDER BY 服务区名称;"
+  },
+  {
+    "question": "统计各服务区日均车流量并按车流由高到低排序",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, AVG(cc.customer_count) AS 日均车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 日均车流量 DESC;"
+  },
+  {
+    "question": "查询危化品车辆占比超过5%的服务区信息",
+    "sql": "SELECT sa.service_area_name, ROUND((SUM(CASE WHEN cc.car_type='危化品' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count))::numeric,2) AS 危化品占比 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name HAVING SUM(CASE WHEN cc.car_type='危化品' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count) > 5 ORDER BY 危化品占比 DESC;"
+  },
+  {
+    "question": "分析最近30天各车型日均通行量变化趋势",
+    "sql": "SELECT count_date AS 统计日期, car_type AS 车型, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY count_date, car_type ORDER BY count_date;"
+  },
+  {
+    "question": "对比周末与工作日车流量差异",
+    "sql": "SELECT CASE WHEN EXTRACT(DOW FROM count_date) IN (0,6) THEN '周末' ELSE '工作日' END AS 时段类型, AVG(customer_count) AS 平均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY 时段类型;"
+  },
+  {
+    "question": "获取各服务区过境车辆占比TOP5",
+    "sql": "SELECT sa.service_area_name, ROUND((SUM(CASE WHEN cc.car_type='过境' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count))::numeric,2) AS 过境占比 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 过境占比 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计最近一周每日总车流量及环比增长率",
+    "sql": "WITH daily_total AS (SELECT count_date, SUM(customer_count) AS total FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY count_date) SELECT count_date, total, LAG(total) OVER(ORDER BY count_date) AS 前一日流量, ROUND(((total - LAG(total) OVER(ORDER BY count_date))*100.0/LAG(total) OVER(ORDER BY count_date))::numeric,2) AS 环比增长率 FROM daily_total;"
+  },
+  {
+    "question": "查询连续3天车流量增长的服务区",
+    "sql": "WITH daily_growth AS (SELECT service_area_id, count_date, SUM(customer_count) AS daily_count, LAG(SUM(customer_count),1) OVER(PARTITION BY service_area_id ORDER BY count_date) AS prev_count FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id, count_date) SELECT sa.service_area_name FROM (SELECT service_area_id FROM daily_growth WHERE daily_count > prev_count GROUP BY service_area_id, count_date - generate_series(0,2)) t JOIN bss_service_area sa ON t.service_area_id = sa.id;"
+  },
+  {
+    "question": "统计各车辆类型在不同时间段的分布比例",
+    "sql": "SELECT car_type AS 车型, EXTRACT(HOUR FROM create_ts)::integer AS 小时段, ROUND(AVG(customer_count)::numeric,0) AS 平均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type, 小时段 ORDER BY 小时段;"
+  },
+  {
+    "question": "获取昨日车流量最高的3个服务区及对应车型分布",
+    "sql": "SELECT sa.service_area_name, cc.car_type, cc.customer_count FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date = CURRENT_DATE - 1 AND sa.delete_ts IS NULL ORDER BY cc.customer_count DESC LIMIT 3;"
+  },
+  {
+    "question": "分析各区域城际车辆通行量与服务区开放状态的关系",
+    "sql": "SELECT sa.service_state AS 开放状态, AVG(CASE WHEN cc.car_type='城际' THEN cc.customer_count ELSE 0 END) AS 平均城际车流量 FROM bss_car_day_count cc RIGHT JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE sa.delete_ts IS NULL GROUP BY sa.service_state;"
+  },
+  {
+    "question": "各分公司2023年4月人均营收TOP5(按支付总额/车流量计算)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.pay_sum)/SUM(car.customer_count) AS 人均营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_car_day_count car ON sa.id = car.service_area_id AND bd.oper_date = car.count_date WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY c.company_name ORDER BY 人均营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "2023年Q2各分公司客单价对比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, AVG(bd.pay_sum/bd.order_sum) AS 客单价 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-06-30' GROUP BY c.company_name ORDER BY 客单价 DESC;"
+  },
+  {
+    "question": "最近一周订单密度(订单数/面积)最低的3个分公司",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.order_sum)/COUNT(DISTINCT sa.id) AS 订单密度 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date >= CURRENT_DATE - 7 GROUP BY c.company_name ORDER BY 订单密度 ASC LIMIT 3;"
+  },
+  {
+    "question": "各分公司2023年节假日营收总额环比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 1 THEN bd.pay_sum ELSE 0 END) AS 一月营收, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 2 THEN bd.pay_sum ELSE 0 END) AS 二月营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(YEAR FROM bd.oper_date) = 2023 GROUP BY c.company_name;"
+  },
+  {
+    "question": "2023-04-01当日各分公司运营指标对比(支付总额、订单数、车流量)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.pay_sum) AS 支付总额, SUM(bd.order_sum) AS 订单总数, SUM(car.customer_count) AS 车流量 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_car_day_count car ON sa.id = car.service_area_id WHERE bd.oper_date = '2023-04-01' GROUP BY c.company_name ORDER BY 支付总额 DESC;"
+  },
+  {
+    "question": "各分公司微信支付占比分析(近30天)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.wx) / SUM(bd.pay_sum) * 100 AS 微信占比百分比 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date >= CURRENT_DATE - 30 GROUP BY c.company_name ORDER BY 微信占比百分比 DESC;"
+  },
+  {
+    "question": "各分公司服务区数量与营收能力关联分析",
+    "sql": "SELECT c.company_name AS 分公司名称, COUNT(sa.id) AS 服务区数量, SUM(bd.pay_sum) AS 总营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY c.company_name ORDER BY 服务区数量 DESC, 总营收 DESC;"
+  },
+  {
+    "question": "2023年各分公司月均订单密度趋势分析",
+    "sql": "SELECT c.company_name AS 分公司名称, EXTRACT(MONTH FROM bd.oper_date) AS 月份, AVG(bd.order_sum) AS 月均订单密度 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(YEAR FROM bd.oper_date) = 2023 GROUP BY c.company_name, 月份 ORDER BY 分公司名称, 月份;"
+  },
+  {
+    "question": "各分公司不同支付方式订单数占比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.wx_order)/SUM(bd.order_sum)*100 AS 微信占比, SUM(bd.zf_order)/SUM(bd.order_sum)*100 AS 支付宝占比 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY c.company_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "2023年Q2各分公司营收增长率分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 4 THEN bd.pay_sum ELSE 0 END) / SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 5 THEN bd.pay_sum ELSE 0 END) - 1 AS 月增长率 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(QUARTER FROM bd.oper_date) = 2 GROUP BY c.company_name ORDER BY 月增长率 DESC;"
+  },
+  {
+    "question": "统计各路线关联的服务区数量及平均车流量,按服务区数量降序排列",
+    "sql": "SELECT r.route_name AS 路线名称, COUNT(l.service_area_id) AS 服务区数量, AVG(c.customer_count) AS 平均车流量 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id LEFT JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE r.delete_ts IS NULL GROUP BY r.route_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "计算2023年Q2各路段日均车流量,筛选出日均车流量>1000的路段",
+    "sql": "SELECT s.section_name AS 路段名称, COUNT(*) AS 天数, AVG(c.customer_count) AS 日均车流量 FROM bss_section_route s JOIN bss_section_route_area_link l ON s.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE c.count_date BETWEEN '2023-04-01' AND '2023-06-30' AND s.delete_ts IS NULL GROUP BY s.section_name HAVING AVG(c.customer_count) > 1000;"
+  },
+  {
+    "question": "查询2023年车流量TOP5服务区及对应路线信息",
+    "sql": "SELECT a.service_area_name AS 服务区名称, r.route_name AS 路线名称, SUM(c.customer_count) AS 总车流量 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_car_day_count c ON a.id = c.service_area_id WHERE c.count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY a.service_area_name, r.route_name ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计未关联服务区的路段清单及创建时间",
+    "sql": "SELECT r.section_name AS 路段名称, r.create_ts AS 创建时间 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id WHERE l.service_area_id IS NULL AND r.delete_ts IS NULL;"
+  },
+  {
+    "question": "分析春运期间(2023-01-07至2023-02-16)各路线车流变化趋势",
+    "sql": "SELECT r.route_name AS 路线名称, c.count_date AS 日期, SUM(c.customer_count) AS 总车流量 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE c.count_date BETWEEN '2023-01-07' AND '2023-02-16' GROUP BY r.route_name, c.count_date ORDER BY 日期;"
+  },
+  {
+    "question": "计算各服务区车流覆盖率(关联路段车流/总车流)TOP10",
+    "sql": "SELECT a.service_area_name AS 服务区名称, SUM(c.customer_count) AS 关联车流, (SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = a.id) AS 总车流, ROUND((SUM(c.customer_count)/(SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = a.id)) * 100)::numeric(5,2) AS 覆盖率 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_car_day_count c ON a.id = c.service_area_id GROUP BY a.id, a.service_area_name ORDER BY 覆盖率 DESC LIMIT 10;"
+  },
+  {
+    "question": "分析不同分公司管辖路段的服务区密度(服务区数/路段长度)",
+    "sql": "SELECT c.company_name AS 分公司名称, COUNT(a.id) AS 服务区数量, SUM(LENGTH(s.code)) AS 路段总长度, ROUND((COUNT(a.id)/SUM(LENGTH(s.code))) * 1000)::numeric(5,2) AS 密度_每千米 FROM bss_company c JOIN bss_service_area a ON c.id = a.company_id JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route s ON l.section_route_id = s.id GROUP BY c.company_name;"
+  },
+  {
+    "question": "分析2023年国庆节期间各服务区营收总额及环比增长率",
+    "sql": "WITH holiday_revenue AS (SELECT service_name, SUM(pay_sum) AS holiday_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name), pre_holiday_revenue AS (SELECT service_name, SUM(pay_sum) AS pre_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-09-30' AND delete_ts IS NULL GROUP BY service_name) SELECT h.service_name, h.holiday_amount, ROUND((h.holiday_amount - p.pre_amount)/p.pre_amount*100, 2) AS growth_rate FROM holiday_revenue h JOIN pre_holiday_revenue p ON h.service_name = p.service_name ORDER BY growth_rate DESC;"
+  },
+  {
+    "question": "统计2023年春节期间各服务区节假日营收占Q1季度总营收比例",
+    "sql": "WITH q1_revenue AS (SELECT service_name, SUM(pay_sum) AS q1_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY service_name), lunar_revenue AS (SELECT service_name, SUM(pay_sum) AS lunar_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-20' AND '2023-01-27' AND delete_ts IS NULL GROUP BY service_name) SELECT q.service_name, ROUND(l.lunar_amount/q.q1_amount*100, 2) AS ratio FROM q1_revenue q JOIN lunar_revenue l ON q.service_name = l.service_name ORDER BY ratio DESC;"
+  },
+  {
+    "question": "对比2023年国庆节期间不同支付方式金额占比",
+    "sql": "SELECT '微信' AS pay_type, ROUND(SUM(wx)/SUM(pay_sum)*100, 2) AS ratio FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL UNION ALL SELECT '支付宝', ROUND(SUM(zfb)/SUM(pay_sum)*100, 2) FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL UNION ALL SELECT '现金', ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "分析节假日与非节假日各服务区日均车流量增长率",
+    "sql": "WITH holiday_avg AS (SELECT service_area_id, AVG(customer_count) AS holiday_avg FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id), non_holiday_avg AS (SELECT service_area_id, AVG(customer_count) AS non_holiday_avg FROM bss_car_day_count WHERE count_date NOT BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id) SELECT h.service_area_id, ROUND((h.holiday_avg - n.non_holiday_avg)/n.non_holiday_avg*100, 2) AS growth_rate FROM holiday_avg h JOIN non_holiday_avg n ON h.service_area_id = n.service_area_id ORDER BY growth_rate DESC LIMIT 10;"
+  },
+  {
+    "question": "统计节假日车流最高峰时段的车辆类型分布",
+    "sql": "SELECT car_type, SUM(customer_count) AS total_cars FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND EXTRACT(HOUR FROM create_ts) BETWEEN 8 AND 10 AND delete_ts IS NULL GROUP BY car_type ORDER BY total_cars DESC;"
+  },
+  {
+    "question": "对比2023年五一假期与清明假期营收增幅排名TOP5服务区",
+    "sql": "WITH may_revenue AS (SELECT service_name, SUM(pay_sum) AS may_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-29' AND '2023-05-03' AND delete_ts IS NULL GROUP BY service_name), qingming_revenue AS (SELECT service_name, SUM(pay_sum) AS qingming_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-05' AND '2023-04-07' AND delete_ts IS NULL GROUP BY service_name) SELECT m.service_name, ROUND((m.may_amount - q.qingming_amount)/q.qingming_amount*100, 2) AS growth_rate FROM may_revenue m JOIN qingming_revenue q ON m.service_name = q.service_name ORDER BY growth_rate DESC LIMIT 5;"
+  },
+  {
+    "question": "分析节假日现金支付比例变化趋势",
+    "sql": "SELECT oper_date, ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) AS cash_ratio FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-10-07' AND delete_ts IS NULL GROUP BY oper_date ORDER BY oper_date;"
+  },
+  {
+    "question": "统计危化品车辆节假日期间通行量同比增幅",
+    "sql": "WITH holiday_2022 AS (SELECT COUNT(*) AS cnt_2022 FROM bss_car_day_count WHERE count_date BETWEEN '2022-10-01' AND '2022-10-07' AND car_type = '危化品' AND delete_ts IS NULL), holiday_2023 AS (SELECT COUNT(*) AS cnt_2023 FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND car_type = '危化品' AND delete_ts IS NULL) SELECT ROUND((cnt_2023 - cnt_2022)/cnt_2022*100, 2) AS growth_rate FROM holiday_2022, holiday_2023;"
+  },
+  {
+    "question": "查询2023年国庆节期间营收增幅超过50%的服务区清单",
+    "sql": "WITH pre_data AS (SELECT service_name, SUM(pay_sum) AS pre_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-09-30' AND delete_ts IS NULL GROUP BY service_name), holiday_data AS (SELECT service_name, SUM(pay_sum) AS holiday_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name) SELECT h.service_name, ROUND((h.holiday_amount - p.pre_amount)/p.pre_amount*100, 2) AS growth_rate FROM holiday_data h JOIN pre_data p ON h.service_name = p.service_name WHERE (h.holiday_amount - p.pre_amount)/p.pre_amount > 0.5 ORDER BY growth_rate DESC;"
+  },
+  {
+    "question": "分析节假日期间城际车辆流量与服务区地理位置的关系",
+    "sql": "SELECT s.service_area_name, s.service_position, AVG(c.customer_count) AS avg_traffic FROM bss_car_day_count c JOIN bss_service_area s ON c.service_area_id = s.id WHERE c.car_type = '城际' AND c.count_date BETWEEN '2023-10-01' AND '2023-10-07' AND c.delete_ts IS NULL GROUP BY s.service_area_name, s.service_position ORDER BY avg_traffic DESC;"
+  }
+]

+ 202 - 0
data_pipeline/training_data/task_20250701_131627/qs_highway_db_20250701_134736_pair.json.backup

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "统计2023年4月1日各服务区的总营收及现金支付金额占比",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, SUM(rmb)/SUM(pay_sum)*100 AS 现金支付占比 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "分析2023年第一季度各支付方式在总营收中的占比变化趋势",
+    "sql": "SELECT oper_date AS 统计日期, SUM(wx)/SUM(pay_sum)*100 AS 微信占比, SUM(zfb)/SUM(pay_sum)*100 AS 支付宝占比, SUM(rmb)/SUM(pay_sum)*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY oper_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "查询最近7天总营收最高的前5个服务区及其移动支付比例",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, (SUM(wx)+SUM(zfb))/SUM(pay_sum)*100 AS 移动支付比例 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND oper_date < CURRENT_DATE AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "对比不同档口的现金支付订单占比并按占比排序",
+    "sql": "SELECT branch_name AS 档口名称, SUM(rmb_order)/SUM(order_sum)*100 AS 现金订单占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 现金订单占比 DESC;"
+  },
+  {
+    "question": "计算宜春服务区2023年各季度月均营收及最大单日营收",
+    "sql": "SELECT EXTRACT(QUARTER FROM oper_date) AS 季度, AVG(pay_sum) AS 月均营收, MAX(pay_sum) AS 最大单日营收 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND EXTRACT(YEAR FROM oper_date) = 2023 AND delete_ts IS NULL GROUP BY 季度 ORDER BY 季度;"
+  },
+  {
+    "question": "统计2023年4月各服务区订单总数及总营收并按营收排名",
+    "sql": "SELECT service_name AS 服务区名称, SUM(order_sum) AS 订单总数, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "查询最近一天移动支付占比超过80%的服务区信息",
+    "sql": "SELECT service_name AS 服务区名称, (wx+zfb)/pay_sum*100 AS 移动支付比例 FROM bss_business_day_data WHERE oper_date = (SELECT MAX(oper_date) FROM bss_business_day_data WHERE delete_ts IS NULL) AND (wx+zfb)/pay_sum > 0.8 AND delete_ts IS NULL ORDER BY 移动支付比例 DESC;"
+  },
+  {
+    "question": "分析庐山服务区2023年各星期的营收分布情况",
+    "sql": "SELECT EXTRACT(ISODOW FROM oper_date) AS 星期, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE service_name = '庐山服务区' AND EXTRACT(YEAR FROM oper_date) = 2023 AND delete_ts IS NULL GROUP BY 星期 ORDER BY 星期;"
+  },
+  {
+    "question": "统计最近一天总营收超过1万元且现金占比低于10%的服务区",
+    "sql": "SELECT service_name AS 服务区名称, pay_sum AS 总营收, rmb/pay_sum*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date = (SELECT MAX(oper_date) FROM bss_business_day_data WHERE delete_ts IS NULL) AND pay_sum > 10000 AND rmb/pay_sum < 0.1 AND delete_ts IS NULL ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "对比宜春和南昌南服务区最近30天各支付方式的平均日营收",
+    "sql": "SELECT service_name AS 服务区名称, AVG(wx) AS 日均微信营收, AVG(zfb) AS 日均支付宝营收, AVG(rmb) AS 日均现金营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND service_name IN ('宜春服务区','南昌南服务区') AND delete_ts IS NULL GROUP BY service_name ORDER BY 服务区名称;"
+  },
+  {
+    "question": "统计各服务区日均车流量并按车流由高到低排序",
+    "sql": "SELECT sa.service_area_name AS 服务区名称, AVG(cc.customer_count) AS 日均车流量 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 日均车流量 DESC;"
+  },
+  {
+    "question": "查询危化品车辆占比超过5%的服务区信息",
+    "sql": "SELECT sa.service_area_name, ROUND((SUM(CASE WHEN cc.car_type='危化品' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count))::numeric,2) AS 危化品占比 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name HAVING SUM(CASE WHEN cc.car_type='危化品' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count) > 5 ORDER BY 危化品占比 DESC;"
+  },
+  {
+    "question": "分析最近30天各车型日均通行量变化趋势",
+    "sql": "SELECT count_date AS 统计日期, car_type AS 车型, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY count_date, car_type ORDER BY count_date;"
+  },
+  {
+    "question": "对比周末与工作日车流量差异",
+    "sql": "SELECT CASE WHEN EXTRACT(DOW FROM count_date) IN (0,6) THEN '周末' ELSE '工作日' END AS 时段类型, AVG(customer_count) AS 平均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY 时段类型;"
+  },
+  {
+    "question": "获取各服务区过境车辆占比TOP5",
+    "sql": "SELECT sa.service_area_name, ROUND((SUM(CASE WHEN cc.car_type='过境' THEN cc.customer_count ELSE 0 END)*100.0/SUM(cc.customer_count))::numeric,2) AS 过境占比 FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.delete_ts IS NULL AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 过境占比 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计最近一周每日总车流量及环比增长率",
+    "sql": "WITH daily_total AS (SELECT count_date, SUM(customer_count) AS total FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY count_date) SELECT count_date, total, LAG(total) OVER(ORDER BY count_date) AS 前一日流量, ROUND(((total - LAG(total) OVER(ORDER BY count_date))*100.0/LAG(total) OVER(ORDER BY count_date))::numeric,2) AS 环比增长率 FROM daily_total;"
+  },
+  {
+    "question": "查询连续3天车流量增长的服务区",
+    "sql": "WITH daily_growth AS (SELECT service_area_id, count_date, SUM(customer_count) AS daily_count, LAG(SUM(customer_count),1) OVER(PARTITION BY service_area_id ORDER BY count_date) AS prev_count FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id, count_date) SELECT sa.service_area_name FROM (SELECT service_area_id FROM daily_growth WHERE daily_count > prev_count GROUP BY service_area_id, count_date - generate_series(0,2)) t JOIN bss_service_area sa ON t.service_area_id = sa.id;"
+  },
+  {
+    "question": "统计各车辆类型在不同时间段的分布比例",
+    "sql": "SELECT car_type AS 车型, EXTRACT(HOUR FROM create_ts)::integer AS 小时段, ROUND(AVG(customer_count)::numeric,0) AS 平均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type, 小时段 ORDER BY 小时段;"
+  },
+  {
+    "question": "获取昨日车流量最高的3个服务区及对应车型分布",
+    "sql": "SELECT sa.service_area_name, cc.car_type, cc.customer_count FROM bss_car_day_count cc JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date = CURRENT_DATE - 1 AND sa.delete_ts IS NULL ORDER BY cc.customer_count DESC LIMIT 3;"
+  },
+  {
+    "question": "分析各区域城际车辆通行量与服务区开放状态的关系",
+    "sql": "SELECT sa.service_state AS 开放状态, AVG(CASE WHEN cc.car_type='城际' THEN cc.customer_count ELSE 0 END) AS 平均城际车流量 FROM bss_car_day_count cc RIGHT JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE sa.delete_ts IS NULL GROUP BY sa.service_state;"
+  },
+  {
+    "question": "各分公司2023年4月人均营收TOP5(按支付总额/车流量计算)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.pay_sum)/SUM(car.customer_count) AS 人均营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_car_day_count car ON sa.id = car.service_area_id AND bd.oper_date = car.count_date WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY c.company_name ORDER BY 人均营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "2023年Q2各分公司客单价对比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, AVG(bd.pay_sum/bd.order_sum) AS 客单价 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-06-30' GROUP BY c.company_name ORDER BY 客单价 DESC;"
+  },
+  {
+    "question": "最近一周订单密度(订单数/面积)最低的3个分公司",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.order_sum)/COUNT(DISTINCT sa.id) AS 订单密度 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date >= CURRENT_DATE - 7 GROUP BY c.company_name ORDER BY 订单密度 ASC LIMIT 3;"
+  },
+  {
+    "question": "各分公司2023年节假日营收总额环比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 1 THEN bd.pay_sum ELSE 0 END) AS 一月营收, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 2 THEN bd.pay_sum ELSE 0 END) AS 二月营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(YEAR FROM bd.oper_date) = 2023 GROUP BY c.company_name;"
+  },
+  {
+    "question": "2023-04-01当日各分公司运营指标对比(支付总额、订单数、车流量)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.pay_sum) AS 支付总额, SUM(bd.order_sum) AS 订单总数, SUM(car.customer_count) AS 车流量 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_car_day_count car ON sa.id = car.service_area_id WHERE bd.oper_date = '2023-04-01' GROUP BY c.company_name ORDER BY 支付总额 DESC;"
+  },
+  {
+    "question": "各分公司微信支付占比分析(近30天)",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.wx) / SUM(bd.pay_sum) * 100 AS 微信占比百分比 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date >= CURRENT_DATE - 30 GROUP BY c.company_name ORDER BY 微信占比百分比 DESC;"
+  },
+  {
+    "question": "各分公司服务区数量与营收能力关联分析",
+    "sql": "SELECT c.company_name AS 分公司名称, COUNT(sa.id) AS 服务区数量, SUM(bd.pay_sum) AS 总营收 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY c.company_name ORDER BY 服务区数量 DESC, 总营收 DESC;"
+  },
+  {
+    "question": "2023年各分公司月均订单密度趋势分析",
+    "sql": "SELECT c.company_name AS 分公司名称, EXTRACT(MONTH FROM bd.oper_date) AS 月份, AVG(bd.order_sum) AS 月均订单密度 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(YEAR FROM bd.oper_date) = 2023 GROUP BY c.company_name, 月份 ORDER BY 分公司名称, 月份;"
+  },
+  {
+    "question": "各分公司不同支付方式订单数占比分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(bd.wx_order)/SUM(bd.order_sum)*100 AS 微信占比, SUM(bd.zf_order)/SUM(bd.order_sum)*100 AS 支付宝占比 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY c.company_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "2023年Q2各分公司营收增长率分析",
+    "sql": "SELECT c.company_name AS 分公司名称, SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 4 THEN bd.pay_sum ELSE 0 END) / SUM(CASE WHEN EXTRACT(MONTH FROM bd.oper_date) = 5 THEN bd.pay_sum ELSE 0 END) - 1 AS 月增长率 FROM bss_company c JOIN bss_service_area sa ON c.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE EXTRACT(QUARTER FROM bd.oper_date) = 2 GROUP BY c.company_name ORDER BY 月增长率 DESC;"
+  },
+  {
+    "question": "统计各路线关联的服务区数量及平均车流量,按服务区数量降序排列",
+    "sql": "SELECT r.route_name AS 路线名称, COUNT(l.service_area_id) AS 服务区数量, AVG(c.customer_count) AS 平均车流量 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id LEFT JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE r.delete_ts IS NULL GROUP BY r.route_name ORDER BY 服务区数量 DESC;"
+  },
+  {
+    "question": "计算2023年Q2各路段日均车流量,筛选出日均车流量>1000的路段",
+    "sql": "SELECT s.section_name AS 路段名称, COUNT(*) AS 天数, AVG(c.customer_count) AS 日均车流量 FROM bss_section_route s JOIN bss_section_route_area_link l ON s.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE c.count_date BETWEEN '2023-04-01' AND '2023-06-30' AND s.delete_ts IS NULL GROUP BY s.section_name HAVING AVG(c.customer_count) > 1000;"
+  },
+  {
+    "question": "查询2023年车流量TOP5服务区及对应路线信息",
+    "sql": "SELECT a.service_area_name AS 服务区名称, r.route_name AS 路线名称, SUM(c.customer_count) AS 总车流量 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_car_day_count c ON a.id = c.service_area_id WHERE c.count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY a.service_area_name, r.route_name ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析各路线服务区营收贡献占比,按微信支付金额排序",
+    "sql": "SELECT r.route_name AS 路线名称, SUM(b.wx) AS 微信支付总额, SUM(b.pay_sum) AS 总营收, ROUND((SUM(b.wx)/SUM(b.pay_sum))*100, 2) AS 微信占比 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_business_day_data b ON l.service_area_id = b.service_area_id WHERE b.oper_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY r.route_name ORDER BY 微信支付总额 DESC;"
+  },
+  {
+    "question": "对比不同车辆类型在各路线的分布比例",
+    "sql": "SELECT r.route_name AS 路线名称, c.car_type AS 车辆类型, COUNT(*) AS 记录数, ROUND((COUNT(*)/(SELECT COUNT(*) FROM bss_car_day_count WHERE service_area_id IN (SELECT service_area_id FROM bss_section_route_area_link WHERE section_route_id = r.id))) * 100)::numeric(5,2) AS 占比百分比 FROM bss_car_day_count c JOIN bss_section_route_area_link l ON c.service_area_id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id GROUP BY r.route_name, c.car_type;"
+  },
+  {
+    "question": "统计未关联服务区的路段清单及创建时间",
+    "sql": "SELECT r.section_name AS 路段名称, r.create_ts AS 创建时间 FROM bss_section_route r LEFT JOIN bss_section_route_area_link l ON r.id = l.section_route_id WHERE l.service_area_id IS NULL AND r.delete_ts IS NULL;"
+  },
+  {
+    "question": "分析春运期间(2023-01-07至2023-02-16)各路线车流变化趋势",
+    "sql": "SELECT r.route_name AS 路线名称, c.count_date AS 日期, SUM(c.customer_count) AS 总车流量 FROM bss_section_route r JOIN bss_section_route_area_link l ON r.id = l.section_route_id JOIN bss_car_day_count c ON l.service_area_id = c.service_area_id WHERE c.count_date BETWEEN '2023-01-07' AND '2023-02-16' GROUP BY r.route_name, c.count_date ORDER BY 日期;"
+  },
+  {
+    "question": "计算各服务区车流覆盖率(关联路段车流/总车流)TOP10",
+    "sql": "SELECT a.service_area_name AS 服务区名称, SUM(c.customer_count) AS 关联车流, (SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = a.id) AS 总车流, ROUND((SUM(c.customer_count)/(SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = a.id)) * 100)::numeric(5,2) AS 覆盖率 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_car_day_count c ON a.id = c.service_area_id GROUP BY a.service_area_name ORDER BY 覆盖率 DESC LIMIT 10;"
+  },
+  {
+    "question": "查询节假日(2023-10-01至2023-10-07)营收贡献最高的TOP3服务区及对应路线",
+    "sql": "SELECT a.service_area_name AS 服务区名称, r.route_name AS 路线名称, SUM(b.pay_sum) AS 总营收 FROM bss_service_area a JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route r ON l.section_route_id = r.id JOIN bss_business_day_data b ON a.id = b.service_area_id WHERE b.oper_date BETWEEN '2023-10-01' AND '2023-10-07' GROUP BY a.service_area_name, r.route_name ORDER BY 总营收 DESC LIMIT 3;"
+  },
+  {
+    "question": "分析不同分公司管辖路段的服务区密度(服务区数/路段长度)",
+    "sql": "SELECT c.company_name AS 分公司名称, COUNT(a.id) AS 服务区数量, SUM(LENGTH(s.code)) AS 路段总长度, ROUND((COUNT(a.id)/SUM(LENGTH(s.code))) * 1000)::numeric(5,2) AS 密度_每千米 FROM bss_company c JOIN bss_service_area a ON c.id = a.company_id JOIN bss_section_route_area_link l ON a.id = l.service_area_id JOIN bss_section_route s ON l.section_route_id = s.id GROUP BY c.company_name;"
+  },
+  {
+    "question": "分析2023年国庆节期间各服务区营收总额及环比增长率",
+    "sql": "WITH holiday_revenue AS (SELECT service_name, SUM(pay_sum) AS holiday_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name), pre_holiday_revenue AS (SELECT service_name, SUM(pay_sum) AS pre_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-09-30' AND delete_ts IS NULL GROUP BY service_name) SELECT h.service_name, h.holiday_amount, ROUND((h.holiday_amount - p.pre_amount)/p.pre_amount*100, 2) AS growth_rate FROM holiday_revenue h JOIN pre_holiday_revenue p ON h.service_name = p.service_name ORDER BY growth_rate DESC;"
+  },
+  {
+    "question": "统计2023年春节期间各服务区节假日营收占Q1季度总营收比例",
+    "sql": "WITH q1_revenue AS (SELECT service_name, SUM(pay_sum) AS q1_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY service_name), lunar_revenue AS (SELECT service_name, SUM(pay_sum) AS lunar_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-20' AND '2023-01-27' AND delete_ts IS NULL GROUP BY service_name) SELECT q.service_name, ROUND(l.lunar_amount/q.q1_amount*100, 2) AS ratio FROM q1_revenue q JOIN lunar_revenue l ON q.service_name = l.service_name ORDER BY ratio DESC;"
+  },
+  {
+    "question": "对比2023年国庆节期间不同支付方式金额占比",
+    "sql": "SELECT '微信' AS pay_type, ROUND(SUM(wx)/SUM(pay_sum)*100, 2) AS ratio FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL UNION ALL SELECT '支付宝', ROUND(SUM(zfb)/SUM(pay_sum)*100, 2) FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL UNION ALL SELECT '现金', ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "分析节假日与非节假日各服务区日均车流量增长率",
+    "sql": "WITH holiday_avg AS (SELECT service_area_id, AVG(customer_count) AS holiday_avg FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id), non_holiday_avg AS (SELECT service_area_id, AVG(customer_count) AS non_holiday_avg FROM bss_car_day_count WHERE count_date NOT BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id) SELECT h.service_area_id, ROUND((h.holiday_avg - n.non_holiday_avg)/n.non_holiday_avg*100, 2) AS growth_rate FROM holiday_avg h JOIN non_holiday_avg n ON h.service_area_id = n.service_area_id ORDER BY growth_rate DESC LIMIT 10;"
+  },
+  {
+    "question": "统计节假日车流最高峰时段的车辆类型分布",
+    "sql": "SELECT car_type, SUM(customer_count) AS total_cars FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND EXTRACT(HOUR FROM create_ts) BETWEEN 8 AND 10 AND delete_ts IS NULL GROUP BY car_type ORDER BY total_cars DESC;"
+  },
+  {
+    "question": "对比2023年五一假期与清明假期营收增幅排名TOP5服务区",
+    "sql": "WITH may_revenue AS (SELECT service_name, SUM(pay_sum) AS may_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-29' AND '2023-05-03' AND delete_ts IS NULL GROUP BY service_name), qingming_revenue AS (SELECT service_name, SUM(pay_sum) AS qingming_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-05' AND '2023-04-07' AND delete_ts IS NULL GROUP BY service_name) SELECT m.service_name, ROUND((m.may_amount - q.qingming_amount)/q.qingming_amount*100, 2) AS growth_rate FROM may_revenue m JOIN qingming_revenue q ON m.service_name = q.service_name ORDER BY growth_rate DESC LIMIT 5;"
+  },
+  {
+    "question": "分析节假日现金支付比例变化趋势",
+    "sql": "SELECT oper_date, ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) AS cash_ratio FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-10-07' AND delete_ts IS NULL GROUP BY oper_date ORDER BY oper_date;"
+  },
+  {
+    "question": "统计危化品车辆节假日期间通行量同比增幅",
+    "sql": "WITH holiday_2022 AS (SELECT COUNT(*) AS cnt_2022 FROM bss_car_day_count WHERE count_date BETWEEN '2022-10-01' AND '2022-10-07' AND car_type = '危化品' AND delete_ts IS NULL), holiday_2023 AS (SELECT COUNT(*) AS cnt_2023 FROM bss_car_day_count WHERE count_date BETWEEN '2023-10-01' AND '2023-10-07' AND car_type = '危化品' AND delete_ts IS NULL) SELECT ROUND((cnt_2023 - cnt_2022)/cnt_2022*100, 2) AS growth_rate FROM holiday_2022, holiday_2023;"
+  },
+  {
+    "question": "查询2023年国庆节期间营收增幅超过50%的服务区清单",
+    "sql": "WITH pre_data AS (SELECT service_name, SUM(pay_sum) AS pre_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-09-24' AND '2023-09-30' AND delete_ts IS NULL GROUP BY service_name), holiday_data AS (SELECT service_name, SUM(pay_sum) AS holiday_amount FROM bss_business_day_data WHERE oper_date BETWEEN '2023-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_name) SELECT h.service_name, ROUND((h.holiday_amount - p.pre_amount)/p.pre_amount*100, 2) AS growth_rate FROM holiday_data h JOIN pre_data p ON h.service_name = p.service_name WHERE (h.holiday_amount - p.pre_amount)/p.pre_amount > 0.5 ORDER BY growth_rate DESC;"
+  },
+  {
+    "question": "分析节假日期间城际车辆流量与服务区地理位置的关系",
+    "sql": "SELECT s.service_area_name, s.service_position, AVG(c.customer_count) AS avg_traffic FROM bss_car_day_count c JOIN bss_service_area s ON c.service_area_id = s.id WHERE c.car_type = '城际' AND c.count_date BETWEEN '2023-10-01' AND '2023-10-07' AND c.delete_ts IS NULL GROUP BY s.service_area_name, s.service_position ORDER BY avg_traffic DESC;"
+  }
+]

+ 14 - 0
data_pipeline/training_data/task_20250701_131627/task_config.json

@@ -0,0 +1,14 @@
+{
+  "task_id": "task_20250701_131627",
+  "created_at": "2025-07-01T05:16:27.671265",
+  "parameters": {
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:6432/highway_db",
+    "table_list_file": "data_pipeline/tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_sql_validation": true,
+    "enable_training_data_load": true
+  },
+  "output_directory": "data_pipeline\\training_data\\task_20250701_131627"
+}

+ 88 - 0
data_pipeline/training_data/task_20250701_131627/task_result.json

@@ -0,0 +1,88 @@
+{
+  "success": true,
+  "workflow_summary": {
+    "total_duration": 1283.84,
+    "completed_steps": [
+      "ddl_md_generation",
+      "question_sql_generation",
+      "sql_validation",
+      "training_data_load"
+    ],
+    "failed_steps": [],
+    "total_steps": 4,
+    "workflow_started": "2025-07-01T13:30:53.267230",
+    "workflow_completed": "2025-07-01T13:52:17.112211"
+  },
+  "input_parameters": {
+    "db_connection": "postgresql://postgres:***@192.168.67.1:6432/highway_db",
+    "table_list_file": "data_pipeline/tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "db_name": "highway_db",
+    "output_directory": "data_pipeline\\training_data\\task_20250701_131627",
+    "enable_sql_validation": true,
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_training_data_load": true
+  },
+  "processing_results": {
+    "ddl_md_generation": {
+      "total_tables": 7,
+      "processed_successfully": 7,
+      "failed": 0,
+      "files_generated": 14,
+      "duration": 422.30856490135193
+    },
+    "question_sql_generation": {
+      "output_file": "data_pipeline\\training_data\\task_20250701_131627\\qs_highway_db_20250701_134736_pair.json",
+      "total_questions": 50,
+      "total_themes": 5,
+      "successful_themes": 5,
+      "failed_themes": [],
+      "duration": 607.0530173778534
+    },
+    "sql_validation": {
+      "original_sql_count": 50,
+      "valid_sql_count": 47,
+      "invalid_sql_count": 3,
+      "success_rate": 0.94,
+      "repair_stats": {
+        "attempted": 4,
+        "successful": 1,
+        "failed": 3
+      },
+      "file_modification_stats": {
+        "modified": 1,
+        "deleted": 3,
+        "failed_modifications": 0
+      },
+      "average_execution_time": 0.02947342872619629,
+      "total_retries": 0,
+      "duration": 236.6604528427124
+    },
+    "training_data_load": {
+      "training_data_dir": "data_pipeline\\training_data\\task_20250701_131627",
+      "load_successful": true,
+      "total_records": 288,
+      "data_type_counts": {
+        "sql": 254,
+        "documentation": 17,
+        "ddl": 16,
+        "error_sql": 1
+      },
+      "duration": 17.167370080947876
+    }
+  },
+  "final_outputs": {
+    "primary_output_file": "data_pipeline\\training_data\\task_20250701_131627\\qs_highway_db_20250701_134736_pair.json",
+    "output_directory": "data_pipeline\\training_data\\task_20250701_131627",
+    "final_question_count": 47,
+    "backup_files_created": true
+  },
+  "performance_metrics": {
+    "step1_duration": 422.31,
+    "step2_duration": 607.05,
+    "step3_duration": 236.66,
+    "step4_duration": 17.17,
+    "total_duration": 1283.84
+  }
+}

+ 179 - 0
docs/data_pipeline_api_config_changes.md

@@ -0,0 +1,179 @@
+# Data Pipeline API 配置变更说明
+
+## 变更概述
+
+基于用户需求,Data Pipeline API 进行了重要的配置变更,主要目的是:
+
+1. **简化API调用**:移除 `db_connection` 必填参数
+2. **统一配置管理**:使用 `app_config.py` 中的配置
+3. **明确数据库职责**:任务管理表存储在向量数据库中
+
+## 主要变更内容
+
+### 1. API参数变更
+
+#### 变更前
+```json
+{
+  "db_connection": "postgresql://user:pass@host:5432/dbname",  // 必填
+  "table_list_file": "tables.txt",
+  "business_context": "业务描述"
+}
+```
+
+#### 变更后
+```json
+{
+  "table_list_file": "tables.txt",                            // 必填
+  "business_context": "业务描述",                            // 必填
+  "db_name": "highway_db"                                     // 可选
+}
+```
+
+### 2. 数据库连接配置
+
+#### 业务数据库连接
+- **配置来源**: `app_config.py` 中的 `APP_DB_CONFIG`
+- **用途**: Schema分析和训练数据生成的源数据库
+- **自动构建**: 系统自动构建连接字符串用于 `schema_workflow` 执行
+
+#### 任务管理数据库连接
+- **配置来源**: `app_config.py` 中的 `PGVECTOR_CONFIG`
+- **用途**: 存储任务状态、执行记录、日志等管理信息
+- **表结构**: 4个管理表都创建在向量数据库中
+
+### 3. 代码变更清单
+
+#### 修改的文件:
+
+1. **`data_pipeline/api/simple_db_manager.py`**
+   - 修改 `create_task()` 方法签名
+   - 移除 `db_connection` 必填参数
+   - 添加 `_build_db_connection_string()` 方法
+   - 从 `APP_DB_CONFIG` 自动获取业务数据库配置
+
+2. **`data_pipeline/api/simple_workflow.py`**
+   - 修改 `SimpleWorkflowManager.create_task()` 方法
+   - 更新参数传递逻辑
+
+3. **`citu_app.py`**
+   - 更新 `/api/v0/data_pipeline/tasks` POST 接口
+   - 移除 `db_connection` 参数验证
+   - 添加可选的 `db_name` 参数支持
+
+4. **文档更新**
+   - `docs/data_pipeline_api_usage_guide.md`
+   - `docs/data_pipeline_api_design.md`
+   - 更新API调用示例和参数说明
+
+## 数据库架构
+
+### 双数据库设计
+
+```
+┌─────────────────────┐       ┌─────────────────────┐
+│   业务数据库        │       │   向量数据库        │
+│  (APP_DB_CONFIG)    │       │  (PGVECTOR_CONFIG)  │
+├─────────────────────┤       ├─────────────────────┤
+│ • 业务表数据        │       │ • 任务管理表        │
+│ • Schema信息        │  ───→ │ • 执行记录表        │
+│ • 训练数据源        │       │ • 日志表            │
+│                     │       │ • 文件输出表        │
+└─────────────────────┘       └─────────────────────┘
+      ↑                              ↑
+      │                              │
+ schema_workflow              SimpleTaskManager
+  数据处理执行                    任务状态管理
+```
+
+## 向前兼容性
+
+### API兼容性
+- **破坏性变更**: 是的,移除了 `db_connection` 必填参数
+- **迁移方案**: 
+  1. 更新API调用代码,移除 `db_connection` 参数
+  2. 确保 `app_config.py` 中正确配置了 `APP_DB_CONFIG`
+  3. 可选择性添加 `db_name` 参数指定特定数据库
+
+### 数据库兼容性
+- **表结构**: 无变更,继续使用现有的4个管理表
+- **存储位置**: 确保表创建在向量数据库中
+- **初始化**: 使用 `data_pipeline/sql/init_tables.sql` 在向量数据库中创建
+
+## 配置示例
+
+### app_config.py 示例配置
+
+```python
+# 业务数据库配置(用于数据处理)
+APP_DB_CONFIG = {
+    'host': '192.168.67.1',
+    'port': 6432,
+    'dbname': 'highway_db',
+    'user': 'postgres',
+    'password': 'password'
+}
+
+# 向量数据库配置(用于任务管理)
+PGVECTOR_CONFIG = {
+    'host': '192.168.67.1',
+    'port': 5432,
+    'dbname': 'highway_pgvector_db',
+    'user': 'postgres',
+    'password': 'password'
+}
+```
+
+## 测试方法
+
+### 1. 使用新API格式
+```bash
+curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks \
+  -H "Content-Type: application/json" \
+  -d '{
+    "table_list_file": "data_pipeline/tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "db_name": "highway_db"
+  }'
+```
+
+### 2. 运行测试脚本
+```bash
+python test_api_changes.py
+```
+
+## 注意事项
+
+1. **配置检查**: 确保 `app_config.py` 中的数据库配置正确
+2. **权限验证**: 确保应用有权限访问两个数据库
+3. **表初始化**: 在向量数据库中执行 `init_tables.sql`
+4. **监控日志**: 关注任务创建和执行过程中的日志信息
+
+## 常见问题
+
+### Q: 为什么要移除 db_connection 参数?
+A: 
+- 简化API调用,避免敏感信息在请求中传递
+- 统一配置管理,便于维护
+- 与现有系统架构保持一致
+
+### Q: 如何指定不同的业务数据库?
+A: 
+- 使用可选的 `db_name` 参数
+- 或在 `app_config.py` 中修改 `APP_DB_CONFIG`
+
+### Q: 旧的API调用会怎样?
+A: 
+- 包含 `db_connection` 的请求会被忽略此参数
+- 必须提供 `table_list_file` 和 `business_context`
+- 建议更新到新的API格式
+
+### Q: 任务管理表为什么放在向量数据库中?
+A: 
+- 向量数据库用于存储系统元数据
+- 避免污染业务数据库
+- 便于系统数据的统一管理
+
+## 总结
+
+这次变更使Data Pipeline API更加简洁和易用,同时保持了系统的功能完整性。通过将配置管理集中到 `app_config.py`,提高了系统的可维护性和安全性。

+ 1204 - 0
docs/data_pipeline_api_design.md

@@ -0,0 +1,1204 @@
+# Data Pipeline API 概要设计
+
+## 项目背景
+
+为了让Web UI能够调用Data Pipeline生成训练数据的功能,并实现任务的后台执行、进度追踪和日志查看,我们需要设计一套API系统来支持这些需求。
+
+## 设计目标
+
+1. **后台执行**:支持长时间运行的训练数据生成任务,不阻塞HTTP请求
+2. **进度追踪**:提供实时的任务执行进度和状态查询
+3. **日志管理**:集中管理任务日志,支持详细日志查看
+4. **文件管理**:统一管理生成的训练数据文件
+5. **并发控制**:确保同时只有一个任务在执行
+6. **持久化**:任务状态持久化存储,支持服务重启后的状态恢复
+
+## 核心设计原则
+
+### 1. 任务与API解耦
+- **API服务器**:仅负责任务调度和状态查询
+- **独立脚本**:实际执行数据处理工作,完全独立运行
+- **数据库桥梁**:作为两者之间的通信媒介
+
+### 2. 任务ID即时间戳约定
+- **任务ID生成规则**:`task_YYYYMMDD_HHMMSS` 格式
+  - 示例:`task_20250627_143052` 表示 2025年6月27日 14:30:52 创建的任务
+  - 使用本地时间,确保在同一秒内不会创建多个任务
+  - 任务ID同时作为:
+    - 数据库主键
+    - 文件系统目录名
+    - API查询参数
+- **优势**:
+  - 自然排序,方便查找最新任务
+  - 无需额外的ID生成器
+  - 时间信息直观可见
+
+### 3. 时间戳目录管理
+每个任务在`./data_pipeline/training_data/`下创建独立的时间戳目录:
+```
+./data_pipeline/training_data/
+├── task_20250627_143052/                   # 时间戳作为任务ID
+│   ├── data_pipeline.log                   # 所有data_pipeline模块的统一日志
+│   ├── task_config.json                    # 任务配置参数
+│   ├── task_result.json                    # 最终执行结果
+│   ├── bss_*.ddl                          # 生成的DDL文件
+│   ├── bss_*_detail.md                    # 生成的MD文档
+│   ├── qs_*.json                          # Question-SQL对
+│   ├── metadata.txt                        # 元数据文件
+│   ├── sql_validation_*_summary.log       # SQL验证摘要报告
+│   ├── sql_validation_*_report.json       # SQL验证详细报告(可选)
+│   └── file_modifications_*.log           # 文件修改日志(如果启用修改功能)
+└── task_20250627_150123/
+    └── ...
+```
+
+**目录创建细节**:
+- **创建时机**:在API返回之前创建,确保任务开始执行时目录已存在
+- **创建位置**:相对于项目根目录的`./data_pipeline/training_data/`
+- **权限设置**:确保当前用户和子进程都有读写权限(755)
+- **失败处理**:如果目录创建失败,取消任务创建,返回错误信息
+- **文件组织**:
+  - 所有SchemaWorkflowOrchestrator的输出都重定向到此目录
+  - 日志文件使用独立的FileHandler写入此目录
+  - 配置文件在任务创建时立即写入
+
+### 4. 粗粒度进度追踪
+采用步骤级进度追踪,不追踪表级别的细节:
+- DDL/MD生成:0% → 40%
+- Question-SQL生成:40% → 70%
+- SQL验证:70% → 90%
+- 训练数据加载:90% → 100%
+
+## 数据库设计
+
+### 任务表 (data_pipeline_tasks)
+```sql
+CREATE TABLE data_pipeline_tasks (
+    id VARCHAR(32) PRIMARY KEY,                    -- 任务ID (时间戳格式)
+    task_type VARCHAR(50) NOT NULL,                -- 任务类型
+    status VARCHAR(20) NOT NULL,                   -- 任务状态: pending/in_progress/partial_completed/completed/failed
+    parameters JSONB NOT NULL,                     -- 任务参数
+    result JSONB,                                  -- 任务结果
+    error_message TEXT,                            -- 错误信息
+    step_status JSONB DEFAULT '{                   -- 各步骤状态跟踪
+        "ddl_generation": "pending",
+        "qa_generation": "pending", 
+        "sql_validation": "pending",
+        "training_load": "pending"
+    }',
+    output_directory TEXT,                         -- 任务输出目录
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    started_at TIMESTAMP,
+    completed_at TIMESTAMP,
+    created_by VARCHAR(50),
+    db_name VARCHAR(100),                          -- 数据库名称
+    business_context TEXT                          -- 业务上下文
+);
+```
+
+### 任务执行记录表 (data_pipeline_task_executions)
+```sql
+CREATE TABLE data_pipeline_task_executions (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    execution_step VARCHAR(50) NOT NULL,          -- 'ddl_generation', 'qa_generation', 'sql_validation', 'training_load'
+    status VARCHAR(20) NOT NULL,                  -- 'running', 'completed', 'failed'
+    started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    completed_at TIMESTAMP,
+    error_message TEXT,
+    execution_result JSONB,                       -- 步骤执行结果
+    execution_id VARCHAR(100) UNIQUE,             -- {task_id}_step_{step_name}_exec_{timestamp}
+    force_executed BOOLEAN DEFAULT FALSE,         -- 是否强制执行
+    files_cleaned BOOLEAN DEFAULT FALSE           -- 是否清理了旧文件
+);
+```
+
+### 任务日志表 (data_pipeline_task_logs)
+```sql
+CREATE TABLE data_pipeline_task_logs (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    execution_id VARCHAR(100) REFERENCES data_pipeline_task_executions(execution_id),
+    log_level VARCHAR(10) NOT NULL,               -- 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+    message TEXT NOT NULL,
+    step_name VARCHAR(50),                        -- 执行步骤名称
+    module_name VARCHAR(100),                     -- 模块名称
+    function_name VARCHAR(100),                   -- 函数名称
+    timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    extra_data JSONB DEFAULT '{}'                 -- 额外的结构化信息
+);
+```
+
+### 任务文件输出表 (data_pipeline_task_outputs)
+```sql
+CREATE TABLE data_pipeline_task_outputs (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    execution_id VARCHAR(100) REFERENCES data_pipeline_task_executions(execution_id),
+    file_type VARCHAR(50) NOT NULL,               -- 'ddl', 'md', 'json', 'log', 'report'
+    file_name VARCHAR(255) NOT NULL,              -- 文件名
+    file_path TEXT NOT NULL,                      -- 相对路径
+    file_size BIGINT DEFAULT 0,                   -- 文件大小(字节)
+    content_hash VARCHAR(64),                     -- 文件内容hash
+    description TEXT,                             -- 文件描述
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    modified_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    is_primary BOOLEAN DEFAULT FALSE,             -- 是否为主要输出文件
+    is_downloadable BOOLEAN DEFAULT TRUE          -- 是否可下载
+);
+```
+
+### 索引设计
+```sql
+-- 任务表索引
+CREATE INDEX idx_tasks_status ON data_pipeline_tasks(status);
+CREATE INDEX idx_tasks_created_at ON data_pipeline_tasks(created_at DESC);
+CREATE INDEX idx_tasks_db_name ON data_pipeline_tasks(db_name);
+CREATE INDEX idx_tasks_created_by ON data_pipeline_tasks(created_by);
+
+-- 执行记录表索引
+CREATE INDEX idx_executions_task_id ON data_pipeline_task_executions(task_id);
+CREATE INDEX idx_executions_step ON data_pipeline_task_executions(execution_step);
+CREATE INDEX idx_executions_status ON data_pipeline_task_executions(status);
+CREATE INDEX idx_executions_started_at ON data_pipeline_task_executions(started_at DESC);
+
+-- 日志表索引
+CREATE INDEX idx_logs_task_id ON data_pipeline_task_logs(task_id);
+CREATE INDEX idx_logs_execution_id ON data_pipeline_task_logs(execution_id);
+CREATE INDEX idx_logs_timestamp ON data_pipeline_task_logs(timestamp DESC);
+CREATE INDEX idx_logs_level ON data_pipeline_task_logs(log_level);
+CREATE INDEX idx_logs_step ON data_pipeline_task_logs(step_name);
+
+-- 文件输出表索引
+CREATE INDEX idx_outputs_task_id ON data_pipeline_task_outputs(task_id);
+CREATE INDEX idx_outputs_execution_id ON data_pipeline_task_outputs(execution_id);
+CREATE INDEX idx_outputs_file_type ON data_pipeline_task_outputs(file_type);
+CREATE INDEX idx_outputs_primary ON data_pipeline_task_outputs(is_primary) WHERE is_primary = TRUE;
+```
+
+## API设计
+
+**实现位置**:所有API端点都在`citu_app.py`中实现,作为现有Flask应用的扩展。
+
+### 1. 创建任务(不执行)
+```
+POST /api/v0/data_pipeline/tasks
+```
+
+**请求参数**:
+```json
+{
+  "task_type": "data_workflow",
+  "table_list_file": "tables.txt",
+  "business_context": "高速公路服务区管理系统",
+  "db_name": "highway_db",
+  "enable_sql_validation": true,
+  "enable_llm_repair": true,
+  "modify_original_file": true,
+  "enable_training_data_load": true
+}
+```
+
+**注意:** 数据库连接信息自动从 `app_config.py` 获取:
+- 业务数据库连接:使用 `APP_DB_CONFIG`
+- 任务管理表存储:使用 `PGVECTOR_CONFIG`(向量数据库)
+
+**响应**:
+```json
+{
+  "success": true,
+  "message": "任务创建成功",
+  "data": {
+    "task_id": "task_20250627_143052",
+    "status": "pending",
+    "output_directory": "./data_pipeline/training_data/task_20250627_143052",
+    "step_status": {
+      "ddl_generation": "pending",
+      "qa_generation": "pending", 
+      "sql_validation": "pending",
+      "training_load": "pending"
+    },
+    "created_at": "2025-06-27T14:30:52"
+  }
+}
+```
+
+### 1.1. 执行任务步骤
+```
+POST /api/v0/data_pipeline/tasks/{task_id}/execute
+```
+
+**请求参数**:
+```json
+{
+  "step": "ddl_generation",
+  "force_execute": false,
+  "clean_previous": true
+}
+```
+
+**响应**:
+```json
+{
+  "success": true,
+  "message": "步骤执行已启动",
+  "data": {
+    "execution_id": "task_20250627_143052_step_ddl_generation_exec_20250627143055",
+    "task_id": "task_20250627_143052",
+    "step": "ddl_generation",
+    "status": "running",
+    "started_at": "2025-06-27T14:30:55"
+  }
+}
+```
+
+### 1.2. 创建任务并立即执行完整工作流
+```
+POST /api/v0/data_pipeline/tasks/execute-complete
+```
+
+**请求参数**:
+```json
+{
+  "task_type": "complete_workflow",
+  "table_list_file": "tables.txt",
+  "business_context": "高速公路服务区管理系统",
+  "db_name": "highway_db"
+}
+```
+
+**响应**:
+```json
+{
+  "success": true,
+  "message": "完整工作流执行已启动",
+  "data": {
+    "task_id": "task_20250627_143052",
+    "execution_id": "task_20250627_143052_step_complete_exec_20250627143055",
+    "status": "running",
+    "started_at": "2025-06-27T14:30:55"
+  }
+}
+```
+
+### 2. 获取任务列表
+```
+GET /api/v0/data_pipeline/tasks
+```
+
+**响应**:
+```json
+{
+  "success": true,
+  "data": {
+    "tasks": [
+      {
+        "task_id": "task_20250627_143052",
+        "task_type": "complete_workflow",
+        "status": "running",
+        "progress": 45,
+        "created_at": "2025-06-27T14:30:52"
+      }
+    ]
+  }
+}
+```
+
+### 3. 获取任务详情
+```
+GET /api/v0/data_pipeline/tasks/{task_id}
+```
+
+**响应**:
+```json
+{
+  "success": true,
+  "data": {
+    "task_id": "task_20250627_143052",
+    "task_type": "data_workflow",
+    "status": "in_progress",
+    "parameters": { ... },
+    "step_status": {
+      "ddl_generation": "completed",
+      "qa_generation": "running", 
+      "sql_validation": "pending",
+      "training_load": "pending"
+    },
+    "output_directory": "./data_pipeline/training_data/task_20250627_143052",
+    "created_at": "2025-06-27T14:30:52",
+    "started_at": "2025-06-27T14:30:53",
+    "completed_at": null,
+    "current_execution": {
+      "execution_id": "task_20250627_143052_step_qa_generation_exec_20250627143521",
+      "step": "qa_generation",
+      "status": "running",
+      "started_at": "2025-06-27T14:35:21"
+    }
+  }
+}
+```
+
+### 3.1. 获取任务执行历史
+```
+GET /api/v0/data_pipeline/tasks/{task_id}/executions
+```
+
+**响应**:
+```json
+{
+  "success": true,
+  "data": {
+    "executions": [
+      {
+        "execution_id": "task_20250627_143052_step_ddl_generation_exec_20250627143053",
+        "step": "ddl_generation",
+        "status": "completed",
+        "started_at": "2025-06-27T14:30:53",
+        "completed_at": "2025-06-27T14:35:20",
+        "duration": 267,
+        "force_executed": false,
+        "files_cleaned": true
+      },
+      {
+        "execution_id": "task_20250627_143052_step_qa_generation_exec_20250627143521",
+        "step": "qa_generation",
+        "status": "running",
+        "started_at": "2025-06-27T14:35:21",
+        "completed_at": null,
+        "force_executed": false,
+        "files_cleaned": false
+      }
+    ]
+  }
+}
+```
+
+### 4. 获取当前活跃任务
+```
+GET /api/v0/data_pipeline/tasks/active
+```
+
+**响应**:返回最近的运行中任务,如无则返回最近完成的任务
+
+### 5. 获取任务日志
+```
+GET /api/v0/data_pipeline/tasks/{task_id}/logs?limit=100&level=INFO
+```
+
+**响应**:
+```json
+{
+  "success": true,
+  "data": {
+    "logs": [
+      {
+        "timestamp": "2025-06-27T14:30:53",
+        "level": "INFO",
+        "step_name": "ddl_md_generation",
+        "message": "开始处理表: bss_business_day_data"
+      }
+    ]
+  }
+}
+```
+
+### 6. 获取任务输出文件
+```
+GET /api/v0/data_pipeline/tasks/{task_id}/files
+```
+
+**响应**:
+```json
+{
+  "success": true,
+  "data": {
+    "files": [
+      {
+        "file_name": "qs_highway_db_20250627_143052_pair.json",
+        "file_type": "json",
+        "file_size": 102400,
+        "download_url": "/api/v0/data_pipeline/tasks/task_20250627_143052/files/download/qs_highway_db_20250627_143052_pair.json"
+      }
+    ]
+  }
+}
+```
+
+### 7. 下载文件
+```
+GET /api/v0/data_pipeline/tasks/{task_id}/files/download/{filename}
+```
+
+## 任务与执行模型设计
+
+### 1. 核心概念
+
+**任务(Task)**:一个完整的数据处理工作单元,包含4个步骤,有唯一的任务ID和输出目录
+**执行(Execution)**:在某个任务中执行特定步骤的一次操作,支持重复执行和分步执行
+
+### 2. 步骤定义
+
+**步骤标识使用描述性名称**:
+- **ddl_generation**:DDL生成 - 生成DDL文件和MD文档
+- **qa_generation**:Q&A生成 - 生成Question-SQL对
+- **sql_validation**:SQL验证 - 验证和修正SQL语句  
+- **training_load**:训练数据加载 - 加载训练数据到Vanna
+
+### 3. 支持的执行模式
+
+**完整工作流模式**:
+- 一次性执行所有4个步骤:ddl_generation → qa_generation → sql_validation → training_load
+- 传统的端到端执行方式
+
+**分步执行模式**:
+- 在同一个任务中分多次执行不同步骤
+- 支持检查每个步骤的结果后再决定是否执行下一步
+- 支持重复执行同一步骤(比如步骤失败后重新执行)
+- 所有步骤的日志和输出文件都在同一个任务目录中
+
+### 4. 步骤依赖关系
+
+- **ddl_generation**:无依赖,可直接执行
+- **qa_generation**:依赖 ddl_generation 成功完成
+- **sql_validation**:依赖 qa_generation 成功完成
+- **training_load**:依赖 sql_validation 成功完成
+
+### 5. 文件管理策略
+
+**同一任务目录原则**:
+- 所有步骤的输出都在 `./data_pipeline/training_data/{task_id}/` 目录
+- 重复执行步骤时清理该步骤的旧输出文件
+- 保持其他步骤的文件不受影响
+
+**步骤文件映射**:
+- ddl_generation: `*.ddl`, `*_detail.md`, `metadata.txt`
+- qa_generation: `qs_*.json`, `qs_*.json.backup`
+- sql_validation: `sql_validation_*_summary.log`, `sql_validation_*_report.json`
+- training_load: `training_load_*.log`
+
+### 6. 并发控制
+
+**单任务内串行执行**:
+- 同一任务内不允许并发执行多个步骤
+- 全局可以有多个不同任务并发执行
+- 执行前检查是否有正在运行的步骤
+
+## 执行流程设计
+
+### 1. 任务创建流程
+```
+1. 前端发送POST请求创建任务
+2. API生成task_id (格式: task_YYYYMMDD_HHMMSS)
+3. 在数据库中创建任务记录,状态为'pending'
+4. 创建对应的时间戳目录
+5. 初始化步骤状态为全部'pending'
+6. 立即返回task_id给前端
+7. 任务创建完成,等待步骤执行请求
+```
+
+### 2. 步骤执行流程  
+```
+1. 前端发送POST请求执行特定步骤
+2. 检查任务是否存在
+3. 检查步骤依赖关系(除非force_execute=true)
+4. 检查是否有正在运行的步骤(并发控制)
+5. 生成execution_id
+6. 创建执行记录,状态为'running'
+7. 如果clean_previous=true,清理该步骤的旧输出文件
+8. 启动独立任务执行器进程: subprocess.Popen([
+     sys.executable, 
+     './data_pipeline/task_executor.py',
+     '--task-id', task_id,
+     '--execution-mode', execution_mode,  # 'complete' 或 'step'
+     '--step-name', step_name if execution_mode == 'step' else None
+    ],
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE,
+    text=True,
+    cwd=project_root
+)
+9. 立即返回execution_id给前端
+10. API请求结束,task_executor.py脚本继续后台运行
+```
+
+**详细实现步骤**:
+
+#### 2.1 任务ID生成
+```python
+from datetime import datetime
+task_id = f"task_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+```
+
+#### 2.2 并发检查
+```sql
+SELECT COUNT(*) FROM data_pipeline_tasks WHERE status = 'running';
+-- 如果结果 > 0,返回错误:"已有任务正在执行,请稍后再试"
+```
+
+#### 2.3 任务记录创建
+```sql
+INSERT INTO data_pipeline_tasks (id, task_type, status, parameters, created_by)
+VALUES (?, ?, 'pending', ?::jsonb, ?);
+```
+
+#### 2.4 目录创建
+```python
+task_dir = os.path.join('./data_pipeline/training_data/', task_id)
+os.makedirs(task_dir, mode=0o755, exist_ok=False)  # exist_ok=False 确保目录唯一
+```
+
+#### 2.5 配置文件写入
+```python
+config_path = os.path.join(task_dir, 'task_config.json')
+with open(config_path, 'w', encoding='utf-8') as f:
+    json.dump({
+        'task_id': task_id,
+        'task_type': task_type,
+        'parameters': parameters,
+        'created_at': datetime.now().isoformat()
+    }, f, indent=2, ensure_ascii=False)
+```
+
+#### 2.6 启动后台进程
+```python
+# 使用subprocess.Popen启动独立任务执行器进程
+process = subprocess.Popen(
+    [sys.executable, 
+     './data_pipeline/task_executor.py',
+     '--task-id', task_id,
+     '--execution-mode', execution_mode,  # 'complete' 或 'step'
+     '--step-name', step_name if execution_mode == 'step' else None
+    ],
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE,
+    text=True,
+    cwd=project_root  # 项目根目录
+)
+```
+
+### 2. 后台执行流程
+```
+1. task_executor.py启动,接收task_id和执行模式参数
+2. 初始化日志系统,创建SimpleWorkflowExecutor实例
+3. 确保任务目录存在,设置任务目录日志记录器
+4. 更新数据库状态为'running',started_at时间戳
+5. 创建SchemaWorkflowOrchestrator并重定向其日志到任务目录
+6. 执行工作流(完整或单步),记录详细日志到data_pipeline.log
+7. 生成的文件都保存在对应的时间戳目录
+8. 完成后更新数据库状态为'completed'或'failed'
+9. 清理资源,脚本退出
+```
+
+**任务执行架构** (基于独立的task_executor.py):
+
+#### 2.1 任务执行器参数
+```python
+# data_pipeline/task_executor.py 命令行参数
+parser.add_argument('--task-id', required=True, help='任务ID')
+parser.add_argument('--execution-mode', default='complete', 
+                   choices=['complete', 'step'], help='执行模式')
+parser.add_argument('--step-name', help='步骤名称(当execution-mode=step时必需)')
+```
+
+#### 2.2 任务执行主函数
+```python
+async def execute_task(task_id: str, execution_mode: str, step_name: str = None):
+    """执行任务的异步函数"""
+    executor = None
+    try:
+        # 创建SimpleWorkflowExecutor实例
+        executor = SimpleWorkflowExecutor(task_id)
+        
+        if execution_mode == "complete":
+            # 执行完整工作流
+            return await executor.execute_complete_workflow()
+        elif execution_mode == "step":
+            # 执行单个步骤
+            return await executor.execute_single_step(step_name)
+        else:
+            raise ValueError(f"不支持的执行模式: {execution_mode}")
+            
+    finally:
+        if executor:
+            executor.cleanup()
+```
+
+#### 2.3 SimpleWorkflowExecutor核心功能
+```python
+class SimpleWorkflowExecutor:
+    def __init__(self, task_id: str):
+        self.task_id = task_id
+        self.task_manager = SimpleTaskManager()  # 数据库管理
+        self.file_manager = SimpleFileManager()  # 文件管理
+        self.task_dir_logger = None              # 任务目录日志记录器
+        self._load_task_info()                   # 加载任务信息
+    
+    def _setup_task_directory_logger(self):
+        """设置任务目录日志记录器"""
+        task_dir = self.file_manager.get_task_directory(self.task_id)
+        log_file = task_dir / "data_pipeline.log"
+        
+        # 创建专门的任务目录日志记录器
+        self.task_dir_logger = logging.getLogger(f"TaskDir_{self.task_id}")
+        self.task_dir_logger.setLevel(logging.DEBUG)
+        self.task_dir_logger.handlers.clear()
+        self.task_dir_logger.propagate = False
+        
+        # 创建文件处理器
+        file_handler = logging.FileHandler(log_file, encoding='utf-8')
+        formatter = logging.Formatter(
+            '%(asctime)s [%(levelname)s] %(name)s: %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        file_handler.setFormatter(formatter)
+        self.task_dir_logger.addHandler(file_handler)
+    
+    def _redirect_orchestrator_logs(self, orchestrator):
+        """重定向SchemaWorkflowOrchestrator的日志到任务目录"""
+        if self.task_dir_logger and hasattr(orchestrator, 'logger'):
+            for handler in self.task_dir_logger.handlers:
+                if isinstance(handler, logging.FileHandler):
+                    orchestrator.logger.addHandler(handler)
+                    break
+```
+
+#### 2.4 双日志系统设计
+
+##### 日志文件位置
+- **任务目录日志**:`./data_pipeline/training_data/{task_id}/data_pipeline.log` - 详细执行日志
+- **数据库日志**:存储在 `data_pipeline_task_logs` 表 - 结构化查询和展示
+- **系统日志**:`./logs/` 目录保留系统级日志(app.log、agent.log、vanna.log)
+
+##### 日志记录机制
+1. **任务目录日志记录器**:
+   - 每个任务创建独立的 `TaskDir_{task_id}` 日志记录器
+   - 直接写入任务目录的 `data_pipeline.log` 文件
+   - 捕获所有详细的执行过程信息
+
+2. **数据库日志记录器**:
+   - 通过 `SimpleTaskManager.record_log()` 记录关键事件
+   - 支持按级别、步骤、时间等条件查询
+   - 用于API返回和Web UI展示
+
+3. **SchemaWorkflowOrchestrator日志重定向**:
+   - 将orchestrator的日志同时输出到任务目录文件
+   - 确保所有子模块的日志都集中记录
+   - 保持现有日志系统不变的同时增强功能
+
+##### 日志内容示例
+```
+# 任务目录日志文件内容示例
+2025-07-01 14:30:52 [INFO] TaskDir_task_20250701_143052: 任务目录日志初始化完成 - 任务ID: task_20250701_143052
+2025-07-01 14:30:52 [INFO] TaskDir_task_20250701_143052: 任务参数: {"db_connection": "...", "business_context": "..."}
+2025-07-01 14:30:53 [INFO] TaskDir_task_20250701_143052: [complete] 开始执行步骤: complete
+2025-07-01 14:30:53 [INFO] DataPipelineOrchestrator: 开始执行完整工作流
+2025-07-01 14:30:54 [INFO] DDLMDGenerator: 开始处理表: bss_business_day_data
+```
+
+#### 2.5 执行示例
+
+```bash
+# 1. API调用(完整工作流)
+python data_pipeline/task_executor.py \
+    --task-id "task_20250627_143052" \
+    --execution-mode complete
+
+# 2. API调用(单步执行DDL生成)
+python data_pipeline/task_executor.py \
+    --task-id "task_20250627_143052" \
+    --execution-mode step \
+    --step-name ddl_generation
+
+# 3. API调用(单步执行Q&A生成)
+python data_pipeline/task_executor.py \
+    --task-id "task_20250627_143052" \
+    --execution-mode step \
+    --step-name qa_generation
+
+# 4. API调用(单步执行SQL验证)
+python data_pipeline/task_executor.py \
+    --task-id "task_20250627_143052" \
+    --execution-mode step \
+    --step-name sql_validation
+
+# 5. API调用(单步执行训练数据加载)
+python data_pipeline/task_executor.py \
+    --task-id "task_20250627_143052" \
+    --execution-mode step \
+    --step-name training_load
+```
+
+### 3. 分步执行使用流程
+
+#### 场景1:分步执行,检查每步结果
+```bash
+# 1. 创建任务
+curl -X POST /api/v0/data_pipeline/tasks \
+  -d '{"task_type": "data_workflow", "parameters": {...}}'
+# 返回: {"task_id": "task_20250627_143052"}
+
+# 2. 执行DDL生成
+curl -X POST /api/v0/data_pipeline/tasks/task_20250627_143052/execute \
+  -d '{"step": "ddl_generation"}'
+# 等待完成,检查结果
+
+# 3. 检查DDL生成结果满意后,执行Q&A生成
+curl -X POST /api/v0/data_pipeline/tasks/task_20250627_143052/execute \
+  -d '{"step": "qa_generation"}'
+
+# 4. 如果Q&A结果不满意,重新执行
+curl -X POST /api/v0/data_pipeline/tasks/task_20250627_143052/execute \
+  -d '{"step": "qa_generation", "clean_previous": true}'
+
+# 5. 继续后续步骤
+curl -X POST /api/v0/data_pipeline/tasks/task_20250627_143052/execute \
+  -d '{"step": "sql_validation"}'
+
+curl -X POST /api/v0/data_pipeline/tasks/task_20250627_143052/execute \
+  -d '{"step": "training_load"}'
+```
+
+#### 场景2:一次性执行完整工作流
+```bash
+# 创建任务并立即执行完整工作流
+curl -X POST /api/v0/data_pipeline/tasks/execute-complete \
+  -d '{"task_type": "complete_workflow", "parameters": {...}}'
+```
+
+### 4. 前端轮询实现
+```javascript
+// 分步执行时的轮询
+async function pollExecutionStatus(taskId, executionId) {
+    const pollInterval = setInterval(async () => {
+        const response = await fetch(`/api/v0/data_pipeline/tasks/${taskId}/executions`);
+        const data = await response.json();
+        
+        const currentExecution = data.data.executions.find(e => e.execution_id === executionId);
+        
+        // 更新UI
+        updateStepStatus(currentExecution.step, currentExecution.status);
+        
+        // 检查是否完成
+        if (currentExecution.status === 'completed' || currentExecution.status === 'failed') {
+            clearInterval(pollInterval);
+            handleStepComplete(currentExecution);
+        }
+    }, 5000);
+}
+
+// 任务整体状态轮询
+async function pollTaskStatus(taskId) {
+    const pollInterval = setInterval(async () => {
+        const response = await fetch(`/api/v0/data_pipeline/tasks/${taskId}`);
+        const data = await response.json();
+        
+        // 更新各步骤状态
+        updateAllStepsStatus(data.data.step_status);
+        
+        // 更新当前执行信息
+        if (data.data.current_execution) {
+            updateCurrentExecution(data.data.current_execution);
+        }
+        
+        // 检查任务是否全部完成
+        if (data.data.status === 'completed' || data.data.status === 'failed') {
+            clearInterval(pollInterval);
+            handleTaskComplete(data.data);
+        }
+    }, 5000);
+}
+```
+
+## 任务配置文件格式
+
+### task_config.json 示例
+```json
+{
+  "task_id": "task_20250627_143052",
+  "task_type": "complete_workflow",
+  "created_at": "2025-06-27T14:30:52",
+  "parameters": {
+    "db_connection": {
+      "host": "localhost",
+      "port": 5432,
+      "database": "highway_db",
+      "user": "postgres",
+      "password": "******"
+    },
+    "table_list": ["bss_business_day_data", "bss_car_day_count", ...],
+    "business_context": "高速公路服务区管理系统",
+    "output_dir": "./data_pipeline/training_data/task_20250627_143052",
+    "execution_mode": "complete",
+    "single_step": null,
+    "llm_config": {
+      "model": "qianwen",
+      "temperature": 0.7
+    }
+  }
+}
+```
+
+## 错误处理机制
+
+### 1. API层错误处理
+```python
+try:
+    task_id = create_task(request_data)
+    return {"success": True, "task_id": task_id}
+except ConcurrentTaskError:
+    return {"success": False, "error": "已有任务正在执行"}, 409
+except Exception as e:
+    logger.error(f"任务创建失败: {str(e)}")
+    return {"success": False, "error": "任务创建失败"}, 500
+```
+
+### 2. 执行流程中的错误处理
+```python
+try:
+    # 执行任务
+    report = await orchestrator.execute_complete_workflow()
+    if self.db_logger:
+        self.db_logger.update_status('completed')
+except Exception as e:
+    # 记录错误到日志和数据库
+    self.logger.error(f"任务执行失败: {str(e)}", exc_info=True)
+    if self.db_logger:
+        self.db_logger.log('ERROR', str(e))
+        self.db_logger.update_status('failed', error_message=str(e))
+    raise
+```
+
+### 3. 僵尸任务检测
+```python
+# 在API启动时检查
+def check_zombie_tasks():
+    # 查找超过2小时仍在运行的任务
+    query = """
+    UPDATE data_pipeline_tasks 
+    SET status = 'failed', 
+        error_message = '任务超时,可能已停止运行'
+    WHERE status = 'running' 
+    AND started_at < NOW() - INTERVAL '2 hours'
+    """
+```
+
+## 并发控制策略
+
+### 单任务执行原则
+- 同时只允许一个任务处于'running'状态
+- 新任务提交时检查数据库,如有运行中任务则拒绝
+- 前端显示当前运行任务信息,提示用户等待
+
+### 任务锁实现
+```python
+# 使用数据库事务确保原子性
+def acquire_task_lock(task_id):
+    with db.transaction():
+        # 检查是否有运行中的任务
+        running_count = db.query(
+            "SELECT COUNT(*) FROM data_pipeline_tasks WHERE status = 'running'"
+        ).scalar()
+        
+        if running_count > 0:
+            raise ConcurrentTaskError("已有任务正在执行")
+            
+        # 获取锁:更新状态为running
+        db.execute(
+            "UPDATE data_pipeline_tasks SET status = 'running', started_at = NOW() WHERE id = %s",
+            [task_id]
+        )
+```
+
+## Web UI模块设计
+
+### 1. 任务管理页面
+- **任务创建表单**:配置任务参数并提交
+- **任务列表**:显示历史任务和状态
+- **任务筛选**:按状态、时间等筛选任务
+
+### 2. 任务详情页面
+- **实时进度条**:显示当前执行进度
+- **步骤状态**:各步骤的执行状态和耗时
+- **实时日志**:滚动显示任务日志
+- **文件管理**:列出生成的文件并提供下载
+
+### 3. 日志查看器
+- **日志级别筛选**:INFO/WARNING/ERROR
+- **关键词搜索**:在日志中搜索特定内容
+- **自动滚动**:新日志自动滚动到底部
+- **日志导出**:下载完整日志文件
+
+### 4. 文件管理器
+- **文件列表**:显示所有生成的文件
+- **批量下载**:打包下载所有文件
+- **文件预览**:在线查看文本文件内容
+- **文件统计**:显示文件大小和生成时间
+
+## 技术实现要点
+
+### 1. 数据库连接管理
+- 复用现有的PostgreSQL连接配置
+- 在独立脚本中建立独立的数据库连接
+- 确保连接池的正确释放
+
+### 2. 日志系统集成
+- 复用现有的core.logging系统
+- 在SchemaWorkflowOrchestrator中添加数据库日志写入
+- 保持原有的文件日志不变
+
+### 3. 文件路径管理
+- 统一使用绝对路径避免路径混乱
+- 确保时间戳目录的正确创建和权限
+- 提供文件清理机制避免磁盘空间耗尽
+
+### 4. 错误处理
+- 完善的异常捕获和错误信息记录
+- 优雅的错误恢复机制
+- 清晰的错误信息展示给用户
+
+## SchemaWorkflowOrchestrator集成细节
+
+### 1. 主要修改点
+
+由于直接调用schema_workflow.py,不需要额外的worker.py,主要修改集中在:
+
+1. **命令行参数扩展**:添加`--task-id`和`--no-db-tracking`参数
+2. **数据库记录器集成**:在SchemaWorkflowOrchestrator中集成进度记录功能
+3. **各步骤进度更新**:在现有的执行步骤中添加进度更新调用
+
+### 2. 进度更新实现
+
+在每个执行步骤方法中添加进度更新:
+
+```python
+# _execute_step_1_ddl_md_generation
+if self.db_logger:
+    self.db_logger.update_progress(10, 'ddl_md_generation')
+    self.db_logger.log('INFO', 'DDL/MD生成开始', 'ddl_md_generation')
+    # ... 执行实际工作
+    self.db_logger.update_progress(40, 'ddl_md_generation')
+    
+# _execute_step_2_question_sql_generation  
+if self.db_logger:
+    self.db_logger.update_progress(40, 'question_sql_generation')
+    # ... 执行实际工作
+    self.db_logger.update_progress(70, 'question_sql_generation')
+    
+# _execute_step_3_sql_validation
+if self.db_logger:
+    self.db_logger.update_progress(70, 'sql_validation')
+    # ... 执行实际工作
+    self.db_logger.update_progress(90, 'sql_validation')
+    
+# _execute_step_4_training_data_load
+if self.db_logger:
+    self.db_logger.update_progress(90, 'training_data_load')
+    # ... 执行实际工作
+    self.db_logger.update_progress(100, 'training_data_load')
+```
+
+### 3. 任务状态管理
+
+在主执行流程中管理任务状态:
+
+```python
+async def execute_complete_workflow(self):
+    # 开始时更新状态
+    if self.db_logger:
+        self.db_logger.update_status('running')
+    
+    try:
+        # 执行各步骤...
+        report = await self._generate_final_report()
+        
+        # 成功完成
+        if self.db_logger:
+            self.db_logger.update_status('completed')
+            
+    except Exception as e:
+        # 失败处理
+        if self.db_logger:
+            self.db_logger.update_status('failed', str(e))
+        raise
+```
+
+### 4. 输出目录管理
+
+当通过API调用时,output_dir会被设置为任务特定的时间戳目录,确保所有输出文件都集中存储。
+
+## API安全性考虑
+
+### 1. 认证和授权
+- 使用现有的API认证机制(如JWT)
+- 检查用户权限,确保有执行数据生成的权限
+- 记录操作者信息到created_by字段
+
+### 2. 输入验证
+```python
+def validate_task_request(request_data):
+    # 验证必填字段
+    required_fields = ['task_type', 'parameters']
+    for field in required_fields:
+        if field not in request_data:
+            raise ValueError(f"缺少必填字段: {field}")
+    
+    # 验证数据库连接参数
+    db_params = request_data['parameters'].get('db_connection', {})
+    if not all(k in db_params for k in ['host', 'port', 'database']):
+        raise ValueError("数据库连接参数不完整")
+        
+    # 验证表列表
+    table_list = request_data['parameters'].get('table_list', [])
+    if not table_list:
+        raise ValueError("表列表不能为空")
+```
+
+### 3. 路径安全
+- 禁止路径遍历攻击
+- 确保所有文件操作都在指定的任务目录内
+- 使用os.path.normpath和验证路径前缀
+
+## 性能优化建议
+
+### 1. 数据库查询优化
+- 使用批量插入日志,而非逐条插入
+- 建立适当的索引加速查询
+- 定期清理旧日志数据
+
+### 2. 文件处理优化
+- 大文件使用流式读写
+- 压缩旧任务的输出文件
+- 实现文件分片下载
+
+### 3. 内存管理
+- 在worker中及时释放大对象
+- 使用生成器处理大数据集
+- 监控内存使用情况
+
+## 任务清理策略
+
+### 1. 自动清理
+```python
+# 定期任务清理旧数据
+def cleanup_old_tasks():
+    # 清理30天前的任务
+    cutoff_date = datetime.now() - timedelta(days=30)
+    
+    # 查询要清理的任务
+    old_tasks = db.query("""
+        SELECT id FROM data_pipeline_tasks 
+        WHERE created_at < %s AND status IN ('completed', 'failed')
+    """, [cutoff_date])
+    
+    for task in old_tasks:
+        # 删除文件目录
+        task_dir = os.path.join('./data_pipeline/training_data/', task.id)
+        if os.path.exists(task_dir):
+            shutil.rmtree(task_dir)
+            
+        # 删除数据库记录
+        db.execute("DELETE FROM data_pipeline_tasks WHERE id = %s", [task.id])
+```
+
+### 2. 手动清理API
+```
+DELETE /api/v0/data_pipeline/tasks/{task_id}
+```
+
+## 监控指标
+
+### 1. 任务指标
+- 任务执行时间统计
+- 任务成功率
+- 各步骤平均耗时
+
+### 2. 系统指标
+- CPU和内存使用率
+- 磁盘空间占用
+- 数据库连接池状态
+
+### 3. 告警规则
+- 任务执行超时告警
+- 磁盘空间不足告警
+- 连续失败任务告警
+
+## 部署和运维
+
+### 1. 依赖要求
+- 现有的Data Pipeline依赖不变
+- 确保subprocess能够正确启动Python脚本
+- 数据库表的创建和权限配置
+- Windows系统需要注意Python路径和脚本执行权限
+
+### 2. 初始化脚本
+```sql
+-- 创建必要的数据库表
+CREATE TABLE IF NOT EXISTS data_pipeline_tasks (...);
+CREATE TABLE IF NOT EXISTS data_pipeline_task_logs (...);
+CREATE TABLE IF NOT EXISTS data_pipeline_task_outputs (...);
+
+-- 创建索引
+CREATE INDEX IF NOT EXISTS idx_tasks_status ON data_pipeline_tasks(status);
+CREATE INDEX IF NOT EXISTS idx_tasks_created_at ON data_pipeline_tasks(created_at);
+CREATE INDEX IF NOT EXISTS idx_logs_task_id ON data_pipeline_task_logs(task_id);
+
+-- 创建清理函数
+CREATE OR REPLACE FUNCTION cleanup_old_tasks()...
+```
+
+### 3. 运维检查清单
+- [ ] 确保training_data目录有足够的磁盘空间
+- [ ] 定期检查是否有僵尸任务
+- [ ] 监控任务执行时间趋势
+- [ ] 备份重要的训练数据
+- [ ] 定期执行任务清理
+
+### 4. 故障排查指南
+1. **任务卡住**:检查数据库中任务状态,查看任务目录下的日志文件
+2. **任务失败**:
+   - 查看数据库中的 error_message 字段
+   - 在 data_pipeline.log 中搜索 [ERROR] 级别日志
+   - 检查数据库连接和LLM服务状态
+3. **磁盘满**:执行清理脚本,调整保留策略
+4. **性能下降**:检查数据库索引,清理历史日志
+
+## 总结
+
+本设计采用了任务与API解耦的架构,通过数据库作为通信桥梁,实现了长时间任务的后台执行和实时进度追踪。设计简洁实用,充分复用了现有的代码和基础设施,能够满足Web UI调用Data Pipeline的各种需求。
+
+本概要设计文档详细描述了Data Pipeline API的完整实现方案:
+
+1. **核心设计特点**:
+   - 任务ID即时间戳的简洁设计,无需额外的ID生成器
+   - API与执行脚本完全解耦,支持服务重启后任务继续执行
+   - 基于数据库的状态管理和进度追踪,替代复杂的消息队列
+   - 时间戳目录的统一文件管理,所有输出集中存储
+
+2. **技术实现亮点**:
+   - 使用subprocess实现真正的后台执行,不阻塞HTTP请求
+   - 粗粒度进度追踪(步骤级),避免过度复杂
+   - 完善的错误处理和恢复机制,包括僵尸任务检测
+   - 单任务执行保证系统稳定性,避免资源竞争
+
+3. **实用性考虑**:
+   - 充分复用现有的SchemaWorkflowOrchestrator代码
+   - 支持服务重启后的状态恢复,任务不会丢失
+   - 提供完整的文件管理和下载功能
+   - 包含监控、清理和运维策略,便于长期维护
+
+4. **Web UI友好设计**:
+   - 清晰的RESTful API设计,易于前端集成
+   - 实时进度查询,支持轮询机制
+   - 完整的日志查看和文件下载功能
+   - 直观的任务状态展示
+
+5. **关键实现变更**:
+   - 直接调用schema_workflow.py,无需额外的worker.py
+   - 手工执行时自动生成manual_前缀的task_id
+   - 支持--no-db-tracking参数禁用数据库追踪
+   - 只需修改schema_workflow.py一个文件即可实现所有功能
+   - 使用环境变量方案统一管理data_pipeline模块的日志路径
+   - 所有任务日志都写入各自的任务目录,不再使用./logs/data_pipeline.log
+   - 禁用日志轮转(rotation),因为每个任务的日志是独立的
+
+本方案在保持简单实用的同时,提供了完整的功能支持,能够很好地满足Data Pipeline Web UI集成的需求。

+ 1136 - 0
docs/data_pipeline_api_detailed_design.md

@@ -0,0 +1,1136 @@
+# Data Pipeline API 详细设计文档
+
+## 项目概述
+
+本文档是基于概要设计文档和现有代码结构,对Data Pipeline API系统的详细技术实现设计。该系统将为Web UI提供完整的数据管道调度、执行监控和日志管理功能。
+
+## 核心需求分析
+
+### 1. 业务需求
+- **API调度执行**:通过REST API调度执行 `./data_pipeline/schema_workflow.py`
+- **执行监控**:实时查看任务执行状态和进度
+- **日志集中管理**:所有日志写入任务特定的子目录
+- **步骤控制**:支持通过参数控制执行特定步骤
+- **数据库日志记录**:关键步骤信息写入PostgreSQL数据库
+
+### 2. 技术约束
+- 复用现有的 `SchemaWorkflowOrchestrator` 架构
+- 集成现有的日志系统 (`core.logging`)
+- 使用现有的Flask应用 (`citu_app.py`) 作为API承载
+- 保持与现有数据库配置的兼容性
+
+## 系统架构设计
+
+### 1. 整体架构
+
+```
+┌─────────────────────┐    ┌─────────────────────┐    ┌─────────────────────┐
+│   Web Frontend      │    │   Flask API         │    │  Schema Workflow    │
+│                     │ ─→ │   (citu_app.py)     │ ─→ │  (subprocess)       │
+│ - 任务创建表单      │    │ - 任务调度          │    │ - DDL生成           │
+│ - 进度监控界面      │    │ - 状态查询          │    │ - Q&A生成           │
+│ - 日志查看器        │    │ - 日志API           │    │ - SQL验证           │
+│ - 文件管理器        │    │ - 文件管理          │    │ - 训练数据加载      │
+└─────────────────────┘    └─────────────────────┘    └─────────────────────┘
+                                    │                           │
+                                    ▼                           ▼
+                           ┌─────────────────────┐    ┌─────────────────────┐
+                           │  PostgreSQL DB      │    │  File System        │
+                           │ - 任务状态表        │    │ - 任务目录          │
+                           │ - 日志记录表        │    │ - 输出文件          │
+                           │ - 文件输出表        │    │ - 日志文件          │
+                           └─────────────────────┘    └─────────────────────┘
+```
+
+### 2. 进程分离设计
+
+```
+HTTP Request ──┐
+               │
+               ▼
+        ┌─────────────┐    subprocess.Popen    ┌──────────────────┐
+        │ Flask API   │ ──────────────────────→ │ task_executor.py │
+        │ Process     │                        │ Process          │
+        │             │    Database Bridge     │                  │
+        │ - 任务调度  │ ←─────────────────────→ │ - SimpleWorkflow │
+        │ - 状态查询  │                        │ - 进度更新       │
+        │ - 文件管理  │                        │ - 双日志记录     │
+        └─────────────┘                        └──────────────────┘
+               │                                        │
+               ▼                                        ▼
+        立即返回task_id                     独立执行工作流+日志到任务目录
+```
+
+## 数据库设计详细说明
+
+### 1. 表结构设计
+
+#### 任务主表 (data_pipeline_tasks)
+```sql
+CREATE TABLE data_pipeline_tasks (
+    -- 主键:时间戳格式的任务ID
+    id VARCHAR(32) PRIMARY KEY,                    -- 'task_20250627_143052'
+    
+    -- 任务基本信息
+    task_type VARCHAR(50) NOT NULL DEFAULT 'data_workflow',
+    status VARCHAR(20) NOT NULL DEFAULT 'pending', -- pending/in_progress/partial_completed/completed/failed
+    
+    -- 配置和结果(JSON格式)
+    parameters JSONB NOT NULL,                     -- 任务配置参数
+    result JSONB,                                  -- 最终执行结果
+    
+    -- 错误处理
+    error_message TEXT,                            -- 错误详细信息
+    
+    -- 步骤状态跟踪
+    step_status JSONB DEFAULT '{                   -- 各步骤状态
+        "ddl_generation": "pending",
+        "qa_generation": "pending", 
+        "sql_validation": "pending",
+        "training_load": "pending"
+    }',
+    
+    -- 时间戳
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    started_at TIMESTAMP,
+    completed_at TIMESTAMP,
+    
+    -- 创建者信息
+    created_by VARCHAR(50) DEFAULT 'api',          -- 'api', 'manual', 'system'
+    
+    -- 输出目录
+    output_directory TEXT,                         -- 任务输出目录路径
+    
+    -- 索引字段
+    db_name VARCHAR(100),                          -- 数据库名称(便于筛选)
+    business_context TEXT                          -- 业务上下文(便于搜索)
+);
+
+-- 创建索引
+CREATE INDEX idx_tasks_status ON data_pipeline_tasks(status);
+CREATE INDEX idx_tasks_created_at ON data_pipeline_tasks(created_at DESC);
+CREATE INDEX idx_tasks_db_name ON data_pipeline_tasks(db_name);
+CREATE INDEX idx_tasks_created_by ON data_pipeline_tasks(created_by);
+```
+
+#### 任务执行记录表 (data_pipeline_task_executions)
+```sql
+CREATE TABLE data_pipeline_task_executions (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    execution_step VARCHAR(50) NOT NULL,          -- 'ddl_generation', 'qa_generation', 'sql_validation', 'training_load', 'complete'
+    status VARCHAR(20) NOT NULL,                  -- 'running', 'completed', 'failed'
+    started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    completed_at TIMESTAMP,
+    error_message TEXT,
+    execution_result JSONB,                       -- 步骤执行结果
+    execution_id VARCHAR(100) UNIQUE,             -- {task_id}_step_{step_name}_exec_{timestamp}
+    force_executed BOOLEAN DEFAULT FALSE,         -- 是否强制执行
+    files_cleaned BOOLEAN DEFAULT FALSE,          -- 是否清理了旧文件
+    duration_seconds INTEGER                      -- 执行时长(秒)
+);
+
+-- 创建索引
+CREATE INDEX idx_executions_task_id ON data_pipeline_task_executions(task_id);
+CREATE INDEX idx_executions_step ON data_pipeline_task_executions(execution_step);
+CREATE INDEX idx_executions_status ON data_pipeline_task_executions(status);
+CREATE INDEX idx_executions_started_at ON data_pipeline_task_executions(started_at DESC);
+```
+
+#### 任务日志表 (data_pipeline_task_logs)
+```sql
+CREATE TABLE data_pipeline_task_logs (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    execution_id VARCHAR(100) REFERENCES data_pipeline_task_executions(execution_id),
+    
+    -- 日志内容
+    log_level VARCHAR(10) NOT NULL,               -- 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
+    message TEXT NOT NULL,                        -- 日志消息内容
+    
+    -- 上下文信息
+    step_name VARCHAR(50),                        -- 执行步骤名称
+    module_name VARCHAR(100),                     -- 模块名称
+    function_name VARCHAR(100),                   -- 函数名称
+    
+    -- 时间戳
+    timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    
+    -- 额外信息(JSON格式)
+    extra_data JSONB DEFAULT '{}'                 -- 额外的结构化信息
+);
+
+-- 创建索引
+CREATE INDEX idx_logs_task_id ON data_pipeline_task_logs(task_id);
+CREATE INDEX idx_logs_execution_id ON data_pipeline_task_logs(execution_id);
+CREATE INDEX idx_logs_timestamp ON data_pipeline_task_logs(timestamp DESC);
+CREATE INDEX idx_logs_level ON data_pipeline_task_logs(log_level);
+CREATE INDEX idx_logs_step ON data_pipeline_task_logs(step_name);
+```
+
+#### 任务输出文件表 (data_pipeline_task_outputs)
+```sql
+CREATE TABLE data_pipeline_task_outputs (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    execution_id VARCHAR(100) REFERENCES data_pipeline_task_executions(execution_id),
+    
+    -- 文件信息
+    file_type VARCHAR(50) NOT NULL,               -- 'ddl', 'md', 'json', 'log', 'report'
+    file_name VARCHAR(255) NOT NULL,              -- 文件名
+    file_path TEXT NOT NULL,                      -- 相对路径
+    file_size BIGINT DEFAULT 0,                   -- 文件大小(字节)
+    
+    -- 文件内容摘要
+    content_hash VARCHAR(64),                     -- 文件内容hash
+    description TEXT,                             -- 文件描述
+    
+    -- 时间戳
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    modified_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    
+    -- 状态
+    is_primary BOOLEAN DEFAULT FALSE,             -- 是否为主要输出文件
+    is_downloadable BOOLEAN DEFAULT TRUE          -- 是否可下载
+);
+
+-- 创建索引
+CREATE INDEX idx_outputs_task_id ON data_pipeline_task_outputs(task_id);
+CREATE INDEX idx_outputs_execution_id ON data_pipeline_task_outputs(execution_id);
+CREATE INDEX idx_outputs_file_type ON data_pipeline_task_outputs(file_type);
+CREATE INDEX idx_outputs_primary ON data_pipeline_task_outputs(is_primary) WHERE is_primary = TRUE;
+```
+
+### 2. 数据库操作类设计
+
+```python
+# data_pipeline/api/simple_db_manager.py
+class SimpleTaskManager:
+    """简化的数据管道任务数据库管理器"""
+    
+    def __init__(self):
+        self.logger = get_data_pipeline_logger("SimpleTaskManager")
+        self._connection = None
+        self._connect_to_pgvector()
+    
+    def create_task(self, db_connection: str, table_list_file: str, 
+                   business_context: str, **kwargs) -> str:
+        """创建新任务记录,返回task_id"""
+        
+    def update_task_status(self, task_id: str, status: str, 
+                          error_message: str = None) -> bool:
+        """更新任务状态"""
+        
+    def update_step_status(self, task_id: str, step_name: str, 
+                          status: str) -> bool:
+        """更新步骤状态"""
+        
+    def get_task(self, task_id: str) -> dict:
+        """获取任务详情"""
+        
+    def get_tasks_list(self, limit: int = 50, status: str = None) -> list:
+        """获取任务列表"""
+        
+    def create_execution(self, task_id: str, step_name: str) -> str:
+        """创建执行记录,返回execution_id"""
+        
+    def complete_execution(self, execution_id: str, status: str, 
+                          error_message: str = None) -> bool:
+        """完成执行记录"""
+        
+    def record_log(self, task_id: str, level: str, message: str, 
+                  execution_id: str = None, step_name: str = None) -> bool:
+        """记录任务日志"""
+        
+    def get_task_logs(self, task_id: str, limit: int = 100) -> list:
+        """获取任务日志"""
+        
+    def get_task_outputs(self, task_id: str) -> list:
+        """获取任务输出文件列表"""
+```
+
+## API接口详细设计
+
+### 1. API路由设计
+
+所有API都在 `citu_app.py` 中实现,路由前缀为 `/api/v0/data_pipeline/`
+
+```python
+# citu_app.py 中添加的路由
+@app.flask_app.route('/api/v0/data_pipeline/tasks', methods=['POST'])
+def create_data_pipeline_task():
+    """创建数据管道任务"""
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks', methods=['GET'])
+def get_data_pipeline_tasks():
+    """获取任务列表"""
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>', methods=['GET'])
+def get_data_pipeline_task(task_id):
+    """获取单个任务详情"""
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/active', methods=['GET'])
+def get_active_data_pipeline_task():
+    """获取当前活跃任务"""
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/logs', methods=['GET'])
+def get_data_pipeline_task_logs(task_id):
+    """获取任务日志"""
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/files', methods=['GET'])
+def get_data_pipeline_task_files(task_id):
+    """获取任务输出文件列表"""
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/files/download/<filename>', methods=['GET'])
+def download_data_pipeline_task_file(task_id, filename):
+    """下载任务输出文件"""
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>', methods=['DELETE'])
+def delete_data_pipeline_task(task_id):
+    """删除任务(清理)"""
+```
+
+### 2. API接口实现详情
+
+#### 2.1 创建任务接口
+
+```python
+@app.flask_app.route('/api/v0/data_pipeline/tasks', methods=['POST'])
+def create_data_pipeline_task():
+    """
+    创建数据管道任务
+    
+    Request Body:
+    {
+        "task_type": "complete_workflow",
+        "parameters": {
+            "db_connection": "postgresql://...",
+            "table_list_file": "tables.txt", 
+            "business_context": "业务描述",
+            "output_dir": "./data_pipeline/training_data/",
+            "execution_mode": "complete",
+            "single_step": null
+        }
+    }
+    """
+    try:
+        # 1. 参数验证
+        req_data = request.get_json()
+        if not req_data:
+            return jsonify(bad_request_response("请求体不能为空")), 400
+            
+        task_type = req_data.get('task_type', 'complete_workflow')
+        parameters = req_data.get('parameters', {})
+        
+        # 验证必需参数
+        required_params = ['db_connection', 'table_list_file', 'business_context']
+        missing_params = [p for p in required_params if not parameters.get(p)]
+        if missing_params:
+            return jsonify(bad_request_response(
+                f"缺少必需参数: {', '.join(missing_params)}",
+                missing_params=missing_params
+            )), 400
+        
+        # 验证执行模式参数
+        execution_mode = parameters.get('execution_mode', 'complete')
+        single_step = parameters.get('single_step')
+        
+        if execution_mode not in ['complete', 'single']:
+            return jsonify(bad_request_response("execution_mode必须是complete或single")), 400
+            
+        if execution_mode == 'single':
+            if not single_step or single_step not in [1, 2, 3, 4]:
+                return jsonify(bad_request_response("单步模式下single_step必须是1、2、3、4中的一个")), 400
+        elif execution_mode == 'complete' and single_step:
+            return jsonify(bad_request_response("完整模式下不应提供single_step参数")), 400
+        
+        # 2. 并发检查 - 简化版本(依赖SimpleWorkflowManager)
+        workflow_manager = SimpleWorkflowManager()
+        
+        # 3. 创建任务记录(返回task_id)
+        task_id = workflow_manager.create_task(
+            db_connection=parameters['db_connection'],
+            table_list_file=parameters['table_list_file'],
+            business_context=parameters['business_context'],
+            **{k: v for k, v in parameters.items() 
+               if k not in ['db_connection', 'table_list_file', 'business_context']}
+        )
+        
+        # 4. 启动后台进程
+        import subprocess
+        import sys
+        from pathlib import Path
+        
+        # 构建任务执行器命令
+        cmd_args = [
+            sys.executable, 
+            str(Path(__file__).parent / "data_pipeline" / "task_executor.py"),
+            '--task-id', task_id,
+            '--execution-mode', 'complete'
+        ]
+        
+        # 如果是单步执行,添加步骤参数
+        if execution_mode == 'step' and single_step:
+            cmd_args.extend(['--step-name', f'step_{single_step}'])
+        
+        # 启动后台进程
+        try:
+            process = subprocess.Popen(
+                cmd_args,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                cwd=Path(__file__).parent
+            )
+            logger.info(f"启动任务进程: PID={process.pid}, task_id={task_id}")
+        except Exception as e:
+            # 清理任务记录
+            workflow_manager.cleanup()
+            return jsonify(internal_error_response(f"启动后台进程失败: {str(e)}")), 500
+        
+        # 5. 返回成功响应
+        
+        # 启动进程
+        try:
+            log_file_path = os.path.join(task_dir, 'data_pipeline.log')
+            process = subprocess.Popen(
+                cmd_args,
+                stdout=open(log_file_path, 'w', encoding='utf-8'),
+                stderr=subprocess.STDOUT,
+                cwd=os.getcwd(),
+                start_new_session=True
+            )
+            
+            logger.info(f"启动后台任务: {task_id}, PID: {process.pid}")
+            
+        except Exception as e:
+            # 清理资源
+            task_manager.update_task_status(task_id, 'failed', error_message=f"启动进程失败: {str(e)}")
+            shutil.rmtree(task_dir, ignore_errors=True)
+            return jsonify(internal_error_response(f"启动任务失败: {str(e)}")), 500
+        
+        # 9. 返回成功响应
+        return jsonify(success_response(
+            message="任务创建成功",
+            data={
+                "task_id": task_id,
+                "status": "pending",
+                "created_at": datetime.now().isoformat(),
+                "output_directory": task_dir
+            }
+        )), 201
+        
+    except Exception as e:
+        logger.exception(f"创建任务失败: {str(e)}")
+        return jsonify(internal_error_response("创建任务失败")), 500
+```
+
+#### 2.2 获取任务详情接口
+
+```python
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>', methods=['GET'])
+def get_data_pipeline_task(task_id):
+    """
+    获取单个任务详情
+    
+    Response:
+    {
+        "success": true,
+        "data": {
+            "task_id": "task_20250627_143052",
+            "task_type": "complete_workflow",
+            "status": "running",
+            "progress": 45,
+            "current_step": "question_sql_generation",
+            "parameters": {...},
+            "result": {...},
+            "error_message": null,
+            "step_details": [...],
+            "created_at": "2025-06-27T14:30:52",
+            "started_at": "2025-06-27T14:30:53",
+            "completed_at": null,
+            "duration": 125.5,
+            "output_directory": "./data_pipeline/training_data/task_20250627_143052"
+        }
+    }
+    """
+    try:
+        # 参数验证
+        if not task_id or not task_id.startswith('task_'):
+            return jsonify(bad_request_response("无效的任务ID格式")), 400
+        
+        workflow_manager = SimpleWorkflowManager()
+        task_data = workflow_manager.get_task_status(task_id)
+        
+        if not task_data:
+            return jsonify(not_found_response(f"任务不存在: {task_id}")), 404
+        
+        # 计算执行时长
+        duration = None
+        if task_data.get('started_at'):
+            end_time = task_data.get('completed_at') or datetime.now()
+            start_time = task_data['started_at']
+            if isinstance(start_time, str):
+                start_time = datetime.fromisoformat(start_time)
+            if isinstance(end_time, str):
+                end_time = datetime.fromisoformat(end_time)
+            duration = (end_time - start_time).total_seconds()
+        
+        # 获取步骤详情
+        step_details = []
+        step_stats = task_data.get('step_stats', {})
+        
+        for step_name in ['ddl_md_generation', 'question_sql_generation', 'sql_validation', 'training_data_load']:
+            step_info = step_stats.get(step_name, {})
+            step_details.append({
+                "step": step_name,
+                "status": step_info.get('status', 'pending'),
+                "started_at": step_info.get('started_at'),
+                "completed_at": step_info.get('completed_at'),
+                "duration": step_info.get('duration'),
+                "error_message": step_info.get('error_message')
+            })
+        
+        response_data = {
+            **task_data,
+            "duration": duration,
+            "step_details": step_details
+        }
+        
+        return jsonify(success_response("获取任务详情成功", data=response_data))
+        
+    except Exception as e:
+        logger.exception(f"获取任务详情失败: {str(e)}")
+        return jsonify(internal_error_response("获取任务详情失败")), 500
+```
+
+## Schema Workflow 集成设计
+
+### 1. 命令行参数扩展
+
+在现有的 `setup_argument_parser()` 函数中添加新参数:
+
+```python
+def setup_argument_parser():
+    """设置命令行参数解析器"""
+    parser = argparse.ArgumentParser(
+        description="Schema工作流编排器 - 端到端的Schema处理流程",
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    
+    # ... 现有参数 ...
+    
+    # 新增API集成参数
+    parser.add_argument(
+        "--task-id",
+        required=False,
+        help="任务ID(API调用时提供,手动执行时自动生成)"
+    )
+    
+    parser.add_argument(
+        "--no-db-tracking",
+        action="store_true",
+        help="禁用数据库任务追踪(不记录到任务表)"
+    )
+    
+    # 新增执行模式参数
+    parser.add_argument(
+        "--execution-mode",
+        choices=['complete', 'single'],
+        default='complete',
+        help="执行模式:complete=完整工作流,single=单步执行"
+    )
+    
+    parser.add_argument(
+        "--single-step",
+        type=int,
+        choices=[1, 2, 3, 4],
+        help="单步执行时指定步骤号(1=DDL生成,2=Q&A生成,3=SQL验证,4=训练数据加载)"
+    )
+    
+    return parser
+```
+
+### 2. SchemaWorkflowOrchestrator 类修改
+
+```python
+class SchemaWorkflowOrchestrator:
+    """端到端的Schema处理编排器 - 完整工作流程"""
+    
+    def __init__(self, 
+                 db_connection: str,
+                 table_list_file: str, 
+                 business_context: str,
+                 output_dir: str = None,
+                 enable_sql_validation: bool = True,
+                 enable_llm_repair: bool = True,
+                 modify_original_file: bool = True,
+                 enable_training_data_load: bool = True,
+                 # 新增参数
+                 task_id: str = None,
+                 db_logger: 'DatabaseProgressLogger' = None,
+                 execution_mode: str = 'complete',
+                 single_step: int = None):
+        """
+        初始化Schema工作流编排器
+        
+        Args:
+            # ... 现有参数 ...
+            task_id: 任务ID(可选)
+            db_logger: 数据库进度记录器(可选)
+            execution_mode: 执行模式 ('complete' 或 'single')
+            single_step: 单步执行时的步骤号 (1-4)
+        """
+        # ... 现有初始化代码 ...
+        
+        # 新增属性
+        self.task_id = task_id
+        self.db_logger = db_logger
+        self.execution_mode = execution_mode
+        self.single_step = single_step
+        
+        # 如果提供了task_id但没有db_logger,尝试创建一个
+        if self.task_id and not self.db_logger:
+            try:
+                self.db_logger = self._create_db_logger()
+            except Exception as e:
+                self.logger.warning(f"无法创建数据库记录器: {e}")
+    
+    def _create_db_logger(self):
+        """创建数据库进度记录器"""
+        from data_pipeline.api.database_logger import DatabaseProgressLogger
+        return DatabaseProgressLogger(self.task_id, self.db_connection)
+    
+    def _should_execute_step(self, step_number: int) -> bool:
+        """判断是否应该执行指定步骤"""
+        if self.execution_mode == 'complete':
+            # 完整模式:执行所有步骤
+            return True
+        elif self.execution_mode == 'single':
+            # 单步模式:只执行指定的步骤
+            return step_number == self.single_step
+        else:
+            return False
+    
+    async def execute_complete_workflow(self) -> Dict[str, Any]:
+        """执行完整的Schema处理工作流程"""
+        self.workflow_state["start_time"] = time.time()
+        
+        # 更新数据库状态为running
+        if self.db_logger:
+            self.db_logger.update_task_status('running')
+            self.db_logger.add_log('INFO', f'开始执行Schema工作流编排', 'workflow_start')
+        
+        self.logger.info("🚀 开始执行Schema工作流编排")
+        # ... 现有日志 ...
+        
+        try:
+            # 步骤1: 生成DDL和MD文件
+            if self._should_execute_step(1):
+                await self._execute_step_1_ddl_md_generation()
+            
+            # 步骤2: 生成Question-SQL对
+            if self._should_execute_step(2):
+                await self._execute_step_2_question_sql_generation()
+            
+            # 步骤3: 验证和修正SQL
+            if self._should_execute_step(3):
+                await self._execute_step_3_sql_validation()
+            
+            # 步骤4: 训练数据加载
+            if self._should_execute_step(4):
+                await self._execute_step_4_training_data_load()
+            
+            # 设置结束时间
+            self.workflow_state["end_time"] = time.time()
+            
+            # 生成最终报告
+            final_report = await self._generate_final_report()
+            
+            # 更新数据库状态为completed
+            if self.db_logger:
+                self.db_logger.update_task_status('completed', result=final_report)
+                self.db_logger.add_log('INFO', '工作流执行完成', 'workflow_complete')
+            
+            self.logger.info("✅ Schema工作流编排完成")
+            return final_report
+            
+        except Exception as e:
+            self.workflow_state["end_time"] = time.time()
+            
+            # 更新数据库状态为failed
+            if self.db_logger:
+                self.db_logger.update_task_status('failed', error_message=str(e))
+                self.db_logger.add_log('ERROR', f'工作流执行失败: {str(e)}', 'workflow_error')
+            
+            self.logger.exception(f"❌ 工作流程执行失败: {str(e)}")
+            error_report = await self._generate_error_report(e)
+            return error_report
+    
+    async def _execute_step_1_ddl_md_generation(self):
+        """步骤1: 生成DDL和MD文件"""
+        self.workflow_state["current_step"] = "ddl_md_generation"
+        
+        # 更新数据库进度
+        if self.db_logger:
+            self.db_logger.update_progress(10, 'ddl_md_generation')
+            self.db_logger.add_log('INFO', 'DDL/MD生成开始', 'ddl_md_generation')
+        
+        # ... 现有执行代码 ...
+        
+        try:
+            # ... DDL/MD生成逻辑 ...
+            
+            # 更新进度
+            if self.db_logger:
+                self.db_logger.update_progress(40, 'ddl_md_generation')
+                self.db_logger.add_log('INFO', f'DDL/MD生成完成: 成功处理 {processed_tables} 个表', 'ddl_md_generation')
+            
+        except Exception as e:
+            if self.db_logger:
+                self.db_logger.add_log('ERROR', f'DDL/MD生成失败: {str(e)}', 'ddl_md_generation')
+            raise
+    
+    # 类似地修改其他步骤方法...
+```
+
+### 3. 数据库进度记录器
+
+```python
+# data_pipeline/api/database_logger.py
+class DatabaseProgressLogger:
+    """数据库进度记录器"""
+    
+    def __init__(self, task_id: str, db_connection_string: str):
+        self.task_id = task_id
+        self.task_manager = DataPipelineTaskManager(db_connection_string)
+        self.logger = get_data_pipeline_logger("DatabaseLogger")
+    
+    def update_task_status(self, status: str, current_step: str = None, 
+                          error_message: str = None, result: dict = None):
+        """更新任务状态"""
+        try:
+            success = self.task_manager.update_task_status(
+                self.task_id, status, current_step, error_message
+            )
+            if result and status == 'completed':
+                self.task_manager.update_task_result(self.task_id, result)
+            return success
+        except Exception as e:
+            self.logger.warning(f"更新任务状态失败: {e}")
+            return False
+    
+    def update_progress(self, progress: int, current_step: str = None):
+        """更新任务进度"""
+        try:
+            return self.task_manager.update_task_progress(
+                self.task_id, progress, current_step
+            )
+        except Exception as e:
+            self.logger.warning(f"更新任务进度失败: {e}")
+            return False
+    
+    def add_log(self, level: str, message: str, step_name: str = None, 
+               extra_data: dict = None):
+        """添加任务日志"""
+        try:
+            return self.task_manager.add_task_log(
+                self.task_id, level, message, step_name, extra_data
+            )
+        except Exception as e:
+            self.logger.warning(f"添加任务日志失败: {e}")
+            return False
+```
+
+## 日志系统集成设计
+
+### 1. 日志路径管理
+
+修改 `core/logging/log_manager.py` 以支持任务特定的日志目录:
+
+```python
+def _create_file_handler(self, file_config: dict, module: str) -> logging.Handler:
+    """创建文件处理器"""
+    
+    # 对于data_pipeline模块,检查是否有任务特定的日志目录
+    if module == 'data_pipeline' and 'DATA_PIPELINE_LOG_DIR' in os.environ:
+        log_file = Path(os.environ['DATA_PIPELINE_LOG_DIR']) / 'data_pipeline.log'
+        # 禁用轮转,因为每个任务的日志是独立的
+        file_config = file_config.copy()
+        file_config['enable_rotation'] = False
+    else:
+        log_file = self.base_log_dir / file_config.get('filename', f'{module}.log')
+    
+    # 确保日志目录存在
+    log_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    # ... 其余代码保持不变 ...
+```
+
+### 2. 任务日志初始化
+
+在 `schema_workflow.py` 的 `main()` 函数中:
+
+```python
+async def main():
+    """命令行入口点"""
+    parser = setup_argument_parser()
+    args = parser.parse_args()
+    
+    # 初始化变量
+    task_id = None
+    db_logger = None
+    
+    # 如果不禁用数据库追踪
+    if not args.no_db_tracking:
+        # 如果没有task_id,自动生成
+        if not args.task_id:
+            from datetime import datetime
+            args.task_id = f"manual_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+            logger.info(f"📝 自动生成任务ID: {args.task_id}")
+        
+        task_id = args.task_id
+        
+        # 确定任务目录
+        if task_id.startswith('task_'):
+            # API调用的任务,输出目录已经是任务特定的
+            task_dir = args.output_dir
+        else:
+            # 手动执行的任务,创建任务特定目录
+            task_dir = os.path.join(args.output_dir, task_id)
+            os.makedirs(task_dir, exist_ok=True)
+            args.output_dir = task_dir
+        
+        # 设置环境变量,让日志系统知道当前的任务目录
+        os.environ['DATA_PIPELINE_LOG_DIR'] = task_dir
+        
+        # 重新初始化日志系统
+        from core.logging import initialize_logging
+        initialize_logging()
+        
+        try:
+            # 创建任务记录(如果是手动执行)
+            if task_id.startswith('manual_'):
+                task_manager = DataPipelineTaskManager(args.db_connection)
+                task_manager.create_task(
+                    task_id=task_id,
+                    task_type='complete_workflow',
+                    parameters={
+                        'db_connection': args.db_connection,
+                        'table_list': args.table_list,
+                        'business_context': args.business_context,
+                        'output_dir': args.output_dir,
+                        # ... 其他参数
+                    },
+                    created_by='manual'
+                )
+            
+            # 初始化数据库记录器
+            db_logger = DatabaseProgressLogger(task_id, args.db_connection)
+            logger.info(f"✅ 已启用数据库任务追踪: {task_id}")
+            
+        except Exception as e:
+            logger.warning(f"⚠️ 无法初始化任务追踪: {e}")
+            db_logger = None
+    else:
+        logger.info("ℹ️ 已禁用数据库任务追踪")
+    
+    # 参数验证:单步模式必须提供步骤号
+    if args.execution_mode == 'single' and not args.single_step:
+        logger.error("单步模式下必须提供 --single-step 参数")
+        sys.exit(1)
+    
+    # 创建编排器,传入新参数
+    orchestrator = SchemaWorkflowOrchestrator(
+        db_connection=args.db_connection,
+        table_list_file=args.table_list,
+        business_context=args.business_context,
+        output_dir=args.output_dir,
+        enable_sql_validation=not args.skip_validation,
+        enable_llm_repair=not args.disable_llm_repair,
+        modify_original_file=not args.no_modify_file,
+        enable_training_data_load=not args.skip_training_load,
+        task_id=task_id,
+        db_logger=db_logger,
+        execution_mode=args.execution_mode,
+        single_step=args.single_step
+    )
+    
+    # 执行工作流
+    report = await orchestrator.execute_complete_workflow()
+    
+    # ... 其余代码保持不变 ...
+```
+
+## 错误处理和监控
+
+### 1. 僵尸任务检测
+
+```python
+# data_pipeline/api/task_monitor.py
+class TaskMonitor:
+    """任务监控器"""
+    
+    def __init__(self, db_connection_string: str):
+        self.task_manager = DataPipelineTaskManager(db_connection_string)
+        self.logger = get_data_pipeline_logger("TaskMonitor")
+    
+    def check_zombie_tasks(self, timeout_hours: int = 2):
+        """检查僵尸任务"""
+        try:
+            cutoff_time = datetime.now() - timedelta(hours=timeout_hours)
+            
+            # 查找超时的运行中任务
+            zombie_tasks = self.task_manager.get_zombie_tasks(cutoff_time)
+            
+            for task in zombie_tasks:
+                task_id = task['id']
+                self.logger.warning(f"发现僵尸任务: {task_id}")
+                
+                # 标记为失败
+                self.task_manager.update_task_status(
+                    task_id, 
+                    'failed', 
+                    error_message=f"任务超时(超过{timeout_hours}小时),可能已停止运行"
+                )
+                
+                # 记录日志
+                self.task_manager.add_task_log(
+                    task_id, 
+                    'ERROR', 
+                    f"任务被标记为僵尸任务,执行时间超过{timeout_hours}小时", 
+                    'system_check'
+                )
+        
+        except Exception as e:
+            self.logger.error(f"检查僵尸任务失败: {e}")
+
+# 在citu_app.py中添加定期检查
+import threading
+import time
+
+def start_task_monitor():
+    """启动任务监控器"""
+    def monitor_loop():
+        monitor = TaskMonitor(app_config.PGVECTOR_CONFIG)
+        while True:
+            try:
+                monitor.check_zombie_tasks()
+                time.sleep(300)  # 每5分钟检查一次
+            except Exception as e:
+                logger.error(f"任务监控异常: {e}")
+                time.sleep(60)  # 出错时等待1分钟再重试
+    
+    monitor_thread = threading.Thread(target=monitor_loop, daemon=True)
+    monitor_thread.start()
+    logger.info("任务监控器已启动")
+
+# 在应用启动时调用
+if __name__ == '__main__':
+    start_task_monitor()
+    app.run()
+```
+
+### 2. 文件输出管理
+
+```python
+# data_pipeline/api/file_manager.py
+class TaskFileManager:
+    """任务文件管理器"""
+    
+    def __init__(self, task_id: str, output_dir: str, db_connection_string: str):
+        self.task_id = task_id
+        self.output_dir = Path(output_dir)
+        self.task_manager = DataPipelineTaskManager(db_connection_string)
+        self.logger = get_data_pipeline_logger("FileManager")
+    
+    def scan_and_register_files(self):
+        """扫描并注册输出文件"""
+        try:
+            if not self.output_dir.exists():
+                return
+            
+            # 文件类型映射
+            file_type_mapping = {
+                '.ddl': 'ddl',
+                '.md': 'md', 
+                '.json': 'json',
+                '.log': 'log',
+                '.txt': 'txt'
+            }
+            
+            for file_path in self.output_dir.iterdir():
+                if file_path.is_file():
+                    file_ext = file_path.suffix.lower()
+                    file_type = file_type_mapping.get(file_ext, 'other')
+                    file_size = file_path.stat().st_size
+                    
+                    # 判断是否为主要输出文件
+                    is_primary = (
+                        file_path.name.endswith('_pair.json') or
+                        file_path.name == 'metadata.txt' or
+                        file_path.name.endswith('_summary.log')
+                    )
+                    
+                    # 注册文件
+                    self.task_manager.register_output_file(
+                        task_id=self.task_id,
+                        file_type=file_type,
+                        file_name=file_path.name,
+                        file_path=str(file_path.relative_to(self.output_dir)),
+                        file_size=file_size,
+                        is_primary=is_primary
+                    )
+        
+        except Exception as e:
+            self.logger.error(f"扫描文件失败: {e}")
+    
+    def cleanup_task_files(self):
+        """清理任务文件"""
+        try:
+            if self.output_dir.exists():
+                shutil.rmtree(self.output_dir)
+                self.logger.info(f"已清理任务文件: {self.output_dir}")
+        except Exception as e:
+            self.logger.error(f"清理任务文件失败: {e}")
+```
+
+## 部署和初始化
+
+### 1. 数据库初始化脚本
+
+```sql
+-- data_pipeline/sql/init_tables.sql
+
+-- 创建任务表
+CREATE TABLE IF NOT EXISTS data_pipeline_tasks (
+    id VARCHAR(32) PRIMARY KEY,
+    task_type VARCHAR(50) NOT NULL DEFAULT 'complete_workflow',
+    status VARCHAR(20) NOT NULL DEFAULT 'pending',
+    parameters JSONB NOT NULL,
+    result JSONB,
+    error_message TEXT,
+    error_step VARCHAR(100),
+    progress INTEGER DEFAULT 0 CHECK (progress >= 0 AND progress <= 100),
+    current_step VARCHAR(100),
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    started_at TIMESTAMP,
+    completed_at TIMESTAMP,
+    created_by VARCHAR(50) DEFAULT 'api',
+    step_stats JSONB DEFAULT '{}',
+    output_directory TEXT,
+    db_name VARCHAR(100),
+    business_context TEXT
+);
+
+-- 创建日志表
+CREATE TABLE IF NOT EXISTS data_pipeline_task_logs (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    log_level VARCHAR(10) NOT NULL,
+    message TEXT NOT NULL,
+    step_name VARCHAR(100),
+    module_name VARCHAR(100),
+    function_name VARCHAR(100),
+    timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    extra_data JSONB DEFAULT '{}'
+);
+
+-- 创建输出文件表
+CREATE TABLE IF NOT EXISTS data_pipeline_task_outputs (
+    id SERIAL PRIMARY KEY,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
+    file_type VARCHAR(50) NOT NULL,
+    file_name VARCHAR(255) NOT NULL,
+    file_path TEXT NOT NULL,
+    file_size BIGINT DEFAULT 0,
+    content_hash VARCHAR(64),
+    description TEXT,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    modified_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    is_primary BOOLEAN DEFAULT FALSE,
+    is_downloadable BOOLEAN DEFAULT TRUE
+);
+
+-- 创建索引
+CREATE INDEX IF NOT EXISTS idx_tasks_status ON data_pipeline_tasks(status);
+CREATE INDEX IF NOT EXISTS idx_tasks_created_at ON data_pipeline_tasks(created_at DESC);
+CREATE INDEX IF NOT EXISTS idx_tasks_db_name ON data_pipeline_tasks(db_name);
+CREATE INDEX IF NOT EXISTS idx_tasks_created_by ON data_pipeline_tasks(created_by);
+
+CREATE INDEX IF NOT EXISTS idx_logs_task_id ON data_pipeline_task_logs(task_id);
+CREATE INDEX IF NOT EXISTS idx_logs_timestamp ON data_pipeline_task_logs(timestamp DESC);
+CREATE INDEX IF NOT EXISTS idx_logs_level ON data_pipeline_task_logs(log_level);
+CREATE INDEX IF NOT EXISTS idx_logs_step ON data_pipeline_task_logs(step_name);
+
+CREATE INDEX IF NOT EXISTS idx_outputs_task_id ON data_pipeline_task_outputs(task_id);
+CREATE INDEX IF NOT EXISTS idx_outputs_file_type ON data_pipeline_task_outputs(file_type);
+CREATE INDEX IF NOT EXISTS idx_outputs_primary ON data_pipeline_task_outputs(is_primary) WHERE is_primary = TRUE;
+
+-- 创建清理函数
+CREATE OR REPLACE FUNCTION cleanup_old_data_pipeline_tasks(days_to_keep INTEGER DEFAULT 30)
+RETURNS INTEGER AS $$
+DECLARE
+    deleted_count INTEGER;
+    cutoff_date TIMESTAMP;
+BEGIN
+    cutoff_date := NOW() - INTERVAL '1 day' * days_to_keep;
+    
+    -- 删除旧任务(级联删除相关日志和文件记录)
+    DELETE FROM data_pipeline_tasks 
+    WHERE created_at < cutoff_date 
+    AND status IN ('completed', 'failed');
+    
+    GET DIAGNOSTICS deleted_count = ROW_COUNT;
+    
+    RETURN deleted_count;
+END;
+$$ LANGUAGE plpgsql;
+```
+
+### 2. 配置文件更新
+
+需要在 `app_config.py` 中添加Data Pipeline相关配置:
+
+```python
+# Data Pipeline API配置
+DATA_PIPELINE_CONFIG = {
+    "max_concurrent_tasks": 1,           # 最大并发任务数
+    "task_timeout_hours": 2,             # 任务超时时间(小时)
+    "log_retention_days": 30,            # 日志保留天数
+    "file_retention_days": 30,           # 文件保留天数
+    "monitor_interval_seconds": 300,     # 监控检查间隔(秒)
+    "enable_file_download": True,        # 是否允许文件下载
+    "max_download_file_size": 100 * 1024 * 1024,  # 最大下载文件大小(字节)
+}
+```
+
+## 总结
+
+本详细设计文档提供了Data Pipeline API系统的完整技术实现方案:
+
+### 主要特点
+
+1. **API与执行分离**:使用subprocess实现真正的后台执行,API不阻塞
+2. **数据库驱动的状态管理**:所有任务状态、进度、日志都记录在PostgreSQL中
+3. **灵活的步骤控制**:支持从指定步骤开始、结束,以及跳过特定步骤
+4. **统一的日志管理**:每个任务的日志都写入独立的任务目录
+5. **完整的文件管理**:自动扫描、注册和管理任务输出文件
+6. **健壮的错误处理**:包括僵尸任务检测、超时处理等
+
+### 实现要点
+
+1. **最小化代码修改**:主要修改集中在 `schema_workflow.py` 和 `citu_app.py`
+2. **向后兼容**:手动执行方式仍然完全支持
+3. **扩展性好**:易于添加新的任务类型和执行步骤
+4. **监控友好**:提供完整的任务监控和清理机制
+
+### 关键文件
+
+1. `citu_app.py` - 添加API路由实现
+2. `data_pipeline/schema_workflow.py` - 修改以支持API集成
+3. `data_pipeline/api/database_manager.py` - 数据库操作封装(新建)
+4. `data_pipeline/api/database_logger.py` - 进度记录器(新建)
+5. `data_pipeline/sql/init_tables.sql` - 数据库初始化脚本(新建)
+
+这个设计充分考虑了现有代码结构,提供了完整的API功能,同时保持了系统的简洁性和可维护性。

+ 637 - 0
docs/data_pipeline_api_usage_guide.md

@@ -0,0 +1,637 @@
+# Data Pipeline API 使用指南
+
+## 概述
+
+Data Pipeline API 是一个简化的数据管道调度和管理系统,支持通过 REST API 调度执行数据管道任务,提供任务管理、进度监控、双日志系统和文件管理等功能。
+
+## 系统架构
+
+### 核心组件
+
+1. **简化任务管理器** (`SimpleTaskManager`) - 管理任务生命周期和数据库操作
+2. **简化工作流执行器** (`SimpleWorkflowExecutor`) - 执行具体的数据管道任务
+3. **任务执行器** (`task_executor.py`) - 独立进程执行任务
+4. **文件管理器** (`SimpleFileManager`) - 管理任务输出文件和下载
+5. **双日志系统** - 数据库日志 + 任务目录详细日志
+
+### 数据库结构
+
+系统使用 4 个主要数据库表(部署在 pgvector 数据库中):
+
+- `data_pipeline_tasks` - 任务主表
+- `data_pipeline_task_executions` - 任务执行记录表
+- `data_pipeline_task_logs` - 任务日志表
+- `data_pipeline_task_outputs` - 任务输出文件表
+
+### 执行架构
+
+```
+API请求 → citu_app.py → subprocess → task_executor.py → SimpleWorkflowExecutor → SchemaWorkflowOrchestrator
+```
+
+- **进程隔离**:使用 subprocess 启动独立进程执行任务
+- **双日志记录**:数据库结构化日志 + 任务目录详细文件日志
+- **任务目录管理**:每个任务在 `./data_pipeline/training_data/{task_id}/` 目录中独立存储
+
+## 部署说明
+
+### 1. 数据库初始化
+
+首先运行 SQL 初始化脚本创建必要的数据库表:
+
+```bash
+psql -h host -p port -U username -d database_name -f data_pipeline/sql/init_tables.sql
+```
+
+### 2. 启动应用
+
+启动 Flask 应用(包含 Data Pipeline API):
+
+```bash
+python citu_app.py
+```
+
+应用将在 `http://localhost:8084` 启动,Data Pipeline API 端点前缀为 `/api/v0/data_pipeline/`。
+
+## API 使用指南
+
+### 基础任务管理
+
+#### 1. 创建任务
+
+```bash
+curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "table_list_file": "tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "db_name": "highway_db",
+    "enable_sql_validation": true,
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_training_data_load": true
+  }'
+```
+
+响应示例:
+```json
+{
+  "success": true,
+  "code": 201,
+  "message": "任务创建成功",
+  "data": {
+    "task_id": "task_20250627_143052",
+    "status": "pending",
+    "created_at": "2025-06-27T14:30:52"
+  }
+}
+```
+
+#### 2. 执行任务
+
+**完整工作流执行:**
+```bash
+curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/execute \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "execution_mode": "complete",
+    "force_execution": false,
+    "clean_existing_files": true
+  }'
+```
+
+**单步执行:**
+```bash
+curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/execute \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "execution_mode": "step", 
+    "step_name": "ddl_generation"
+  }'
+```
+
+**可用的步骤名称:**
+- `ddl_generation` - DDL生成和MD文档生成
+- `qa_generation` - Q&A问答对生成
+- `sql_validation` - SQL验证和修复
+- `training_load` - 训练数据加载到Vanna
+
+响应示例:
+```json
+{
+  "success": true,
+  "code": 202,
+  "message": "任务执行已启动",
+  "data": {
+    "task_id": "task_20250627_143052",
+    "execution_mode": "step",
+    "step_name": "ddl_generation",
+    "status": "running"
+  }
+}
+
+#### 3. 查询任务状态
+
+```bash
+curl -X GET http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052
+```
+
+响应示例:
+```json
+{
+  "success": true,
+  "data": {
+    "task_id": "task_20250627_143052",
+    "status": "in_progress",
+    "step_status": {
+      "ddl_generation": "completed",
+      "qa_generation": "running",
+      "sql_validation": "pending",
+      "training_load": "pending"
+    },
+    "current_execution": {
+      "execution_id": "task_20250627_143052_step_qa_generation_exec_20250627_143100",
+      "step": "qa_generation",
+      "status": "running",
+      "started_at": "2025-06-27T14:31:00"
+    }
+  }
+}
+```
+
+#### 4. 获取任务列表
+
+```bash
+curl -X GET "http://localhost:8084/api/v0/data_pipeline/tasks?limit=10&status=completed"
+```
+
+### 日志管理
+
+#### 查看任务日志(数据库日志)
+
+```bash
+curl -X GET "http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/logs?limit=50&level=ERROR"
+```
+
+### 文件管理
+
+#### 1. 获取任务文件列表
+
+#### 查看任务目录详细日志
+
+任务执行过程中的详细日志会写入任务目录的 `data_pipeline.log` 文件:
+
+**文件位置:** `./data_pipeline/training_data/{task_id}/data_pipeline.log`
+
+**日志内容示例:**
+```
+2025-07-01 14:30:52 [INFO] TaskDir_task_20250701_143052: 任务目录日志初始化完成 - 任务ID: task_20250701_143052
+2025-07-01 14:30:53 [INFO] TaskDir_task_20250701_143052: [complete] 开始执行步骤: complete
+2025-07-01 14:30:53 [INFO] DataPipelineOrchestrator: 开始执行完整工作流
+2025-07-01 14:30:54 [INFO] DDLMDGenerator: 开始处理表: bss_business_day_data
+```
+
+### 文件管理
+
+#### 1. 获取输出文件列表
+
+```bash
+curl -X GET http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/files
+```
+
+#### 2. 下载任务文件
+
+```bash
+curl -X GET http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/files/qs_highway_db_20250627_143052_pair.json \\
+  -o downloaded_file.json
+```
+
+#### 3. 创建任务压缩包
+
+```bash
+curl -X POST http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/files/archive \\
+  -H "Content-Type: application/json" \\
+  -d '{"archive_format": "zip"}'
+```
+
+#### 4. 验证文件完整性
+
+```bash
+curl -X GET http://localhost:8084/api/v0/data_pipeline/tasks/task_20250627_143052/files/integrity
+```
+
+#### 5. 清理旧文件
+
+```bash
+curl -X POST http://localhost:8084/api/v0/data_pipeline/files/cleanup \\
+  -H "Content-Type: application/json" \\
+  -d '{"days_to_keep": 30}'
+```
+
+### 监控功能
+
+#### 1. 获取系统状态
+
+```bash
+curl -X GET http://localhost:8084/api/v0/data_pipeline/monitor/status
+```
+
+响应包含:
+- 系统性能指标(CPU、内存、磁盘使用率)
+- 任务统计信息
+- 磁盘使用情况
+- 异常检测结果
+- 系统健康状态
+
+#### 2. 获取任务详细监控
+
+```bash
+curl -X GET http://localhost:8084/api/v0/data_pipeline/monitor/tasks/task_20250627_143052
+```
+
+#### 3. 获取历史性能数据
+
+```bash
+curl -X GET "http://localhost:8084/api/v0/data_pipeline/monitor/metrics/history?minutes=120"
+```
+
+#### 4. 获取异常记录
+
+```bash
+curl -X GET "http://localhost:8084/api/v0/data_pipeline/monitor/anomalies?hours=24"
+```
+
+### 统计信息
+
+#### 获取整体统计
+
+```bash
+curl -X GET http://localhost:8084/api/v0/data_pipeline/stats
+```
+
+## 工作流说明
+
+### 完整工作流步骤
+
+1. **DDL生成** (`ddl_generation`)
+   - 连接数据库分析表结构
+   - 生成 `.ddl` 文件和 `_detail.md` 文档
+   - 生成 `metadata.txt` 和 `filename_mapping.txt`
+
+2. **Question-SQL生成** (`qa_generation`)
+   - 基于DDL和文档生成问答对
+   - 输出 `qs_*_pair.json` 文件
+
+3. **SQL验证** (`sql_validation`) - 可选
+   - 验证生成的SQL语句
+   - 修复无效SQL(如果启用LLM修复)
+   - 生成验证报告
+
+4. **训练数据加载** (`training_load`) - 可选
+   - 将生成的数据加载到 Vanna.ai 训练数据库
+
+### 任务状态说明
+
+- `pending` - 任务已创建,等待执行
+- `in_progress` - 任务正在执行中
+- `partial_completed` - 部分步骤完成
+- `completed` - 任务完全完成
+- `failed` - 任务执行失败
+
+### 步骤状态说明
+
+- `pending` - 步骤等待执行
+- `running` - 步骤正在执行
+- `completed` - 步骤执行完成
+- `failed` - 步骤执行失败
+
+## 文件组织结构
+
+每个任务在 `./data_pipeline/training_data/` 下创建独立目录:
+
+```
+./data_pipeline/training_data/
+├── task_20250627_143052/                   # 任务ID作为目录名
+│   ├── task_config.json                    # 任务配置参数
+│   ├── task_result.json                    # 最终执行结果
+│   ├── ddl_generation_result.json          # DDL生成步骤结果
+│   ├── qa_generation_result.json           # QA生成步骤结果
+│   ├── sql_validation_result.json          # SQL验证步骤结果
+│   ├── training_load_result.json           # 训练加载步骤结果
+│   ├── bss_*.ddl                          # 生成的DDL文件
+│   ├── bss_*_detail.md                    # 生成的MD文档
+│   ├── qs_*.json                          # Question-SQL对
+│   ├── metadata.txt                        # 元数据文件
+│   ├── filename_mapping.txt               # 文件映射
+│   ├── sql_validation_*_summary.log       # SQL验证摘要
+│   └── sql_validation_*_report.json       # SQL验证详细报告
+└── task_20250627_150123/
+    └── ...
+```
+
+## 错误处理
+
+### 常见错误和解决方案
+
+1. **任务创建失败**
+   - 检查数据库连接配置
+   - 确认表清单文件存在
+   - 验证PostgreSQL连接权限
+
+2. **执行超时**
+   - 系统自动检测2小时以上的僵尸任务
+   - 可通过监控API查看系统资源使用情况
+
+3. **文件访问错误**
+   - 检查目录权限
+   - 确认磁盘空间充足
+
+4. **依赖检查失败**
+   - 按顺序执行步骤:ddl_generation → qa_generation → sql_validation → training_load
+   - 或使用 `force_execution: true` 跳过依赖检查
+
+## 最佳实践
+
+### 1. 任务管理
+- 使用描述性的业务上下文
+- 定期清理旧任务文件释放磁盘空间
+- 监控长时间运行的任务
+
+### 2. 性能优化
+- 大型数据库建议分批处理表清单
+- 监控系统资源使用情况
+- 及时处理异常告警
+
+### 3. 安全考虑
+- 不要在日志中记录敏感数据库连接信息
+- 定期备份重要的训练数据
+- 控制API访问权限
+
+## 故障排除
+
+### 查看日志
+```bash
+# 查看任务错误日志
+curl -X GET "http://localhost:8084/api/v0/data_pipeline/tasks/TASK_ID/logs?level=ERROR"
+
+# 查看系统异常
+curl -X GET "http://localhost:8084/api/v0/data_pipeline/monitor/anomalies"
+```
+
+### 检查系统状态
+```bash
+# 获取完整系统状态
+curl -X GET http://localhost:8084/api/v0/data_pipeline/monitor/status
+```
+
+### 手动清理
+```bash
+# 清理僵尸任务(通过数据库管理器)
+# 清理旧文件
+curl -X POST http://localhost:8084/api/v0/data_pipeline/files/cleanup \\
+  -H "Content-Type: application/json" \\
+  -d '{"days_to_keep": 7}'
+```
+
+## 扩展功能
+
+### 自定义告警
+系统支持异常检测和告警,可以通过修改 `TaskAnomalyDetector` 类添加自定义告警逻辑。
+
+### 性能监控
+系统自动收集性能指标,支持查看历史数据和趋势分析。
+
+### 文件管理
+支持文件完整性验证、压缩包创建、批量下载等功能。
+
+## 完整 API 接口说明
+
+### 1. 任务管理接口
+
+#### 创建任务
+```bash
+POST /api/v0/data_pipeline/tasks
+Content-Type: application/json
+
+{
+  "table_list_file": "tables.txt",
+  "business_context": "业务描述",
+  "db_name": "highway_db",
+  "enable_sql_validation": true,
+  "enable_llm_repair": true,
+  "modify_original_file": true,
+  "enable_training_data_load": true
+}
+```
+
+**参数说明:**
+- `table_list_file` (必填): 表清单文件路径
+- `business_context` (必填): 业务上下文描述
+- `db_name` (可选): 指定业务数据库名称,如不提供则使用app_config中的默认配置
+- 其他参数为可选的功能开关
+
+**注意:** 数据库连接信息自动从 `app_config.py` 的 `APP_DB_CONFIG` 获取,无需在API请求中提供
+
+**预期返回:**
+```json
+{
+  "success": true,
+  "code": 201,
+  "message": "任务创建成功",
+  "data": {
+    "task_id": "task_20250701_143052",
+    "status": "pending",
+    "created_at": "2025-07-01T14:30:52"
+  }
+}
+```
+
+#### 执行任务
+```bash
+POST /api/v0/data_pipeline/tasks/{task_id}/execute
+Content-Type: application/json
+
+# 完整工作流
+{"execution_mode": "complete"}
+
+# 单步执行
+{"execution_mode": "step", "step_name": "ddl_generation"}
+```
+
+**预期返回:**
+```json
+{
+  "success": true,
+  "code": 202,
+  "message": "任务执行已启动",
+  "data": {
+    "task_id": "task_20250701_143052",
+    "execution_mode": "complete",
+    "status": "running"
+  }
+}
+```
+
+#### 查询任务状态
+```bash
+GET /api/v0/data_pipeline/tasks/{task_id}
+```
+
+**预期返回:**
+```json
+{
+  "success": true,
+  "data": {
+    "task_id": "task_20250701_143052",
+    "status": "in_progress",
+    "step_status": {
+      "ddl_generation": "completed",
+      "qa_generation": "running",
+      "sql_validation": "pending",
+      "training_load": "pending"
+    },
+    "created_at": "2025-07-01T14:30:52",
+    "started_at": "2025-07-01T14:30:53"
+  }
+}
+```
+
+#### 获取任务列表
+```bash
+GET /api/v0/data_pipeline/tasks?limit=10&status=completed
+```
+
+**预期返回:**
+```json
+{
+  "success": true,
+  "data": {
+    "tasks": [
+      {
+        "task_id": "task_20250701_143052",
+        "status": "completed",
+        "created_at": "2025-07-01T14:30:52"
+      }
+    ]
+  }
+}
+```
+
+### 2. 日志接口
+
+#### 获取任务日志
+```bash
+GET /api/v0/data_pipeline/tasks/{task_id}/logs?limit=50&level=ERROR
+```
+
+**预期返回:**
+```json
+{
+  "success": true,
+  "data": {
+    "logs": [
+      {
+        "id": 123,
+        "timestamp": "2025-07-01T14:30:54",
+        "level": "INFO",
+        "message": "开始执行步骤: ddl_generation",
+        "step_name": "ddl_generation"
+      }
+    ]
+  }
+}
+```
+
+### 3. 文件管理接口
+
+#### 获取文件列表
+```bash
+GET /api/v0/data_pipeline/tasks/{task_id}/files
+```
+
+**预期返回:**
+```json
+{
+  "success": true,
+  "data": {
+    "files": [
+      {
+        "file_name": "data_pipeline.log",
+        "file_type": "log",
+        "file_size": 1024,
+        "download_url": "/api/v0/data_pipeline/tasks/{task_id}/files/download/data_pipeline.log"
+      },
+      {
+        "file_name": "qs_highway_db_20250701_143052_pair.json",
+        "file_type": "json",
+        "file_size": 10240,
+        "download_url": "/api/v0/data_pipeline/tasks/{task_id}/files/download/qs_highway_db_20250701_143052_pair.json"
+      }
+    ]
+  }
+}
+```
+
+#### 下载文件
+```bash
+GET /api/v0/data_pipeline/tasks/{task_id}/files/download/{filename}
+```
+
+**预期返回:** 文件二进制内容,Content-Type 根据文件类型设置
+
+### 4. 执行记录接口
+
+#### 获取任务执行记录
+```bash
+GET /api/v0/data_pipeline/tasks/{task_id}/executions
+```
+
+**预期返回:**
+```json
+{
+  "success": true,
+  "data": {
+    "executions": [
+      {
+        "execution_id": "task_20250701_143052_step_ddl_generation_exec_20250701143053",
+        "execution_step": "ddl_generation",
+        "status": "completed",
+        "started_at": "2025-07-01T14:30:53",
+        "completed_at": "2025-07-01T14:35:20"
+      }
+    ]
+  }
+}
+```
+
+### 5. 错误响应格式
+
+所有接口在出错时都返回统一的错误格式:
+
+```json
+{
+  "success": false,
+  "code": 400,
+  "message": "错误描述",
+  "error_type": "validation_error",
+  "details": {}
+}
+```
+
+**常见错误码:**
+- `400` - 请求参数错误
+- `404` - 任务不存在
+- `409` - 任务冲突(已有任务在执行)
+- `500` - 服务器内部错误
+- `503` - 服务暂时不可用
+
+## 技术支持
+
+如有问题,请检查:
+1. 系统日志和错误信息
+2. 数据库连接状态
+3. 文件系统权限
+4. 系统资源使用情况
+5. 任务目录详细日志文件 `./data_pipeline/training_data/{task_id}/data_pipeline.log`
+
+通过监控API可以获取详细的系统状态和错误信息,有助于快速定位和解决问题。

+ 106 - 0
test_api_changes.py

@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+测试Data Pipeline API的修改
+验证去除db_connection必填参数后的功能
+"""
+
+import requests
+import json
+
+def test_create_task():
+    """测试创建任务(不需要db_connection参数)"""
+    url = "http://localhost:8084/api/v0/data_pipeline/tasks"
+    
+    # 新的请求格式 - 不需要db_connection
+    data = {
+        "table_list_file": "data_pipeline/tables.txt",
+        "business_context": "高速公路服务区管理系统测试",
+        "db_name": "highway_db",  # 可选参数
+        "enable_sql_validation": True,
+        "enable_llm_repair": True,
+        "modify_original_file": True,
+        "enable_training_data_load": True
+    }
+    
+    print("测试创建任务(使用app_config配置的数据库连接)...")
+    print(f"请求数据: {json.dumps(data, ensure_ascii=False, indent=2)}")
+    
+    try:
+        response = requests.post(url, json=data, timeout=10)
+        print(f"响应状态码: {response.status_code}")
+        print(f"响应内容: {json.dumps(response.json(), ensure_ascii=False, indent=2)}")
+        
+        if response.status_code == 201:
+            return response.json().get('data', {}).get('task_id')
+        else:
+            print("任务创建失败")
+            return None
+            
+    except Exception as e:
+        print(f"请求失败: {e}")
+        return None
+
+def test_old_format():
+    """测试旧格式是否还能工作(应该报错)"""
+    url = "http://localhost:8084/api/v0/data_pipeline/tasks"
+    
+    # 旧的请求格式 - 包含db_connection
+    data = {
+        "db_connection": "postgresql://user:pass@host:5432/dbname",
+        "table_list_file": "data_pipeline/tables.txt",
+        "business_context": "测试旧格式"
+    }
+    
+    print("\n测试旧格式(包含db_connection,应该被忽略)...")
+    print(f"请求数据: {json.dumps(data, ensure_ascii=False, indent=2)}")
+    
+    try:
+        response = requests.post(url, json=data, timeout=10)
+        print(f"响应状态码: {response.status_code}")
+        print(f"响应内容: {json.dumps(response.json(), ensure_ascii=False, indent=2)}")
+        
+    except Exception as e:
+        print(f"请求失败: {e}")
+
+def test_missing_params():
+    """测试缺少必需参数的情况"""
+    url = "http://localhost:8084/api/v0/data_pipeline/tasks"
+    
+    # 缺少必需参数
+    data = {
+        "business_context": "只有业务上下文"
+    }
+    
+    print("\n测试缺少必需参数(应该返回400错误)...")
+    print(f"请求数据: {json.dumps(data, ensure_ascii=False, indent=2)}")
+    
+    try:
+        response = requests.post(url, json=data, timeout=10)
+        print(f"响应状态码: {response.status_code}")
+        print(f"响应内容: {json.dumps(response.json(), ensure_ascii=False, indent=2)}")
+        
+    except Exception as e:
+        print(f"请求失败: {e}")
+
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Data Pipeline API 修改测试")
+    print("=" * 60)
+    
+    # 测试新格式
+    task_id = test_create_task()
+    
+    # 测试旧格式
+    test_old_format()
+    
+    # 测试缺少参数
+    test_missing_params()
+    
+    print("\n" + "=" * 60)
+    print("测试完成")
+    print("=" * 60)
+    
+    if task_id:
+        print(f"成功创建的任务ID: {task_id}")
+        print(f"可以通过以下命令查看任务状态:")
+        print(f"curl http://localhost:8084/api/v0/data_pipeline/tasks/{task_id}")