Browse Source

测试了data_pipline日志模块,现在增强版本的data_pipline的api.

wangxq 1 tuần trước cách đây
mục cha
commit
31f1504378
60 tập tin đã thay đổi với 2570 bổ sung1056 xóa
  1. 255 59
      citu_app.py
  2. 116 91
      data_pipeline/api/simple_db_manager.py
  3. 335 3
      data_pipeline/api/simple_file_manager.py
  4. 97 72
      data_pipeline/api/simple_workflow.py
  5. 13 0
      data_pipeline/config.py
  6. 4 2
      data_pipeline/dp_logging/manager.py
  7. 3 1
      data_pipeline/schema_workflow.py
  8. 52 166
      data_pipeline/sql/init_tables.sql
  9. 0 15
      data_pipeline/training_data/bss_company.ddl
  10. 0 15
      data_pipeline/training_data/bss_company_detail.md
  11. 0 16
      data_pipeline/training_data/bss_section_route.ddl
  12. 0 7
      data_pipeline/training_data/bss_section_route_area_link.ddl
  13. 0 7
      data_pipeline/training_data/bss_section_route_area_link_detail.md
  14. 0 16
      data_pipeline/training_data/bss_section_route_detail.md
  15. 0 19
      data_pipeline/training_data/bss_service_area.ddl
  16. 0 21
      data_pipeline/training_data/bss_service_area_detail.md
  17. 0 18
      data_pipeline/training_data/bss_service_area_mapper.ddl
  18. 0 19
      data_pipeline/training_data/bss_service_area_mapper_detail.md
  19. 0 13
      data_pipeline/training_data/db_query_decision_prompt.txt
  20. 0 10
      data_pipeline/training_data/filename_mapping.txt
  21. 0 62
      data_pipeline/training_data/metadata.txt
  22. 0 202
      data_pipeline/training_data/qs_highway_db_20250627_101745_pair.json
  23. 0 202
      data_pipeline/training_data/qs_highway_db_20250627_101745_pair.json.backup
  24. 31 0
      data_pipeline/training_data/task_20250701_212426/bss_business_day_data.ddl
  25. 32 0
      data_pipeline/training_data/task_20250701_212426/bss_business_day_data_detail.md
  26. 17 0
      data_pipeline/training_data/task_20250701_212426/bss_car_day_count.ddl
  27. 18 0
      data_pipeline/training_data/task_20250701_212426/bss_car_day_count_detail.md
  28. 5 0
      data_pipeline/training_data/task_20250701_212426/filename_mapping.txt
  29. 96 0
      data_pipeline/training_data/task_20250701_212426/qs_intermediate_20250701_212921.json
  30. 14 0
      data_pipeline/training_data/task_20250701_212426/task_config.json
  31. 3 3
      data_pipeline/training_data/task_20250701_213434/bss_business_day_data.ddl
  32. 31 0
      data_pipeline/training_data/task_20250701_213434/bss_business_day_data_1.ddl
  33. 3 3
      data_pipeline/training_data/task_20250701_213434/bss_business_day_data_detail.md
  34. 32 0
      data_pipeline/training_data/task_20250701_213434/bss_business_day_data_detail_1.md
  35. 17 0
      data_pipeline/training_data/task_20250701_213434/bss_car_day_count.ddl
  36. 5 5
      data_pipeline/training_data/task_20250701_213434/bss_car_day_count_1.ddl
  37. 18 0
      data_pipeline/training_data/task_20250701_213434/bss_car_day_count_detail.md
  38. 18 0
      data_pipeline/training_data/task_20250701_213434/bss_car_day_count_detail_1.md
  39. 11 0
      data_pipeline/training_data/task_20250701_213434/db_query_decision_prompt.txt
  40. 7 0
      data_pipeline/training_data/task_20250701_213434/ddl_generation_result.json
  41. 5 0
      data_pipeline/training_data/task_20250701_213434/filename_mapping.txt
  42. 62 0
      data_pipeline/training_data/task_20250701_213434/metadata.txt
  43. 3 3
      data_pipeline/training_data/task_20250701_213434/metadata_detail.md
  44. 202 0
      data_pipeline/training_data/task_20250701_213434/qs_highway_db_20250701_214431_pair.json
  45. 202 0
      data_pipeline/training_data/task_20250701_213434/qs_highway_db_20250701_214431_pair.json.backup
  46. 14 0
      data_pipeline/training_data/task_20250701_213434/task_config.json
  47. 117 0
      data_pipeline/training_data/task_20250701_213434/task_result.json
  48. 31 0
      data_pipeline/training_data/task_20250701_231850/bss_business_day_data.ddl
  49. 32 0
      data_pipeline/training_data/task_20250701_231850/bss_business_day_data_detail.md
  50. 17 0
      data_pipeline/training_data/task_20250701_231850/bss_car_day_count.ddl
  51. 6 6
      data_pipeline/training_data/task_20250701_231850/bss_car_day_count_detail.md
  52. 10 0
      data_pipeline/training_data/task_20250701_231850/db_query_decision_prompt.txt
  53. 5 0
      data_pipeline/training_data/task_20250701_231850/filename_mapping.txt
  54. 62 0
      data_pipeline/training_data/task_20250701_231850/metadata.txt
  55. 20 0
      data_pipeline/training_data/task_20250701_231850/metadata_detail.md
  56. 202 0
      data_pipeline/training_data/task_20250701_231850/qs_highway_db_20250701_234811_pair.json
  57. 202 0
      data_pipeline/training_data/task_20250701_231850/qs_highway_db_20250701_234811_pair.json.backup
  58. 13 0
      data_pipeline/training_data/task_20250701_231850/table_list.txt
  59. 15 0
      data_pipeline/training_data/task_20250701_231850/task_config.json
  60. 117 0
      data_pipeline/training_data/task_20250701_231850/task_result.json

+ 255 - 59
citu_app.py

@@ -2792,15 +2792,8 @@ def create_data_pipeline_task():
     try:
         req = request.get_json(force=True)
         
-        # 验证必需参数 - 移除db_connection,改为使用app_config配置
-        required_params = ['table_list_file', 'business_context']
-        missing_params = [param for param in required_params if not req.get(param)]
-        
-        if missing_params:
-            return jsonify(bad_request_response(
-                response_text=f"缺少必需参数: {', '.join(missing_params)}",
-                missing_params=missing_params
-            )), 400
+        # table_list_file和business_context现在都是可选参数
+        # 如果未提供table_list_file,将使用文件上传模式
         
         # 创建任务(自动使用app_config中的数据库配置)
         manager = get_data_pipeline_manager()
@@ -2823,8 +2816,17 @@ def create_data_pipeline_task():
             "created_at": task_info.get('created_at').isoformat() if task_info.get('created_at') else None
         }
         
+        # 检查是否为文件上传模式
+        file_upload_mode = not req.get('table_list_file')
+        response_message = "任务创建成功"
+        
+        if file_upload_mode:
+            response_data["file_upload_mode"] = True
+            response_data["next_step"] = f"POST /api/v0/data_pipeline/tasks/{task_id}/upload-table-list"
+            response_message += ",请上传表清单文件后再执行任务"
+        
         return jsonify(success_response(
-            response_text="任务创建成功",
+            response_text=response_message,
             data=response_data
         )), 201
         
@@ -2967,27 +2969,43 @@ def get_data_pipeline_task_status(task_id):
                 response_text=f"任务不存在: {task_id}"
             )), 404
         
-        # 获取执行记录
-        executions = manager.get_task_executions(task_id)
-        current_execution = executions[0] if executions else None
+        # 获取步骤状态
+        steps = manager.get_task_steps(task_id)
+        current_step = None
+        for step in steps:
+            if step['step_status'] == 'running':
+                current_step = step
+                break
+        
+        # 构建步骤状态摘要
+        step_status_summary = {}
+        for step in steps:
+            step_status_summary[step['step_name']] = step['step_status']
         
         response_data = {
-            "task_id": task_info['id'],
+            "task_id": task_info['task_id'],
             "status": task_info['status'],
-            "step_status": task_info.get('step_status', {}),
+            "step_status": step_status_summary,
             "created_at": task_info['created_at'].isoformat() if task_info.get('created_at') else None,
             "started_at": task_info['started_at'].isoformat() if task_info.get('started_at') else None,
             "completed_at": task_info['completed_at'].isoformat() if task_info.get('completed_at') else None,
             "parameters": task_info.get('parameters', {}),
             "result": task_info.get('result'),
             "error_message": task_info.get('error_message'),
-            "current_execution": {
-                "execution_id": current_execution['execution_id'],
-                "step": current_execution['execution_step'],
-                "status": current_execution['status'],
-                "started_at": current_execution['started_at'].isoformat() if current_execution.get('started_at') else None
-            } if current_execution else None,
-            "total_executions": len(executions)
+            "current_step": {
+                "execution_id": current_step['execution_id'],
+                "step": current_step['step_name'],
+                "status": current_step['step_status'],
+                "started_at": current_step['started_at'].isoformat() if current_step and current_step.get('started_at') else None
+            } if current_step else None,
+            "total_steps": len(steps),
+            "steps": [{
+                "step_name": step['step_name'],
+                "step_status": step['step_status'],
+                "started_at": step['started_at'].isoformat() if step.get('started_at') else None,
+                "completed_at": step['completed_at'].isoformat() if step.get('completed_at') else None,
+                "error_message": step.get('error_message')
+            } for step in steps]
         }
         
         return jsonify(success_response(
@@ -3004,10 +3022,10 @@ def get_data_pipeline_task_status(task_id):
 @app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/logs', methods=['GET'])
 def get_data_pipeline_task_logs(task_id):
     """
-    获取数据管道任务日志
+    获取数据管道任务日志(从任务目录文件读取)
     
     查询参数:
-    - limit: 日志数限制,默认100
+    - limit: 日志数限制,默认100
     - level: 日志级别过滤,可选
     
     响应:
@@ -3019,14 +3037,13 @@ def get_data_pipeline_task_logs(task_id):
             "task_id": "task_20250627_143052",
             "logs": [
                 {
-                    "timestamp": "2025-06-27T14:30:52",
+                    "timestamp": "2025-06-27 14:30:52",
                     "level": "INFO",
-                    "message": "任务开始执行",
-                    "step_name": "ddl_generation",
-                    "execution_id": "task_20250627_143052_step_ddl_generation_exec_20250627_143100"
+                    "message": "任务开始执行"
                 }
             ],
-            "total": 15
+            "total": 15,
+            "source": "file"
         }
     }
     """
@@ -3046,31 +3063,62 @@ def get_data_pipeline_task_logs(task_id):
                 response_text=f"任务不存在: {task_id}"
             )), 404
         
-        # 获取日志
-        logs = manager.get_task_logs(task_id, limit=limit)
-        
-        # 过滤日志级别
-        if level:
-            logs = [log for log in logs if log.get('log_level') == level.upper()]
-        
-        # 格式化日志
-        formatted_logs = []
-        for log in logs:
-            formatted_logs.append({
-                "timestamp": log['timestamp'].isoformat() if log.get('timestamp') else None,
-                "level": log.get('log_level'),
-                "message": log.get('message'),
-                "step_name": log.get('step_name'),
-                "execution_id": log.get('execution_id'),
-                "module_name": log.get('module_name'),
-                "function_name": log.get('function_name'),
-                "extra_data": log.get('extra_data')
-            })
+        # 获取任务目录下的日志文件
+        import os
+        from pathlib import Path
+        
+        # 获取项目根目录的绝对路径
+        project_root = Path(__file__).parent.absolute()
+        task_dir = project_root / "data_pipeline" / "training_data" / task_id
+        log_file = task_dir / "data_pipeline.log"
+        
+        logs = []
+        if log_file.exists():
+            try:
+                # 读取日志文件的最后N行
+                with open(log_file, 'r', encoding='utf-8') as f:
+                    lines = f.readlines()
+                    
+                # 取最后limit行
+                recent_lines = lines[-limit:] if len(lines) > limit else lines
+                
+                # 解析日志行
+                import re
+                log_pattern = r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+?): (.+)$'
+                
+                for line in recent_lines:
+                    line = line.strip()
+                    if not line:
+                        continue
+                        
+                    match = re.match(log_pattern, line)
+                    if match:
+                        timestamp, log_level, logger_name, message = match.groups()
+                        
+                        # 级别过滤
+                        if level and log_level != level.upper():
+                            continue
+                            
+                        logs.append({
+                            "timestamp": timestamp,
+                            "level": log_level,
+                            "logger": logger_name,
+                            "message": message
+                        })
+                    else:
+                        # 处理多行日志(如异常堆栈)
+                        if logs:
+                            logs[-1]["message"] += f"\n{line}"
+                        
+            except Exception as e:
+                logger.error(f"读取日志文件失败: {e}")
         
         response_data = {
             "task_id": task_id,
-            "logs": formatted_logs,
-            "total": len(formatted_logs)
+            "logs": logs,
+            "total": len(logs),
+            "source": "file",
+            "log_file": str(log_file) if log_file.exists() else None
         }
         
         return jsonify(success_response(
@@ -3194,28 +3242,50 @@ def get_data_pipeline_task_files(task_id):
 def download_data_pipeline_task_file(task_id, file_name):
     """下载任务文件"""
     try:
-        file_manager = get_data_pipeline_file_manager()
+        logger.info(f"开始下载文件: task_id={task_id}, file_name={file_name}")
+        
+        # 直接构建文件路径,避免依赖数据库
+        from pathlib import Path
+        import os
+        
+        # 获取项目根目录的绝对路径
+        project_root = Path(__file__).parent.absolute()
+        task_dir = project_root / "data_pipeline" / "training_data" / task_id
+        file_path = task_dir / file_name
+        
+        logger.info(f"文件路径: {file_path}")
         
-        # 验证文件存在且安全
-        if not file_manager.file_exists(task_id, file_name):
+        # 检查文件是否存在
+        if not file_path.exists():
+            logger.warning(f"文件不存在: {file_path}")
             return jsonify(not_found_response(
                 response_text=f"文件不存在: {file_name}"
             )), 404
         
-        if not file_manager.is_file_safe(task_id, file_name):
+        # 检查是否为文件(而不是目录)
+        if not file_path.is_file():
+            logger.warning(f"路径不是文件: {file_path}")
             return jsonify(bad_request_response(
-                response_text="非法的文件路径"
+                response_text=f"路径不是有效文件: {file_name}"
             )), 400
         
-        # 获取文件路径
-        file_path = file_manager.get_file_path(task_id, file_name)
+        # 安全检查:确保文件在允许的目录内
+        try:
+            file_path.resolve().relative_to(task_dir.resolve())
+        except ValueError:
+            logger.warning(f"文件路径不安全: {file_path}")
+            return jsonify(bad_request_response(
+                response_text="非法的文件路径"
+            )), 400
         
         # 检查文件是否可读
         if not os.access(file_path, os.R_OK):
+            logger.warning(f"文件不可读: {file_path}")
             return jsonify(bad_request_response(
                 response_text="文件不可读"
             )), 400
         
+        logger.info(f"开始发送文件: {file_path}")
         return send_file(
             file_path,
             as_attachment=True,
@@ -3223,10 +3293,136 @@ def download_data_pipeline_task_file(task_id, file_name):
         )
         
     except Exception as e:
-        logger.error(f"下载任务文件失败: {str(e)}")
+        logger.error(f"下载任务文件失败: task_id={task_id}, file_name={file_name}, 错误: {str(e)}", exc_info=True)
         return jsonify(internal_error_response(
             response_text="下载文件失败,请稍后重试"
         )), 500
 
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/upload-table-list', methods=['POST'])
+def upload_table_list_file(task_id):
+    """
+    上传表清单文件
+    
+    表单参数:
+    - file: 要上传的表清单文件(multipart/form-data)
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "表清单文件上传成功",
+        "data": {
+            "task_id": "task_20250701_123456",
+            "filename": "table_list.txt",
+            "file_size": 1024,
+            "file_size_formatted": "1.0 KB"
+        }
+    }
+    """
+    try:
+        # 验证任务是否存在
+        manager = get_data_pipeline_manager()
+        task_info = manager.get_task_status(task_id)
+        if not task_info:
+            return jsonify(not_found_response(
+                response_text=f"任务不存在: {task_id}"
+            )), 404
+        
+        # 检查是否有文件上传
+        if 'file' not in request.files:
+            return jsonify(bad_request_response(
+                response_text="请选择要上传的表清单文件",
+                missing_params=['file']
+            )), 400
+        
+        file = request.files['file']
+        
+        # 验证文件名
+        if file.filename == '':
+            return jsonify(bad_request_response(
+                response_text="请选择有效的文件"
+            )), 400
+        
+        try:
+            # 使用文件管理器上传文件
+            file_manager = get_data_pipeline_file_manager()
+            result = file_manager.upload_table_list_file(task_id, file)
+            
+            response_data = {
+                "task_id": task_id,
+                "filename": result["filename"],
+                "file_size": result["file_size"],
+                "file_size_formatted": result["file_size_formatted"],
+                "upload_time": result["upload_time"].isoformat() if result.get("upload_time") else None
+            }
+            
+            return jsonify(success_response(
+                response_text="表清单文件上传成功",
+                data=response_data
+            )), 200
+            
+        except ValueError as e:
+            # 文件验证错误(如文件太大、空文件等)
+            return jsonify(bad_request_response(
+                response_text=str(e)
+            )), 400
+        except Exception as e:
+            logger.error(f"上传表清单文件失败: {str(e)}")
+            return jsonify(internal_error_response(
+                response_text="文件上传失败,请稍后重试"
+            )), 500
+        
+    except Exception as e:
+        logger.error(f"处理表清单文件上传请求失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="处理上传请求失败,请稍后重试"
+        )), 500
+
+@app.flask_app.route('/api/v0/data_pipeline/tasks/<task_id>/table-list-info', methods=['GET'])
+def get_table_list_info(task_id):
+    """
+    获取任务的表清单文件信息
+    
+    响应:
+    {
+        "success": true,
+        "code": 200,
+        "message": "获取表清单文件信息成功",
+        "data": {
+            "task_id": "task_20250701_123456",
+            "has_file": true,
+            "filename": "table_list.txt",
+            "file_path": "./data_pipeline/training_data/task_20250701_123456/table_list.txt",
+            "file_size": 1024,
+            "file_size_formatted": "1.0 KB",
+            "uploaded_at": "2025-07-01T12:34:56",
+            "table_count": 5,
+            "is_readable": true
+        }
+    }
+    """
+    try:
+        file_manager = get_data_pipeline_file_manager()
+        
+        # 获取表清单文件信息
+        table_list_info = file_manager.get_table_list_file_info(task_id)
+        
+        response_data = {
+            "task_id": task_id,
+            "has_file": table_list_info.get("exists", False),
+            **table_list_info
+        }
+        
+        return jsonify(success_response(
+            response_text="获取表清单文件信息成功",
+            data=response_data
+        ))
+        
+    except Exception as e:
+        logger.error(f"获取表清单文件信息失败: {str(e)}")
+        return jsonify(internal_error_response(
+            response_text="获取表清单文件信息失败,请稍后重试"
+        )), 500
+
 logger.info("正在启动Flask应用: http://localhost:8084")
 app.run(host="0.0.0.0", port=8084, debug=True)

+ 116 - 91
data_pipeline/api/simple_db_manager.py

@@ -54,8 +54,8 @@ class SimpleTaskManager:
         return f"task_{timestamp}"
     
     def create_task(self, 
-                   table_list_file: str,
-                   business_context: str,
+                   table_list_file: str = None,
+                   business_context: str = None,
                    db_name: str = None,
                    **kwargs) -> str:
         """创建新任务"""
@@ -71,21 +71,33 @@ class SimpleTaskManager:
         if not db_name:
             db_name = APP_DB_CONFIG.get('dbname', 'business_db')
         
+        # 处理table_list_file参数
+        # 如果未提供,将在执行时检查任务目录中的table_list.txt文件
+        task_table_list_file = table_list_file
+        if not task_table_list_file:
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            upload_config = SCHEMA_TOOLS_CONFIG.get("file_upload", {})
+            target_filename = upload_config.get("target_filename", "table_list.txt")
+            # 使用相对于任务目录的路径
+            task_table_list_file = f"{{task_directory}}/{target_filename}"
+        
         # 构建参数
         parameters = {
             "db_connection": business_db_connection,  # 业务数据库连接(用于schema_workflow执行)
-            "table_list_file": table_list_file,
-            "business_context": business_context,
+            "table_list_file": task_table_list_file,
+            "business_context": business_context or "数据库管理系统",
+            "file_upload_mode": table_list_file is None,  # 标记是否使用文件上传模式
             **kwargs
         }
         
         try:
             conn = self._get_connection()
             with conn.cursor() as cursor:
+                # 创建任务记录
                 cursor.execute("""
                     INSERT INTO data_pipeline_tasks (
-                        id, task_type, status, parameters, created_by, 
-                        db_name, business_context, output_directory
+                        task_id, task_type, status, parameters, created_type, 
+                        by_user, db_name, output_directory
                     ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                 """, (
                     task_id, 
@@ -93,11 +105,32 @@ class SimpleTaskManager:
                     'pending', 
                     Json(parameters),
                     'api',
+                    'guest',
                     db_name,
-                    business_context,
-                    f"./data_pipeline/training_data/{task_id}"
+                    f"data_pipeline/training_data/{task_id}"
                 ))
                 
+                # 预创建所有步骤记录(策略A)
+                step_names = ['ddl_generation', 'qa_generation', 'sql_validation', 'training_load']
+                for step_name in step_names:
+                    cursor.execute("""
+                        INSERT INTO data_pipeline_task_steps (
+                            task_id, step_name, step_status
+                        ) VALUES (%s, %s, %s)
+                    """, (task_id, step_name, 'pending'))
+            
+            # 创建任务目录
+            try:
+                from data_pipeline.api.simple_file_manager import SimpleFileManager
+                file_manager = SimpleFileManager()
+                success = file_manager.create_task_directory(task_id)
+                if success:
+                    self.logger.info(f"任务目录创建成功: {task_id}")
+                else:
+                    self.logger.warning(f"任务目录创建失败,但任务记录已保存: {task_id}")
+            except Exception as dir_error:
+                self.logger.warning(f"创建任务目录时出错: {dir_error},但任务记录已保存: {task_id}")
+                
             self.logger.info(f"任务创建成功: {task_id}")
             return task_id
             
@@ -110,7 +143,7 @@ class SimpleTaskManager:
         try:
             conn = self._get_connection()
             with conn.cursor(cursor_factory=RealDictCursor) as cursor:
-                cursor.execute("SELECT * FROM data_pipeline_tasks WHERE id = %s", (task_id,))
+                cursor.execute("SELECT * FROM data_pipeline_tasks WHERE task_id = %s", (task_id,))
                 result = cursor.fetchone()
                 return dict(result) if result else None
         except Exception as e:
@@ -140,7 +173,7 @@ class SimpleTaskManager:
                 cursor.execute(f"""
                     UPDATE data_pipeline_tasks 
                     SET {', '.join(update_fields)}
-                    WHERE id = %s
+                    WHERE task_id = %s
                 """, values)
                 
                 self.logger.info(f"任务状态更新: {task_id} -> {status}")
@@ -148,130 +181,122 @@ class SimpleTaskManager:
             self.logger.error(f"任务状态更新失败: {e}")
             raise
     
-    def update_step_status(self, task_id: str, step_name: str, step_status: str):
+    def update_step_status(self, task_id: str, step_name: str, step_status: str, error_message: Optional[str] = None):
         """更新步骤状态"""
         try:
             conn = self._get_connection()
             with conn.cursor() as cursor:
-                cursor.execute("""
-                    UPDATE data_pipeline_tasks 
-                    SET step_status = jsonb_set(step_status, %s, %s)
-                    WHERE id = %s
-                """, ([step_name], json.dumps(step_status), task_id))
+                update_fields = ["step_status = %s"]
+                values = [step_status]
+                
+                # 如果状态是running,记录开始时间
+                if step_status == 'running':
+                    update_fields.append("started_at = CURRENT_TIMESTAMP")
+                
+                # 如果状态是completed或failed,记录完成时间
+                if step_status in ['completed', 'failed']:
+                    update_fields.append("completed_at = CURRENT_TIMESTAMP")
+                
+                # 如果有错误信息,记录错误信息
+                if error_message:
+                    update_fields.append("error_message = %s")
+                    values.append(error_message)
+                
+                values.extend([task_id, step_name])
+                
+                cursor.execute(f"""
+                    UPDATE data_pipeline_task_steps 
+                    SET {', '.join(update_fields)}
+                    WHERE task_id = %s AND step_name = %s
+                """, values)
                 
                 self.logger.debug(f"步骤状态更新: {task_id}.{step_name} -> {step_status}")
         except Exception as e:
             self.logger.error(f"步骤状态更新失败: {e}")
             raise
     
-    def create_execution(self, task_id: str, execution_step: str) -> str:
-        """创建执行记录"""
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        execution_id = f"{task_id}_step_{execution_step}_exec_{timestamp}"
-        
+    def update_step_execution_id(self, task_id: str, step_name: str, execution_id: str):
+        """更新步骤的execution_id"""
         try:
             conn = self._get_connection()
             with conn.cursor() as cursor:
                 cursor.execute("""
-                    INSERT INTO data_pipeline_task_executions (
-                        task_id, execution_step, status, execution_id
-                    ) VALUES (%s, %s, %s, %s)
-                """, (task_id, execution_step, 'running', execution_id))
+                    UPDATE data_pipeline_task_steps 
+                    SET execution_id = %s
+                    WHERE task_id = %s AND step_name = %s
+                """, (execution_id, task_id, step_name))
                 
-                self.logger.info(f"执行记录创建: {execution_id}")
-                return execution_id
+                self.logger.debug(f"步骤execution_id更新: {task_id}.{step_name} -> {execution_id}")
         except Exception as e:
-            self.logger.error(f"执行记录创建失败: {e}")
+            self.logger.error(f"步骤execution_id更新失败: {e}")
             raise
     
-    def complete_execution(self, execution_id: str, status: str, error_message: Optional[str] = None):
-        """完成执行记录"""
+    def start_step(self, task_id: str, step_name: str) -> str:
+        """开始执行步骤"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        execution_id = f"{task_id}_step_{step_name}_exec_{timestamp}"
+        
         try:
-            conn = self._get_connection()
-            with conn.cursor() as cursor:
-                # 计算执行时长
-                cursor.execute("""
-                    SELECT started_at FROM data_pipeline_task_executions 
-                    WHERE execution_id = %s
-                """, (execution_id,))
-                result = cursor.fetchone()
-                
-                duration_seconds = None
-                if result and result[0]:
-                    duration_seconds = int((datetime.now() - result[0]).total_seconds())
-                
-                # 更新执行记录
-                update_fields = ["status = %s", "completed_at = CURRENT_TIMESTAMP"]
-                values = [status]
-                
-                if duration_seconds is not None:
-                    update_fields.append("duration_seconds = %s")
-                    values.append(duration_seconds)
-                
-                if error_message:
-                    update_fields.append("error_message = %s")
-                    values.append(error_message)
-                
-                values.append(execution_id)
-                
-                cursor.execute(f"""
-                    UPDATE data_pipeline_task_executions 
-                    SET {', '.join(update_fields)}
-                    WHERE execution_id = %s
-                """, values)
+            # 更新步骤状态为running并设置execution_id
+            self.update_step_status(task_id, step_name, 'running')
+            self.update_step_execution_id(task_id, step_name, execution_id)
                 
-                self.logger.info(f"执行记录完成: {execution_id} -> {status}")
+            self.logger.info(f"步骤开始执行: {task_id}.{step_name} -> {execution_id}")
+            return execution_id
         except Exception as e:
-            self.logger.error(f"执行记录完成失败: {e}")
+            self.logger.error(f"步骤开始执行失败: {e}")
             raise
     
-    def record_log(self, task_id: str, log_level: str, message: str, 
-                   execution_id: Optional[str] = None, step_name: Optional[str] = None):
-        """记录日志到数据库"""
+    def complete_step(self, task_id: str, step_name: str, status: str, error_message: Optional[str] = None):
+        """完成步骤执行"""
         try:
-            conn = self._get_connection()
-            with conn.cursor() as cursor:
-                cursor.execute("""
-                    INSERT INTO data_pipeline_task_logs (
-                        task_id, execution_id, log_level, message, step_name
-                    ) VALUES (%s, %s, %s, %s, %s)
-                """, (task_id, execution_id, log_level, message, step_name))
+            self.update_step_status(task_id, step_name, status, error_message)
+            self.logger.info(f"步骤执行完成: {task_id}.{step_name} -> {status}")
         except Exception as e:
-            self.logger.error(f"日志记录失败: {e}")
+            self.logger.error(f"步骤执行完成失败: {e}")
+            raise
     
-    def get_task_logs(self, task_id: str, limit: int = 100) -> List[Dict[str, Any]]:
-        """获取任务日志"""
+    def get_task_steps(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务的所有步骤状态"""
         try:
             conn = self._get_connection()
             with conn.cursor(cursor_factory=RealDictCursor) as cursor:
                 cursor.execute("""
-                    SELECT * FROM data_pipeline_task_logs 
+                    SELECT * FROM data_pipeline_task_steps 
                     WHERE task_id = %s 
-                    ORDER BY timestamp DESC 
-                    LIMIT %s
-                """, (task_id, limit))
+                    ORDER BY 
+                        CASE step_name 
+                          WHEN 'ddl_generation' THEN 1
+                          WHEN 'qa_generation' THEN 2
+                          WHEN 'sql_validation' THEN 3
+                          WHEN 'training_load' THEN 4
+                          ELSE 5 
+                        END
+                """, (task_id,))
                 
                 return [dict(row) for row in cursor.fetchall()]
         except Exception as e:
-            self.logger.error(f"获取任务日志失败: {e}")
+            self.logger.error(f"获取任务步骤状态失败: {e}")
             raise
     
-    def get_task_executions(self, task_id: str) -> List[Dict[str, Any]]:
-        """获取任务执行记录"""
+    def get_step_status(self, task_id: str, step_name: str) -> Optional[Dict[str, Any]]:
+        """获取特定步骤的状态"""
         try:
             conn = self._get_connection()
             with conn.cursor(cursor_factory=RealDictCursor) as cursor:
                 cursor.execute("""
-                    SELECT * FROM data_pipeline_task_executions 
-                    WHERE task_id = %s 
-                    ORDER BY started_at DESC
-                """, (task_id,))
+                    SELECT * FROM data_pipeline_task_steps 
+                    WHERE task_id = %s AND step_name = %s
+                """, (task_id, step_name))
                 
-                return [dict(row) for row in cursor.fetchall()]
+                result = cursor.fetchone()
+                return dict(result) if result else None
         except Exception as e:
-            self.logger.error(f"获取执行记录失败: {e}")
+            self.logger.error(f"获取步骤状态失败: {e}")
             raise
     
+
+    
     def get_tasks_list(self, limit: int = 50, offset: int = 0, status_filter: Optional[str] = None) -> List[Dict[str, Any]]:
         """获取任务列表"""
         try:
@@ -303,7 +328,7 @@ class SimpleTaskManager:
         try:
             conn = self._get_connection()
             with conn.cursor() as cursor:
-                cursor.execute("SELECT started_at FROM data_pipeline_tasks WHERE id = %s", (task_id,))
+                cursor.execute("SELECT started_at FROM data_pipeline_tasks WHERE task_id = %s", (task_id,))
                 result = cursor.fetchone()
                 return result[0] if result and result[0] else None
         except Exception:

+ 335 - 3
data_pipeline/api/simple_file_manager.py

@@ -1,13 +1,15 @@
 """
 Data Pipeline API 简化文件管理器
 
-提供简单的文件列表和下载功能,无压缩等复杂功能
+提供简单的文件列表、下载和上传功能,无压缩等复杂功能
 """
 
 import os
 from pathlib import Path
-from typing import Dict, Any, List
+from typing import Dict, Any, List, BinaryIO, Union
 from datetime import datetime
+import tempfile
+import shutil
 
 import logging
 
@@ -15,7 +17,12 @@ import logging
 class SimpleFileManager:
     """简化的文件管理器"""
     
-    def __init__(self, base_output_dir: str = "./data_pipeline/training_data/"):
+    def __init__(self, base_output_dir: str = None):
+        if base_output_dir is None:
+            # 获取项目根目录的绝对路径
+            from pathlib import Path
+            project_root = Path(__file__).parent.parent.parent
+            base_output_dir = str(project_root / "data_pipeline" / "training_data")
         """
         初始化文件管理器
         
@@ -181,4 +188,329 @@ class SimpleFileManager:
                 "total_files": 0,
                 "total_size": 0,
                 "total_size_formatted": "0 B"
+            }
+    
+    def upload_table_list_file(self, task_id: str, file_obj: Union[BinaryIO, bytes], filename: str = None) -> Dict[str, Any]:
+        """
+        上传表清单文件到指定任务目录
+        
+        Args:
+            task_id: 任务ID
+            file_obj: 文件对象(Flask的FileStorage)或文件内容(字节流)
+            filename: 原始文件名(可选,仅用于日志记录)
+        
+        Returns:
+            Dict: 上传结果,包含filename、file_size、file_size_formatted、upload_time等
+        
+        Raises:
+            ValueError: 文件验证失败(文件太大、空文件、格式错误等)
+            FileNotFoundError: 任务目录不存在且无法创建
+            IOError: 文件操作失败
+        """
+        try:
+            # 获取配置
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            upload_config = SCHEMA_TOOLS_CONFIG.get("file_upload", {})
+            max_file_size_mb = upload_config.get("max_file_size_mb", 2)
+            max_size = max_file_size_mb * 1024 * 1024  # 转换为字节
+            target_filename = upload_config.get("target_filename", "table_list.txt")
+            allowed_extensions = upload_config.get("allowed_extensions", ["txt"])
+            
+            # 处理文件对象或字节流
+            if isinstance(file_obj, bytes):
+                file_content = file_obj
+                original_filename = filename or "uploaded_file.txt"
+            else:
+                # Flask FileStorage对象
+                if hasattr(file_obj, 'filename') and file_obj.filename:
+                    original_filename = file_obj.filename
+                else:
+                    original_filename = filename or "uploaded_file.txt"
+                
+                # 验证文件扩展名 - 修复:统一格式进行比较
+                file_ext = Path(original_filename).suffix.lower().lstrip('.')
+                if file_ext not in allowed_extensions:
+                    raise ValueError(f"不支持的文件类型,仅支持: {', '.join(['.' + ext for ext in allowed_extensions])}")
+                
+                # 读取文件内容并验证大小
+                file_content = b''
+                chunk_size = 8192
+                total_size = 0
+                
+                while True:
+                    chunk = file_obj.read(chunk_size)
+                    if not chunk:
+                        break
+                    
+                    total_size += len(chunk)
+                    if total_size > max_size:
+                        raise ValueError(f"文件大小超过限制: {max_file_size_mb}MB")
+                    
+                    file_content += chunk
+            
+            # 验证文件内容为空
+            if len(file_content) == 0:
+                raise ValueError("文件为空,请选择有效的表清单文件")
+            
+            # 验证文件内容(简单检查是否为文本文件)
+            self._validate_table_list_content_simple(file_content)
+            
+            # 确保任务目录存在
+            task_dir = self.get_task_directory(task_id)
+            if not task_dir.exists():
+                task_dir.mkdir(parents=True, exist_ok=True)
+                self.logger.info(f"创建任务目录: {task_dir}")
+            
+            # 确定目标文件路径
+            target_file_path = task_dir / target_filename
+            
+            # 保存文件
+            with open(target_file_path, 'wb') as f:
+                f.write(file_content)
+            
+            # 验证文件是否成功写入
+            if not target_file_path.exists():
+                raise IOError("文件保存失败")
+            
+            # 获取文件信息
+            file_stat = target_file_path.stat()
+            upload_time = datetime.fromtimestamp(file_stat.st_mtime)
+            
+            self.logger.info(f"成功上传表清单文件到任务 {task_id}: {target_file_path}")
+            
+            return {
+                "filename": target_filename,
+                "original_filename": original_filename,
+                "file_size": file_stat.st_size,
+                "file_size_formatted": self._format_file_size(file_stat.st_size),
+                "upload_time": upload_time,
+                "target_path": str(target_file_path)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"上传表清单文件失败: {e}")
+            raise
+    
+    def _validate_table_list_content_simple(self, file_content: bytes) -> None:
+        """
+        简单验证表清单文件内容
+        
+        Args:
+            file_content: 文件内容(字节流)
+            
+        Raises:
+            ValueError: 文件内容验证失败
+        """
+        try:
+            # 尝试解码文件内容
+            try:
+                content = file_content.decode('utf-8')
+            except UnicodeDecodeError:
+                try:
+                    content = file_content.decode('gbk')
+                except UnicodeDecodeError:
+                    raise ValueError("文件编码错误,请确保文件为UTF-8或GBK格式")
+            
+            # 检查文件是否为空
+            if not content.strip():
+                raise ValueError("表清单文件为空")
+            
+            # 简单验证:检查是否包含至少一个非空行
+            lines = [line.strip() for line in content.split('\n') if line.strip()]
+            if not lines:
+                raise ValueError("表清单文件不包含有效的表名")
+            
+            # 可选:验证表名格式(避免SQL注入等安全问题)
+            import re
+            table_name_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*(\.[a-zA-Z_][a-zA-Z0-9_]*)?$')
+            invalid_tables = []
+            
+            for line in lines[:10]:  # 只检查前10行以避免过度验证
+                # 忽略注释行
+                if line.startswith('#') or line.startswith('--'):
+                    continue
+                
+                # 检查表名格式
+                if not table_name_pattern.match(line):
+                    invalid_tables.append(line)
+            
+            if invalid_tables:
+                raise ValueError(f"表清单文件包含无效的表名格式: {', '.join(invalid_tables[:3])}")
+                
+        except ValueError:
+            raise
+        except Exception as e:
+            raise ValueError(f"文件内容验证失败: {str(e)}")
+    
+    def _validate_table_list_content(self, file_content: bytes, config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        验证表清单文件内容
+        
+        Args:
+            file_content: 文件内容(字节流)
+            config: 文件上传配置
+        
+        Returns:
+            Dict: 验证结果
+        """
+        try:
+            # 解码文件内容
+            encoding = config.get("encoding", "utf-8")
+            try:
+                content = file_content.decode(encoding)
+            except UnicodeDecodeError:
+                # 尝试其他编码
+                for fallback_encoding in ["gbk", "latin1"]:
+                    try:
+                        content = file_content.decode(fallback_encoding)
+                        self.logger.warning(f"文件编码检测为 {fallback_encoding},建议使用 UTF-8")
+                        break
+                    except UnicodeDecodeError:
+                        continue
+                else:
+                    return {
+                        "valid": False,
+                        "error": f"无法解码文件内容,请确保文件编码为 {encoding}"
+                    }
+            
+            # 分析文件内容
+            lines = content.splitlines()
+            total_lines = len(lines)
+            
+            # 过滤空行和注释行
+            valid_lines = []
+            comment_lines = 0
+            empty_lines = 0
+            
+            for line_num, line in enumerate(lines, 1):
+                stripped = line.strip()
+                if not stripped:
+                    empty_lines += 1
+                elif stripped.startswith('#'):
+                    comment_lines += 1
+                else:
+                    # 简单验证表名格式
+                    if self._is_valid_table_name(stripped):
+                        valid_lines.append(stripped)
+                    else:
+                        return {
+                            "valid": False,
+                            "error": f"第 {line_num} 行包含无效的表名: {stripped}",
+                            "details": {
+                                "line_number": line_num,
+                                "invalid_content": stripped
+                            }
+                        }
+            
+            # 检查有效行数
+            min_lines = config.get("min_lines", 1)
+            max_lines = config.get("max_lines", 1000)
+            
+            if len(valid_lines) < min_lines:
+                return {
+                    "valid": False,
+                    "error": f"文件至少需要包含 {min_lines} 个有效表名,当前只有 {len(valid_lines)} 个",
+                    "details": {
+                        "valid_tables": len(valid_lines),
+                        "min_required": min_lines
+                    }
+                }
+            
+            if len(valid_lines) > max_lines:
+                return {
+                    "valid": False,
+                    "error": f"文件包含的表名数量超过限制,最多允许 {max_lines} 个,当前有 {len(valid_lines)} 个",
+                    "details": {
+                        "valid_tables": len(valid_lines),
+                        "max_allowed": max_lines
+                    }
+                }
+            
+            return {
+                "valid": True,
+                "details": {
+                    "total_lines": total_lines,
+                    "empty_lines": empty_lines,
+                    "comment_lines": comment_lines,
+                    "valid_tables": len(valid_lines),
+                    "table_names": valid_lines[:10]  # 只返回前10个作为预览
+                }
+            }
+            
+        except Exception as e:
+            return {
+                "valid": False,
+                "error": f"文件内容验证失败: {str(e)}"
+            }
+    
+    def _is_valid_table_name(self, table_name: str) -> bool:
+        """
+        验证表名格式是否有效
+        
+        Args:
+            table_name: 表名
+        
+        Returns:
+            bool: 是否有效
+        """
+        import re
+        
+        # 基本的表名格式检查
+        # 支持: table_name, schema.table_name
+        pattern = r'^[a-zA-Z_][a-zA-Z0-9_]*(\.[a-zA-Z_][a-zA-Z0-9_]*)?$'
+        return bool(re.match(pattern, table_name))
+    
+    def get_table_list_file_info(self, task_id: str) -> Dict[str, Any]:
+        """
+        获取任务的表清单文件信息
+        
+        Args:
+            task_id: 任务ID
+        
+        Returns:
+            Dict: 文件信息或None
+        """
+        try:
+            from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+            upload_config = SCHEMA_TOOLS_CONFIG.get("file_upload", {})
+            target_filename = upload_config.get("target_filename", "table_list.txt")
+            
+            file_path = self.get_file_path(task_id, target_filename)
+            
+            if not file_path.exists():
+                return {
+                    "exists": False,
+                    "file_name": target_filename,
+                    "expected_path": str(file_path)
+                }
+            
+            file_stat = file_path.stat()
+            
+            # 尝试读取文件内容进行分析
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                    lines = content.splitlines()
+                    valid_tables = [line.strip() for line in lines 
+                                   if line.strip() and not line.strip().startswith('#')]
+            except Exception:
+                valid_tables = []
+            
+            return {
+                "exists": True,
+                "file_name": target_filename,
+                "file_path": str(file_path),
+                "file_size": file_stat.st_size,
+                "file_size_formatted": self._format_file_size(file_stat.st_size),
+                "uploaded_at": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                "created_at": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                "table_count": len(valid_tables),
+                "is_readable": os.access(file_path, os.R_OK)
+            }
+            
+        except Exception as e:
+            self.logger.error(f"获取表清单文件信息失败: {e}")
+            return {
+                "exists": False,
+                "error": str(e)
             }

+ 97 - 72
data_pipeline/api/simple_workflow.py

@@ -135,13 +135,48 @@ class SimpleWorkflowExecutor:
             except Exception as e:
                 self.logger.error(f"记录任务目录日志失败: {e}")
     
+    def _resolve_table_list_file_path(self) -> str:
+        """解析表清单文件路径"""
+        table_list_file = self.task_params['table_list_file']
+        
+        # 检查是否使用文件上传模式
+        if self.task_params.get('file_upload_mode', False) or '{task_directory}' in table_list_file:
+            # 文件上传模式:检查任务目录中的文件
+            task_dir = self.file_manager.get_task_directory(self.task_id)
+            
+            # 替换占位符
+            if '{task_directory}' in table_list_file:
+                resolved_path = table_list_file.replace('{task_directory}', str(task_dir))
+            else:
+                from data_pipeline.config import SCHEMA_TOOLS_CONFIG
+                upload_config = SCHEMA_TOOLS_CONFIG.get("file_upload", {})
+                target_filename = upload_config.get("target_filename", "table_list.txt")
+                resolved_path = str(task_dir / target_filename)
+            
+            # 检查文件是否存在
+            if not Path(resolved_path).exists():
+                raise FileNotFoundError(
+                    f"表清单文件不存在: {resolved_path}。"
+                    f"请先上传表清单文件到任务 {self.task_id},然后再执行工作流。"
+                )
+            
+            return resolved_path
+        else:
+            # 传统模式:使用指定的文件路径
+            if not Path(table_list_file).exists():
+                raise FileNotFoundError(f"表清单文件不存在: {table_list_file}")
+            return table_list_file
+    
     def _create_orchestrator(self) -> SchemaWorkflowOrchestrator:
         """创建工作流编排器"""
         task_dir = self.file_manager.get_task_directory(self.task_id)
         
+        # 解析表清单文件路径
+        table_list_file = self._resolve_table_list_file_path()
+        
         return SchemaWorkflowOrchestrator(
             db_connection=self.task_params['db_connection'],
-            table_list_file=self.task_params['table_list_file'],
+            table_list_file=table_list_file,
             business_context=self.task_params['business_context'],
             output_dir=str(task_dir),
             task_id=self.task_id,  # 传递task_id给编排器
@@ -158,9 +193,7 @@ class SimpleWorkflowExecutor:
         
         try:
             # 开始执行
-            execution_id = self.task_manager.create_execution(self.task_id, step_name)
-            self.task_manager.update_step_status(self.task_id, step_name, "running")
-            self.task_manager.record_log(self.task_id, "INFO", f"开始执行步骤: {step_name}", execution_id, step_name)
+            execution_id = self.task_manager.start_step(self.task_id, step_name)
             
             # 记录到任务目录日志
             self._log_to_task_directory("INFO", f"开始执行步骤: {step_name}", step_name)
@@ -168,9 +201,7 @@ class SimpleWorkflowExecutor:
             yield execution_id
             
             # 成功完成
-            self.task_manager.complete_execution(execution_id, 'completed')
-            self.task_manager.update_step_status(self.task_id, step_name, "completed")
-            self.task_manager.record_log(self.task_id, "INFO", f"步骤执行完成: {step_name}", execution_id, step_name)
+            self.task_manager.complete_step(self.task_id, step_name, 'completed')
             
             # 记录到任务目录日志
             self._log_to_task_directory("INFO", f"步骤执行完成: {step_name}", step_name)
@@ -179,11 +210,7 @@ class SimpleWorkflowExecutor:
             # 执行失败
             error_msg = str(e)
             
-            if execution_id:
-                self.task_manager.complete_execution(execution_id, 'failed', error_msg)
-            
-            self.task_manager.update_step_status(self.task_id, step_name, "failed")
-            self.task_manager.record_log(self.task_id, "ERROR", f"步骤执行失败: {step_name} - {error_msg}", execution_id, step_name)
+            self.task_manager.complete_step(self.task_id, step_name, 'failed', error_msg)
             
             # 记录到任务目录日志
             self._log_to_task_directory("ERROR", f"步骤执行失败: {step_name} - {error_msg}", step_name)
@@ -198,7 +225,6 @@ class SimpleWorkflowExecutor:
             
             # 开始任务
             self.task_manager.update_task_status(self.task_id, 'in_progress')
-            self.task_manager.record_log(self.task_id, "INFO", "任务开始执行")
             
             # 记录到任务目录日志
             self._log_to_task_directory("INFO", "完整工作流任务开始执行")
@@ -206,26 +232,59 @@ class SimpleWorkflowExecutor:
             # 创建工作流编排器
             orchestrator = self._create_orchestrator()
             
-            # 执行完整工作流
-            with self._step_execution("complete") as execution_id:
-                self.task_manager.record_log(self.task_id, "INFO", "开始执行完整工作流", execution_id, "complete")
+            # 重定向SchemaWorkflowOrchestrator的日志到任务目录
+            self._redirect_orchestrator_logs(orchestrator)
+            
+            # 分别执行各个步骤,每个步骤都用_step_execution包装
+            try:
+                # 步骤1: DDL/MD生成
+                with self._step_execution("ddl_generation") as execution_id:
+                    self._log_to_task_directory("INFO", "开始执行DDL/MD生成步骤", "ddl_generation")
+                    await orchestrator._execute_step_1_ddl_md_generation()
+                    self._log_to_task_directory("INFO", "DDL/MD生成步骤完成", "ddl_generation")
+                
+                # 步骤2: Question-SQL生成  
+                with self._step_execution("qa_generation") as execution_id:
+                    self._log_to_task_directory("INFO", "开始执行Question-SQL生成步骤", "qa_generation")
+                    await orchestrator._execute_step_2_question_sql_generation()
+                    self._log_to_task_directory("INFO", "Question-SQL生成步骤完成", "qa_generation")
                 
-                # 重定向SchemaWorkflowOrchestrator的日志到任务目录
-                self._redirect_orchestrator_logs(orchestrator)
+                # 步骤3: SQL验证(如果启用)
+                if orchestrator.enable_sql_validation:
+                    with self._step_execution("sql_validation") as execution_id:
+                        self._log_to_task_directory("INFO", "开始执行SQL验证步骤", "sql_validation")
+                        await orchestrator._execute_step_3_sql_validation()
+                        self._log_to_task_directory("INFO", "SQL验证步骤完成", "sql_validation")
+                else:
+                    self._log_to_task_directory("INFO", "跳过SQL验证步骤(未启用)", "sql_validation")
+                
+                # 步骤4: 训练数据加载(如果启用)
+                if orchestrator.enable_training_data_load:
+                    with self._step_execution("training_load") as execution_id:
+                        self._log_to_task_directory("INFO", "开始执行训练数据加载步骤", "training_load")
+                        await orchestrator._execute_step_4_training_data_load()
+                        self._log_to_task_directory("INFO", "训练数据加载步骤完成", "training_load")
+                else:
+                    self._log_to_task_directory("INFO", "跳过训练数据加载步骤(未启用)", "training_load")
                 
-                result = await orchestrator.execute_complete_workflow()
+                # 获取工作流结果
+                result = {
+                    "success": True,
+                    "workflow_state": orchestrator.workflow_state,
+                    "artifacts": orchestrator.workflow_state.get("artifacts", {})
+                }
                 
                 # 写入结果文件
                 self._write_result_file(result)
                 
-                self.task_manager.record_log(self.task_id, "INFO", "完整工作流执行完成", execution_id, "complete")
-            
-            # 更新所有子步骤状态为完成
-            self._update_all_step_status_for_complete_workflow(result)
+            except Exception as step_error:
+                self.logger.error(f"工作流步骤执行失败: {step_error}")
+                # 记录到任务目录日志
+                self._log_to_task_directory("ERROR", f"工作流步骤执行失败: {step_error}")
+                raise
             
             # 完成任务
             self.task_manager.update_task_status(self.task_id, 'completed')
-            self.task_manager.record_log(self.task_id, "INFO", "任务执行完成")
             
             # 记录到任务目录日志
             self._log_to_task_directory("INFO", "完整工作流任务执行完成")
@@ -240,7 +299,6 @@ class SimpleWorkflowExecutor:
         except Exception as e:
             # 记录错误
             error_msg = str(e)
-            self.task_manager.record_log(self.task_id, "ERROR", f"任务执行失败: {error_msg}")
             self.task_manager.update_task_status(self.task_id, 'failed', error_msg)
             
             # 记录到任务目录日志
@@ -308,7 +366,6 @@ class SimpleWorkflowExecutor:
         except Exception as e:
             # 记录错误
             error_msg = str(e)
-            self.task_manager.record_log(self.task_id, "ERROR", f"步骤执行失败: {step_name} - {error_msg}")
             self.task_manager.update_task_status(self.task_id, 'failed', error_msg)
             
             # 记录到任务目录日志
@@ -350,16 +407,16 @@ class SimpleWorkflowExecutor:
         """更新整体任务状态"""
         try:
             # 检查所有步骤的完成情况
-            executions = self.task_manager.get_task_executions(self.task_id)
+            steps = self.task_manager.get_task_steps(self.task_id)
             
             completed_steps = set()
             failed_steps = set()
             
-            for execution in executions:
-                if execution['status'] == 'completed':
-                    completed_steps.add(execution['execution_step'])
-                elif execution['status'] == 'failed':
-                    failed_steps.add(execution['execution_step'])
+            for step in steps:
+                if step['step_status'] == 'completed':
+                    completed_steps.add(step['step_name'])
+                elif step['step_status'] == 'failed':
+                    failed_steps.add(step['step_name'])
             
             # 检查是否有失败的步骤
             if failed_steps:
@@ -394,35 +451,7 @@ class SimpleWorkflowExecutor:
             except Exception as e:
                 self.logger.error(f"重定向orchestrator日志失败: {e}")
     
-    def _update_all_step_status_for_complete_workflow(self, result: Dict[str, Any]):
-        """完整工作流成功后,更新所有子步骤状态为完成"""
-        try:
-            # 定义完整工作流包含的所有步骤
-            workflow_steps = ["ddl_generation", "qa_generation", "sql_validation", "training_load"]
-            
-            # 记录日志
-            self._log_to_task_directory("INFO", "开始更新完整工作流各步骤状态为完成")
-            
-            # 逐一更新每个步骤的状态为完成
-            for step_name in workflow_steps:
-                try:
-                    self.task_manager.update_step_status(self.task_id, step_name, "completed")
-                    self.task_manager.record_log(
-                        self.task_id, 
-                        "INFO", 
-                        f"完整工作流执行成功,更新步骤状态为完成: {step_name}",
-                        step_name=step_name
-                    )
-                    self._log_to_task_directory("INFO", f"更新步骤状态为完成: {step_name}", step_name)
-                except Exception as step_error:
-                    self.logger.error(f"更新步骤状态失败 {step_name}: {step_error}")
-                    self._log_to_task_directory("ERROR", f"更新步骤状态失败: {step_name} - {step_error}", step_name)
-            
-            self._log_to_task_directory("INFO", "完整工作流各步骤状态更新完成")
-            
-        except Exception as e:
-            self.logger.error(f"更新完整工作流步骤状态失败: {e}")
-            self._log_to_task_directory("ERROR", f"更新完整工作流步骤状态失败: {e}")
+
     
     def cleanup(self):
         """清理资源"""
@@ -450,14 +479,14 @@ class SimpleWorkflowManager:
         self.logger.setLevel(logging.INFO)
     
     def create_task(self, 
-                   table_list_file: str,
-                   business_context: str,
+                   table_list_file: str = None,
+                   business_context: str = None,
                    db_name: str = None,
                    **kwargs) -> str:
         """创建新任务"""
         try:
-            # 验证表清单文件存在
-            if not os.path.exists(table_list_file):
+            # 如果提供了table_list_file,验证文件存在
+            if table_list_file and not os.path.exists(table_list_file):
                 raise FileNotFoundError(f"表清单文件不存在: {table_list_file}")
             
             # 创建任务(使用app_config中的数据库配置)
@@ -500,17 +529,13 @@ class SimpleWorkflowManager:
         """获取任务状态"""
         return self.task_manager.get_task(task_id)
     
-    def get_task_logs(self, task_id: str, limit: int = 100) -> List[Dict[str, Any]]:
-        """获取任务日志"""
-        return self.task_manager.get_task_logs(task_id, limit)
-    
     def get_task_files(self, task_id: str) -> List[Dict[str, Any]]:
         """获取任务文件列表"""
         return self.file_manager.get_task_files(task_id)
     
-    def get_task_executions(self, task_id: str) -> List[Dict[str, Any]]:
-        """获取任务执行记录"""
-        return self.task_manager.get_task_executions(task_id)
+    def get_task_steps(self, task_id: str) -> List[Dict[str, Any]]:
+        """获取任务步骤状态"""
+        return self.task_manager.get_task_steps(task_id)
     
     def get_tasks_list(self, **kwargs) -> List[Dict[str, Any]]:
         """获取任务列表"""

+ 13 - 0
data_pipeline/config.py

@@ -120,6 +120,19 @@ SCHEMA_TOOLS_CONFIG = {
         
         # 文件修改配置
         "modify_original_file": False,       # 是否修改原始JSON文件(默认禁用)
+    },
+    
+    # 文件上传配置
+    "file_upload": {
+        "enabled": True,                     # 是否启用文件上传功能
+        "max_file_size_mb": 2,               # 最大文件大小(MB)
+        "allowed_extensions": ["txt"],       # 允许的文件扩展名(不带点)
+        "target_filename": "table_list.txt", # 上传后的标准文件名
+        "validate_content": True,            # 是否验证文件内容
+        "min_lines": 1,                      # 最少行数(排除空行和注释)
+        "max_lines": 1000,                   # 最大行数限制
+        "encoding": "utf-8",                 # 文件编码
+        "allow_overwrite": True,             # 是否允许覆盖已存在的文件
     }
 }
 

+ 4 - 2
data_pipeline/dp_logging/manager.py

@@ -81,8 +81,10 @@ class DataPipelineLogManager:
     def _create_file_handler(cls, task_id: str) -> std_logging.FileHandler:
         """创建文件处理器"""
         try:
-            # 确定日志文件路径
-            task_dir = Path("data_pipeline/training_data") / task_id
+            # 获取项目根目录的绝对路径
+            project_root = Path(__file__).parent.parent.parent
+            task_dir = project_root / "data_pipeline" / "training_data" / task_id
+            
             task_dir.mkdir(parents=True, exist_ok=True)
             
             log_file = task_dir / "data_pipeline.log"

+ 3 - 1
data_pipeline/schema_workflow.py

@@ -64,7 +64,9 @@ class SchemaWorkflowOrchestrator:
         # 设置输出目录
         if output_dir is None:
             # 脚本模式或未指定输出目录时,使用任务目录
-            self.output_dir = Path("data_pipeline/training_data") / self.task_id
+            # 获取项目根目录的绝对路径
+            project_root = Path(__file__).parent.parent
+            self.output_dir = project_root / "data_pipeline" / "training_data" / self.task_id
         else:
             # API模式或明确指定输出目录时,使用指定的目录
             self.output_dir = Path(output_dir)

+ 52 - 166
data_pipeline/sql/init_tables.sql

@@ -17,7 +17,7 @@ BEGIN;
 -- ====================================================================
 CREATE TABLE IF NOT EXISTS data_pipeline_tasks (
     -- 主键:时间戳格式的任务ID
-    id VARCHAR(32) PRIMARY KEY,                    -- 'task_20250627_143052'
+    task_id VARCHAR(32) PRIMARY KEY,               -- 'task_20250627_143052'
     
     -- 任务基本信息
     task_type VARCHAR(50) NOT NULL DEFAULT 'data_workflow',
@@ -30,28 +30,20 @@ CREATE TABLE IF NOT EXISTS data_pipeline_tasks (
     -- 错误处理
     error_message TEXT,                            -- 错误详细信息
     
-    -- 步骤状态跟踪
-    step_status JSONB DEFAULT '{
-        "ddl_generation": "pending",
-        "qa_generation": "pending", 
-        "sql_validation": "pending",
-        "training_load": "pending"
-    }'::jsonb,
-    
     -- 时间戳
     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
     started_at TIMESTAMP,
     completed_at TIMESTAMP,
     
     -- 创建者信息
-    created_by VARCHAR(50) DEFAULT 'api',          -- 'api', 'manual', 'system'
+    created_type VARCHAR(50) DEFAULT 'api',        -- 'api', 'manual', 'system'
+    by_user VARCHAR(50),                           -- 'guest'或其它user_id
     
     -- 输出目录
     output_directory TEXT,                         -- 任务输出目录路径
     
     -- 索引字段
-    db_name VARCHAR(100),                          -- 数据库名称(便于筛选)
-    business_context TEXT                          -- 业务上下文(便于搜索)
+    db_name VARCHAR(100)                           -- 数据库名称(便于筛选)
 );
 
 -- 添加约束
@@ -61,98 +53,31 @@ ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_task_status
 ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_task_type 
     CHECK (task_type IN ('data_workflow', 'complete_workflow'));
 
-ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_created_by 
-    CHECK (created_by IN ('api', 'manual', 'system'));
+ALTER TABLE data_pipeline_tasks ADD CONSTRAINT chk_created_type 
+    CHECK (created_type IN ('api', 'manual', 'system'));
 
 -- ====================================================================
--- 任务执行记录表 (data_pipeline_task_executions)
+-- 任务步骤状态表 (data_pipeline_task_steps)
 -- ====================================================================
-CREATE TABLE IF NOT EXISTS data_pipeline_task_executions (
+CREATE TABLE IF NOT EXISTS data_pipeline_task_steps (
     id SERIAL PRIMARY KEY,
-    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
-    execution_step VARCHAR(50) NOT NULL,          -- 'ddl_generation', 'qa_generation', 'sql_validation', 'training_load', 'complete'
-    status VARCHAR(20) NOT NULL,                  -- 'running', 'completed', 'failed'
-    started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(task_id) ON DELETE CASCADE,
+    execution_id VARCHAR(100),                    -- 执行批次ID(可为空)
+    step_name VARCHAR(50) NOT NULL,               -- 'ddl_generation', 'qa_generation', 'sql_validation', 'training_load'
+    step_status VARCHAR(50) NOT NULL DEFAULT 'pending', -- 'pending', 'running', 'completed', 'failed'
+    started_at TIMESTAMP,
     completed_at TIMESTAMP,
-    error_message TEXT,
-    execution_result JSONB,                       -- 步骤执行结果
-    execution_id VARCHAR(100) UNIQUE,             -- {task_id}_step_{step_name}_exec_{timestamp}
-    force_executed BOOLEAN DEFAULT FALSE,         -- 是否强制执行
-    files_cleaned BOOLEAN DEFAULT FALSE,          -- 是否清理了旧文件
-    duration_seconds INTEGER                      -- 执行时长(秒)
+    error_message TEXT                            -- 错误详细信息
 );
 
 -- 添加约束
-ALTER TABLE data_pipeline_task_executions ADD CONSTRAINT chk_execution_status 
-    CHECK (status IN ('running', 'completed', 'failed'));
+ALTER TABLE data_pipeline_task_steps ADD CONSTRAINT chk_step_status 
+    CHECK (step_status IN ('pending', 'running', 'completed', 'failed'));
 
-ALTER TABLE data_pipeline_task_executions ADD CONSTRAINT chk_execution_step 
-    CHECK (execution_step IN ('ddl_generation', 'qa_generation', 'sql_validation', 'training_load', 'complete'));
+ALTER TABLE data_pipeline_task_steps ADD CONSTRAINT chk_step_name 
+    CHECK (step_name IN ('ddl_generation', 'qa_generation', 'sql_validation', 'training_load'));
 
-ALTER TABLE data_pipeline_task_executions ADD CONSTRAINT chk_duration_positive 
-    CHECK (duration_seconds IS NULL OR duration_seconds >= 0);
 
--- ====================================================================
--- 任务日志表 (data_pipeline_task_logs)
--- ====================================================================
-CREATE TABLE IF NOT EXISTS data_pipeline_task_logs (
-    id SERIAL PRIMARY KEY,
-    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
-    execution_id VARCHAR(100) REFERENCES data_pipeline_task_executions(execution_id) ON DELETE SET NULL,
-    
-    -- 日志内容
-    log_level VARCHAR(10) NOT NULL,               -- 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
-    message TEXT NOT NULL,                        -- 日志消息内容
-    
-    -- 上下文信息
-    step_name VARCHAR(50),                        -- 执行步骤名称
-    module_name VARCHAR(100),                     -- 模块名称
-    function_name VARCHAR(100),                   -- 函数名称
-    
-    -- 时间戳
-    timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    
-    -- 额外信息(JSON格式)
-    extra_data JSONB DEFAULT '{}'::jsonb          -- 额外的结构化信息
-);
-
--- 添加约束
-ALTER TABLE data_pipeline_task_logs ADD CONSTRAINT chk_log_level 
-    CHECK (log_level IN ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'));
-
--- ====================================================================
--- 任务输出文件表 (data_pipeline_task_outputs)
--- ====================================================================
-CREATE TABLE IF NOT EXISTS data_pipeline_task_outputs (
-    id SERIAL PRIMARY KEY,
-    task_id VARCHAR(32) REFERENCES data_pipeline_tasks(id) ON DELETE CASCADE,
-    execution_id VARCHAR(100) REFERENCES data_pipeline_task_executions(execution_id) ON DELETE SET NULL,
-    
-    -- 文件信息
-    file_type VARCHAR(50) NOT NULL,               -- 'ddl', 'md', 'json', 'log', 'report'
-    file_name VARCHAR(255) NOT NULL,              -- 文件名
-    file_path TEXT NOT NULL,                      -- 相对路径
-    file_size BIGINT DEFAULT 0,                   -- 文件大小(字节)
-    
-    -- 文件内容摘要
-    content_hash VARCHAR(64),                     -- 文件内容hash
-    description TEXT,                             -- 文件描述
-    
-    -- 时间戳
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    modified_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    
-    -- 状态
-    is_primary BOOLEAN DEFAULT FALSE,             -- 是否为主要输出文件
-    is_downloadable BOOLEAN DEFAULT TRUE          -- 是否可下载
-);
-
--- 添加约束
-ALTER TABLE data_pipeline_task_outputs ADD CONSTRAINT chk_file_type 
-    CHECK (file_type IN ('ddl', 'md', 'json', 'log', 'report', 'txt', 'other'));
-
-ALTER TABLE data_pipeline_task_outputs ADD CONSTRAINT chk_file_size_positive 
-    CHECK (file_size >= 0);
 
 -- ====================================================================
 -- 创建索引
@@ -162,30 +87,15 @@ ALTER TABLE data_pipeline_task_outputs ADD CONSTRAINT chk_file_size_positive
 CREATE INDEX IF NOT EXISTS idx_tasks_status ON data_pipeline_tasks(status);
 CREATE INDEX IF NOT EXISTS idx_tasks_created_at ON data_pipeline_tasks(created_at DESC);
 CREATE INDEX IF NOT EXISTS idx_tasks_db_name ON data_pipeline_tasks(db_name);
-CREATE INDEX IF NOT EXISTS idx_tasks_created_by ON data_pipeline_tasks(created_by);
+CREATE INDEX IF NOT EXISTS idx_tasks_created_type ON data_pipeline_tasks(created_type);
 CREATE INDEX IF NOT EXISTS idx_tasks_task_type ON data_pipeline_tasks(task_type);
 
--- 执行记录表索引
-CREATE INDEX IF NOT EXISTS idx_executions_task_id ON data_pipeline_task_executions(task_id);
-CREATE INDEX IF NOT EXISTS idx_executions_step ON data_pipeline_task_executions(execution_step);
-CREATE INDEX IF NOT EXISTS idx_executions_status ON data_pipeline_task_executions(status);
-CREATE INDEX IF NOT EXISTS idx_executions_started_at ON data_pipeline_task_executions(started_at DESC);
-CREATE INDEX IF NOT EXISTS idx_executions_task_step ON data_pipeline_task_executions(task_id, execution_step);
-
--- 日志表索引
-CREATE INDEX IF NOT EXISTS idx_logs_task_id ON data_pipeline_task_logs(task_id);
-CREATE INDEX IF NOT EXISTS idx_logs_execution_id ON data_pipeline_task_logs(execution_id);
-CREATE INDEX IF NOT EXISTS idx_logs_timestamp ON data_pipeline_task_logs(timestamp DESC);
-CREATE INDEX IF NOT EXISTS idx_logs_level ON data_pipeline_task_logs(log_level);
-CREATE INDEX IF NOT EXISTS idx_logs_step ON data_pipeline_task_logs(step_name);
-CREATE INDEX IF NOT EXISTS idx_logs_task_timestamp ON data_pipeline_task_logs(task_id, timestamp DESC);
-
--- 文件输出表索引
-CREATE INDEX IF NOT EXISTS idx_outputs_task_id ON data_pipeline_task_outputs(task_id);
-CREATE INDEX IF NOT EXISTS idx_outputs_execution_id ON data_pipeline_task_outputs(execution_id);
-CREATE INDEX IF NOT EXISTS idx_outputs_file_type ON data_pipeline_task_outputs(file_type);
-CREATE INDEX IF NOT EXISTS idx_outputs_primary ON data_pipeline_task_outputs(is_primary) WHERE is_primary = TRUE;
-CREATE INDEX IF NOT EXISTS idx_outputs_downloadable ON data_pipeline_task_outputs(is_downloadable) WHERE is_downloadable = TRUE;
+-- 步骤状态表索引
+CREATE INDEX IF NOT EXISTS idx_steps_task_id ON data_pipeline_task_steps(task_id);
+CREATE INDEX IF NOT EXISTS idx_steps_step_name ON data_pipeline_task_steps(step_name);
+CREATE INDEX IF NOT EXISTS idx_steps_step_status ON data_pipeline_task_steps(step_status);
+CREATE INDEX IF NOT EXISTS idx_steps_started_at ON data_pipeline_task_steps(started_at DESC);
+CREATE INDEX IF NOT EXISTS idx_steps_task_step ON data_pipeline_task_steps(task_id, step_name);
 
 -- ====================================================================
 -- 创建清理函数
@@ -200,19 +110,13 @@ DECLARE
 BEGIN
     cutoff_date := NOW() - INTERVAL '1 day' * days_to_keep;
     
-    -- 删除旧任务(级联删除相关日志和文件记录)
+    -- 删除旧任务(级联删除相关步骤记录)
     DELETE FROM data_pipeline_tasks 
     WHERE created_at < cutoff_date 
     AND status IN ('completed', 'failed');
     
     GET DIAGNOSTICS deleted_count = ROW_COUNT;
     
-    -- 记录清理操作
-    INSERT INTO data_pipeline_task_logs (task_id, log_level, message, step_name)
-    VALUES ('system', 'INFO', 
-            FORMAT('清理了 %s 个超过 %s 天的旧任务', deleted_count, days_to_keep),
-            'cleanup');
-    
     RETURN deleted_count;
 END;
 $$ LANGUAGE plpgsql;
@@ -249,12 +153,12 @@ DECLARE
 BEGIN
     cutoff_time := NOW() - INTERVAL '1 hour' * timeout_hours;
     
-    -- 查找超时的运行中执行
-    UPDATE data_pipeline_task_executions 
-    SET status = 'failed',
-        error_message = FORMAT('执行超时(超过%s小时),可能已停止运行', timeout_hours),
+    -- 查找超时的运行中步骤
+    UPDATE data_pipeline_task_steps 
+    SET step_status = 'failed',
+        error_message = FORMAT('步骤执行超时(超过%s小时),可能已停止运行', timeout_hours),
         completed_at = NOW()
-    WHERE status = 'running' 
+    WHERE step_status = 'running' 
     AND started_at < cutoff_time;
     
     GET DIAGNOSTICS zombie_count = ROW_COUNT;
@@ -266,14 +170,6 @@ BEGIN
     WHERE status IN ('in_progress') 
     AND started_at < cutoff_time;
     
-    -- 记录检查操作
-    IF zombie_count > 0 THEN
-        INSERT INTO data_pipeline_task_logs (task_id, log_level, message, step_name)
-        VALUES ('system', 'WARNING', 
-                FORMAT('发现并处理了 %s 个僵尸执行', zombie_count),
-                'zombie_check');
-    END IF;
-    
     RETURN zombie_count;
 END;
 $$ LANGUAGE plpgsql;
@@ -289,41 +185,33 @@ $$ LANGUAGE plpgsql;
 -- 创建视图(便于查询)
 -- ====================================================================
 
--- 任务执行概览视图
-CREATE OR REPLACE VIEW v_task_execution_overview AS
+-- 任务步骤概览视图
+CREATE OR REPLACE VIEW v_task_step_overview AS
 SELECT 
-    t.id as task_id,
+    t.task_id,
     t.task_type,
     t.status as task_status,
-    t.step_status,
     t.created_at,
     t.started_at,
     t.completed_at,
-    t.created_by,
+    t.created_type,
+    t.by_user,
     t.db_name,
-    COALESCE(e.current_execution, '{}') as current_execution,
-    COALESCE(e.execution_count, 0) as total_executions
+    s.step_name,
+    s.step_status,
+    s.started_at as step_started_at,
+    s.completed_at as step_completed_at,
+    s.error_message as step_error_message
 FROM data_pipeline_tasks t
-LEFT JOIN (
-    SELECT 
-        task_id,
-        COUNT(*) as execution_count,
-        json_build_object(
-            'execution_id', e1.execution_id,
-            'step', e1.execution_step,
-            'status', e1.status,
-            'started_at', e1.started_at
-        ) as current_execution
-    FROM data_pipeline_task_executions e1
-    WHERE e1.id = (
-        SELECT e2.id 
-        FROM data_pipeline_task_executions e2 
-        WHERE e2.task_id = e1.task_id 
-        ORDER BY e2.started_at DESC 
-        LIMIT 1
-    )
-    GROUP BY task_id, e1.execution_id, e1.execution_step, e1.status, e1.started_at
-) e ON t.id = e.task_id;
+LEFT JOIN data_pipeline_task_steps s ON t.task_id = s.task_id
+ORDER BY t.created_at DESC, 
+         CASE s.step_name 
+           WHEN 'ddl_generation' THEN 1
+           WHEN 'qa_generation' THEN 2
+           WHEN 'sql_validation' THEN 3
+           WHEN 'training_load' THEN 4
+           ELSE 5 
+         END;
 
 -- 提交事务
 COMMIT;
@@ -333,9 +221,7 @@ COMMIT;
 \echo ''
 \echo '已创建的表:'
 \echo '- data_pipeline_tasks: 任务主表'
-\echo '- data_pipeline_task_executions: 任务执行记录表'
-\echo '- data_pipeline_task_logs: 任务日志表'
-\echo '- data_pipeline_task_outputs: 任务输出文件表'
+\echo '- data_pipeline_task_steps: 任务步骤状态表'
 \echo ''
 \echo '已创建的函数:'
 \echo '- cleanup_old_data_pipeline_tasks(days): 清理旧任务'
@@ -343,4 +229,4 @@ COMMIT;
 \echo '- check_zombie_data_pipeline_tasks(hours): 检查僵尸任务'
 \echo ''
 \echo '已创建的视图:'
-\echo '- v_task_execution_overview: 任务执行概览'
+\echo '- v_task_step_overview: 任务步骤概览'

+ 0 - 15
data_pipeline/training_data/bss_company.ddl

@@ -1,15 +0,0 @@
--- 中文名: 存储高速公路服务区关联企业基本信息
--- 描述: 存储高速公路服务区关联企业基本信息,包含公司名称、编码及操作审计信息,用于管理入驻服务区的合作企业。
-create table public.bss_company (
-  id varchar(32) not null     -- 主键ID,主键,
-  version integer not null    -- 版本号,
-  create_ts timestamp         -- 创建时间,
-  created_by varchar(50)      -- 创建人,
-  update_ts timestamp         -- 更新时间,
-  updated_by varchar(50)      -- 更新人,
-  delete_ts timestamp         -- 删除时间,
-  deleted_by varchar(50)      -- 删除人,
-  company_name varchar(255)   -- 公司名称,
-  company_no varchar(255)     -- 公司编码,
-  primary key (id)
-);

+ 0 - 15
data_pipeline/training_data/bss_company_detail.md

@@ -1,15 +0,0 @@
-## bss_company(存储高速公路服务区关联企业基本信息)
-bss_company 表存储高速公路服务区关联企业基本信息,包含公司名称、编码及操作审计信息,用于管理入驻服务区的合作企业。
-字段列表:
-- id (varchar(32)) - 主键ID [主键, 非空] [示例: 30675d85ba5044c31acfa243b9d16334, 47ed0bb37f5a85f3d9245e4854959b81]
-- version (integer) - 版本号 [非空] [示例: 1, 2]
-- create_ts (timestamp) - 创建时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
-- created_by (varchar(50)) - 创建人 [示例: admin]
-- update_ts (timestamp) - 更新时间 [示例: 2021-05-20 09:51:58.718000, 2021-05-20 09:42:03.341000]
-- updated_by (varchar(50)) - 更新人 [示例: admin]
-- delete_ts (timestamp) - 删除时间
-- deleted_by (varchar(50)) - 删除人
-- company_name (varchar(255)) - 公司名称 [示例: 上饶分公司, 宜春分公司]
-- company_no (varchar(255)) - 公司编码 [示例: H03, H02]
-字段补充说明:
-- id 为主键

+ 0 - 16
data_pipeline/training_data/bss_section_route.ddl

@@ -1,16 +0,0 @@
--- 中文名: 记录路段与路线关联信息及版本变更
--- 描述: 记录路段与路线关联信息及版本变更,支持服务区运营管理(BSS系统核心配置表)
-create table public.bss_section_route (
-  id varchar(32) not null     -- 主键ID,主键,
-  version integer not null    -- 版本号,
-  create_ts timestamp         -- 创建时间,
-  created_by varchar(50)      -- 创建人,
-  update_ts timestamp         -- 更新时间,
-  updated_by varchar(50)      -- 更新人,
-  delete_ts timestamp         -- 删除时间,
-  deleted_by varchar(50)      -- 删除人,
-  section_name varchar(255)   -- 路段名称,
-  route_name varchar(255)     -- 路线名称,
-  code varchar(255)           -- 路段编号,
-  primary key (id)
-);

+ 0 - 7
data_pipeline/training_data/bss_section_route_area_link.ddl

@@ -1,7 +0,0 @@
--- 中文名: 路段路线与服务区关联关系表
--- 描述: 路段路线与服务区关联关系表
-create table public.bss_section_route_area_link (
-  section_route_id varchar(32) not null -- 路段路线ID,主键,
-  service_area_id varchar(32) not null -- 服务区ID,主键,
-  primary key (section_route_id, service_area_id)
-);

+ 0 - 7
data_pipeline/training_data/bss_section_route_area_link_detail.md

@@ -1,7 +0,0 @@
-## bss_section_route_area_link(路段路线与服务区关联关系表)
-bss_section_route_area_link 表路段路线与服务区关联关系表
-字段列表:
-- section_route_id (varchar(32)) - 路段路线ID [主键, 非空] [示例: v8elrsfs5f7lt7jl8a6p87smfzesn3rz, hxzi2iim238e3s1eajjt1enmh9o4h3wp]
-- service_area_id (varchar(32)) - 服务区ID [主键, 非空] [示例: 08e01d7402abd1d6a4d9fdd5df855ef8, 091662311d2c737029445442ff198c4c]
-字段补充说明:
-- 复合主键:section_route_id, service_area_id

+ 0 - 16
data_pipeline/training_data/bss_section_route_detail.md

@@ -1,16 +0,0 @@
-## bss_section_route(记录路段与路线关联信息及版本变更)
-bss_section_route 表记录路段与路线关联信息及版本变更,支持服务区运营管理(BSS系统核心配置表)
-字段列表:
-- id (varchar(32)) - 主键ID [主键, 非空] [示例: 04ri3j67a806uw2c6o6dwdtz4knexczh, 0g5mnefxxtukql2cq6acul7phgskowy7]
-- version (integer) - 版本号 [非空] [示例: 1, 0]
-- create_ts (timestamp) - 创建时间 [示例: 2021-10-29 19:43:50, 2022-03-04 16:07:16]
-- created_by (varchar(50)) - 创建人 [示例: admin]
-- update_ts (timestamp) - 更新时间
-- updated_by (varchar(50)) - 更新人
-- delete_ts (timestamp) - 删除时间
-- deleted_by (varchar(50)) - 删除人
-- section_name (varchar(255)) - 路段名称 [示例: 昌栗, 昌宁]
-- route_name (varchar(255)) - 路线名称 [示例: 昌栗, 昌韶]
-- code (varchar(255)) - 路段编号 [示例: SR0001, SR0002]
-字段补充说明:
-- id 为主键

+ 0 - 19
data_pipeline/training_data/bss_service_area.ddl

@@ -1,19 +0,0 @@
--- 中文名: 存储高速公路服务区基础信息
--- 描述: 存储高速公路服务区基础信息,包含名称、编码及版本控制,记录创建/更新/删除操作轨迹,用于支撑服务区全生命周期管理。
-create table public.bss_service_area (
-  id varchar(32) not null     -- 主键ID,主键,
-  version integer not null    -- 版本号,
-  create_ts timestamp         -- 创建时间,
-  created_by varchar(50)      -- 创建人,
-  update_ts timestamp         -- 更新时间,
-  updated_by varchar(50)      -- 更新人,
-  delete_ts timestamp         -- 删除时间,
-  deleted_by varchar(50)      -- 删除人,
-  service_area_name varchar(255) -- 服务区名称,
-  service_area_no varchar(255) -- 服务区编码,
-  company_id varchar(32)      -- 公司ID,
-  service_position varchar(255) -- 地理位置坐标,
-  service_area_type varchar(50) -- 服务区类型,
-  service_state varchar(50)   -- 服务区状态,
-  primary key (id)
-);

+ 0 - 21
data_pipeline/training_data/bss_service_area_detail.md

@@ -1,21 +0,0 @@
-## bss_service_area(存储高速公路服务区基础信息)
-bss_service_area 表存储高速公路服务区基础信息,包含名称、编码及版本控制,记录创建/更新/删除操作轨迹,用于支撑服务区全生命周期管理。
-字段列表:
-- id (varchar(32)) - 主键ID [主键, 非空] [示例: 0271d68ef93de9684b7ad8c7aae600b6, 08e01d7402abd1d6a4d9fdd5df855ef8]
-- version (integer) - 版本号 [非空] [示例: 3, 6]
-- create_ts (timestamp) - 创建时间 [示例: 2021-05-21 13:26:40.589000, 2021-05-20 19:51:46.314000]
-- created_by (varchar(50)) - 创建人 [示例: admin]
-- update_ts (timestamp) - 更新时间 [示例: 2021-07-10 15:41:28.795000, 2021-07-11 09:33:08.455000]
-- updated_by (varchar(50)) - 更新人 [示例: admin]
-- delete_ts (timestamp) - 删除时间
-- deleted_by (varchar(50)) - 删除人 [示例: ]
-- service_area_name (varchar(255)) - 服务区名称 [示例: 白鹭湖停车区, 南昌南服务区]
-- service_area_no (varchar(255)) - 服务区编码 [示例: H0814, H0105]
-- company_id (varchar(32)) - 公司ID [示例: b1629f07c8d9ac81494fbc1de61f1ea5, ee9bf1180a2b45003f96e597a4b7f15a]
-- service_position (varchar(255)) - 地理位置坐标 [示例: 114.574721,26.825584, 115.910549,28.396355]
-- service_area_type (varchar(50)) - 服务区类型 [示例: 信息化服务区]
-- service_state (varchar(50)) - 服务区状态 [示例: 开放, 关闭]
-字段补充说明:
-- id 为主键
-- service_area_type 为枚举字段,包含取值:信息化服务区、智能化服务区
-- service_state 为枚举字段,包含取值:开放、关闭、上传数据

+ 0 - 18
data_pipeline/training_data/bss_service_area_mapper.ddl

@@ -1,18 +0,0 @@
--- 中文名: 服务区名称与编码映射表
--- 描述: 服务区名称与编码映射表,记录基础信息及变更记录,支撑服务区业务数据关联
-create table public.bss_service_area_mapper (
-  id varchar(32) not null     -- 主键ID,主键,
-  version integer not null    -- 版本号,
-  create_ts timestamp         -- 创建时间,
-  created_by varchar(50)      -- 创建人,
-  update_ts timestamp         -- 更新时间,
-  updated_by varchar(50)      -- 更新人,
-  delete_ts timestamp         -- 删除时间,
-  deleted_by varchar(50)      -- 删除人,
-  service_name varchar(255)   -- 服务区名称,
-  service_no varchar(255)     -- 服务区编码,
-  service_area_id varchar(32) -- 服务区ID,
-  source_system_type varchar(50) -- 数据来源系统类型,
-  source_type integer         -- 数据来源类别ID,
-  primary key (id)
-);

+ 0 - 19
data_pipeline/training_data/bss_service_area_mapper_detail.md

@@ -1,19 +0,0 @@
-## bss_service_area_mapper(服务区名称与编码映射表)
-bss_service_area_mapper 表服务区名称与编码映射表,记录基础信息及变更记录,支撑服务区业务数据关联
-字段列表:
-- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00e1e893909211ed8ee6fa163eaf653f, 013867f5962211ed8ee6fa163eaf653f]
-- version (integer) - 版本号 [非空] [示例: 1]
-- create_ts (timestamp) - 创建时间 [示例: 2023-01-10 10:54:03, 2023-01-17 12:47:29]
-- created_by (varchar(50)) - 创建人 [示例: admin]
-- update_ts (timestamp) - 更新时间 [示例: 2023-01-10 10:54:07, 2023-01-17 12:47:32]
-- updated_by (varchar(50)) - 更新人
-- delete_ts (timestamp) - 删除时间
-- deleted_by (varchar(50)) - 删除人
-- service_name (varchar(255)) - 服务区名称 [示例: 信丰西服务区, 南康北服务区]
-- service_no (varchar(255)) - 服务区编码 [示例: 1067, 1062]
-- service_area_id (varchar(32)) - 服务区ID [示例: 97cd6cd516a551409a4d453a58f9e170, fdbdd042962011ed8ee6fa163eaf653f]
-- source_system_type (varchar(50)) - 数据来源系统类型 [示例: 驿美, 驿购]
-- source_type (integer) - 数据来源类别ID [示例: 3, 1]
-字段补充说明:
-- id 为主键
-- source_system_type 为枚举字段,包含取值:司乘管理、商业管理、驿购、驿美、手工录入

+ 0 - 13
data_pipeline/training_data/db_query_decision_prompt.txt

@@ -1,13 +0,0 @@
-=== 数据库业务范围 ===
-当前数据库存储的是高速公路服务区运营管理的相关数据,主要涉及服务区营业数据、车流统计、企业合作及路段关联,包含以下业务数据:
-核心业务实体:
-- 服务区:记录服务区基础信息及状态,主要字段:service_area_name、service_area_no、service_state
-- 企业:存储服务区关联企业信息,主要字段:company_name、company_no
-- 路段路线:管理路段与路线关联关系,主要字段:section_name、route_name
-- 车辆类型:统计车辆分类日流量,主要字段:car_type、customer_count
-- 支付方式:记录营业数据的支付类型及金额,主要字段:wx、zfb、rmb、xs、jd
-关键业务指标:
-- 营收统计:包含各支付方式金额(wx/zfb/rmb/xs/jd)及订单总数(order_sum)
-- 车流分析:按车辆类型分类的车流量(customer_count)及分布
-- 支付渗透率:各支付方式订单数(wx_order/zf_order/rmb_order)占比
-- 服务区运营状态:开放/关闭状态的服务区数量统计(service_state)

+ 0 - 10
data_pipeline/training_data/filename_mapping.txt

@@ -1,10 +0,0 @@
-# 文件名映射报告
-# 格式: 原始表名 -> 实际文件名
-
-public.bss_business_day_data -> bss_business_day_data_detail.md
-public.bss_car_day_count -> bss_car_day_count_detail.md
-public.bss_company -> bss_company_detail.md
-public.bss_section_route -> bss_section_route_detail.md
-public.bss_section_route_area_link -> bss_section_route_area_link_detail.md
-public.bss_service_area -> bss_service_area_detail.md
-public.bss_service_area_mapper -> bss_service_area_mapper_detail.md

+ 0 - 62
data_pipeline/training_data/metadata.txt

@@ -1,62 +0,0 @@
--- Schema Tools生成的主题元数据
--- 业务背景: 高速公路服务区管理系统
--- 生成时间: 2025-06-27 10:17:45
--- 数据库: highway_db
-
--- 创建表(如果不存在)
-CREATE TABLE IF NOT EXISTS metadata (
-    id SERIAL PRIMARY KEY,    -- 主键
-    topic_name VARCHAR(100) NOT NULL,  -- 业务主题名称
-    description TEXT,                  -- 业务主体说明
-    related_tables TEXT[],			  -- 相关表名
-    biz_entities TEXT[],               -- 主要业务实体名称
-    biz_metrics TEXT[],                -- 主要业务指标名称
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP    -- 插入时间
-);
-
--- 插入主题数据
-INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
-(
-  '日营业数据分析',
-  '分析各服务区/档口每日营收、订单量及支付方式占比,评估经营效率与用户支付偏好',
-  'bss_business_day_data,bss_service_area',
-  '服务区,档口,支付方式,日期',
-  '收入趋势,服务区对比,支付方式分布'
-);
-
-INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
-(
-  '车流类型分析',
-  '统计各服务区不同车辆类型日流量分布,为设施配置与交通疏导提供数据支撑',
-  'bss_car_day_count,bss_service_area',
-  '服务区,车辆类型,日期,路段',
-  '车流趋势,车型占比,高峰时段识别'
-);
-
-INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
-(
-  '企业运营对比',
-  '对比不同企业下属服务区的营收能力与车流规模,评估企业运营管理效能',
-  'bss_company,bss_service_area,bss_business_day_data,bss_car_day_count',
-  '企业,服务区,路段,日期',
-  '单车流收益,企业服务区覆盖率,车流转化率'
-);
-
-INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
-(
-  '路段引流效果',
-  '分析不同路段关联服务区的车流与消费数据,评估路段对服务区业务的带动能力',
-  'bss_section_route,bss_section_route_area_link,bss_car_day_count,bss_business_day_data',
-  '路段,路线,服务区,日期',
-  '路段车流贡献度,单车道收益,路段-服务区关联度'
-);
-
-INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
-(
-  '支付方式演化',
-  '追踪支付方式随时间变化趋势及区域差异,指导支付渠道优化与营销策略调整',
-  'bss_business_day_data,bss_service_area,bss_section_route',
-  '支付类型,服务区,路段,季度',
-  '支付渗透率变化,区域支付偏好,新支付方式增长率'
-);
-

+ 0 - 202
data_pipeline/training_data/qs_highway_db_20250627_101745_pair.json

@@ -1,202 +0,0 @@
-[
-  {
-    "question": "统计最近7天各服务区的每日总收入趋势(按日期排序)",
-    "sql": "SELECT oper_date AS 统计日期, service_name AS 服务区名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - INTERVAL '7 days' AND delete_ts IS NULL GROUP BY oper_date, service_name ORDER BY oper_date;"
-  },
-  {
-    "question": "查询昨日各档口订单量TOP10(按订单量降序)",
-    "sql": "SELECT branch_name AS 档口名称, SUM(order_sum) AS 订单总量 FROM bss_business_day_data WHERE oper_date = CURRENT_DATE - INTERVAL '1 day' AND delete_ts IS NULL GROUP BY branch_name ORDER BY 订单总量 DESC LIMIT 10;"
-  },
-  {
-    "question": "分析今日各服务区支付宝支付订单占比超过20%的记录",
-    "sql": "SELECT service_name AS 服务区名称, SUM(zf_order) AS 支付宝订单量, SUM(order_sum) AS 总订单量, ROUND(SUM(zf_order)*100.0/SUM(order_sum), 2) AS 支付宝占比 FROM bss_business_day_data WHERE oper_date = CURRENT_DATE AND delete_ts IS NULL GROUP BY service_name HAVING SUM(zf_order)*100.0/SUM(order_sum) > 20;"
-  },
-  {
-    "question": "对比本月与上月各服务区总营收变化率(双月数据对比)",
-    "sql": "SELECT service_name AS 服务区名称, SUM(CASE WHEN EXTRACT(MONTH FROM oper_date) = EXTRACT(MONTH FROM CURRENT_DATE) THEN pay_sum ELSE 0 END) AS 本月营收, SUM(CASE WHEN EXTRACT(MONTH FROM oper_date) = EXTRACT(MONTH FROM CURRENT_DATE - INTERVAL '1 month') THEN pay_sum ELSE 0 END) AS 上月营收, ROUND((SUM(CASE WHEN EXTRACT(MONTH FROM oper_date) = EXTRACT(MONTH FROM CURRENT_DATE) THEN pay_sum ELSE 0 END)/NULLIF(SUM(CASE WHEN EXTRACT(MONTH FROM oper_date) = EXTRACT(MONTH FROM CURRENT_DATE - INTERVAL '1 month') THEN pay_sum ELSE 0 END), 0)-1)*100, 2) AS 环比增长率 FROM bss_business_day_data WHERE oper_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '2 months') AND delete_ts IS NULL GROUP BY service_name;"
-  },
-  {
-    "question": "查询特定日期(如2023-04-01)各档口微信支付与现金支付金额对比",
-    "sql": "SELECT branch_name AS 档口名称, SUM(wx) AS 微信支付总额, SUM(rmb) AS 现金支付总额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY branch_name ORDER BY 微信支付总额 DESC;"
-  },
-  {
-    "question": "统计最近30天各公司下属服务区的平均日营收(关联企业信息)",
-    "sql": "SELECT c.company_name AS 企业名称, bsa.service_area_name AS 服务区名称, ROUND(AVG(bbd.pay_sum), 2) AS 平均日营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_name = bsa.service_area_name JOIN bss_company c ON bsa.company_id = c.id WHERE bbd.oper_date >= CURRENT_DATE - INTERVAL '30 days' AND bbd.delete_ts IS NULL AND bsa.delete_ts IS NULL GROUP BY c.company_name, bsa.service_area_name;"
-  },
-  {
-    "question": "分析各服务区不同支付方式(微信/支付宝/现金)的订单占比分布",
-    "sql": "SELECT service_name AS 服务区名称, ROUND(SUM(wx_order)*100.0/SUM(order_sum), 2) AS 微信占比, ROUND(SUM(zf_order)*100.0/SUM(order_sum), 2) AS 支付宝占比, ROUND(SUM(rmb_order)*100.0/SUM(order_sum), 2) AS 现金占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name;"
-  },
-  {
-    "question": "查询当前开放状态的服务区及其最近营业日数据完整率(是否存在空数据)",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, MAX(bbd.oper_date) AS 最后营业日, CASE WHEN MAX(bbd.oper_date) >= CURRENT_DATE - INTERVAL '1 day' THEN '数据完整' ELSE '数据缺失' END AS 数据状态 FROM bss_service_area sa LEFT JOIN bss_business_day_data bbd ON sa.service_area_name = bbd.service_name AND bbd.delete_ts IS NULL WHERE sa.service_state = '开放' AND sa.delete_ts IS NULL GROUP BY sa.service_area_name;"
-  },
-  {
-    "question": "统计本周工作日(周一至周五)各时段(早/中/晚)的营收分布",
-    "sql": "SELECT service_name AS 服务区名称, CASE WHEN EXTRACT(HOUR FROM create_ts) BETWEEN 6 AND 11 THEN '上午' WHEN EXTRACT(HOUR FROM create_ts) BETWEEN 12 AND 17 THEN '下午' ELSE '晚上' END AS 营业时段, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE EXTRACT(ISODOW FROM oper_date) <= 5 AND oper_date >= CURRENT_DATE - INTERVAL '7 days' AND delete_ts IS NULL GROUP BY 服务区名称, 营业时段 ORDER BY 服务区名称, 营业时段;"
-  },
-  {
-    "question": "分析不同类型服务区(信息化/智能化)的平均客单价差异",
-    "sql": "SELECT sa.service_area_type AS 服务区类型, ROUND(AVG(bbd.pay_sum / NULLIF(bbd.order_sum, 0)), 2) AS 平均客单价 FROM bss_business_day_data bbd JOIN bss_service_area sa ON bbd.service_name = sa.service_area_name WHERE sa.delete_ts IS NULL AND bbd.delete_ts IS NULL AND bbd.order_sum > 0 GROUP BY sa.service_area_type;"
-  },
-  {
-    "question": "统计2023年4月1日各服务区各车辆类型的总车流量,并按车流量降序排序",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, cc.car_type AS 车辆类型, SUM(cc.customer_count) AS 总车流量 FROM bss_car_day_count cc INNER JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date = '2023-04-01' AND sa.delete_ts IS NULL GROUP BY sa.service_area_name, cc.car_type ORDER BY 总车流量 DESC;"
-  },
-  {
-    "question": "分析2023年3月各车辆类型的日均车流量,找出日均车流量最高的车型",
-    "sql": "SELECT car_type AS 车辆类型, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-03-01' AND '2023-03-31' GROUP BY car_type ORDER BY 日均车流量 DESC LIMIT 1;"
-  },
-  {
-    "question": "查询最近7天危化品车辆流量最高的前3个服务区及其总流量",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, SUM(cc.customer_count) AS 危化品车流量 FROM bss_car_day_count cc INNER JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date >= CURRENT_DATE - 7 AND cc.car_type = '危化品' AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 危化品车流量 DESC LIMIT 3;"
-  },
-  {
-    "question": "计算南昌南服务区各车辆类型占比,并按占比降序排序",
-    "sql": "SELECT car_type AS 车辆类型, SUM(customer_count) * 100.0 / (SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '南昌南服务区')) AS 占比百分比 FROM bss_car_day_count WHERE service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '南昌南服务区') GROUP BY car_type ORDER BY 占比百分比 DESC;"
-  },
-  {
-    "question": "统计2023年每周总车流量趋势,按周环比增长率排序",
-    "sql": "WITH weekly AS (SELECT date_trunc('week', count_date) AS 周, SUM(customer_count) AS 总流量 FROM bss_car_day_count GROUP BY 周) SELECT 周, 总流量, (总流量 - LAG(总流量,1) OVER(ORDER BY 周)) / LAG(总流量,1) OVER(ORDER BY 周)::numeric * 100 AS 环比增长率 FROM weekly ORDER BY 周;"
-  },
-  {
-    "question": "查询信息化与智能化服务区的平均车流量差异",
-    "sql": "SELECT sa.service_area_type AS 服务区类型, AVG(cc.customer_count) AS 平均车流量 FROM bss_car_day_count cc INNER JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE sa.delete_ts IS NULL GROUP BY sa.service_area_type;"
-  },
-  {
-    "question": "找出2023年车流量增长率最高的Top5服务区(同比2022年)",
-    "sql": "WITH yearly_2022 AS (SELECT service_area_id, SUM(customer_count) AS 流量2022 FROM bss_car_day_count WHERE count_date BETWEEN '2022-01-01' AND '2022-12-31' GROUP BY service_area_id), yearly_2023 AS (SELECT service_area_id, SUM(customer_count) AS 流量2023 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY service_area_id) SELECT sa.service_area_name, (y2023.流量2023 - y2022.流量2022)/y2022.流量2022::numeric * 100 AS 增长率 FROM yearly_2022 y2022 INNER JOIN yearly_2023 y2023 ON y2022.service_area_id = y2023.service_area_id INNER JOIN bss_service_area sa ON y2022.service_area_id = sa.id ORDER BY 增长率 DESC LIMIT 5;"
-  },
-  {
-    "question": "查询存在危化品车辆的所有服务区名称及其首次出现日期",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, MIN(cc.count_date) AS 首次出现日期 FROM bss_car_day_count cc INNER JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.car_type = '危化品' AND sa.delete_ts IS NULL GROUP BY sa.service_area_name;"
-  },
-  {
-    "question": "统计各服务区2023年Q1季度月均车流量并按总量降序排序",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, AVG(月流量) AS 月均车流量 FROM (SELECT service_area_id, date_trunc('month', count_date) AS 月份, SUM(customer_count) AS 月流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-03-31' GROUP BY service_area_id, 月份) AS mq INNER JOIN bss_service_area sa ON mq.service_area_id = sa.id GROUP BY sa.service_area_name ORDER BY 月均车流量 DESC;"
-  },
-  {
-    "question": "分析周末与工作日的车流差异(统计2023年4月工作日/周末日均车流量)",
-    "sql": "SELECT CASE WHEN EXTRACT(ISODOW FROM count_date) IN (6,7) THEN '周末' ELSE '工作日' END AS 日期类型, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY 日期类型;"
-  },
-  {
-    "question": "统计2023年6月各企业下属服务区单车流收益TOP5(单车流收益=总营收/总车流量)",
-    "sql": "SELECT c.company_name AS 企业名称, SUM(b.pay_sum)/SUM(car.customer_count) AS 单车流收益 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id AND s.delete_ts IS NULL JOIN bss_business_day_data b ON s.service_area_no = b.service_no AND b.oper_date BETWEEN '2023-06-01' AND '2023-06-30' JOIN bss_car_day_count car ON s.id = car.service_area_id AND car.count_date BETWEEN '2023-06-01' AND '2023-06-30' GROUP BY c.company_name ORDER BY 单车流收益 DESC LIMIT 5;"
-  },
-  {
-    "question": "计算当前有效服务区中车流转化率(订单数/车流量)最低的10个服务区信息",
-    "sql": "SELECT s.service_area_name AS 服务区名称, SUM(b.order_sum)/SUM(car.customer_count) AS 车流转化率 FROM bss_service_area s JOIN bss_business_day_data b ON s.service_area_no = b.service_no JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE s.service_state = '开放' GROUP BY s.service_area_name ORDER BY 车流转化率 ASC LIMIT 10;"
-  },
-  {
-    "question": "对比2023年Q2各企业服务区覆盖率(服务区数量占全局比例)变化趋势",
-    "sql": "WITH company_count AS (SELECT c.company_name, COUNT(s.id) AS cnt FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id GROUP BY c.company_name), total AS (SELECT COUNT(*) AS total_cnt FROM bss_service_area) SELECT company_name, cnt/total_cnt AS 覆盖率, 'Q2' AS 季度 FROM company_count CROSS JOIN total;"
-  },
-  {
-    "question": "分析宜春分公司2023年7月每日车流中过境车辆占比变化趋势",
-    "sql": "SELECT car.count_date AS 统计日期, SUM(CASE WHEN car.car_type='过境' THEN car.customer_count ELSE 0 END)/SUM(car.customer_count) AS 过境占比 FROM bss_service_area s JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE s.company_id = (SELECT id FROM bss_company WHERE company_name = '宜春分公司') AND car.count_date BETWEEN '2023-07-01' AND '2023-07-31' GROUP BY car.count_date ORDER BY 统计日期;"
-  },
-  {
-    "question": "统计连续3天无营收记录的服务区清单及所属企业",
-    "sql": "SELECT DISTINCT s.service_area_name, c.company_name FROM bss_service_area s JOIN bss_company c ON s.company_id = c.id LEFT JOIN bss_business_day_data b ON s.service_area_no = b.service_no AND b.oper_date >= CURRENT_DATE - INTERVAL '3 days' WHERE b.id IS NULL;"
-  },
-  {
-    "question": "对比昌栗路段和昌韶路段所属企业2023年单车流收益差异",
-    "sql": "SELECT sec.route_name AS 路段名称, c.company_name AS 企业名称, SUM(b.pay_sum)/SUM(car.customer_count) AS 单车流收益 FROM bss_section_route sec JOIN bss_section_route_area_link link ON sec.id = link.section_route_id JOIN bss_service_area s ON link.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id JOIN bss_business_day_data b ON s.service_area_no = b.service_no JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE sec.route_name IN ('昌栗','昌韶') GROUP BY sec.route_name, c.company_name;"
-  },
-  {
-    "question": "计算各企业2023年上半年月均营收额和车流量增长率(与2022年同期对比)",
-    "sql": "WITH current_year AS (SELECT c.company_name, EXTRACT(MONTH FROM b.oper_date) AS 月份, SUM(b.pay_sum) AS 营收, SUM(car.customer_count) AS 车流量 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id JOIN bss_business_day_data b ON s.service_area_no = b.service_no JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE EXTRACT(YEAR FROM b.oper_date) = 2023 GROUP BY c.company_name, 月份), last_year AS (SELECT c.company_name, EXTRACT(MONTH FROM b.oper_date) AS 月份, SUM(b.pay_sum) AS 营收_去年, SUM(car.customer_count) AS 车流量_去年 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id JOIN bss_business_day_data b ON s.service_area_no = b.service_no JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE EXTRACT(YEAR FROM b.oper_date) = 2022 GROUP BY c.company_name, 月份) SELECT cy.company_name, cy.月份, (cy.营收/ly.营收_去年-1)*100 AS 营收增长率, (cy.车流量/ly.车流量_去年-1)*100 AS 车量增长率 FROM current_year cy JOIN last_year ly ON cy.company_name = ly.company_name AND cy.月份 = ly.月份;"
-  },
-  {
-    "question": "统计各企业服务区危化品车辆通行量占比TOP3的服务区",
-    "sql": "SELECT c.company_name, s.service_area_name, SUM(CASE WHEN car.car_type='危化品' THEN car.customer_count ELSE 0 END)/SUM(car.customer_count) AS 危化品占比 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id JOIN bss_car_day_count car ON s.id = car.service_area_id GROUP BY c.company_name, s.service_area_name ORDER BY 危化品占比 DESC LIMIT 3;"
-  },
-  {
-    "question": "分析2023年各季度企业新增服务区数量及运营状态分布",
-    "sql": "SELECT c.company_name, DATE_TRUNC('quarter', s.create_ts) AS 季度, COUNT(s.id) AS 新增数量, SUM(CASE WHEN s.service_state='开放' THEN 1 ELSE 0 END) AS 开放数量 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id WHERE EXTRACT(YEAR FROM s.create_ts) = 2023 GROUP BY c.company_name, DATE_TRUNC('quarter', s.create_ts) ORDER BY 季度;"
-  },
-  {
-    "question": "统计连续两月营收环比下降超过10%的企业名单",
-    "sql": "WITH monthly_revenue AS (SELECT c.company_name, DATE_TRUNC('month', b.oper_date) AS 月份, SUM(b.pay_sum) AS 总营收 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id JOIN bss_business_day_data b ON s.service_area_no = b.service_no GROUP BY c.company_name, 月份) SELECT company_name FROM (SELECT company_name, 月份, 总营收 / LAG(总营收) OVER (PARTITION BY company_name ORDER BY 月份) -1 AS 环比变化 FROM monthly_revenue) t WHERE 环比变化 < -0.1 GROUP BY company_name;"
-  },
-  {
-    "question": "分析2023年Q1各路段关联服务区的总车流与消费金额对比,按车流量排序",
-    "sql": "SELECT sr.section_name AS 路段名称, SUM(cd.customer_count) AS 总车流量, SUM(bd.pay_sum) AS 总消费金额 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_car_day_count cd ON link.service_area_id = cd.service_area_id JOIN bss_business_day_data bd ON cd.service_area_id = bd.service_no::varchar AND cd.count_date = bd.oper_date WHERE sr.delete_ts IS NULL AND cd.count_date BETWEEN '2023-01-01' AND '2023-03-31' GROUP BY sr.section_name ORDER BY 总车流量 DESC;"
-  },
-  {
-    "question": "统计最近30天单车道收益最高的前5个服务区,包含路段信息",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, sr.section_name AS 关联路段, ROUND(SUM(bd.pay_sum)/COUNT(DISTINCT sr.id), 2) AS 单车道收益 FROM bss_service_area sa JOIN bss_section_route_area_link link ON sa.id = link.service_area_id JOIN bss_section_route sr ON link.section_route_id = sr.id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE sa.delete_ts IS NULL AND bd.oper_date >= CURRENT_DATE - 30 GROUP BY sa.service_area_name, sr.section_name ORDER BY 单车道收益 DESC LIMIT 5;"
-  },
-  {
-    "question": "计算2023年每月不同车辆类型的平均单车消费金额变化趋势",
-    "sql": "SELECT EXTRACT(MONTH FROM cd.count_date) AS 月份, cd.car_type AS 车辆类型, ROUND(AVG(bd.pay_sum/cd.customer_count), 2) AS 平均单车消费 FROM bss_car_day_count cd JOIN bss_business_day_data bd ON cd.service_area_id = bd.service_no::varchar AND cd.count_date = bd.oper_date WHERE cd.count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY 月份, cd.car_type ORDER BY 月份;"
-  },
-  {
-    "question": "对比昌栗路段与昌宁路段关联服务区的周末(周六日)车流量差异",
-    "sql": "SELECT sr.section_name AS 路段, CASE WHEN EXTRACT(DOW FROM cd.count_date) IN (0,6) THEN '周末' ELSE '工作日' END AS 日期类型, SUM(cd.customer_count) AS 总车流量 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_car_day_count cd ON link.service_area_id = cd.service_area_id WHERE sr.section_name IN ('昌栗', '昌宁') AND cd.count_date >= CURRENT_DATE - 90 GROUP BY sr.section_name, 日期类型 ORDER BY sr.section_name, 日期类型;"
-  },
-  {
-    "question": "统计各路段关联服务区的微信支付占比,筛选占比超过40%的服务区",
-    "sql": "SELECT sr.section_name AS 路段, sa.service_area_name AS 服务区, ROUND(SUM(bd.wx)/SUM(bd.pay_sum)*100, 2) AS 微信支付占比 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_service_area sa ON link.service_area_id = sa.id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE sr.delete_ts IS NULL GROUP BY sr.section_name, sa.service_area_name HAVING SUM(bd.pay_sum) > 0 AND ROUND(SUM(bd.wx)/SUM(bd.pay_sum)*100, 2) > 40;"
-  },
-  {
-    "question": "分析2023年Q2各路段车流贡献度(车流量/路段长度),需关联路段里程数据(假设code对应SR0001=10km)",
-    "sql": "SELECT section_name AS 路段, code AS 路段编号, SUM(customer_count) AS 总车流量, CASE WHEN code = 'SR0001' THEN 10 WHEN code = 'SR0002' THEN 15 END AS 路段长度, ROUND(SUM(customer_count)::numeric / CASE WHEN code = 'SR0001' THEN 10 WHEN code = 'SR0002' THEN 15 END, 2) AS 车流密度 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_car_day_count cd ON link.service_area_id = cd.service_area_id WHERE cd.count_date BETWEEN '2023-04-01' AND '2023-06-30' GROUP BY section_name, code;"
-  },
-  {
-    "question": "统计各路段-服务区关联度(车流+消费金额的综合评分),按权重5:5计算",
-    "sql": "SELECT sr.section_name AS 路段, sa.service_area_name AS 服务区, ROUND((SUM(cd.customer_count)/MAX(cd.customer_count)*0.5 + SUM(bd.pay_sum)/MAX(bd.pay_sum)*0.5)*100, 2) AS 关联度评分 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_service_area sa ON link.service_area_id = sa.id JOIN bss_car_day_count cd ON sa.id = cd.service_area_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY sr.section_name, sa.service_area_name ORDER BY 关联度评分 DESC;"
-  },
-  {
-    "question": "分析2023年每月不同路段的档口订单密度(订单总数/档口数量)",
-    "sql": "SELECT EXTRACT(MONTH FROM oper_date) AS 月份, sr.section_name AS 路段, SUM(bd.order_sum) AS 总订单数, COUNT(DISTINCT bd.branch_no) AS 档口数量, ROUND(SUM(bd.order_sum)/COUNT(DISTINCT bd.branch_no), 2) AS 订单密度 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_service_area sa ON link.service_area_id = sa.id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY 月份, sr.section_name ORDER BY 月份;"
-  },
-  {
-    "question": "找出2023年车流增长率最高的服务区(对比2022年同期数据)",
-    "sql": "WITH prev_year AS (SELECT service_area_id, SUM(customer_count) AS 流量2022 FROM bss_car_day_count WHERE count_date BETWEEN '2022-01-01' AND '2022-12-31' GROUP BY service_area_id), curr_year AS (SELECT service_area_id, SUM(customer_count) AS 流量2023 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY service_area_id) SELECT sa.service_area_name, ROUND((curr_year.流量2023/prev_year.流量2022-1)*100, 2) AS 增长率 FROM prev_year JOIN curr_year ON prev_year.service_area_id = curr_year.service_area_id JOIN bss_service_area sa ON sa.id = prev_year.service_area_id ORDER BY 增长率 DESC LIMIT 1;"
-  },
-  {
-    "question": "分析不同公司管辖路段的单车收益对比(按公司统计)",
-    "sql": "SELECT co.company_name AS 管辖公司, ROUND(SUM(bd.pay_sum)/SUM(cd.customer_count), 2) AS 单车收益 FROM bss_company co JOIN bss_service_area sa ON co.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_section_route_area_link link ON sa.id = link.service_area_id JOIN bss_section_route sr ON link.section_route_id = sr.id JOIN bss_car_day_count cd ON sa.id = cd.service_area_id WHERE co.delete_ts IS NULL GROUP BY co.company_name ORDER BY 单车收益 DESC;"
-  },
-  {
-    "question": "各季度不同支付方式的渗透率变化趋势如何?",
-    "sql": "SELECT DATE_TRUNC('quarter', oper_date) AS 季度, '微信支付' AS 支付方式, SUM(wx_order)/SUM(order_sum) AS 渗透率 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 季度 UNION ALL SELECT DATE_TRUNC('quarter', oper_date) AS 季度, '支付宝' AS 支付方式, SUM(zf_order)/SUM(order_sum) AS 渗透率 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 季度;"
-  },
-  {
-    "question": "各路段微信支付金额占比前五名是哪些?",
-    "sql": "SELECT s.section_name AS 路段, SUM(bd.wx)/SUM(bd.pay_sum) AS 微信占比 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no JOIN bss_section_route_area_link link ON sa.id = link.service_area_id JOIN bss_section_route s ON link.section_route_id = s.id WHERE bd.delete_ts IS NULL GROUP BY s.section_name ORDER BY 微信占比 DESC LIMIT 5;"
-  },
-  {
-    "question": "2023年Q2新开通服务区的现金支付占比分布情况?",
-    "sql": "SELECT sa.service_area_name AS 服务区, bd.rmb_order/bd.order_sum AS 现金占比 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-06-30' AND sa.create_ts BETWEEN '2023-04-01' AND '2023-06-30' AND bd.delete_ts IS NULL;"
-  },
-  {
-    "question": "行吧支付近三个月订单量增长率最高的三个服务区?",
-    "sql": "WITH cur AS (SELECT service_no, SUM(xs_order) AS cnt FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 90 GROUP BY service_no), pre AS (SELECT service_no, SUM(xs_order) AS cnt FROM bss_business_day_data WHERE oper_date BETWEEN CURRENT_DATE - 180 AND CURRENT_DATE - 91 GROUP BY service_no) SELECT cur.service_no AS 服务区, (cur.cnt - pre.cnt)/pre.cnt AS 增长率 FROM cur JOIN pre ON cur.service_no = pre.service_no ORDER BY 增长率 DESC LIMIT 3;"
-  },
-  {
-    "question": "各支付方式在不同车辆类型的使用偏好对比?",
-    "sql": "SELECT car_type AS 车辆类型, SUM(wx)/SUM(pay_sum) AS 微信占比, SUM(zfb)/SUM(pay_sum) AS 支付宝占比, SUM(rmb)/SUM(pay_sum) AS 现金占比 FROM bss_business_day_data bd JOIN bss_car_day_count c ON bd.oper_date = c.count_date WHERE bd.delete_ts IS NULL GROUP BY car_type;"
-  },
-  {
-    "question": "2023年各月现金支付订单占比变化趋势图?",
-    "sql": "SELECT DATE_TRUNC('month', oper_date) AS 月份, SUM(rmb_order)/SUM(order_sum) AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-12-31' AND delete_ts IS NULL GROUP BY 月份 ORDER BY 月份;"
-  },
-  {
-    "question": "各公司管辖服务区微信支付渗透率对比分析?",
-    "sql": "SELECT com.company_name AS 公司, SUM(bd.wx_order)/SUM(bd.order_sum) AS 微信渗透率 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no JOIN bss_company com ON sa.company_id = com.id WHERE bd.delete_ts IS NULL GROUP BY com.company_name;"
-  },
-  {
-    "question": "使用金豆支付的订单数量季度环比增长情况?",
-    "sql": "WITH qtr AS (SELECT DATE_TRUNC('quarter', oper_date) AS q, SUM(jd_order) AS cnt FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY q) SELECT q, cnt, (cnt - LAG(cnt,1,cnt) OVER(ORDER BY q))/LAG(cnt,1,cnt) OVER(ORDER BY q) AS 环比增长率 FROM qtr WHERE q <= CURRENT_DATE ORDER BY q;"
-  },
-  {
-    "question": "行吧支付订单占比超过10%的服务区清单?",
-    "sql": "SELECT service_name AS 服务区 FROM (SELECT service_no, service_name, SUM(xs_order)/SUM(order_sum) AS xs_ratio FROM bss_business_day_data GROUP BY service_no, service_name) t WHERE xs_ratio > 0.1 AND service_name IS NOT NULL;"
-  },
-  {
-    "question": "各路段支付宝支付金额的季度同比变化率?",
-    "sql": "WITH qtr_sum AS (SELECT DATE_TRUNC('quarter', oper_date) AS q, s.section_name AS 路段, SUM(zfb) AS amt FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no JOIN bss_section_route_area_link link ON sa.id = link.service_area_id JOIN bss_section_route s ON link.section_route_id = s.id WHERE bd.delete_ts IS NULL GROUP BY q, 路段) SELECT q, 路段, amt/(LAG(amt,4,amt) OVER(PARTITION BY 路段 ORDER BY q)) -1 AS 同比增长率 FROM qtr_sum ORDER BY q, 路段;"
-  }
-]

+ 0 - 202
data_pipeline/training_data/qs_highway_db_20250627_101745_pair.json.backup

@@ -1,202 +0,0 @@
-[
-  {
-    "question": "统计最近7天各服务区的每日总收入趋势(按日期排序)",
-    "sql": "SELECT oper_date AS 统计日期, service_name AS 服务区名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - INTERVAL '7 days' AND delete_ts IS NULL GROUP BY oper_date, service_name ORDER BY oper_date;"
-  },
-  {
-    "question": "查询昨日各档口订单量TOP10(按订单量降序)",
-    "sql": "SELECT branch_name AS 档口名称, SUM(order_sum) AS 订单总量 FROM bss_business_day_data WHERE oper_date = CURRENT_DATE - INTERVAL '1 day' AND delete_ts IS NULL GROUP BY branch_name ORDER BY 订单总量 DESC LIMIT 10;"
-  },
-  {
-    "question": "分析今日各服务区支付宝支付订单占比超过20%的记录",
-    "sql": "SELECT service_name AS 服务区名称, SUM(zf_order) AS 支付宝订单量, SUM(order_sum) AS 总订单量, ROUND(SUM(zf_order)*100.0/SUM(order_sum), 2) AS 支付宝占比 FROM bss_business_day_data WHERE oper_date = CURRENT_DATE AND delete_ts IS NULL GROUP BY service_name HAVING SUM(zf_order)*100.0/SUM(order_sum) > 20;"
-  },
-  {
-    "question": "对比本月与上月各服务区总营收变化率(双月数据对比)",
-    "sql": "SELECT service_name AS 服务区名称, SUM(CASE WHEN EXTRACT(MONTH FROM oper_date) = EXTRACT(MONTH FROM CURRENT_DATE) THEN pay_sum ELSE 0 END) AS 本月营收, SUM(CASE WHEN EXTRACT(MONTH FROM oper_date) = EXTRACT(MONTH FROM CURRENT_DATE - INTERVAL '1 month') THEN pay_sum ELSE 0 END) AS 上月营收, ROUND((SUM(CASE WHEN EXTRACT(MONTH FROM oper_date) = EXTRACT(MONTH FROM CURRENT_DATE) THEN pay_sum ELSE 0 END)/NULLIF(SUM(CASE WHEN EXTRACT(MONTH FROM oper_date) = EXTRACT(MONTH FROM CURRENT_DATE - INTERVAL '1 month') THEN pay_sum ELSE 0 END), 0)-1)*100, 2) AS 环比增长率 FROM bss_business_day_data WHERE oper_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '2 months') AND delete_ts IS NULL GROUP BY service_name;"
-  },
-  {
-    "question": "查询特定日期(如2023-04-01)各档口微信支付与现金支付金额对比",
-    "sql": "SELECT branch_name AS 档口名称, SUM(wx) AS 微信支付总额, SUM(rmb) AS 现金支付总额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY branch_name ORDER BY 微信支付总额 DESC;"
-  },
-  {
-    "question": "统计最近30天各公司下属服务区的平均日营收(关联企业信息)",
-    "sql": "SELECT c.company_name AS 企业名称, bsa.service_area_name AS 服务区名称, ROUND(AVG(bbd.pay_sum), 2) AS 平均日营收 FROM bss_business_day_data bbd JOIN bss_service_area bsa ON bbd.service_name = bsa.service_area_name JOIN bss_company c ON bsa.company_id = c.id WHERE bbd.oper_date >= CURRENT_DATE - INTERVAL '30 days' AND bbd.delete_ts IS NULL AND bsa.delete_ts IS NULL GROUP BY c.company_name, bsa.service_area_name;"
-  },
-  {
-    "question": "分析各服务区不同支付方式(微信/支付宝/现金)的订单占比分布",
-    "sql": "SELECT service_name AS 服务区名称, ROUND(SUM(wx_order)*100.0/SUM(order_sum), 2) AS 微信占比, ROUND(SUM(zf_order)*100.0/SUM(order_sum), 2) AS 支付宝占比, ROUND(SUM(rmb_order)*100.0/SUM(order_sum), 2) AS 现金占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name;"
-  },
-  {
-    "question": "查询当前开放状态的服务区及其最近营业日数据完整率(是否存在空数据)",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, MAX(bbd.oper_date) AS 最后营业日, CASE WHEN MAX(bbd.oper_date) >= CURRENT_DATE - INTERVAL '1 day' THEN '数据完整' ELSE '数据缺失' END AS 数据状态 FROM bss_service_area sa LEFT JOIN bss_business_day_data bbd ON sa.service_area_name = bbd.service_name AND bbd.delete_ts IS NULL WHERE sa.service_state = '开放' AND sa.delete_ts IS NULL GROUP BY sa.service_area_name;"
-  },
-  {
-    "question": "统计本周工作日(周一至周五)各时段(早/中/晚)的营收分布",
-    "sql": "SELECT service_name AS 服务区名称, CASE WHEN EXTRACT(HOUR FROM create_ts) BETWEEN 6 AND 11 THEN '上午' WHEN EXTRACT(HOUR FROM create_ts) BETWEEN 12 AND 17 THEN '下午' ELSE '晚上' END AS 营业时段, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE EXTRACT(ISODOW FROM oper_date) <= 5 AND oper_date >= CURRENT_DATE - INTERVAL '7 days' AND delete_ts IS NULL GROUP BY 服务区名称, 营业时段 ORDER BY 服务区名称, 营业时段;"
-  },
-  {
-    "question": "分析不同类型服务区(信息化/智能化)的平均客单价差异",
-    "sql": "SELECT sa.service_area_type AS 服务区类型, ROUND(AVG(bbd.pay_sum / NULLIF(bbd.order_sum, 0)), 2) AS 平均客单价 FROM bss_business_day_data bbd JOIN bss_service_area sa ON bbd.service_name = sa.service_area_name WHERE sa.delete_ts IS NULL AND bbd.delete_ts IS NULL AND bbd.order_sum > 0 GROUP BY sa.service_area_type;"
-  },
-  {
-    "question": "统计2023年4月1日各服务区各车辆类型的总车流量,并按车流量降序排序",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, cc.car_type AS 车辆类型, SUM(cc.customer_count) AS 总车流量 FROM bss_car_day_count cc INNER JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date = '2023-04-01' AND sa.delete_ts IS NULL GROUP BY sa.service_area_name, cc.car_type ORDER BY 总车流量 DESC;"
-  },
-  {
-    "question": "分析2023年3月各车辆类型的日均车流量,找出日均车流量最高的车型",
-    "sql": "SELECT car_type AS 车辆类型, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-03-01' AND '2023-03-31' GROUP BY car_type ORDER BY 日均车流量 DESC LIMIT 1;"
-  },
-  {
-    "question": "查询最近7天危化品车辆流量最高的前3个服务区及其总流量",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, SUM(cc.customer_count) AS 危化品车流量 FROM bss_car_day_count cc INNER JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.count_date >= CURRENT_DATE - 7 AND cc.car_type = '危化品' AND sa.delete_ts IS NULL GROUP BY sa.service_area_name ORDER BY 危化品车流量 DESC LIMIT 3;"
-  },
-  {
-    "question": "计算南昌南服务区各车辆类型占比,并按占比降序排序",
-    "sql": "SELECT car_type AS 车辆类型, SUM(customer_count) * 100.0 / (SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '南昌南服务区')) AS 占比百分比 FROM bss_car_day_count WHERE service_area_id = (SELECT id FROM bss_service_area WHERE service_area_name = '南昌南服务区') GROUP BY car_type ORDER BY 占比百分比 DESC;"
-  },
-  {
-    "question": "统计2023年每周总车流量趋势,按周环比增长率排序",
-    "sql": "WITH weekly AS (SELECT date_trunc('week', count_date) AS 周, SUM(customer_count) AS 总流量 FROM bss_car_day_count GROUP BY 周) SELECT 周, 总流量, (总流量 - LAG(总流量,1) OVER(ORDER BY 周)) / LAG(总流量,1) OVER(ORDER BY 周)::numeric * 100 AS 环比增长率 FROM weekly ORDER BY 周;"
-  },
-  {
-    "question": "查询信息化与智能化服务区的平均车流量差异",
-    "sql": "SELECT sa.service_area_type AS 服务区类型, AVG(cc.customer_count) AS 平均车流量 FROM bss_car_day_count cc INNER JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE sa.delete_ts IS NULL GROUP BY sa.service_area_type;"
-  },
-  {
-    "question": "找出2023年车流量增长率最高的Top5服务区(同比2022年)",
-    "sql": "WITH yearly_2022 AS (SELECT service_area_id, SUM(customer_count) AS 流量2022 FROM bss_car_day_count WHERE count_date BETWEEN '2022-01-01' AND '2022-12-31' GROUP BY service_area_id), yearly_2023 AS (SELECT service_area_id, SUM(customer_count) AS 流量2023 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY service_area_id) SELECT sa.service_area_name, (y2023.流量2023 - y2022.流量2022)/y2022.流量2022::numeric * 100 AS 增长率 FROM yearly_2022 y2022 INNER JOIN yearly_2023 y2023 ON y2022.service_area_id = y2023.service_area_id INNER JOIN bss_service_area sa ON y2022.service_area_id = sa.id ORDER BY 增长率 DESC LIMIT 5;"
-  },
-  {
-    "question": "查询存在危化品车辆的所有服务区名称及其首次出现日期",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, MIN(cc.count_date) AS 首次出现日期 FROM bss_car_day_count cc INNER JOIN bss_service_area sa ON cc.service_area_id = sa.id WHERE cc.car_type = '危化品' AND sa.delete_ts IS NULL GROUP BY sa.service_area_name;"
-  },
-  {
-    "question": "统计各服务区2023年Q1季度月均车流量并按总量降序排序",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, AVG(月流量) AS 月均车流量 FROM (SELECT service_area_id, date_trunc('month', count_date) AS 月份, SUM(customer_count) AS 月流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-03-31' GROUP BY service_area_id, 月份) AS mq INNER JOIN bss_service_area sa ON mq.service_area_id = sa.id GROUP BY sa.service_area_name ORDER BY 月均车流量 DESC;"
-  },
-  {
-    "question": "分析周末与工作日的车流差异(统计2023年4月工作日/周末日均车流量)",
-    "sql": "SELECT CASE WHEN EXTRACT(ISODOW FROM count_date) IN (6,7) THEN '周末' ELSE '工作日' END AS 日期类型, AVG(customer_count) AS 日均车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY 日期类型;"
-  },
-  {
-    "question": "统计2023年6月各企业下属服务区单车流收益TOP5(单车流收益=总营收/总车流量)",
-    "sql": "SELECT c.company_name AS 企业名称, SUM(b.pay_sum)/SUM(car.customer_count) AS 单车流收益 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id AND s.delete_ts IS NULL JOIN bss_business_day_data b ON s.service_area_no = b.service_no AND b.oper_date BETWEEN '2023-06-01' AND '2023-06-30' JOIN bss_car_day_count car ON s.id = car.service_area_id AND car.count_date BETWEEN '2023-06-01' AND '2023-06-30' GROUP BY c.company_name ORDER BY 单车流收益 DESC LIMIT 5;"
-  },
-  {
-    "question": "计算当前有效服务区中车流转化率(订单数/车流量)最低的10个服务区信息",
-    "sql": "SELECT s.service_area_name AS 服务区名称, SUM(b.order_sum)/SUM(car.customer_count) AS 车流转化率 FROM bss_service_area s JOIN bss_business_day_data b ON s.service_area_no = b.service_no JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE s.service_state = '开放' GROUP BY s.service_area_name ORDER BY 车流转化率 ASC LIMIT 10;"
-  },
-  {
-    "question": "对比2023年Q2各企业服务区覆盖率(服务区数量占全局比例)变化趋势",
-    "sql": "WITH company_count AS (SELECT c.company_name, COUNT(s.id) AS cnt FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id GROUP BY c.company_name), total AS (SELECT COUNT(*) AS total_cnt FROM bss_service_area) SELECT company_name, cnt/total_cnt AS 覆盖率, 'Q2' AS 季度 FROM company_count CROSS JOIN total;"
-  },
-  {
-    "question": "分析宜春分公司2023年7月每日车流中过境车辆占比变化趋势",
-    "sql": "SELECT car.count_date AS 统计日期, SUM(CASE WHEN car.car_type='过境' THEN car.customer_count ELSE 0 END)/SUM(car.customer_count) AS 过境占比 FROM bss_service_area s JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE s.company_id = (SELECT id FROM bss_company WHERE company_name = '宜春分公司') AND car.count_date BETWEEN '2023-07-01' AND '2023-07-31' GROUP BY car.count_date ORDER BY 统计日期;"
-  },
-  {
-    "question": "统计连续3天无营收记录的服务区清单及所属企业",
-    "sql": "SELECT DISTINCT s.service_area_name, c.company_name FROM bss_service_area s JOIN bss_company c ON s.company_id = c.id LEFT JOIN bss_business_day_data b ON s.service_area_no = b.service_no AND b.oper_date >= CURRENT_DATE - INTERVAL '3 days' WHERE b.id IS NULL;"
-  },
-  {
-    "question": "对比昌栗路段和昌韶路段所属企业2023年单车流收益差异",
-    "sql": "SELECT sec.route_name AS 路段名称, c.company_name AS 企业名称, SUM(b.pay_sum)/SUM(car.customer_count) AS 单车流收益 FROM bss_section_route sec JOIN bss_section_route_area_link link ON sec.id = link.section_route_id JOIN bss_service_area s ON link.service_area_id = s.id JOIN bss_company c ON s.company_id = c.id JOIN bss_business_day_data b ON s.service_area_no = b.service_no JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE sec.route_name IN ('昌栗','昌韶') GROUP BY sec.route_name, c.company_name;"
-  },
-  {
-    "question": "计算各企业2023年上半年月均营收额和车流量增长率(与2022年同期对比)",
-    "sql": "WITH current_year AS (SELECT c.company_name, EXTRACT(MONTH FROM b.oper_date) AS 月份, SUM(b.pay_sum) AS 营收, SUM(car.customer_count) AS 车流量 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id JOIN bss_business_day_data b ON s.service_area_no = b.service_no JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE EXTRACT(YEAR FROM b.oper_date) = 2023 GROUP BY c.company_name, 月份), last_year AS (SELECT c.company_name, EXTRACT(MONTH FROM b.oper_date) AS 月份, SUM(b.pay_sum) AS 营收_去年, SUM(car.customer_count) AS 车流量_去年 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id JOIN bss_business_day_data b ON s.service_area_no = b.service_no JOIN bss_car_day_count car ON s.id = car.service_area_id WHERE EXTRACT(YEAR FROM b.oper_date) = 2022 GROUP BY c.company_name, 月份) SELECT cy.company_name, cy.月份, (cy.营收/ly.营收_去年-1)*100 AS 营收增长率, (cy.车流量/ly.车流量_去年-1)*100 AS 车量增长率 FROM current_year cy JOIN last_year ly ON cy.company_name = ly.company_name AND cy.月份 = ly.月份;"
-  },
-  {
-    "question": "统计各企业服务区危化品车辆通行量占比TOP3的服务区",
-    "sql": "SELECT c.company_name, s.service_area_name, SUM(CASE WHEN car.car_type='危化品' THEN car.customer_count ELSE 0 END)/SUM(car.customer_count) AS 危化品占比 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id JOIN bss_car_day_count car ON s.id = car.service_area_id GROUP BY c.company_name, s.service_area_name ORDER BY 危化品占比 DESC LIMIT 3;"
-  },
-  {
-    "question": "分析2023年各季度企业新增服务区数量及运营状态分布",
-    "sql": "SELECT c.company_name, DATE_TRUNC('quarter', s.create_ts) AS 季度, COUNT(s.id) AS 新增数量, SUM(CASE WHEN s.service_state='开放' THEN 1 ELSE 0 END) AS 开放数量 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id WHERE EXTRACT(YEAR FROM s.create_ts) = 2023 GROUP BY c.company_name, DATE_TRUNC('quarter', s.create_ts) ORDER BY 季度;"
-  },
-  {
-    "question": "统计连续两月营收环比下降超过10%的企业名单",
-    "sql": "WITH monthly_revenue AS (SELECT c.company_name, DATE_TRUNC('month', b.oper_date) AS 月份, SUM(b.pay_sum) AS 总营收 FROM bss_company c JOIN bss_service_area s ON c.id = s.company_id JOIN bss_business_day_data b ON s.service_area_no = b.service_no GROUP BY c.company_name, 月份) SELECT company_name FROM (SELECT company_name, 月份, 总营收 / LAG(总营收) OVER (PARTITION BY company_name ORDER BY 月份) -1 AS 环比变化 FROM monthly_revenue) t WHERE 环比变化 < -0.1 GROUP BY company_name;"
-  },
-  {
-    "question": "分析2023年Q1各路段关联服务区的总车流与消费金额对比,按车流量排序",
-    "sql": "SELECT sr.section_name AS 路段名称, SUM(cd.customer_count) AS 总车流量, SUM(bd.pay_sum) AS 总消费金额 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_car_day_count cd ON link.service_area_id = cd.service_area_id JOIN bss_business_day_data bd ON cd.service_area_id = bd.service_no::varchar AND cd.count_date = bd.oper_date WHERE sr.delete_ts IS NULL AND cd.count_date BETWEEN '2023-01-01' AND '2023-03-31' GROUP BY sr.section_name ORDER BY 总车流量 DESC;"
-  },
-  {
-    "question": "统计最近30天单车道收益最高的前5个服务区,包含路段信息",
-    "sql": "SELECT sa.service_area_name AS 服务区名称, sr.section_name AS 关联路段, ROUND(SUM(bd.pay_sum)/COUNT(DISTINCT sr.id), 2) AS 单车道收益 FROM bss_service_area sa JOIN bss_section_route_area_link link ON sa.id = link.service_area_id JOIN bss_section_route sr ON link.section_route_id = sr.id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE sa.delete_ts IS NULL AND bd.oper_date >= CURRENT_DATE - 30 GROUP BY sa.service_area_name, sr.section_name ORDER BY 单车道收益 DESC LIMIT 5;"
-  },
-  {
-    "question": "计算2023年每月不同车辆类型的平均单车消费金额变化趋势",
-    "sql": "SELECT EXTRACT(MONTH FROM cd.count_date) AS 月份, cd.car_type AS 车辆类型, ROUND(AVG(bd.pay_sum/cd.customer_count), 2) AS 平均单车消费 FROM bss_car_day_count cd JOIN bss_business_day_data bd ON cd.service_area_id = bd.service_no::varchar AND cd.count_date = bd.oper_date WHERE cd.count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY 月份, cd.car_type ORDER BY 月份;"
-  },
-  {
-    "question": "对比昌栗路段与昌宁路段关联服务区的周末(周六日)车流量差异",
-    "sql": "SELECT sr.section_name AS 路段, CASE WHEN EXTRACT(DOW FROM cd.count_date) IN (0,6) THEN '周末' ELSE '工作日' END AS 日期类型, SUM(cd.customer_count) AS 总车流量 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_car_day_count cd ON link.service_area_id = cd.service_area_id WHERE sr.section_name IN ('昌栗', '昌宁') AND cd.count_date >= CURRENT_DATE - 90 GROUP BY sr.section_name, 日期类型 ORDER BY sr.section_name, 日期类型;"
-  },
-  {
-    "question": "统计各路段关联服务区的微信支付占比,筛选占比超过40%的服务区",
-    "sql": "SELECT sr.section_name AS 路段, sa.service_area_name AS 服务区, ROUND(SUM(bd.wx)/SUM(bd.pay_sum)*100, 2) AS 微信支付占比 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_service_area sa ON link.service_area_id = sa.id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE sr.delete_ts IS NULL GROUP BY sr.section_name, sa.service_area_name HAVING SUM(bd.pay_sum) > 0 AND ROUND(SUM(bd.wx)/SUM(bd.pay_sum)*100, 2) > 40;"
-  },
-  {
-    "question": "分析2023年Q2各路段车流贡献度(车流量/路段长度),需关联路段里程数据(假设code对应SR0001=10km)",
-    "sql": "SELECT section_name AS 路段, code AS 路段编号, SUM(customer_count) AS 总车流量, CASE WHEN code = 'SR0001' THEN 10 WHEN code = 'SR0002' THEN 15 END AS 路段长度, ROUND(SUM(customer_count)::numeric / CASE WHEN code = 'SR0001' THEN 10 WHEN code = 'SR0002' THEN 15 END, 2) AS 车流密度 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_car_day_count cd ON link.service_area_id = cd.service_area_id WHERE cd.count_date BETWEEN '2023-04-01' AND '2023-06-30' GROUP BY section_name, code;"
-  },
-  {
-    "question": "统计各路段-服务区关联度(车流+消费金额的综合评分),按权重5:5计算",
-    "sql": "SELECT sr.section_name AS 路段, sa.service_area_name AS 服务区, ROUND((SUM(cd.customer_count)/MAX(cd.customer_count)*0.5 + SUM(bd.pay_sum)/MAX(bd.pay_sum)*0.5)*100, 2) AS 关联度评分 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_service_area sa ON link.service_area_id = sa.id JOIN bss_car_day_count cd ON sa.id = cd.service_area_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no GROUP BY sr.section_name, sa.service_area_name ORDER BY 关联度评分 DESC;"
-  },
-  {
-    "question": "分析2023年每月不同路段的档口订单密度(订单总数/档口数量)",
-    "sql": "SELECT EXTRACT(MONTH FROM oper_date) AS 月份, sr.section_name AS 路段, SUM(bd.order_sum) AS 总订单数, COUNT(DISTINCT bd.branch_no) AS 档口数量, ROUND(SUM(bd.order_sum)/COUNT(DISTINCT bd.branch_no), 2) AS 订单密度 FROM bss_section_route sr JOIN bss_section_route_area_link link ON sr.id = link.section_route_id JOIN bss_service_area sa ON link.service_area_id = sa.id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no WHERE bd.oper_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY 月份, sr.section_name ORDER BY 月份;"
-  },
-  {
-    "question": "找出2023年车流增长率最高的服务区(对比2022年同期数据)",
-    "sql": "WITH prev_year AS (SELECT service_area_id, SUM(customer_count) AS 流量2022 FROM bss_car_day_count WHERE count_date BETWEEN '2022-01-01' AND '2022-12-31' GROUP BY service_area_id), curr_year AS (SELECT service_area_id, SUM(customer_count) AS 流量2023 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-12-31' GROUP BY service_area_id) SELECT sa.service_area_name, ROUND((curr_year.流量2023/prev_year.流量2022-1)*100, 2) AS 增长率 FROM prev_year JOIN curr_year ON prev_year.service_area_id = curr_year.service_area_id JOIN bss_service_area sa ON sa.id = prev_year.service_area_id ORDER BY 增长率 DESC LIMIT 1;"
-  },
-  {
-    "question": "分析不同公司管辖路段的单车收益对比(按公司统计)",
-    "sql": "SELECT co.company_name AS 管辖公司, ROUND(SUM(bd.pay_sum)/SUM(cd.customer_count), 2) AS 单车收益 FROM bss_company co JOIN bss_service_area sa ON co.id = sa.company_id JOIN bss_business_day_data bd ON sa.service_area_no = bd.service_no JOIN bss_section_route_area_link link ON sa.id = link.service_area_id JOIN bss_section_route sr ON link.section_route_id = sr.id JOIN bss_car_day_count cd ON sa.id = cd.service_area_id WHERE co.delete_ts IS NULL GROUP BY co.company_name ORDER BY 单车收益 DESC;"
-  },
-  {
-    "question": "各季度不同支付方式的渗透率变化趋势如何?",
-    "sql": "SELECT DATE_TRUNC('quarter', oper_date) AS 季度, '微信支付' AS 支付方式, SUM(wx_order)/SUM(order_sum) AS 渗透率 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 季度 UNION ALL SELECT DATE_TRUNC('quarter', oper_date), '支付宝', SUM(zf_order)/SUM(order_sum) FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 季度;"
-  },
-  {
-    "question": "各路段微信支付金额占比前五名是哪些?",
-    "sql": "SELECT s.section_name AS 路段, SUM(bd.wx)/SUM(bd.pay_sum) AS 微信占比 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no JOIN bss_section_route_area_link link ON sa.id = link.service_area_id JOIN bss_section_route s ON link.section_route_id = s.id WHERE bd.delete_ts IS NULL GROUP BY s.section_name ORDER BY 微信占比 DESC LIMIT 5;"
-  },
-  {
-    "question": "2023年Q2新开通服务区的现金支付占比分布情况?",
-    "sql": "SELECT sa.service_area_name AS 服务区, bd.rmb_order/bd.order_sum AS 现金占比 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no WHERE bd.oper_date BETWEEN '2023-04-01' AND '2023-06-30' AND sa.create_ts BETWEEN '2023-04-01' AND '2023-06-30' AND bd.delete_ts IS NULL;"
-  },
-  {
-    "question": "行吧支付近三个月订单量增长率最高的三个服务区?",
-    "sql": "WITH cur AS (SELECT service_no, SUM(xs_order) AS cnt FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 90 GROUP BY service_no), pre AS (SELECT service_no, SUM(xs_order) AS cnt FROM bss_business_day_data WHERE oper_date BETWEEN CURRENT_DATE - 180 AND CURRENT_DATE - 91 GROUP BY service_no) SELECT cur.service_no AS 服务区, (cur.cnt - pre.cnt)/pre.cnt AS 增长率 FROM cur JOIN pre ON cur.service_no = pre.service_no ORDER BY 增长率 DESC LIMIT 3;"
-  },
-  {
-    "question": "各支付方式在不同车辆类型的使用偏好对比?",
-    "sql": "SELECT car_type AS 车辆类型, SUM(wx)/SUM(pay_sum) AS 微信占比, SUM(zf)/SUM(pay_sum) AS 支付宝占比, SUM(rmb)/SUM(pay_sum) AS 现金占比 FROM bss_business_day_data bd JOIN bss_car_day_count c ON bd.oper_date = c.count_date WHERE bd.delete_ts IS NULL GROUP BY car_type;"
-  },
-  {
-    "question": "2023年各月现金支付订单占比变化趋势图?",
-    "sql": "SELECT DATE_TRUNC('month', oper_date) AS 月份, SUM(rmb_order)/SUM(order_sum) AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-12-31' AND delete_ts IS NULL GROUP BY 月份 ORDER BY 月份;"
-  },
-  {
-    "question": "各公司管辖服务区微信支付渗透率对比分析?",
-    "sql": "SELECT com.company_name AS 公司, SUM(bd.wx_order)/SUM(bd.order_sum) AS 微信渗透率 FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no JOIN bss_company com ON sa.company_id = com.id WHERE bd.delete_ts IS NULL GROUP BY com.company_name;"
-  },
-  {
-    "question": "使用金豆支付的订单数量季度环比增长情况?",
-    "sql": "WITH qtr AS (SELECT DATE_TRUNC('quarter', oper_date) AS q, SUM(jd_order) AS cnt FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY q) SELECT q, cnt, (cnt - LAG(cnt,1,cnt) OVER(ORDER BY q))/LAG(cnt,1,cnt) OVER(ORDER BY q) AS 环比增长率 FROM qtr WHERE q <= CURRENT_DATE ORDER BY q;"
-  },
-  {
-    "question": "行吧支付订单占比超过10%的服务区清单?",
-    "sql": "SELECT service_name AS 服务区 FROM (SELECT service_no, service_name, SUM(xs_order)/SUM(order_sum) AS xs_ratio FROM bss_business_day_data GROUP BY service_no, service_name) t WHERE xs_ratio > 0.1 AND service_name IS NOT NULL;"
-  },
-  {
-    "question": "各路段支付宝支付金额的季度同比变化率?",
-    "sql": "WITH qtr_sum AS (SELECT DATE_TRUNC('quarter', oper_date) AS q, s.section_name AS 路段, SUM(zfb) AS amt FROM bss_business_day_data bd JOIN bss_service_area sa ON bd.service_no = sa.service_area_no JOIN bss_section_route_area_link link ON sa.id = link.service_area_id JOIN bss_section_route s ON link.section_route_id = s.id WHERE bd.delete_ts IS NULL GROUP BY q, 路段) SELECT q, 路段, amt/(LAG(amt,4,amt) OVER(PARTITION BY 路段 ORDER BY q)) -1 AS 同比增长率 FROM qtr_sum ORDER BY q, 路段;"
-  }
-]

+ 31 - 0
data_pipeline/training_data/task_20250701_212426/bss_business_day_data.ddl

@@ -0,0 +1,31 @@
+-- 中文名: 高速公路服务区每日经营数据记录表
+-- 描述: 高速公路服务区每日经营数据记录表,存储交易流水、运营统计及状态变更信息,支撑业务分析与运营管理。
+create table public.bss_business_day_data (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  oper_date date              -- 统计日期,
+  service_no varchar(255)     -- 服务区编码,
+  service_name varchar(255)   -- 服务区名称,
+  branch_no varchar(255)      -- 档口编码,
+  branch_name varchar(255)    -- 档口名称,
+  wx numeric(19,4)            -- 微信支付金额,
+  wx_order integer            -- 微信订单数量,
+  zfb numeric(19,4)           -- 支付宝支付金额,
+  zf_order integer            -- 支付宝订单数量,
+  rmb numeric(19,4)           -- 现金支付金额,
+  rmb_order integer           -- 现金订单数量,
+  xs numeric(19,4)            -- 行吧支付金额,
+  xs_order integer            -- 行吧订单数量,
+  jd numeric(19,4)            -- 金豆支付金额,
+  jd_order integer            -- 金豆订单数量,
+  order_sum integer           -- 订单总数,
+  pay_sum numeric(19,4)       -- 支付总金额,
+  source_type integer         -- 数据来源类别,
+  primary key (id)
+);

+ 32 - 0
data_pipeline/training_data/task_20250701_212426/bss_business_day_data_detail.md

@@ -0,0 +1,32 @@
+## bss_business_day_data(高速公路服务区每日经营数据记录表)
+bss_business_day_data 表高速公路服务区每日经营数据记录表,存储交易流水、运营统计及状态变更信息,支撑业务分析与运营管理。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- created_by (varchar(50)) - 创建人 [示例: xingba]
+- update_ts (timestamp) - 更新时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- oper_date (date) - 统计日期 [示例: 2023-04-01]
+- service_no (varchar(255)) - 服务区编码 [示例: 1028, H0501]
+- service_name (varchar(255)) - 服务区名称 [示例: 宜春服务区, 庐山服务区]
+- branch_no (varchar(255)) - 档口编码 [示例: 1, H05016]
+- branch_name (varchar(255)) - 档口名称 [示例: 宜春南区, 庐山鲜徕客东区]
+- wx (numeric(19,4)) - 微信支付金额 [示例: 4790.0000, 2523.0000]
+- wx_order (integer) - 微信订单数量 [示例: 253, 133]
+- zfb (numeric(19,4)) - 支付宝支付金额 [示例: 229.0000, 0.0000]
+- zf_order (integer) - 支付宝订单数量 [示例: 15, 0]
+- rmb (numeric(19,4)) - 现金支付金额 [示例: 1058.5000, 124.0000]
+- rmb_order (integer) - 现金订单数量 [示例: 56, 12]
+- xs (numeric(19,4)) - 行吧支付金额 [示例: 0.0000, 40.0000]
+- xs_order (integer) - 行吧订单数量 [示例: 0, 1]
+- jd (numeric(19,4)) - 金豆支付金额 [示例: 0.0000]
+- jd_order (integer) - 金豆订单数量 [示例: 0]
+- order_sum (integer) - 订单总数 [示例: 324, 146]
+- pay_sum (numeric(19,4)) - 支付总金额 [示例: 6077.5000, 2687.0000]
+- source_type (integer) - 数据来源类别 [示例: 1, 0, 4]
+字段补充说明:
+- id 为主键
+- source_type 为枚举字段,包含取值:0、4、1、2、3

+ 17 - 0
data_pipeline/training_data/task_20250701_212426/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: 记录高速公路服务区每日车辆类型及数量统计
+-- 描述: 记录高速公路服务区每日车辆类型及数量统计,用于车流分析与资源调配
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人ID,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人ID,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人ID,
+  customer_count bigint       -- 车辆数量,
+  car_type varchar(100)       -- 车辆类别,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 18 - 0
data_pipeline/training_data/task_20250701_212426/bss_car_day_count_detail.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(记录高速公路服务区每日车辆类型及数量统计)
+bss_car_day_count 表记录高速公路服务区每日车辆类型及数量统计,用于车流分析与资源调配
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人ID
+- update_ts (timestamp) - 更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 更新人ID
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人ID
+- customer_count (bigint) - 车辆数量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类别 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

+ 5 - 0
data_pipeline/training_data/task_20250701_212426/filename_mapping.txt

@@ -0,0 +1,5 @@
+# 文件名映射报告
+# 格式: 原始表名 -> 实际文件名
+
+public.bss_business_day_data -> bss_business_day_data_detail.md
+public.bss_car_day_count -> bss_car_day_count_detail.md

+ 96 - 0
data_pipeline/training_data/task_20250701_212426/qs_intermediate_20250701_212921.json

@@ -0,0 +1,96 @@
+[
+  {
+    "theme": "支付方式分析",
+    "timestamp": "2025-07-01T21:30:17.494692",
+    "questions_count": 10,
+    "questions": [
+      {
+        "question": "各支付方式的总使用频率及金额占比是多少?",
+        "sql": "SELECT '微信' AS 支付方式, SUM(wx_order) AS 总订单数, SUM(wx) AS 总金额, SUM(wx)/SUM(pay_sum) AS 金额占比 FROM bss_business_day_data WHERE delete_ts IS NULL UNION ALL SELECT '支付宝', SUM(zf_order), SUM(zfb), SUM(zfb)/SUM(pay_sum) FROM bss_business_day_data WHERE delete_ts IS NULL UNION ALL SELECT '现金', SUM(rmb_order), SUM(rmb), SUM(rmb)/SUM(pay_sum) FROM bss_business_day_data WHERE delete_ts IS NULL;"
+      },
+      {
+        "question": "近7天各服务区微信支付订单数对比(按日期排序取前5)?",
+        "sql": "SELECT service_name AS 服务区名称, oper_date AS 统计日期, SUM(wx_order) AS 微信订单数 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - 7 GROUP BY service_name, oper_date ORDER BY oper_date DESC, 微信订单数 DESC LIMIT 5;"
+      },
+      {
+        "question": "现金支付占比超过30%的档口TOP10有哪些?",
+        "sql": "SELECT branch_name AS 档口名称, SUM(rmb)/SUM(pay_sum) AS 现金占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name HAVING SUM(rmb)/SUM(pay_sum) > 0.3 ORDER BY 现金占比 DESC LIMIT 10;"
+      },
+      {
+        "question": "2023年Q2各支付方式订单数量月度趋势如何?",
+        "sql": "SELECT EXTRACT(MONTH FROM oper_date) AS 月份, SUM(wx_order) AS 微信订单数, SUM(zf_order) AS 支付宝订单数, SUM(rmb_order) AS 现金订单数 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date BETWEEN '2023-04-01' AND '2023-06-30' GROUP BY 月份 ORDER BY 月份;"
+      },
+      {
+        "question": "支付宝支付金额超过5000元的服务区有哪些?",
+        "sql": "SELECT service_name AS 服务区名称, SUM(zfb) AS 支付宝总金额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name HAVING SUM(zfb) > 5000 ORDER BY 支付宝总金额 DESC;"
+      },
+      {
+        "question": "各档口平均订单金额最高的支付方式是什么?",
+        "sql": "SELECT branch_name AS 档口名称, '微信' AS 支付方式, AVG(wx/wx_order) AS 平均订单金额 FROM bss_business_day_data WHERE delete_ts IS NULL AND wx_order > 0 UNION ALL SELECT branch_name, '支付宝', AVG(zfb/zf_order) FROM bss_business_day_data WHERE delete_ts IS NULL AND zf_order > 0 UNION ALL SELECT branch_name, '现金', AVG(rmb/rmb_order) FROM bss_business_day_data WHERE delete_ts IS NULL AND rmb_order > 0 ORDER BY 档口名称, 平均订单金额 DESC;"
+      },
+      {
+        "question": "微信支付总金额排名前十的服务区是哪些?",
+        "sql": "SELECT service_name AS 服务区名称, SUM(wx) AS 微信总金额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 微信总金额 DESC LIMIT 10;"
+      },
+      {
+        "question": "现金支付占比季度环比增长率如何变化?",
+        "sql": "SELECT EXTRACT(QUARTER FROM oper_date) AS 季度, SUM(rmb)/SUM(pay_sum) AS 现金占比, (SUM(rmb)/SUM(pay_sum) - LAG(SUM(rmb)/SUM(pay_sum),1) OVER(ORDER BY EXTRACT(QUARTER FROM oper_date)))/LAG(SUM(rmb)/SUM(pay_sum),1) OVER(ORDER BY EXTRACT(QUARTER FROM oper_date)) AS 环比增长率 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 季度 ORDER BY 季度;"
+      },
+      {
+        "question": "2023年国庆黄金周期间各支付方式使用情况对比?",
+        "sql": "SELECT oper_date AS 日期, SUM(wx_order) AS 微信订单数, SUM(zf_order) AS 支付宝订单数, SUM(rmb_order) AS 现金订单数, SUM(wx) AS 微信金额, SUM(zfb) AS 支付宝金额, SUM(rmb) AS 现金金额 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date BETWEEN '2023-10-01' AND '2023-10-07' GROUP BY 日期 ORDER BY 日期;"
+      },
+      {
+        "question": "各服务区支付效率(订单数/支付金额)TOP5是哪些?",
+        "sql": "SELECT service_name AS 服务区名称, SUM(order_sum)/SUM(pay_sum) AS 支付效率 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 支付效率 DESC LIMIT 5;"
+      }
+    ]
+  },
+  {
+    "theme": "车流时段分析",
+    "timestamp": "2025-07-01T21:31:43.792772",
+    "questions_count": 10,
+    "questions": [
+      {
+        "question": "统计最近一周每日车流量变化趋势,识别高峰日期",
+        "sql": "SELECT count_date AS 统计日期, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY count_date ORDER BY count_date DESC LIMIT 7;"
+      },
+      {
+        "question": "分析各车辆类型在不同服务区的分布占比",
+        "sql": "SELECT car_type AS 车辆类型, service_area_id AS 服务区ID, SUM(customer_count) AS 总数量, ROUND(SUM(customer_count)*100/(SELECT SUM(customer_count) FROM bss_car_day_count WHERE delete_ts IS NULL),2) AS 占比百分比 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY car_type, service_area_id ORDER BY service_area_id, 占比百分比 DESC;"
+      },
+      {
+        "question": "找出过去一个月车流量最高的前5个服务区",
+        "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - INTERVAL '1 month' AND delete_ts IS NULL GROUP BY service_area_id ORDER BY 总车流量 DESC LIMIT 5;"
+      },
+      {
+        "question": "查询2023-04-01当天各服务区危化品车辆具体数量",
+        "sql": "SELECT service_area_id AS 服务区ID, customer_count AS 危化品车数量 FROM bss_car_day_count WHERE count_date = '2023-04-01' AND car_type = '危化品' AND delete_ts IS NULL ORDER BY 危化品车数量 DESC;"
+      },
+      {
+        "question": "分析城际车辆日环比增长趋势",
+        "sql": "WITH daily AS (SELECT count_date, SUM(customer_count) AS cnt FROM bss_car_day_count WHERE car_type = '城际' AND delete_ts IS NULL GROUP BY count_date) SELECT count_date, cnt, (cnt - LAG(cnt,1) OVER(ORDER BY count_date))/LAG(cnt,1) OVER(ORDER BY count_date)*100 AS 环比增长率 FROM daily ORDER BY count_date;"
+      },
+      {
+        "question": "统计各服务区过境车辆月平均流量",
+        "sql": "SELECT service_area_id AS 服务区ID, AVG(customer_count) AS 日均过境车辆数 FROM bss_car_day_count WHERE car_type = '过境' AND delete_ts IS NULL GROUP BY service_area_id, DATE_TRUNC('month', count_date) ORDER BY 服务区ID;"
+      },
+      {
+        "question": "识别近7天车流最集中的3个时段(按小时粒度)",
+        "sql": "SELECT EXTRACT(HOUR FROM create_ts) AS 小时时段, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date >= CURRENT_DATE - INTERVAL '7 days' AND delete_ts IS NULL GROUP BY 小时时段 ORDER BY 总车流量 DESC LIMIT 3;"
+      },
+      {
+        "question": "分析不同车辆类型季度同比变化",
+        "sql": "SELECT DATE_TRUNC('quarter', count_date) AS 季度, car_type AS 车辆类型, SUM(customer_count) AS 总数量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY 季度, car_type ORDER BY 季度, 总数量 DESC;"
+      },
+      {
+        "question": "查询单日车流量超过5000辆的服务区记录",
+        "sql": "SELECT count_date AS 统计日期, service_area_id AS 服务区ID, customer_count AS 车流量 FROM bss_car_day_count WHERE customer_count > 5000 AND delete_ts IS NULL ORDER BY 车流量 DESC;"
+      },
+      {
+        "question": "分析指定时间段内各服务区车辆类型占比分布",
+        "sql": "SELECT service_area_id AS 服务区ID, car_type AS 车辆类型, SUM(customer_count) AS 总数量, ROUND(SUM(customer_count) * 100 / SUM(SUM(customer_count)) OVER(PARTITION BY service_area_id), 2) AS 类型占比百分比 FROM bss_car_day_count WHERE count_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY 服务区ID, 车辆类型 ORDER BY 服务区ID, 类型占比百分比 DESC;"
+      }
+    ]
+  }
+]

+ 14 - 0
data_pipeline/training_data/task_20250701_212426/task_config.json

@@ -0,0 +1,14 @@
+{
+  "task_id": "task_20250701_212426",
+  "created_at": "2025-07-01T13:24:25.700551",
+  "parameters": {
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:6432/highway_db",
+    "table_list_file": "data_pipeline/tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_sql_validation": true,
+    "enable_training_data_load": true
+  },
+  "output_directory": "data_pipeline\\training_data\\task_20250701_212426"
+}

+ 3 - 3
data_pipeline/training_data/bss_business_day_data.ddl → data_pipeline/training_data/task_20250701_213434/bss_business_day_data.ddl

@@ -1,7 +1,7 @@
--- 中文名: 业务支撑系统中的服务区营业日数据表
--- 描述: 业务支撑系统中的服务区营业日数据表,记录各服务区每日营业统计信息(交易/服务等),支持运营分析与管理
+-- 中文名: 存储各服务区每日业务数据统计信息
+-- 描述: 存储各服务区每日业务数据统计信息,支持运营分析与决策
 create table public.bss_business_day_data (
-  id varchar(32) not null     -- 主键标识,主键,
+  id varchar(32) not null     -- 主键ID,主键,
   version integer not null    -- 版本号,
   create_ts timestamp         -- 创建时间,
   created_by varchar(50)      -- 创建人,

+ 31 - 0
data_pipeline/training_data/task_20250701_213434/bss_business_day_data_1.ddl

@@ -0,0 +1,31 @@
+-- 中文名: 高速公路服务区每日业务统计数据表
+-- 描述: 高速公路服务区每日业务统计数据表,支持运营分析与监控
+create table public.bss_business_day_data (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  oper_date date              -- 统计日期,
+  service_no varchar(255)     -- 服务区编码,
+  service_name varchar(255)   -- 服务区名称,
+  branch_no varchar(255)      -- 档口编码,
+  branch_name varchar(255)    -- 档口名称,
+  wx numeric(19,4)            -- 微信支付金额,
+  wx_order integer            -- 微信订单数量,
+  zfb numeric(19,4)           -- 支付宝支付金额,
+  zf_order integer            -- 支付宝订单数量,
+  rmb numeric(19,4)           -- 现金支付金额,
+  rmb_order integer           -- 现金订单数量,
+  xs numeric(19,4)            -- 行吧支付金额,
+  xs_order integer            -- 行吧支付数量,
+  jd numeric(19,4)            -- 金豆支付金额,
+  jd_order integer            -- 金豆支付数量,
+  order_sum integer           -- 订单总数,
+  pay_sum numeric(19,4)       -- 支付总金额,
+  source_type integer         -- 数据来源类别,
+  primary key (id)
+);

+ 3 - 3
data_pipeline/training_data/bss_business_day_data_detail.md → data_pipeline/training_data/task_20250701_213434/bss_business_day_data_detail.md

@@ -1,7 +1,7 @@
-## bss_business_day_data(业务支撑系统中的服务区营业日数据表
-bss_business_day_data 表业务支撑系统中的服务区营业日数据表,记录各服务区每日营业统计信息(交易/服务等),支持运营分析与管理
+## bss_business_day_data(存储各服务区每日业务数据统计信息
+bss_business_day_data 表存储各服务区每日业务数据统计信息,支持运营分析与决策
 字段列表:
-- id (varchar(32)) - 主键标识 [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
 - version (integer) - 版本号 [非空] [示例: 1]
 - create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
 - created_by (varchar(50)) - 创建人 [示例: xingba]

+ 32 - 0
data_pipeline/training_data/task_20250701_213434/bss_business_day_data_detail_1.md

@@ -0,0 +1,32 @@
+## bss_business_day_data(高速公路服务区每日业务统计数据表)
+bss_business_day_data 表高速公路服务区每日业务统计数据表,支持运营分析与监控
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- created_by (varchar(50)) - 创建人 [示例: xingba]
+- update_ts (timestamp) - 更新时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- oper_date (date) - 统计日期 [示例: 2023-04-01]
+- service_no (varchar(255)) - 服务区编码 [示例: 1028, H0501]
+- service_name (varchar(255)) - 服务区名称 [示例: 宜春服务区, 庐山服务区]
+- branch_no (varchar(255)) - 档口编码 [示例: 1, H05016]
+- branch_name (varchar(255)) - 档口名称 [示例: 宜春南区, 庐山鲜徕客东区]
+- wx (numeric(19,4)) - 微信支付金额 [示例: 4790.0000, 2523.0000]
+- wx_order (integer) - 微信订单数量 [示例: 253, 133]
+- zfb (numeric(19,4)) - 支付宝支付金额 [示例: 229.0000, 0.0000]
+- zf_order (integer) - 支付宝订单数量 [示例: 15, 0]
+- rmb (numeric(19,4)) - 现金支付金额 [示例: 1058.5000, 124.0000]
+- rmb_order (integer) - 现金订单数量 [示例: 56, 12]
+- xs (numeric(19,4)) - 行吧支付金额 [示例: 0.0000, 40.0000]
+- xs_order (integer) - 行吧支付数量 [示例: 0, 1]
+- jd (numeric(19,4)) - 金豆支付金额 [示例: 0.0000]
+- jd_order (integer) - 金豆支付数量 [示例: 0]
+- order_sum (integer) - 订单总数 [示例: 324, 146]
+- pay_sum (numeric(19,4)) - 支付总金额 [示例: 6077.5000, 2687.0000]
+- source_type (integer) - 数据来源类别 [示例: 1, 0, 4]
+字段补充说明:
+- id 为主键
+- source_type 为枚举字段,包含取值:0、4、1、2、3

+ 17 - 0
data_pipeline/training_data/task_20250701_213434/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: 服务区每日车辆类型数量统计表
+-- 描述: 服务区每日车辆类型数量统计表,用于实时流量分析与资源调度优化。
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键标识符,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人ID,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人ID,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人ID,
+  customer_count bigint       -- 车辆数量,
+  car_type varchar(100)       -- 车辆类别,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 5 - 5
data_pipeline/training_data/bss_car_day_count.ddl → data_pipeline/training_data/task_20250701_213434/bss_car_day_count_1.ddl

@@ -1,16 +1,16 @@
--- 中文名: 服务区车辆类型日统计表
--- 描述: 服务区车辆类型日统计表,记录每日车流数量及分类数据,用于交通流量分析与服务资源调度
+-- 中文名: 表注释:服务区每日车辆统计记录
+-- 描述: 表注释:服务区每日车辆统计记录表,包含车辆类型及数量统计,用于运营分析与管理
 create table public.bss_car_day_count (
   id varchar(32) not null     -- 主键ID,主键,
   version integer not null    -- 版本号,
   create_ts timestamp         -- 创建时间,
   created_by varchar(50)      -- 创建人,
-  update_ts timestamp         -- 更新时间,
-  updated_by varchar(50)      -- 更新人,
+  update_ts timestamp         -- 最后更新时间,
+  updated_by varchar(50)      -- 最后更新人,
   delete_ts timestamp         -- 删除时间,
   deleted_by varchar(50)      -- 删除人,
   customer_count bigint       -- 车辆数量,
-  car_type varchar(100)       -- 车辆类,
+  car_type varchar(100)       -- 车辆类,
   count_date date             -- 统计日期,
   service_area_id varchar(32) -- 服务区ID,
   primary key (id)

+ 18 - 0
data_pipeline/training_data/task_20250701_213434/bss_car_day_count_detail.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(服务区每日车辆类型数量统计表)
+bss_car_day_count 表服务区每日车辆类型数量统计表,用于实时流量分析与资源调度优化。
+字段列表:
+- id (varchar(32)) - 主键标识符 [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人ID
+- update_ts (timestamp) - 更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 更新人ID
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人ID
+- customer_count (bigint) - 车辆数量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类别 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

+ 18 - 0
data_pipeline/training_data/task_20250701_213434/bss_car_day_count_detail_1.md

@@ -0,0 +1,18 @@
+## bss_car_day_count(表注释:服务区每日车辆统计记录表)
+bss_car_day_count 表表注释:服务区每日车辆统计记录表,包含车辆类型及数量统计,用于运营分析与管理。
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- created_by (varchar(50)) - 创建人
+- update_ts (timestamp) - 最后更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
+- updated_by (varchar(50)) - 最后更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- customer_count (bigint) - 车辆数量 [示例: 1114, 295]
+- car_type (varchar(100)) - 车辆类别 [示例: 其他]
+- count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
+- service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
+字段补充说明:
+- id 为主键
+- car_type 为枚举字段,包含取值:其他、危化品、城际、过境

+ 11 - 0
data_pipeline/training_data/task_20250701_213434/db_query_decision_prompt.txt

@@ -0,0 +1,11 @@
+=== 数据库业务范围 ===
+当前数据库存储的是高速公路服务区运营管理的相关数据,主要涉及服务区营业流水、车辆流量统计及支付渠道分析,包含以下业务数据:
+核心业务实体:
+- 服务区:服务区基础信息及地理位置标识,主要字段:service_no、service_name、service_area_id
+- 档口:服务区内部经营单元信息,主要字段:branch_no、branch_name
+- 车辆类型:车辆分类统计维度,主要字段:car_type(枚举值:其他/危化品/城际/过境)
+- 统计日期:业务数据观测时间维度,主要字段:oper_date、count_date
+关键业务指标:
+- 支付渠道分析:各支付方式交易额(微信/支付宝/现金/行吧/金豆)及订单量对比
+- 营业趋势监测:日订单总量、日支付总额、各档口营收分布
+- 车流特征分析:各类型车辆日通行量、服务区车流规模分布

+ 7 - 0
data_pipeline/training_data/task_20250701_213434/ddl_generation_result.json

@@ -0,0 +1,7 @@
+{
+  "total_tables": 2,
+  "processed_successfully": 2,
+  "failed": 0,
+  "files_generated": 4,
+  "duration": 129.3695569038391
+}

+ 5 - 0
data_pipeline/training_data/task_20250701_213434/filename_mapping.txt

@@ -0,0 +1,5 @@
+# 文件名映射报告
+# 格式: 原始表名 -> 实际文件名
+
+public.bss_business_day_data -> bss_business_day_data_detail_1.md
+public.bss_car_day_count -> bss_car_day_count_detail_1.md

+ 62 - 0
data_pipeline/training_data/task_20250701_213434/metadata.txt

@@ -0,0 +1,62 @@
+-- Schema Tools生成的主题元数据
+-- 业务背景: 高速公路服务区管理系统
+-- 生成时间: 2025-07-01 21:44:31
+-- 数据库: highway_db
+
+-- 创建表(如果不存在)
+CREATE TABLE IF NOT EXISTS metadata (
+    id SERIAL PRIMARY KEY,    -- 主键
+    topic_name VARCHAR(100) NOT NULL,  -- 业务主题名称
+    description TEXT,                  -- 业务主体说明
+    related_tables TEXT[],			  -- 相关表名
+    biz_entities TEXT[],               -- 主要业务实体名称
+    biz_metrics TEXT[],                -- 主要业务指标名称
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP    -- 插入时间
+);
+
+-- 插入主题数据
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '支付方式分析',
+  '分析不同支付方式(微信、支付宝、现金等)在各服务区的使用占比及交易金额分布',
+  'bss_business_day_data',
+  '服务区,支付类型,档口',
+  '支付占比,交易金额分布,订单量对比'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '车流高峰分析',
+  '基于车辆类型统计表,识别各服务区在不同日期的车辆高峰时段及车型分布特征',
+  'bss_car_day_count',
+  '服务区,车辆类型,统计日期',
+  '高峰时段识别,车型占比分析,车流量趋势'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '营收对比分析',
+  '对比不同服务区在相同时间段内的日均营收及订单量差异,识别头部和尾部区域',
+  'bss_business_day_data',
+  '服务区,档口,统计周期',
+  '日均营收对比,订单量排名,营收增长率'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '档口效能评估',
+  '分析各档口单位面积营收能力及客单价水平,评估空间资源利用效率',
+  'bss_business_day_data',
+  '档口,服务区,支付渠道',
+  '单位面积收益,客单价分析,坪效排名'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '车型消费关联',
+  '结合车辆类型数据和消费数据,分析不同类型车辆停留期间的消费行为特征',
+  'bss_car_day_count,bss_business_day_data',
+  '车辆类型,服务区,消费时段',
+  '人均消费分析,消费频次关联,车型消费转化率'
+);
+

+ 3 - 3
data_pipeline/training_data/metadata_detail.md → data_pipeline/training_data/task_20250701_213434/metadata_detail.md

@@ -7,9 +7,9 @@
 - `id` (serial) - 主键ID [主键, 非空]
 - `topic_name` (varchar(100)) - 业务主题名称 [非空]
 - `description` (text) - 业务主题说明
-- `related_tables` (text[]) - 涉及的数据表 [示例: bss_section_route_area_link, bss_business_day_data]
-- `biz_entities` (text[]) - 主要业务实体名称 [示例: 路段, 企业, 日期]
-- `biz_metrics` (text[]) - 主要业务指标名称 [示例: 车流转化率, 支付渗透率变化, 单车道收益]
+- `related_tables` (text[]) - 涉及的数据表 [示例: bss_car_day_count, bss_business_day_data]
+- `biz_entities` (text[]) - 主要业务实体名称 [示例: 统计周期, 支付渠道, 支付类型]
+- `biz_metrics` (text[]) - 主要业务指标名称 [示例: 单位面积收益, 消费频次关联, 订单量对比]
 - `created_at` (timestamp) - 插入时间 [默认值: `CURRENT_TIMESTAMP`]
 
 字段补充说明:

+ 202 - 0
data_pipeline/training_data/task_20250701_213434/qs_highway_db_20250701_214431_pair.json

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "统计2023年4月各服务区微信支付占比,并按占比降序排列",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx)/SUM(pay_sum)*100 AS 微信占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "对比2023年第一季度各档口支付宝订单量TOP10",
+    "sql": "SELECT branch_name AS 档口名称, SUM(zf_order) AS 支付宝订单量 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY branch_name ORDER BY 支付宝订单量 DESC LIMIT 10;"
+  },
+  {
+    "question": "筛选现金支付金额占比超过20%的服务区(2023年数据)",
+    "sql": "SELECT service_name AS 服务区名称, SUM(rmb)/SUM(pay_sum)*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date >= '2023-01-01' AND delete_ts IS NULL GROUP BY service_name HAVING SUM(rmb)/SUM(pay_sum) > 0.2;"
+  },
+  {
+    "question": "分析2023年Q1各支付类型月均交易金额分布",
+    "sql": "SELECT DATE_TRUNC('month', oper_date) AS 月份, AVG(wx) AS 平均微信支付, AVG(zfb) AS 平均支付宝支付, AVG(rmb) AS 平均现金支付 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY 月份 ORDER BY 月份;"
+  },
+  {
+    "question": "找出微信支付订单量最低的5个服务区(2023年数据)",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx_order) AS 微信订单量 FROM bss_business_day_data WHERE oper_date >= '2023-01-01' AND delete_ts IS NULL GROUP BY service_name ORDER BY 微信订单量 ASC LIMIT 5;"
+  },
+  {
+    "question": "统计各档口行吧支付使用情况并按金额排序TOP5",
+    "sql": "SELECT branch_name AS 档口名称, SUM(xs) AS 行吧支付总额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 行吧支付总额 DESC LIMIT 5;"
+  },
+  {
+    "question": "计算2023年4月各支付类型的平均订单金额",
+    "sql": "SELECT SUM(wx)/SUM(wx_order) AS 微信单均金额, SUM(zfb)/SUM(zf_order) AS 支付宝单均金额, SUM(rmb)/SUM(rmb_order) AS 现金单均金额 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "分析服务区各支付方式订单量是否超过全量平均值",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx_order) > (SELECT AVG(wx_order) FROM bss_business_day_data WHERE delete_ts IS NULL) AS 微信超均值, SUM(zf_order) > (SELECT AVG(zf_order) FROM bss_business_day_data WHERE delete_ts IS NULL) AS 支付宝超均值 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "统计2023年4月每日微信支付金额趋势",
+    "sql": "SELECT oper_date AS 日期, SUM(wx) AS 微信支付总额 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY oper_date ORDER BY 日期;"
+  },
+  {
+    "question": "分析金豆支付使用情况并找出TOP1服务区",
+    "sql": "SELECT service_name AS 服务区名称, SUM(jd) AS 金豆支付总额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 金豆支付总额 DESC LIMIT 1;"
+  },
+  {
+    "question": "统计各服务区2023年4月1日当天车辆总数并按数量降序排列",
+    "sql": "SELECT service_area_id AS \"服务区ID\", SUM(customer_count) AS \"总车流量\" FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id ORDER BY \"总车流量\" DESC;"
+  },
+  {
+    "question": "分析庐山服务区2023年不同车辆类型占比分布",
+    "sql": "SELECT car_type AS \"车辆类型\", SUM(customer_count) AS \"数量\", ROUND(SUM(customer_count)*100/(SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = '庐山服务区ID' AND delete_ts IS NULL),2) AS \"占比(%)\" FROM bss_car_day_count WHERE service_area_id = '庐山服务区ID' AND delete_ts IS NULL GROUP BY car_type;"
+  },
+  {
+    "question": "查询宜春服务区近7天每日车流量趋势(按统计日期)",
+    "sql": "SELECT count_date AS \"统计日期\", SUM(customer_count) AS \"日车流量\" FROM bss_car_day_count WHERE service_area_id = '宜春服务区ID' AND count_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY count_date ORDER BY \"统计日期\";"
+  },
+  {
+    "question": "找出2023年4月车流高峰前5的服务区及对应峰值日期",
+    "sql": "SELECT service_area_id AS \"服务区ID\", count_date AS \"峰值日期\", customer_count AS \"车流量\" FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL ORDER BY \"车流量\" DESC LIMIT 5;"
+  },
+  {
+    "question": "对比城际车辆与过境车辆在各服务区的月均车流量差异",
+    "sql": "SELECT service_area_id AS \"服务区ID\", car_type AS \"车辆类型\", AVG(customer_count) AS \"月均车流量\" FROM bss_car_day_count WHERE car_type IN ('城际','过境') AND delete_ts IS NULL GROUP BY \"服务区ID\", \"车辆类型\" ORDER BY \"服务区ID\", \"车辆类型\";"
+  },
+  {
+    "question": "统计各服务区危化品车辆出现频次TOP3的日期",
+    "sql": "SELECT * FROM (SELECT service_area_id AS \"服务区ID\", count_date AS \"统计日期\", customer_count AS \"车流量\", RANK() OVER(PARTITION BY service_area_id ORDER BY customer_count DESC) AS \"排名\" FROM bss_car_day_count WHERE car_type = '危化品' AND delete_ts IS NULL) t WHERE \"排名\" <=3 ORDER BY \"服务区ID\", \"排名\";"
+  },
+  {
+    "question": "分析2023年各季度不同车辆类型占比变化趋势",
+    "sql": "SELECT DATE_TRUNC('quarter', count_date) AS \"季度\", car_type AS \"车辆类型\", SUM(customer_count) AS \"累计数量\" FROM bss_car_day_count WHERE count_date >= '2023-01-01' AND delete_ts IS NULL GROUP BY \"季度\", \"车辆类型\" ORDER BY \"季度\", \"车辆类型\";"
+  },
+  {
+    "question": "查询单日车流量超过5000的服务区及对应日期",
+    "sql": "SELECT service_area_id AS \"服务区ID\", count_date AS \"统计日期\", customer_count AS \"车流量\" FROM bss_car_day_count WHERE customer_count > 5000 AND delete_ts IS NULL ORDER BY \"统计日期\" DESC;"
+  },
+  {
+    "question": "比较不同车辆类型在工作日与非工作日的平均车流量差异",
+    "sql": "SELECT car_type AS \"车辆类型\", AVG(CASE WHEN EXTRACT(ISODOW FROM count_date) IN (6,7) THEN customer_count ELSE 0 END) AS \"非工作日均值\", AVG(CASE WHEN EXTRACT(ISODOW FROM count_date) NOT IN (6,7) THEN customer_count ELSE 0 END) AS \"工作日均值\" FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY \"车辆类型\";"
+  },
+  {
+    "question": "统计各服务区连续3天及以上日车流量破千的记录",
+    "sql": "SELECT * FROM (SELECT service_area_id AS \"服务区ID\", count_date AS \"统计日期\", customer_count AS \"车流量\", COUNT(*) OVER(PARTITION BY service_area_id ORDER BY count_date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS \"连续达标天数\" FROM bss_car_day_count WHERE customer_count >= 1000 AND delete_ts IS NULL) t WHERE \"连续达标天数\" >=3;"
+  },
+  {
+    "question": "最近7天各服务区日均营收对比,按日均营收从高到低排列前10名",
+    "sql": "SELECT service_name AS 服务区名称, AVG(pay_sum) AS 日均营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY service_name ORDER BY 日均营收 DESC LIMIT 10;"
+  },
+  {
+    "question": "上月各服务区总订单量排名,显示订单量最高的前5个服务区",
+    "sql": "SELECT service_name AS 服务区名称, SUM(order_sum) AS 总订单量 FROM bss_business_day_data WHERE oper_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month') AND oper_date < DATE_TRUNC('month', CURRENT_DATE) AND delete_ts IS NULL GROUP BY service_name ORDER BY 总订单量 DESC LIMIT 5;"
+  },
+  {
+    "question": "本月已开业天数大于15天的服务区中,日均营收增长率超过20%的服务区有哪些",
+    "sql": "SELECT service_name AS 服务区名称, (AVG(CASE WHEN oper_date >= DATE_TRUNC('month', CURRENT_DATE) THEN pay_sum ELSE 0 END) / NULLIF(AVG(CASE WHEN oper_date < DATE_TRUNC('month', CURRENT_DATE) AND oper_date >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month' THEN pay_sum ELSE 0 END), 0) - 1) * 100 AS 营收增长率百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name HAVING COUNT(DISTINCT CASE WHEN oper_date >= DATE_TRUNC('month', CURRENT_DATE) THEN oper_date END) > 15 AND (AVG(CASE WHEN oper_date >= DATE_TRUNC('month', CURRENT_DATE) THEN pay_sum ELSE 0 END) / NULLIF(AVG(CASE WHEN oper_date < DATE_TRUNC('month', CURRENT_DATE) AND oper_date >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month' THEN pay_sum ELSE 0 END), 0) - 1) * 100 > 20;"
+  },
+  {
+    "question": "2023年4月1日单日营收最低的5个服务区及对应营收金额",
+    "sql": "SELECT service_name AS 服务区名称, pay_sum AS 营收金额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL ORDER BY pay_sum ASC LIMIT 5;"
+  },
+  {
+    "question": "最近30天各档口日均营收排名,显示前10名档口信息",
+    "sql": "SELECT branch_name AS 档口名称, service_name AS 服务区名称, AVG(pay_sum) AS 日均营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY branch_name, service_name ORDER BY 日均营收 DESC LIMIT 10;"
+  },
+  {
+    "question": "2023年3月各服务区总营收及订单量统计,按订单量降序排列",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, SUM(order_sum) AS 总订单量 FROM bss_business_day_data WHERE EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 3 AND delete_ts IS NULL GROUP BY service_name ORDER BY 总订单量 DESC;"
+  },
+  {
+    "question": "连续7天日均营收超过1万元的服务区有哪些",
+    "sql": "SELECT service_name AS 服务区名称 FROM (SELECT service_name, oper_date, AVG(pay_sum) OVER(PARTITION BY service_name ORDER BY oper_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS \"7日移动平均\" FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - 30) t WHERE \"7日移动平均\" > 10000 GROUP BY service_name;"
+  },
+  {
+    "question": "最近一周各服务区微信支付占比分析,按占比高低排序",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx) / SUM(pay_sum) * 100 AS 微信支付占比百分比 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY service_name ORDER BY 微信支付占比百分比 DESC;"
+  },
+  {
+    "question": "本周(截至昨日)与上周相同时段营收环比增长率分析",
+    "sql": "SELECT service_name AS 服务区名称, (SUM(CASE WHEN oper_date >= CURRENT_DATE - 7 AND oper_date < CURRENT_DATE THEN pay_sum ELSE 0 END) / NULLIF(SUM(CASE WHEN oper_date >= CURRENT_DATE - 14 AND oper_date < CURRENT_DATE - 7 THEN pay_sum ELSE 0 END), 0) - 1) * 100 AS 环比增长率百分比 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 14 AND delete_ts IS NULL GROUP BY service_name ORDER BY 环比增长率百分比 DESC;"
+  },
+  {
+    "question": "宜春服务区各档口2023年Q1季度总营收分布,按档口营收降序排列",
+    "sql": "SELECT branch_name AS 档口名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date >= '2023-01-01' AND oper_date <= '2023-03-31' AND delete_ts IS NULL GROUP BY branch_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "统计各档口单位面积收益排名(按总支付金额降序排列前10)",
+    "sql": "SELECT branch_name AS 档口名称, SUM(pay_sum) AS 总支付金额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 总支付金额 DESC LIMIT 10;"
+  },
+  {
+    "question": "分析各档口客单价(总支付金额除以订单总数)",
+    "sql": "SELECT branch_name AS 档口名称, SUM(pay_sum) / SUM(order_sum) AS 客单价 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name;"
+  },
+  {
+    "question": "近一周每日总支付金额趋势分析",
+    "sql": "SELECT oper_date AS 日期, SUM(pay_sum) AS 日总支付金额 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - INTERVAL '7 days' GROUP BY oper_date ORDER BY 日期;"
+  },
+  {
+    "question": "对比各服务区总支付金额及档口平均收益",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总支付金额, AVG(pay_sum) AS 平均支付金额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 总支付金额 DESC;"
+  },
+  {
+    "question": "各档口微信支付金额占比分析",
+    "sql": "SELECT branch_name AS 档口名称, SUM(wx) / SUM(pay_sum) * 100 AS 微信占比百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name;"
+  },
+  {
+    "question": "订单数量最多的Top 5档口排名",
+    "sql": "SELECT branch_name AS 档口名称, SUM(order_sum) AS 总订单数 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 总订单数 DESC LIMIT 5;"
+  },
+  {
+    "question": "2023年第一季度(1-3月)各月总支付金额趋势",
+    "sql": "SELECT EXTRACT(MONTH FROM oper_date) AS 月份, SUM(pay_sum) AS 总支付金额 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date BETWEEN '2023-01-01' AND '2023-03-31' GROUP BY 月份 ORDER BY 月份;"
+  },
+  {
+    "question": "客单价最低的5个服务区明细",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) / SUM(order_sum) AS 客单价 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 客单价 ASC LIMIT 5;"
+  },
+  {
+    "question": "各服务区支付宝支付金额占比超过20%的记录",
+    "sql": "SELECT service_name AS 服务区名称, SUM(zfb) / SUM(pay_sum) * 100 AS 支付宝占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name HAVING SUM(zfb) / SUM(pay_sum) > 0.2 ORDER BY 支付宝占比 DESC;"
+  },
+  {
+    "question": "2023-04-01当天各档口支付金额及订单数明细",
+    "sql": "SELECT branch_name AS 档口名称, pay_sum AS 当日支付金额, order_sum AS 当日订单数 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date = '2023-04-01' ORDER BY 当日支付金额 DESC;"
+  },
+  {
+    "question": "不同车辆类型在消费金额上的差异如何?",
+    "sql": "SELECT car.car_type AS 车辆类型, SUM(bus.pay_sum) AS 总消费金额, SUM(car.customer_count) AS 总停留车辆数, SUM(bus.pay_sum)/SUM(car.customer_count) AS 人均消费 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.car_type ORDER BY 总消费金额 DESC;"
+  },
+  {
+    "question": "各车型消费频次(订单数/车辆数)排名如何?",
+    "sql": "SELECT car.car_type AS 车辆类型, SUM(bus.order_sum) AS 总订单数, SUM(car.customer_count) AS 总停留车辆数, SUM(bus.order_sum)::numeric/SUM(car.customer_count) AS 消费频次 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.car_type ORDER BY 消费频次 DESC;"
+  },
+  {
+    "question": "危化品车辆停留期间每日人均消费趋势如何?",
+    "sql": "SELECT car.count_date AS 统计日期, SUM(bus.pay_sum)/SUM(car.customer_count) AS 人均消费 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.car_type = '危化品' AND car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.count_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "城际车辆在各服务区的平均消费金额TOP10是哪些?",
+    "sql": "SELECT bus.service_name AS 服务区名称, AVG(bus.pay_sum) AS 平均消费金额 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date AND car.service_area_id = bus.id WHERE car.car_type = '城际' AND car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY bus.service_name ORDER BY 平均消费金额 DESC LIMIT 10;"
+  },
+  {
+    "question": "过境车辆消费中微信支付占比超过50%的日期有哪些?",
+    "sql": "SELECT car.count_date AS 统计日期, SUM(bus.wx)/SUM(bus.pay_sum) AS 微信支付占比 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.car_type = '过境' AND car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.count_date HAVING SUM(bus.wx)/SUM(bus.pay_sum) > 0.5 ORDER BY 统计日期;"
+  },
+  {
+    "question": "各车型消费转化率(订单数/车辆数)对比情况如何?",
+    "sql": "SELECT car.car_type AS 车辆类型, SUM(bus.order_sum) AS 总订单数, SUM(car.customer_count) AS 总停留车辆数, SUM(bus.order_sum)::numeric/SUM(car.customer_count) AS 消费转化率 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.car_type ORDER BY 消费转化率 DESC;"
+  },
+  {
+    "question": "2023年春节期间各车型总消费金额是多少?",
+    "sql": "SELECT car.car_type AS 车辆类型, SUM(bus.pay_sum) AS 总消费金额 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL AND bus.oper_date BETWEEN '2023-01-20' AND '2023-01-30' GROUP BY car.car_type ORDER BY 总消费金额 DESC;"
+  },
+  {
+    "question": "各服务区中哪种车型消费金额占比最高?",
+    "sql": "WITH ranked_data AS (SELECT bus.service_name AS 服务区名称, car.car_type AS 车辆类型, SUM(bus.pay_sum) AS 总消费金额, RANK() OVER (PARTITION BY bus.service_name ORDER BY SUM(bus.pay_sum) DESC) AS 排名 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date AND car.service_area_id = bus.id WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY bus.service_name, car.car_type) SELECT 服务区名称, 车辆类型, 总消费金额 FROM ranked_data WHERE 排名 = 1 ORDER BY 总消费金额 DESC;"
+  },
+  {
+    "question": "每日车辆数与当日总消费金额的相关性如何?",
+    "sql": "SELECT CORR(total_cars, total_pay) AS 相关性系数 FROM (SELECT count_date, SUM(customer_count) AS total_cars, SUM(pay_sum) AS total_pay FROM bss_car_day_count car JOIN bss_business_day_data bus ON count_date = oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY count_date) AS daily_data;"
+  },
+  {
+    "question": "最近一周其他类型车辆消费订单数量变化趋势如何?",
+    "sql": "SELECT car.count_date AS 统计日期, SUM(bus.order_sum) AS 总订单数 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.car_type = '其他' AND car.delete_ts IS NULL AND bus.delete_ts IS NULL AND car.count_date >= CURRENT_DATE - 7 GROUP BY car.count_date ORDER BY 统计日期;"
+  }
+]

+ 202 - 0
data_pipeline/training_data/task_20250701_213434/qs_highway_db_20250701_214431_pair.json.backup

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "统计2023年4月各服务区微信支付占比,并按占比降序排列",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx)/SUM(pay_sum)*100 AS 微信占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY service_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "对比2023年第一季度各档口支付宝订单量TOP10",
+    "sql": "SELECT branch_name AS 档口名称, SUM(zf_order) AS 支付宝订单量 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY branch_name ORDER BY 支付宝订单量 DESC LIMIT 10;"
+  },
+  {
+    "question": "筛选现金支付金额占比超过20%的服务区(2023年数据)",
+    "sql": "SELECT service_name AS 服务区名称, SUM(rmb)/SUM(pay_sum)*100 AS 现金占比 FROM bss_business_day_data WHERE oper_date >= '2023-01-01' AND delete_ts IS NULL GROUP BY service_name HAVING SUM(rmb)/SUM(pay_sum) > 0.2;"
+  },
+  {
+    "question": "分析2023年Q1各支付类型月均交易金额分布",
+    "sql": "SELECT DATE_TRUNC('month', oper_date) AS 月份, AVG(wx) AS 平均微信支付, AVG(zfb) AS 平均支付宝支付, AVG(rmb) AS 平均现金支付 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-01' AND '2023-03-31' AND delete_ts IS NULL GROUP BY 月份 ORDER BY 月份;"
+  },
+  {
+    "question": "找出微信支付订单量最低的5个服务区(2023年数据)",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx_order) AS 微信订单量 FROM bss_business_day_data WHERE oper_date >= '2023-01-01' AND delete_ts IS NULL GROUP BY service_name ORDER BY 微信订单量 ASC LIMIT 5;"
+  },
+  {
+    "question": "统计各档口行吧支付使用情况并按金额排序TOP5",
+    "sql": "SELECT branch_name AS 档口名称, SUM(xs) AS 行吧支付总额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 行吧支付总额 DESC LIMIT 5;"
+  },
+  {
+    "question": "计算2023年4月各支付类型的平均订单金额",
+    "sql": "SELECT SUM(wx)/SUM(wx_order) AS 微信单均金额, SUM(zfb)/SUM(zf_order) AS 支付宝单均金额, SUM(rmb)/SUM(rmb_order) AS 现金单均金额 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "分析服务区各支付方式订单量是否超过全量平均值",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx_order) > (SELECT AVG(wx_order) FROM bss_business_day_data WHERE delete_ts IS NULL) AS 微信超均值, SUM(zf_order) > (SELECT AVG(zf_order) FROM bss_business_day_data WHERE delete_ts IS NULL) AS 支付宝超均值 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "统计2023年4月每日微信支付金额趋势",
+    "sql": "SELECT oper_date AS 日期, SUM(wx) AS 微信支付总额 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY oper_date ORDER BY 日期;"
+  },
+  {
+    "question": "分析金豆支付使用情况并找出TOP1服务区",
+    "sql": "SELECT service_name AS 服务区名称, SUM(jd) AS 金豆支付总额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 金豆支付总额 DESC LIMIT 1;"
+  },
+  {
+    "question": "统计各服务区2023年4月1日当天车辆总数并按数量降序排列",
+    "sql": "SELECT service_area_id AS \"服务区ID\", SUM(customer_count) AS \"总车流量\" FROM bss_car_day_count WHERE count_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_area_id ORDER BY \"总车流量\" DESC;"
+  },
+  {
+    "question": "分析庐山服务区2023年不同车辆类型占比分布",
+    "sql": "SELECT car_type AS \"车辆类型\", SUM(customer_count) AS \"数量\", ROUND(SUM(customer_count)*100/(SELECT SUM(customer_count) FROM bss_car_day_count WHERE service_area_id = '庐山服务区ID' AND delete_ts IS NULL),2) AS \"占比(%)\" FROM bss_car_day_count WHERE service_area_id = '庐山服务区ID' AND delete_ts IS NULL GROUP BY car_type;"
+  },
+  {
+    "question": "查询宜春服务区近7天每日车流量趋势(按统计日期)",
+    "sql": "SELECT count_date AS \"统计日期\", SUM(customer_count) AS \"日车流量\" FROM bss_car_day_count WHERE service_area_id = '宜春服务区ID' AND count_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY count_date ORDER BY \"统计日期\";"
+  },
+  {
+    "question": "找出2023年4月车流高峰前5的服务区及对应峰值日期",
+    "sql": "SELECT service_area_id AS \"服务区ID\", count_date AS \"峰值日期\", customer_count AS \"车流量\" FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL ORDER BY \"车流量\" DESC LIMIT 5;"
+  },
+  {
+    "question": "对比城际车辆与过境车辆在各服务区的月均车流量差异",
+    "sql": "SELECT service_area_id AS \"服务区ID\", car_type AS \"车辆类型\", AVG(customer_count) AS \"月均车流量\" FROM bss_car_day_count WHERE car_type IN ('城际','过境') AND delete_ts IS NULL GROUP BY \"服务区ID\", \"车辆类型\" ORDER BY \"服务区ID\", \"车辆类型\";"
+  },
+  {
+    "question": "统计各服务区危化品车辆出现频次TOP3的日期",
+    "sql": "SELECT * FROM (SELECT service_area_id AS \"服务区ID\", count_date AS \"统计日期\", customer_count AS \"车流量\", RANK() OVER(PARTITION BY service_area_id ORDER BY customer_count DESC) AS \"排名\" FROM bss_car_day_count WHERE car_type = '危化品' AND delete_ts IS NULL) t WHERE \"排名\" <=3 ORDER BY \"服务区ID\", \"排名\";"
+  },
+  {
+    "question": "分析2023年各季度不同车辆类型占比变化趋势",
+    "sql": "SELECT DATE_TRUNC('quarter', count_date) AS \"季度\", car_type AS \"车辆类型\", SUM(customer_count) AS \"累计数量\" FROM bss_car_day_count WHERE count_date >= '2023-01-01' AND delete_ts IS NULL GROUP BY \"季度\", \"车辆类型\" ORDER BY \"季度\", \"车辆类型\";"
+  },
+  {
+    "question": "查询单日车流量超过5000的服务区及对应日期",
+    "sql": "SELECT service_area_id AS \"服务区ID\", count_date AS \"统计日期\", customer_count AS \"车流量\" FROM bss_car_day_count WHERE customer_count > 5000 AND delete_ts IS NULL ORDER BY \"统计日期\" DESC;"
+  },
+  {
+    "question": "比较不同车辆类型在工作日与非工作日的平均车流量差异",
+    "sql": "SELECT car_type AS \"车辆类型\", AVG(CASE WHEN EXTRACT(ISODOW FROM count_date) IN (6,7) THEN customer_count ELSE 0 END) AS \"非工作日均值\", AVG(CASE WHEN EXTRACT(ISODOW FROM count_date) NOT IN (6,7) THEN customer_count ELSE 0 END) AS \"工作日均值\" FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY \"车辆类型\";"
+  },
+  {
+    "question": "统计各服务区连续3天及以上日车流量破千的记录",
+    "sql": "SELECT * FROM (SELECT service_area_id AS \"服务区ID\", count_date AS \"统计日期\", customer_count AS \"车流量\", COUNT(*) OVER(PARTITION BY service_area_id ORDER BY count_date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS \"连续达标天数\" FROM bss_car_day_count WHERE customer_count >= 1000 AND delete_ts IS NULL) t WHERE \"连续达标天数\" >=3;"
+  },
+  {
+    "question": "最近7天各服务区日均营收对比,按日均营收从高到低排列前10名",
+    "sql": "SELECT service_name AS 服务区名称, AVG(pay_sum) AS 日均营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY service_name ORDER BY 日均营收 DESC LIMIT 10;"
+  },
+  {
+    "question": "上月各服务区总订单量排名,显示订单量最高的前5个服务区",
+    "sql": "SELECT service_name AS 服务区名称, SUM(order_sum) AS 总订单量 FROM bss_business_day_data WHERE oper_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month') AND oper_date < DATE_TRUNC('month', CURRENT_DATE) AND delete_ts IS NULL GROUP BY service_name ORDER BY 总订单量 DESC LIMIT 5;"
+  },
+  {
+    "question": "本月已开业天数大于15天的服务区中,日均营收增长率超过20%的服务区有哪些",
+    "sql": "SELECT service_name AS 服务区名称, (AVG(CASE WHEN oper_date >= DATE_TRUNC('month', CURRENT_DATE) THEN pay_sum ELSE 0 END) / NULLIF(AVG(CASE WHEN oper_date < DATE_TRUNC('month', CURRENT_DATE) AND oper_date >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month' THEN pay_sum ELSE 0 END), 0) - 1) * 100 AS 营收增长率百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name HAVING COUNT(DISTINCT CASE WHEN oper_date >= DATE_TRUNC('month', CURRENT_DATE) THEN oper_date END) > 15 AND (AVG(CASE WHEN oper_date >= DATE_TRUNC('month', CURRENT_DATE) THEN pay_sum ELSE 0 END) / NULLIF(AVG(CASE WHEN oper_date < DATE_TRUNC('month', CURRENT_DATE) AND oper_date >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month' THEN pay_sum ELSE 0 END), 0) - 1) * 100 > 20;"
+  },
+  {
+    "question": "2023年4月1日单日营收最低的5个服务区及对应营收金额",
+    "sql": "SELECT service_name AS 服务区名称, pay_sum AS 营收金额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL ORDER BY pay_sum ASC LIMIT 5;"
+  },
+  {
+    "question": "最近30天各档口日均营收排名,显示前10名档口信息",
+    "sql": "SELECT branch_name AS 档口名称, service_name AS 服务区名称, AVG(pay_sum) AS 日均营收 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY branch_name, service_name ORDER BY 日均营收 DESC LIMIT 10;"
+  },
+  {
+    "question": "2023年3月各服务区总营收及订单量统计,按订单量降序排列",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总营收, SUM(order_sum) AS 总订单量 FROM bss_business_day_data WHERE EXTRACT(YEAR FROM oper_date) = 2023 AND EXTRACT(MONTH FROM oper_date) = 3 AND delete_ts IS NULL GROUP BY service_name ORDER BY 总订单量 DESC;"
+  },
+  {
+    "question": "连续7天日均营收超过1万元的服务区有哪些",
+    "sql": "SELECT service_name AS 服务区名称 FROM (SELECT service_name, oper_date, AVG(pay_sum) OVER(PARTITION BY service_name ORDER BY oper_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS 7日移动平均 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - 30) t WHERE 7日移动平均 > 10000 GROUP BY service_name;"
+  },
+  {
+    "question": "最近一周各服务区微信支付占比分析,按占比高低排序",
+    "sql": "SELECT service_name AS 服务区名称, SUM(wx) / SUM(pay_sum) * 100 AS 微信支付占比百分比 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 7 AND delete_ts IS NULL GROUP BY service_name ORDER BY 微信支付占比百分比 DESC;"
+  },
+  {
+    "question": "本周(截至昨日)与上周相同时段营收环比增长率分析",
+    "sql": "SELECT service_name AS 服务区名称, (SUM(CASE WHEN oper_date >= CURRENT_DATE - 7 AND oper_date < CURRENT_DATE THEN pay_sum ELSE 0 END) / NULLIF(SUM(CASE WHEN oper_date >= CURRENT_DATE - 14 AND oper_date < CURRENT_DATE - 7 THEN pay_sum ELSE 0 END), 0) - 1) * 100 AS 环比增长率百分比 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 14 AND delete_ts IS NULL GROUP BY service_name ORDER BY 环比增长率百分比 DESC;"
+  },
+  {
+    "question": "宜春服务区各档口2023年Q1季度总营收分布,按档口营收降序排列",
+    "sql": "SELECT branch_name AS 档口名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date >= '2023-01-01' AND oper_date <= '2023-03-31' AND delete_ts IS NULL GROUP BY branch_name ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "统计各档口单位面积收益排名(按总支付金额降序排列前10)",
+    "sql": "SELECT branch_name AS 档口名称, SUM(pay_sum) AS 总支付金额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 总支付金额 DESC LIMIT 10;"
+  },
+  {
+    "question": "分析各档口客单价(总支付金额除以订单总数)",
+    "sql": "SELECT branch_name AS 档口名称, SUM(pay_sum) / SUM(order_sum) AS 客单价 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name;"
+  },
+  {
+    "question": "近一周每日总支付金额趋势分析",
+    "sql": "SELECT oper_date AS 日期, SUM(pay_sum) AS 日总支付金额 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - INTERVAL '7 days' GROUP BY oper_date ORDER BY 日期;"
+  },
+  {
+    "question": "对比各服务区总支付金额及档口平均收益",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 总支付金额, AVG(pay_sum) AS 平均支付金额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 总支付金额 DESC;"
+  },
+  {
+    "question": "各档口微信支付金额占比分析",
+    "sql": "SELECT branch_name AS 档口名称, SUM(wx) / SUM(pay_sum) * 100 AS 微信占比百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name;"
+  },
+  {
+    "question": "订单数量最多的Top 5档口排名",
+    "sql": "SELECT branch_name AS 档口名称, SUM(order_sum) AS 总订单数 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 总订单数 DESC LIMIT 5;"
+  },
+  {
+    "question": "2023年第一季度(1-3月)各月总支付金额趋势",
+    "sql": "SELECT EXTRACT(MONTH FROM oper_date) AS 月份, SUM(pay_sum) AS 总支付金额 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date BETWEEN '2023-01-01' AND '2023-03-31' GROUP BY 月份 ORDER BY 月份;"
+  },
+  {
+    "question": "客单价最低的5个服务区明细",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) / SUM(order_sum) AS 客单价 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name ORDER BY 客单价 ASC LIMIT 5;"
+  },
+  {
+    "question": "各服务区支付宝支付金额占比超过20%的记录",
+    "sql": "SELECT service_name AS 服务区名称, SUM(zfb) / SUM(pay_sum) * 100 AS 支付宝占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name HAVING SUM(zfb) / SUM(pay_sum) > 0.2 ORDER BY 支付宝占比 DESC;"
+  },
+  {
+    "question": "2023-04-01当天各档口支付金额及订单数明细",
+    "sql": "SELECT branch_name AS 档口名称, pay_sum AS 当日支付金额, order_sum AS 当日订单数 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date = '2023-04-01' ORDER BY 当日支付金额 DESC;"
+  },
+  {
+    "question": "不同车辆类型在消费金额上的差异如何?",
+    "sql": "SELECT car.car_type AS 车辆类型, SUM(bus.pay_sum) AS 总消费金额, SUM(car.customer_count) AS 总停留车辆数, SUM(bus.pay_sum)/SUM(car.customer_count) AS 人均消费 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.car_type ORDER BY 总消费金额 DESC;"
+  },
+  {
+    "question": "各车型消费频次(订单数/车辆数)排名如何?",
+    "sql": "SELECT car.car_type AS 车辆类型, SUM(bus.order_sum) AS 总订单数, SUM(car.customer_count) AS 总停留车辆数, SUM(bus.order_sum)::numeric/SUM(car.customer_count) AS 消费频次 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.car_type ORDER BY 消费频次 DESC;"
+  },
+  {
+    "question": "危化品车辆停留期间每日人均消费趋势如何?",
+    "sql": "SELECT car.count_date AS 统计日期, SUM(bus.pay_sum)/SUM(car.customer_count) AS 人均消费 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.car_type = '危化品' AND car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.count_date ORDER BY 统计日期;"
+  },
+  {
+    "question": "城际车辆在各服务区的平均消费金额TOP10是哪些?",
+    "sql": "SELECT bus.service_name AS 服务区名称, AVG(bus.pay_sum) AS 平均消费金额 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date AND car.service_area_id = bus.id WHERE car.car_type = '城际' AND car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY bus.service_name ORDER BY 平均消费金额 DESC LIMIT 10;"
+  },
+  {
+    "question": "过境车辆消费中微信支付占比超过50%的日期有哪些?",
+    "sql": "SELECT car.count_date AS 统计日期, SUM(bus.wx)/SUM(bus.pay_sum) AS 微信支付占比 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.car_type = '过境' AND car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.count_date HAVING SUM(bus.wx)/SUM(bus.pay_sum) > 0.5 ORDER BY 统计日期;"
+  },
+  {
+    "question": "各车型消费转化率(订单数/车辆数)对比情况如何?",
+    "sql": "SELECT car.car_type AS 车辆类型, SUM(bus.order_sum) AS 总订单数, SUM(car.customer_count) AS 总停留车辆数, SUM(bus.order_sum)::numeric/SUM(car.customer_count) AS 消费转化率 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY car.car_type ORDER BY 消费转化率 DESC;"
+  },
+  {
+    "question": "2023年春节期间各车型总消费金额是多少?",
+    "sql": "SELECT car.car_type AS 车辆类型, SUM(bus.pay_sum) AS 总消费金额 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL AND bus.oper_date BETWEEN '2023-01-20' AND '2023-01-30' GROUP BY car.car_type ORDER BY 总消费金额 DESC;"
+  },
+  {
+    "question": "各服务区中哪种车型消费金额占比最高?",
+    "sql": "WITH ranked_data AS (SELECT bus.service_name AS 服务区名称, car.car_type AS 车辆类型, SUM(bus.pay_sum) AS 总消费金额, RANK() OVER (PARTITION BY bus.service_name ORDER BY SUM(bus.pay_sum) DESC) AS 排名 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date AND car.service_area_id = bus.id WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY bus.service_name, car.car_type) SELECT 服务区名称, 车辆类型, 总消费金额 FROM ranked_data WHERE 排名 = 1 ORDER BY 总消费金额 DESC;"
+  },
+  {
+    "question": "每日车辆数与当日总消费金额的相关性如何?",
+    "sql": "SELECT CORR(total_cars, total_pay) AS 相关性系数 FROM (SELECT count_date, SUM(customer_count) AS total_cars, SUM(pay_sum) AS total_pay FROM bss_car_day_count car JOIN bss_business_day_data bus ON count_date = oper_date WHERE car.delete_ts IS NULL AND bus.delete_ts IS NULL GROUP BY count_date) AS daily_data;"
+  },
+  {
+    "question": "最近一周其他类型车辆消费订单数量变化趋势如何?",
+    "sql": "SELECT car.count_date AS 统计日期, SUM(bus.order_sum) AS 总订单数 FROM bss_car_day_count car JOIN bss_business_day_data bus ON car.count_date = bus.oper_date WHERE car.car_type = '其他' AND car.delete_ts IS NULL AND bus.delete_ts IS NULL AND car.count_date >= CURRENT_DATE - 7 GROUP BY car.count_date ORDER BY 统计日期;"
+  }
+]

+ 14 - 0
data_pipeline/training_data/task_20250701_213434/task_config.json

@@ -0,0 +1,14 @@
+{
+  "task_id": "task_20250701_213434",
+  "created_at": "2025-07-01T13:34:35.478473",
+  "parameters": {
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:6432/highway_db",
+    "table_list_file": "data_pipeline/tables.txt",
+    "business_context": "高速公路服务区管理系统",
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_sql_validation": true,
+    "enable_training_data_load": true
+  },
+  "output_directory": "data_pipeline\\training_data\\task_20250701_213434"
+}

+ 117 - 0
data_pipeline/training_data/task_20250701_213434/task_result.json

@@ -0,0 +1,117 @@
+{
+  "success": true,
+  "workflow_state": {
+    "start_time": null,
+    "end_time": null,
+    "current_step": "training_data_load",
+    "completed_steps": [
+      "ddl_md_generation",
+      "question_sql_generation",
+      "sql_validation",
+      "training_data_load"
+    ],
+    "failed_steps": [],
+    "artifacts": {
+      "ddl_md_generation": {
+        "total_tables": 2,
+        "processed_successfully": 2,
+        "failed": 0,
+        "files_generated": 4,
+        "duration": 134.5416886806488
+      },
+      "question_sql_generation": {
+        "output_file": "data_pipeline\\training_data\\task_20250701_213434\\qs_highway_db_20250701_214431_pair.json",
+        "total_questions": 50,
+        "total_themes": 5,
+        "successful_themes": 5,
+        "failed_themes": [],
+        "duration": 464.0704131126404
+      },
+      "sql_validation": {
+        "original_sql_count": 50,
+        "valid_sql_count": 50,
+        "invalid_sql_count": 0,
+        "success_rate": 1.0,
+        "repair_stats": {
+          "attempted": 1,
+          "successful": 1,
+          "failed": 0
+        },
+        "file_modification_stats": {
+          "modified": 1,
+          "deleted": 0,
+          "failed_modifications": 0
+        },
+        "average_execution_time": 0.030688700675964357,
+        "total_retries": 0,
+        "duration": 24.97702646255493
+      },
+      "training_data_load": {
+        "training_data_dir": "data_pipeline\\training_data\\task_20250701_213434",
+        "load_successful": true,
+        "total_records": 393,
+        "data_type_counts": {
+          "sql": 349,
+          "documentation": 23,
+          "ddl": 20,
+          "error_sql": 1
+        },
+        "duration": 68.5514280796051
+      }
+    },
+    "statistics": {
+      "step1_duration": 134.5416886806488,
+      "step2_duration": 464.0704131126404,
+      "step3_duration": 24.97702646255493,
+      "step4_duration": 68.5514280796051
+    }
+  },
+  "artifacts": {
+    "ddl_md_generation": {
+      "total_tables": 2,
+      "processed_successfully": 2,
+      "failed": 0,
+      "files_generated": 4,
+      "duration": 134.5416886806488
+    },
+    "question_sql_generation": {
+      "output_file": "data_pipeline\\training_data\\task_20250701_213434\\qs_highway_db_20250701_214431_pair.json",
+      "total_questions": 50,
+      "total_themes": 5,
+      "successful_themes": 5,
+      "failed_themes": [],
+      "duration": 464.0704131126404
+    },
+    "sql_validation": {
+      "original_sql_count": 50,
+      "valid_sql_count": 50,
+      "invalid_sql_count": 0,
+      "success_rate": 1.0,
+      "repair_stats": {
+        "attempted": 1,
+        "successful": 1,
+        "failed": 0
+      },
+      "file_modification_stats": {
+        "modified": 1,
+        "deleted": 0,
+        "failed_modifications": 0
+      },
+      "average_execution_time": 0.030688700675964357,
+      "total_retries": 0,
+      "duration": 24.97702646255493
+    },
+    "training_data_load": {
+      "training_data_dir": "data_pipeline\\training_data\\task_20250701_213434",
+      "load_successful": true,
+      "total_records": 393,
+      "data_type_counts": {
+        "sql": 349,
+        "documentation": 23,
+        "ddl": 20,
+        "error_sql": 1
+      },
+      "duration": 68.5514280796051
+    }
+  }
+}

+ 31 - 0
data_pipeline/training_data/task_20250701_231850/bss_business_day_data.ddl

@@ -0,0 +1,31 @@
+-- 中文名: 记录各服务区每日业务统计数据
+-- 描述: 记录各服务区每日业务统计数据,支持运营分析与决策
+create table public.bss_business_day_data (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建人,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新人,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除人,
+  oper_date date              -- 统计日期,
+  service_no varchar(255)     -- 服务区编码,
+  service_name varchar(255)   -- 服务区名称,
+  branch_no varchar(255)      -- 档口编码,
+  branch_name varchar(255)    -- 档口名称,
+  wx numeric(19,4)            -- 微信支付金额,
+  wx_order integer            -- 微信订单数量,
+  zfb numeric(19,4)           -- 支付宝支付金额,
+  zf_order integer            -- 支付宝订单数量,
+  rmb numeric(19,4)           -- 现金支付金额,
+  rmb_order integer           -- 现金订单数量,
+  xs numeric(19,4)            -- 行吧支付金额,
+  xs_order integer            -- 行吧订单数量,
+  jd numeric(19,4)            -- 金豆支付金额,
+  jd_order integer            -- 金豆订单数量,
+  order_sum integer           -- 订单总数,
+  pay_sum numeric(19,4)       -- 总支付金额,
+  source_type integer         -- 数据来源类别,
+  primary key (id)
+);

+ 32 - 0
data_pipeline/training_data/task_20250701_231850/bss_business_day_data_detail.md

@@ -0,0 +1,32 @@
+## bss_business_day_data(记录各服务区每日业务统计数据)
+bss_business_day_data 表记录各服务区每日业务统计数据,支持运营分析与决策
+字段列表:
+- id (varchar(32)) - 主键ID [主键, 非空] [示例: 00827DFF993D415488EA1F07CAE6C440, 00e799048b8cbb8ee758eac9c8b4b820]
+- version (integer) - 版本号 [非空] [示例: 1]
+- create_ts (timestamp) - 创建时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- created_by (varchar(50)) - 创建人 [示例: xingba]
+- update_ts (timestamp) - 更新时间 [示例: 2023-04-02 08:31:51, 2023-04-02 02:30:08]
+- updated_by (varchar(50)) - 更新人
+- delete_ts (timestamp) - 删除时间
+- deleted_by (varchar(50)) - 删除人
+- oper_date (date) - 统计日期 [示例: 2023-04-01]
+- service_no (varchar(255)) - 服务区编码 [示例: 1028, H0501]
+- service_name (varchar(255)) - 服务区名称 [示例: 宜春服务区, 庐山服务区]
+- branch_no (varchar(255)) - 档口编码 [示例: 1, H05016]
+- branch_name (varchar(255)) - 档口名称 [示例: 宜春南区, 庐山鲜徕客东区]
+- wx (numeric(19,4)) - 微信支付金额 [示例: 4790.0000, 2523.0000]
+- wx_order (integer) - 微信订单数量 [示例: 253, 133]
+- zfb (numeric(19,4)) - 支付宝支付金额 [示例: 229.0000, 0.0000]
+- zf_order (integer) - 支付宝订单数量 [示例: 15, 0]
+- rmb (numeric(19,4)) - 现金支付金额 [示例: 1058.5000, 124.0000]
+- rmb_order (integer) - 现金订单数量 [示例: 56, 12]
+- xs (numeric(19,4)) - 行吧支付金额 [示例: 0.0000, 40.0000]
+- xs_order (integer) - 行吧订单数量 [示例: 0, 1]
+- jd (numeric(19,4)) - 金豆支付金额 [示例: 0.0000]
+- jd_order (integer) - 金豆订单数量 [示例: 0]
+- order_sum (integer) - 订单总数 [示例: 324, 146]
+- pay_sum (numeric(19,4)) - 总支付金额 [示例: 6077.5000, 2687.0000]
+- source_type (integer) - 数据来源类别 [示例: 1, 0, 4]
+字段补充说明:
+- id 为主键
+- source_type 为枚举字段,包含取值:0、4、1、2、3

+ 17 - 0
data_pipeline/training_data/task_20250701_231850/bss_car_day_count.ddl

@@ -0,0 +1,17 @@
+-- 中文名: 高速公路服务区每日车辆数量统计表
+-- 描述: 高速公路服务区每日车辆数量统计表,按车型分类,用于车流分析及运营管理。
+create table public.bss_car_day_count (
+  id varchar(32) not null     -- 主键ID,主键,
+  version integer not null    -- 版本号,
+  create_ts timestamp         -- 创建时间,
+  created_by varchar(50)      -- 创建者ID,
+  update_ts timestamp         -- 更新时间,
+  updated_by varchar(50)      -- 更新者ID,
+  delete_ts timestamp         -- 删除时间,
+  deleted_by varchar(50)      -- 删除者ID,
+  customer_count bigint       -- 车辆数量,
+  car_type varchar(100)       -- 车辆类别,
+  count_date date             -- 统计日期,
+  service_area_id varchar(32) -- 服务区ID,
+  primary key (id)
+);

+ 6 - 6
data_pipeline/training_data/bss_car_day_count_detail.md → data_pipeline/training_data/task_20250701_231850/bss_car_day_count_detail.md

@@ -1,16 +1,16 @@
-## bss_car_day_count(服务区车辆类型日统计表)
-bss_car_day_count 表服务区车辆类型日统计表,记录每日车流数量及分类数据,用于交通流量分析与服务资源调度
+## bss_car_day_count(高速公路服务区每日车辆数量统计表)
+bss_car_day_count 表高速公路服务区每日车辆数量统计表,按车型分类,用于车流分析及运营管理
 字段列表:
 - id (varchar(32)) - 主键ID [主键, 非空] [示例: 00022c1c99ff11ec86d4fa163ec0f8fc, 00022caa99ff11ec86d4fa163ec0f8fc]
 - version (integer) - 版本号 [非空] [示例: 1]
 - create_ts (timestamp) - 创建时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
-- created_by (varchar(50)) - 创建
+- created_by (varchar(50)) - 创建者ID
 - update_ts (timestamp) - 更新时间 [示例: 2022-03-02 16:01:43, 2022-02-02 14:18:55]
-- updated_by (varchar(50)) - 更新
+- updated_by (varchar(50)) - 更新者ID
 - delete_ts (timestamp) - 删除时间
-- deleted_by (varchar(50)) - 删除
+- deleted_by (varchar(50)) - 删除者ID
 - customer_count (bigint) - 车辆数量 [示例: 1114, 295]
-- car_type (varchar(100)) - 车辆类 [示例: 其他]
+- car_type (varchar(100)) - 车辆类 [示例: 其他]
 - count_date (date) - 统计日期 [示例: 2022-03-02, 2022-02-02]
 - service_area_id (varchar(32)) - 服务区ID [示例: 17461166e7fa3ecda03534a5795ce985, 81f4eb731fb0728aef17ae61f1f1daef]
 字段补充说明:

+ 10 - 0
data_pipeline/training_data/task_20250701_231850/db_query_decision_prompt.txt

@@ -0,0 +1,10 @@
+=== 数据库业务范围 ===
+当前数据库存储的是高速公路服务区运营管理的相关数据,主要涉及服务区营业收入与车辆流量统计,包含以下业务数据:
+核心业务实体:
+- 服务区:高速公路沿线提供休息、商业服务的区域,主要字段:service_no、service_name
+- 档口:服务区内的具体经营单元,主要字段:branch_no、branch_name
+- 车辆类别:高速公路通行车辆的分类标准,主要字段:car_type
+关键业务指标:
+- 支付方式维度:各支付渠道(微信/支付宝/现金/行吧/金豆)的交易金额与订单数量统计
+- 车流量维度:按车辆类型分类的通行数量统计
+- 营收聚合维度:单日订单总量、支付总金额及人均消费金额(通过支付总金额/订单总量推算)

+ 5 - 0
data_pipeline/training_data/task_20250701_231850/filename_mapping.txt

@@ -0,0 +1,5 @@
+# 文件名映射报告
+# 格式: 原始表名 -> 实际文件名
+
+public.bss_business_day_data -> bss_business_day_data_detail.md
+public.bss_car_day_count -> bss_car_day_count_detail.md

+ 62 - 0
data_pipeline/training_data/task_20250701_231850/metadata.txt

@@ -0,0 +1,62 @@
+-- Schema Tools生成的主题元数据
+-- 业务背景: 高速公路服务区管理系统
+-- 生成时间: 2025-07-01 23:48:11
+-- 数据库: highway_db
+
+-- 创建表(如果不存在)
+CREATE TABLE IF NOT EXISTS metadata (
+    id SERIAL PRIMARY KEY,    -- 主键
+    topic_name VARCHAR(100) NOT NULL,  -- 业务主题名称
+    description TEXT,                  -- 业务主体说明
+    related_tables TEXT[],			  -- 相关表名
+    biz_entities TEXT[],               -- 主要业务实体名称
+    biz_metrics TEXT[],                -- 主要业务指标名称
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP    -- 插入时间
+);
+
+-- 插入主题数据
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '日营收分析',
+  '分析各服务区每日营收结构及支付方式占比,评估经营质量与支付偏好',
+  'bss_business_day_data',
+  '服务区,档口,支付方式',
+  '日营收总额,支付方式占比,档口收益排名'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '车流特征分析',
+  '统计各服务区车辆类型分布及时段规律,辅助基础设施规划与服务资源配置',
+  'bss_car_day_count',
+  '服务区,车辆类型,统计日期',
+  '车流量趋势,车型占比分析,高峰时段识别'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '档口效能评估',
+  '对比不同档口单位车流的营收转化率,发现运营效率差异与改进空间',
+  'bss_business_day_data,bss_car_day_count',
+  '服务区,档口,运营时段',
+  '坪效对比,客单价分析,转化率排名'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '节假日效应分析',
+  '对比节假日与常规时段的营收波动及车流变化,优化促销策略与人力配置',
+  'bss_business_day_data,bss_car_day_count',
+  '服务区,节假日类型,支付方式',
+  '节前/节中/节后对比,车流峰值分析,支付方式迁移趋势'
+);
+
+INSERT INTO metadata(topic_name, description, related_tables, biz_entities, biz_metrics) VALUES
+(
+  '异常数据监测',
+  '识别营收数据与车流数据的匹配异常,发现潜在运营问题或数据采集故障',
+  'bss_business_day_data,bss_car_day_count',
+  '服务区,数据来源,支付方式',
+  '营收-车流偏离度,支付方式异常检测,数据完整性校验'
+);
+

+ 20 - 0
data_pipeline/training_data/task_20250701_231850/metadata_detail.md

@@ -0,0 +1,20 @@
+## metadata(存储分析主题元数据)
+
+`metadata` 主要描述了当前数据库包含了哪些数据内容,哪些分析主题,哪些指标等等。
+
+字段列表:
+
+- `id` (serial) - 主键ID [主键, 非空]
+- `topic_name` (varchar(100)) - 业务主题名称 [非空]
+- `description` (text) - 业务主题说明
+- `related_tables` (text[]) - 涉及的数据表 [示例: bss_car_day_count, bss_business_day_data]
+- `biz_entities` (text[]) - 主要业务实体名称 [示例: 数据来源, 档口, 服务区]
+- `biz_metrics` (text[]) - 主要业务指标名称 [示例: 档口收益排名, 高峰时段识别, 支付方式异常检测]
+- `created_at` (timestamp) - 插入时间 [默认值: `CURRENT_TIMESTAMP`]
+
+字段补充说明:
+
+- `id` 为主键,自增;
+- `related_tables` 用于建立主题与具体明细表的依赖关系;
+- `biz_entities` 表示主题关注的核心对象,例如服务区、车辆、公司;
+- `biz_metrics` 表示该主题关注的业务分析指标,例如营收对比、趋势变化、占比结构等。

+ 202 - 0
data_pipeline/training_data/task_20250701_231850/qs_highway_db_20250701_234811_pair.json

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "统计2023-04-01各服务区总营收并按金额降序排列",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 日营收总额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_name ORDER BY 日营收总额 DESC;"
+  },
+  {
+    "question": "查询宜春服务区2023年4月各档口营收排名TOP5",
+    "sql": "SELECT branch_name AS 档口名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY 档口名称 ORDER BY 总营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析2023-04-02各服务区微信支付占比(微信金额/总支付金额)",
+    "sql": "SELECT service_name AS 服务区名称, ROUND(SUM(wx)/SUM(pay_sum)*100, 2) AS 微信占比百分比 FROM bss_business_day_data WHERE oper_date = '2023-04-02' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "对比庐山服务区2023年3月与4月日均营收变化趋势",
+    "sql": "SELECT EXTRACT(MONTH FROM oper_date) AS 月份, AVG(pay_sum) AS 日均营收 FROM bss_business_day_data WHERE service_name = '庐山服务区' AND EXTRACT(MONTH FROM oper_date) IN (3,4) AND delete_ts IS NULL GROUP BY 月份 ORDER BY 月份;"
+  },
+  {
+    "question": "统计各服务区近7天现金支付金额环比增长率(当前周-上一周)",
+    "sql": "SELECT service_name AS 服务区名称, SUM(CASE WHEN oper_date BETWEEN CURRENT_DATE - 7 AND CURRENT_DATE THEN pay_sum ELSE 0 END) - SUM(CASE WHEN oper_date BETWEEN CURRENT_DATE - 14 AND CURRENT_DATE - 8 THEN pay_sum ELSE 0 END) AS 现金增长额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "查询各服务区支付宝订单占比超过10%的记录(按日期筛选)",
+    "sql": "SELECT oper_date AS 统计日期, service_name AS 服务区名称, SUM(zf_order) AS 支付宝订单数, SUM(order_sum) AS 总订单数 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 统计日期, 服务区名称 HAVING SUM(zf_order)/SUM(order_sum) > 0.1 ORDER BY 统计日期 DESC;"
+  },
+  {
+    "question": "分析行吧支付使用情况(订单数前10的服务区及使用率)",
+    "sql": "SELECT service_name AS 服务区名称, SUM(xs_order) AS 行吧订单数, SUM(order_sum) AS 总订单数, ROUND(SUM(xs_order)/SUM(order_sum)*100, 2) AS 使用率百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 服务区名称 ORDER BY 行吧订单数 DESC LIMIT 10;"
+  },
+  {
+    "question": "统计各服务区最近一天营收数据并标注数据来源类型",
+    "sql": "SELECT DISTINCT ON (service_name) service_name AS 服务区名称, oper_date AS 统计日期, pay_sum AS 营收金额, source_type AS 数据来源类型 FROM bss_business_day_data WHERE delete_ts IS NULL ORDER BY service_name, oper_date DESC;"
+  },
+  {
+    "question": "查询2023-04-01宜春南区档口各支付方式金额及占比",
+    "sql": "SELECT '微信' AS 支付方式, wx AS 金额, ROUND(wx/pay_sum*100, 2) AS 占比 FROM bss_business_day_data WHERE branch_name = '宜春南区' AND oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '支付宝', zfb, ROUND(zfb/pay_sum*100, 2) FROM bss_business_day_data WHERE branch_name = '宜春南区' AND oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '现金', rmb, ROUND(rmb/pay_sum*100, 2) FROM bss_business_day_data WHERE branch_name = '宜春南区' AND oper_date = '2023-04-01' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "分析各服务区近30天日营收标准差评估经营稳定性",
+    "sql": "SELECT service_name AS 服务区名称, STDDEV_SAMP(pay_sum) AS 日营收标准差 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY service_name ORDER BY 日营收标准差 DESC;"
+  },
+  {
+    "question": "统计各服务区不同车型的车流量占比,用于资源配置优化",
+    "sql": "SELECT service_area_id AS 服务区ID, car_type AS 车辆类型, SUM(customer_count) AS 总车流量, ROUND((SUM(customer_count)*100.0/SUM(SUM(customer_count)) OVER(PARTITION BY service_area_id)),2) AS 占比百分比 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id, car_type ORDER BY service_area_id, 总车流量 DESC;"
+  },
+  {
+    "question": "分析2023年4月各服务区每日车流量趋势变化,识别高峰期",
+    "sql": "SELECT count_date AS 统计日期, service_area_id AS 服务区ID, SUM(customer_count) AS 日车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY count_date, service_area_id ORDER BY count_date;"
+  },
+  {
+    "question": "查询2023年4月1日当日车流量最高的前5个服务区",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 当日车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date = '2023-04-01' GROUP BY service_area_id ORDER BY 当日车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计各服务区危化品车辆出现频次,用于安全管理评估",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 危化品车辆总数 FROM bss_car_day_count WHERE delete_ts IS NULL AND car_type = '危化品' GROUP BY service_area_id ORDER BY 危化品车辆总数 DESC;"
+  },
+  {
+    "question": "分析各服务区车流量月环比增长趋势(按2023年Q2数据)",
+    "sql": "SELECT service_area_id AS 服务区ID, month AS 月份, total_count AS 当月车流量, prev_month_count AS 上月车流量, ROUND(((total_count-prev_month_count)*100.0/prev_month_count),2) AS 环比增长率 FROM (SELECT service_area_id, DATE_TRUNC('month', count_date) AS month, SUM(customer_count) AS total_count, LAG(SUM(customer_count)) OVER(PARTITION BY service_area_id ORDER BY DATE_TRUNC('month', count_date)) AS prev_month_count FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-06-30' GROUP BY service_area_id, DATE_TRUNC('month', count_date)) AS monthly_data ORDER BY service_area_id, 月份;"
+  },
+  {
+    "question": "对比周末与工作日各服务区平均车流量差异",
+    "sql": "SELECT service_area_id AS 服务区ID, AVG(CASE WHEN EXTRACT(ISODOW FROM count_date) IN (6,7) THEN customer_count END) AS 周末日均车流量, AVG(CASE WHEN EXTRACT(ISODOW FROM count_date) NOT IN (6,7) THEN customer_count END) AS 工作日均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id;"
+  },
+  {
+    "question": "查询2023年4月城际车辆流量最高的前3个服务区",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 城际车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND car_type = '城际' AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY service_area_id ORDER BY 城际车流量 DESC LIMIT 3;"
+  },
+  {
+    "question": "分析各服务区不同车型的月均车流量分布",
+    "sql": "SELECT service_area_id AS 服务区ID, car_type AS 车辆类型, ROUND(AVG(customer_count), 2) AS 月均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id, car_type ORDER BY service_area_id, 月均车流量 DESC;"
+  },
+  {
+    "question": "识别最近7天各服务区车流量TOP3日期",
+    "sql": "SELECT * FROM (SELECT service_area_id, count_date AS 统计日期, customer_count AS 车流量, ROW_NUMBER() OVER(PARTITION BY service_area_id ORDER BY customer_count DESC) AS rn FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date >= CURRENT_DATE - 7) t WHERE rn <= 3 ORDER BY service_area_id, 车流量 DESC;"
+  },
+  {
+    "question": "统计各服务区过境车辆与城际车辆流量比值,分析交通特性",
+    "sql": "SELECT service_area_id AS 服务区ID, (SUM(CASE WHEN car_type = '过境' THEN customer_count ELSE 0 END) * 1.0 / NULLIF(SUM(CASE WHEN car_type = '城际' THEN customer_count ELSE 0 END), 0)) AS 过境城际比 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id ORDER BY 过境城际比 DESC;"
+  },
+  {
+    "question": "各档口单位车流的坪效对比(总支付金额/车辆数量)排名TOP10",
+    "sql": "SELECT b.branch_name AS 档口名称, ROUND(SUM(b.pay_sum) / SUM(c.customer_count), 2) AS 坪效 FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY b.branch_name ORDER BY 坪效 DESC LIMIT 10;"
+  },
+  {
+    "question": "各档口客单价(总支付金额/订单总数)最高前5名",
+    "sql": "SELECT branch_name AS 档口名称, ROUND(SUM(pay_sum) / SUM(order_sum), 2) AS 客单价 FROM bss_business_day_data WHERE delete_ts IS NULL AND order_sum > 0 GROUP BY branch_name ORDER BY 客单价 DESC LIMIT 5;"
+  },
+  {
+    "question": "不同服务区档口的平均坪效对比(排除0订单数据)",
+    "sql": "SELECT service_name AS 服务区名称, ROUND(AVG(pay_sum / NULLIF(customer_count, 0)), 2) AS 平均坪效 FROM (SELECT b.service_name, SUM(b.pay_sum) AS pay_sum, SUM(c.customer_count) AS customer_count FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY b.service_name, b.oper_date HAVING SUM(b.order_sum) > 0) AS sub GROUP BY service_name ORDER BY 平均坪效 DESC;"
+  },
+  {
+    "question": "最近7天微信支付占比超过50%的档口清单",
+    "sql": "SELECT branch_name AS 档口名称, SUM(wx) / SUM(pay_sum) AS 微信占比 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - INTERVAL '7 days' AND delete_ts IS NULL GROUP BY branch_name HAVING SUM(pay_sum) > 0 AND SUM(wx)/SUM(pay_sum) > 0.5 ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "各时段(早/中/晚)档口客单价分布统计",
+    "sql": "SELECT CASE WHEN EXTRACT(HOUR FROM create_ts) < 12 THEN '上午' WHEN EXTRACT(HOUR FROM create_ts) < 18 THEN '下午' ELSE '晚上' END AS 时段, ROUND(AVG(pay_sum/order_sum), 2) AS 平均客单价 FROM bss_business_day_data WHERE delete_ts IS NULL AND order_sum > 0 GROUP BY 时段 ORDER BY 时段;"
+  },
+  {
+    "question": "车辆数量TOP5但营收低于平均值的档口预警",
+    "sql": "SELECT c.service_area_id AS 服务区ID, b.branch_name AS 档口名称, SUM(c.customer_count) AS 总车流量, SUM(b.pay_sum) AS 总营收 FROM bss_car_day_count c JOIN bss_business_day_data b ON c.service_area_id = b.service_no WHERE c.delete_ts IS NULL AND b.delete_ts IS NULL GROUP BY c.service_area_id, b.branch_name HAVING SUM(c.customer_count) > (SELECT AVG(customer_count) FROM bss_car_day_count WHERE delete_ts IS NULL) AND SUM(b.pay_sum) < (SELECT AVG(pay_sum) FROM bss_business_day_data WHERE delete_ts IS NULL) ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "各档口不同支付方式金额占比分析",
+    "sql": "SELECT branch_name AS 档口名称, ROUND(SUM(wx)/SUM(pay_sum)*100, 2) AS 微信占比, ROUND(SUM(zfb)/SUM(pay_sum)*100, 2) AS 支付宝占比, ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) AS 现金占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "节假日(假设2023-04-01至2023-04-05)期间坪效波动趋势",
+    "sql": "SELECT oper_date AS 日期, ROUND(SUM(pay_sum)/SUM(c.customer_count), 2) AS 日坪效 FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.oper_date BETWEEN '2023-04-01' AND '2023-04-05' AND b.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY oper_date ORDER BY 日期;"
+  },
+  {
+    "question": "各档口订单转化率(订单数/车辆数量)排名及同比变化",
+    "sql": "WITH current_period AS (SELECT branch_name, SUM(order_sum) AS orders, SUM(c.customer_count) AS customers FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.oper_date >= '2023-04-01' AND b.oper_date <= '2023-04-07' GROUP BY branch_name), last_period AS (SELECT branch_name, SUM(order_sum) AS orders, SUM(c.customer_count) AS customers FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.oper_date >= '2023-03-25' AND b.oper_date <= '2023-03-31' GROUP BY branch_name) SELECT c.branch_name AS 档口名称, ROUND(c.orders/c.customers, 4) AS 本期转化率, ROUND(l.orders/l.customers, 4) AS 上期转化率, ROUND((c.orders/c.customers - l.orders/l.customers)/NULLIF(l.orders/l.customers, 0)*100, 2) AS 变化率 FROM current_period c JOIN last_period l ON c.branch_name = l.branch_name ORDER BY 变化率 DESC;"
+  },
+  {
+    "question": "特定档口(如branch_name='庐山鲜徕客东区')近30天每日营收趋势",
+    "sql": "SELECT oper_date AS 日期, pay_sum AS 营收金额 FROM bss_business_day_data WHERE branch_name = '庐山鲜徕客东区' AND delete_ts IS NULL AND oper_date >= CURRENT_DATE - INTERVAL '30 days' ORDER BY 日期 ASC;"
+  },
+  {
+    "question": "节假日与非节假日各服务区平均日营收对比分析?",
+    "sql": "SELECT CASE WHEN oper_date BETWEEN '2023-04-01' AND '2023-04-05' THEN '节假日期间' ELSE '常规时段' END AS 分析时段, service_name AS 服务区名称, ROUND(AVG(pay_sum)::numeric, 2) AS 平均日营收 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 分析时段, 服务区名称 ORDER BY 平均日营收 DESC;"
+  },
+  {
+    "question": "节假日车流量TOP5服务区统计?",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-05' AND delete_ts IS NULL GROUP BY 服务区ID ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "节前/节中/节后各支付方式订单占比趋势分析?",
+    "sql": "SELECT CASE WHEN oper_date < '2023-04-01' THEN '节前' WHEN oper_date BETWEEN '2023-04-01' AND '2023-04-05' THEN '节中' ELSE '节后' END AS 阶段, ROUND(SUM(wx_order)*100/SUM(order_sum), 2) AS 微信占比, ROUND(SUM(zf_order)*100/SUM(order_sum), 2) AS 支付宝占比 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date BETWEEN '2023-03-25' AND '2023-04-10' GROUP BY 阶段 ORDER BY 阶段;"
+  },
+  {
+    "question": "春节前后一周服务区营收增长率对比(2023-01-20至2023-01-26 vs 2023-01-27至2023-02-02)?",
+    "sql": "WITH pre_period AS (SELECT service_name, SUM(pay_sum) AS 营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-20' AND '2023-01-26' GROUP BY service_name), post_period AS (SELECT service_name, SUM(pay_sum) AS 营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-27' AND '2023-02-02' GROUP BY service_name) SELECT a.service_name, ROUND((b.营收-a.营收)/a.营收*100, 2) AS 增长率 FROM pre_period a JOIN post_period b ON a.service_name = b.service_name ORDER BY 增长率 DESC;"
+  },
+  {
+    "question": "节假日不同车型车流分布占比分析?",
+    "sql": "SELECT car_type AS 车型, COUNT(*) AS 记录数, ROUND(CAST(COUNT(*) AS numeric)*100/(SELECT COUNT(*) FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-05'), 2) AS 占比百分比 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-05' AND delete_ts IS NULL GROUP BY car_type ORDER BY 记录数 DESC;"
+  },
+  {
+    "question": "节假日期间现金支付比例最高的3个服务区?",
+    "sql": "SELECT service_name, ROUND(SUM(rmb) * 100 / SUM(pay_sum), 2) AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-05' AND delete_ts IS NULL GROUP BY service_name ORDER BY 现金占比 DESC LIMIT 3;"
+  },
+  {
+    "question": "节后三天内订单总量最低的5个服务区?",
+    "sql": "SELECT service_name, SUM(order_sum) AS 总订单量 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-06' AND '2023-04-08' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总订单量 ASC LIMIT 5;"
+  },
+  {
+    "question": "节中期间微信支付金额环比增长TOP3服务区?",
+    "sql": "SELECT service_name, oper_date, wx, LAG(wx,1) OVER (PARTITION BY service_name ORDER BY oper_date) AS 前一日, ROUND((wx - LAG(wx,1) OVER (PARTITION BY service_name ORDER BY oper_date)) / LAG(wx,1) OVER (PARTITION BY service_name ORDER BY oper_date) * 100, 2) AS 环比增长率 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-05' AND delete_ts IS NULL ORDER BY oper_date, 环比增长率 DESC;"
+  },
+  {
+    "question": "国庆黄金周车流量同比去年增长情况分析?",
+    "sql": "SELECT service_area_id, SUM(CASE WHEN count_date BETWEEN '2022-10-01' AND '2022-10-07' THEN customer_count ELSE 0 END) AS 去年车流量, SUM(CASE WHEN count_date BETWEEN '2023-10-01' AND '2023-10-07' THEN customer_count ELSE 0 END) AS 今年车流量, ROUND((SUM(CASE WHEN count_date BETWEEN '2023-10-01' AND '2023-10-07' THEN customer_count ELSE 0 END) - SUM(CASE WHEN count_date BETWEEN '2022-10-01' AND '2022-10-07' THEN customer_count ELSE 0 END)) * 100 / NULLIF(SUM(CASE WHEN count_date BETWEEN '2022-10-01' AND '2022-10-07' THEN customer_count ELSE 0 END), 0), 2) AS 增长率 FROM bss_car_day_count WHERE count_date BETWEEN '2022-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id ORDER BY 增长率 DESC;"
+  },
+  {
+    "question": "节后一周内营收恢复至节前90%水平的服务区统计?",
+    "sql": "WITH pre_holiday AS (SELECT service_name, AVG(pay_sum) AS 节前均值 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-03-25' AND '2023-03-31' GROUP BY service_name), post_holiday AS (SELECT service_name, AVG(pay_sum) AS 节后均值 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-06' AND '2023-04-12' GROUP BY service_name) SELECT a.service_name, ROUND(a.节前均值, 2) AS 节前均值, ROUND(b.节后均值, 2) AS 节后均值, ROUND(b.节后均值/a.节前均值*100, 2) AS 恢复比例 FROM pre_holiday a JOIN post_holiday b ON a.service_name = b.service_name WHERE b.节后均值 >= a.节前均值 * 0.9 ORDER BY 恢复比例 DESC;"
+  },
+  {
+    "question": "统计最近一天营收-车流偏离度TOP10的服务区",
+    "sql": "SELECT b.service_name AS 服务区名称, (b.pay_sum / NULLIF(c.customer_count, 0)) AS 营收车流比 FROM bss_business_day_data b JOIN bss_car_day_count c ON b.oper_date = c.count_date AND b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL AND b.oper_date = CURRENT_DATE - 1 ORDER BY 营收车流比 DESC LIMIT 10;"
+  },
+  {
+    "question": "查找昨日微信支付订单占比低于5%的异常服务区",
+    "sql": "SELECT service_name AS 服务区名称, wx_order AS 微信订单数, order_sum AS 总订单数, (wx_order::numeric / order_sum) AS 微信占比 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date = CURRENT_DATE - 1 AND order_sum > 0 AND (wx_order::numeric / order_sum) < 0.05;"
+  },
+  {
+    "question": "分析最近一周各数据来源类型的营收分布",
+    "sql": "SELECT source_type AS 数据来源类型, COUNT(*) AS 记录数, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - 7 GROUP BY source_type ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "查询过去24小时车流量为0但存在营收记录的服务区",
+    "sql": "SELECT b.service_name AS 服务区名称, b.oper_date AS 日期, b.pay_sum AS 营收额 FROM bss_business_day_data b LEFT JOIN bss_car_day_count c ON b.oper_date = c.count_date AND b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL AND b.oper_date >= CURRENT_DATE - 1 AND (c.customer_count IS NULL OR c.customer_count = 0) AND b.pay_sum > 0;"
+  },
+  {
+    "question": "计算各服务区近7天营收偏离度的标准差",
+    "sql": "SELECT service_name AS 服务区名称, STDDEV_SAMP(pay_sum / NULLIF(customer_count, 0)) AS 营收偏离度标准差 FROM (SELECT * FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - 7) b JOIN (SELECT * FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date >= CURRENT_DATE - 7) c ON b.oper_date = c.count_date AND b.service_no = c.service_area_id GROUP BY service_name ORDER BY 营收偏离度标准差 DESC LIMIT 10;"
+  },
+  {
+    "question": "统计今日每小时的数据完整性校验结果",
+    "sql": "SELECT EXTRACT(HOUR FROM create_ts) AS 小时段, COUNT(*) AS 记录数, SUM(CASE WHEN pay_sum > 0 THEN 1 ELSE 0 END) AS 有效记录数 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date = CURRENT_DATE GROUP BY EXTRACT(HOUR FROM create_ts) ORDER BY 小时段;"
+  },
+  {
+    "question": "查找危化品车辆占比超过10%且营收异常的服务区",
+    "sql": "SELECT c.count_date AS 日期, b.service_name AS 服务区名称, c.customer_count AS 车流量, b.pay_sum AS 营收额 FROM bss_business_day_data b JOIN (SELECT * FROM bss_car_day_count WHERE car_type = '危化品' AND customer_count > 0) c ON b.oper_date = c.count_date AND b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL AND (c.customer_count::numeric / (SELECT SUM(customer_count) FROM bss_car_day_count WHERE count_date = c.count_date AND service_area_id = c.service_area_id)) > 0.1;"
+  },
+  {
+    "question": "分析连续3天营收增长但车流下降的异常服务区",
+    "sql": "WITH revenue_trend AS (SELECT service_no, oper_date, pay_sum, LEAD(pay_sum, 1, 0) OVER (PARTITION BY service_no ORDER BY oper_date) AS next_pay_sum FROM bss_business_day_data WHERE delete_ts IS NULL), car_trend AS (SELECT service_area_id, count_date, customer_count, LEAD(customer_count, 1, 0) OVER (PARTITION BY service_area_id ORDER BY count_date) AS next_count FROM bss_car_day_count WHERE delete_ts IS NULL) SELECT r.service_no AS 服务区编码 FROM revenue_trend r JOIN car_trend c ON r.service_no = c.service_area_id AND r.oper_date = c.count_date WHERE r.pay_sum > r.next_pay_sum AND c.customer_count < c.next_count GROUP BY r.service_no HAVING COUNT(*) >= 3 LIMIT 10;"
+  },
+  {
+    "question": "统计现金支付占比超过50%的异常档口",
+    "sql": "SELECT branch_name AS 档口名称, rmb AS 现金支付额, pay_sum AS 总营收, (rmb::numeric / pay_sum) AS 现金占比 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date = CURRENT_DATE - 1 AND pay_sum > 0 AND (rmb::numeric / pay_sum) > 0.5 ORDER BY 现金占比 DESC;"
+  },
+  {
+    "question": "分析城际车辆占比与营收的相关性",
+    "sql": "SELECT CORR((SELECT SUM(customer_count) FROM bss_car_day_count WHERE car_type = '城际' GROUP BY count_date), (SELECT SUM(pay_sum) FROM bss_business_day_data GROUP BY oper_date)) AS 相关系数 FROM bss_car_day_count LIMIT 1;"
+  }
+]

+ 202 - 0
data_pipeline/training_data/task_20250701_231850/qs_highway_db_20250701_234811_pair.json.backup

@@ -0,0 +1,202 @@
+[
+  {
+    "question": "统计2023-04-01各服务区总营收并按金额降序排列",
+    "sql": "SELECT service_name AS 服务区名称, SUM(pay_sum) AS 日营收总额 FROM bss_business_day_data WHERE oper_date = '2023-04-01' AND delete_ts IS NULL GROUP BY service_name ORDER BY 日营收总额 DESC;"
+  },
+  {
+    "question": "查询宜春服务区2023年4月各档口营收排名TOP5",
+    "sql": "SELECT branch_name AS 档口名称, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE service_name = '宜春服务区' AND oper_date BETWEEN '2023-04-01' AND '2023-04-30' AND delete_ts IS NULL GROUP BY 档口名称 ORDER BY 总营收 DESC LIMIT 5;"
+  },
+  {
+    "question": "分析2023-04-02各服务区微信支付占比(微信金额/总支付金额)",
+    "sql": "SELECT service_name AS 服务区名称, ROUND(SUM(wx)/SUM(pay_sum)*100, 2) AS 微信占比百分比 FROM bss_business_day_data WHERE oper_date = '2023-04-02' AND delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "对比庐山服务区2023年3月与4月日均营收变化趋势",
+    "sql": "SELECT EXTRACT(MONTH FROM oper_date) AS 月份, AVG(pay_sum) AS 日均营收 FROM bss_business_day_data WHERE service_name = '庐山服务区' AND EXTRACT(MONTH FROM oper_date) IN (3,4) AND delete_ts IS NULL GROUP BY 月份 ORDER BY 月份;"
+  },
+  {
+    "question": "统计各服务区近7天现金支付金额环比增长率(当前周-上一周)",
+    "sql": "SELECT service_name AS 服务区名称, SUM(CASE WHEN oper_date BETWEEN CURRENT_DATE - 7 AND CURRENT_DATE THEN pay_sum ELSE 0 END) - SUM(CASE WHEN oper_date BETWEEN CURRENT_DATE - 14 AND CURRENT_DATE - 8 THEN pay_sum ELSE 0 END) AS 现金增长额 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY service_name;"
+  },
+  {
+    "question": "查询各服务区支付宝订单占比超过10%的记录(按日期筛选)",
+    "sql": "SELECT oper_date AS 统计日期, service_name AS 服务区名称, SUM(zf_order) AS 支付宝订单数, SUM(order_sum) AS 总订单数 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 统计日期, 服务区名称 HAVING SUM(zf_order)/SUM(order_sum) > 0.1 ORDER BY 统计日期 DESC;"
+  },
+  {
+    "question": "分析行吧支付使用情况(订单数前10的服务区及使用率)",
+    "sql": "SELECT service_name AS 服务区名称, SUM(xs_order) AS 行吧订单数, SUM(order_sum) AS 总订单数, ROUND(SUM(xs_order)/SUM(order_sum)*100, 2) AS 使用率百分比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 服务区名称 ORDER BY 行吧订单数 DESC LIMIT 10;"
+  },
+  {
+    "question": "统计各服务区最近一天营收数据并标注数据来源类型",
+    "sql": "SELECT DISTINCT ON (service_name) service_name AS 服务区名称, oper_date AS 统计日期, pay_sum AS 营收金额, source_type AS 数据来源类型 FROM bss_business_day_data WHERE delete_ts IS NULL ORDER BY service_name, oper_date DESC;"
+  },
+  {
+    "question": "查询2023-04-01宜春南区档口各支付方式金额及占比",
+    "sql": "SELECT '微信' AS 支付方式, wx AS 金额, ROUND(wx/pay_sum*100, 2) AS 占比 FROM bss_business_day_data WHERE branch_name = '宜春南区' AND oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '支付宝', zfb, ROUND(zfb/pay_sum*100, 2) FROM bss_business_day_data WHERE branch_name = '宜春南区' AND oper_date = '2023-04-01' AND delete_ts IS NULL UNION ALL SELECT '现金', rmb, ROUND(rmb/pay_sum*100, 2) FROM bss_business_day_data WHERE branch_name = '宜春南区' AND oper_date = '2023-04-01' AND delete_ts IS NULL;"
+  },
+  {
+    "question": "分析各服务区近30天日营收标准差评估经营稳定性",
+    "sql": "SELECT service_name AS 服务区名称, STDDEV_SAMP(pay_sum) AS 日营收标准差 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - 30 AND delete_ts IS NULL GROUP BY service_name ORDER BY 标准差 DESC;"
+  },
+  {
+    "question": "统计各服务区不同车型的车流量占比,用于资源配置优化",
+    "sql": "SELECT service_area_id AS 服务区ID, car_type AS 车辆类型, SUM(customer_count) AS 总车流量, ROUND((SUM(customer_count)*100.0/SUM(SUM(customer_count)) OVER(PARTITION BY service_area_id)),2) AS 占比百分比 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id, car_type ORDER BY service_area_id, 总车流量 DESC;"
+  },
+  {
+    "question": "分析2023年4月各服务区每日车流量趋势变化,识别高峰期",
+    "sql": "SELECT count_date AS 统计日期, service_area_id AS 服务区ID, SUM(customer_count) AS 日车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY count_date, service_area_id ORDER BY count_date;"
+  },
+  {
+    "question": "查询2023年4月1日当日车流量最高的前5个服务区",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 当日车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date = '2023-04-01' GROUP BY service_area_id ORDER BY 当日车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "统计各服务区危化品车辆出现频次,用于安全管理评估",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 危化品车辆总数 FROM bss_car_day_count WHERE delete_ts IS NULL AND car_type = '危化品' GROUP BY service_area_id ORDER BY 危化品车辆总数 DESC;"
+  },
+  {
+    "question": "分析各服务区车流量月环比增长趋势(按2023年Q2数据)",
+    "sql": "SELECT service_area_id AS 服务区ID, month AS 月份, total_count AS 当月车流量, prev_month_count AS 上月车流量, ROUND(((total_count-prev_month_count)*100.0/prev_month_count),2) AS 环比增长率 FROM (SELECT service_area_id, DATE_TRUNC('month', count_date) AS month, SUM(customer_count) AS total_count, LAG(SUM(customer_count)) OVER(PARTITION BY service_area_id ORDER BY DATE_TRUNC('month', count_date)) AS prev_month_count FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date BETWEEN '2023-04-01' AND '2023-06-30' GROUP BY service_area_id, DATE_TRUNC('month', count_date)) AS monthly_data ORDER BY service_area_id, 月份;"
+  },
+  {
+    "question": "对比周末与工作日各服务区平均车流量差异",
+    "sql": "SELECT service_area_id AS 服务区ID, AVG(CASE WHEN EXTRACT(ISODOW FROM count_date) IN (6,7) THEN customer_count END) AS 周末日均车流量, AVG(CASE WHEN EXTRACT(ISODOW FROM count_date) NOT IN (6,7) THEN customer_count END) AS 工作日均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id;"
+  },
+  {
+    "question": "查询2023年4月城际车辆流量最高的前3个服务区",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 城际车流量 FROM bss_car_day_count WHERE delete_ts IS NULL AND car_type = '城际' AND count_date BETWEEN '2023-04-01' AND '2023-04-30' GROUP BY service_area_id ORDER BY 城际车流量 DESC LIMIT 3;"
+  },
+  {
+    "question": "分析各服务区不同车型的月均车流量分布",
+    "sql": "SELECT service_area_id AS 服务区ID, car_type AS 车辆类型, ROUND(AVG(customer_count), 2) AS 月均车流量 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id, car_type ORDER BY service_area_id, 月均车流量 DESC;"
+  },
+  {
+    "question": "识别最近7天各服务区车流量TOP3日期",
+    "sql": "SELECT * FROM (SELECT service_area_id, count_date AS 统计日期, customer_count AS 车流量, ROW_NUMBER() OVER(PARTITION BY service_area_id ORDER BY customer_count DESC) AS rn FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date >= CURRENT_DATE - 7) t WHERE rn <= 3 ORDER BY service_area_id, 车流量 DESC;"
+  },
+  {
+    "question": "统计各服务区过境车辆与城际车辆流量比值,分析交通特性",
+    "sql": "SELECT service_area_id AS 服务区ID, (SUM(CASE WHEN car_type = '过境' THEN customer_count ELSE 0 END) * 1.0 / NULLIF(SUM(CASE WHEN car_type = '城际' THEN customer_count ELSE 0 END), 0)) AS 过境城际比 FROM bss_car_day_count WHERE delete_ts IS NULL GROUP BY service_area_id ORDER BY 过境城际比 DESC;"
+  },
+  {
+    "question": "各档口单位车流的坪效对比(总支付金额/车辆数量)排名TOP10",
+    "sql": "SELECT b.branch_name AS 档口名称, ROUND(SUM(b.pay_sum) / SUM(c.customer_count), 2) AS 坪效 FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY b.branch_name ORDER BY 坪效 DESC LIMIT 10;"
+  },
+  {
+    "question": "各档口客单价(总支付金额/订单总数)最高前5名",
+    "sql": "SELECT branch_name AS 档口名称, ROUND(SUM(pay_sum) / SUM(order_sum), 2) AS 客单价 FROM bss_business_day_data WHERE delete_ts IS NULL AND order_sum > 0 GROUP BY branch_name ORDER BY 客单价 DESC LIMIT 5;"
+  },
+  {
+    "question": "不同服务区档口的平均坪效对比(排除0订单数据)",
+    "sql": "SELECT service_name AS 服务区名称, ROUND(AVG(pay_sum / NULLIF(customer_count, 0)), 2) AS 平均坪效 FROM (SELECT b.service_name, SUM(b.pay_sum) AS pay_sum, SUM(c.customer_count) AS customer_count FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY b.service_name, b.oper_date HAVING SUM(b.order_sum) > 0) AS sub GROUP BY service_name ORDER BY 平均坪效 DESC;"
+  },
+  {
+    "question": "最近7天微信支付占比超过50%的档口清单",
+    "sql": "SELECT branch_name AS 档口名称, SUM(wx) / SUM(pay_sum) AS 微信占比 FROM bss_business_day_data WHERE oper_date >= CURRENT_DATE - INTERVAL '7 days' AND delete_ts IS NULL GROUP BY branch_name HAVING SUM(pay_sum) > 0 AND SUM(wx)/SUM(pay_sum) > 0.5 ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "各时段(早/中/晚)档口客单价分布统计",
+    "sql": "SELECT CASE WHEN EXTRACT(HOUR FROM create_ts) < 12 THEN '上午' WHEN EXTRACT(HOUR FROM create_ts) < 18 THEN '下午' ELSE '晚上' END AS 时段, ROUND(AVG(pay_sum/order_sum), 2) AS 平均客单价 FROM bss_business_day_data WHERE delete_ts IS NULL AND order_sum > 0 GROUP BY 时段 ORDER BY 时段;"
+  },
+  {
+    "question": "车辆数量TOP5但营收低于平均值的档口预警",
+    "sql": "SELECT c.service_area_id AS 服务区ID, b.branch_name AS 档口名称, SUM(c.customer_count) AS 总车流量, SUM(b.pay_sum) AS 总营收 FROM bss_car_day_count c JOIN bss_business_day_data b ON c.service_area_id = b.service_no WHERE c.delete_ts IS NULL AND b.delete_ts IS NULL GROUP BY c.service_area_id, b.branch_name HAVING SUM(c.customer_count) > (SELECT AVG(customer_count) FROM bss_car_day_count WHERE delete_ts IS NULL) AND SUM(b.pay_sum) < (SELECT AVG(pay_sum) FROM bss_business_day_data WHERE delete_ts IS NULL) ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "各档口不同支付方式金额占比分析",
+    "sql": "SELECT branch_name AS 档口名称, ROUND(SUM(wx)/SUM(pay_sum)*100, 2) AS 微信占比, ROUND(SUM(zfb)/SUM(pay_sum)*100, 2) AS 支付宝占比, ROUND(SUM(rmb)/SUM(pay_sum)*100, 2) AS 现金占比 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY branch_name ORDER BY 微信占比 DESC;"
+  },
+  {
+    "question": "节假日(假设2023-04-01至2023-04-05)期间坪效波动趋势",
+    "sql": "SELECT oper_date AS 日期, ROUND(SUM(pay_sum)/SUM(c.customer_count), 2) AS 日坪效 FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.oper_date BETWEEN '2023-04-01' AND '2023-04-05' AND b.delete_ts IS NULL AND c.delete_ts IS NULL GROUP BY oper_date ORDER BY 日期;"
+  },
+  {
+    "question": "各档口订单转化率(订单数/车辆数量)排名及同比变化",
+    "sql": "WITH current_period AS (SELECT branch_name, SUM(order_sum) AS orders, SUM(c.customer_count) AS customers FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.oper_date >= '2023-04-01' AND b.oper_date <= '2023-04-07' GROUP BY branch_name), last_period AS (SELECT branch_name, SUM(order_sum) AS orders, SUM(c.customer_count) AS customers FROM bss_business_day_data b JOIN bss_car_day_count c ON b.service_no = c.service_area_id WHERE b.oper_date >= '2023-03-25' AND b.oper_date <= '2023-03-31' GROUP BY branch_name) SELECT c.branch_name AS 档口名称, ROUND(c.orders/c.customers, 4) AS 本期转化率, ROUND(l.orders/l.customers, 4) AS 上期转化率, ROUND((c.orders/c.customers - l.orders/l.customers)/NULLIF(l.orders/l.customers, 0)*100, 2) AS 变化率 FROM current_period c JOIN last_period l ON c.branch_name = l.branch_name ORDER BY 变化率 DESC;"
+  },
+  {
+    "question": "特定档口(如branch_name='庐山鲜徕客东区')近30天每日营收趋势",
+    "sql": "SELECT oper_date AS 日期, pay_sum AS 营收金额 FROM bss_business_day_data WHERE branch_name = '庐山鲜徕客东区' AND delete_ts IS NULL AND oper_date >= CURRENT_DATE - INTERVAL '30 days' ORDER BY 日期 ASC;"
+  },
+  {
+    "question": "节假日与非节假日各服务区平均日营收对比分析?",
+    "sql": "SELECT CASE WHEN oper_date BETWEEN '2023-04-01' AND '2023-04-05' THEN '节假日期间' ELSE '常规时段' END AS 分析时段, service_name AS 服务区名称, ROUND(AVG(pay_sum)::numeric, 2) AS 平均日营收 FROM bss_business_day_data WHERE delete_ts IS NULL GROUP BY 分析时段, 服务区名称 ORDER BY 平均日营收 DESC;"
+  },
+  {
+    "question": "节假日车流量TOP5服务区统计?",
+    "sql": "SELECT service_area_id AS 服务区ID, SUM(customer_count) AS 总车流量 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-05' AND delete_ts IS NULL GROUP BY 服务区ID ORDER BY 总车流量 DESC LIMIT 5;"
+  },
+  {
+    "question": "节前/节中/节后各支付方式订单占比趋势分析?",
+    "sql": "SELECT CASE WHEN oper_date < '2023-04-01' THEN '节前' WHEN oper_date BETWEEN '2023-04-01' AND '2023-04-05' THEN '节中' ELSE '节后' END AS 阶段, ROUND(SUM(wx_order)*100/SUM(order_sum), 2) AS 微信占比, ROUND(SUM(zf_order)*100/SUM(order_sum), 2) AS 支付宝占比 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date BETWEEN '2023-03-25' AND '2023-04-10' GROUP BY 阶段 ORDER BY 阶段;"
+  },
+  {
+    "question": "春节前后一周服务区营收增长率对比(2023-01-20至2023-01-26 vs 2023-01-27至2023-02-02)?",
+    "sql": "WITH pre_period AS (SELECT service_name, SUM(pay_sum) AS 营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-20' AND '2023-01-26' GROUP BY service_name), post_period AS (SELECT service_name, SUM(pay_sum) AS 营收 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-01-27' AND '2023-02-02' GROUP BY service_name) SELECT a.service_name, ROUND((b.营收-a.营收)/a.营收*100, 2) AS 增长率 FROM pre_period a JOIN post_period b ON a.service_name = b.service_name ORDER BY 增长率 DESC;"
+  },
+  {
+    "question": "节假日不同车型车流分布占比分析?",
+    "sql": "SELECT car_type AS 车型, COUNT(*) AS 记录数, ROUND(CAST(COUNT(*) AS numeric)*100/(SELECT COUNT(*) FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-05'), 2) AS 占比百分比 FROM bss_car_day_count WHERE count_date BETWEEN '2023-04-01' AND '2023-04-05' AND delete_ts IS NULL GROUP BY car_type ORDER BY 记录数 DESC;"
+  },
+  {
+    "question": "节假日期间现金支付比例最高的3个服务区?",
+    "sql": "SELECT service_name, ROUND(SUM(rmb) * 100 / SUM(pay_sum), 2) AS 现金占比 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-05' AND delete_ts IS NULL GROUP BY service_name ORDER BY 现金占比 DESC LIMIT 3;"
+  },
+  {
+    "question": "节后三天内订单总量最低的5个服务区?",
+    "sql": "SELECT service_name, SUM(order_sum) AS 总订单量 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-06' AND '2023-04-08' AND delete_ts IS NULL GROUP BY service_name ORDER BY 总订单量 ASC LIMIT 5;"
+  },
+  {
+    "question": "节中期间微信支付金额环比增长TOP3服务区?",
+    "sql": "SELECT service_name, oper_date, wx, LAG(wx,1) OVER (PARTITION BY service_name ORDER BY oper_date) AS 前一日, ROUND((wx - LAG(wx,1) OVER (PARTITION BY service_name ORDER BY oper_date)) / LAG(wx,1) OVER (PARTITION BY service_name ORDER BY oper_date) * 100, 2) AS 环比增长率 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-01' AND '2023-04-05' AND delete_ts IS NULL ORDER BY oper_date, 环比增长率 DESC;"
+  },
+  {
+    "question": "国庆黄金周车流量同比去年增长情况分析?",
+    "sql": "SELECT service_area_id, SUM(CASE WHEN count_date BETWEEN '2022-10-01' AND '2022-10-07' THEN customer_count ELSE 0 END) AS 去年车流量, SUM(CASE WHEN count_date BETWEEN '2023-10-01' AND '2023-10-07' THEN customer_count ELSE 0 END) AS 今年车流量, ROUND((SUM(CASE WHEN count_date BETWEEN '2023-10-01' AND '2023-10-07' THEN customer_count ELSE 0 END) - SUM(CASE WHEN count_date BETWEEN '2022-10-01' AND '2022-10-07' THEN customer_count ELSE 0 END)) * 100 / NULLIF(SUM(CASE WHEN count_date BETWEEN '2022-10-01' AND '2022-10-07' THEN customer_count ELSE 0 END), 0), 2) AS 增长率 FROM bss_car_day_count WHERE count_date BETWEEN '2022-10-01' AND '2023-10-07' AND delete_ts IS NULL GROUP BY service_area_id ORDER BY 增长率 DESC;"
+  },
+  {
+    "question": "节后一周内营收恢复至节前90%水平的服务区统计?",
+    "sql": "WITH pre_holiday AS (SELECT service_name, AVG(pay_sum) AS 节前均值 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-03-25' AND '2023-03-31' GROUP BY service_name), post_holiday AS (SELECT service_name, AVG(pay_sum) AS 节后均值 FROM bss_business_day_data WHERE oper_date BETWEEN '2023-04-06' AND '2023-04-12' GROUP BY service_name) SELECT a.service_name, ROUND(a.节前均值, 2) AS 节前均值, ROUND(b.节后均值, 2) AS 节后均值, ROUND(b.节后均值/a.节前均值*100, 2) AS 恢复比例 FROM pre_holiday a JOIN post_holiday b ON a.service_name = b.service_name WHERE b.节后均值 >= a.节前均值 * 0.9 ORDER BY 恢复比例 DESC;"
+  },
+  {
+    "question": "统计最近一天营收-车流偏离度TOP10的服务区",
+    "sql": "SELECT b.service_name AS 服务区名称, (b.pay_sum / NULLIF(c.customer_count, 0)) AS 营收车流比 FROM bss_business_day_data b JOIN bss_car_day_count c ON b.oper_date = c.count_date AND b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL AND b.oper_date = CURRENT_DATE - 1 ORDER BY 营收车流比 DESC LIMIT 10;"
+  },
+  {
+    "question": "查找昨日微信支付订单占比低于5%的异常服务区",
+    "sql": "SELECT service_name AS 服务区名称, wx_order AS 微信订单数, order_sum AS 总订单数, (wx_order::numeric / order_sum) AS 微信占比 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date = CURRENT_DATE - 1 AND order_sum > 0 AND (wx_order::numeric / order_sum) < 0.05;"
+  },
+  {
+    "question": "分析最近一周各数据来源类型的营收分布",
+    "sql": "SELECT source_type AS 数据来源类型, COUNT(*) AS 记录数, SUM(pay_sum) AS 总营收 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - 7 GROUP BY source_type ORDER BY 总营收 DESC;"
+  },
+  {
+    "question": "查询过去24小时车流量为0但存在营收记录的服务区",
+    "sql": "SELECT b.service_name AS 服务区名称, b.oper_date AS 日期, b.pay_sum AS 营收额 FROM bss_business_day_data b LEFT JOIN bss_car_day_count c ON b.oper_date = c.count_date AND b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL AND b.oper_date >= CURRENT_DATE - 1 AND (c.customer_count IS NULL OR c.customer_count = 0) AND b.pay_sum > 0;"
+  },
+  {
+    "question": "计算各服务区近7天营收偏离度的标准差",
+    "sql": "SELECT service_name AS 服务区名称, STDDEV_SAMP(pay_sum / NULLIF(customer_count, 0)) AS 营收偏离度标准差 FROM (SELECT * FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date >= CURRENT_DATE - 7) b JOIN (SELECT * FROM bss_car_day_count WHERE delete_ts IS NULL AND count_date >= CURRENT_DATE - 7) c ON b.oper_date = c.count_date AND b.service_no = c.service_area_id GROUP BY service_name ORDER BY 标准差 DESC LIMIT 10;"
+  },
+  {
+    "question": "统计今日每小时的数据完整性校验结果",
+    "sql": "SELECT EXTRACT(HOUR FROM create_ts) AS 小时段, COUNT(*) AS 记录数, SUM(CASE WHEN pay_sum > 0 THEN 1 ELSE 0 END) AS 有效记录数 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date = CURRENT_DATE GROUP BY EXTRACT(HOUR FROM create_ts) ORDER BY 小时段;"
+  },
+  {
+    "question": "查找危化品车辆占比超过10%且营收异常的服务区",
+    "sql": "SELECT c.count_date AS 日期, b.service_name AS 服务区名称, c.customer_count AS 车流量, b.pay_sum AS 营收额 FROM bss_business_day_data b JOIN (SELECT * FROM bss_car_day_count WHERE car_type = '危化品' AND customer_count > 0) c ON b.oper_date = c.count_date AND b.service_no = c.service_area_id WHERE b.delete_ts IS NULL AND c.delete_ts IS NULL AND (c.customer_count::numeric / (SELECT SUM(customer_count) FROM bss_car_day_count WHERE count_date = c.count_date AND service_area_id = c.service_area_id)) > 0.1;"
+  },
+  {
+    "question": "分析连续3天营收增长但车流下降的异常服务区",
+    "sql": "WITH revenue_trend AS (SELECT service_no, oper_date, pay_sum, LEAD(pay_sum, 1, 0) OVER (PARTITION BY service_no ORDER BY oper_date) AS next_pay_sum FROM bss_business_day_data WHERE delete_ts IS NULL), car_trend AS (SELECT service_area_id, count_date, customer_count, LEAD(customer_count, 1, 0) OVER (PARTITION BY service_area_id ORDER BY count_date) AS next_count FROM bss_car_day_count WHERE delete_ts IS NULL) SELECT r.service_no AS 服务区编码 FROM revenue_trend r JOIN car_trend c ON r.service_no = c.service_area_id AND r.oper_date = c.count_date WHERE r.pay_sum > r.next_pay_sum AND c.customer_count < c.next_count GROUP BY r.service_no HAVING COUNT(*) >= 3 LIMIT 10;"
+  },
+  {
+    "question": "统计现金支付占比超过50%的异常档口",
+    "sql": "SELECT branch_name AS 档口名称, rmb AS 现金支付额, pay_sum AS 总营收, (rmb::numeric / pay_sum) AS 现金占比 FROM bss_business_day_data WHERE delete_ts IS NULL AND oper_date = CURRENT_DATE - 1 AND pay_sum > 0 AND (rmb::numeric / pay_sum) > 0.5 ORDER BY 现金占比 DESC;"
+  },
+  {
+    "question": "分析城际车辆占比与营收的相关性",
+    "sql": "SELECT CORR((SELECT SUM(customer_count) FROM bss_car_day_count WHERE car_type = '城际' GROUP BY count_date), (SELECT SUM(pay_sum) FROM bss_business_day_data GROUP BY oper_date)) AS 相关系数 FROM bss_car_day_count LIMIT 1;"
+  }
+]

+ 13 - 0
data_pipeline/training_data/task_20250701_231850/table_list.txt

@@ -0,0 +1,13 @@
+# 示例表清单文件
+# 每行一个表名,支持 schema.table 格式
+# 以 # 开头的行为注释
+
+# 服务区相关表
+bss_car_day_count
+bss_business_day_data
+#bss_company
+#bss_section_route
+#bss_section_route_area_link
+#bss_service_area
+#bss_service_area_mapper
+

+ 15 - 0
data_pipeline/training_data/task_20250701_231850/task_config.json

@@ -0,0 +1,15 @@
+{
+  "task_id": "task_20250701_231850",
+  "created_at": "2025-07-01T23:18:50.085424",
+  "parameters": {
+    "db_connection": "postgresql://postgres:postgres@192.168.67.1:6432/highway_db",
+    "table_list_file": "{task_directory}/table_list.txt",
+    "business_context": "高速公路服务区管理系统",
+    "file_upload_mode": true,
+    "enable_llm_repair": true,
+    "modify_original_file": true,
+    "enable_sql_validation": true,
+    "enable_training_data_load": true
+  },
+  "output_directory": "data_pipeline\\training_data\\task_20250701_231850"
+}

+ 117 - 0
data_pipeline/training_data/task_20250701_231850/task_result.json

@@ -0,0 +1,117 @@
+{
+  "success": true,
+  "workflow_state": {
+    "start_time": null,
+    "end_time": null,
+    "current_step": "training_data_load",
+    "completed_steps": [
+      "ddl_md_generation",
+      "question_sql_generation",
+      "sql_validation",
+      "training_data_load"
+    ],
+    "failed_steps": [],
+    "artifacts": {
+      "ddl_md_generation": {
+        "total_tables": 2,
+        "processed_successfully": 2,
+        "failed": 0,
+        "files_generated": 4,
+        "duration": 144.94102311134338
+      },
+      "question_sql_generation": {
+        "output_file": "data_pipeline\\training_data\\task_20250701_231850\\qs_highway_db_20250701_234811_pair.json",
+        "total_questions": 50,
+        "total_themes": 5,
+        "successful_themes": 5,
+        "failed_themes": [],
+        "duration": 572.6270577907562
+      },
+      "sql_validation": {
+        "original_sql_count": 50,
+        "valid_sql_count": 50,
+        "invalid_sql_count": 0,
+        "success_rate": 1.0,
+        "repair_stats": {
+          "attempted": 2,
+          "successful": 2,
+          "failed": 0
+        },
+        "file_modification_stats": {
+          "modified": 2,
+          "deleted": 0,
+          "failed_modifications": 0
+        },
+        "average_execution_time": 0.03857877254486084,
+        "total_retries": 0,
+        "duration": 21.42849063873291
+      },
+      "training_data_load": {
+        "training_data_dir": "data_pipeline\\training_data\\task_20250701_231850",
+        "load_successful": true,
+        "total_records": 446,
+        "data_type_counts": {
+          "sql": 397,
+          "documentation": 26,
+          "ddl": 22,
+          "error_sql": 1
+        },
+        "duration": 80.00725603103638
+      }
+    },
+    "statistics": {
+      "step1_duration": 144.94102311134338,
+      "step2_duration": 572.6270577907562,
+      "step3_duration": 21.42849063873291,
+      "step4_duration": 80.00725603103638
+    }
+  },
+  "artifacts": {
+    "ddl_md_generation": {
+      "total_tables": 2,
+      "processed_successfully": 2,
+      "failed": 0,
+      "files_generated": 4,
+      "duration": 144.94102311134338
+    },
+    "question_sql_generation": {
+      "output_file": "data_pipeline\\training_data\\task_20250701_231850\\qs_highway_db_20250701_234811_pair.json",
+      "total_questions": 50,
+      "total_themes": 5,
+      "successful_themes": 5,
+      "failed_themes": [],
+      "duration": 572.6270577907562
+    },
+    "sql_validation": {
+      "original_sql_count": 50,
+      "valid_sql_count": 50,
+      "invalid_sql_count": 0,
+      "success_rate": 1.0,
+      "repair_stats": {
+        "attempted": 2,
+        "successful": 2,
+        "failed": 0
+      },
+      "file_modification_stats": {
+        "modified": 2,
+        "deleted": 0,
+        "failed_modifications": 0
+      },
+      "average_execution_time": 0.03857877254486084,
+      "total_retries": 0,
+      "duration": 21.42849063873291
+    },
+    "training_data_load": {
+      "training_data_dir": "data_pipeline\\training_data\\task_20250701_231850",
+      "load_successful": true,
+      "total_records": 446,
+      "data_type_counts": {
+        "sql": 397,
+        "documentation": 26,
+        "ddl": 22,
+        "error_sql": 1
+      },
+      "duration": 80.00725603103638
+    }
+  }
+}