2 months ago · 15bb995811
--- a/dags/dag_dataops_pipeline_data_scheduler.py
+++ b/dags/dag_dataops_pipeline_data_scheduler.py
@@ -499,21 +499,6 @@ def prepare_dag_schedule(**kwargs):
 
															     kwargs['ti'].xcom_push(key='execution_plan', value=json.dumps(execution_plan, default=json_serial))
														
 
															     logger.info(f"准备了执行计划，包含 {len(resource_tasks)} 个资源表任务和 {len(model_tasks)} 个模型表任务")
														
 
															-    # 保存执行计划到文件
														
 
															-    try:
														
 
															-        plan_path = os.path.join(os.path.dirname(__file__), 'last_execution_plan.json')
														
 
															-        with open(plan_path, 'w') as f:
														
 
															-            json.dump(execution_plan, f, default=json_serial, indent=2)
														
 
															-        logger.info(f"将执行计划保存到文件: {plan_path}")
														
 
															-        
														
 
															-        # 添加ready标记文件
														
 
															-        ready_path = f"{plan_path}.ready"
														
 
															-        with open(ready_path, 'w') as f:
														
 
															-            f.write(datetime.now().isoformat())
														
 
															-        logger.info(f"已创建ready标记文件: {ready_path}")
														
 
															-    except Exception as file_e:
														
 
															-        logger.error(f"保存执行计划到文件时出错: {str(file_e)}")
														
 
															-    
														
 
															     return inserted_count
														
 
															 def check_execution_plan_file(**kwargs):
														
@@ -732,43 +717,12 @@ def create_execution_plan(**kwargs):
 
															                 "dependencies": dependencies
														
 
															             }
														
 
															-            # 保存执行计划
														
 
															+            # 保存执行计划到XCom
														
 
															             kwargs['ti'].xcom_push(key='execution_plan', value=json.dumps(new_execution_plan, default=json_serial))
														
 
															             logger.info(f"创建新的执行计划，包含 {len(resource_tasks)} 个资源表任务和 {len(model_tasks)} 个模型表任务")
														
 
															-            # 保存执行计划到文件
														
 
															-            try:
														
 
															-                plan_path = os.path.join(os.path.dirname(__file__), 'last_execution_plan.json')
														
 
															-                with open(plan_path, 'w') as f:
														
 
															-                    json.dump(new_execution_plan, f, default=json_serial, indent=2)
														
 
															-                logger.info(f"将执行计划保存到文件: {plan_path}")
														
 
															-                
														
 
															-                # 创建ready标记文件
														
 
															-                ready_path = f"{plan_path}.ready"
														
 
															-                with open(ready_path, 'w') as f:
														
 
															-                    f.write(datetime.now().isoformat())
														
 
															-                logger.info(f"已创建ready标记文件: {ready_path}")
														
 
															-            except Exception as file_e:
														
 
															-                logger.error(f"保存执行计划到文件时出错: {str(file_e)}")
														
 
															-            
														
 
															             return json.dumps(new_execution_plan, default=json_serial)
														
 
															-        # 如果从XCom获取到了执行计划，也保存到文件
														
 
															-        try:
														
 
															-            plan_json = json.loads(execution_plan) if isinstance(execution_plan, str) else execution_plan
														
 
															-            plan_path = os.path.join(os.path.dirname(__file__), 'last_execution_plan.json')
														
 
															-            with open(plan_path, 'w') as f:
														
 
															-                json.dump(plan_json, f, default=json_serial, indent=2)
														
 
															-            logger.info(f"将从XCom获取的执行计划保存到文件: {plan_path}")
														
 
															-            
														
 
															-            # 创建ready标记文件
														
 
															-            ready_path = f"{plan_path}.ready"
														
 
															-            with open(ready_path, 'w') as f:
														
 
															-                f.write(datetime.now().isoformat())
														
 
															-            logger.info(f"已创建ready标记文件: {ready_path}")
														
 
															-        except Exception as file_e:
														
 
															-            logger.error(f"保存从XCom获取的执行计划到文件时出错: {str(file_e)}")
														
 
															-        
														
 
															         logger.info(f"成功获取执行计划")
														
 
															         return execution_plan
														
 
															     except Exception as e:
														
@@ -781,21 +735,6 @@ def create_execution_plan(**kwargs):
 
															             "dependencies": {}
														
 
															         }
														
 
															-        # 尝试保存空执行计划到文件
														
 
															-        try:
														
 
															-            plan_path = os.path.join(os.path.dirname(__file__), 'last_execution_plan.json')
														
 
															-            with open(plan_path, 'w') as f:
														
 
															-                json.dump(empty_plan, f, default=json_serial, indent=2)
														
 
															-            logger.info(f"将空执行计划保存到文件: {plan_path}")
														
 
															-            
														
 
															-            # 创建ready标记文件
														
 
															-            ready_path = f"{plan_path}.ready"
														
 
															-            with open(ready_path, 'w') as f:
														
 
															-                f.write(datetime.now().isoformat())
														
 
															-            logger.info(f"已创建ready标记文件: {ready_path}")
														
 
															-        except Exception as file_e:
														
 
															-            logger.error(f"保存空执行计划到文件时出错: {str(file_e)}")
														
 
															-            
														
 
															         return json.dumps(empty_plan, default=json_serial)
														
 
															 def process_resource(target_table, script_name, script_exec_mode, exec_date):
														
@@ -1332,6 +1271,83 @@ def summarize_execution(**kwargs):
 
															         # 返回一个简单的错误报告，确保任务不会失败
														
 
															         return f"执行汇总时出现错误: {str(e)}"
														
 
															+# 添加新函数，用于从数据库获取执行计划
														
 
															+def get_execution_plan_from_db(ds):
														
 
															+    """
														
 
															+    从数据库airflow_exec_plans表中获取执行计划
														
 
															+    
														
 
															+    参数:
														
 
															+        ds (str): 执行日期，格式为'YYYY-MM-DD'
														
 
															+        
														
 
															+    返回:
														
 
															+        dict: 执行计划字典，如果找不到则返回None
														
 
															+    """
														
 
															+    logger.info(f"尝试从数据库获取执行日期 {ds} 的执行计划")
														
 
															+    conn = get_pg_conn()
														
 
															+    cursor = conn.cursor()
														
 
															+    execution_plan = None
														
 
															+    
														
 
															+    try:
														
 
															+        # 查询条件a: 当前日期=表的ds，如果有多条记录，取insert_time最大的一条
														
 
															+        cursor.execute("""
														
 
															+            SELECT plan, run_id, insert_time
														
 
															+            FROM airflow_exec_plans
														
 
															+            WHERE dag_id = 'dag_dataops_pipeline_prepare_scheduler' AND ds = %s
														
 
															+            ORDER BY insert_time DESC
														
 
															+            LIMIT 1
														
 
															+        """, (ds,))
														
 
															+        result = cursor.fetchone()
														
 
															+        
														
 
															+        if result:
														
 
															+            # 获取计划、run_id和insert_time
														
 
															+            plan_json, run_id, insert_time = result
														
 
															+            logger.info(f"找到当前日期 ds={ds} 的执行计划记录，run_id: {run_id}, insert_time: {insert_time}")
														
 
															+            
														
 
															+            # 处理plan_json可能已经是dict的情况
														
 
															+            if isinstance(plan_json, dict):
														
 
															+                execution_plan = plan_json
														
 
															+            else:
														
 
															+                execution_plan = json.loads(plan_json)
														
 
															+                
														
 
															+            return execution_plan
														
 
															+        
														
 
															+        # 查询条件b: 找不到当前日期的记录，查找ds<当前ds的最新记录
														
 
															+        logger.info(f"未找到当前日期 ds={ds} 的执行计划记录，尝试查找历史记录")
														
 
															+        cursor.execute("""
														
 
															+            SELECT plan, run_id, insert_time, ds
														
 
															+            FROM airflow_exec_plans
														
 
															+            WHERE dag_id = 'dag_dataops_pipeline_prepare_scheduler' AND ds < %s
														
 
															+            ORDER BY ds DESC, insert_time DESC
														
 
															+            LIMIT 1
														
 
															+        """, (ds,))
														
 
															+        result = cursor.fetchone()
														
 
															+        
														
 
															+        if result:
														
 
															+            # 获取计划、run_id、insert_time和ds
														
 
															+            plan_json, run_id, insert_time, plan_ds = result
														
 
															+            logger.info(f"找到历史执行计划记录，ds: {plan_ds}, run_id: {run_id}, insert_time: {insert_time}")
														
 
															+            
														
 
															+            # 处理plan_json可能已经是dict的情况
														
 
															+            if isinstance(plan_json, dict):
														
 
															+                execution_plan = plan_json
														
 
															+            else:
														
 
															+                execution_plan = json.loads(plan_json)
														
 
															+                
														
 
															+            return execution_plan
														
 
															+        
														
 
															+        # 找不到任何执行计划记录
														
 
															+        logger.error(f"在数据库中未找到任何执行计划记录，当前DAG ds={ds}")
														
 
															+        return None
														
 
															+        
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"从数据库获取执行计划时出错: {str(e)}")
														
 
															+        import traceback
														
 
															+        logger.error(traceback.format_exc())
														
 
															+        return None
														
 
															+    finally:
														
 
															+        cursor.close()
														
 
															+        conn.close()
														
 
															+
														
 
															 # 创建DAG
														
 
															 with DAG(
														
 
															     "dag_dataops_pipeline_data_scheduler", 
														
@@ -1431,210 +1447,206 @@ with DAG(
 
															     # 设置三个阶段之间的依赖关系
														
 
															     prepare_group >> data_group >> summary_group
														
 
															-    # 尝试从执行计划文件中获取信息 - 这部分在DAG解析时执行
														
 
															+    # 尝试从数据库获取执行计划
														
 
															     try:
														
 
															-        # 尝试从文件中读取最新的执行计划，用于构建DAG图
														
 
															-        plan_path = os.path.join(os.path.dirname(__file__), 'last_execution_plan.json')
														
 
															-        ready_path = f"{plan_path}.ready"
														
 
															-        
														
 
															-        if os.path.exists(plan_path) and os.path.exists(ready_path):
														
 
															-            try:
														
 
															-                # 读取ready文件中的时间戳
														
 
															-                with open(ready_path, 'r') as f:
														
 
															-                    ready_timestamp = f.read().strip()
														
 
															-                    logger.info(f"执行计划ready标记时间: {ready_timestamp}")
														
 
															+        # 获取当前DAG的执行日期
														
 
															+        exec_date = get_today_date()  # 使用当天日期作为默认值
														
 
															+        logger.info(f"当前DAG执行日期 ds={exec_date}，尝试从数据库获取执行计划")
														
 
															+        
														
 
															+        # 从数据库获取执行计划
														
 
															+        execution_plan = get_execution_plan_from_db(exec_date)
														
 
															+        
														
 
															+        # 检查是否成功获取到执行计划
														
 
															+        if execution_plan is None:
														
 
															+            error_msg = f"无法从数据库获取有效的执行计划，当前DAG ds={exec_date}"
														
 
															+            logger.error(error_msg)
														
 
															+            # 使用全局变量而不是异常来强制DAG失败
														
 
															+            raise ValueError(error_msg)
														
 
															+        
														
 
															+        # 如果获取到了执行计划，处理它
														
 
															+        logger.info(f"成功从数据库获取执行计划")
														
 
															+        
														
 
															+        # 提取信息
														
 
															+        exec_date = execution_plan.get("exec_date", exec_date)
														
 
															+        resource_tasks = execution_plan.get("resource_tasks", [])
														
 
															+        model_tasks = execution_plan.get("model_tasks", [])
														
 
															+        dependencies = execution_plan.get("dependencies", {})
														
 
															+        
														
 
															+        logger.info(f"执行计划: exec_date={exec_date}, resource_tasks数量={len(resource_tasks)}, model_tasks数量={len(model_tasks)}")
														
 
															+        
														
 
															+        # 如果执行计划为空（没有任务），也应该失败
														
 
															+        if not resource_tasks and not model_tasks:
														
 
															+            error_msg = f"执行计划中没有任何任务，当前DAG ds={exec_date}"
														
 
															+            logger.error(error_msg)
														
 
															+            raise ValueError(error_msg)
														
 
															+        
														
 
															+        # 动态创建处理任务
														
 
															+        task_dict = {}
														
 
															+        
														
 
															+        # 1. 创建资源表任务
														
 
															+        for task_info in resource_tasks:
														
 
															+            table_name = task_info["target_table"]
														
 
															+            script_name = task_info["script_name"]
														
 
															+            exec_mode = task_info.get("script_exec_mode", "append")
														
 
															+            
														
 
															+            # 创建安全的任务ID
														
 
															+            safe_table_name = table_name.replace(".", "_").replace("-", "_")
														
 
															+            
														
 
															+            # 确保所有任务都是data_processing_phase的一部分
														
 
															+            with data_group:
														
 
															+                resource_task = PythonOperator(
														
 
															+                    task_id=f"resource_{safe_table_name}",
														
 
															+                    python_callable=process_resource,
														
 
															+                    op_kwargs={
														
 
															+                        "target_table": table_name,
														
 
															+                        "script_name": script_name,
														
 
															+                        "script_exec_mode": exec_mode,
														
 
															+                        # 确保使用字符串而不是可能是默认（非字符串）格式的执行日期
														
 
															+                        # 这样 execute_with_monitoring 函数才能正确更新数据库
														
 
															+                        "exec_date": str(exec_date)
														
 
															+                    },
														
 
															+                    retries=TASK_RETRY_CONFIG["retries"],
														
 
															+                    retry_delay=timedelta(minutes=TASK_RETRY_CONFIG["retry_delay_minutes"])
														
 
															+                )
														
 
															+            
														
 
															+            # 将任务添加到字典
														
 
															+            task_dict[table_name] = resource_task
														
 
															+            
														
 
															+            # 设置与start_processing的依赖
														
 
															+            start_processing >> resource_task
														
 
															+        
														
 
															+        # 创建有向图，用于检测模型表之间的依赖关系
														
 
															+        G = nx.DiGraph()
														
 
															+        
														
 
															+        # 将所有模型表添加为节点
														
 
															+        for task_info in model_tasks:
														
 
															+            table_name = task_info["target_table"]
														
 
															+            G.add_node(table_name)
														
 
															+        
														
 
															+        # 添加模型表之间的依赖边
														
 
															+        for source, deps in dependencies.items():
														
 
															+            for dep in deps:
														
 
															+                if dep.get("table_type") == "DataModel" and dep.get("table_name") in G.nodes():
														
 
															+                    G.add_edge(dep.get("table_name"), source)  # 依赖方向：依赖项 -> 目标
														
 
															+        
														
 
															+        # 检测循环依赖并处理
														
 
															+        try:
														
 
															+            cycles = list(nx.simple_cycles(G))
														
 
															+            if cycles:
														
 
															+                logger.warning(f"检测到循环依赖: {cycles}")
														
 
															+                for cycle in cycles:
														
 
															+                    G.remove_edge(cycle[-1], cycle[0])
														
 
															+                    logger.info(f"打破循环依赖: 移除 {cycle[-1]} -> {cycle[0]} 的依赖")
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"检测循环依赖时出错: {str(e)}")
														
 
															+        
														
 
															+        # 生成拓扑排序，确定执行顺序
														
 
															+        execution_order = []
														
 
															+        try:
														
 
															+            execution_order = list(nx.topological_sort(G))
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"生成拓扑排序失败: {str(e)}")
														
 
															+            execution_order = [task_info["target_table"] for task_info in model_tasks]
														
 
															+        
														
 
															+        # 2. 按拓扑排序顺序创建模型表任务
														
 
															+        for table_name in execution_order:
														
 
															+            task_info = next((t for t in model_tasks if t["target_table"] == table_name), None)
														
 
															+            if not task_info:
														
 
															+                continue
														
 
															-                # 读取执行计划文件
														
 
															-                with open(plan_path, 'r') as f:
														
 
															-                    execution_plan_json = f.read()
														
 
															-                    execution_plan = json.loads(execution_plan_json)
														
 
															-                    logger.info(f"从文件加载执行计划: {plan_path}")
														
 
															-                    
														
 
															-                    # 提取信息
														
 
															-                    exec_date = execution_plan.get("exec_date", get_today_date())
														
 
															-                    resource_tasks = execution_plan.get("resource_tasks", [])
														
 
															-                    model_tasks = execution_plan.get("model_tasks", [])
														
 
															-                    dependencies = execution_plan.get("dependencies", {})
														
 
															-                    
														
 
															-                    logger.info(f"执行计划: exec_date={exec_date}, resource_tasks数量={len(resource_tasks)}, model_tasks数量={len(model_tasks)}")
														
 
															-                    
														
 
															-                    # 动态创建处理任务
														
 
															-                    task_dict = {}
														
 
															-                    
														
 
															-                    # 1. 创建资源表任务
														
 
															-                    for task_info in resource_tasks:
														
 
															-                        table_name = task_info["target_table"]
														
 
															-                        script_name = task_info["script_name"]
														
 
															-                        exec_mode = task_info.get("script_exec_mode", "append")
														
 
															-                        
														
 
															-                        # 创建安全的任务ID
														
 
															-                        safe_table_name = table_name.replace(".", "_").replace("-", "_")
														
 
															-                        
														
 
															-                        # 确保所有任务都是data_processing_phase的一部分
														
 
															-                        with data_group:
														
 
															-                            resource_task = PythonOperator(
														
 
															-                                task_id=f"resource_{safe_table_name}",
														
 
															-                                python_callable=process_resource,
														
 
															-                                op_kwargs={
														
 
															-                                    "target_table": table_name,
														
 
															-                                    "script_name": script_name,
														
 
															-                                    "script_exec_mode": exec_mode,
														
 
															-                                    # 确保使用字符串而不是可能是默认（非字符串）格式的执行日期
														
 
															-                                    # 这样 execute_with_monitoring 函数才能正确更新数据库
														
 
															-                                    "exec_date": str(exec_date)
														
 
															-                                },
														
 
															-                                retries=TASK_RETRY_CONFIG["retries"],
														
 
															-                                retry_delay=timedelta(minutes=TASK_RETRY_CONFIG["retry_delay_minutes"])
														
 
															-                            )
														
 
															-                        
														
 
															-                        # 将任务添加到字典
														
 
															-                        task_dict[table_name] = resource_task
														
 
															-                        
														
 
															-                        # 设置与start_processing的依赖
														
 
															-                        start_processing >> resource_task
														
 
															-                    
														
 
															-                    # 创建有向图，用于检测模型表之间的依赖关系
														
 
															-                    G = nx.DiGraph()
														
 
															-                    
														
 
															-                    # 将所有模型表添加为节点
														
 
															-                    for task_info in model_tasks:
														
 
															-                        table_name = task_info["target_table"]
														
 
															-                        G.add_node(table_name)
														
 
															-                    
														
 
															-                    # 添加模型表之间的依赖边
														
 
															-                    for source, deps in dependencies.items():
														
 
															-                        for dep in deps:
														
 
															-                            if dep.get("table_type") == "DataModel" and dep.get("table_name") in G.nodes():
														
 
															-                                G.add_edge(dep.get("table_name"), source)  # 依赖方向：依赖项 -> 目标
														
 
															-                    
														
 
															-                    # 检测循环依赖并处理
														
 
															-                    try:
														
 
															-                        cycles = list(nx.simple_cycles(G))
														
 
															-                        if cycles:
														
 
															-                            logger.warning(f"检测到循环依赖: {cycles}")
														
 
															-                            for cycle in cycles:
														
 
															-                                G.remove_edge(cycle[-1], cycle[0])
														
 
															-                                logger.info(f"打破循环依赖: 移除 {cycle[-1]} -> {cycle[0]} 的依赖")
														
 
															-                    except Exception as e:
														
 
															-                        logger.error(f"检测循环依赖时出错: {str(e)}")
														
 
															-                    
														
 
															-                    # 生成拓扑排序，确定执行顺序
														
 
															-                    execution_order = []
														
 
															-                    try:
														
 
															-                        execution_order = list(nx.topological_sort(G))
														
 
															-                    except Exception as e:
														
 
															-                        logger.error(f"生成拓扑排序失败: {str(e)}")
														
 
															-                        execution_order = [task_info["target_table"] for task_info in model_tasks]
														
 
															-                    
														
 
															-                    # 2. 按拓扑排序顺序创建模型表任务
														
 
															-                    for table_name in execution_order:
														
 
															-                        task_info = next((t for t in model_tasks if t["target_table"] == table_name), None)
														
 
															-                        if not task_info:
														
 
															-                            continue
														
 
															-                            
														
 
															-                        script_name = task_info["script_name"]
														
 
															-                        exec_mode = task_info.get("script_exec_mode", "append")
														
 
															-                        
														
 
															-                        # 创建安全的任务ID
														
 
															-                        safe_table_name = table_name.replace(".", "_").replace("-", "_")
														
 
															-                        
														
 
															-                        # 确保所有任务都是data_processing_phase的一部分
														
 
															-                        with data_group:
														
 
															-                            model_task = PythonOperator(
														
 
															-                                task_id=f"model_{safe_table_name}",
														
 
															-                                python_callable=process_model,
														
 
															-                                op_kwargs={
														
 
															-                                    "target_table": table_name,
														
 
															-                                    "script_name": script_name,
														
 
															-                                    "script_exec_mode": exec_mode,
														
 
															-                                    # 确保使用字符串而不是可能是默认（非字符串）格式的执行日期
														
 
															-                                    # 这样 execute_with_monitoring 函数才能正确更新数据库
														
 
															-                                    "exec_date": str(exec_date)
														
 
															-                                },
														
 
															-                                retries=TASK_RETRY_CONFIG["retries"],
														
 
															-                                retry_delay=timedelta(minutes=TASK_RETRY_CONFIG["retry_delay_minutes"])
														
 
															-                            )
														
 
															-                        
														
 
															-                        # 将任务添加到字典
														
 
															-                        task_dict[table_name] = model_task
														
 
															-                        
														
 
															-                        # 设置依赖关系
														
 
															-                        deps = dependencies.get(table_name, [])
														
 
															-                        has_dependency = False
														
 
															-                        
														
 
															-                        # 处理模型表之间的依赖
														
 
															-                        for dep in deps:
														
 
															-                            dep_table = dep.get("table_name")
														
 
															-                            dep_type = dep.get("table_type")
														
 
															-                            
														
 
															-                            if dep_table in task_dict:
														
 
															-                                task_dict[dep_table] >> model_task
														
 
															-                                has_dependency = True
														
 
															-                                logger.info(f"设置依赖: {dep_table} >> {table_name}")
														
 
															-                        
														
 
															-                        # 如果没有依赖，则依赖于start_processing和资源表任务
														
 
															-                        if not has_dependency:
														
 
															-                            # 从start_processing任务直接连接
														
 
															-                            start_processing >> model_task
														
 
															-                            
														
 
															-                            # 同时从所有资源表任务连接
														
 
															-                            resource_count = 0
														
 
															-                            for resource_table in resource_tasks:
														
 
															-                                if resource_count >= 5:  # 最多设置5个依赖
														
 
															-                                    break
														
 
															-                                
														
 
															-                                resource_name = resource_table["target_table"]
														
 
															-                                if resource_name in task_dict:
														
 
															-                                    task_dict[resource_name] >> model_task
														
 
															-                                    resource_count += 1
														
 
															-                    
														
 
															-                    # 找出所有终端任务（没有下游依赖的任务）
														
 
															-                    terminal_tasks = []
														
 
															-                    
														
 
															-                    # 检查所有模型表任务
														
 
															-                    for table_name in execution_order:
														
 
															-                        # 检查是否有下游任务
														
 
															-                        has_downstream = False
														
 
															-                        for source, deps in dependencies.items():
														
 
															-                            if source == table_name:  # 跳过自身
														
 
															-                                continue
														
 
															-                            for dep in deps:
														
 
															-                                if dep.get("table_name") == table_name:
														
 
															-                                    has_downstream = True
														
 
															-                                    break
														
 
															-                            if has_downstream:
														
 
															-                                break
														
 
															-                        
														
 
															-                        # 如果没有下游任务，添加到终端任务列表
														
 
															-                        if not has_downstream and table_name in task_dict:
														
 
															-                            terminal_tasks.append(table_name)
														
 
															-                    
														
 
															-                    # 如果没有模型表任务，将所有资源表任务视为终端任务
														
 
															-                    if not model_tasks and resource_tasks:
														
 
															-                        terminal_tasks = [task["target_table"] for task in resource_tasks]
														
 
															-                        logger.info(f"没有模型表任务，将所有资源表任务视为终端任务: {terminal_tasks}")
														
 
															+            script_name = task_info["script_name"]
														
 
															+            exec_mode = task_info.get("script_exec_mode", "append")
														
 
															+            
														
 
															+            # 创建安全的任务ID
														
 
															+            safe_table_name = table_name.replace(".", "_").replace("-", "_")
														
 
															+            
														
 
															+            # 确保所有任务都是data_processing_phase的一部分
														
 
															+            with data_group:
														
 
															+                model_task = PythonOperator(
														
 
															+                    task_id=f"model_{safe_table_name}",
														
 
															+                    python_callable=process_model,
														
 
															+                    op_kwargs={
														
 
															+                        "target_table": table_name,
														
 
															+                        "script_name": script_name,
														
 
															+                        "script_exec_mode": exec_mode,
														
 
															+                        # 确保使用字符串而不是可能是默认（非字符串）格式的执行日期
														
 
															+                        # 这样 execute_with_monitoring 函数才能正确更新数据库
														
 
															+                        "exec_date": str(exec_date)
														
 
															+                    },
														
 
															+                    retries=TASK_RETRY_CONFIG["retries"],
														
 
															+                    retry_delay=timedelta(minutes=TASK_RETRY_CONFIG["retry_delay_minutes"])
														
 
															+                )
														
 
															+            
														
 
															+            # 将任务添加到字典
														
 
															+            task_dict[table_name] = model_task
														
 
															+            
														
 
															+            # 设置依赖关系
														
 
															+            deps = dependencies.get(table_name, [])
														
 
															+            has_dependency = False
														
 
															+            
														
 
															+            # 处理模型表之间的依赖
														
 
															+            for dep in deps:
														
 
															+                dep_table = dep.get("table_name")
														
 
															+                dep_type = dep.get("table_type")
														
 
															+                
														
 
															+                if dep_table in task_dict:
														
 
															+                    task_dict[dep_table] >> model_task
														
 
															+                    has_dependency = True
														
 
															+                    logger.info(f"设置依赖: {dep_table} >> {table_name}")
														
 
															+            
														
 
															+            # 如果没有依赖，则依赖于start_processing和资源表任务
														
 
															+            if not has_dependency:
														
 
															+                # 从start_processing任务直接连接
														
 
															+                start_processing >> model_task
														
 
															+                
														
 
															+                # 同时从所有资源表任务连接
														
 
															+                resource_count = 0
														
 
															+                for resource_table in resource_tasks:
														
 
															+                    if resource_count >= 5:  # 最多设置5个依赖
														
 
															+                        break
														
 
															-                    # 如果既没有模型表任务也没有资源表任务，已有默认依赖链
														
 
															-                    if not terminal_tasks:
														
 
															-                        logger.warning("未找到任何任务，使用默认依赖链")
														
 
															-                    else:
														
 
															-                        # 将所有终端任务连接到完成标记
														
 
															-                        for table_name in terminal_tasks:
														
 
															-                            if table_name in task_dict:
														
 
															-                                task_dict[table_name] >> processing_completed
														
 
															-                                logger.info(f"设置终端任务: {table_name} >> processing_completed")            
														
 
															-            except Exception as plan_e:
														
 
															-                logger.error(f"解析执行计划文件时出错: {str(plan_e)}")
														
 
															-                import traceback
														
 
															-                logger.error(traceback.format_exc())
														
 
															+                    resource_name = resource_table["target_table"]
														
 
															+                    if resource_name in task_dict:
														
 
															+                        task_dict[resource_name] >> model_task
														
 
															+                        resource_count += 1
														
 
															+        
														
 
															+        # 找出所有终端任务（没有下游依赖的任务）
														
 
															+        terminal_tasks = []
														
 
															+        
														
 
															+        # 检查所有模型表任务
														
 
															+        for table_name in execution_order:
														
 
															+            # 检查是否有下游任务
														
 
															+            has_downstream = False
														
 
															+            for source, deps in dependencies.items():
														
 
															+                if source == table_name:  # 跳过自身
														
 
															+                    continue
														
 
															+                for dep in deps:
														
 
															+                    if dep.get("table_name") == table_name:
														
 
															+                        has_downstream = True
														
 
															+                        break
														
 
															+                if has_downstream:
														
 
															+                    break
														
 
															+            
														
 
															+            # 如果没有下游任务，添加到终端任务列表
														
 
															+            if not has_downstream and table_name in task_dict:
														
 
															+                terminal_tasks.append(table_name)
														
 
															+        
														
 
															+        # 如果没有模型表任务，将所有资源表任务视为终端任务
														
 
															+        if not model_tasks and resource_tasks:
														
 
															+            terminal_tasks = [task["target_table"] for task in resource_tasks]
														
 
															+            logger.info(f"没有模型表任务，将所有资源表任务视为终端任务: {terminal_tasks}")
														
 
															+        
														
 
															+        # 如果既没有模型表任务也没有资源表任务，已有默认依赖链
														
 
															+        if not terminal_tasks:
														
 
															+            logger.warning("未找到任何任务，使用默认依赖链")
														
 
															         else:
														
 
															-            if not os.path.exists(plan_path):
														
 
															-                logger.warning(f"执行计划文件不存在: {plan_path}")
														
 
															-            if not os.path.exists(ready_path):
														
 
															-                logger.warning(f"执行计划ready标记文件不存在: {ready_path}")
														
 
															-            logger.warning("将使用默认DAG结构")
														
 
															+            # 将所有终端任务连接到完成标记
														
 
															+            for table_name in terminal_tasks:
														
 
															+                if table_name in task_dict:
														
 
															+                    task_dict[table_name] >> processing_completed
														
 
															+                    logger.info(f"设置终端任务: {table_name} >> processing_completed")
														
 
															     except Exception as e:
														
 
															-        logger.error(f"加载执行计划文件时出错: {str(e)}")
														
 
															+        logger.error(f"加载执行计划时出错: {str(e)}")
														
 
															         import traceback
														
 
															         logger.error(traceback.format_exc())
														
 
															 logger.info(f"DAG dag_dataops_pipeline_data_scheduler 定义完成")