wangxiaoqing_citu
/
airflow_scheduler


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
							# dag_data_model_scheduler.py
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.empty import EmptyOperator
from airflow.sensors.external_task import ExternalTaskSensor
from datetime import datetime, timedelta
import pendulum
import logging
import networkx as nx
from utils import (
    get_enabled_tables,
    is_data_model_table,
    run_model_script,
    get_model_dependency_graph,
    check_script_exists,
    get_script_name_from_neo4j
)
from config import TASK_RETRY_CONFIG

# 创建日志记录器
logger = logging.getLogger(__name__)

def get_all_enabled_tables_for_today():
    """
    根据当前日期获取所有需要处理的表
    
    返回:
        list: 需要处理的表配置列表
    """
    today = pendulum.today()
    # 原始代码（注释）
    # is_monday = today.day_of_week == 0
    # is_first_day_of_month = today.day == 1
    # is_first_day_of_year = today.month == 1 and today.day == 1
    
    # 测试用：所有条件设为True
    is_monday = True
    is_first_day_of_month = True
    is_first_day_of_year = True
    
    logger.info(f"今日日期: {today.to_date_string()}")
    logger.info(f"日期特性: 是否周一={is_monday}, 是否月初={is_first_day_of_month}, 是否年初={is_first_day_of_year}")
    
    all_tables = []
    
    # 每天都处理daily表
    daily_tables = get_enabled_tables("daily")
    all_tables.extend(daily_tables)
    logger.info(f"添加daily表: {len(daily_tables)}个")
    
    # 周一处理weekly表
    if is_monday:
        weekly_tables = get_enabled_tables("weekly")
        all_tables.extend(weekly_tables)
        logger.info(f"今天是周一，添加weekly表: {len(weekly_tables)}个")
    
    # 月初处理monthly表
    if is_first_day_of_month:
        monthly_tables = get_enabled_tables("monthly")
        all_tables.extend(monthly_tables)
        logger.info(f"今天是月初，添加monthly表: {len(monthly_tables)}个")
    
    # 年初处理yearly表
    if is_first_day_of_year:
        yearly_tables = get_enabled_tables("yearly")
        all_tables.extend(yearly_tables)
        logger.info(f"今天是年初，添加yearly表: {len(yearly_tables)}个")
    
    # 去重
    unique_tables = {}
    for item in all_tables:
        table_name = item["table_name"]
        if table_name not in unique_tables:
            unique_tables[table_name] = item
        else:
            # 如果存在重复，优先保留execution_mode为full_refresh的配置
            if item["execution_mode"] == "full_refresh":
                unique_tables[table_name] = item
    
    result_tables = list(unique_tables.values())
    logger.info(f"去重后，共 {len(result_tables)} 个表需要处理")
    
    # 记录所有需要处理的表
    for idx, item in enumerate(result_tables, 1):
        logger.info(f"表[{idx}]: {item['table_name']}, 执行模式: {item['execution_mode']}")
    
    return result_tables

def optimize_execution_plan(tables):
    """
    优化表的执行计划
    
    参数:
        tables (list): 表配置列表
        
    返回:
        tuple: (优化后的表执行顺序, 依赖关系图)
    """
    logger.info("开始优化执行计划...")
    
    # 筛选出DataModel类型的表
    model_tables = []
    for table in tables:
        table_name = table["table_name"]
        if is_data_model_table(table_name):
            model_tables.append(table)
    
    logger.info(f"筛选出 {len(model_tables)} 个DataModel类型的表")
    
    if not model_tables:
        logger.warning("没有找到DataModel类型的表，无需优化执行计划")
        return [], {}
    
    # 获取表名列表
    table_names = [t["table_name"] for t in model_tables]
    
    # 创建有向图
    G = nx.DiGraph()
    
    # 添加所有节点
    for table_name in table_names:
        G.add_node(table_name)
    
    # 获取依赖关系
    dependency_dict = get_model_dependency_graph(table_names)
    logger.info(f"获取到 {len(dependency_dict)} 个表的依赖关系")
    
    # 添加依赖边
    edge_count = 0
    for target, upstreams in dependency_dict.items():
        for upstream in upstreams:
            if upstream in table_names:  # 确保只考虑当前处理的表
                G.add_edge(upstream, target)  # 从上游指向下游
                edge_count += 1
    
    logger.info(f"依赖图中添加了 {edge_count} 条边")
    
    # 检测循环依赖
    cycles = list(nx.simple_cycles(G))
    if cycles:
        logger.warning(f"检测到 {len(cycles)} 个循环依赖，将尝试打破循环")
        # 打破循环依赖（简单策略：移除每个循环中的最后一条边）
        for cycle in cycles:
            # 移除循环中的最后一条边
            G.remove_edge(cycle[-1], cycle[0])
            logger.info(f"打破循环依赖: 移除 {cycle[-1]} -> {cycle[0]} 的依赖")
    
    # 生成拓扑排序
    try:
        # 拓扑排序会从没有入边的节点开始，这些节点是上游节点，因此排序结果是从上游到下游
        execution_order = list(nx.topological_sort(G))
        logger.info(f"成功生成执行顺序，按从上游到下游顺序共 {len(execution_order)} 个表")
        
        # 创建结果依赖字典，包含所有表（即使没有依赖）
        result_dependency_dict = {name: [] for name in table_names}
        
        # 添加实际依赖关系
        for target, upstreams in dependency_dict.items():
            if target in table_names:  # 确保只考虑当前处理的表
                result_dependency_dict[target] = [u for u in upstreams if u in table_names]
        
        return execution_order, result_dependency_dict
    except Exception as e:
        logger.error(f"生成执行顺序失败: {str(e)}")
        # 如果拓扑排序失败，返回原始表名列表和空依赖图
        return table_names, {name: [] for name in table_names}

with DAG(
    "dag_data_model_scheduler", 
    start_date=datetime(2024, 1, 1), 
    schedule_interval="@daily", 
    catchup=False
) as dag:
    logger.info("初始化 dag_data_model_scheduler DAG")
    
    # 等待资源表 DAG 完成
    wait_for_resource = ExternalTaskSensor(
        task_id="wait_for_resource_loading",
        external_dag_id="dag_data_resource_scheduler",
        external_task_id="resource_loading_completed",
        mode="poke",
        timeout=3600,
        poke_interval=30
    )
    logger.info("创建资源表等待任务 - wait_for_resource_loading")

    # 创建一个完成标记任务
    model_processing_completed = EmptyOperator(
        task_id="model_processing_completed",
        dag=dag
    )
    logger.info("创建模型处理完成标记 - model_processing_completed")

    try:
        # 获取今日需要处理的所有表
        all_enabled_tables = get_all_enabled_tables_for_today()
        
        if not all_enabled_tables:
            logger.info("今天没有需要处理的表，直接连接开始和结束任务")
            wait_for_resource >> model_processing_completed
        else:
            # 优化执行计划
            execution_order, dependency_dict = optimize_execution_plan(all_enabled_tables)
            
            if not execution_order:
                logger.info("执行计划为空，直接连接开始和结束任务")
                wait_for_resource >> model_processing_completed
            else:
                # 创建任务字典
                task_dict = {}
                
                # 为每个表创建处理任务
                for table_name in execution_order:
                    # 查找表配置
                    table_config = next((t for t in all_enabled_tables if t["table_name"] == table_name), None)
                    
                    if table_config:
                        logger.info(f"为表 {table_name} 创建处理任务，执行模式: {table_config['execution_mode']}")
                        
                        # 创建任务
                        task = PythonOperator(
                            task_id=f"process_{table_name}",
                            python_callable=run_model_script,
                            op_kwargs={
                                "table_name": table_name,
                                "execution_mode": table_config["execution_mode"]
                            },
                            retries=TASK_RETRY_CONFIG["retries"],
                            retry_delay=timedelta(minutes=TASK_RETRY_CONFIG["retry_delay_minutes"]),
                            dag=dag
                        )
                        
                        # 将任务添加到字典
                        task_dict[table_name] = task
                
                # 设置任务间的依赖关系
                for table_name, task in task_dict.items():
                    # 获取上游依赖
                    upstream_tables = dependency_dict.get(table_name, [])
                    
                    if not upstream_tables:
                        # 如果没有上游依赖，直接连接到资源表等待任务
                        logger.info(f"表 {table_name} 没有上游依赖，连接到资源表等待任务")
                        wait_for_resource >> task
                    else:
                        # 设置与上游表的依赖关系
                        for upstream_table in upstream_tables:
                            if upstream_table in task_dict:
                                logger.info(f"设置依赖: {upstream_table} >> {table_name}")
                                task_dict[upstream_table] >> task
                    
                    # 检查是否是末端节点（没有下游节点）
                    is_terminal = True
                    for target, upstreams in dependency_dict.items():
                        if table_name in upstreams:
                            is_terminal = False
                            break
                    
                    # 如果是末端节点，连接到模型处理完成标记
                    if is_terminal:
                        logger.info(f"表 {table_name} 是末端节点，连接到模型处理完成标记")
                        task >> model_processing_completed
                
                # 处理特殊情况：检查是否有任务连接到完成标记
                has_connection_to_completed = False
                for task in task_dict.values():
                    for downstream in task.downstream_list:
                        if downstream.task_id == model_processing_completed.task_id:
                            has_connection_to_completed = True
                            break
                
                # 如果没有任务连接到完成标记，连接所有任务到完成标记
                if not has_connection_to_completed and task_dict:
                    logger.info("没有发现连接到完成标记的任务，连接所有任务到完成标记")
                    for task in task_dict.values():
                        task >> model_processing_completed
                
                # 处理特殊情况：如果资源等待任务没有下游任务，直接连接到完成标记
                if not wait_for_resource.downstream_list:
                    logger.info("资源等待任务没有下游任务，直接连接到完成标记")
                    wait_for_resource >> model_processing_completed
    
    except Exception as e:
        logger.error(f"构建DAG时出错: {str(e)}")
        import traceback
        logger.error(traceback.format_exc())
        # 确保出错时也有完整的执行流
        wait_for_resource >> model_processing_completed