فهرست منبع

抽取公共函数,修改scripts的路径

wangxq 1 هفته پیش
والد
کامیت
5761322b50

+ 0 - 445
dags/common.py

@@ -1,445 +0,0 @@
-# common.py
-import psycopg2
-from neo4j import GraphDatabase
-import logging
-import importlib.util
-from pathlib import Path
-import networkx as nx
-import os
-from datetime import datetime, timedelta
-from config import PG_CONFIG, NEO4J_CONFIG, SCRIPTS_BASE_PATH
-import functools
-import time
-
-# 创建统一的日志记录器
-logger = logging.getLogger("airflow.task")
-
-def get_pg_conn():
-    """获取PostgreSQL连接"""
-    return psycopg2.connect(**PG_CONFIG)
-
-def get_neo4j_driver():
-    """获取Neo4j连接驱动"""
-    uri = NEO4J_CONFIG['uri']
-    auth = (NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-    return GraphDatabase.driver(uri, auth=auth)
-
-def update_task_start_time(exec_date, target_table, script_name, start_time):
-    """更新任务开始时间"""
-    logger.info(f"===== 更新任务开始时间 =====")
-    logger.info(f"参数: exec_date={exec_date} ({type(exec_date).__name__}), target_table={target_table}, script_name={script_name}")
-    
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    try:
-        # 首先检查记录是否存在
-        cursor.execute("""
-            SELECT COUNT(*) 
-            FROM airflow_dag_schedule 
-            WHERE exec_date = %s AND target_table = %s AND script_name = %s
-        """, (exec_date, target_table, script_name))
-        count = cursor.fetchone()[0]
-        logger.info(f"查询到符合条件的记录数: {count}")
-        
-        if count == 0:
-            logger.warning(f"未找到匹配的记录: exec_date={exec_date}, target_table={target_table}, script_name={script_name}")
-            logger.info("尝试记录在airflow_dag_schedule表中找到的记录:")
-            cursor.execute("""
-                SELECT exec_date, target_table, script_name
-                FROM airflow_dag_schedule
-                LIMIT 5
-            """)
-            sample_records = cursor.fetchall()
-            for record in sample_records:
-                logger.info(f"样本记录: exec_date={record[0]} ({type(record[0]).__name__}), target_table={record[1]}, script_name={record[2]}")
-        
-        # 执行更新
-        sql = """
-            UPDATE airflow_dag_schedule 
-            SET exec_start_time = %s
-            WHERE exec_date = %s AND target_table = %s AND script_name = %s
-        """
-        logger.info(f"执行SQL: {sql}")
-        logger.info(f"参数: start_time={start_time}, exec_date={exec_date}, target_table={target_table}, script_name={script_name}")
-        
-        cursor.execute(sql, (start_time, exec_date, target_table, script_name))
-        affected_rows = cursor.rowcount
-        logger.info(f"更新影响的行数: {affected_rows}")
-        
-        conn.commit()
-        logger.info("事务已提交")
-    except Exception as e:
-        logger.error(f"更新任务开始时间失败: {str(e)}")
-        import traceback
-        logger.error(f"错误堆栈: {traceback.format_exc()}")
-        conn.rollback()
-        logger.info("事务已回滚")
-        raise
-    finally:
-        cursor.close()
-        conn.close()
-        logger.info("数据库连接已关闭")
-        logger.info("===== 更新任务开始时间完成 =====")
-
-def update_task_completion(exec_date, target_table, script_name, success, end_time, duration):
-    """更新任务完成信息"""
-    logger.info(f"===== 更新任务完成信息 =====")
-    logger.info(f"参数: exec_date={exec_date} ({type(exec_date).__name__}), target_table={target_table}, script_name={script_name}")
-    logger.info(f"参数: success={success} ({type(success).__name__}), end_time={end_time}, duration={duration}")
-    
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    try:
-        # 首先检查记录是否存在
-        cursor.execute("""
-            SELECT COUNT(*) 
-            FROM airflow_dag_schedule 
-            WHERE exec_date = %s AND target_table = %s AND script_name = %s
-        """, (exec_date, target_table, script_name))
-        count = cursor.fetchone()[0]
-        logger.info(f"查询到符合条件的记录数: {count}")
-        
-        if count == 0:
-            logger.warning(f"未找到匹配的记录: exec_date={exec_date}, target_table={target_table}, script_name={script_name}")
-            # 查询表中前几条记录作为参考
-            cursor.execute("""
-                SELECT exec_date, target_table, script_name
-                FROM airflow_dag_schedule
-                LIMIT 5
-            """)
-            sample_records = cursor.fetchall()
-            logger.info("airflow_dag_schedule表中的样本记录:")
-            for record in sample_records:
-                logger.info(f"样本记录: exec_date={record[0]} ({type(record[0]).__name__}), target_table={record[1]}, script_name={record[2]}")
-        
-        # 确保success是布尔类型
-        if not isinstance(success, bool):
-            original_success = success
-            success = bool(success)
-            logger.warning(f"success参数不是布尔类型,原始值: {original_success},转换为: {success}")
-        
-        # 执行更新
-        sql = """
-            UPDATE airflow_dag_schedule 
-            SET exec_result = %s, exec_end_time = %s, exec_duration = %s
-            WHERE exec_date = %s AND target_table = %s AND script_name = %s
-        """
-        logger.info(f"执行SQL: {sql}")
-        logger.info(f"参数: success={success}, end_time={end_time}, duration={duration}, exec_date={exec_date}, target_table={target_table}, script_name={script_name}")
-        
-        cursor.execute(sql, (success, end_time, duration, exec_date, target_table, script_name))
-        affected_rows = cursor.rowcount
-        logger.info(f"更新影响的行数: {affected_rows}")
-        
-        if affected_rows == 0:
-            logger.warning("更新操作没有影响任何行,可能是因为条件不匹配")
-            # 尝试用不同格式的exec_date查询
-            if isinstance(exec_date, str):
-                try:
-                    # 尝试解析日期字符串
-                    from datetime import datetime
-                    parsed_date = datetime.strptime(exec_date, "%Y-%m-%d").date()
-                    logger.info(f"尝试使用解析后的日期格式: {parsed_date}")
-                    
-                    cursor.execute("""
-                        SELECT COUNT(*) 
-                        FROM airflow_dag_schedule 
-                        WHERE exec_date = %s AND target_table = %s AND script_name = %s
-                    """, (parsed_date, target_table, script_name))
-                    parsed_count = cursor.fetchone()[0]
-                    logger.info(f"使用解析日期后查询到的记录数: {parsed_count}")
-                    
-                    if parsed_count > 0:
-                        # 尝试用解析的日期更新
-                        cursor.execute("""
-                            UPDATE airflow_dag_schedule 
-                            SET exec_result = %s, exec_end_time = %s, exec_duration = %s
-                            WHERE exec_date = %s AND target_table = %s AND script_name = %s
-                        """, (success, end_time, duration, parsed_date, target_table, script_name))
-                        new_affected_rows = cursor.rowcount
-                        logger.info(f"使用解析日期后更新影响的行数: {new_affected_rows}")
-                except Exception as parse_e:
-                    logger.error(f"尝试解析日期格式时出错: {str(parse_e)}")
-        
-        conn.commit()
-        logger.info("事务已提交")
-    except Exception as e:
-        logger.error(f"更新任务完成信息失败: {str(e)}")
-        import traceback
-        logger.error(f"错误堆栈: {traceback.format_exc()}")
-        conn.rollback()
-        logger.info("事务已回滚")
-        raise
-    finally:
-        cursor.close()
-        conn.close()
-        logger.info("数据库连接已关闭")
-        logger.info("===== 更新任务完成信息完成 =====")
-
-def execute_with_monitoring(target_table, script_name, script_exec_mode, exec_date, **kwargs):
-    """执行脚本并监控执行情况"""
-
-    # 添加详细日志
-    logger.info(f"===== 开始监控执行 =====")
-    logger.info(f"target_table: {target_table}, 类型: {type(target_table)}")
-    logger.info(f"script_name: {script_name}, 类型: {type(script_name)}")
-    logger.info(f"script_exec_mode: {script_exec_mode}, 类型: {type(script_exec_mode)}")
-    logger.info(f"exec_date: {exec_date}, 类型: {type(exec_date)}")
-
-    # 检查script_name是否为空
-    if not script_name:
-        logger.error(f"表 {target_table} 的script_name为空,无法执行")
-        # 记录执行失败
-        now = datetime.now()
-        update_task_completion(exec_date, target_table, script_name or "", False, now, 0)
-        return False
-    # 记录执行开始时间
-    start_time = datetime.now()
-    
-    # 尝试更新开始时间并记录结果
-    try:
-        update_task_start_time(exec_date, target_table, script_name, start_time)
-        logger.info(f"成功更新任务开始时间: {start_time}")
-    except Exception as e:
-        logger.error(f"更新任务开始时间失败: {str(e)}")
-    
-    try:
-        # 执行实际脚本
-        logger.info(f"开始执行脚本: {script_name}")
-        result = execute_script(script_name, target_table, script_exec_mode)
-        logger.info(f"脚本执行完成,原始返回值: {result}, 类型: {type(result)}")
-        
-        # 确保result是布尔值
-        if result is None:
-            logger.warning(f"脚本返回值为None,转换为False")
-            result = False
-        elif not isinstance(result, bool):
-            original_result = result
-            result = bool(result)
-            logger.warning(f"脚本返回非布尔值 {original_result},转换为布尔值: {result}")
-        
-        # 记录结束时间和结果
-        end_time = datetime.now()
-        duration = (end_time - start_time).total_seconds()
-        
-        # 尝试更新完成状态并记录结果
-        try:
-            logger.info(f"尝试更新完成状态: result={result}, end_time={end_time}, duration={duration}")
-            update_task_completion(exec_date, target_table, script_name, result, end_time, duration)
-            logger.info(f"成功更新任务完成状态,结果: {result}")
-        except Exception as e:
-            logger.error(f"更新任务完成状态失败: {str(e)}")
-        
-        logger.info(f"===== 监控执行完成 =====")
-        return result
-    except Exception as e:
-        # 处理异常
-        logger.error(f"执行任务出错: {str(e)}")
-        end_time = datetime.now()
-        duration = (end_time - start_time).total_seconds()
-        
-        # 尝试更新失败状态并记录结果
-        try:
-            logger.info(f"尝试更新失败状态: end_time={end_time}, duration={duration}")
-            update_task_completion(exec_date, target_table, script_name, False, end_time, duration)
-            logger.info(f"成功更新任务失败状态")
-        except Exception as update_e:
-            logger.error(f"更新任务失败状态失败: {str(update_e)}")
-        
-        logger.info(f"===== 监控执行异常结束 =====")
-        raise e
-
-def ensure_boolean_result(func):
-    """装饰器:确保函数返回布尔值"""
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        try:
-            result = func(*args, **kwargs)
-            logger.debug(f"脚本原始返回值: {result} (类型: {type(result).__name__})")
-            
-            # 处理None值
-            if result is None:
-                logger.warning(f"脚本函数 {func.__name__} 返回了None,默认设置为False")
-                return False
-                
-            # 处理非布尔值
-            if not isinstance(result, bool):
-                try:
-                    # 尝试转换为布尔值
-                    bool_result = bool(result)
-                    logger.warning(f"脚本函数 {func.__name__} 返回非布尔值 {result},已转换为布尔值 {bool_result}")
-                    return bool_result
-                except Exception as e:
-                    logger.error(f"无法将脚本返回值 {result} 转换为布尔值: {str(e)}")
-                    return False
-            
-            return result
-        except Exception as e:
-            logger.error(f"脚本函数 {func.__name__} 执行出错: {str(e)}")
-            return False
-    return wrapper
-
-def execute_script(script_path=None, script_name=None, script_exec_mode=None, table_name=None, execution_mode=None, args=None):
-    """
-    执行指定的脚本,并返回执行结果
-    支持两种调用方式:
-    1. execute_script(script_path, script_name, script_exec_mode, args={})
-    2. execute_script(script_name, table_name, execution_mode)
-    """
-    # 确定调用方式并统一参数
-    if script_path and script_name and script_exec_mode is not None:
-        # 第一种调用方式
-        if args is None:
-            args = {}
-    elif script_name and table_name and execution_mode is not None:
-        # 第二种调用方式
-        script_path = os.path.join(SCRIPTS_BASE_PATH, f"{script_name}.py")
-        script_exec_mode = execution_mode
-        args = {"table_name": table_name}
-    else:
-        logger.error("参数不正确,无法执行脚本")
-        return False
-
-    try:
-        # 确保脚本路径存在
-        if not os.path.exists(script_path):
-            logger.error(f"脚本路径 {script_path} 不存在")
-            return False
-
-        # 加载脚本模块
-        spec = importlib.util.spec_from_file_location("script_module", script_path)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        
-        # 检查并记录所有可用的函数
-        module_functions = [f for f in dir(module) if callable(getattr(module, f)) and not f.startswith('_')]
-        logger.debug(f"模块 {script_name} 中的可用函数: {module_functions}")
-
-        # 获取脚本的运行函数
-        if not hasattr(module, "run"):
-            logger.error(f"脚本 {script_name} 没有run函数")
-            return False
-
-        # 装饰run函数,确保返回布尔值
-        original_run = module.run
-        module.run = ensure_boolean_result(original_run)
-        
-        logger.info(f"开始执行脚本 {script_name},执行模式: {script_exec_mode}, 参数: {args}")
-        start_time = time.time()
-        
-        # 执行脚本
-        if table_name is not None:
-            # 第二种调用方式的参数格式
-            exec_result = module.run(table_name=table_name, execution_mode=script_exec_mode)
-        else:
-            # 第一种调用方式的参数格式
-            exec_result = module.run(script_exec_mode, args)
-        
-        end_time = time.time()
-        duration = end_time - start_time
-        
-        logger.info(f"脚本 {script_name} 执行完成,结果: {exec_result}, 耗时: {duration:.2f}秒")
-        return exec_result
-    except Exception as e:
-        logger.error(f"执行脚本 {script_name} 时出错: {str(e)}")
-        import traceback
-        logger.error(traceback.format_exc())
-        return False
-
-def generate_optimized_execution_order(table_names, dependency_dict):
-    """
-    生成优化的执行顺序,处理循环依赖
-    
-    参数:
-        table_names: 表名列表
-        dependency_dict: 依赖关系字典 {表名: [依赖表1, 依赖表2, ...]}
-    
-    返回:
-        list: 优化后的执行顺序列表
-    """
-    # 创建有向图
-    G = nx.DiGraph()
-    
-    # 添加所有节点
-    for table_name in table_names:
-        G.add_node(table_name)
-    
-    # 添加依赖边
-    for target, sources in dependency_dict.items():
-        for source in sources:
-            if source in table_names:  # 确保只考虑目标表集合中的表
-                # 从依赖指向目标,表示依赖需要先执行
-                G.add_edge(source, target)
-    
-    # 检测循环依赖
-    cycles = list(nx.simple_cycles(G))
-    if cycles:
-        logger.warning(f"检测到循环依赖,将尝试打破循环: {cycles}")
-        # 打破循环依赖(简单策略:移除每个循环中的一条边)
-        for cycle in cycles:
-            # 移除循环中的最后一条边
-            G.remove_edge(cycle[-1], cycle[0])
-            logger.info(f"打破循环依赖: 移除 {cycle[-1]} -> {cycle[0]} 的依赖")
-    
-    # 生成拓扑排序
-    try:
-        execution_order = list(nx.topological_sort(G))
-        return execution_order
-    except Exception as e:
-        logger.error(f"生成执行顺序失败: {str(e)}")
-        # 返回原始列表作为备选
-        return table_names
-
-def get_datamodel_dependency_from_neo4j(table_names):
-    """
-    从Neo4j获取DataModel表间的依赖关系
-    
-    参数:
-        table_names: 表名列表
-    
-    返回:
-        dict: 依赖关系字典 {目标表: [依赖表1, 依赖表2, ...]}
-    """
-    logger.info(f"开始获取 {len(table_names)} 个表的依赖关系")
-    
-    # 创建Neo4j连接
-    driver = get_neo4j_driver()
-    dependency_dict = {name: [] for name in table_names}
-    
-    try:
-        with driver.session() as session:
-            # 使用一次性查询获取所有表之间的依赖关系
-            query = """
-                MATCH (source:DataModel)-[:DERIVED_FROM]->(target:DataModel)
-                WHERE source.en_name IN $table_names AND target.en_name IN $table_names
-                RETURN source.en_name AS source, target.en_name AS target
-            """
-            result = session.run(query, table_names=table_names)
-            
-            # 处理结果
-            for record in result:
-                source = record.get("source")
-                target = record.get("target")
-                
-                if source and target:
-                    # 目标依赖于源
-                    if source in dependency_dict:
-                        dependency_dict[source].append(target)
-                        logger.debug(f"依赖关系: {source} 依赖于 {target}")
-    except Exception as e:
-        logger.error(f"从Neo4j获取依赖关系时出错: {str(e)}")
-    finally:
-        driver.close()
-    
-    # 记录依赖关系
-    for table, deps in dependency_dict.items():
-        if deps:
-            logger.info(f"表 {table} 依赖于: {deps}")
-        else:
-            logger.info(f"表 {table} 没有依赖")
-    
-    return dependency_dict
-
-def get_today_date():
-    """获取今天的日期,返回YYYY-MM-DD格式字符串"""
-    return datetime.now().strftime("%Y-%m-%d")

+ 1 - 1
dags/config.py

@@ -32,7 +32,7 @@ TASK_RETRY_CONFIG = {
 # 脚本文件基础路径配置
 # 部署到 Airflow 环境时使用此路径
 AIRFLOW_BASE_PATH='/opt/airflow'
-SCRIPTS_BASE_PATH = "/opt/airflow/dataops/scripts"
+SCRIPTS_BASE_PATH = "/opt/airflow/dataops_scripts"
 
 # 上传的CSV/EXCEL文件的基准上传路径
 STRUCTURE_UPLOAD_BASE_PATH ="/data/csv"

+ 0 - 1359
dags/dag_dataops_pipeline_data_scheduler.py

@@ -1,1359 +0,0 @@
-"""
-统一数据运维调度器 DAG
-
-功能:
-1. 将数据处理与统计汇总整合到一个DAG中
-2. 保留原有的每个处理脚本单独运行的特性,方便通过Web UI查看
-3. 支持执行计划文件的动态解析和执行
-4. 执行完成后自动生成汇总报告
-"""
-from airflow import DAG
-from airflow.operators.python import PythonOperator, ShortCircuitOperator
-from airflow.operators.empty import EmptyOperator
-from airflow.utils.task_group import TaskGroup
-from datetime import datetime, timedelta, date
-import logging
-import networkx as nx
-import json
-import os
-import pendulum
-from decimal import Decimal
-from common import (
-    get_pg_conn, 
-    get_neo4j_driver,
-    get_today_date
-)
-from config import TASK_RETRY_CONFIG, SCRIPTS_BASE_PATH, PG_CONFIG, NEO4J_CONFIG
-import pytz
-
-# 创建日志记录器
-logger = logging.getLogger(__name__)
-
-# 开启详细诊断日志记录
-ENABLE_DEBUG_LOGGING = True
-
-def log_debug(message):
-    """记录调试日志,但只在启用调试模式时"""
-    if ENABLE_DEBUG_LOGGING:
-        logger.info(f"[DEBUG] {message}")
-
-# 在DAG启动时输出诊断信息
-log_debug("======== 诊断信息 ========")
-log_debug(f"当前工作目录: {os.getcwd()}")
-log_debug(f"SCRIPTS_BASE_PATH: {SCRIPTS_BASE_PATH}")
-log_debug(f"导入的common模块路径: {get_pg_conn.__module__}")
-
-# 检查数据库连接
-def validate_database_connection():
-    """验证数据库连接是否正常"""
-    try:
-        conn = get_pg_conn()
-        cursor = conn.cursor()
-        cursor.execute("SELECT version()")
-        version = cursor.fetchone()
-        log_debug(f"数据库连接正常,PostgreSQL版本: {version[0]}")
-        
-        # 检查airflow_exec_plans表是否存在
-        cursor.execute("""
-            SELECT EXISTS (
-               SELECT FROM information_schema.tables 
-               WHERE table_name = 'airflow_exec_plans'
-            )
-        """)
-        table_exists = cursor.fetchone()[0]
-        if table_exists:
-            # 检查表结构
-            cursor.execute("""
-                SELECT column_name, data_type 
-                FROM information_schema.columns 
-                WHERE table_name = 'airflow_exec_plans'
-            """)
-            columns = cursor.fetchall()
-            log_debug(f"airflow_exec_plans表存在,列信息:")
-            for col in columns:
-                log_debug(f"  - {col[0]}: {col[1]}")
-            
-            # 查询最新记录数量
-            cursor.execute("SELECT COUNT(*) FROM airflow_exec_plans")
-            count = cursor.fetchone()[0]
-            log_debug(f"airflow_exec_plans表中有 {count} 条记录")
-            
-            # 检查最近的执行记录
-            cursor.execute("""
-                SELECT exec_date, COUNT(*) as record_count
-                FROM airflow_exec_plans
-                GROUP BY exec_date
-                ORDER BY exec_date DESC
-                LIMIT 3
-            """)
-            recent_dates = cursor.fetchall()
-            log_debug(f"最近的执行日期及记录数:")
-            for date_info in recent_dates:
-                log_debug(f"  - {date_info[0]}: {date_info[1]} 条记录")
-        else:
-            log_debug("airflow_exec_plans表不存在!")
-        
-        cursor.close()
-        conn.close()
-        return True
-    except Exception as e:
-        log_debug(f"数据库连接验证失败: {str(e)}")
-        import traceback
-        log_debug(f"错误堆栈: {traceback.format_exc()}")
-        return False
-
-# 执行数据库连接验证
-try:
-    validate_database_connection()
-except Exception as e:
-    log_debug(f"验证数据库连接时出错: {str(e)}")
-
-log_debug("======== 诊断信息结束 ========")
-
-#############################################
-# 通用工具函数
-#############################################
-
-def json_serial(obj):
-    """将日期对象序列化为ISO格式字符串的JSON序列化器"""
-    if isinstance(obj, (datetime, date)):
-        return obj.isoformat()
-    raise TypeError(f"类型 {type(obj)} 不能被序列化为JSON")
-
-# 添加自定义JSON编码器解决Decimal序列化问题
-class DecimalEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, Decimal):
-            return float(obj)
-        # 处理日期类型
-        elif isinstance(obj, (datetime, date)):
-            return obj.isoformat()
-        # 让父类处理其他类型
-        return super(DecimalEncoder, self).default(obj)
-
-#############################################
-# 新的工具函数
-#############################################
-
-def execute_python_script(target_table, script_name, script_exec_mode, exec_date, **kwargs):
-    """
-    执行Python脚本并返回执行结果
-    
-    参数:
-        target_table: 目标表名
-        script_name: 脚本名称 
-        script_exec_mode: 脚本执行模式
-        exec_date: 执行日期
-        source_tables: (可选) 源表列表
-        
-    返回:
-        bool: 脚本执行结果
-    """
-    # 添加详细日志
-    logger.info(f"===== 开始执行脚本 =====")
-    logger.info(f"target_table: {target_table}, 类型: {type(target_table)}")
-    logger.info(f"script_name: {script_name}, 类型: {type(script_name)}")
-    logger.info(f"script_exec_mode: {script_exec_mode}, 类型: {type(script_exec_mode)}")
-    logger.info(f"exec_date: {exec_date}, 类型: {type(exec_date)}")
-
-    # 记录额外参数
-    for key, value in kwargs.items():
-        logger.info(f"额外参数 - {key}: {value}, 类型: {type(value)}")
-
-    # 检查script_name是否为空
-    if not script_name:
-        logger.error(f"表 {target_table} 的script_name为空,无法执行")
-        return False
-        
-    # 记录执行开始时间
-    start_time = datetime.now()
-    
-    try:
-        # 导入和执行脚本模块
-        import importlib.util
-        import sys
-        # SCRIPTS_BASE_PATH = "/opt/airflow/dataops/scripts"
-        script_path = os.path.join(SCRIPTS_BASE_PATH, script_name)
-        
-        if not os.path.exists(script_path):
-            logger.error(f"脚本文件不存在: {script_path}")
-            return False
-            
-        # 动态导入模块
-        spec = importlib.util.spec_from_file_location("dynamic_module", script_path)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-
-
-        
-        # 检查并调用标准入口函数run
-        if hasattr(module, "run"):
-            logger.info(f"调用脚本 {script_name} 的标准入口函数 run()")
-            # 构建完整的参数字典
-            run_params = {
-                "table_name": target_table,
-                "execution_mode": script_exec_mode,
-                "exec_date": exec_date
-            }
-
-            ## 添加可能的额外参数
-            for key in ['target_type', 'storage_location', 'frequency', 'source_tables']:
-                if key in kwargs and kwargs[key] is not None:
-                    run_params[key] = kwargs[key] 
-
-            # 调用脚本的run函数
-            logger.info(f"调用run函数并传递参数: {run_params}")
-            result = module.run(**run_params)
-            logger.info(f"脚本执行完成,原始返回值: {result}, 类型: {type(result)}")
-            
-            # 确保result是布尔值
-            if result is None:
-                logger.warning(f"脚本返回值为None,转换为False")
-                result = False
-            elif not isinstance(result, bool):
-                original_result = result
-                result = bool(result)
-                logger.warning(f"脚本返回非布尔值 {original_result},转换为布尔值: {result}")
-            
-            # 记录结束时间和结果
-            end_time = datetime.now()
-            duration = (end_time - start_time).total_seconds()
-            logger.info(f"脚本 {script_name} 执行完成,结果: {result}, 耗时: {duration:.2f}秒")
-            
-            return result
-        else:
-            logger.error(f"脚本 {script_name} 中未定义标准入口函数 run(),无法执行")
-            return False
-    except Exception as e:
-        # 处理异常
-        logger.error(f"执行任务出错: {str(e)}")
-        end_time = datetime.now()
-        duration = (end_time - start_time).total_seconds()
-        logger.error(f"脚本 {script_name} 执行失败,耗时: {duration:.2f}秒")
-        logger.info(f"===== 脚本执行异常结束 =====")
-        import traceback
-        logger.error(traceback.format_exc())
-        
-        # 确保不会阻塞DAG
-        return False
-
-#############################################
-# 第一阶段: 准备阶段(Prepare Phase)的函数
-#############################################
-
-def get_enabled_tables():
-    """获取所有启用的表"""
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    try:
-        cursor.execute("""
-            SELECT owner_id, table_name 
-            FROM schedule_status 
-            WHERE schedule_is_enabled = TRUE
-        """)
-        result = cursor.fetchall()
-        return [row[1] for row in result]  # 只返回表名
-    except Exception as e:
-        logger.error(f"获取启用表失败: {str(e)}")
-        return []
-    finally:
-        cursor.close()
-        conn.close()
-
-def check_table_directly_subscribed(table_name):
-    """检查表是否在schedule_status表中直接订阅"""
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    try:
-        cursor.execute("""
-            SELECT schedule_is_enabled
-            FROM schedule_status 
-            WHERE table_name = %s
-        """, (table_name,))
-        result = cursor.fetchone()
-        return result and result[0] is True
-    except Exception as e:
-        logger.error(f"检查表订阅状态失败: {str(e)}")
-        return False
-    finally:
-        cursor.close()
-        conn.close()
-
-def get_table_info_from_neo4j(table_name):
-    """从Neo4j获取表的详细信息"""
-    driver = get_neo4j_driver()
-     # 检查表是否直接订阅
-    is_directly_schedule = check_table_directly_subscribed(table_name)
-
-    table_info = {
-        'target_table': table_name,
-        'is_directly_schedule': is_directly_schedule,  # 初始值设为True,从schedule_status表获取
-    }
-    
-    try:
-        with driver.session() as session:
-            # 查询表标签和状态
-            query_table = """
-                MATCH (t {en_name: $table_name})
-                RETURN labels(t) AS labels, t.status AS status, t.frequency AS frequency,
-                       t.type AS type, t.storage_location AS storage_location
-            """
-            result = session.run(query_table, table_name=table_name)
-            record = result.single()
-            
-            if record:
-                labels = record.get("labels", [])
-                table_info['target_table_label'] = [label for label in labels if label in ["DataResource", "DataModel", "DataSource"]][0] if labels else None
-                table_info['target_table_status'] = record.get("status", True)  # 默认为True
-                table_info['default_update_frequency'] = record.get("frequency")
-                table_info['frequency'] = record.get("frequency")
-                table_info['target_type'] = record.get("type")  # 获取type属性
-                table_info['storage_location'] = record.get("storage_location")  # 获取storage_location属性
-                
-                # 根据标签类型查询关系和脚本信息
-                if "DataResource" in labels:
-                    # 检查是否为structure类型
-                    if table_info.get('target_type') == "structure":
-                        # 对于structure类型,设置默认值,不查询关系
-                        table_info['source_tables'] = []  # 使用空数组表示无源表
-                        table_info['script_name'] = "load_file.py"
-                        table_info['script_type'] = "python"
-                        
-                        # csv类型的DataResource没有上游,使用默认的append模式
-                        table_info['script_exec_mode'] = "append"
-                        logger.info(f"表 {table_name} 为structure类型,使用默认执行模式: append")
-
-                        return table_info
-                    else:
-                        query_rel = """
-                            MATCH (target {en_name: $table_name})-[rel:ORIGINATES_FROM]->(source)
-                            WITH source, rel, 
-                                 CASE WHEN rel.script_name IS NULL THEN target.en_name + '_script.py' ELSE rel.script_name END AS script_name,
-                                 CASE WHEN rel.script_type IS NULL THEN 'python' ELSE rel.script_type END AS script_type
-                            RETURN source.en_name AS source_table, script_name AS script_name,
-                                   script_type AS script_type, 'append' AS script_exec_mode
-                        """
-                elif "DataModel" in labels:
-                    query_rel = """
-                        MATCH (target {en_name: $table_name})-[rel:DERIVED_FROM]->(source)
-                        WITH source, rel, 
-                             CASE WHEN rel.script_name IS NULL THEN target.en_name + '_script.py' ELSE rel.script_name END AS script_name,
-                             CASE WHEN rel.script_type IS NULL THEN 'python' ELSE rel.script_type END AS script_type
-                        RETURN source.en_name AS source_table, script_name AS script_name,
-                               script_type AS script_type, 'append' AS script_exec_mode
-                    """
-                else:
-                    logger.warning(f"表 {table_name} 不是DataResource或DataModel类型")
-                    return table_info
-                
-                # 收集所有关系记录
-                result = session.run(query_rel, table_name=table_name)
-                # 检查result对象是否有collect方法,否则使用data方法或list直接转换
-                try:
-                    if hasattr(result, 'collect'):
-                        records = result.collect()  # 使用collect()获取所有记录
-                    else:
-                        # 尝试使用其他方法获取记录
-                        logger.info(f"表 {table_name} 的查询结果不支持collect方法,尝试使用其他方法")
-                        try:
-                            records = list(result)  # 直接转换为列表
-                        except Exception as e1:
-                            logger.warning(f"尝试列表转换失败: {str(e1)},尝试使用data方法")
-                            try:
-                                records = result.data()  # 使用data()方法
-                            except Exception as e2:
-                                logger.warning(f"所有方法都失败,使用空列表: {str(e2)}")
-                                records = []
-                except Exception as e:
-                    logger.warning(f"获取查询结果时出错: {str(e)},使用空列表")
-                    records = []
-                
-                # 记录查询到的原始记录
-                logger.info(f"表 {table_name} 查询到 {len(records)} 条关系记录")
-                for idx, rec in enumerate(records):
-                    logger.info(f"关系记录[{idx}]: source_table={rec.get('source_table')}, script_name={rec.get('script_name')}, " 
-                                f"script_type={rec.get('script_type')}, script_exec_mode={rec.get('script_exec_mode')}")
-                
-                if records:
-                    # 按脚本名称分组源表
-                    scripts_info = {}
-                    for record in records:
-                        script_name = record.get("script_name")
-                        source_table = record.get("source_table")
-                        script_type = record.get("script_type", "python")
-                        script_exec_mode = record.get("script_exec_mode", "append")
-                        
-                        logger.info(f"处理记录: source_table={source_table}, script_name={script_name}")
-
-                        if not script_name:
-                            script_name = f"{table_name}_process.py"
-                            logger.warning(f"表 {table_name} 的关系中没有script_name属性,使用默认值: {script_name}")
-                            
-                        if script_name not in scripts_info:
-                            scripts_info[script_name] = {
-                                "sources": [],
-                                "script_type": script_type,
-                                "script_exec_mode": script_exec_mode
-                            }
-                        
-                        # 确保source_table有值且不为None才添加到sources列表中
-                        if source_table and source_table not in scripts_info[script_name]["sources"]:
-                            scripts_info[script_name]["sources"].append(source_table)
-                            logger.debug(f"为表 {table_name} 的脚本 {script_name} 添加源表: {source_table}")
-                    
-                    # 处理分组信息
-                    if scripts_info:
-                        # 存储完整的脚本信息
-                        table_info['scripts_info'] = scripts_info
-                        
-                        # 如果只有一个脚本,直接使用它
-                        if len(scripts_info) == 1:
-                            script_name = list(scripts_info.keys())[0]
-                            script_info = scripts_info[script_name]
-                            
-                            table_info['source_tables'] = script_info["sources"]  # 使用数组
-                            table_info['script_name'] = script_name
-                            table_info['script_type'] = script_info["script_type"]
-                            table_info['script_exec_mode'] = script_info["script_exec_mode"]
-                            logger.info(f"表 {table_name} 有单个脚本 {script_name},源表: {script_info['sources']}")
-                        else:
-                            # 如果有多个不同脚本,记录多脚本信息
-                            logger.info(f"表 {table_name} 有多个不同脚本: {list(scripts_info.keys())}")
-                            # 暂时使用第一个脚本的信息作为默认值
-                            first_script = list(scripts_info.keys())[0]
-                            table_info['source_tables'] = scripts_info[first_script]["sources"]
-                            table_info['script_name'] = first_script
-                            table_info['script_type'] = scripts_info[first_script]["script_type"]
-                            table_info['script_exec_mode'] = scripts_info[first_script]["script_exec_mode"]
-                    else:
-                        logger.warning(f"表 {table_name} 未找到有效的脚本信息")
-                        table_info['source_tables'] = []  # 使用空数组
-                        # 向下兼容
-                        table_info['source_table'] = None
-                else:
-                    logger.warning(f"未找到表 {table_name} 的关系信息")
-                    table_info['source_tables'] = []  # 使用空数组
-                    # 向下兼容
-                    table_info['source_table'] = None
-            else:
-                logger.warning(f"在Neo4j中找不到表 {table_name} 的信息")
-    except Exception as e:
-        logger.error(f"获取表 {table_name} 的信息时出错: {str(e)}")
-    finally:
-        driver.close()
-    
-    return table_info
-
-def process_dependencies(tables_info):
-    """处理表间依赖关系,添加被动调度的表"""
-    # 存储所有表信息的字典
-    all_tables = {t['target_table']: t for t in tables_info}
-    driver = get_neo4j_driver()
-    
-    try:
-        with driver.session() as session:
-            for table_name, table_info in list(all_tables.items()):
-                if table_info.get('target_table_label') == 'DataModel':
-                    # 查询其依赖表
-                    query = """
-                        MATCH (dm {en_name: $table_name})-[:DERIVED_FROM]->(dep)
-                        RETURN dep.en_name AS dep_name, labels(dep) AS dep_labels, 
-                               dep.status AS dep_status, dep.frequency AS dep_frequency
-                    """
-                    result = session.run(query, table_name=table_name)
-                    
-                    for record in result:
-                        dep_name = record.get("dep_name")
-                        dep_labels = record.get("dep_labels", [])
-                        dep_status = record.get("dep_status", True)
-                        dep_frequency = record.get("dep_frequency")
-                        
-                        # 处理未被直接调度的依赖表
-                        if dep_name and dep_name not in all_tables:
-                            logger.info(f"发现被动依赖表: {dep_name}, 标签: {dep_labels}")
-                            
-                            # 获取依赖表详细信息
-                            dep_info = get_table_info_from_neo4j(dep_name)
-                            dep_info['is_directly_schedule'] = False
-                            
-                            # 处理调度频率继承
-                            if not dep_info.get('frequency'):
-                                dep_info['frequency'] = table_info.get('frequency')
-                            
-                            # 确保向下兼容
-                            if not dep_info.get('default_update_frequency'):
-                                dep_info['default_update_frequency'] = table_info.get('default_update_frequency')
-                            
-                            all_tables[dep_name] = dep_info
-    except Exception as e:
-        logger.error(f"处理依赖关系时出错: {str(e)}")
-    finally:
-        driver.close()
-    
-    return list(all_tables.values())
-
-def filter_invalid_tables(tables_info):
-    """过滤无效表及其依赖,使用NetworkX构建依赖图"""
-    # 构建表名到索引的映射
-    table_dict = {t['target_table']: i for i, t in enumerate(tables_info)}
-    
-    # 找出无效表
-    invalid_tables = set()
-    for table in tables_info:
-        if table.get('target_table_status') is False:
-            invalid_tables.add(table['target_table'])
-            logger.info(f"表 {table['target_table']} 的状态为无效")
-    
-    # 构建依赖图
-    G = nx.DiGraph()
-    
-    # 添加所有节点
-    for table in tables_info:
-        G.add_node(table['target_table'])
-    
-    # 查询并添加依赖边
-    driver = get_neo4j_driver()
-    try:
-        with driver.session() as session:
-            for table in tables_info:
-                if table.get('target_table_label') == 'DataModel':
-                    query = """
-                        MATCH (source {en_name: $table_name})-[:DERIVED_FROM]->(target)
-                        RETURN target.en_name AS target_name
-                    """
-                    result = session.run(query, table_name=table['target_table'])
-                    
-                    for record in result:
-                        target_name = record.get("target_name")
-                        if target_name and target_name in table_dict:
-                            # 添加从目标到源的边,表示目标依赖于源
-                            G.add_edge(table['target_table'], target_name)
-                            logger.debug(f"添加依赖边: {table['target_table']} -> {target_name}")
-    except Exception as e:
-        logger.error(f"构建依赖图时出错: {str(e)}")
-    finally:
-        driver.close()
-    
-    # 找出依赖于无效表的所有表
-    downstream_invalid = set()
-    for invalid_table in invalid_tables:
-        # 获取可从无效表到达的所有节点
-        try:
-            descendants = nx.descendants(G, invalid_table)
-            downstream_invalid.update(descendants)
-            logger.info(f"表 {invalid_table} 的下游无效表: {descendants}")
-        except Exception as e:
-            logger.error(f"处理表 {invalid_table} 的下游依赖时出错: {str(e)}")
-    
-    # 合并所有无效表
-    all_invalid = invalid_tables.union(downstream_invalid)
-    logger.info(f"总共 {len(all_invalid)} 个表被标记为无效: {all_invalid}")
-    
-    # 过滤出有效表
-    valid_tables = [t for t in tables_info if t['target_table'] not in all_invalid]
-    logger.info(f"过滤后保留 {len(valid_tables)} 个有效表")
-    
-    return valid_tables
-
-def prepare_dag_schedule(**kwargs):
-    """准备DAG调度任务的主函数"""
-    dag_run = kwargs.get('dag_run')
-    logical_date = dag_run.logical_date
-    local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-    exec_date = local_logical_date.strftime('%Y-%m-%d')
-    
-    # 检查是否是手动触发
-    is_manual_trigger = dag_run.conf.get('MANUAL_TRIGGER', False) if dag_run.conf else False
-    if is_manual_trigger:
-        logger.info(f"【手动触发】当前DAG是手动触发的,使用传入的logical_date: {logical_date}")
-    
-    # 记录重要的时间参数
-    logger.info(f"【时间参数】prepare_dag_schedule: exec_date={exec_date}, logical_date={logical_date}, local_logical_date={local_logical_date}")
-    logger.info(f"开始准备执行日期 {exec_date} 的统一调度任务")
-    
-    # 1. 获取启用的表
-    enabled_tables = get_enabled_tables()
-    logger.info(f"从schedule_status表获取到 {len(enabled_tables)} 个启用的表")
-    
-    if not enabled_tables:
-        logger.warning("没有找到启用的表,准备工作结束")
-        return 0
-    
-    # 2. 获取表的详细信息
-    tables_info = []
-    for table_name in enabled_tables:
-        table_info = get_table_info_from_neo4j(table_name)
-        if table_info:
-            tables_info.append(table_info)
-    
-    logger.info(f"成功获取 {len(tables_info)} 个表的详细信息")
-    
-    # 3. 处理依赖关系,添加被动调度的表
-    enriched_tables = process_dependencies(tables_info)
-    logger.info(f"处理依赖后,总共有 {len(enriched_tables)} 个表")
-    
-    # 4. 过滤无效表及其依赖
-    valid_tables = filter_invalid_tables(enriched_tables)
-    logger.info(f"过滤无效表后,最终有 {len(valid_tables)} 个有效表")
-    
-    # 已删除对 airflow_dag_schedule 表的写入操作
-    # 只记录准备了多少个表
-    logger.info(f"处理了 {len(valid_tables)} 个有效表")
-    
-    # 7. 生成执行计划数据
-    resource_tasks = []
-    model_tasks = []
-    
-    for table in valid_tables:
-        if table.get('target_table_label') == 'DataResource':
-            task_info = {
-                "source_tables": [table.get('source_table')] if table.get('source_table') else [],
-                "target_table": table['target_table'],
-                "target_table_label": "DataResource",
-                "script_name": table.get('script_name'),
-                "script_exec_mode": table.get('script_exec_mode', 'append'),
-                "frequency": table.get('frequency')
-            }
-            # 为structure类型添加特殊属性
-            if table.get('target_type') == "structure":
-                task_info["target_type"] = "structure"
-                task_info["storage_location"] = table.get('storage_location')
-
-            resource_tasks.append(task_info)
-        elif table.get('target_table_label') == 'DataModel':
-            model_tasks.append({
-                "source_tables": [table.get('source_table')] if table.get('source_table') else [],
-                "target_table": table['target_table'],
-                "target_table_label": "DataModel",
-                "script_name": table.get('script_name'),
-                "script_exec_mode": table.get('script_exec_mode', 'append'),
-                "frequency": table.get('frequency')
-            })    
-    # 获取依赖关系
-    model_table_names = [t['target_table'] for t in model_tasks]
-    dependencies = {}
-    
-    driver = get_neo4j_driver()
-    try:
-        with driver.session() as session:
-            for table_name in model_table_names:
-                query = """
-                    MATCH (source:DataModel {en_name: $table_name})-[:DERIVED_FROM]->(target)
-                    RETURN source.en_name AS source, target.en_name AS target, labels(target) AS target_labels
-                """
-                result = session.run(query, table_name=table_name)
-                
-                deps = []
-                for record in result:
-                    target = record.get("target")
-                    target_labels = record.get("target_labels", [])
-                    
-                    if target:
-                        table_type = next((label for label in target_labels if label in ["DataModel", "DataResource"]), None)
-                        deps.append({
-                            "table_name": target,
-                            "table_type": table_type
-                        })
-                
-                dependencies[table_name] = deps
-    finally:
-        driver.close()
-    
-    # 创建执行计划
-    execution_plan = {
-        "exec_date": exec_date,
-        "logical_date": logical_date,
-        "local_logical_date": local_logical_date,
-        "resource_tasks": resource_tasks,
-        "model_tasks": model_tasks,
-        "dependencies": dependencies
-    }
-    
-    # 将执行计划保存到XCom
-    kwargs['ti'].xcom_push(key='execution_plan', value=execution_plan)
-    logger.info(f"准备了执行计划,包含 {len(resource_tasks)} 个资源表任务和 {len(model_tasks)} 个模型表任务")
-    
-    return len(valid_tables)
-
-def check_execution_plan(**kwargs):
-    """
-    检查执行计划是否存在且有效
-    返回False将阻止所有下游任务执行
-    """
-    dag_run = kwargs.get('dag_run')
-    logical_date = dag_run.logical_date
-    local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-    exec_date = local_logical_date.strftime('%Y-%m-%d')
-    
-    # 检查是否是手动触发
-    is_manual_trigger = dag_run.conf.get('MANUAL_TRIGGER', False) if dag_run.conf else False
-    if is_manual_trigger:
-        logger.info(f"【手动触发】当前DAG是手动触发的,使用传入的logical_date: {logical_date}")
-    
-    # 记录重要的时间参数
-    logger.info(f"【时间参数】check_execution_plan: exec_date={exec_date}, logical_date={logical_date}, local_logical_date={local_logical_date}")
-    logger.info("检查数据库中的执行计划是否存在且有效")
-    
-    # 从数据库获取执行计划
-    execution_plan = get_execution_plan_from_db(exec_date)
-    
-    # 检查是否成功获取到执行计划
-    if not execution_plan:
-        logger.error(f"未找到执行日期 {exec_date} 的执行计划")
-        return False
-    
-    # 检查执行计划是否包含必要字段
-    if "exec_date" not in execution_plan:
-        logger.error("执行计划缺少exec_date字段")
-        return False
-        
-    if not isinstance(execution_plan.get("resource_tasks", []), list):
-        logger.error("执行计划的resource_tasks字段无效")
-        return False
-        
-    if not isinstance(execution_plan.get("model_tasks", []), list):
-        logger.error("执行计划的model_tasks字段无效")
-        return False
-    
-    # 检查是否有任务数据
-    resource_tasks = execution_plan.get("resource_tasks", [])
-    model_tasks = execution_plan.get("model_tasks", [])
-    
-    if not resource_tasks and not model_tasks:
-        logger.warning("执行计划不包含任何任务")
-        # 如果没有任务,则阻止下游任务执行
-        return False
-    
-    logger.info(f"执行计划验证成功: 包含 {len(resource_tasks)} 个资源任务和 {len(model_tasks)} 个模型任务")
-    return True
-
-#############################################
-# 第二阶段: 数据处理阶段(Data Processing Phase)的函数
-#############################################
-
-def get_all_tasks(exec_date):
-    """
-    获取所有需要执行的任务(DataResource和DataModel)
-    直接从执行计划获取任务信息,不再查询数据库
-    """
-    # 从数据库获取执行计划
-    execution_plan = get_execution_plan_from_db(exec_date)
-    
-    if not execution_plan:
-        logger.warning(f"未找到执行日期 {exec_date} 的执行计划")
-        return [], []
-    
-    # 提取资源任务和模型任务
-    resource_tasks = execution_plan.get("resource_tasks", [])
-    model_tasks = execution_plan.get("model_tasks", [])
-    
-    logger.info(f"获取到 {len(resource_tasks)} 个资源任务和 {len(model_tasks)} 个模型任务")
-    return resource_tasks, model_tasks
-
-def get_table_dependencies(table_names):
-    """获取表之间的依赖关系"""
-    driver = get_neo4j_driver()
-    dependency_dict = {name: [] for name in table_names}
-    
-    try:
-        with driver.session() as session:
-            # 获取所有模型表之间的依赖关系
-            query = """
-                MATCH (source:DataModel)-[:DERIVED_FROM]->(target)
-                WHERE source.en_name IN $table_names
-                RETURN source.en_name AS source, target.en_name AS target, labels(target) AS target_labels
-            """
-            result = session.run(query, table_names=table_names)
-            
-            for record in result:
-                source = record.get("source")
-                target = record.get("target")
-                target_labels = record.get("target_labels", [])
-                
-                if source and target:
-                    # 将目标表添加到源表的依赖列表中
-                    dependency_dict[source].append({
-                        "table_name": target,
-                        "table_type": next((label for label in target_labels if label in ["DataModel", "DataResource"]), None)
-                    })
-                    logger.debug(f"依赖关系: {source} 依赖于 {target}")
-    except Exception as e:
-        logger.error(f"从Neo4j获取依赖关系时出错: {str(e)}")
-    finally:
-        driver.close()
-    
-    return dependency_dict
-
-def create_execution_plan(**kwargs):
-    """准备执行计划的函数,使用从准备阶段传递的数据"""
-    try:
-        dag_run = kwargs.get('dag_run')
-        logical_date = dag_run.logical_date
-        local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-        exec_date = local_logical_date.strftime('%Y-%m-%d')
-        
-        # 检查是否是手动触发
-        is_manual_trigger = dag_run.conf.get('MANUAL_TRIGGER', False) if dag_run.conf else False
-        if is_manual_trigger:
-            logger.info(f"【手动触发】当前DAG是手动触发的,使用传入的logical_date: {logical_date}")
-        
-        # 记录重要的时间参数
-        logger.info(f"【时间参数】create_execution_plan: exec_date={exec_date}, logical_date={logical_date}, local_logical_date={local_logical_date}")
-        
-        # 从XCom获取执行计划
-        execution_plan = kwargs['ti'].xcom_pull(task_ids='prepare_phase.prepare_dag_schedule', key='execution_plan')
-        
-        # 如果找不到执行计划,则从数据库获取
-        if not execution_plan:
-            # 获取执行日期
-            logger.info(f"未找到执行计划,从数据库获取。使用执行日期: {exec_date}")
-            
-            # 获取所有任务
-            resource_tasks, model_tasks = get_all_tasks(exec_date)
-            
-            if not resource_tasks and not model_tasks:
-                logger.warning(f"执行日期 {exec_date} 没有找到任务")
-                return 0
-            
-            # 为所有模型表获取依赖关系
-            model_table_names = [task["target_table"] for task in model_tasks]
-            dependencies = get_table_dependencies(model_table_names)
-            
-            # 创建执行计划
-            new_execution_plan = {
-                "exec_date": exec_date,
-                "resource_tasks": resource_tasks,
-                "model_tasks": model_tasks,
-                "dependencies": dependencies
-            }
-            
-            # 保存执行计划到XCom
-            kwargs['ti'].xcom_push(key='execution_plan', value=new_execution_plan)
-            logger.info(f"创建新的执行计划,包含 {len(resource_tasks)} 个资源表任务和 {len(model_tasks)} 个模型表任务")
-            
-            return new_execution_plan
-        
-        logger.info(f"成功获取执行计划")
-        return execution_plan
-    except Exception as e:
-        logger.error(f"创建执行计划时出错: {str(e)}")
-        # 返回空执行计划
-        empty_plan = {
-            "exec_date": get_today_date(),
-            "resource_tasks": [],
-            "model_tasks": [],
-            "dependencies": {}
-        }
-        
-        return empty_plan
-
-def process_resource(target_table, script_name, script_exec_mode, exec_date,**kwargs):
-    """处理单个资源表"""
-    task_id = f"resource_{target_table}"
-    logger.info(f"===== 开始执行 {task_id} =====")
-    logger.info(f"执行资源表 {target_table} 的脚本 {script_name}")
-
-    # 确保exec_date是字符串
-    if not isinstance(exec_date, str):
-        exec_date = str(exec_date)
-        logger.info(f"将exec_date转换为字符串: {exec_date}")
-
-    # 获取额外参数
-    target_type = kwargs.get('target_type')
-    storage_location = kwargs.get('storage_location')
-    frequency = kwargs.get('frequency')
-    source_tables = kwargs.get('source_tables', [])
-    
-    # 记录源表信息(如果有)
-    if source_tables and len(source_tables) > 0:
-        logger.info(f"资源表 {target_table} 有 {len(source_tables)} 个源表: {source_tables}")
-    
-    try:
-        # 使用新的函数执行脚本,传递相应参数
-        logger.info(f"调用execute_python_script: target_table={target_table}, script_name={script_name}")
-        
-        # 构建参数字典
-        script_params = {
-            "target_table": target_table,
-            "script_name": script_name,
-            "script_exec_mode": script_exec_mode,
-            "exec_date": exec_date,
-            "frequency": frequency,
-            "source_tables": source_tables
-        }
-        
-        # 添加特殊参数(如果有)
-        if target_type == "structure":
-            logger.info(f"处理structure类型的资源表,文件路径: {storage_location}")
-            script_params["target_type"] = target_type
-            script_params["storage_location"] = storage_location
-        
-        # logger.debug 打印所有的script_params
-        logger.debug(f"script_params: {script_params}")
-        
-        # 执行脚本
-        result = execute_python_script(**script_params)
-        logger.info(f"资源表 {target_table} 处理完成,结果: {result}")
-        return result
-    except Exception as e:
-        logger.error(f"处理资源表 {target_table} 时出错: {str(e)}")
-        import traceback
-        logger.error(traceback.format_exc())
-        logger.info(f"===== 结束执行 {task_id} (失败) =====")
-        return False
-    finally:
-        logger.info(f"===== 结束执行 {task_id} =====")
-
-
-def process_model(target_table, script_name, script_exec_mode, exec_date, source_tables=None):
-    """处理单个模型表,支持多个源表"""
-    task_id = f"model_{target_table}"
-    logger.info(f"===== 开始执行 {task_id} =====")
-    logger.info(f"执行模型表 {target_table} 的脚本 {script_name}")
-    
-    # 确保exec_date是字符串
-    if not isinstance(exec_date, str):
-        exec_date = str(exec_date)
-        logger.info(f"将exec_date转换为字符串: {exec_date}")
-    
-    # 记录源表信息
-    if source_tables and len(source_tables) > 0:
-        logger.info(f"模型表 {target_table} 有 {len(source_tables)} 个源表: {source_tables}")
-    
-    try:
-        # 使用新的函数执行脚本,不依赖数据库
-        logger.info(f"调用execute_python_script: target_table={target_table}, script_name={script_name}")
-        result = execute_python_script(
-            target_table=target_table,
-            script_name=script_name,
-            script_exec_mode=script_exec_mode,
-            exec_date=exec_date,
-            source_tables=source_tables  # 传递源表列表
-        )
-        logger.info(f"模型表 {target_table} 处理完成,结果: {result}")
-        return result
-    except Exception as e:
-        logger.error(f"处理模型表 {target_table} 时出错: {str(e)}")
-        import traceback
-        logger.error(traceback.format_exc())
-        logger.info(f"===== 结束执行 {task_id} (失败) =====")
-        return False
-    finally:
-        logger.info(f"===== 结束执行 {task_id} =====")
-
-
-# 添加新函数,用于从数据库获取执行计划
-def get_execution_plan_from_db(ds):
-    """
-    从数据库airflow_exec_plans表中获取执行计划
-    
-    参数:
-        ds (str): 执行日期,格式为'YYYY-MM-DD'
-        
-    返回:
-        dict: 执行计划字典,如果找不到则返回None
-    """
-    # 记录输入参数详细信息
-    if isinstance(ds, datetime):
-        if ds.tzinfo:
-            logger.debug(f"【执行日期】get_execution_plan_from_db接收到datetime对象: {ds}, 带时区: {ds.tzinfo}")
-        else:
-            logger.debug(f"【执行日期】get_execution_plan_from_db接收到datetime对象: {ds}, 无时区")
-    else:
-        logger.debug(f"【执行日期】get_execution_plan_from_db接收到: {ds}, 类型: {type(ds)}")
-    
-    logger.info(f"尝试从数据库获取执行日期 {ds} 的执行计划")
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    execution_plan = None
-    
-    try:
-        # 查询条件a: 当前日期=表的exec_date,如果有多条记录,取insert_time最大的一条
-        cursor.execute("""
-            SELECT plan, run_id, insert_time
-            FROM airflow_exec_plans
-            WHERE dag_id = 'dag_dataops_pipeline_prepare_scheduler' AND exec_date = %s
-            ORDER BY insert_time DESC
-            LIMIT 1
-        """, (ds,))
-        result = cursor.fetchone()
-        
-        if result:
-            # 获取计划、run_id和insert_time
-            plan_json, run_id, insert_time = result
-            logger.info(f"找到当前日期 exec_date={ds} 的执行计划记录,run_id: {run_id}, insert_time: {insert_time}")
-            
-            # 处理plan_json可能已经是dict的情况
-            if isinstance(plan_json, dict):
-                execution_plan = plan_json
-            else:
-                execution_plan = json.loads(plan_json)
-                
-            return execution_plan
-        
-        # 查询条件b: 找不到当前日期的记录,查找exec_date<当前ds的最新记录
-        logger.info(f"未找到当前日期 exec_date={ds} 的执行计划记录,尝试查找历史记录")
-        cursor.execute("""
-            SELECT plan, run_id, insert_time, exec_date
-            FROM airflow_exec_plans
-            WHERE dag_id = 'dag_dataops_pipeline_prepare_scheduler' AND exec_date < %s
-            ORDER BY exec_date DESC, insert_time DESC
-            LIMIT 1
-        """, (ds,))
-        result = cursor.fetchone()
-        
-        if result:
-            # 获取计划、run_id、insert_time和exec_date
-            plan_json, run_id, insert_time, plan_ds = result
-            logger.info(f"找到历史执行计划记录,exec_date: {plan_ds}, run_id: {run_id}, insert_time: {insert_time}")
-            
-            # 处理plan_json可能已经是dict的情况
-            if isinstance(plan_json, dict):
-                execution_plan = plan_json
-            else:
-                execution_plan = json.loads(plan_json)
-                
-            return execution_plan
-        
-        # 找不到任何执行计划记录
-        logger.error(f"在数据库中未找到任何执行计划记录,当前DAG exec_date={ds}")
-        return None
-        
-    except Exception as e:
-        logger.error(f"从数据库获取执行计划时出错: {str(e)}")
-        import traceback
-        logger.error(traceback.format_exc())
-        return None
-    finally:
-        cursor.close()
-        conn.close()
-
-# 创建DAG
-with DAG(
-    "dag_dataops_pipeline_data_scheduler", 
-    start_date=datetime(2024, 1, 1), 
-    schedule_interval="@daily", 
-    catchup=False,
-    default_args={
-        'owner': 'airflow',
-        'depends_on_past': False,
-        'email_on_failure': False,
-        'email_on_retry': False,
-        'retries': 1,
-        'retry_delay': timedelta(minutes=5)
-    },
-    params={
-        'MANUAL_TRIGGER': False, 
-    },
-    # 添加DAG级别参数,确保任务运行时有正确的环境
-    # params={
-    #     "scripts_path": SCRIPTS_BASE_PATH,
-    #     "airflow_base_path": os.path.dirname(os.path.dirname(__file__))
-    # }
-) as dag:
-    
-    # 记录DAG实例化时的重要信息
-    now = datetime.now()
-    now_with_tz = now.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
-    default_exec_date = get_today_date()
-    logger.info(f"【DAG初始化】当前时间: {now} / {now_with_tz}, 默认执行日期: {default_exec_date}")
-    
-    #############################################
-    # 阶段1: 准备阶段(Prepare Phase)
-    #############################################
-    with TaskGroup("prepare_phase") as prepare_group:
-        # 任务开始标记
-        start_preparation = EmptyOperator(
-            task_id="start_preparation"
-        )
-        
-        # 准备调度任务
-        prepare_task = PythonOperator(
-            task_id="prepare_dag_schedule",
-            python_callable=prepare_dag_schedule,
-            provide_context=True
-        )
-        
-        # 验证执行计划有效性
-        check_plan = ShortCircuitOperator(
-            task_id="check_execution_plan",
-            python_callable=check_execution_plan,
-            provide_context=True
-        )
-        
-        # 创建执行计划 
-        create_plan = PythonOperator(
-            task_id="create_execution_plan",
-            python_callable=create_execution_plan,
-            provide_context=True
-        )
-        
-        # 准备完成标记
-        preparation_completed = EmptyOperator(
-            task_id="preparation_completed"
-        )
-        
-        # 设置任务依赖
-        start_preparation >> prepare_task >> check_plan >> create_plan >> preparation_completed
-    
-    #############################################
-    # 阶段2: 数据处理阶段(Data Processing Phase)
-    #############################################
-    with TaskGroup("data_processing_phase") as data_group:
-        # 数据处理开始任务
-        start_processing = EmptyOperator(
-            task_id="start_processing"
-        )
-        
-        # 数据处理完成标记
-        processing_completed = EmptyOperator(
-            task_id="processing_completed",
-            trigger_rule="none_failed_min_one_success"  # 只要有一个任务成功且没有失败的任务就标记为完成
-        )
-        
-        # 设置依赖
-        start_processing >> processing_completed
-    
-    
-    # 设置三个阶段之间的依赖关系
-    prepare_group >> data_group
-
-    # 尝试从数据库获取执行计划
-    try:
-        # 获取当前DAG的执行日期
-        exec_date = get_today_date()  # 使用当天日期作为默认值
-        logger.info(f"当前DAG执行日期 ds={exec_date},尝试从数据库获取执行计划")
-        
-        # 记录实际使用的执行日期的时区信息和原始格式
-        if isinstance(exec_date, datetime):
-            logger.info(f"【执行日期详情】类型: datetime, 时区: {exec_date.tzinfo}, 值: {exec_date}")
-        else:
-            logger.info(f"【执行日期详情】类型: {type(exec_date)}, 值: {exec_date}")
-        
-        # 从数据库获取执行计划
-        execution_plan = get_execution_plan_from_db(exec_date)
-        
-        # 检查是否成功获取到执行计划
-        if execution_plan is None:
-            error_msg = f"无法从数据库获取有效的执行计划,当前DAG exec_date={exec_date}"
-            logger.error(error_msg)
-            # 使用全局变量而不是异常来强制DAG失败
-            raise ValueError(error_msg)
-        
-        # 如果获取到了执行计划,处理它
-        logger.info(f"成功从数据库获取执行计划")
-        
-        # 提取信息
-        exec_date = execution_plan.get("exec_date", exec_date)
-        resource_tasks = execution_plan.get("resource_tasks", [])
-        model_tasks = execution_plan.get("model_tasks", [])
-        dependencies = execution_plan.get("dependencies", {})
-        
-        logger.info(f"执行计划: exec_date={exec_date}, resource_tasks数量={len(resource_tasks)}, model_tasks数量={len(model_tasks)}")
-        
-        # 如果执行计划为空(没有任务),也应该失败
-        if not resource_tasks and not model_tasks:
-            error_msg = f"执行计划中没有任何任务,当前DAG exec_date={exec_date}"
-            logger.error(error_msg)
-            raise ValueError(error_msg)
-        
-        # 动态创建处理任务
-        task_dict = {}
-        
-        # 1. 创建资源表任务
-        for task_info in resource_tasks:
-            table_name = task_info["target_table"]
-            script_name = task_info["script_name"]
-            exec_mode = task_info.get("script_exec_mode", "append")
-            source_tables = task_info.get("source_tables", [])  # 获取源表数组
-            
-            # 创建安全的任务ID
-            safe_table_name = table_name.replace(".", "_").replace("-", "_")
-
-            # 构建op_kwargs参数
-            op_kwargs = {
-                "target_table": table_name,
-                "script_name": script_name,
-                "script_exec_mode": exec_mode,
-                "exec_date": str(exec_date),
-                "source_tables": source_tables  # 添加源表数组
-            }
-
-            # 添加特殊参数(如果有)
-            if "target_type" in task_info and task_info["target_type"] == "structure":
-                op_kwargs["target_type"] = task_info["target_type"]
-                op_kwargs["storage_location"] = task_info.get("storage_location")
-            
-            # 添加frequency参数(如果有)
-            if "frequency" in task_info:
-                op_kwargs["frequency"] = task_info["frequency"]
-            
-            # 确保所有任务都是data_processing_phase的一部分
-            with data_group:
-                resource_task = PythonOperator(
-                    task_id=f"resource_{safe_table_name}",
-                    python_callable=process_resource,
-                    op_kwargs=op_kwargs,
-                    retries=TASK_RETRY_CONFIG["retries"],
-                    retry_delay=timedelta(minutes=TASK_RETRY_CONFIG["retry_delay_minutes"])
-                )
-            
-            # 将任务添加到字典
-            task_dict[table_name] = resource_task
-            
-            # 设置与start_processing的依赖
-            start_processing >> resource_task
-            
-            # 如果资源表有自己的源表依赖
-            if source_tables and isinstance(source_tables, list):
-                for source_table in source_tables:
-                    if source_table and source_table in task_dict:
-                        task_dict[source_table] >> resource_task
-                        logger.info(f"设置资源表依赖: {source_table} >> {table_name}")
-        
-        # 创建有向图,用于检测模型表之间的依赖关系
-        G = nx.DiGraph()
-        
-        # 将所有模型表添加为节点
-        for task_info in model_tasks:
-            table_name = task_info["target_table"]
-            G.add_node(table_name)
-        
-        # 添加模型表之间的依赖边
-        for source, deps in dependencies.items():
-            for dep in deps:
-                if dep.get("table_type") == "DataModel" and dep.get("table_name") in G.nodes():
-                    G.add_edge(dep.get("table_name"), source)  # 依赖方向:依赖项 -> 目标
-        
-        # 检测循环依赖并处理
-        try:
-            cycles = list(nx.simple_cycles(G))
-            if cycles:
-                logger.warning(f"检测到循环依赖: {cycles}")
-                for cycle in cycles:
-                    G.remove_edge(cycle[-1], cycle[0])
-                    logger.info(f"打破循环依赖: 移除 {cycle[-1]} -> {cycle[0]} 的依赖")
-        except Exception as e:
-            logger.error(f"检测循环依赖时出错: {str(e)}")
-        
-        # 生成拓扑排序,确定执行顺序
-        execution_order = []
-        try:
-            execution_order = list(nx.topological_sort(G))
-        except Exception as e:
-            logger.error(f"生成拓扑排序失败: {str(e)}")
-            execution_order = [task_info["target_table"] for task_info in model_tasks]
-        
-        # 2. 按拓扑排序顺序创建模型表任务
-        for table_name in execution_order:
-            task_info = next((t for t in model_tasks if t["target_table"] == table_name), None)
-            if not task_info:
-                continue
-                
-            script_name = task_info["script_name"]
-            exec_mode = task_info.get("script_exec_mode", "append")
-            source_tables = task_info.get("source_tables", [])  # 获取源表数组
-            
-            # 创建安全的任务ID
-            safe_table_name = table_name.replace(".", "_").replace("-", "_")
-            
-            # 确保所有任务都是data_processing_phase的一部分
-            with data_group:
-                model_task = PythonOperator(
-                    task_id=f"model_{safe_table_name}",
-                    python_callable=process_model,
-                    op_kwargs={
-                        "target_table": table_name,
-                        "script_name": script_name,
-                        "script_exec_mode": exec_mode,
-                        # 确保使用字符串而不是可能是默认(非字符串)格式的执行日期
-                        "exec_date": str(exec_date),
-                        "source_tables": source_tables  # 传递源表数组
-                    },
-                    retries=TASK_RETRY_CONFIG["retries"],
-                    retry_delay=timedelta(minutes=TASK_RETRY_CONFIG["retry_delay_minutes"])
-                )
-            
-            # 将任务添加到字典
-            task_dict[table_name] = model_task
-            
-            # 设置依赖关系,基于source_tables和dependencies
-            has_dependency = False
-            
-            # 先根据source_tables直接设置依赖
-            if isinstance(source_tables, list):
-                for source_table in source_tables:
-                    if source_table and source_table in task_dict:
-                        task_dict[source_table] >> model_task
-                        has_dependency = True
-                        logger.info(f"根据source_tables设置依赖: {source_table} >> {table_name}")
-            
-            # 然后处理dependencies中的依赖
-            deps = dependencies.get(table_name, [])
-            for dep in deps:
-                dep_table = dep.get("table_name")
-                dep_type = dep.get("table_type")
-                
-                if dep_table in task_dict:
-                    # 避免重复设置依赖
-                    if dep_table not in source_tables:
-                        task_dict[dep_table] >> model_task
-                        has_dependency = True
-                        logger.info(f"根据dependencies设置依赖: {dep_table} >> {table_name}")
-            
-            # 如果没有依赖,则依赖于start_processing和资源表任务
-            if not has_dependency:
-                # 从start_processing任务直接连接
-                start_processing >> model_task
-                
-                # 同时从所有资源表任务连接
-                resource_count = 0
-                for resource_table in resource_tasks:
-                    if resource_count >= 5:  # 最多设置5个依赖
-                        break
-                    
-                    resource_name = resource_table["target_table"]
-                    if resource_name in task_dict:
-                        task_dict[resource_name] >> model_task
-                        resource_count += 1
-        
-        # 找出所有终端任务(没有下游依赖的任务)
-        terminal_tasks = []
-        
-        # 检查所有模型表任务
-        for table_name in execution_order:
-            # 检查是否有下游任务
-            has_downstream = False
-            for source, deps in dependencies.items():
-                if source == table_name:  # 跳过自身
-                    continue
-                for dep in deps:
-                    if dep.get("table_name") == table_name:
-                        has_downstream = True
-                        break
-                if has_downstream:
-                    break
-            
-            # 如果没有下游任务,添加到终端任务列表
-            if not has_downstream and table_name in task_dict:
-                terminal_tasks.append(table_name)
-        
-        # 如果没有模型表任务,将所有资源表任务视为终端任务
-        if not model_tasks and resource_tasks:
-            terminal_tasks = [task["target_table"] for task in resource_tasks]
-            logger.info(f"没有模型表任务,将所有资源表任务视为终端任务: {terminal_tasks}")
-        
-        # 如果既没有模型表任务也没有资源表任务,已有默认依赖链
-        if not terminal_tasks:
-            logger.warning("未找到任何任务,使用默认依赖链")
-        else:
-            # 将所有终端任务连接到完成标记
-            for table_name in terminal_tasks:
-                if table_name in task_dict:
-                    task_dict[table_name] >> processing_completed
-                    logger.info(f"设置终端任务: {table_name} >> processing_completed")
-    except Exception as e:
-        logger.error(f"加载执行计划时出错: {str(e)}")
-        import traceback
-        logger.error(traceback.format_exc())
-logger.info(f"DAG dag_dataops_pipeline_data_scheduler 定义完成")

+ 0 - 885
dags/dag_dataops_pipeline_prepare_scheduler.py

@@ -1,885 +0,0 @@
-# dag_dataops_pipeline_prepare_scheduler.py
-from airflow import DAG
-from airflow.operators.python import PythonOperator, ShortCircuitOperator
-from airflow.operators.empty import EmptyOperator
-from datetime import datetime, timedelta
-import logging
-import networkx as nx
-import json
-import os
-import re
-import glob
-from pathlib import Path
-import hashlib
-import pendulum
-from common import (
-    get_pg_conn, 
-    get_neo4j_driver,
-    get_today_date
-)
-from config import PG_CONFIG, NEO4J_CONFIG
-
-# 创建日志记录器
-logger = logging.getLogger(__name__)
-
-def get_enabled_tables():
-    """获取所有启用的表"""
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    try:
-        cursor.execute("""
-            SELECT owner_id, table_name 
-            FROM schedule_status 
-            WHERE schedule_is_enabled = TRUE
-        """)
-        result = cursor.fetchall()
-        return [row[1] for row in result]  # 只返回表名
-    except Exception as e:
-        logger.error(f"获取启用表失败: {str(e)}")
-        return []
-    finally:
-        cursor.close()
-        conn.close()
-
-def check_table_directly_subscribed(table_name):
-    """检查表是否在schedule_status表中直接调度"""
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    try:
-        cursor.execute("""
-            SELECT schedule_is_enabled
-            FROM schedule_status 
-            WHERE table_name = %s
-        """, (table_name,))
-        result = cursor.fetchone()
-        return result and result[0] is True
-    except Exception as e:
-        logger.error(f"检查表订阅状态失败: {str(e)}")
-        return False
-    finally:
-        cursor.close()
-        conn.close()
-
-
-def should_execute_today(table_name, frequency, exec_date):
-    """
-    判断指定频率的表在给定执行日期是否应该执行
-    
-    参数:
-        table_name (str): 表名,用于日志记录
-        frequency (str): 调度频率,如'daily'、'weekly'、'monthly'、'yearly',为None时默认为'daily'
-        exec_date (str): 执行日期,格式为'YYYY-MM-DD'
-    
-    返回:
-        bool: 如果该表应该在执行日期执行,则返回True,否则返回False
-    """
-    # 将执行日期字符串转换为pendulum日期对象
-    try:
-        exec_date_obj = pendulum.parse(exec_date)
-    except Exception as e:
-        logger.error(f"解析执行日期 {exec_date} 出错: {str(e)},使用当前日期")
-        exec_date_obj = pendulum.today()
-    
-    # 计算下一个日期,用于判断是否是月初、周初等
-    next_date = exec_date_obj.add(days=1)
-    
-    # 如果频率为None或空字符串,默认为daily
-    if not frequency:
-        logger.info(f"表 {table_name} 未指定调度频率,默认为daily")
-        return True
-    
-    frequency = frequency.lower() if isinstance(frequency, str) else 'daily'
-    
-    if frequency == 'daily':
-        # 日任务每天都执行
-        return True
-    elif frequency == 'weekly':
-        # 周任务只在周日执行(因为exec_date+1是周一时才执行)
-        is_sunday = next_date.day_of_week == 1  # 1表示周一
-        logger.info(f"表 {table_name} 是weekly任务,exec_date={exec_date},next_date={next_date.to_date_string()},是否周日: {is_sunday}")
-        return is_sunday
-    elif frequency == 'monthly':
-        # 月任务只在每月最后一天执行(因为exec_date+1是月初时才执行)
-        is_month_end = next_date.day == 1
-        logger.info(f"表 {table_name} 是monthly任务,exec_date={exec_date},next_date={next_date.to_date_string()},是否月末: {is_month_end}")
-        return is_month_end
-    elif frequency == 'quarterly':
-        # 季度任务只在每季度最后一天执行(因为exec_date+1是季度初时才执行)
-        is_quarter_end = next_date.day == 1 and next_date.month in [1, 4, 7, 10]
-        logger.info(f"表 {table_name} 是quarterly任务,exec_date={exec_date},next_date={next_date.to_date_string()},是否季末: {is_quarter_end}")
-        return is_quarter_end
-    elif frequency == 'yearly':
-        # 年任务只在每年最后一天执行(因为exec_date+1是年初时才执行)
-        is_year_end = next_date.day == 1 and next_date.month == 1
-        logger.info(f"表 {table_name} 是yearly任务,exec_date={exec_date},next_date={next_date.to_date_string()},是否年末: {is_year_end}")
-        return is_year_end
-    else:
-        # 未知频率,默认执行
-        logger.warning(f"表 {table_name} 使用未知的调度频率: {frequency},默认执行")
-        return True
-
-def get_table_info_from_neo4j(table_name):
-    """从Neo4j获取表的详细信息"""
-    driver = get_neo4j_driver()
-     # 检查表是否直接订阅
-    is_directly_schedule = check_table_directly_subscribed(table_name)
-
-    table_info = {
-        'target_table': table_name,
-        'is_directly_schedule': is_directly_schedule,  # 初始值设为True,从schedule_status表获取
-    }
-    
-    try:
-        with driver.session() as session:
-            # 查询表标签和状态
-            query_table = """
-                MATCH (t {en_name: $table_name})
-                RETURN labels(t) AS labels, t.status AS status, t.frequency AS frequency,
-                       t.type AS type, t.storage_location AS storage_location
-            """
-            result = session.run(query_table, table_name=table_name)
-            record = result.single()
-            
-            if record:
-                labels = record.get("labels", [])
-                table_info['target_table_label'] = [label for label in labels if label in ["DataResource", "DataModel", "DataSource"]][0] if labels else None
-                table_info['target_table_status'] = record.get("status", True)  # 默认为True
-                # table_info['default_update_frequency'] = record.get("frequency")
-                table_info['frequency'] = record.get("frequency")
-                table_info['target_type'] = record.get("type")  # 获取type属性
-                table_info['storage_location'] = record.get("storage_location")  # 获取storage_location属性
-                
-                # 根据标签类型查询关系和脚本信息
-                if "DataResource" in labels:
-                    # 检查是否为structure类型
-                    if table_info.get('target_type') == "structure":
-                        # 对于structure类型,设置默认值,不查询关系
-                        table_info['source_tables'] = []  # 使用空数组表示无源表
-                        table_info['script_name'] = "load_file.py"
-                        table_info['script_type'] = "python"
-                        
-                        # csv类型的DataResource没有上游,使用默认的append模式
-                        table_info['script_exec_mode'] = "append"
-                        logger.info(f"表 {table_name} 为structure类型,使用默认执行模式: append")
-
-                        return table_info
-                    else:
-                        query_rel = """
-                            MATCH (target {en_name: $table_name})-[rel:ORIGINATES_FROM]->(source)
-                            WITH source, rel, 
-                                 CASE WHEN rel.script_name IS NULL THEN target.en_name + '_script.py' ELSE rel.script_name END AS script_name,
-                                 CASE WHEN rel.script_type IS NULL THEN 'python' ELSE rel.script_type END AS script_type
-                            RETURN source.en_name AS source_table, script_name AS script_name,
-                                   script_type AS script_type, 'append' AS script_exec_mode
-                        """
-                elif "DataModel" in labels:
-                    query_rel = """
-                        MATCH (target {en_name: $table_name})-[rel:DERIVED_FROM]->(source)
-                        WITH source, rel, 
-                             CASE WHEN rel.script_name IS NULL THEN target.en_name + '_script.py' ELSE rel.script_name END AS script_name,
-                             CASE WHEN rel.script_type IS NULL THEN 'python' ELSE rel.script_type END AS script_type
-                        RETURN source.en_name AS source_table, script_name AS script_name,
-                               script_type AS script_type, 'append' AS script_exec_mode
-                    """
-                else:
-                    logger.warning(f"表 {table_name} 不是DataResource或DataModel类型")
-                    return table_info
-                
-                # 收集所有关系记录
-                result = session.run(query_rel, table_name=table_name)
-                # 检查result对象是否有collect方法,否则使用data方法或list直接转换
-                try:
-                    if hasattr(result, 'collect'):
-                        records = result.collect()  # 使用collect()获取所有记录
-                    else:
-                        # 尝试使用其他方法获取记录
-                        logger.info(f"表 {table_name} 的查询结果不支持collect方法,尝试使用其他方法")
-                        try:
-                            records = list(result)  # 直接转换为列表
-                        except Exception as e1:
-                            logger.warning(f"尝试列表转换失败: {str(e1)},尝试使用data方法")
-                            try:
-                                records = result.data()  # 使用data()方法
-                            except Exception as e2:
-                                logger.warning(f"所有方法都失败,使用空列表: {str(e2)}")
-                                records = []
-                except Exception as e:
-                    logger.warning(f"获取查询结果时出错: {str(e)},使用空列表")
-                    records = []
-                
-                # 记录查询到的原始记录
-                logger.info(f"表 {table_name} 查询到 {len(records)} 条关系记录")
-                for idx, rec in enumerate(records):
-                    logger.info(f"关系记录[{idx}]: source_table={rec.get('source_table')}, script_name={rec.get('script_name')}, " 
-                                f"script_type={rec.get('script_type')}, script_exec_mode={rec.get('script_exec_mode')}")
-                
-                if records:
-                    # 按脚本名称分组源表
-                    scripts_info = {}
-                    for record in records:
-                        script_name = record.get("script_name")
-                        source_table = record.get("source_table")
-                        script_type = record.get("script_type", "python")
-                        script_exec_mode = record.get("script_exec_mode", "append")
-                        
-                        logger.info(f"处理记录: source_table={source_table}, script_name={script_name}")
-                        
-                        # 如果script_name为空,生成默认的脚本名
-                        if not script_name:
-                            script_name = f"{table_name}_process.py"
-                            logger.warning(f"表 {table_name} 的关系中没有script_name属性,使用默认值: {script_name}")
-                            
-                        if script_name not in scripts_info:
-                            scripts_info[script_name] = {
-                                "sources": [],
-                                "script_type": script_type,
-                                "script_exec_mode": script_exec_mode
-                            }
-                        
-                        # 确保source_table有值且不为None才添加到sources列表中
-                        if source_table and source_table not in scripts_info[script_name]["sources"]:
-                            scripts_info[script_name]["sources"].append(source_table)
-                            logger.debug(f"为表 {table_name} 的脚本 {script_name} 添加源表: {source_table}")
-                    
-                    # 处理分组信息
-                    if scripts_info:
-                        # 存储完整的脚本信息
-                        table_info['scripts_info'] = scripts_info
-                        
-                        # 如果只有一个脚本,直接使用它
-                        if len(scripts_info) == 1:
-                            script_name = list(scripts_info.keys())[0]
-                            script_info = scripts_info[script_name]
-                            
-                            table_info['source_tables'] = script_info["sources"]  # 使用数组
-                            table_info['script_name'] = script_name
-                            table_info['script_type'] = script_info["script_type"]
-                            table_info['script_exec_mode'] = script_info["script_exec_mode"]
-                            logger.info(f"表 {table_name} 有单个脚本 {script_name},源表: {script_info['sources']}")
-                        else:
-                            # 如果有多个不同脚本,记录多脚本信息
-                            logger.info(f"表 {table_name} 有多个不同脚本: {list(scripts_info.keys())}")
-                            # 暂时使用第一个脚本的信息作为默认值
-                            first_script = list(scripts_info.keys())[0]
-                            table_info['source_tables'] = scripts_info[first_script]["sources"]
-                            table_info['script_name'] = first_script
-                            table_info['script_type'] = scripts_info[first_script]["script_type"]
-                            table_info['script_exec_mode'] = scripts_info[first_script]["script_exec_mode"]
-                    else:
-                        logger.warning(f"表 {table_name} 未找到有效的脚本信息")
-                        table_info['source_tables'] = []  # 使用空数组
-                else:
-                    logger.warning(f"未找到表 {table_name} 的关系信息")
-                    table_info['source_tables'] = []  # 使用空数组
-            else:
-                logger.warning(f"在Neo4j中找不到表 {table_name} 的信息")
-    except Exception as e:
-        logger.error(f"获取表 {table_name} 的信息时出错: {str(e)}")
-    finally:
-        driver.close()
-    
-    return table_info
-
-def process_dependencies(tables_info):
-    """处理表间依赖关系,添加被动调度的表"""
-    # 存储所有表信息的字典
-    all_tables = {t['target_table']: t for t in tables_info}
-    driver = get_neo4j_driver()
-    
-    try:
-        with driver.session() as session:
-            for table_name, table_info in list(all_tables.items()):
-                if table_info.get('target_table_label') == 'DataModel':
-                    # 查询其依赖表
-                    query = """
-                        MATCH (dm {en_name: $table_name})-[:DERIVED_FROM]->(dep)
-                        RETURN dep.en_name AS dep_name, labels(dep) AS dep_labels, 
-                               dep.status AS dep_status, dep.frequency AS dep_frequency
-                    """
-                    result = session.run(query, table_name=table_name)
-                    
-                    for record in result:
-                        dep_name = record.get("dep_name")
-                        dep_labels = record.get("dep_labels", [])
-                        dep_status = record.get("dep_status", True)
-                        dep_frequency = record.get("dep_frequency")
-                        
-                        # 处理未被直接调度的依赖表
-                        if dep_name and dep_name not in all_tables:
-                            logger.info(f"发现被动依赖表: {dep_name}, 标签: {dep_labels}")
-                            
-                            # 获取依赖表详细信息
-                            dep_info = get_table_info_from_neo4j(dep_name)
-                            dep_info['is_directly_schedule'] = False
-                            
-                            # 处理调度频率继承
-                            if not dep_info.get('frequency'):
-                                dep_info['frequency'] = table_info.get('frequency')
-                            
-                            all_tables[dep_name] = dep_info
-    except Exception as e:
-        logger.error(f"处理依赖关系时出错: {str(e)}")
-    finally:
-        driver.close()
-    
-    return list(all_tables.values())
-
-def filter_invalid_tables(tables_info):
-    """过滤无效表及其依赖,使用NetworkX构建依赖图"""
-    # 构建表名到索引的映射
-    table_dict = {t['target_table']: i for i, t in enumerate(tables_info)}
-    
-    # 找出无效表
-    invalid_tables = set()
-    for table in tables_info:
-        if table.get('target_table_status') is False:
-            invalid_tables.add(table['target_table'])
-            logger.info(f"表 {table['target_table']} 的状态为无效")
-    
-    # 构建依赖图
-    G = nx.DiGraph()
-    
-    # 添加所有节点
-    for table in tables_info:
-        G.add_node(table['target_table'])
-    
-    # 查询并添加依赖边
-    driver = get_neo4j_driver()
-    try:
-        with driver.session() as session:
-            for table in tables_info:
-                if table.get('target_table_label') == 'DataModel':
-                    query = """
-                        MATCH (source {en_name: $table_name})-[:DERIVED_FROM]->(target)
-                        RETURN target.en_name AS target_name
-                    """
-                    result = session.run(query, table_name=table['target_table'])
-                    
-                    for record in result:
-                        target_name = record.get("target_name")
-                        if target_name and target_name in table_dict:
-                            # 添加从目标到源的边,表示目标依赖于源
-                            G.add_edge(table['target_table'], target_name)
-                            logger.debug(f"添加依赖边: {table['target_table']} -> {target_name}")
-    except Exception as e:
-        logger.error(f"构建依赖图时出错: {str(e)}")
-    finally:
-        driver.close()
-    
-    # 找出依赖于无效表的所有表
-    downstream_invalid = set()
-    for invalid_table in invalid_tables:
-        # 获取可从无效表到达的所有节点
-        try:
-            descendants = nx.descendants(G, invalid_table)
-            downstream_invalid.update(descendants)
-            logger.info(f"表 {invalid_table} 的下游无效表: {descendants}")
-        except Exception as e:
-            logger.error(f"处理表 {invalid_table} 的下游依赖时出错: {str(e)}")
-    
-    # 合并所有无效表
-    all_invalid = invalid_tables.union(downstream_invalid)
-    logger.info(f"总共 {len(all_invalid)} 个表被标记为无效: {all_invalid}")
-    
-    # 过滤出有效表
-    valid_tables = [t for t in tables_info if t['target_table'] not in all_invalid]
-    logger.info(f"过滤后保留 {len(valid_tables)} 个有效表")
-    
-    return valid_tables
-
-def touch_data_scheduler_file():
-    """
-    更新数据调度器DAG文件的修改时间,触发重新解析
-    
-    返回:
-        bool: 是否成功更新
-    """
-    data_scheduler_path = os.path.join(os.path.dirname(__file__), 'dag_dataops_pipeline_data_scheduler.py')
-
-    
-    success = False
-    try:
-        if os.path.exists(data_scheduler_path):
-            # 更新文件修改时间,触发Airflow重新解析
-            os.utime(data_scheduler_path, None)
-            logger.info(f"已触发数据调度器DAG重新解析: {data_scheduler_path}")
-            success = True
-        else:
-            logger.warning(f"数据调度器DAG文件不存在: {data_scheduler_path}")
-                
-        return success
-    except Exception as e:
-        logger.error(f"触发DAG重新解析时出错: {str(e)}")
-        return False
-
-def get_subscription_state_hash():
-    """获取订阅表状态的哈希值"""
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    try:
-        cursor.execute("""
-            SELECT table_name, schedule_is_enabled
-            FROM schedule_status
-            ORDER BY table_name
-        """)
-        rows = cursor.fetchall()
-        # 将所有行拼接成一个字符串,然后计算哈希值
-        data_str = '|'.join(f"{row[0]}:{row[1]}" for row in rows)
-        return hashlib.md5(data_str.encode()).hexdigest()
-    except Exception as e:
-        logger.error(f"计算订阅表状态哈希值时出错: {str(e)}")
-        return None
-    finally:
-        cursor.close()
-        conn.close()
-
-def check_execution_plan_in_db(**kwargs):
-    """
-    检查当天的执行计划是否存在于数据库中
-    返回False将阻止所有下游任务执行
-    """
-    # 获取执行日期
-    dag_run = kwargs.get('dag_run')
-    logical_date = dag_run.logical_date
-    local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-    exec_date = local_logical_date.strftime('%Y-%m-%d')
-    logger.info(f"logical_date: {logical_date} ")
-    logger.info(f"local_logical_date {local_logical_date} ")
-    logger.info(f"检查执行日期 exec_date {exec_date} 的执行计划是否存在于数据库中")
-   
-    
-    # 检查数据库中是否存在执行计划
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    try:
-        cursor.execute("""
-            SELECT plan
-            FROM airflow_exec_plans
-            WHERE exec_date = %s
-            ORDER BY logical_date DESC
-            LIMIT 1
-        """, (exec_date,))
-        
-        result = cursor.fetchone()
-        if not result:
-            logger.error(f"数据库中不存在执行日期 {exec_date} 的执行计划")
-            return False
-        
-        # 检查执行计划内容是否有效
-        try:
-            # PostgreSQL的jsonb类型会被psycopg2自动转换为Python字典,无需再使用json.loads
-            plan_data = result[0]            
-            # 检查必要字段
-            if "exec_date" not in plan_data:
-                logger.error("执行计划缺少exec_date字段")
-                return False
-                
-            if not isinstance(plan_data.get("resource_tasks", []), list):
-                logger.error("执行计划的resource_tasks字段无效")
-                return False
-                
-            if not isinstance(plan_data.get("model_tasks", []), list):
-                logger.error("执行计划的model_tasks字段无效")
-                return False
-            
-            # 检查是否有任务数据
-            resource_tasks = plan_data.get("resource_tasks", [])
-            model_tasks = plan_data.get("model_tasks", [])
-            
-            logger.info(f"执行计划验证成功: 包含 {len(resource_tasks)} 个资源任务和 {len(model_tasks)} 个模型任务")
-            return True
-            
-        except Exception as je:
-            logger.error(f"处理执行计划数据时出错: {str(je)}")
-            return False
-        
-    except Exception as e:
-        logger.error(f"检查数据库中执行计划时出错: {str(e)}")
-        return False
-    finally:
-        cursor.close()
-        conn.close()
-
-def save_execution_plan_to_db(execution_plan, dag_id, run_id, logical_date, ds):
-    """
-    将执行计划保存到airflow_exec_plans表
-    
-    参数:
-        execution_plan (dict): 执行计划字典
-        dag_id (str): DAG的ID
-        run_id (str): DAG运行的ID
-        logical_date (datetime): 逻辑日期
-        ds (str): 日期字符串,格式为YYYY-MM-DD
-    
-    返回:
-        bool: 操作是否成功
-    """
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    
-    try:
-        # 将执行计划转换为JSON字符串
-        plan_json = json.dumps(execution_plan)
-        
-        # 获取本地时间
-        local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-        
-        # 插入记录
-        cursor.execute("""
-            INSERT INTO airflow_exec_plans
-            (dag_id, run_id, logical_date, local_logical_date, exec_date, plan)
-            VALUES (%s, %s, %s, %s, %s, %s)
-        """, (dag_id, run_id, logical_date, local_logical_date, ds, plan_json))
-        
-        conn.commit()
-        logger.info(f"成功将执行计划保存到airflow_exec_plans表,dag_id={dag_id}, run_id={run_id}, exec_date={ds}")
-        return True
-    except Exception as e:
-        logger.error(f"保存执行计划到数据库时出错: {str(e)}")
-        conn.rollback()
-        return False
-    finally:
-        cursor.close()
-        conn.close()
-
-def prepare_pipeline_dag_schedule(**kwargs):
-    """准备Pipeline DAG调度任务的主函数"""
-    # 检查是否是手动触发模式
-    is_manual_trigger = False
-    params = kwargs.get('params', {})
-    if params and 'MANUAL_TRIGGER' in params:
-        is_manual_trigger = params.get('MANUAL_TRIGGER', False)
-        if is_manual_trigger:
-            logger.info(f"接收到手动触发参数: MANUAL_TRIGGER={is_manual_trigger}")
-    
-    # 获取执行日期
-    dag_run = kwargs.get('dag_run')
-    logical_date = dag_run.logical_date
-    local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-    exec_date = local_logical_date.strftime('%Y-%m-%d')
-    logger.info(f"开始准备执行日期 {exec_date} 的Pipeline调度任务")
-    
-    # 检查是否需要创建新的执行计划
-    need_create_plan = False
-    
-    # 条件1: 数据库中不存在当天的执行计划
-    has_plan_in_db = check_execution_plan_in_db(**kwargs)
-    if not has_plan_in_db:
-        logger.info(f"数据库中不存在执行日期exec_date {exec_date} 的执行计划,需要创建新的执行计划")
-        need_create_plan = True
-    
-    # 条件2: schedule_status表中的数据发生了变更
-    if not need_create_plan:
-        # 计算当前哈希值
-        current_hash = get_subscription_state_hash()
-        # 读取上次记录的哈希值
-        hash_file = os.path.join(os.path.dirname(__file__), '.subscription_state')
-        last_hash = None
-        if os.path.exists(hash_file):
-            try:
-                with open(hash_file, 'r') as f:
-                    last_hash = f.read().strip()
-            except Exception as e:
-                logger.warning(f"读取上次订阅状态哈希值失败: {str(e)}")
-        
-        # 如果哈希值不同,表示数据发生了变更
-        if current_hash != last_hash:
-            logger.info(f"检测到schedule_status表数据变更。旧哈希值: {last_hash}, 新哈希值: {current_hash}")
-            need_create_plan = True
-    
-    # 手动触发模式覆盖以上判断
-    if is_manual_trigger:
-        logger.info("手动触发模式,将创建新的执行计划")
-        need_create_plan = True
-    
-    # 如果不需要创建新的执行计划,直接返回
-    if not need_create_plan:
-        logger.info("无需创建新的执行计划")
-        return 0
-    
-    # 继续处理,创建新的执行计划
-    # 1. 获取启用的表
-    enabled_tables = get_enabled_tables()
-    logger.info(f"从schedule_status表获取到 {len(enabled_tables)} 个启用的表")
-    
-    if not enabled_tables:
-        logger.warning("没有找到启用的表,准备工作结束")
-        return 0
-    
-    # 2. 获取表的详细信息
-    tables_info = []
-    for table_name in enabled_tables:
-        table_info = get_table_info_from_neo4j(table_name)
-        if table_info:
-            tables_info.append(table_info)
-    
-    logger.info(f"成功获取 {len(tables_info)} 个表的详细信息")
-    
-    # 2.1 根据调度频率过滤表(新增的步骤)
-    filtered_tables_info = []
-    for table_info in tables_info:
-        table_name = table_info['target_table']
-        frequency = table_info.get('frequency')
-        
-        if should_execute_today(table_name, frequency, exec_date):
-            filtered_tables_info.append(table_info)
-            logger.info(f"表 {table_name} (频率: {frequency}) 将在今天{exec_date}执行")
-        else:
-            logger.info(f"表 {table_name} (频率: {frequency}) 今天{exec_date}不执行,已过滤")
-    
-    logger.info(f"按调度频率过滤后,今天{exec_date}需要执行的表有 {len(filtered_tables_info)} 个")
-
-
-    # 3. 处理依赖关系,添加被动调度的表
-    enriched_tables = process_dependencies(filtered_tables_info)
-    logger.info(f"处理依赖后,总共有 {len(enriched_tables)} 个表")
-    
-    # 4. 过滤无效表及其依赖
-    valid_tables = filter_invalid_tables(enriched_tables)
-    logger.info(f"过滤无效表后,最终有 {len(valid_tables)} 个有效表")
-    
-    # 构建执行计划并保存到数据库
-    try:
-        # 构建执行计划
-        resource_tasks = []
-        model_tasks = []
-        
-        # 遍历所有有效表,创建任务信息
-        for table in valid_tables:
-            # 确保每个表对象都有source_tables字段且是一个列表
-            if 'source_tables' not in table or not isinstance(table.get('source_tables'), list):
-                logger.warning(f"表 {table['target_table']} 没有source_tables或不是列表,初始化为空列表")
-                table['source_tables'] = []
-            
-            # 处理资源表任务
-            if table.get('target_table_label') == 'DataResource':
-                task_info = {
-                    "source_tables": table.get('source_tables', []),  # 使用数组存储源表
-                    "target_table": table['target_table'],
-                    "target_table_label": "DataResource",
-                    "script_name": table.get('script_name'),
-                    "script_exec_mode": table.get('script_exec_mode', 'append'),
-                    "frequency": table.get('frequency')
-                }
-                # 为structure类型添加特殊属性
-                if table.get('target_type') == "structure":
-                    task_info["target_type"] = "structure"
-                    task_info["storage_location"] = table.get('storage_location')  
-                              
-                resource_tasks.append(task_info)
-            # 处理模型表任务
-            elif table.get('target_table_label') == 'DataModel':
-                # 检查是否有多个脚本信息
-                if 'scripts_info' in table and len(table['scripts_info']) > 1:
-                    # 处理多脚本情况,为每个脚本创建单独的任务
-                    logger.info(f"表 {table['target_table']} 有多个脚本,单独处理每个脚本")
-                    
-                    for script_name, script_info in table['scripts_info'].items():
-                        model_tasks.append({
-                            "source_tables": script_info.get("sources", []),  # 使用数组存储源表
-                            "target_table": table['target_table'],
-                            "target_table_label": "DataModel",
-                            "script_name": script_name,
-                            "script_exec_mode": script_info.get("script_exec_mode", 'append'),
-                            "script_type": script_info.get("script_type", 'python'),
-                            "frequency": table.get('frequency')
-                        })
-                else:
-                    # 处理单脚本情况
-                    model_tasks.append({
-                        "source_tables": table.get('source_tables', []),  # 使用数组存储源表
-                        "target_table": table['target_table'],
-                        "target_table_label": "DataModel",
-                        "script_name": table.get('script_name'),
-                        "script_exec_mode": table.get('script_exec_mode', 'append'),
-                        "frequency": table.get('frequency')
-                    })
-        
-        # 获取和处理依赖关系
-        dependencies = {}
-        model_table_names = [t['target_table'] for t in model_tasks]
-        
-        # 初始化依赖关系字典
-        for table_name in model_table_names:
-            dependencies[table_name] = []
-        
-        # 查询Neo4j获取依赖关系
-        driver = get_neo4j_driver()
-        try:
-            with driver.session() as session:
-                # 为每个模型表查询依赖
-                for table_name in model_table_names:
-                    query = """
-                        MATCH (source:DataModel {en_name: $table_name})-[:DERIVED_FROM]->(target)
-                        RETURN source.en_name AS source, target.en_name AS target, labels(target) AS target_labels
-                    """
-                    try:
-                        # 执行查询
-                        result = session.run(query, table_name=table_name)
-                        
-                        # 尝试获取记录
-                        records = []
-                        try:
-                            if hasattr(result, 'collect'):
-                                records = result.collect()
-                            else:
-                                records = list(result)
-                        except Exception as e:
-                            logger.warning(f"获取表 {table_name} 的依赖关系记录失败: {str(e)}")
-                            records = []
-                        
-                        # 源表列表,用于后续更新model_tasks
-                        source_tables_list = []
-                        
-                        # 处理依赖关系记录
-                        for record in records:
-                            target = record.get("target")
-                            target_labels = record.get("target_labels", [])
-                            
-                            if target:
-                                # 确定依赖表类型
-                                table_type = next((label for label in target_labels 
-                                                 if label in ["DataModel", "DataResource"]), None)
-                                
-                                # 添加依赖关系
-                                dependencies[table_name].append({
-                                    "table_name": target,
-                                    "table_type": table_type
-                                })
-                                
-                                # 记录源表
-                                source_tables_list.append(target)
-                                logger.info(f"添加其他依赖: {table_name} -> {target}")
-                        
-                        # 更新model_tasks中的source_tables
-                        for mt in model_tasks:
-                            if mt['target_table'] == table_name:
-                                # 确保source_tables是数组
-                                if not isinstance(mt.get('source_tables'), list):
-                                    mt['source_tables'] = []
-                                
-                                # 添加依赖的源表
-                                for source_table in source_tables_list:
-                                    if source_table and source_table not in mt['source_tables']:
-                                        mt['source_tables'].append(source_table)
-                                        logger.info(f"从依赖关系中添加源表 {source_table} 到 {table_name}")
-                    
-                    except Exception as e:
-                        logger.error(f"处理表 {table_name} 的依赖关系时出错: {str(e)}")
-                        
-        except Exception as e:
-            logger.error(f"查询Neo4j依赖关系时出错: {str(e)}")
-        finally:
-            driver.close()
-        
-        # 创建最终执行计划
-        execution_plan = {
-            "exec_date": exec_date,
-            "resource_tasks": resource_tasks,
-            "model_tasks": model_tasks,
-            "dependencies": dependencies
-        }
-        
-        # 更新订阅表状态哈希值
-        current_hash = get_subscription_state_hash()
-        hash_file = os.path.join(os.path.dirname(__file__), '.subscription_state')
-        with open(hash_file, 'w') as f:
-            f.write(current_hash)
-        logger.info(f"已更新订阅表状态哈希值: {current_hash}")
-        
-        # 触发数据调度器DAG重新解析
-        touch_data_scheduler_file()
-        
-        # 保存执行计划到数据库表
-        try:
-            # 获取DAG运行信息
-            dag_run = kwargs.get('dag_run')
-            if dag_run:
-                dag_id = dag_run.dag_id
-                run_id = dag_run.run_id
-                logical_date = dag_run.logical_date
-                local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-            else:
-                # 如果无法获取dag_run,使用默认值
-                dag_id = kwargs.get('dag').dag_id if 'dag' in kwargs else "dag_dataops_pipeline_prepare_scheduler"
-                run_id = f"manual_{datetime.now().strftime('%Y%m%d%H%M%S')}"
-                logical_date = datetime.now()
-            
-            # 保存到数据库
-            save_result = save_execution_plan_to_db(
-                execution_plan=execution_plan,
-                dag_id=dag_id,
-                run_id=run_id,
-                logical_date=local_logical_date,
-                ds=exec_date
-            )
-            
-            if save_result:
-                logger.info("执行计划已成功保存到数据库")
-            else:
-                raise Exception("执行计划保存到数据库失败")
-            
-        except Exception as db_e:
-            # 捕获数据库保存错误
-            error_msg = f"保存执行计划到数据库时出错: {str(db_e)}"
-            logger.error(error_msg)
-            raise Exception(error_msg)
-            
-    except Exception as e:
-        error_msg = f"创建或保存执行计划时出错: {str(e)}"
-        logger.error(error_msg)
-        # 强制抛出异常,确保任务失败,阻止下游DAG执行
-        raise Exception(error_msg)
-    
-    return len(valid_tables)  # 返回有效表数量
-
-# 创建DAG
-with DAG(
-    "dag_dataops_pipeline_prepare_scheduler",
-    start_date=datetime(2024, 1, 1),
-    # 每小时执行一次
-    schedule_interval="0 * * * *",
-    catchup=False,
-    default_args={
-        'owner': 'airflow',
-        'depends_on_past': False,
-        'email_on_failure': False,
-        'email_on_retry': False,
-        'retries': 1,
-        'retry_delay': timedelta(minutes=5)
-    },
-    params={
-        'MANUAL_TRIGGER': False, 
-    },
-) as dag:
-    
-    # 任务开始标记
-    start_preparation = EmptyOperator(
-        task_id="start_preparation",
-        dag=dag
-    )
-    
-    # 准备调度任务
-    prepare_task = PythonOperator(
-        task_id="prepare_pipeline_dag_schedule",
-        python_callable=prepare_pipeline_dag_schedule,
-        provide_context=True,
-        dag=dag
-    )
-    
-    # 检查执行计划是否存在于数据库中
-    check_plan_in_db = ShortCircuitOperator(
-        task_id="check_execution_plan_in_db",
-        python_callable=check_execution_plan_in_db,
-        provide_context=True,
-        dag=dag
-    )
-    
-    # 准备完成标记
-    preparation_completed = EmptyOperator(
-        task_id="preparation_completed",
-        dag=dag
-    )
-    
-    # 设置任务依赖
-    start_preparation >> prepare_task >> check_plan_in_db >> preparation_completed

+ 0 - 394
dags/dag_dataops_pipeline_summary_scheduler.py

@@ -1,394 +0,0 @@
-# dag_dataops_pipeline_summary_scheduler.py
-"""
-数据管道执行统计汇总 DAG
-
-功能:
-1. 依赖主数据处理 DAG (dag_dataops_pipeline_data_scheduler) 的完成
-2. 收集主 DAG 的执行统计信息
-3. 生成执行报告
-4. 无论主 DAG 执行成功与否都会运行
-"""
-from airflow import DAG
-from airflow.operators.python import PythonOperator
-from airflow.operators.empty import EmptyOperator
-from airflow.sensors.external_task import ExternalTaskSensor
-from datetime import datetime, timedelta
-import logging
-import json
-import pendulum
-import pytz
-from airflow.models import DagRun, TaskInstance
-from airflow.utils.state import State
-from sqlalchemy import desc
-from airflow import settings
-from common import get_today_date
-
-# 创建日志记录器
-logger = logging.getLogger(__name__)
-
-# 开启详细日志记录
-ENABLE_DEBUG_LOGGING = True
-
-def log_debug(message):
-    """记录调试日志,但只在启用调试模式时"""
-    if ENABLE_DEBUG_LOGGING:
-        logger.info(f"[DEBUG] {message}")
-
-def print_target_date(dt):
-    """
-    打印并返回执行日期信息,用于 ExternalTaskSensor
-    """
-    # 转换为中国时区
-    local_dt = pendulum.instance(dt).in_timezone('Asia/Shanghai')
-
-    logger.info(f"===== ExternalTaskSensor等待的目标日期信息 =====")
-    logger.info(f"源DAG: dag_dataops_pipeline_summary_scheduler")
-    logger.info(f"目标DAG: dag_dataops_pipeline_data_scheduler")
-    logger.info(f"目标任务: data_processing_phase.processing_completed")
-    logger.info(f"查找的执行日期(UTC): {dt}")
-    logger.info(f"查找的执行日期(北京时间): {local_dt}")
-    logger.info(f"日期字符串格式(UTC): {dt.strftime('%Y-%m-%dT%H:%M:%S')}")
-    logger.info(f"日期字符串格式(北京时间): {local_dt.strftime('%Y-%m-%dT%H:%M:%S')}")
-    logger.info(f"日期UTC时区: {dt.tzinfo}")
-    logger.info(f"日期类型: {type(dt)}")
-    logger.info(f"=======================================")
-    # 必须返回原始日期,不能修改
-    return dt
-
-
-def collect_pipeline_stats(**kwargs):
-    """
-    从 Airflow 元数据收集主 DAG 的执行统计信息
-    """
-    # 获取当前执行的日期和时间信息
-    dag_run = kwargs.get('dag_run')
-    logical_date = dag_run.logical_date
-    local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-    exec_date = local_logical_date.strftime('%Y-%m-%d')
-    
-    # 记录重要的时间参数
-    logger.info(f"【时间参数】collect_pipeline_stats: exec_date={exec_date}, logical_date={logical_date}, local_logical_date={local_logical_date}")
-    logger.info(f"开始收集执行日期 {exec_date} 的管道执行统计信息")
-    
-    # 主 DAG 的 ID
-    target_dag_id = "dag_dataops_pipeline_data_scheduler"
-    
-    try:
-        # 创建数据库会话
-        session = settings.Session()
-        
-        # 查询最近的 DAG 运行记录,按照创建时间降序排序
-        dag_runs = session.query(DagRun).filter(
-            DagRun.dag_id == target_dag_id,
-            DagRun.execution_date == local_logical_date  # 只考虑当前执行日期及之前的运行
-        ).order_by(desc(DagRun.execution_date)).limit(1).all()
-        
-        if not dag_runs:
-            logger.warning(f"未找到 DAG {target_dag_id} 的运行记录")
-            session.close()
-            return {
-                "exec_date": exec_date,
-                "dag_id": target_dag_id,
-                "status": "NOT_FOUND",
-                "total_tasks": 0,
-                "success_count": 0,
-                "fail_count": 0,
-                "skipped_count": 0,
-                "upstream_failed_count": 0,
-                "duration": None,
-                "start_time": None,
-                "end_time": None
-            }
-        
-        # 获取最近的 DAG 运行
-        dag_run = dag_runs[0]
-        dag_run_id = dag_run.run_id
-        dag_start_time = dag_run.start_date
-        dag_end_time = dag_run.end_date
-        dag_state = dag_run.state
-        dag_execution_date = dag_run.execution_date
-        
-        # 计算 DAG 运行时间
-        dag_duration = None
-        if dag_start_time and dag_end_time:
-            dag_duration = (dag_end_time - dag_start_time).total_seconds()
-
-        
-        # 时区转换
-        if dag_start_time:
-            dag_start_time_local = pendulum.instance(dag_start_time).in_timezone('Asia/Shanghai')
-            dag_start_time_str = dag_start_time_local.strftime('%Y-%m-%d %H:%M:%S')
-        else:
-            dag_start_time_str = 'N/A'
-            
-        if dag_end_time:
-            dag_end_time_local = pendulum.instance(dag_end_time).in_timezone('Asia/Shanghai')
-            dag_end_time_str = dag_end_time_local.strftime('%Y-%m-%d %H:%M:%S')
-        else:
-            dag_end_time_str = 'N/A'
-
-            
-        # 获取所有相关的任务实例
-        task_instances = session.query(TaskInstance).filter(
-            TaskInstance.dag_id == target_dag_id,
-            TaskInstance.run_id == dag_run_id
-        ).all()
-        
-        # 关闭会话
-        session.close()
-        
-        # 统计任务状态信息
-        total_tasks = len(task_instances)
-        success_count = sum(1 for ti in task_instances if ti.state == State.SUCCESS)
-        fail_count = sum(1 for ti in task_instances if ti.state == State.FAILED)
-        skipped_count = sum(1 for ti in task_instances if ti.state == State.SKIPPED)
-        upstream_failed_count = sum(1 for ti in task_instances if ti.state == State.UPSTREAM_FAILED)
-        
-        # 统计各任务类型的数量
-        resource_task_count = sum(1 for ti in task_instances if "resource_" in ti.task_id)
-        model_task_count = sum(1 for ti in task_instances if "model_" in ti.task_id)
-        
-        # 获取执行时间最长的几个任务
-        task_durations = []
-        for ti in task_instances:
-            if ti.start_date and ti.end_date:
-                duration = (ti.end_date - ti.start_date).total_seconds()
-                task_durations.append({
-                    "task_id": ti.task_id,
-                    "duration": duration,
-                    "state": ti.state
-                })
-        
-        # 按持续时间降序排序
-        task_durations.sort(key=lambda x: x["duration"] if x["duration"] is not None else 0, reverse=True)
-        top_tasks_by_duration = task_durations[:5]  # 取前5个
-        
-        # 获取失败的任务
-        failed_tasks = []
-        for ti in task_instances:
-            if ti.state in [State.FAILED, State.UPSTREAM_FAILED]:
-                failed_task = {
-                    "task_id": ti.task_id,
-                    "state": ti.state,
-                    "try_number": ti.try_number,
-                }
-                if ti.start_date and ti.end_date:
-                    failed_task["duration"] = (ti.end_date - ti.start_date).total_seconds()
-                failed_tasks.append(failed_task)
-        
-        # 构建统计结果
-        stats = {
-            "exec_date": exec_date,
-            "dag_id": target_dag_id,
-            "dag_execution_date": dag_execution_date.isoformat() if dag_execution_date else None,
-            "dag_run_id": dag_run_id,
-            "status": dag_state,
-            "total_tasks": total_tasks,
-            "success_count": success_count,
-            "fail_count": fail_count,
-            "skipped_count": skipped_count,
-            "upstream_failed_count": upstream_failed_count,
-            "resource_task_count": resource_task_count,
-            "model_task_count": model_task_count,
-            "duration": dag_duration,
-            "start_time": dag_start_time_str,
-            "end_time": dag_end_time_str,
-            "top_tasks_by_duration": top_tasks_by_duration,
-            "failed_tasks": failed_tasks
-        }
-        
-        # 将统计结果保存到 XCom
-        kwargs['ti'].xcom_push(key='pipeline_stats', value=stats)
-        
-        logger.info(f"成功收集管道执行统计信息: 总任务数={total_tasks}, 成功={success_count}, 失败={fail_count}")
-        return stats
-    except Exception as e:
-        logger.error(f"收集管道执行统计信息时出错: {str(e)}")
-        import traceback
-        logger.error(traceback.format_exc())
-        # 返回一个基本的错误信息
-        error_stats = {
-            "exec_date": exec_date,
-            "dag_id": target_dag_id,
-            "status": "ERROR",
-            "error": str(e),
-            "total_tasks": 0,
-            "success_count": 0,
-            "fail_count": 0
-        }
-        kwargs['ti'].xcom_push(key='pipeline_stats', value=error_stats)
-        return error_stats
-
-def generate_execution_report(**kwargs):
-    """
-    基于收集的统计信息生成执行报告
-    """
-    try:
-        # 从 XCom 获取统计信息
-        ti = kwargs['ti']
-        stats = ti.xcom_pull(task_ids='collect_pipeline_stats')
-        
-        if not stats:
-            logger.warning("未找到管道执行统计信息,无法生成报告")
-            report = "未找到管道执行统计信息,无法生成报告。"
-            ti.xcom_push(key='execution_report', value=report)
-            return report
-        
-        # 构建报告
-        report = []
-        report.append(f"\n========== Data pipeline 执行报告 ==========")
-        report.append(f"执行日期: {stats['exec_date']}")
-        report.append(f"DAG ID: {stats['dag_id']}")
-        report.append(f"Runn ID: {stats.get('dag_run_id', 'N/A')}")
-        report.append(f"状态: {stats['status']}")
-        report.append(f"总任务数: {stats['total_tasks']}")
-        
-        # 任务状态统计
-        report.append("\n--- 任务状态统计 ---")
-        report.append(f"成功任务: {stats['success_count']} 个")
-        report.append(f"失败任务: {stats['fail_count']} 个")
-        report.append(f"跳过任务: {stats.get('skipped_count', 0)} 个")
-        report.append(f"上游失败任务: {stats.get('upstream_failed_count', 0)} 个")
-        
-        # 任务类型统计
-        report.append("\n--- 任务类型统计 ---")
-        report.append(f"资源任务: {stats.get('resource_task_count', 0)} 个")
-        report.append(f"模型任务: {stats.get('model_task_count', 0)} 个")
-        
-        # 执行时间统计
-        report.append("\n--- 执行时间统计 ---")
-        if stats.get('duration') is not None:
-            hours, remainder = divmod(stats['duration'], 3600)
-            minutes, seconds = divmod(remainder, 60)
-            report.append(f"总执行时间: {int(hours)}小时 {int(minutes)}分钟 {int(seconds)}秒")
-        else:
-            report.append("总执行时间: N/A")
-            
-        report.append(f"开始时间(北京时间): {stats.get('start_time', 'N/A')}")
-        report.append(f"结束时间(北京时间): {stats.get('end_time', 'N/A')}")
-        
-        # 执行时间最长的任务
-        top_tasks = stats.get('top_tasks_by_duration', [])
-        if top_tasks:
-            report.append("\n--- 执行时间最长的任务 ---")
-            for i, task in enumerate(top_tasks, 1):
-                duration_secs = task.get('duration', 0)
-                minutes, seconds = divmod(duration_secs, 60)
-                report.append(f"{i}. {task['task_id']}: {int(minutes)}分钟 {int(seconds)}秒 ({task['state']})")
-        
-        # 失败任务详情
-        failed_tasks = stats.get('failed_tasks', [])
-        if failed_tasks:
-            report.append("\n--- 失败任务详情 ---")
-            for i, task in enumerate(failed_tasks, 1):
-                report.append(f"{i}. 任务ID: {task['task_id']}")
-                report.append(f"   状态: {task['state']}")
-                report.append(f"   尝试次数: {task.get('try_number', 'N/A')}")
-                
-                if 'duration' in task:
-                    minutes, seconds = divmod(task['duration'], 60)
-                    report.append(f"   执行时间: {int(minutes)}分钟 {int(seconds)}秒")
-                else:
-                    report.append("   执行时间: N/A")
-        
-        # 总结
-        success_rate = 0
-        if stats['total_tasks'] > 0:
-            success_rate = (stats['success_count'] / stats['total_tasks']) * 100
-            
-        report.append("\n--- 总结 ---")
-        report.append(f"任务成功率: {success_rate:.2f}%")
-        
-        if stats['status'] == 'success':
-            report.append("管道执行成功完成!")
-        elif stats['status'] == 'failed':
-            report.append(f"管道执行失败。有 {stats['fail_count']} 个任务失败。")
-        else:
-            report.append(f"管道当前状态: {stats['status']}")
-        
-        report.append("\n========== 报告结束 ==========")
-        
-        # 将报告转换为字符串
-        report_str = "\n".join(report)
-        
-        # 记录到日志
-        logger.info("\n" + report_str)
-        
-        # 保存到 XCom
-        ti.xcom_push(key='execution_report', value=report_str)
-        
-        return report_str
-    except Exception as e:
-        logger.error(f"生成执行报告时出错: {str(e)}")
-        import traceback
-        logger.error(traceback.format_exc())
-        # 返回一个简单的错误报告
-        error_report = f"生成执行报告时出错: {str(e)}"
-        kwargs['ti'].xcom_push(key='execution_report', value=error_report)
-        return error_report
-
-# 创建 DAG
-with DAG(
-    "dag_dataops_pipeline_summary_scheduler", 
-    start_date=datetime(2024, 1, 1), 
-    schedule_interval="@daily", 
-    catchup=False,
-    default_args={
-        'owner': 'airflow',
-        'depends_on_past': False,
-        'email_on_failure': False,
-        'email_on_retry': False,
-        'retries': 1,
-        'retry_delay': timedelta(minutes=5)
-    }
-) as dag:
-    
-    # 记录 DAG 实例化时的信息
-    now = datetime.now()
-    now_with_tz = now.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
-    default_exec_date = get_today_date()
-    logger.info(f"【DAG初始化】当前时间: {now} / {now_with_tz}, 默认执行日期: {default_exec_date}")
-    
-    #############################################
-    # 等待阶段: 等待主 DAG 完成
-    #############################################
-    wait_for_pipeline_completion = ExternalTaskSensor(
-        task_id="wait_for_pipeline_completion",
-        external_dag_id="dag_dataops_pipeline_data_scheduler",
-        external_task_id="data_processing_phase.processing_completed",
-        mode="reschedule",  # 使用 reschedule 模式,不会占用 worker
-        timeout=7200,  # 等待超时时间为 2 小时
-        poke_interval=30,  # 每30秒检查一次
-        allowed_states=["success", "failed", "skipped"],  # 允许的状态包括成功、失败和跳过
-        failed_states=None,  # 不设置失败状态,确保无论主 DAG 状态如何都会继续执行
-        execution_date_fn=print_target_date,  # 用于调试的日期打印函数
-        dag=dag
-    )
-    
-    #############################################
-    # 统计阶段: 收集和生成统计信息
-    #############################################
-    collect_stats = PythonOperator(
-        task_id="collect_pipeline_stats",
-        python_callable=collect_pipeline_stats,
-        provide_context=True,
-        dag=dag
-    )
-    
-    generate_report = PythonOperator(
-        task_id="generate_execution_report",
-        python_callable=generate_execution_report,
-        provide_context=True,
-        dag=dag
-    )
-    
-    #############################################
-    # 完成阶段: 标记汇总完成
-    #############################################
-    summary_completed = EmptyOperator(
-        task_id="summary_completed",
-        dag=dag
-    )
-    
-    # 设置任务依赖
-    wait_for_pipeline_completion >> collect_stats >> generate_report >> summary_completed

+ 0 - 667
dags/dag_manual_dependency_trigger.py

@@ -1,667 +0,0 @@
-# dag_manual_dependency_trigger.py
-"""
-手动触发数据表依赖链执行DAG
-
-功能:
-- 根据指定的表名,构建并执行其上游依赖链
-- 支持三种依赖级别:
-  - 'self':只执行当前表,不处理上游依赖
-  - 'resource':查找依赖到Resource层,但只执行DataModel层
-  - 'source':查找并执行完整依赖链到Source层
-
-参数:
-- table_name:目标表名
-- dependency_level:依赖级别
-
-使用示例:
-```
-{
-  "conf": {
-    "table_name": "book_sale_amt_2yearly",
-    "dependency_level": "resource"
-  }
-}
-```
-"""
-from airflow import DAG
-from airflow.operators.python import PythonOperator
-from datetime import datetime, timedelta
-import logging
-import importlib.util
-import os
-from pathlib import Path
-from neo4j import GraphDatabase
-import psycopg2
-import networkx as nx
-from config import NEO4J_CONFIG, SCRIPTS_BASE_PATH, PG_CONFIG
-
-# 设置logger
-logger = logging.getLogger(__name__)
-
-# DAG参数
-default_args = {
-    'owner': 'airflow',
-    'depends_on_past': False,
-    'start_date': datetime(2024, 1, 1),
-    'email_on_failure': False,
-    'email_on_retry': False,
-    'retries': 1,
-    'retry_delay': timedelta(minutes=5),
-}
-
-def get_pg_conn():
-    """获取PostgreSQL连接"""
-    return psycopg2.connect(**PG_CONFIG)
-
-def get_execution_mode(table_name):
-    """
-    从PostgreSQL获取表的执行模式    
-    参数:
-        table_name (str): 表名
-    注意:
-        "AND is_enabled = TRUE" 这个条件在这里不适用,因为这是强制执行的。
-        即使订阅表中没有这个表名,也会强制执行。
-    返回:
-        str: 执行模式,如果未找到则返回"append"作为默认值
-    """
-    try:
-        conn = get_pg_conn()
-        cursor = conn.cursor()
-        cursor.execute("""
-            SELECT execution_mode 
-            FROM table_schedule 
-            WHERE table_name = %s
-        """, (table_name,))
-        result = cursor.fetchone()
-        cursor.close()
-        conn.close()
-        
-        if result:
-            return result[0]
-        else:
-            logger.warning(f"未找到表 {table_name} 的执行模式,使用默认值 'append'")
-            return "append"
-    except Exception as e:
-        logger.error(f"获取表 {table_name} 的执行模式时出错: {str(e)}")
-        return "append"
-
-def get_dag_params(**context):
-    """获取DAG运行参数"""
-    params = context.get('params', {})
-    table_name = params.get('table_name')
-    
-    # 记录原始参数信息
-    logger.info(f"接收到的原始参数: {params}")
-    
-    # 获取依赖级别参数
-    dependency_level = params.get('dependency_level')
-    logger.info(f"获取的依赖级别值: {dependency_level}")
-
-    if not table_name:
-        raise ValueError("必须提供TABLE_NAME参数")
-    
-    # 验证dependency_level参数
-    if dependency_level not in ['self', 'resource', 'source']:
-        logger.warning(f"无效的依赖级别参数: {dependency_level},使用默认值'resource'")
-        dependency_level = 'resource'
-    
-    logger.info(f"最终使用的参数 - 表名: {table_name}, 依赖级别: {dependency_level}")
-    return table_name, dependency_level
-
-def is_data_model_table(table_name):
-    """判断表是否为DataModel类型"""
-    driver = GraphDatabase.driver(
-        NEO4J_CONFIG['uri'], 
-        auth=(NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-    )
-    query = """
-        MATCH (n:DataModel {en_name: $table_name}) RETURN count(n) > 0 AS exists
-    """
-    try:
-        with driver.session() as session:
-            result = session.run(query, table_name=table_name)
-            record = result.single()
-            return record and record["exists"]
-    finally:
-        driver.close()
-
-def is_data_resource_table(table_name):
-    """判断表是否为DataResource类型"""
-    driver = GraphDatabase.driver(
-        NEO4J_CONFIG['uri'], 
-        auth=(NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-    )
-    query = """
-        MATCH (n:DataResource {en_name: $table_name}) RETURN count(n) > 0 AS exists
-    """
-    try:
-        with driver.session() as session:
-            result = session.run(query, table_name=table_name)
-            record = result.single()
-            return record and record["exists"]
-    finally:
-        driver.close()
-
-def get_upstream_models(table_name):
-    """获取表的上游DataModel依赖"""
-    driver = GraphDatabase.driver(
-        NEO4J_CONFIG['uri'], 
-        auth=(NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-    )
-    query = """
-        MATCH (target:DataModel {en_name: $table_name})-[:DERIVED_FROM]->(up:DataModel)
-        RETURN up.en_name AS upstream
-    """
-    try:
-        with driver.session() as session:
-            result = session.run(query, table_name=table_name)
-            upstream_list = [record["upstream"] for record in result]
-            logger.info(f"表 {table_name} 的上游DataModel依赖: {upstream_list}")
-            return upstream_list
-    finally:
-        driver.close()
-
-def get_upstream_resources(table_name):
-    """获取表的上游DataResource依赖"""
-    driver = GraphDatabase.driver(
-        NEO4J_CONFIG['uri'], 
-        auth=(NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-    )
-    query = """
-        MATCH (target:DataModel {en_name: $table_name})-[:DERIVED_FROM]->(up:DataResource)
-        RETURN up.en_name AS upstream
-    """
-    try:
-        with driver.session() as session:
-            result = session.run(query, table_name=table_name)
-            upstream_list = [record["upstream"] for record in result]
-            logger.info(f"表 {table_name} 的上游DataResource依赖: {upstream_list}")
-            return upstream_list
-    finally:
-        driver.close()
-
-def get_data_sources(resource_table_name):
-    """获取DataResource表的上游DataSource"""
-    driver = GraphDatabase.driver(
-        NEO4J_CONFIG['uri'], 
-        auth=(NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-    )
-    query = """
-        MATCH (dr:DataResource {en_name: $table_name})-[:ORIGINATES_FROM]->(ds:DataSource)
-        RETURN ds.en_name AS source_name
-    """
-    try:
-        with driver.session() as session:
-            result = session.run(query, table_name=resource_table_name)
-            return [record["source_name"] for record in result]
-    finally:
-        driver.close()
-
-def get_script_name_for_model(table_name):
-    """获取DataModel表对应的脚本名称"""
-    driver = GraphDatabase.driver(
-        NEO4J_CONFIG['uri'], 
-        auth=(NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-    )
-    query = """
-        MATCH (target:DataModel {en_name: $table_name})-[r:DERIVED_FROM]->(n)
-        WHERE n:DataModel OR n:DataResource
-        RETURN r.script_name AS script_name
-    """
-    try:
-        with driver.session() as session:
-            result = session.run(query, table_name=table_name)
-            record = result.single()
-            if record:
-                return record["script_name"]
-            else:
-                logger.warning(f"未找到DataModel表 {table_name} 的脚本名称")
-                return None
-    except Exception as e:
-        logger.error(f"查询表 {table_name} 的脚本名称时出错: {str(e)}")
-        return None
-    finally:
-        driver.close()
-
-def get_script_name_for_resource(table_name):
-    """获取DataResource表对应的脚本名称"""
-    driver = GraphDatabase.driver(
-        NEO4J_CONFIG['uri'], 
-        auth=(NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-    )
-    query = """
-        MATCH (dr:DataResource {en_name: $table_name})-[rel:ORIGINATES_FROM]->(ds:DataSource)
-        RETURN rel.script_name AS script_name
-    """
-    try:
-        with driver.session() as session:
-            result = session.run(query, table_name=table_name)
-            record = result.single()
-            if record:
-                return record["script_name"]
-            else:
-                logger.warning(f"未找到DataResource表 {table_name} 的脚本名称")
-                return None
-    except Exception as e:
-        logger.error(f"查询表 {table_name} 的脚本名称时出错: {str(e)}")
-        return None
-    finally:
-        driver.close()
-
-def build_dependency_chain_nx(start_table, dependency_level='resource'):
-    """
-    使用networkx构建依赖链
-    
-    参数:
-        start_table (str): 起始表名
-        dependency_level (str): 依赖级别
-            - 'self': 只执行自己
-            - 'resource': 到Resource层 (默认)
-            - 'source': 到Source层
-        
-    返回:
-        list: 依赖链列表,按执行顺序排序(从上游到下游)
-    """
-    # 记录依赖级别
-    logger.info(f"构建依赖链 - 起始表: {start_table}, 依赖级别: {dependency_level}")
-    
-    # 创建有向图
-    G = nx.DiGraph()
-    
-    # 设置起始节点属性
-    if is_data_model_table(start_table):
-        G.add_node(start_table, type='DataModel')
-        table_type = 'DataModel'
-    elif is_data_resource_table(start_table):
-        G.add_node(start_table, type='DataResource')
-        table_type = 'DataResource'
-    else:
-        logger.warning(f"表 {start_table} 不是DataModel或DataResource类型")
-        return []
-    
-    # 如果只执行自己,直接返回
-    if dependency_level == 'self':
-        logger.info(f"依赖级别为'self',只包含起始表: {start_table}")
-        script_name = get_script_name_for_model(start_table) if table_type == 'DataModel' else get_script_name_for_resource(start_table)
-        execution_mode = get_execution_mode(start_table)
-        return [{
-            'table_name': start_table,
-            'script_name': script_name,
-            'table_type': table_type,
-            'execution_mode': execution_mode
-        }]
-    
-    # 判断resource级别还是source级别
-    need_source = (dependency_level == 'source')
-    logger.info(f"是否需要查找到Source层: {need_source}")
-    
-    # BFS构建依赖图
-    visited = set([start_table])
-    queue = [start_table]
-    
-    while queue:
-        current = queue.pop(0)
-        current_type = G.nodes[current].get('type')
-        logger.info(f"处理节点: {current}, 类型: {current_type}")
-        
-        # 处理当前节点的上游依赖
-        if current_type == 'DataModel':
-            # 获取DataModel的上游依赖
-            upstream_models = get_upstream_models(current)
-            for upstream in upstream_models:
-                if upstream not in visited:
-                    G.add_node(upstream, type='DataModel')
-                    visited.add(upstream)
-                    queue.append(upstream)
-                G.add_edge(current, upstream, type='model_to_model')
-            
-            # 获取上游DataResource - 对于resource和source级别都需要查找DataResource
-            upstream_resources = get_upstream_resources(current)
-            for upstream in upstream_resources:
-                if upstream not in visited:
-                    G.add_node(upstream, type='DataResource')
-                    visited.add(upstream)
-                    # 只有在source级别时才继续向上查找DataSource
-                    if need_source:
-                        queue.append(upstream)
-                G.add_edge(current, upstream, type='model_to_resource')
-        
-        # 如果当前节点是DataResource,只有在source级别才查找上游DataSource
-        elif current_type == 'DataResource' and need_source:
-            data_sources = get_data_sources(current)
-            for source in data_sources:
-                if source not in visited:
-                    G.add_node(source, type='DataSource')
-                    visited.add(source)
-                G.add_edge(current, source, type='resource_to_source')
-    
-    # 记录依赖图节点和边信息
-    logger.info(f"依赖图节点数: {len(G.nodes)}, 边数: {len(G.edges)}")
-    
-    # 在resource级别,确保不处理DataSource节点的脚本
-    if dependency_level == 'resource':
-        # 查找所有DataSource节点
-        source_nodes = [node for node, attrs in G.nodes(data=True) if attrs.get('type') == 'DataSource']
-        logger.info(f"依赖级别为'resource',将移除 {len(source_nodes)} 个DataSource节点")
-        
-        # 移除所有DataSource节点
-        for node in source_nodes:
-            G.remove_node(node)
-        
-        # 重新记录依赖图信息
-        logger.info(f"清理后依赖图节点数: {len(G.nodes)}, 边数: {len(G.edges)}")
-    
-    logger.info(f"依赖图节点: {list(G.nodes)}")
-    
-    # 检测循环依赖
-    cycles = list(nx.simple_cycles(G))
-    if cycles:
-        logger.warning(f"检测到循环依赖,将尝试打破循环: {cycles}")
-        # 打破循环依赖(简单策略:移除每个循环中的一条边)
-        for cycle in cycles:
-            G.remove_edge(cycle[-1], cycle[0])
-            logger.info(f"打破循环依赖: 移除 {cycle[-1]} -> {cycle[0]} 的依赖")
-    
-    # 生成拓扑排序(从上游到下游的顺序)
-    try:
-        # 注意:拓扑排序给出的是从上游到下游的顺序
-        # 我们需要的是执行顺序,所以要反转图然后进行拓扑排序
-        reverse_G = G.reverse()
-        execution_order = list(nx.topological_sort(reverse_G))
-        logger.info(f"计算出的执行顺序: {execution_order}")
-        
-        # 构建最终依赖链
-        dependency_chain = []
-        for table_name in execution_order:
-            node_type = G.nodes[table_name].get('type')
-            
-            # 跳过DataSource节点,它们没有脚本需要执行
-            if node_type == 'DataSource':
-                logger.info(f"跳过DataSource节点: {table_name}")
-                continue
-            
-            # 获取脚本和执行模式
-            if node_type == 'DataModel':
-                script_name = get_script_name_for_model(table_name)
-            else:  # DataResource
-                script_name = get_script_name_for_resource(table_name)
-            
-            execution_mode = get_execution_mode(table_name)
-            
-            dependency_chain.append({
-                'table_name': table_name,
-                'script_name': script_name,
-                'table_type': node_type,
-                'execution_mode': execution_mode
-            })
-            logger.info(f"添加到依赖链: {table_name}, 类型: {node_type}")
-        
-        logger.info(f"最终依赖链长度: {len(dependency_chain)}")
-        return dependency_chain
-    
-    except Exception as e:
-        logger.error(f"生成拓扑排序时出错: {str(e)}")
-        return []
-
-def execute_scripts(scripts_list):
-    """
-    执行指定的脚本列表    
-    参数:
-        scripts_list (list): 要执行的脚本信息列表,每项包含table_name, script_name, execution_mode        
-    返回:
-        bool: 全部执行成功返回True,任一失败返回False
-    """
-    if not scripts_list:
-        logger.info("没有脚本需要执行")
-        return True
-    
-    success = True
-    for item in scripts_list:
-        script_name = item['script_name']
-        table_name = item['table_name']
-        execution_mode = item['execution_mode']
-        
-        if not script_name:
-            logger.warning(f"表 {table_name} 没有对应的脚本,跳过执行")
-            continue
-        
-        logger.info(f"执行脚本: {script_name}, 表: {table_name}, 模式: {execution_mode}")
-        
-        try:
-            script_path = Path(SCRIPTS_BASE_PATH) / script_name
-            
-            if not os.path.exists(script_path):
-                logger.error(f"脚本文件不存在: {script_path}")
-                success = False
-                break
-            
-            # 动态导入模块
-            spec = importlib.util.spec_from_file_location("dynamic_module", script_path)
-            module = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(module)
-            
-            # 使用标准入口函数run
-            if hasattr(module, "run"):
-                logger.info(f"执行脚本 {script_name} 的标准入口函数 run()")
-                result = module.run(table_name=table_name, execution_mode=execution_mode)
-                if result:
-                    logger.info(f"脚本 {script_name} 执行成功")
-                else:
-                    logger.error(f"脚本 {script_name} 执行失败")
-                    success = False
-                    break
-            else:
-                logger.warning(f"脚本 {script_name} 未定义标准入口函数 run(),无法执行")
-                success = False
-                break
-        except Exception as e:
-            logger.error(f"执行脚本 {script_name} 时出错: {str(e)}")
-            success = False
-            break
-    
-    return success
-
-def prepare_dependency_chain(**context):
-    """
-    准备依赖链并保存到XCom
-    
-    不同依赖级别的行为:
-    - self: 只执行当前表,不查找上游依赖
-    - resource: 仅查找数据模型依赖到Resource层,但不执行Resource层的脚本
-    - source: 完整查找所有依赖到Source层,并执行所有相关脚本
-    """
-    # 获取参数
-    table_name, dependency_level = get_dag_params(**context)
-    
-    # 记录依赖级别信息
-    logger.info(f"依赖级别说明:")
-    logger.info(f"- self: 只执行当前表,不查找上游依赖")
-    logger.info(f"- resource: 仅查找数据模型依赖到Resource层,但不执行Resource层的脚本")
-    logger.info(f"- source: 完整查找所有依赖到Source层,并执行所有相关脚本")
-    logger.info(f"当前依赖级别: {dependency_level}")
-    
-    # 获取依赖链
-    dependency_chain = build_dependency_chain_nx(table_name, dependency_level)
-    
-    if not dependency_chain:
-        logger.warning(f"没有找到表 {table_name} 的依赖链")
-        return False
-    
-    # 记录完整依赖链
-    logger.info(f"依赖链完整列表: {[item['table_name'] for item in dependency_chain]}")
-    
-    # 保存依赖链到XCom以便后续任务使用
-    ti = context['ti']
-    ti.xcom_push(key='dependency_chain', value=dependency_chain)
-    
-    # 保存依赖级别,便于后续任务使用
-    ti.xcom_push(key='dependency_level', value=dependency_level)
-    
-    # 检查是否有各类型的脚本需要执行
-    resource_tables = [item for item in dependency_chain if item['table_type'] == 'DataResource']
-    model_tables = [item for item in dependency_chain if item['table_type'] == 'DataModel']
-    
-    has_resource = len(resource_tables) > 0
-    has_model = len(model_tables) > 0
-    
-    # 处理特殊情况:如果是self级别,且起始表是DataResource
-    if dependency_level == 'self' and not has_model and has_resource:
-        # 确保只有一个DataResource表,而且是起始表
-        is_start_resource = any(item['table_name'] == table_name for item in resource_tables)
-        logger.info(f"依赖级别为'self',起始表是DataResource: {is_start_resource}")
-        
-        # 额外保存标志,标记这是特殊情况
-        ti.xcom_push(key='is_start_resource_only', value=is_start_resource)
-    
-    logger.info(f"是否有DataResource脚本: {has_resource}({len(resource_tables)}个), 是否有DataModel脚本: {has_model}({len(model_tables)}个)")
-    
-    return True
-
-def process_resources(**context):
-    """
-    处理所有DataResource层的脚本
-    
-    依赖级别处理策略:
-    - self: 只有当起始表是DataResource类型时才执行
-    - resource: 不执行任何DataResource脚本
-    - source: 执行所有依赖链中的DataResource脚本
-    """
-    # 获取任务间共享变量
-    ti = context['ti']
-    dependency_chain = ti.xcom_pull(task_ids='prepare_dependency_chain', key='dependency_chain')
-    
-    # 直接从XCom获取依赖级别,避免重复解析
-    dependency_level = ti.xcom_pull(task_ids='prepare_dependency_chain', key='dependency_level')
-    
-    # 记录当前任务的依赖级别
-    logger.info(f"process_resources任务 - 当前依赖级别: {dependency_level}")
-    
-    # 检查特殊标志
-    is_start_resource_only = ti.xcom_pull(task_ids='prepare_dependency_chain', key='is_start_resource_only', default=False)
-    
-    # 依赖级别处理逻辑
-    if dependency_level == 'self' and not is_start_resource_only:
-        logger.info("依赖级别为'self'且起始表不是DataResource,跳过process_resources任务")
-        return True
-    elif dependency_level == 'resource':
-        logger.info("依赖级别为'resource',根据设计不执行DataResource表脚本")
-        return True
-    
-    # 获取表名(仅在self级别需要)
-    table_name = None
-    if dependency_level == 'self':
-        params = context.get('params', {})
-        table_name = params.get('table_name')
-        logger.info(f"依赖级别为'self',目标表: {table_name}")
-    
-    # 根据依赖级别过滤要执行的脚本
-    if dependency_level == 'self' and is_start_resource_only:
-        # 特殊情况:只处理与起始表名匹配的Resource表
-        resource_scripts = [item for item in dependency_chain if item['table_type'] == 'DataResource' and item['table_name'] == table_name]
-        logger.info(f"依赖级别为'self'且起始表是DataResource,只处理表: {table_name}")
-    elif dependency_level == 'source':
-        # source级别:处理所有Resource表
-        resource_scripts = [item for item in dependency_chain if item['table_type'] == 'DataResource']
-        logger.info(f"依赖级别为'source',处理所有DataResource表")
-    else:
-        # 其他情况,返回空列表
-        resource_scripts = []
-    
-    if not resource_scripts:
-        logger.info("没有找到DataResource类型的表需要处理")
-        return True
-    
-    # 详细记录要执行的脚本信息
-    logger.info(f"要执行的DataResource脚本数量: {len(resource_scripts)}")
-    for idx, item in enumerate(resource_scripts, 1):
-        logger.info(f"Resource脚本[{idx}]: 表={item['table_name']}, 脚本={item['script_name']}, 模式={item['execution_mode']}")
-    
-    # 执行所有DataResource脚本
-    return execute_scripts(resource_scripts)
-
-def process_models(**context):
-    """
-    处理所有DataModel层的脚本
-    
-    依赖级别处理策略:
-    - self: 只执行起始表(如果是DataModel类型)
-    - resource/source: 执行所有依赖链中的DataModel脚本
-    """
-    # 获取任务间共享变量
-    ti = context['ti']
-    dependency_chain = ti.xcom_pull(task_ids='prepare_dependency_chain', key='dependency_chain')
-    
-    # 直接从XCom获取依赖级别,避免重复解析
-    dependency_level = ti.xcom_pull(task_ids='prepare_dependency_chain', key='dependency_level')
-    
-    # 记录当前任务的依赖级别
-    logger.info(f"process_models任务 - 当前依赖级别: {dependency_level}")
-    
-    # 获取表名(在所有级别都需要)
-    params = context.get('params', {})
-    table_name = params.get('table_name')
-    logger.info(f"目标表: {table_name}")
-    
-    # 如果依赖级别是'self',只处理起始表
-    if dependency_level == 'self':
-        logger.info(f"依赖级别为'self',只处理起始表: {table_name}")
-        model_scripts = [item for item in dependency_chain if item['table_name'] == table_name and item['table_type'] == 'DataModel']
-    else:
-        # 否则处理所有DataModel表
-        logger.info(f"依赖级别为'{dependency_level}',处理所有DataModel表")
-        model_scripts = [item for item in dependency_chain if item['table_type'] == 'DataModel']
-    
-    if not model_scripts:
-        logger.info("没有找到DataModel类型的表需要处理")
-        return True
-    
-    # 详细记录要执行的脚本信息
-    logger.info(f"要执行的DataModel脚本数量: {len(model_scripts)}")
-    for idx, item in enumerate(model_scripts, 1):
-        logger.info(f"Model脚本[{idx}]: 表={item['table_name']}, 脚本={item['script_name']}, 模式={item['execution_mode']}")
-    
-    # 执行所有DataModel脚本
-    return execute_scripts(model_scripts)
-
-# 创建DAG
-with DAG(
-    'dag_manual_dependency_trigger',
-    default_args=default_args,
-    description='手动触发指定表的依赖链执行,支持三种依赖级别:self(仅本表)、resource(到Resource层但不执行Resource)、source(完整依赖到Source层)',
-    schedule_interval=None,  # 设置为None表示只能手动触发
-    catchup=False,
-    is_paused_upon_creation=True,  # 添加这一行,使DAG创建时不处于暂停状态
-    params={
-        'table_name': '',
-        'dependency_level': {
-            'type': 'string',
-            'enum': ['self', 'resource', 'source'],
-            'default': 'resource',
-            'description': '依赖级别: self-仅本表, resource-到Resource层(不执行Resource脚本), source-到Source层'
-        }
-    },
-) as dag:
-    
-    # 第一个任务:准备依赖链
-    prepare_task = PythonOperator(
-        task_id='prepare_dependency_chain',
-        python_callable=prepare_dependency_chain,
-        provide_context=True,
-    )
-    
-    # 第二个任务:执行DataResource脚本
-    resource_task = PythonOperator(
-        task_id='process_resources',
-        python_callable=process_resources,
-        provide_context=True,
-    )
-    
-    # 第三个任务:执行DataModel脚本
-    model_task = PythonOperator(
-        task_id='process_models',
-        python_callable=process_models,
-        provide_context=True,
-    )
-    
-    # 设置任务依赖关系
-    prepare_task >> resource_task >> model_task 

+ 3 - 19
dags/dataops_productline_execute_dag.py

@@ -18,10 +18,11 @@ import json
 import os
 import pendulum
 from decimal import Decimal
-from common import (
+from utils import (
     get_pg_conn, 
     get_neo4j_driver,
-    get_today_date
+    get_today_date,
+    get_cn_exec_date
 )
 from config import TASK_RETRY_CONFIG, SCRIPTS_BASE_PATH, PG_CONFIG, NEO4J_CONFIG
 import pytz
@@ -67,23 +68,6 @@ class DecimalEncoder(json.JSONEncoder):
         return super(DecimalEncoder, self).default(obj)
     
 
-def get_cn_exec_date(logical_date):
-    """
-    获取逻辑执行日期
-    
-    参数:
-        logical_date: 逻辑执行日期,UTC时间
-
-    返回:
-        logical_exec_date: 逻辑执行日期,北京时间
-        local_logical_date: 北京时区的logical_date
-    """
-    # 获取逻辑执行日期
-    local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-    exec_date = local_logical_date.strftime('%Y-%m-%d')
-    return exec_date, local_logical_date
-
-
 #############################################
 # 脚本执行函数
 #############################################

+ 1 - 1
dags/dataops_productline_finalize_dag.py

@@ -19,7 +19,7 @@ from airflow.models import DagRun, TaskInstance
 from airflow.utils.state import State
 from sqlalchemy import desc
 from airflow import settings
-from common import get_today_date
+from utils import get_today_date
 from decimal import Decimal
 
 # 创建日志记录器

+ 1 - 30
dags/dataops_productline_manual_trigger_dag.py

@@ -66,6 +66,7 @@ from config import NEO4J_CONFIG, SCRIPTS_BASE_PATH, PG_CONFIG
 import traceback
 import pendulum
 import pytz
+from utils import get_pg_conn, get_cn_exec_date, check_script_exists
 
 # 设置logger
 logger = logging.getLogger(__name__)
@@ -81,26 +82,6 @@ default_args = {
     'retry_delay': timedelta(minutes=1),
 }
 
-def get_cn_exec_date(logical_date):
-    """
-    获取逻辑执行日期
-    
-    参数:
-        logical_date: 逻辑执行日期,UTC时间
-
-    返回:
-        logical_exec_date: 逻辑执行日期,北京时间
-        local_logical_date: 北京时区的logical_date
-    """
-    # 获取逻辑执行日期
-    local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
-    exec_date = local_logical_date.strftime('%Y-%m-%d')
-    return exec_date, local_logical_date
-
-def get_pg_conn():
-    """获取PostgreSQL连接"""
-    return psycopg2.connect(**PG_CONFIG)
-
 def get_execution_mode(table_name):
     """
     从Neo4j获取表的执行模式
@@ -722,16 +703,6 @@ def get_upstream_script_dependencies(script_info, dependency_level='resource'):
         # 出错时,至少返回当前脚本
         return [script_info]
 
-def check_script_exists(script_name):
-    """检查脚本文件是否存在"""
-    script_path = os.path.join(SCRIPTS_BASE_PATH, script_name)
-    if os.path.exists(script_path):
-        logger.info(f"脚本文件存在: {script_path}")
-        return True, script_path
-    else:
-        logger.error(f"脚本文件不存在: {script_path}")
-        return False, script_path
-
 def execute_python_script(script_info):
     """
     执行Python脚本文件

+ 1 - 1
dags/dataops_productline_prepare_dag.py

@@ -12,7 +12,7 @@ import glob
 from pathlib import Path
 import hashlib
 import pendulum
-from common import (
+from utils import (
     get_pg_conn, 
     get_neo4j_driver,
     get_today_date

+ 385 - 239
dags/utils.py

@@ -8,6 +8,10 @@ from pathlib import Path
 import networkx as nx
 import os
 from airflow.exceptions import AirflowFailException
+from datetime import datetime, timedelta, date
+import functools
+import time
+import pendulum
 
 # 创建统一的日志记录器
 logger = logging.getLogger("airflow.task")
@@ -15,138 +19,109 @@ logger = logging.getLogger("airflow.task")
 def get_pg_conn():
     return psycopg2.connect(**PG_CONFIG)
 
-def get_subscribed_tables(freq: str) -> list[dict]:
-    """
-    根据调度频率获取启用的订阅表列表,附带 execution_mode 参数
-    返回结果示例:
-    [
-        {'table_name': 'region_sales', 'execution_mode': 'append'},
-        {'table_name': 'catalog_sales', 'execution_mode': 'full_refresh'}
-    ]
-    """
-    conn = get_pg_conn()
-    cursor = conn.cursor()
-    cursor.execute("""
-        SELECT table_name, execution_mode 
-        FROM table_schedule 
-        WHERE is_enabled = TRUE AND schedule_frequency = %s
-    """, (freq,))
-    result = cursor.fetchall()
-    cursor.close()
-    conn.close()
-    return [{"table_name": r[0], "execution_mode": r[1]} for r in result]
 
 
-def get_neo4j_dependencies(table_name: str) -> list:
+def execute_script(script_name=None, table_name=None, execution_mode=None, script_path=None, script_exec_mode=None, args=None):
     """
-    查询 Neo4j 中某个模型的 DERIVED_FROM 依赖(上游表名)
-    """
-    uri = NEO4J_CONFIG['uri']
-    auth = (NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-    driver = GraphDatabase.driver(uri, auth=auth)
-    query = """
-        MATCH (a:Table {name: $name})<-[:DERIVED_FROM]-(b:Table)
-        RETURN b.name
-    """
-    with driver.session() as session:
-        records = session.run(query, name=table_name)
-        return [record["b.name"] for record in records]
-
-# def get_script_name_from_neo4j(table_name: str) -> str:
-#     """
-#     从Neo4j数据库中查询表对应的脚本名称
-#     查询的是 DataResource 和 DataSource 之间的 ORIGINATES_FROM 关系中的 script_name 属性
-    
-#     参数:
-#         table_name (str): 数据资源表名
+    根据脚本名称动态导入并执行对应的脚本
+    支持两种调用方式:
+    1. execute_script(script_name, table_name, execution_mode) - 原始实现
+    2. execute_script(script_path, script_name, script_exec_mode, args={}) - 来自common.py的实现
         
-#     返回:
-#         str: 脚本名称,如果未找到则返回None
-#     """
-#     logger = logging.getLogger("airflow.task")
-    
-#     driver = GraphDatabase.driver(**NEO4J_CONFIG)
-#     query = """
-#         MATCH (dr:DataResource {en_name: $table_name})-[rel:ORIGINATES_FROM]->(ds:DataSource)
-#         RETURN rel.script_name AS script_name
-#     """
-#     try:
-#         with driver.session() as session:
-#             result = session.run(query, table_name=table_name)
-#             record = result.single()
-#             if record and 'script_name' in record:
-#                 return record['script_name']
-#             else:
-#                 logger.warning(f"没有找到表 {table_name} 对应的脚本名称")
-#                 return None
-#     except Exception as e:
-#         logger.error(f"从Neo4j查询脚本名称时出错: {str(e)}")
-#         return None
-#     finally:
-#         driver.close()
-
-def execute_script(script_name: str, table_name: str, execution_mode: str) -> bool:
-    """
-    根据脚本名称动态导入并执行对应的脚本        
     返回:
         bool: 执行成功返回True,否则返回False
     """
-    if not script_name:
-        logger.error("未提供脚本名称,无法执行")
-        return False
-    
-    try:
-        # 直接使用配置的部署路径,不考虑本地开发路径
-        script_path = Path(SCRIPTS_BASE_PATH) / script_name
-        logger.info(f"使用配置的Airflow部署路径: {script_path}")
-        
-        # 动态导入模块
-        spec = importlib.util.spec_from_file_location("dynamic_module", script_path)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
+    # 第一种调用方式 - 原始函数实现
+    if script_name and table_name and execution_mode is not None and script_path is None and script_exec_mode is None:
+        if not script_name:
+            logger.error("未提供脚本名称,无法执行")
+            return False
         
-        # 使用标准入口函数run
-        if hasattr(module, "run"):
-            logger.info(f"执行脚本 {script_name} 的标准入口函数 run()")
-            module.run(table_name=table_name, execution_mode=execution_mode)
-            return True
+        try:
+            # 直接使用配置的部署路径,不考虑本地开发路径
+            script_path = Path(SCRIPTS_BASE_PATH) / script_name
+            logger.info(f"使用配置的Airflow部署路径: {script_path}")
+            
+            # 动态导入模块
+            spec = importlib.util.spec_from_file_location("dynamic_module", script_path)
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            
+            # 使用标准入口函数run
+            if hasattr(module, "run"):
+                logger.info(f"执行脚本 {script_name} 的标准入口函数 run()")
+                module.run(table_name=table_name, execution_mode=execution_mode)
+                return True
+            else:
+                logger.warning(f"脚本 {script_name} 未定义标准入口函数 run(),无法执行")
+                return False
+        except Exception as e:
+            logger.error(f"执行脚本 {script_name} 时出错: {str(e)}")
+            return False
+    
+    # 第二种调用方式 - 从common.py迁移的实现
+    else:
+        # 确定调用方式并统一参数
+        if script_path and script_name and script_exec_mode is not None:
+            # 第二种调用方式 - 显式提供所有参数
+            if args is None:
+                args = {}
+        elif script_name and table_name and execution_mode is not None:
+            # 第二种调用方式 - 但使用第一种调用方式的参数名
+            script_path = os.path.join(SCRIPTS_BASE_PATH, f"{script_name}.py")
+            script_exec_mode = execution_mode
+            args = {"table_name": table_name}
         else:
-            logger.warning(f"脚本 {script_name} 未定义标准入口函数 run(),无法执行")
+            logger.error("参数不正确,无法执行脚本")
+            return False
+
+        try:
+            # 确保脚本路径存在
+            if not os.path.exists(script_path):
+                logger.error(f"脚本路径 {script_path} 不存在")
+                return False
+
+            # 加载脚本模块
+            spec = importlib.util.spec_from_file_location("script_module", script_path)
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            
+            # 检查并记录所有可用的函数
+            module_functions = [f for f in dir(module) if callable(getattr(module, f)) and not f.startswith('_')]
+            logger.debug(f"模块 {script_name} 中的可用函数: {module_functions}")
+
+            # 获取脚本的运行函数
+            if not hasattr(module, "run"):
+                logger.error(f"脚本 {script_name} 没有run函数")
+                return False
+
+            # 装饰run函数,确保返回布尔值
+            original_run = module.run
+            module.run = ensure_boolean_result(original_run)
+            
+            logger.info(f"开始执行脚本 {script_name},执行模式: {script_exec_mode}, 参数: {args}")
+            start_time = time.time()
+            
+            # 执行脚本
+            if table_name is not None:
+                # 使用table_name参数调用
+                exec_result = module.run(table_name=table_name, execution_mode=script_exec_mode)
+            else:
+                # 使用script_exec_mode和args调用
+                exec_result = module.run(script_exec_mode, args)
+            
+            end_time = time.time()
+            duration = end_time - start_time
+            
+            logger.info(f"脚本 {script_name} 执行完成,结果: {exec_result}, 耗时: {duration:.2f}秒")
+            return exec_result
+        except Exception as e:
+            logger.error(f"执行脚本 {script_name} 时出错: {str(e)}")
+            import traceback
+            logger.error(traceback.format_exc())
             return False
-    except Exception as e:
-        logger.error(f"执行脚本 {script_name} 时出错: {str(e)}")
-        return False
 
 
-# def get_enabled_tables(frequency: str) -> list:
-#     conn = get_pg_conn()
-#     cursor = conn.cursor()
-#     cursor.execute("""
-#         SELECT table_name, execution_mode
-#         FROM table_schedule
-#         WHERE is_enabled = TRUE AND schedule_frequency = %s
-#     """, (frequency,))
-#     result = cursor.fetchall()
-#     cursor.close()
-#     conn.close()
-
-#     output = []
-#     for r in result:
-#         output.append({"table_name": r[0], "execution_mode": r[1]})
-#     return output
-
-# def is_data_resource_table(table_name: str) -> bool:
-#     driver = GraphDatabase.driver(NEO4J_CONFIG['uri'], auth=(NEO4J_CONFIG['user'], NEO4J_CONFIG['password']))
-#     query = """
-#         MATCH (n:DataResource {en_name: $table_name}) RETURN count(n) > 0 AS exists
-#     """
-#     try:
-#         with driver.session() as session:
-#             result = session.run(query, table_name=table_name)
-#             record = result.single()
-#             return record and record["exists"]
-#     finally:
-#         driver.close()
 
 def get_resource_subscribed_tables(enabled_tables: list) -> list:
     result = []
@@ -379,26 +354,7 @@ def run_model_script(table_name, execution_mode):
         logger.error(traceback.format_exc())
         raise AirflowFailException(error_msg)
 
-# 从 Neo4j 获取指定 DataModel 表之间的依赖关系图
-# 返回值为 dict:{目标表: [上游依赖表1, 上游依赖表2, ...]}
-# def get_model_dependency_graph(table_names: list) -> dict:
-#     graph = {}
-#     uri = NEO4J_CONFIG['uri']
-#     auth = (NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
-#     driver = GraphDatabase.driver(uri, auth=auth)
-#     try:
-#         with driver.session() as session:
-#             for table_name in table_names:
-#                 query = """
-#                     MATCH (t:DataModel {en_name: $table_name})<-[:DERIVED_FROM]-(up:DataModel)
-#                     RETURN up.en_name AS upstream
-#                 """
-#                 result = session.run(query, table_name=table_name)
-#                 deps = [record['upstream'] for record in result if 'upstream' in record]
-#                 graph[table_name] = deps
-#     finally:
-#         driver.close()
-#     return graph
+
 def get_model_dependency_graph(table_names: list) -> dict:
     """
     使用networkx从Neo4j获取指定DataModel表之间的依赖关系图    
@@ -516,28 +472,39 @@ def get_model_dependency_graph(table_names: list) -> dict:
     return final_dependency_dict
 
 
-def generate_optimized_execution_order(table_names: list) -> list:
+def generate_optimized_execution_order(table_names, dependency_dict=None):
     """
-    生成优化的执行顺序,处理循环依赖
+    生成优化的执行顺序,处理循环依赖
     
     参数:
         table_names: 表名列表
+        dependency_dict: 依赖关系字典 {表名: [依赖表1, 依赖表2, ...]}
+                        如果为None,则通过get_model_dependency_graph获取
     
     返回:
         list: 优化后的执行顺序列表
     """
-    # 创建依赖
+    # 创建有向
     G = nx.DiGraph()
     
     # 添加所有节点
     for table_name in table_names:
         G.add_node(table_name)
     
-    # 添加依赖边
-    dependency_dict = get_model_dependency_graph(table_names)
-    for target, upstreams in dependency_dict.items():
-        for upstream in upstreams:
-            G.add_edge(upstream, target)
+    # 获取依赖关系
+    if dependency_dict is None:
+        # 使用原始utils.py的get_model_dependency_graph获取依赖
+        dependency_dict = get_model_dependency_graph(table_names)
+        # 添加依赖边 - 从上游指向目标
+        for target, upstreams in dependency_dict.items():
+            for upstream in upstreams:
+                G.add_edge(upstream, target)
+    else:
+        # 使用提供的dependency_dict - 从依赖指向目标
+        for target, sources in dependency_dict.items():
+            for source in sources:
+                if source in table_names:  # 确保只考虑目标表集合中的表
+                    G.add_edge(source, target)
     
     # 检测循环依赖
     cycles = list(nx.simple_cycles(G))
@@ -559,64 +526,6 @@ def generate_optimized_execution_order(table_names: list) -> list:
         return table_names
 
 
-
-def identify_common_paths(table_names: list) -> dict:
-    """
-    识别多个表之间的公共执行路径
-    
-    参数:
-        table_names: 表名列表
-    
-    返回:
-        dict: 公共路径信息 {(path_tuple): 使用次数}
-    """
-    # 创建依赖图
-    G = nx.DiGraph()
-    
-    # 添加所有节点和直接依赖边
-    dependency_dict = get_model_dependency_graph(table_names)
-    for target, upstreams in dependency_dict.items():
-        G.add_node(target)
-        for upstream in upstreams:
-            G.add_node(upstream)
-            G.add_edge(upstream, target)
-    
-    # 找出所有路径
-    all_paths = []
-    # 找出所有源节点(没有入边的节点)和终节点(没有出边的节点)
-    sources = [n for n in G.nodes() if G.in_degree(n) == 0]
-    targets = [n for n in G.nodes() if G.out_degree(n) == 0]
-    
-    # 获取所有源到目标的路径
-    for source in sources:
-        for target in targets:
-            try:
-                # 限制路径长度,避免组合爆炸
-                paths = list(nx.all_simple_paths(G, source, target, cutoff=10))
-                all_paths.extend(paths)
-            except nx.NetworkXNoPath:
-                continue
-    
-    # 统计路径段使用频率
-    path_segments = {}
-    for path in all_paths:
-        # 只考虑长度>=2的路径段(至少有一条边)
-        for i in range(len(path)-1):
-            for j in range(i+2, min(i+6, len(path)+1)):  # 限制段长,避免组合爆炸
-                segment = tuple(path[i:j])
-                if segment not in path_segments:
-                    path_segments[segment] = 0
-                path_segments[segment] += 1
-    
-    # 过滤出重复使用的路径段
-    common_paths = {seg: count for seg, count in path_segments.items() 
-                    if count > 1 and len(seg) >= 3}  # 至少3个节点,2条边
-    
-    # 按使用次数排序
-    common_paths = dict(sorted(common_paths.items(), key=lambda x: x[1], reverse=True))
-    
-    return common_paths
-
 def check_table_relationship(table1, table2):
     """
     直接检查Neo4j中两个表之间的关系
@@ -844,46 +753,283 @@ def connect_start_and_end_tasks(task_dict, tasks_with_upstream, tasks_with_downs
     return start_tasks, end_tasks
 
 
-def process_model_tables(enabled_tables, dag_type, wait_task, completed_task, dag, **task_options):
-    """
-    处理模型表并构建DAG
+def get_neo4j_driver():
+    """获取Neo4j连接驱动"""
+    uri = NEO4J_CONFIG['uri']
+    auth = (NEO4J_CONFIG['user'], NEO4J_CONFIG['password'])
+    return GraphDatabase.driver(uri, auth=auth)
+
+def update_task_start_time(exec_date, target_table, script_name, start_time):
+    """更新任务开始时间"""
+    logger.info(f"===== 更新任务开始时间 =====")
+    logger.info(f"参数: exec_date={exec_date} ({type(exec_date).__name__}), target_table={target_table}, script_name={script_name}")
     
-    参数:
-        enabled_tables: 已启用的表列表
-        dag_type: DAG类型 (daily, monthly等)
-        wait_task: 等待任务
-        completed_task: 完成标记任务
-        dag: Airflow DAG对象
-        task_options: 创建任务的额外选项
-    """
-    model_tables = [t for t in enabled_tables if is_data_model_table(t['table_name'])]
-    logger.info(f"获取到 {len(model_tables)} 个启用的 {dag_type} 模型表")
+    conn = get_pg_conn()
+    cursor = conn.cursor()
+    try:
+        # 首先检查记录是否存在
+        cursor.execute("""
+            SELECT COUNT(*) 
+            FROM airflow_dag_schedule 
+            WHERE exec_date = %s AND target_table = %s AND script_name = %s
+        """, (exec_date, target_table, script_name))
+        count = cursor.fetchone()[0]
+        logger.info(f"查询到符合条件的记录数: {count}")
+        
+        if count == 0:
+            logger.warning(f"未找到匹配的记录: exec_date={exec_date}, target_table={target_table}, script_name={script_name}")
+            logger.info("尝试记录在airflow_dag_schedule表中找到的记录:")
+            cursor.execute("""
+                SELECT exec_date, target_table, script_name
+                FROM airflow_dag_schedule
+                LIMIT 5
+            """)
+            sample_records = cursor.fetchall()
+            for record in sample_records:
+                logger.info(f"样本记录: exec_date={record[0]} ({type(record[0]).__name__}), target_table={record[1]}, script_name={record[2]}")
+        
+        # 执行更新
+        sql = """
+            UPDATE airflow_dag_schedule 
+            SET exec_start_time = %s
+            WHERE exec_date = %s AND target_table = %s AND script_name = %s
+        """
+        logger.info(f"执行SQL: {sql}")
+        logger.info(f"参数: start_time={start_time}, exec_date={exec_date}, target_table={target_table}, script_name={script_name}")
+        
+        cursor.execute(sql, (start_time, exec_date, target_table, script_name))
+        affected_rows = cursor.rowcount
+        logger.info(f"更新影响的行数: {affected_rows}")
+        
+        conn.commit()
+        logger.info("事务已提交")
+    except Exception as e:
+        logger.error(f"更新任务开始时间失败: {str(e)}")
+        import traceback
+        logger.error(f"错误堆栈: {traceback.format_exc()}")
+        conn.rollback()
+        logger.info("事务已回滚")
+        raise
+    finally:
+        cursor.close()
+        conn.close()
+        logger.info("数据库连接已关闭")
+        logger.info("===== 更新任务开始时间完成 =====")
+
+def update_task_completion(exec_date, target_table, script_name, success, end_time, duration):
+    """更新任务完成信息"""
+    logger.info(f"===== 更新任务完成信息 =====")
+    logger.info(f"参数: exec_date={exec_date} ({type(exec_date).__name__}), target_table={target_table}, script_name={script_name}")
+    logger.info(f"参数: success={success} ({type(success).__name__}), end_time={end_time}, duration={duration}")
     
-    if not model_tables:
-        # 如果没有模型表需要处理,直接将等待任务与完成标记相连接
-        logger.info(f"没有找到需要处理的{dag_type}模型表,DAG将直接标记为完成")
-        wait_task >> completed_task
-        return
+    conn = get_pg_conn()
+    cursor = conn.cursor()
+    try:
+        # 首先检查记录是否存在
+        cursor.execute("""
+            SELECT COUNT(*) 
+            FROM airflow_dag_schedule 
+            WHERE exec_date = %s AND target_table = %s AND script_name = %s
+        """, (exec_date, target_table, script_name))
+        count = cursor.fetchone()[0]
+        logger.info(f"查询到符合条件的记录数: {count}")
+        
+        if count == 0:
+            logger.warning(f"未找到匹配的记录: exec_date={exec_date}, target_table={target_table}, script_name={script_name}")
+            # 查询表中前几条记录作为参考
+            cursor.execute("""
+                SELECT exec_date, target_table, script_name
+                FROM airflow_dag_schedule
+                LIMIT 5
+            """)
+            sample_records = cursor.fetchall()
+            logger.info("airflow_dag_schedule表中的样本记录:")
+            for record in sample_records:
+                logger.info(f"样本记录: exec_date={record[0]} ({type(record[0]).__name__}), target_table={record[1]}, script_name={record[2]}")
+        
+        # 确保success是布尔类型
+        if not isinstance(success, bool):
+            original_success = success
+            success = bool(success)
+            logger.warning(f"success参数不是布尔类型,原始值: {original_success},转换为: {success}")
+        
+        # 执行更新
+        sql = """
+            UPDATE airflow_dag_schedule 
+            SET exec_result = %s, exec_end_time = %s, exec_duration = %s
+            WHERE exec_date = %s AND target_table = %s AND script_name = %s
+        """
+        logger.info(f"执行SQL: {sql}")
+        logger.info(f"参数: success={success}, end_time={end_time}, duration={duration}, exec_date={exec_date}, target_table={target_table}, script_name={script_name}")
+        
+        cursor.execute(sql, (success, end_time, duration, exec_date, target_table, script_name))
+        affected_rows = cursor.rowcount
+        logger.info(f"更新影响的行数: {affected_rows}")
+        
+        if affected_rows == 0:
+            logger.warning("更新操作没有影响任何行,可能是因为条件不匹配")
+            # 尝试用不同格式的exec_date查询
+            if isinstance(exec_date, str):
+                try:
+                    # 尝试解析日期字符串
+                    from datetime import datetime
+                    parsed_date = datetime.strptime(exec_date, "%Y-%m-%d").date()
+                    logger.info(f"尝试使用解析后的日期格式: {parsed_date}")
+                    
+                    cursor.execute("""
+                        SELECT COUNT(*) 
+                        FROM airflow_dag_schedule 
+                        WHERE exec_date = %s AND target_table = %s AND script_name = %s
+                    """, (parsed_date, target_table, script_name))
+                    parsed_count = cursor.fetchone()[0]
+                    logger.info(f"使用解析日期后查询到的记录数: {parsed_count}")
+                    
+                    if parsed_count > 0:
+                        # 尝试用解析的日期更新
+                        cursor.execute("""
+                            UPDATE airflow_dag_schedule 
+                            SET exec_result = %s, exec_end_time = %s, exec_duration = %s
+                            WHERE exec_date = %s AND target_table = %s AND script_name = %s
+                        """, (success, end_time, duration, parsed_date, target_table, script_name))
+                        new_affected_rows = cursor.rowcount
+                        logger.info(f"使用解析日期后更新影响的行数: {new_affected_rows}")
+                except Exception as parse_e:
+                    logger.error(f"尝试解析日期格式时出错: {str(parse_e)}")
+        
+        conn.commit()
+        logger.info("事务已提交")
+    except Exception as e:
+        logger.error(f"更新任务完成信息失败: {str(e)}")
+        import traceback
+        logger.error(f"错误堆栈: {traceback.format_exc()}")
+        conn.rollback()
+        logger.info("事务已回滚")
+        raise
+    finally:
+        cursor.close()
+        conn.close()
+        logger.info("数据库连接已关闭")
+        logger.info("===== 更新任务完成信息完成 =====")
+
+def execute_with_monitoring(target_table, script_name, script_exec_mode, exec_date, **kwargs):
+    """执行脚本并监控执行情况"""
+
+    # 添加详细日志
+    logger.info(f"===== 开始监控执行 =====")
+    logger.info(f"target_table: {target_table}, 类型: {type(target_table)}")
+    logger.info(f"script_name: {script_name}, 类型: {type(script_name)}")
+    logger.info(f"script_exec_mode: {script_exec_mode}, 类型: {type(script_exec_mode)}")
+    logger.info(f"exec_date: {exec_date}, 类型: {type(exec_date)}")
+
+    # 检查script_name是否为空
+    if not script_name:
+        logger.error(f"表 {target_table} 的script_name为空,无法执行")
+        # 记录执行失败
+        now = datetime.now()
+        update_task_completion(exec_date, target_table, script_name or "", False, now, 0)
+        return False
+    # 记录执行开始时间
+    start_time = datetime.now()
     
-    # 获取表名列表
-    table_names = [t['table_name'] for t in model_tables]
+    # 尝试更新开始时间并记录结果
+    try:
+        update_task_start_time(exec_date, target_table, script_name, start_time)
+        logger.info(f"成功更新任务开始时间: {start_time}")
+    except Exception as e:
+        logger.error(f"更新任务开始时间失败: {str(e)}")
     
     try:
-        # 构建模型依赖DAG
-        optimized_table_order, dependency_graph = build_model_dependency_dag(table_names, model_tables)
+        # 执行实际脚本
+        logger.info(f"开始执行脚本: {script_name}")
+        result = execute_script(script_name, target_table, script_exec_mode)
+        logger.info(f"脚本执行完成,原始返回值: {result}, 类型: {type(result)}")
         
-        # 创建任务字典
-        task_dict = create_task_dict(optimized_table_order, model_tables, dag, dag_type, **task_options)
+        # 确保result是布尔值
+        if result is None:
+            logger.warning(f"脚本返回值为None,转换为False")
+            result = False
+        elif not isinstance(result, bool):
+            original_result = result
+            result = bool(result)
+            logger.warning(f"脚本返回非布尔值 {original_result},转换为布尔值: {result}")
         
-        # 建立任务依赖关系
-        tasks_with_upstream, tasks_with_downstream, _ = build_task_dependencies(task_dict, dependency_graph)
+        # 记录结束时间和结果
+        end_time = datetime.now()
+        duration = (end_time - start_time).total_seconds()
         
-        # 连接开始节点和末端节点
-        connect_start_and_end_tasks(task_dict, tasks_with_upstream, tasks_with_downstream, 
-                                  wait_task, completed_task, dag_type)
+        # 尝试更新完成状态并记录结果
+        try:
+            logger.info(f"尝试更新完成状态: result={result}, end_time={end_time}, duration={duration}")
+            update_task_completion(exec_date, target_table, script_name, result, end_time, duration)
+            logger.info(f"成功更新任务完成状态,结果: {result}")
+        except Exception as e:
+            logger.error(f"更新任务完成状态失败: {str(e)}")
         
+        logger.info(f"===== 监控执行完成 =====")
+        return result
     except Exception as e:
-        logger.error(f"处理{dag_type}模型表时出错: {str(e)}")
-        # 出错时也要确保完成标记被触发
-        wait_task >> completed_task
-        raise
+        # 处理异常
+        logger.error(f"执行任务出错: {str(e)}")
+        end_time = datetime.now()
+        duration = (end_time - start_time).total_seconds()
+        
+        # 尝试更新失败状态并记录结果
+        try:
+            logger.info(f"尝试更新失败状态: end_time={end_time}, duration={duration}")
+            update_task_completion(exec_date, target_table, script_name, False, end_time, duration)
+            logger.info(f"成功更新任务失败状态")
+        except Exception as update_e:
+            logger.error(f"更新任务失败状态失败: {str(update_e)}")
+        
+        logger.info(f"===== 监控执行异常结束 =====")
+        raise e
+
+def ensure_boolean_result(func):
+    """装饰器:确保函数返回布尔值"""
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            result = func(*args, **kwargs)
+            logger.debug(f"脚本原始返回值: {result} (类型: {type(result).__name__})")
+            
+            # 处理None值
+            if result is None:
+                logger.warning(f"脚本函数 {func.__name__} 返回了None,默认设置为False")
+                return False
+                
+            # 处理非布尔值
+            if not isinstance(result, bool):
+                try:
+                    # 尝试转换为布尔值
+                    bool_result = bool(result)
+                    logger.warning(f"脚本函数 {func.__name__} 返回非布尔值 {result},已转换为布尔值 {bool_result}")
+                    return bool_result
+                except Exception as e:
+                    logger.error(f"无法将脚本返回值 {result} 转换为布尔值: {str(e)}")
+                    return False
+            
+            return result
+        except Exception as e:
+            logger.error(f"脚本函数 {func.__name__} 执行出错: {str(e)}")
+            return False
+    return wrapper
+
+def get_today_date():
+    """获取今天的日期,返回YYYY-MM-DD格式字符串"""
+    return datetime.now().strftime("%Y-%m-%d")
+
+def get_cn_exec_date(logical_date):
+    """
+    获取逻辑执行日期
+    
+    参数:
+        logical_date: 逻辑执行日期,UTC时间
+
+    返回:
+        logical_exec_date: 逻辑执行日期,北京时间
+        local_logical_date: 北京时区的logical_date
+    """
+    # 获取逻辑执行日期
+    local_logical_date = pendulum.instance(logical_date).in_timezone('Asia/Shanghai')
+    exec_date = local_logical_date.strftime('%Y-%m-%d')
+    return exec_date, local_logical_date

+ 0 - 0
dataops/scripts/book_sale_amt_2weekly_process.py → dataops_scripts/book_sale_amt_2weekly_process.py


+ 0 - 0
dataops/scripts/book_sale_amt_2yearly_process.py → dataops_scripts/book_sale_amt_2yearly_process.py


+ 0 - 0
dataops/scripts/book_sale_amt_daily_clean.py → dataops_scripts/book_sale_amt_daily_clean.py


+ 0 - 0
dataops/scripts/book_sale_amt_half_yearly_process.py → dataops_scripts/book_sale_amt_half_yearly_process.py


+ 0 - 0
dataops/scripts/book_sale_amt_monthly_process.py → dataops_scripts/book_sale_amt_monthly_process.py


+ 0 - 0
dataops/scripts/book_sale_amt_weekly_process.py → dataops_scripts/book_sale_amt_weekly_process.py


+ 0 - 0
dataops/scripts/book_sale_amt_yearly_process.py → dataops_scripts/book_sale_amt_yearly_process.py


+ 0 - 0
dataops/scripts/books_total_process.py → dataops_scripts/books_total_process.py


+ 0 - 0
dataops/scripts/emp_training_stats_table.py → dataops_scripts/emp_training_stats_table.py


+ 0 - 0
dataops/scripts/execution_python.py → dataops_scripts/execution_python.py


+ 0 - 0
dataops/scripts/execution_sql.py → dataops_scripts/execution_sql.py


+ 0 - 0
dataops/scripts/load_data.py → dataops_scripts/load_data.py


+ 0 - 0
dataops/scripts/load_file.py → dataops_scripts/load_file.py


+ 0 - 0
dataops/scripts/load_file_test.py → dataops_scripts/load_file_test.py


+ 1 - 0
dataops/scripts/script_utils.py → dataops_scripts/script_utils.py

@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+# 这是dataops_scripts目录下的文件 - 用于验证路径修改成功
 import logging
 import sys
 from datetime import datetime, timedelta