script_utils.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # 这是dataops_scripts目录下的文件 - 用于验证路径修改成功
  4. import logging
  5. import sys
  6. from datetime import datetime, timedelta
  7. import pytz
  8. import re # 添加re模块以支持正则表达式
  9. # 配置日志记录器
  10. logging.basicConfig(
  11. level=logging.INFO,
  12. format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
  13. handlers=[
  14. logging.StreamHandler(sys.stdout)
  15. ]
  16. )
  17. logger = logging.getLogger("script_utils")
  18. def get_date_range(exec_date, frequency):
  19. """
  20. 根据执行日期和频率,计算开始日期和结束日期
  21. 参数:
  22. exec_date (str): 执行日期,格式为 YYYY-MM-DD
  23. frequency (str): 频率,可选值为 daily, weekly, monthly, quarterly, yearly
  24. 返回:
  25. tuple: (start_date, end_date) 格式为 YYYY-MM-DD 的字符串
  26. """
  27. logger.info(f"计算日期范围 - 执行日期: {exec_date}, 频率: {frequency}")
  28. # 将输入的日期转换为上海时区的datetime对象
  29. shanghai_tz = pytz.timezone('Asia/Shanghai')
  30. try:
  31. # 解析输入的exec_date
  32. if isinstance(exec_date, str):
  33. date_obj = datetime.strptime(exec_date, '%Y-%m-%d')
  34. elif isinstance(exec_date, datetime):
  35. date_obj = exec_date
  36. else:
  37. raise ValueError(f"不支持的exec_date类型: {type(exec_date)}")
  38. # 转换为上海时区
  39. date_obj = shanghai_tz.localize(date_obj)
  40. logger.info(f"上海时区的执行日期: {date_obj}")
  41. # 根据不同频率计算日期范围
  42. if frequency.lower() == 'daily':
  43. # 每日: start_date = exec_date, end_date = exec_date + 1 day
  44. start_date = date_obj.strftime('%Y-%m-%d')
  45. end_date = (date_obj + timedelta(days=1)).strftime('%Y-%m-%d')
  46. elif frequency.lower() == 'weekly':
  47. # 每周: start_date = 本周一, end_date = 下周一
  48. days_since_monday = date_obj.weekday() # 0=周一, 6=周日
  49. monday = date_obj - timedelta(days=days_since_monday)
  50. next_monday = monday + timedelta(days=7)
  51. start_date = monday.strftime('%Y-%m-%d')
  52. end_date = next_monday.strftime('%Y-%m-%d')
  53. elif frequency.lower() == 'monthly':
  54. # 每月: start_date = 本月第一天, end_date = 下月第一天
  55. first_day = date_obj.replace(day=1)
  56. # 计算下个月的第一天
  57. if first_day.month == 12:
  58. next_month_first_day = first_day.replace(year=first_day.year + 1, month=1)
  59. else:
  60. next_month_first_day = first_day.replace(month=first_day.month + 1)
  61. start_date = first_day.strftime('%Y-%m-%d')
  62. end_date = next_month_first_day.strftime('%Y-%m-%d')
  63. elif frequency.lower() == 'quarterly':
  64. # 每季度: start_date = 本季度第一天, end_date = 下季度第一天
  65. quarter = (date_obj.month - 1) // 3 + 1 # 1-4季度
  66. first_month_of_quarter = (quarter - 1) * 3 + 1 # 季度的第一个月
  67. quarter_first_day = date_obj.replace(month=first_month_of_quarter, day=1)
  68. # 计算下个季度的第一天
  69. if quarter == 4:
  70. next_quarter_first_day = quarter_first_day.replace(year=quarter_first_day.year + 1, month=1)
  71. else:
  72. next_quarter_first_day = quarter_first_day.replace(month=first_month_of_quarter + 3)
  73. start_date = quarter_first_day.strftime('%Y-%m-%d')
  74. end_date = next_quarter_first_day.strftime('%Y-%m-%d')
  75. elif frequency.lower() == 'yearly':
  76. # 每年: start_date = 本年第一天, end_date = 下年第一天
  77. year_first_day = date_obj.replace(month=1, day=1)
  78. next_year_first_day = date_obj.replace(year=date_obj.year + 1, month=1, day=1)
  79. start_date = year_first_day.strftime('%Y-%m-%d')
  80. end_date = next_year_first_day.strftime('%Y-%m-%d')
  81. else:
  82. logger.error(f"不支持的频率: {frequency}")
  83. raise ValueError(f"不支持的频率: {frequency}")
  84. logger.info(f"计算结果 - 开始日期: {start_date}, 结束日期: {end_date}")
  85. return start_date, end_date
  86. except Exception as e:
  87. logger.error(f"计算日期范围时出错: {str(e)}", exc_info=True)
  88. raise
  89. import re
  90. from typing import Dict, List, Optional, Set
  91. def extract_source_fields_linked_to_template(sql: str, jinja_vars: List[str]) -> Set[str]:
  92. """
  93. 从 SQL 中提取和 jinja 模板变量绑定的源字段(支持各种形式)
  94. """
  95. fields = set()
  96. sql = re.sub(r"\s+", " ", sql)
  97. for var in jinja_vars:
  98. # 普通比较、函数包裹
  99. pattern = re.compile(
  100. r"""
  101. (?P<field>
  102. (?:\w+\s*\(\s*)? # 可选函数开始(如 DATE(
  103. [\w\.]+ # 字段名
  104. (?:\s+AS\s+\w+)? # 可选 CAST 形式
  105. \)? # 可选右括号
  106. )
  107. \s*(=|<|>|<=|>=)\s*['"]?\{\{\s*""" + var + r"""\s*\}\}['"]?
  108. """, re.IGNORECASE | re.VERBOSE
  109. )
  110. fields.update(match.group("field").strip() for match in pattern.finditer(sql))
  111. # BETWEEN '{{ start_date }}' AND '{{ end_date }}'
  112. if var == "start_date":
  113. pattern_between = re.compile(
  114. r"""(?P<field>
  115. (?:\w+\s*\(\s*)?[\w\.]+(?:\s+AS\s+\w+)?\)? # 字段(函数包裹可选)
  116. )
  117. \s+BETWEEN\s+['"]?\{\{\s*start_date\s*\}\}['"]?\s+AND\s+['"]?\{\{\s*end_date\s*\}\}
  118. """, re.IGNORECASE | re.VERBOSE
  119. )
  120. fields.update(match.group("field").strip() for match in pattern_between.finditer(sql))
  121. return {extract_core_field(f) for f in fields}
  122. def extract_core_field(expr: str) -> str:
  123. """
  124. 清洗函数包裹的字段表达式:DATE(sd.sale_date) -> sd.sale_date, CAST(...) -> ...
  125. """
  126. expr = re.sub(r"CAST\s*\(\s*([\w\.]+)\s+AS\s+\w+\s*\)", r"\1", expr, flags=re.IGNORECASE)
  127. expr = re.sub(r"\b\w+\s*\(\s*([\w\.]+)\s*\)", r"\1", expr)
  128. return expr.strip()
  129. def parse_select_aliases(sql: str) -> Dict[str, str]:
  130. """
  131. 提取 SELECT 中的字段别名映射:原字段 -> 目标别名
  132. """
  133. sql = re.sub(r"\s+", " ", sql)
  134. select_clause_match = re.search(r"SELECT\s+(.*?)\s+FROM", sql, re.IGNORECASE)
  135. if not select_clause_match:
  136. return {}
  137. select_clause = select_clause_match.group(1)
  138. mappings = {}
  139. for expr in select_clause.split(","):
  140. expr = expr.strip()
  141. alias_match = re.match(r"([\w\.]+)\s+AS\s+([\w]+)", expr, re.IGNORECASE)
  142. if alias_match:
  143. source, alias = alias_match.groups()
  144. mappings[source.strip()] = alias.strip()
  145. return mappings
  146. def find_target_date_field(sql: str, jinja_vars: List[str] = ["start_date", "end_date"]) -> Optional[str]:
  147. """
  148. 从 SQL 中找出与模板时间变量绑定的目标表字段(只返回一个)
  149. """
  150. source_fields = extract_source_fields_linked_to_template(sql, jinja_vars)
  151. alias_map = parse_select_aliases(sql)
  152. # 匹配 SELECT 中的映射字段
  153. for src_field in source_fields:
  154. if src_field in alias_map:
  155. return alias_map[src_field] # 源字段映射的目标字段
  156. # 若未通过 AS 映射,可能直接 SELECT sd.sale_date(裸字段)
  157. for src_field in source_fields:
  158. if '.' not in src_field:
  159. return src_field # 裸字段直接作为目标字段名
  160. return None
  161. def generate_delete_sql(sql_content, target_table=None):
  162. """
  163. 根据SQL脚本内容生成用于清理数据的DELETE语句
  164. 参数:
  165. sql_content (str): 原始SQL脚本内容
  166. target_table (str, optional): 目标表名,如果SQL脚本中无法解析出表名时使用
  167. 返回:
  168. str: DELETE语句,用于清理数据
  169. """
  170. logger.info("生成清理SQL语句,实现ETL作业幂等性")
  171. # 如果提供了目标表名,直接使用
  172. if target_table:
  173. logger.info(f"使用提供的目标表名: {target_table}")
  174. delete_stmt = f"""DELETE FROM {target_table}
  175. WHERE summary_date >= '{{{{ start_date }}}}'
  176. AND summary_date < '{{{{ end_date }}}}';"""
  177. logger.info(f"生成的清理SQL: {delete_stmt}")
  178. return delete_stmt
  179. # 尝试从SQL内容中解析出目标表名
  180. try:
  181. # 简单解析,尝试找出INSERT语句的目标表
  182. # 匹配 INSERT INTO xxx 或 INSERT INTO "xxx" 或 INSERT INTO `xxx` 或 INSERT INTO [xxx]
  183. insert_match = re.search(r'INSERT\s+INTO\s+(?:["\[`])?([a-zA-Z0-9_\.]+)(?:["\]`])?', sql_content, re.IGNORECASE)
  184. if insert_match:
  185. table_name = insert_match.group(1)
  186. logger.info(f"从SQL中解析出目标表名: {table_name}")
  187. delete_stmt = f"""DELETE FROM {table_name}
  188. WHERE summary_date >= '{{{{ start_date }}}}'
  189. AND summary_date < '{{{{ end_date }}}}';"""
  190. logger.info(f"生成的清理SQL: {delete_stmt}")
  191. return delete_stmt
  192. else:
  193. logger.warning("无法从SQL中解析出目标表名,无法生成清理SQL")
  194. return None
  195. except Exception as e:
  196. logger.error(f"解析SQL生成清理语句时出错: {str(e)}", exc_info=True)
  197. return None