|
|
@@ -0,0 +1,431 @@
|
|
|
+"""
|
|
|
+数据验证工具模块
|
|
|
+
|
|
|
+提供数据验证功能,用于验证数据的完整性和格式正确性
|
|
|
+"""
|
|
|
+
|
|
|
+import pandas as pd
|
|
|
+import re
|
|
|
+from typing import List, Dict, Any, Callable, Optional
|
|
|
+import logging
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+
|
|
|
+class ValidationRule:
|
|
|
+ """验证规则基类"""
|
|
|
+
|
|
|
+ def __init__(self, column: str, error_message: str = None):
|
|
|
+ """
|
|
|
+ 初始化验证规则
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 要验证的列名
|
|
|
+ error_message: 自定义错误消息
|
|
|
+ """
|
|
|
+ self.column = column
|
|
|
+ self.error_message = error_message or f"Validation failed for column '{column}'"
|
|
|
+
|
|
|
+ def validate(self, df: pd.DataFrame) -> pd.Series:
|
|
|
+ """
|
|
|
+ 执行验证
|
|
|
+
|
|
|
+ Args:
|
|
|
+ df: 输入的DataFrame
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 布尔Series,True表示验证通过
|
|
|
+ """
|
|
|
+ raise NotImplementedError("Subclasses must implement validate method")
|
|
|
+
|
|
|
+
|
|
|
+class RequiredFieldRule(ValidationRule):
|
|
|
+ """必填字段验证规则"""
|
|
|
+
|
|
|
+ def validate(self, df: pd.DataFrame) -> pd.Series:
|
|
|
+ """验证字段不能为空"""
|
|
|
+ if self.column not in df.columns:
|
|
|
+ raise ValueError(f"Column '{self.column}' not found in DataFrame")
|
|
|
+
|
|
|
+ return df[self.column].notna()
|
|
|
+
|
|
|
+
|
|
|
+class DataTypeRule(ValidationRule):
|
|
|
+ """数据类型验证规则"""
|
|
|
+
|
|
|
+ def __init__(self, column: str, expected_type: type, error_message: str = None):
|
|
|
+ """
|
|
|
+ 初始化数据类型验证规则
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 要验证的列名
|
|
|
+ expected_type: 期望的数据类型
|
|
|
+ error_message: 自定义错误消息
|
|
|
+ """
|
|
|
+ super().__init__(column, error_message)
|
|
|
+ self.expected_type = expected_type
|
|
|
+ self.error_message = error_message or f"Column '{column}' must be of type {expected_type.__name__}"
|
|
|
+
|
|
|
+ def validate(self, df: pd.DataFrame) -> pd.Series:
|
|
|
+ """验证数据类型"""
|
|
|
+ if self.column not in df.columns:
|
|
|
+ raise ValueError(f"Column '{self.column}' not found in DataFrame")
|
|
|
+
|
|
|
+ # 对于空值,认为验证通过(可以与 RequiredFieldRule 组合使用)
|
|
|
+ result = pd.Series([True] * len(df), index=df.index)
|
|
|
+ non_null_mask = df[self.column].notna()
|
|
|
+
|
|
|
+ if self.expected_type == int:
|
|
|
+ result[non_null_mask] = df.loc[non_null_mask, self.column].apply(
|
|
|
+ lambda x: isinstance(x, (int, float)) and float(x).is_integer()
|
|
|
+ )
|
|
|
+ elif self.expected_type == float:
|
|
|
+ result[non_null_mask] = df.loc[non_null_mask, self.column].apply(
|
|
|
+ lambda x: isinstance(x, (int, float))
|
|
|
+ )
|
|
|
+ elif self.expected_type == str:
|
|
|
+ result[non_null_mask] = df.loc[non_null_mask, self.column].apply(
|
|
|
+ lambda x: isinstance(x, str)
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ result[non_null_mask] = df.loc[non_null_mask, self.column].apply(
|
|
|
+ lambda x: isinstance(x, self.expected_type)
|
|
|
+ )
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+class RegexRule(ValidationRule):
|
|
|
+ """正则表达式验证规则"""
|
|
|
+
|
|
|
+ def __init__(self, column: str, pattern: str, error_message: str = None):
|
|
|
+ """
|
|
|
+ 初始化正则表达式验证规则
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 要验证的列名
|
|
|
+ pattern: 正则表达式模式
|
|
|
+ error_message: 自定义错误消息
|
|
|
+ """
|
|
|
+ super().__init__(column, error_message)
|
|
|
+ self.pattern = re.compile(pattern)
|
|
|
+ self.error_message = error_message or f"Column '{column}' does not match pattern '{pattern}'"
|
|
|
+
|
|
|
+ def validate(self, df: pd.DataFrame) -> pd.Series:
|
|
|
+ """验证正则表达式"""
|
|
|
+ if self.column not in df.columns:
|
|
|
+ raise ValueError(f"Column '{self.column}' not found in DataFrame")
|
|
|
+
|
|
|
+ # 对于空值或非字符串,认为验证通过
|
|
|
+ result = pd.Series([True] * len(df), index=df.index)
|
|
|
+ valid_mask = df[self.column].notna() & df[self.column].apply(lambda x: isinstance(x, str))
|
|
|
+
|
|
|
+ result[valid_mask] = df.loc[valid_mask, self.column].apply(
|
|
|
+ lambda x: bool(self.pattern.match(str(x)))
|
|
|
+ )
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+class EmailRule(RegexRule):
|
|
|
+ """邮箱格式验证规则"""
|
|
|
+
|
|
|
+ def __init__(self, column: str, error_message: str = None):
|
|
|
+ """初始化邮箱验证规则"""
|
|
|
+ email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
|
|
+ super().__init__(
|
|
|
+ column,
|
|
|
+ email_pattern,
|
|
|
+ error_message or f"Column '{column}' contains invalid email addresses"
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+class PhoneRule(RegexRule):
|
|
|
+ """电话号码格式验证规则"""
|
|
|
+
|
|
|
+ def __init__(self, column: str, country_code: str = 'CN', error_message: str = None):
|
|
|
+ """
|
|
|
+ 初始化电话号码验证规则
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 要验证的列名
|
|
|
+ country_code: 国家代码,'CN'表示中国手机号
|
|
|
+ error_message: 自定义错误消息
|
|
|
+ """
|
|
|
+ if country_code == 'CN':
|
|
|
+ # 中国手机号:11位,1开头
|
|
|
+ phone_pattern = r'^1[3-9]\d{9}$'
|
|
|
+ else:
|
|
|
+ # 通用格式:支持国际格式
|
|
|
+ phone_pattern = r'^\+?[1-9]\d{1,14}$'
|
|
|
+
|
|
|
+ super().__init__(
|
|
|
+ column,
|
|
|
+ phone_pattern,
|
|
|
+ error_message or f"Column '{column}' contains invalid phone numbers"
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+class CustomRule(ValidationRule):
|
|
|
+ """自定义验证规则"""
|
|
|
+
|
|
|
+ def __init__(self, column: str, validator_func: Callable, error_message: str = None):
|
|
|
+ """
|
|
|
+ 初始化自定义验证规则
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 要验证的列名
|
|
|
+ validator_func: 自定义验证函数,接收单个值,返回布尔值
|
|
|
+ error_message: 自定义错误消息
|
|
|
+ """
|
|
|
+ super().__init__(column, error_message)
|
|
|
+ self.validator_func = validator_func
|
|
|
+
|
|
|
+ def validate(self, df: pd.DataFrame) -> pd.Series:
|
|
|
+ """验证自定义规则"""
|
|
|
+ if self.column not in df.columns:
|
|
|
+ raise ValueError(f"Column '{self.column}' not found in DataFrame")
|
|
|
+
|
|
|
+ return df[self.column].apply(self.validator_func)
|
|
|
+
|
|
|
+
|
|
|
+class DataValidator:
|
|
|
+ """
|
|
|
+ 数据验证器类
|
|
|
+
|
|
|
+ 用于验证数据的完整性和格式正确性
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ """初始化数据验证器"""
|
|
|
+ self.rules: List[ValidationRule] = []
|
|
|
+ logger.info("DataValidator initialized")
|
|
|
+
|
|
|
+ def add_rule(self, rule: ValidationRule) -> 'DataValidator':
|
|
|
+ """
|
|
|
+ 添加验证规则
|
|
|
+
|
|
|
+ Args:
|
|
|
+ rule: 验证规则对象
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ self,支持链式调用
|
|
|
+ """
|
|
|
+ self.rules.append(rule)
|
|
|
+ logger.info(f"Added validation rule for column '{rule.column}'")
|
|
|
+ return self
|
|
|
+
|
|
|
+ def add_required_field(self, column: str, error_message: str = None) -> 'DataValidator':
|
|
|
+ """
|
|
|
+ 添加必填字段验证
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 列名
|
|
|
+ error_message: 自定义错误消息
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ self,支持链式调用
|
|
|
+ """
|
|
|
+ return self.add_rule(RequiredFieldRule(column, error_message))
|
|
|
+
|
|
|
+ def add_data_type(self, column: str, expected_type: type, error_message: str = None) -> 'DataValidator':
|
|
|
+ """
|
|
|
+ 添加数据类型验证
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 列名
|
|
|
+ expected_type: 期望的数据类型
|
|
|
+ error_message: 自定义错误消息
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ self,支持链式调用
|
|
|
+ """
|
|
|
+ return self.add_rule(DataTypeRule(column, expected_type, error_message))
|
|
|
+
|
|
|
+ def add_email_format(self, column: str, error_message: str = None) -> 'DataValidator':
|
|
|
+ """
|
|
|
+ 添加邮箱格式验证
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 列名
|
|
|
+ error_message: 自定义错误消息
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ self,支持链式调用
|
|
|
+ """
|
|
|
+ return self.add_rule(EmailRule(column, error_message))
|
|
|
+
|
|
|
+ def add_phone_format(self, column: str, country_code: str = 'CN', error_message: str = None) -> 'DataValidator':
|
|
|
+ """
|
|
|
+ 添加电话号码格式验证
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 列名
|
|
|
+ country_code: 国家代码
|
|
|
+ error_message: 自定义错误消息
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ self,支持链式调用
|
|
|
+ """
|
|
|
+ return self.add_rule(PhoneRule(column, country_code, error_message))
|
|
|
+
|
|
|
+ def add_regex(self, column: str, pattern: str, error_message: str = None) -> 'DataValidator':
|
|
|
+ """
|
|
|
+ 添加正则表达式验证
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 列名
|
|
|
+ pattern: 正则表达式模式
|
|
|
+ error_message: 自定义错误消息
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ self,支持链式调用
|
|
|
+ """
|
|
|
+ return self.add_rule(RegexRule(column, pattern, error_message))
|
|
|
+
|
|
|
+ def add_custom(self, column: str, validator_func: Callable, error_message: str = None) -> 'DataValidator':
|
|
|
+ """
|
|
|
+ 添加自定义验证规则
|
|
|
+
|
|
|
+ Args:
|
|
|
+ column: 列名
|
|
|
+ validator_func: 自定义验证函数
|
|
|
+ error_message: 自定义错误消息
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ self,支持链式调用
|
|
|
+ """
|
|
|
+ return self.add_rule(CustomRule(column, validator_func, error_message))
|
|
|
+
|
|
|
+ def validate(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 执行所有验证规则
|
|
|
+
|
|
|
+ Args:
|
|
|
+ df: 输入的DataFrame
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 验证结果字典,包含:
|
|
|
+ - is_valid: 整体是否通过验证
|
|
|
+ - total_rows: 总行数
|
|
|
+ - valid_rows: 有效行数
|
|
|
+ - invalid_rows: 无效行数
|
|
|
+ - errors: 错误详情列表
|
|
|
+ - invalid_indices: 无效行的索引列表
|
|
|
+ """
|
|
|
+ logger.info(f"Starting validation on DataFrame with {len(df)} rows")
|
|
|
+
|
|
|
+ if not self.rules:
|
|
|
+ logger.warning("No validation rules defined")
|
|
|
+ return {
|
|
|
+ 'is_valid': True,
|
|
|
+ 'total_rows': len(df),
|
|
|
+ 'valid_rows': len(df),
|
|
|
+ 'invalid_rows': 0,
|
|
|
+ 'errors': [],
|
|
|
+ 'invalid_indices': [],
|
|
|
+ }
|
|
|
+
|
|
|
+ # 初始化所有行为有效
|
|
|
+ valid_mask = pd.Series([True] * len(df), index=df.index)
|
|
|
+ errors = []
|
|
|
+
|
|
|
+ # 应用所有验证规则
|
|
|
+ for rule in self.rules:
|
|
|
+ try:
|
|
|
+ rule_result = rule.validate(df)
|
|
|
+ failed_mask = ~rule_result
|
|
|
+
|
|
|
+ if failed_mask.any():
|
|
|
+ failed_indices = df.index[failed_mask].tolist()
|
|
|
+ errors.append({
|
|
|
+ 'rule': rule.__class__.__name__,
|
|
|
+ 'column': rule.column,
|
|
|
+ 'message': rule.error_message,
|
|
|
+ 'failed_count': failed_mask.sum(),
|
|
|
+ 'failed_indices': failed_indices[:10], # 只记录前10个
|
|
|
+ })
|
|
|
+ logger.warning(f"Validation failed for column '{rule.column}': {failed_mask.sum()} rows")
|
|
|
+
|
|
|
+ # 更新整体有效性掩码
|
|
|
+ valid_mask &= rule_result
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"Error applying rule {rule.__class__.__name__} on column '{rule.column}': {str(e)}")
|
|
|
+ errors.append({
|
|
|
+ 'rule': rule.__class__.__name__,
|
|
|
+ 'column': rule.column,
|
|
|
+ 'message': f"Validation error: {str(e)}",
|
|
|
+ 'failed_count': len(df),
|
|
|
+ 'failed_indices': [],
|
|
|
+ })
|
|
|
+ valid_mask = pd.Series([False] * len(df), index=df.index)
|
|
|
+
|
|
|
+ invalid_indices = df.index[~valid_mask].tolist()
|
|
|
+
|
|
|
+ result = {
|
|
|
+ 'is_valid': valid_mask.all(),
|
|
|
+ 'total_rows': len(df),
|
|
|
+ 'valid_rows': valid_mask.sum(),
|
|
|
+ 'invalid_rows': (~valid_mask).sum(),
|
|
|
+ 'errors': errors,
|
|
|
+ 'invalid_indices': invalid_indices,
|
|
|
+ }
|
|
|
+
|
|
|
+ logger.info(f"Validation completed: {result['valid_rows']}/{result['total_rows']} rows valid")
|
|
|
+ return result
|
|
|
+
|
|
|
+ def get_valid_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
|
+ """
|
|
|
+ 获取通过验证的数据
|
|
|
+
|
|
|
+ Args:
|
|
|
+ df: 输入的DataFrame
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 只包含有效行的DataFrame
|
|
|
+ """
|
|
|
+ validation_result = self.validate(df)
|
|
|
+ invalid_indices = validation_result['invalid_indices']
|
|
|
+
|
|
|
+ if not invalid_indices:
|
|
|
+ return df.copy()
|
|
|
+
|
|
|
+ return df.drop(invalid_indices).copy()
|
|
|
+
|
|
|
+ def get_invalid_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
|
+ """
|
|
|
+ 获取未通过验证的数据
|
|
|
+
|
|
|
+ Args:
|
|
|
+ df: 输入的DataFrame
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 只包含无效行的DataFrame
|
|
|
+ """
|
|
|
+ validation_result = self.validate(df)
|
|
|
+ invalid_indices = validation_result['invalid_indices']
|
|
|
+
|
|
|
+ if not invalid_indices:
|
|
|
+ return pd.DataFrame(columns=df.columns)
|
|
|
+
|
|
|
+ return df.loc[invalid_indices].copy()
|
|
|
+
|
|
|
+
|
|
|
+# 便捷函数
|
|
|
+def validate_data(df: pd.DataFrame, rules: List[ValidationRule]) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 便捷的数据验证函数
|
|
|
+
|
|
|
+ Args:
|
|
|
+ df: 输入的DataFrame
|
|
|
+ rules: 验证规则列表
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 验证结果字典
|
|
|
+ """
|
|
|
+ validator = DataValidator()
|
|
|
+ for rule in rules:
|
|
|
+ validator.add_rule(rule)
|
|
|
+ return validator.validate(df)
|
|
|
+
|