浏览代码

参数格式调整
招聘类型的数据解析操作逻辑调整
新任命类型的数据解析操作逻辑调整

maxiaolong 1 月之前
父节点
当前提交
82fdfefd15

+ 57 - 24
app/api/data_parse/routes.py

@@ -1463,6 +1463,8 @@ def add_parse_task_route():
                     可选值:'名片', '简历', '新任命', '招聘', '杂项'
                     可选值:'名片', '简历', '新任命', '招聘', '杂项'
         - files: 文件数组 (multipart/form-data,对于招聘类型可选)
         - files: 文件数组 (multipart/form-data,对于招聘类型可选)
         - created_by: 创建者 (可选,form-data字段)
         - created_by: 创建者 (可选,form-data字段)
+        - data: 数据内容 (form-data字段,招聘类型必填)
+        - publish_time: 发布时间 (form-data字段,新任命类型必填)
         
         
     返回:
     返回:
         - JSON: 包含任务创建结果和上传摘要
         - JSON: 包含任务创建结果和上传摘要
@@ -1505,6 +1507,10 @@ def add_parse_task_route():
         # 获取创建者信息(可选参数)
         # 获取创建者信息(可选参数)
         created_by = request.form.get('created_by', 'api_user')
         created_by = request.form.get('created_by', 'api_user')
         
         
+        # 获取数据内容和发布时间参数
+        data = request.form.get('data')
+        publish_time = request.form.get('publish_time')
+        
         # 对于招聘类型,不需要文件上传
         # 对于招聘类型,不需要文件上传
         if task_type == '招聘':
         if task_type == '招聘':
             # 检查是否误传了文件
             # 检查是否误传了文件
@@ -1515,11 +1521,19 @@ def add_parse_task_route():
                     'data': None
                     'data': None
                 }), 400
                 }), 400
             
             
+            # 检查data参数是否有内容
+            if not data:
+                return jsonify({
+                    'success': False,
+                    'message': '招聘类型任务需要提供data参数',
+                    'data': None
+                }), 400
+            
             # 记录请求日志
             # 记录请求日志
-            logger.info(f"新增招聘任务请求: 创建者={created_by}")
+            logger.info(f"新增招聘任务请求: 创建者={created_by}, data长度={len(data) if data else 0}")
             
             
             # 调用核心业务逻辑
             # 调用核心业务逻辑
-            result = add_parse_task(None, task_type, created_by)
+            result = add_parse_task(None, task_type, created_by, data, publish_time)
         else:
         else:
             # 其他类型需要文件上传
             # 其他类型需要文件上传
             if 'files' not in request.files:
             if 'files' not in request.files:
@@ -1553,11 +1567,20 @@ def add_parse_task_route():
                 
                 
                 valid_files.append(file)
                 valid_files.append(file)
             
             
+            # 对于新任命类型,检查publish_time参数
+            if task_type == '新任命':
+                if not publish_time:
+                    return jsonify({
+                        'success': False,
+                        'message': '新任命类型任务需要提供publish_time参数',
+                        'data': None
+                    }), 400
+            
             # 记录请求日志
             # 记录请求日志
             logger.info(f"新增{task_type}任务请求: 文件数量={len(valid_files)}, 创建者={created_by}")
             logger.info(f"新增{task_type}任务请求: 文件数量={len(valid_files)}, 创建者={created_by}")
             
             
             # 调用核心业务逻辑
             # 调用核心业务逻辑
-            result = add_parse_task(valid_files, task_type, created_by)
+            result = add_parse_task(valid_files, task_type, created_by, data, publish_time)
         
         
         # 根据处理结果设置HTTP状态码
         # 根据处理结果设置HTTP状态码
         if result['success']:
         if result['success']:
@@ -1746,20 +1769,15 @@ def add_parsed_talents_route():
     处理解析任务响应数据并写入人才信息接口
     处理解析任务响应数据并写入人才信息接口
     
     
     请求参数:
     请求参数:
-        - api_response_data: execute-parse-task API的完整返回数据 (JSON格式)
+        - 请求体: 包含任务ID和人才数据的JSON对象 (JSON格式)
+          - task_id: 任务ID,用于更新任务状态(可选)
+          - data: 包含人才解析结果的数据对象
         
         
     请求体示例:
     请求体示例:
         {
         {
-            "success": true,
-            "message": "处理完成",
-            "data": {
-                "summary": {
-                    "total_files": 5,
-                    "success_count": 4,
-                    "failed_count": 1,
-                    "success_rate": 80.0
-                },
-                "results": [
+           "task_id": 123,
+           "data": {
+               "results": [
                     {
                     {
                         "index": 0,
                         "index": 0,
                         "success": true,
                         "success": true,
@@ -1769,8 +1787,7 @@ def add_parsed_talents_route():
                             "hotel_zh": "某酒店"
                             "hotel_zh": "某酒店"
                         }
                         }
                     }
                     }
-                ],
-                "processed_time": "2025-01-18T10:30:00"
+                ]
             }
             }
         }
         }
         
         
@@ -1778,12 +1795,11 @@ def add_parsed_talents_route():
         - JSON: 包含批量处理结果和状态信息
         - JSON: 包含批量处理结果和状态信息
         
         
     功能说明:
     功能说明:
-        - 接收 execute-parse-task API 的完整返回数据
-        - 自动识别和处理不同格式的人才数据(单人/批量)
+        - 接收包含人才数据的请求体
+        - 处理 results 数组中的人才数据
         - 调用 add_single_talent 函数将人才信息写入 business_cards 表
         - 调用 add_single_talent 函数将人才信息写入 business_cards 表
-        - 支持新任命等包含多个人员信息的批量数据
+        - 成功处理后,更新对应任务记录状态为"已入库"
         - 提供详细的处理统计和结果追踪
         - 提供详细的处理统计和结果追踪
-        - 保留原始API响应数据用于调试
         
         
     状态码:
     状态码:
         - 200: 全部处理成功
         - 200: 全部处理成功
@@ -1824,7 +1840,7 @@ def add_parsed_talents_route():
         if api_response_data.get('data') and api_response_data['data'].get('results'):
         if api_response_data.get('data') and api_response_data['data'].get('results'):
             total_results = len(api_response_data['data']['results'])
             total_results = len(api_response_data['data']['results'])
         
         
-        logger.info(f"收到处理解析任务响应数据请求,包含 {total_results} 条结果记录")
+        logger.info(f"收到处理人才数据请求,包含 {total_results} 条结果记录")
         
         
         # 调用核心业务逻辑
         # 调用核心业务逻辑
         result = add_parsed_talents(api_response_data)
         result = add_parsed_talents(api_response_data)
@@ -1848,9 +1864,26 @@ def add_parsed_talents_route():
             data_summary = result.get('data', {}).get('summary', {})
             data_summary = result.get('data', {}).get('summary', {})
             success_count = data_summary.get('success_count', 0)
             success_count = data_summary.get('success_count', 0)
             failed_count = data_summary.get('failed_count', 0)
             failed_count = data_summary.get('failed_count', 0)
-            logger.info(f"处理解析任务响应数据完成: 成功 {success_count} 条,失败 {failed_count} 条")
+            logger.info(f"处理人才数据完成: 成功 {success_count} 条,失败 {failed_count} 条")
+            
+            # 更新任务状态为"已入库"
+            task_id = api_response_data.get('task_id')
+            if task_id:
+                try:
+                    from app.core.data_parse.parse_system import db, ParseTaskRepository
+                    task_obj = ParseTaskRepository.query.filter_by(id=task_id).first()
+                    if task_obj:
+                        task_obj.task_status = '已入库'
+                        db.session.commit()
+                        logger.info(f"已更新解析任务记录: id={task_id}, 状态=已入库")
+                    else:
+                        logger.warning(f"未找到 ID 为 {task_id} 的任务记录")
+                except Exception as update_error:
+                    logger.error(f"更新任务状态失败: {str(update_error)}", exc_info=True)
+            else:
+                logger.info("请求中未包含 task_id,跳过任务状态更新")
         else:
         else:
-            logger.error(f"处理解析任务响应数据失败: {result.get('message', '未知错误')}")
+            logger.error(f"处理人才数据失败: {result.get('message', '未知错误')}")
         
         
         # 返回结果
         # 返回结果
         return jsonify({
         return jsonify({
@@ -1861,7 +1894,7 @@ def add_parsed_talents_route():
         
         
     except Exception as e:
     except Exception as e:
         # 记录错误日志
         # 记录错误日志
-        error_msg = f"处理解析任务响应数据接口失败: {str(e)}"
+        error_msg = f"处理人才数据接口失败: {str(e)}"
         logger.error(error_msg, exc_info=True)
         logger.error(error_msg, exc_info=True)
         
         
         # 返回错误响应
         # 返回错误响应

+ 148 - 46
app/core/data_parse/parse_menduner.py

@@ -150,6 +150,72 @@ def _normalize_talent_profile(raw_profile: Dict[str, Any]) -> Dict[str, Any]:
     return normalized
     return normalized
 
 
 
 
+def _normalize_talent_to_card_format(raw_profile: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    将门墩儿人才数据标准化为名片格式,与任务解析结果.txt中的data字段格式一致
+    
+    Args:
+        raw_profile (Dict[str, Any]): 原始门墩儿档案数据
+        
+    Returns:
+        Dict[str, Any]: 标准化后的名片格式数据
+    """
+    # 提取基本信息
+    name_zh = raw_profile.get('name', raw_profile.get('name_zh', ''))
+    company = raw_profile.get('company', raw_profile.get('hotel_zh', ''))
+    position = raw_profile.get('position', raw_profile.get('title_zh', ''))
+    mobile = raw_profile.get('phone', raw_profile.get('mobile', ''))
+    email = raw_profile.get('email', '')
+    location = raw_profile.get('location', raw_profile.get('address_zh', ''))
+    
+    # 构建隶属关系
+    affiliation = []
+    if company:
+        affiliation.append({
+            "company": company,
+            "group": raw_profile.get('group', '')
+        })
+    
+    # 构建职业轨迹
+    career_path = []
+    if position and company:
+        career_path.append({
+            "date": datetime.now().strftime('%Y-%m-%d'),
+            "hotel_en": raw_profile.get('hotel_en', ''),
+            "hotel_zh": company,
+            "image_path": raw_profile.get('image_path', ''),
+            "source": "menduner_data_creation",
+            "title_en": raw_profile.get('title_en', ''),
+            "title_zh": position
+        })
+    
+    # 按照任务解析结果.txt的data字段格式组装数据
+    normalized = {
+        "address_en": raw_profile.get('address_en', ''),
+        "address_zh": location,
+        "affiliation": affiliation,
+        "age": raw_profile.get('age', 0),
+        "birthday": raw_profile.get('birthday', ''),
+        "brand_group": raw_profile.get('brand_group', ''),
+        "career_path": career_path,
+        "email": _normalize_email(email),
+        "hotel_en": raw_profile.get('hotel_en', ''),
+        "hotel_zh": company,
+        "mobile": _normalize_phone(mobile),
+        "name_en": raw_profile.get('name_en', ''),
+        "name_zh": name_zh,
+        "native_place": raw_profile.get('native_place', ''),
+        "phone": raw_profile.get('phone', ''),
+        "postal_code_en": raw_profile.get('postal_code_en', ''),
+        "postal_code_zh": raw_profile.get('postal_code_zh', ''),
+        "residence": raw_profile.get('residence', ''),
+        "title_en": raw_profile.get('title_en', ''),
+        "title_zh": position
+    }
+    
+    return normalized
+
+
 def _parse_talent_line(line: str) -> Optional[Dict[str, Any]]:
 def _parse_talent_line(line: str) -> Optional[Dict[str, Any]]:
     """
     """
     解析单行人才信息
     解析单行人才信息
@@ -233,10 +299,10 @@ def _normalize_email(email: str) -> str:
 
 
 def validate_menduner_data(data: Dict[str, Any]) -> Dict[str, Any]:
 def validate_menduner_data(data: Dict[str, Any]) -> Dict[str, Any]:
     """
     """
-    验证门墩儿人才数据的完整性和有效性
+    验证门墩儿人才数据的完整性和有效性(名片格式)
     
     
     Args:
     Args:
-        data (Dict[str, Any]): 待验证的人才数据
+        data (Dict[str, Any]): 待验证的名片格式人才数据
         
         
     Returns:
     Returns:
         Dict[str, Any]: 验证结果
         Dict[str, Any]: 验证结果
@@ -245,29 +311,44 @@ def validate_menduner_data(data: Dict[str, Any]) -> Dict[str, Any]:
         errors = []
         errors = []
         warnings = []
         warnings = []
         
         
-        # 必填字段检查
-        required_fields = ['name']
+        # 必填字段检查(按名片格式)
+        required_fields = ['name_zh']
         for field in required_fields:
         for field in required_fields:
             if not data.get(field):
             if not data.get(field):
                 errors.append(f"缺少必填字段: {field}")
                 errors.append(f"缺少必填字段: {field}")
         
         
         # 可选但建议填写的字段
         # 可选但建议填写的字段
-        recommended_fields = ['phone', 'position', 'company']
+        recommended_fields = ['mobile', 'title_zh', 'hotel_zh']
         for field in recommended_fields:
         for field in recommended_fields:
             if not data.get(field):
             if not data.get(field):
                 warnings.append(f"建议填写字段: {field}")
                 warnings.append(f"建议填写字段: {field}")
         
         
         # 格式验证
         # 格式验证
-        if data.get('phone'):
-            phone = data['phone']
-            if not re.match(r'^1[3-9]\d{9}$', re.sub(r'\D', '', phone)):
-                warnings.append("电话号码格式可能不正确")
+        if data.get('mobile'):
+            mobile = data['mobile']
+            # 移除所有非数字字符进行验证
+            digits_only = re.sub(r'\D', '', mobile)
+            if digits_only and not re.match(r'^1[3-9]\d{9}$', digits_only):
+                warnings.append("手机号码格式可能不正确")
         
         
         if data.get('email'):
         if data.get('email'):
             email = data['email']
             email = data['email']
             if not re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
             if not re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
                 errors.append("邮箱格式不正确")
                 errors.append("邮箱格式不正确")
         
         
+        # 验证数组字段
+        if data.get('affiliation') is not None and not isinstance(data['affiliation'], list):
+            errors.append("affiliation字段必须是数组格式")
+        
+        if data.get('career_path') is not None and not isinstance(data['career_path'], list):
+            errors.append("career_path字段必须是数组格式")
+        
+        # 验证年龄字段
+        if data.get('age') is not None:
+            age = data['age']
+            if not isinstance(age, int) or age < 0 or age > 150:
+                warnings.append("年龄值可能不合理")
+        
         return {
         return {
             'is_valid': len(errors) == 0,
             'is_valid': len(errors) == 0,
             'errors': errors,
             'errors': errors,
@@ -293,24 +374,32 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]]) -> Dict[str, An
         data_list (List[Dict[str, Any]]): 待处理的人才数据列表
         data_list (List[Dict[str, Any]]): 待处理的人才数据列表
         
         
     Returns:
     Returns:
-        Dict[str, Any]: 批量处理结果,格式与batch_process_business_card_images保持一致
+        Dict[str, Any]: 批量处理结果,格式与parse_result保持一致
     """
     """
     try:
     try:
         # 验证参数
         # 验证参数
         if not data_list or not isinstance(data_list, list):
         if not data_list or not isinstance(data_list, list):
             return {
             return {
-                'code': 400,
-                'success': False,
-                'message': 'data_list参数必须是非空数组',
-                'data': None
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": len(data_list) if data_list else 0,
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": len(data_list) if data_list else 0
+                }
             }
             }
         
         
         if len(data_list) == 0:
         if len(data_list) == 0:
             return {
             return {
-                'code': 400,
-                'success': False,
-                'message': '门墩儿数据数组不能为空',
-                'data': None
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": 0,
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": 0
+                }
             }
             }
         
         
         results = []
         results = []
@@ -324,8 +413,8 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]]) -> Dict[str, An
             try:
             try:
                 logging.debug(f"处理第 {i+1}/{len(data_list)} 条数据")
                 logging.debug(f"处理第 {i+1}/{len(data_list)} 条数据")
                 
                 
-                # 标准化数据
-                normalized = _normalize_talent_profile(data)
+                # 标准化数据为名片格式
+                normalized = _normalize_talent_to_card_format(data)
                 
                 
                 # 验证数据
                 # 验证数据
                 validation = validate_menduner_data(normalized)
                 validation = validate_menduner_data(normalized)
@@ -333,29 +422,28 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]]) -> Dict[str, An
                 if validation.get('is_valid', False):
                 if validation.get('is_valid', False):
                     success_count += 1
                     success_count += 1
                     results.append({
                     results.append({
-                        'index': i,
-                        'data_id': data.get('id', f'record_{i}'),
-                        'success': True,
-                        'error': None,
-                        'data': {
-                            'normalized_data': normalized,
-                            'validation': validation
-                        },
-                        'message': f'处理成功,验证得分: {validation.get("score", 0)}'
+                        "data": normalized,
+                        "error": None,
+                        "filename": data.get('filename', f'menduner_record_{i}.json'),
+                        "index": i,
+                        "message": "门墩儿数据解析成功",
+                        "minio_path": data.get('minio_path', ''),
+                        "object_key": data.get('object_key', f'menduner_data/record_{i}.json'),
+                        "success": True
                     })
                     })
                     logging.debug(f"成功处理第 {i+1} 条数据")
                     logging.debug(f"成功处理第 {i+1} 条数据")
                 else:
                 else:
                     failed_count += 1
                     failed_count += 1
                     error_messages = validation.get('errors', ['验证失败'])
                     error_messages = validation.get('errors', ['验证失败'])
                     results.append({
                     results.append({
-                        'index': i,
-                        'data_id': data.get('id', f'record_{i}'),
-                        'success': False,
-                        'error': '; '.join(error_messages),
-                        'data': {
-                            'normalized_data': normalized,
-                            'validation': validation
-                        }
+                        "data": None,
+                        "error": '; '.join(error_messages),
+                        "filename": data.get('filename', f'menduner_record_{i}.json'),
+                        "index": i,
+                        "message": "门墩儿数据解析失败",
+                        "minio_path": data.get('minio_path', ''),
+                        "object_key": data.get('object_key', f'menduner_data/record_{i}.json'),
+                        "success": False
                     })
                     })
                     logging.warning(f"处理第 {i+1} 条数据失败: {'; '.join(error_messages)}")
                     logging.warning(f"处理第 {i+1} 条数据失败: {'; '.join(error_messages)}")
                     
                     
@@ -364,11 +452,14 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]]) -> Dict[str, An
                 error_msg = f"处理门墩儿数据失败: {str(item_error)}"
                 error_msg = f"处理门墩儿数据失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
                 logging.error(error_msg, exc_info=True)
                 results.append({
                 results.append({
-                    'index': i,
-                    'data_id': data.get('id', f'record_{i}') if isinstance(data, dict) else f'record_{i}',
-                    'success': False,
-                    'error': error_msg,
-                    'data': None
+                    "data": None,
+                    "error": error_msg,
+                    "filename": data.get('filename', f'menduner_record_{i}.json') if isinstance(data, dict) else f'menduner_record_{i}.json',
+                    "index": i,
+                    "message": "门墩儿数据解析失败",
+                    "minio_path": data.get('minio_path', '') if isinstance(data, dict) else '',
+                    "object_key": data.get('object_key', f'menduner_data/record_{i}.json') if isinstance(data, dict) else f'menduner_data/record_{i}.json',
+                    "success": False
                 })
                 })
         
         
         # 组装最终结果
         # 组装最终结果
@@ -387,21 +478,21 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]]) -> Dict[str, An
             return {
             return {
                 'code': 200,
                 'code': 200,
                 'success': True,
                 'success': True,
-                'message': f'批量处理完成,全部 {success_count} 条数据处理成功',
+                'message': f'批量处理完成,全部 {success_count} 个文件处理成功',
                 'data': batch_result
                 'data': batch_result
             }
             }
         elif success_count == 0:
         elif success_count == 0:
             return {
             return {
                 'code': 500,
                 'code': 500,
                 'success': False,
                 'success': False,
-                'message': f'批量处理失败,全部 {failed_count} 条数据处理失败',
+                'message': f'批量处理失败,全部 {failed_count} 个文件处理失败',
                 'data': batch_result
                 'data': batch_result
             }
             }
         else:
         else:
             return {
             return {
                 'code': 206,  # Partial Content
                 'code': 206,  # Partial Content
                 'success': True,
                 'success': True,
-                'message': f'批量处理部分成功,成功 {success_count} 条,失败 {failed_count} 条',
+                'message': f'批量处理部分成功,成功 {success_count} 个,失败 {failed_count} 个',
                 'data': batch_result
                 'data': batch_result
             }
             }
             
             
@@ -409,9 +500,20 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]]) -> Dict[str, An
         error_msg = f"批量处理门墩儿数据失败: {str(e)}"
         error_msg = f"批量处理门墩儿数据失败: {str(e)}"
         logging.error(error_msg, exc_info=True)
         logging.error(error_msg, exc_info=True)
         
         
+        batch_result = {
+            'summary': {
+                'total_files': len(data_list) if data_list else 1,
+                'success_count': 0,
+                'failed_count': len(data_list) if data_list else 1,
+                'success_rate': 0
+            },
+            'results': [],
+            'processed_time': datetime.now().isoformat()
+        }
+        
         return {
         return {
             'code': 500,
             'code': 500,
             'success': False,
             'success': False,
             'message': error_msg,
             'message': error_msg,
-            'data': None
+            'data': batch_result
         } 
         } 

+ 123 - 77
app/core/data_parse/parse_pic.py

@@ -617,37 +617,46 @@ def parse_table_with_qwen(base64_image: str) -> List[Dict[str, Any]]:
             
             
             # 创建职业轨迹记录
             # 创建职业轨迹记录
             career_entry = {
             career_entry = {
-                'date': datetime.now().strftime('%Y-%m-%d'),
-                'hotel_en': '',
-                'hotel_zh': person_data.get('work_unit', ''),
-                'image_path': '',
-                'source': 'table_extraction',
-                'title_en': '',
-                'title_zh': person_data.get('position', '')
+                "date": datetime.now().strftime('%Y-%m-%d'),
+                "hotel_en": '',
+                "hotel_zh": person_data.get('work_unit', ''),
+                "image_path": '',
+                "source": 'table_extraction',
+                "title_en": '',
+                "title_zh": person_data.get('position', '')
             }
             }
             
             
-            # 将字段映射到标准格式
+            # 创建隶属关系记录
+            affiliation = []
+            work_unit = person_data.get('work_unit', '')
+            if work_unit:
+                affiliation.append({
+                    "company": work_unit,
+                    "group": ""
+                })
+            
+            # 将字段映射到标准格式,与任务解析结果.txt完全一致
             standardized_person = {
             standardized_person = {
-                'name_zh': person_data.get('name', ''),
-                'name_en': '',
-                'title_zh': person_data.get('position', ''),
-                'title_en': '',
-                'hotel_zh': person_data.get('work_unit', ''),
-                'hotel_en': '',
-                'mobile': person_data.get('mobile', ''),
-                'phone': '',
-                'email': person_data.get('email', ''),
-                'address_zh': '',
-                'address_en': '',
-                'postal_code_zh': '',
-                'postal_code_en': '',
-                'birthday': '',
-                'age': 0,
-                'native_place': '',
-                'residence': '',
-                'brand_group': '',
-                'career_path': [career_entry],
-                'affiliation': []
+                "address_en": '',
+                "address_zh": '',
+                "affiliation": affiliation,
+                "age": 0,
+                "birthday": '',
+                "brand_group": '',
+                "career_path": [career_entry],
+                "email": person_data.get('email', ''),
+                "hotel_en": '',
+                "hotel_zh": person_data.get('work_unit', ''),
+                "mobile": person_data.get('mobile', ''),
+                "name_en": '',
+                "name_zh": person_data.get('name', ''),
+                "native_place": '',
+                "phone": '',
+                "postal_code_en": '',
+                "postal_code_zh": '',
+                "residence": '',
+                "title_en": '',
+                "title_zh": person_data.get('position', '')
             }
             }
             
             
             processed_data.append(standardized_person)
             processed_data.append(standardized_person)
@@ -670,43 +679,45 @@ def batch_process_images(image_paths: List[str], process_type: str = 'table') ->
         process_type (str): 处理类型,只支持 'table'
         process_type (str): 处理类型,只支持 'table'
         
         
     Returns:
     Returns:
-        Dict[str, Any]: 批量处理结果,格式与batch_process_business_card_images保持一致
+        Dict[str, Any]: 批量处理结果,格式与parse_result保持一致
     """
     """
     try:
     try:
         # 验证处理类型
         # 验证处理类型
         if process_type != 'table':
         if process_type != 'table':
             return {
             return {
-                'code': 400,
-                'success': False,
-                'message': f'不支持的处理类型: {process_type},只支持 "table" 类型',
-                'data': {
-                    'summary': {
-                        'total_files': len(image_paths),
-                        'success_count': 0,
-                        'failed_count': len(image_paths),
-                        'success_rate': 0.0,
-                        'process_type': process_type
-                    },
-                    'results': [],
-                    'processed_time': datetime.now().isoformat()
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": len(image_paths),
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": len(image_paths)
                 }
                 }
             }
             }
         
         
         # 验证参数
         # 验证参数
         if not image_paths or not isinstance(image_paths, list):
         if not image_paths or not isinstance(image_paths, list):
             return {
             return {
-                'code': 400,
-                'success': False,
-                'message': 'image_paths参数必须是非空数组',
-                'data': None
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": len(image_paths) if image_paths else 0,
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": len(image_paths) if image_paths else 0
+                }
             }
             }
         
         
         if len(image_paths) == 0:
         if len(image_paths) == 0:
             return {
             return {
-                'code': 400,
-                'success': False,
-                'message': '图片路径数组不能为空',
-                'data': None
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": 0,
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": 0
+                }
             }
             }
         
         
         results = []
         results = []
@@ -720,30 +731,53 @@ def batch_process_images(image_paths: List[str], process_type: str = 'table') ->
             try:
             try:
                 logging.info(f"处理第 {i+1}/{len(image_paths)} 个文件: {image_path}")
                 logging.info(f"处理第 {i+1}/{len(image_paths)} 个文件: {image_path}")
                 
                 
-                # 只支持表格处理
+                # 调用表格处理函数
                 result = parse_table_image(image_path)
                 result = parse_table_image(image_path)
                 
                 
                 if result.get('success', False):
                 if result.get('success', False):
-                    success_count += 1
-                    results.append({
-                        'index': i,
-                        'image_path': image_path,
-                        'filename': os.path.basename(image_path) if image_path else f'file_{i}',
-                        'success': True,
-                        'error': None,
-                        'data': result.get('data'),
-                        'message': result.get('message', '处理成功')
-                    })
-                    logging.info(f"成功处理第 {i+1} 个文件: {os.path.basename(image_path)}")
+                    # 提取表格数据并转换为多个人员记录
+                    extracted_data = result.get('data', {}).get('extracted_data', [])
+                    
+                    if extracted_data and isinstance(extracted_data, list):
+                        # 为每个人员创建一个结果记录
+                        for person_idx, person_data in enumerate(extracted_data):
+                            success_count += 1
+                            results.append({
+                                "data": person_data,
+                                "error": None,
+                                "filename": os.path.basename(image_path) if image_path else f'table_file_{i}.jpg',
+                                "index": len(results),  # 使用连续的索引
+                                "message": "表格图片解析成功",
+                                "minio_path": f"table_images/{os.path.basename(image_path)}" if image_path else '',
+                                "object_key": f"table_images/{os.path.basename(image_path)}" if image_path else f'table_images/file_{i}.jpg',
+                                "success": True
+                            })
+                            logging.info(f"成功提取人员 {person_idx+1}: {person_data.get('name_zh', 'Unknown')}")
+                    else:
+                        # 没有提取到有效数据
+                        failed_count += 1
+                        results.append({
+                            "data": None,
+                            "error": "未从表格图片中提取到人员信息",
+                            "filename": os.path.basename(image_path) if image_path else f'table_file_{i}.jpg',
+                            "index": i,
+                            "message": "表格图片解析失败",
+                            "minio_path": f"table_images/{os.path.basename(image_path)}" if image_path else '',
+                            "object_key": f"table_images/{os.path.basename(image_path)}" if image_path else f'table_images/file_{i}.jpg',
+                            "success": False
+                        })
+                        logging.warning(f"第 {i+1} 个文件未提取到人员信息")
                 else:
                 else:
                     failed_count += 1
                     failed_count += 1
                     results.append({
                     results.append({
-                        'index': i,
-                        'image_path': image_path,
-                        'filename': os.path.basename(image_path) if image_path else f'file_{i}',
-                        'success': False,
-                        'error': result.get('error', '处理失败'),
-                        'data': None
+                        "data": None,
+                        "error": result.get('error', '处理失败'),
+                        "filename": os.path.basename(image_path) if image_path else f'table_file_{i}.jpg',
+                        "index": i,
+                        "message": "表格图片解析失败",
+                        "minio_path": f"table_images/{os.path.basename(image_path)}" if image_path else '',
+                        "object_key": f"table_images/{os.path.basename(image_path)}" if image_path else f'table_images/file_{i}.jpg',
+                        "success": False
                     })
                     })
                     logging.error(f"处理第 {i+1} 个文件失败: {result.get('error', '未知错误')}")
                     logging.error(f"处理第 {i+1} 个文件失败: {result.get('error', '未知错误')}")
                     
                     
@@ -752,12 +786,14 @@ def batch_process_images(image_paths: List[str], process_type: str = 'table') ->
                 error_msg = f"处理图片失败: {str(item_error)}"
                 error_msg = f"处理图片失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
                 logging.error(error_msg, exc_info=True)
                 results.append({
                 results.append({
-                    'index': i,
-                    'image_path': image_path,
-                    'filename': os.path.basename(image_path) if image_path else f'file_{i}',
-                    'success': False,
-                    'error': error_msg,
-                    'data': None
+                    "data": None,
+                    "error": error_msg,
+                    "filename": os.path.basename(image_path) if image_path else f'table_file_{i}.jpg',
+                    "index": i,
+                    "message": "表格图片解析失败",
+                    "minio_path": f"table_images/{os.path.basename(image_path)}" if image_path else '',
+                    "object_key": f"table_images/{os.path.basename(image_path)}" if image_path else f'table_images/file_{i}.jpg',
+                    "success": False
                 })
                 })
         
         
         # 组装最终结果
         # 组装最终结果
@@ -766,8 +802,7 @@ def batch_process_images(image_paths: List[str], process_type: str = 'table') ->
                 'total_files': len(image_paths),
                 'total_files': len(image_paths),
                 'success_count': success_count,
                 'success_count': success_count,
                 'failed_count': failed_count,
                 'failed_count': failed_count,
-                'success_rate': round((success_count / len(image_paths)) * 100, 2) if len(image_paths) > 0 else 0,
-                'process_type': process_type
+                'success_rate': round((success_count / len(image_paths)) * 100, 2) if len(image_paths) > 0 else 0
             },
             },
             'results': results,
             'results': results,
             'processed_time': datetime.now().isoformat()
             'processed_time': datetime.now().isoformat()
@@ -799,9 +834,20 @@ def batch_process_images(image_paths: List[str], process_type: str = 'table') ->
         error_msg = f"批量处理图片失败: {str(e)}"
         error_msg = f"批量处理图片失败: {str(e)}"
         logging.error(error_msg, exc_info=True)
         logging.error(error_msg, exc_info=True)
         
         
+        batch_result = {
+            'summary': {
+                'total_files': len(image_paths) if image_paths else 1,
+                'success_count': 0,
+                'failed_count': len(image_paths) if image_paths else 1,
+                'success_rate': 0
+            },
+            'results': [],
+            'processed_time': datetime.now().isoformat()
+        }
+        
         return {
         return {
             'code': 500,
             'code': 500,
             'success': False,
             'success': False,
             'message': error_msg,
             'message': error_msg,
-            'data': None
+            'data': batch_result
         } 
         } 

+ 110 - 36
app/core/data_parse/parse_resume.py

@@ -151,13 +151,13 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
         # 为career_path增加一条记录(如果提取到相关信息)
         # 为career_path增加一条记录(如果提取到相关信息)
         if parsed_resume.get('hotel_zh') or parsed_resume.get('hotel_en') or parsed_resume.get('title_zh') or parsed_resume.get('title_en'):
         if parsed_resume.get('hotel_zh') or parsed_resume.get('hotel_en') or parsed_resume.get('title_zh') or parsed_resume.get('title_en'):
             career_entry = {
             career_entry = {
-                'date': datetime.now().strftime('%Y-%m-%d'),
-                'hotel_en': parsed_resume.get('hotel_en', ''),
-                'hotel_zh': parsed_resume.get('hotel_zh', ''),
-                'image_path': '',
-                'source': 'resume_extraction',
-                'title_en': parsed_resume.get('title_en', ''),
-                'title_zh': parsed_resume.get('title_zh', '')
+                "date": datetime.now().strftime('%Y-%m-%d'),
+                "hotel_en": parsed_resume.get('hotel_en', ''),
+                "hotel_zh": parsed_resume.get('hotel_zh', ''),
+                "image_path": '',
+                "source": 'resume_extraction',
+                "title_en": parsed_resume.get('title_en', ''),
+                "title_zh": parsed_resume.get('title_zh', '')
             }
             }
             
             
             # 如果原有career_path为空或不是数组,则重新设置
             # 如果原有career_path为空或不是数组,则重新设置
@@ -169,6 +169,29 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
                 parsed_resume['career_path'].insert(0, career_entry)
                 parsed_resume['career_path'].insert(0, career_entry)
                 logging.info(f"为简历解析结果添加了career_path记录: {career_entry}")
                 logging.info(f"为简历解析结果添加了career_path记录: {career_entry}")
         
         
+        # 为affiliation增加记录(如果提取到公司信息)
+        if parsed_resume.get('hotel_zh') or parsed_resume.get('hotel_en'):
+            company_name = parsed_resume.get('hotel_zh') or parsed_resume.get('hotel_en')
+            affiliation_entry = {
+                "company": company_name,
+                "group": ""
+            }
+            
+            # 如果原有affiliation为空或不是数组,则重新设置
+            if not isinstance(parsed_resume.get('affiliation'), list) or not parsed_resume['affiliation']:
+                parsed_resume['affiliation'] = [affiliation_entry]
+                logging.info(f"为简历解析结果设置了affiliation记录: {affiliation_entry}")
+            else:
+                # 检查是否已存在相同公司的记录
+                company_exists = any(
+                    aff.get('company') == company_name 
+                    for aff in parsed_resume['affiliation'] 
+                    if isinstance(aff, dict)
+                )
+                if not company_exists:
+                    parsed_resume['affiliation'].append(affiliation_entry)
+                    logging.info(f"为简历解析结果添加了affiliation记录: {affiliation_entry}")
+        
         return parsed_resume
         return parsed_resume
         
         
     except Exception as e:
     except Exception as e:
@@ -374,24 +397,32 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
         file_paths (List[str]): 简历文件路径列表
         file_paths (List[str]): 简历文件路径列表
         
         
     Returns:
     Returns:
-        Dict[str, Any]: 批量解析结果,格式与batch_process_business_card_images保持一致
+        Dict[str, Any]: 批量解析结果,格式与parse_result保持一致
     """
     """
     try:
     try:
         # 验证参数
         # 验证参数
         if not file_paths or not isinstance(file_paths, list):
         if not file_paths or not isinstance(file_paths, list):
             return {
             return {
-                'code': 400,
-                'success': False,
-                'message': 'file_paths参数必须是非空数组',
-                'data': None
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": len(file_paths) if file_paths else 0,
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": len(file_paths) if file_paths else 0
+                }
             }
             }
         
         
         if len(file_paths) == 0:
         if len(file_paths) == 0:
             return {
             return {
-                'code': 400,
-                'success': False,
-                'message': '简历文件路径数组不能为空',
-                'data': None
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": 0,
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": 0
+                }
             }
             }
         
         
         results = []
         results = []
@@ -408,26 +439,56 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
                 result = parse_resume_file(file_path)
                 result = parse_resume_file(file_path)
                 
                 
                 if result.get('success', False):
                 if result.get('success', False):
+                    # 提取并转换为标准名片格式
+                    resume_data = result.get('data', {})
+                    
+                    # 构建符合规范的名片格式数据
+                    standardized_data = {
+                        "address_en": resume_data.get('address_en', ''),
+                        "address_zh": resume_data.get('address_zh', ''),
+                        "affiliation": resume_data.get('affiliation', []),
+                        "age": resume_data.get('age', 0),
+                        "birthday": resume_data.get('birthday', ''),
+                        "brand_group": resume_data.get('brand_group', ''),
+                        "career_path": resume_data.get('career_path', []),
+                        "email": resume_data.get('email', ''),
+                        "hotel_en": resume_data.get('hotel_en', ''),
+                        "hotel_zh": resume_data.get('hotel_zh', ''),
+                        "mobile": resume_data.get('mobile', ''),
+                        "name_en": resume_data.get('name_en', ''),
+                        "name_zh": resume_data.get('name_zh', ''),
+                        "native_place": resume_data.get('native_place', ''),
+                        "phone": resume_data.get('phone', ''),
+                        "postal_code_en": resume_data.get('postal_code_en', ''),
+                        "postal_code_zh": resume_data.get('postal_code_zh', ''),
+                        "residence": resume_data.get('residence', ''),
+                        "title_en": resume_data.get('title_en', ''),
+                        "title_zh": resume_data.get('title_zh', '')
+                    }
+                    
                     success_count += 1
                     success_count += 1
                     results.append({
                     results.append({
-                        'index': i,
-                        'file_path': file_path,
-                        'filename': os.path.basename(file_path) if file_path else f'file_{i}',
-                        'success': True,
-                        'error': None,
-                        'data': result.get('data'),
-                        'message': result.get('message', '处理成功')
+                        "data": standardized_data,
+                        "error": None,
+                        "filename": os.path.basename(file_path) if file_path else f'resume_{i}.pdf',
+                        "index": i,
+                        "message": "简历文件解析成功",
+                        "minio_path": f"resume_files/{os.path.basename(file_path)}" if file_path else '',
+                        "object_key": f"resume_files/{os.path.basename(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
+                        "success": True
                     })
                     })
                     logging.info(f"成功处理第 {i+1} 个文件: {os.path.basename(file_path)}")
                     logging.info(f"成功处理第 {i+1} 个文件: {os.path.basename(file_path)}")
                 else:
                 else:
                     failed_count += 1
                     failed_count += 1
                     results.append({
                     results.append({
-                        'index': i,
-                        'file_path': file_path,
-                        'filename': os.path.basename(file_path) if file_path else f'file_{i}',
-                        'success': False,
-                        'error': result.get('error', '处理失败'),
-                        'data': None
+                        "data": None,
+                        "error": result.get('error', '处理失败'),
+                        "filename": os.path.basename(file_path) if file_path else f'resume_{i}.pdf',
+                        "index": i,
+                        "message": "简历文件解析失败",
+                        "minio_path": f"resume_files/{os.path.basename(file_path)}" if file_path else '',
+                        "object_key": f"resume_files/{os.path.basename(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
+                        "success": False
                     })
                     })
                     logging.error(f"处理第 {i+1} 个文件失败: {result.get('error', '未知错误')}")
                     logging.error(f"处理第 {i+1} 个文件失败: {result.get('error', '未知错误')}")
                     
                     
@@ -436,12 +497,14 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
                 error_msg = f"处理简历文件失败: {str(item_error)}"
                 error_msg = f"处理简历文件失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
                 logging.error(error_msg, exc_info=True)
                 results.append({
                 results.append({
-                    'index': i,
-                    'file_path': file_path,
-                    'filename': os.path.basename(file_path) if file_path else f'file_{i}',
-                    'success': False,
-                    'error': error_msg,
-                    'data': None
+                    "data": None,
+                    "error": error_msg,
+                    "filename": os.path.basename(file_path) if file_path else f'resume_{i}.pdf',
+                    "index": i,
+                    "message": "简历文件解析失败",
+                    "minio_path": f"resume_files/{os.path.basename(file_path)}" if file_path else '',
+                    "object_key": f"resume_files/{os.path.basename(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
+                    "success": False
                 })
                 })
         
         
         # 组装最终结果
         # 组装最终结果
@@ -482,9 +545,20 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
         error_msg = f"批量解析简历失败: {str(e)}"
         error_msg = f"批量解析简历失败: {str(e)}"
         logging.error(error_msg, exc_info=True)
         logging.error(error_msg, exc_info=True)
         
         
+        batch_result = {
+            'summary': {
+                'total_files': len(file_paths) if file_paths else 1,
+                'success_count': 0,
+                'failed_count': len(file_paths) if file_paths else 1,
+                'success_rate': 0
+            },
+            'results': [],
+            'processed_time': datetime.now().isoformat()
+        }
+        
         return {
         return {
             'code': 500,
             'code': 500,
             'success': False,
             'success': False,
             'message': error_msg,
             'message': error_msg,
-            'data': None
+            'data': batch_result
         } 
         } 

+ 28 - 93
app/core/data_parse/parse_task.py

@@ -214,12 +214,13 @@ def _validate_files_by_task_type(files, task_type):
     return {'success': True}
     return {'success': True}
 
 
 
 
-def _handle_recruitment_task(created_by):
+def _handle_recruitment_task(created_by, data=None):
     """
     """
     处理招聘类型任务(数据库记录,不需要文件上传)
     处理招聘类型任务(数据库记录,不需要文件上传)
     
     
     Args:
     Args:
         created_by (str): 创建者
         created_by (str): 创建者
+        data (str): 招聘数据内容
         
         
     Returns:
     Returns:
         dict: 处理结果
         dict: 处理结果
@@ -236,10 +237,14 @@ def _handle_recruitment_task(created_by):
             'upload_time': datetime.now().isoformat()
             'upload_time': datetime.now().isoformat()
         }
         }
         
         
+        # 将传入的data参数写入task_source字段
+        if data:
+            task_source['data'] = data
+        
         # 创建解析任务记录
         # 创建解析任务记录
         parse_task = ParseTaskRepository(
         parse_task = ParseTaskRepository(
             task_name=task_name,
             task_name=task_name,
-            task_status='待解析',
+            task_status='成功',  # 招聘任务不需要实际解析操作,直接设置为成功
             task_type='招聘',
             task_type='招聘',
             task_source=task_source,
             task_source=task_source,
             collection_count=0,  # 招聘任务不涉及文件收集
             collection_count=0,  # 招聘任务不涉及文件收集
@@ -252,7 +257,7 @@ def _handle_recruitment_task(created_by):
         db.session.add(parse_task)
         db.session.add(parse_task)
         db.session.commit()
         db.session.commit()
         
         
-        logging.info(f"成功创建招聘任务记录: {task_name}")
+        logging.info(f"成功创建招聘任务记录: {task_name}, 包含data参数: {'是' if data else '否'}")
         
         
         return {
         return {
             'code': 200,
             'code': 200,
@@ -350,7 +355,7 @@ def _get_content_type_by_extension(filename):
     return content_type_mapping.get(file_ext, 'application/octet-stream')
     return content_type_mapping.get(file_ext, 'application/octet-stream')
 
 
 
 
-def add_parse_task(files, task_type, created_by='system'):
+def add_parse_task(files, task_type, created_by='system', data=None, publish_time=None):
     """
     """
     新增解析任务,根据任务类型处理不同类型的文件
     新增解析任务,根据任务类型处理不同类型的文件
     
     
@@ -358,6 +363,8 @@ def add_parse_task(files, task_type, created_by='system'):
         files (list): 前端上传的文件数组,每个元素是FileStorage对象
         files (list): 前端上传的文件数组,每个元素是FileStorage对象
         task_type (str): 任务类型,可选值:'名片', '简历', '新任命', '招聘', '杂项'
         task_type (str): 任务类型,可选值:'名片', '简历', '新任命', '招聘', '杂项'
         created_by (str): 创建者,默认为'system'
         created_by (str): 创建者,默认为'system'
+        data (str): 数据内容,招聘类型必需
+        publish_time (str): 发布时间,新任命类型必需
         
         
     Returns:
     Returns:
         dict: 包含操作结果的字典
         dict: 包含操作结果的字典
@@ -382,7 +389,7 @@ def add_parse_task(files, task_type, created_by='system'):
                     'data': None
                     'data': None
                 }
                 }
             # 招聘任务处理逻辑
             # 招聘任务处理逻辑
-            return _handle_recruitment_task(created_by)
+            return _handle_recruitment_task(created_by, data)
         
         
         # 其他类型需要验证文件
         # 其他类型需要验证文件
         if not files or not isinstance(files, list):
         if not files or not isinstance(files, list):
@@ -507,6 +514,10 @@ def add_parse_task(files, task_type, created_by='system'):
             'upload_time': datetime.now().isoformat()
             'upload_time': datetime.now().isoformat()
         }
         }
         
         
+        # 对于新任命类型,在task_source中添加publish_time
+        if task_type == '新任命' and publish_time:
+            task_source['publish_time'] = publish_time
+        
         # 创建解析任务记录
         # 创建解析任务记录
         try:
         try:
             parse_task = ParseTaskRepository(
             parse_task = ParseTaskRepository(
@@ -821,10 +832,10 @@ def add_single_talent(talent_data):
 
 
 def add_parsed_talents(api_response_data):
 def add_parsed_talents(api_response_data):
     """
     """
-    处理execute-parse-task API响应数据,提取人才信息并写入business_cards表
+    处理解析任务响应数据,提取人才信息并写入business_cards表
     
     
     Args:
     Args:
-        api_response_data (dict): execute-parse-task API的返回数据
+        api_response_data (dict): 请求数据,格式为 {"data": {"results": [...]}}
         
         
     Returns:
     Returns:
         dict: 批量处理结果,格式与其他batch函数保持一致
         dict: 批量处理结果,格式与其他batch函数保持一致
@@ -839,22 +850,13 @@ def add_parsed_talents(api_response_data):
                 'data': None
                 'data': None
             }
             }
         
         
-        # 检查API响应是否成功
-        if not api_response_data.get('success', False):
-            return {
-                'code': 400,
-                'success': False,
-                'message': f"API响应表示处理失败: {api_response_data.get('message', '未知错误')}",
-                'data': None
-            }
-        
         # 获取data字段
         # 获取data字段
         response_data = api_response_data.get('data')
         response_data = api_response_data.get('data')
         if not response_data or not isinstance(response_data, dict):
         if not response_data or not isinstance(response_data, dict):
             return {
             return {
                 'code': 400,
                 'code': 400,
                 'success': False,
                 'success': False,
-                'message': 'API响应中缺少有效的data字段',
+                'message': '请求中缺少有效的data字段',
                 'data': None
                 'data': None
             }
             }
         
         
@@ -864,7 +866,7 @@ def add_parsed_talents(api_response_data):
             return {
             return {
                 'code': 400,
                 'code': 400,
                 'success': False,
                 'success': False,
-                'message': 'API响应中的results字段必须是数组',
+                'message': '请求中的results字段必须是数组',
                 'data': None
                 'data': None
             }
             }
         
         
@@ -872,11 +874,11 @@ def add_parsed_talents(api_response_data):
             return {
             return {
                 'code': 400,
                 'code': 400,
                 'success': False,
                 'success': False,
-                'message': 'API响应中的results数组为空,没有人才数据需要处理',
+                'message': '请求中的results数组为空,没有人才数据需要处理',
                 'data': None
                 'data': None
             }
             }
         
         
-        logging.info(f"开始处理API响应中的人才数据,共 {len(results)} 条记录")
+        logging.info(f"开始处理人才数据,共 {len(results)} 条记录")
         
         
         processed_results = []
         processed_results = []
         success_count = 0
         success_count = 0
@@ -914,76 +916,11 @@ def add_parsed_talents(api_response_data):
                     logging.warning(f"第 {i+1} 条记录缺少data字段")
                     logging.warning(f"第 {i+1} 条记录缺少data字段")
                     continue
                     continue
                 
                 
-                # 处理不同的数据格式
-                talent_data = None
-                
-                # 检查是否是批量解析结果(如新任命等,包含多个人员)
-                if isinstance(item_data, dict):
-                    if 'all_results' in item_data and isinstance(item_data['all_results'], list):
-                        # 新任命等批量数据格式,包含多个人员
-                        all_talents = item_data['all_results']
-                        logging.info(f"第 {i+1} 条记录包含 {len(all_talents)} 个人员信息")
-                        
-                        # 处理每个人员
-                        for j, single_talent in enumerate(all_talents):
-                            try:
-                                talent_result = add_single_talent(single_talent)
-                                if talent_result.get('success', False):
-                                    success_count += 1
-                                    processed_results.append({
-                                        'index': i,
-                                        'original_index': result_item.get('index', i),
-                                        'sub_index': j,
-                                        'success': True,
-                                        'error': None,
-                                        'data': talent_result.get('data'),
-                                        'message': f'成功处理人员: {single_talent.get("name_zh", "未知")}'
-                                    })
-                                    logging.debug(f"成功处理第 {i+1} 条记录中的第 {j+1} 个人员")
-                                else:
-                                    failed_count += 1
-                                    processed_results.append({
-                                        'index': i,
-                                        'original_index': result_item.get('index', i),
-                                        'sub_index': j,
-                                        'success': False,
-                                        'error': talent_result.get('message', '处理失败'),
-                                        'data': None
-                                    })
-                                    logging.error(f"处理第 {i+1} 条记录中的第 {j+1} 个人员失败")
-                            except Exception as talent_error:
-                                failed_count += 1
-                                error_msg = f"处理人员数据异常: {str(talent_error)}"
-                                processed_results.append({
-                                    'index': i,
-                                    'original_index': result_item.get('index', i),
-                                    'sub_index': j,
-                                    'success': False,
-                                    'error': error_msg,
-                                    'data': None
-                                })
-                                logging.error(error_msg, exc_info=True)
-                        continue
-                    else:
-                        # 单个人员数据格式
-                        talent_data = item_data
-                elif isinstance(item_data, list) and len(item_data) > 0:
-                    # 如果是数组,取第一个元素
-                    talent_data = item_data[0]
-                else:
-                    failed_count += 1
-                    processed_results.append({
-                        'index': i,
-                        'original_index': result_item.get('index', i),
-                        'success': False,
-                        'error': 'data字段格式不正确,无法识别人才数据',
-                        'data': None
-                    })
-                    logging.warning(f"第 {i+1} 条记录data字段格式不正确")
-                    continue
+                # 处理人才数据 - 新格式直接使用 item_data
+                talent_data = item_data
                 
                 
                 # 处理单个人才数据
                 # 处理单个人才数据
-                if talent_data:
+                if talent_data and isinstance(talent_data, dict):
                     try:
                     try:
                         talent_result = add_single_talent(talent_data)
                         talent_result = add_single_talent(talent_data)
                         if talent_result.get('success', False):
                         if talent_result.get('success', False):
@@ -1037,12 +974,10 @@ def add_parsed_talents(api_response_data):
                 'total_files': len(results),
                 'total_files': len(results),
                 'success_count': success_count,
                 'success_count': success_count,
                 'failed_count': failed_count,
                 'failed_count': failed_count,
-                'success_rate': round((success_count / len(results)) * 100, 2) if len(results) > 0 else 0,
-                'original_summary': response_data.get('summary', {})
+                'success_rate': round((success_count / len(results)) * 100, 2) if len(results) > 0 else 0
             },
             },
             'results': processed_results,
             'results': processed_results,
-            'processed_time': datetime.now().isoformat(),
-            'original_api_response': api_response_data  # 保留原始API响应用于调试
+            'processed_time': datetime.now().isoformat()
         }
         }
         
         
         if failed_count == 0:
         if failed_count == 0:
@@ -1068,7 +1003,7 @@ def add_parsed_talents(api_response_data):
             }
             }
             
             
     except Exception as e:
     except Exception as e:
-        error_msg = f"处理API响应数据失败: {str(e)}"
+        error_msg = f"处理人才数据失败: {str(e)}"
         logging.error(error_msg, exc_info=True)
         logging.error(error_msg, exc_info=True)
         
         
         return {
         return {

+ 201 - 42
app/core/data_parse/parse_web.py

@@ -9,6 +9,7 @@ from botocore.config import Config
 from io import BytesIO
 from io import BytesIO
 from datetime import datetime
 from datetime import datetime
 from openai import OpenAI
 from openai import OpenAI
+from typing import Dict, Any
 
 
 # 导入配置和业务逻辑模块
 # 导入配置和业务逻辑模块
 from app.config.config import DevelopmentConfig, ProductionConfig
 from app.config.config import DevelopmentConfig, ProductionConfig
@@ -748,6 +749,67 @@ def process_webpage_with_QWen(markdown_text, publish_time):
         raise Exception(error_msg) 
         raise Exception(error_msg) 
 
 
 
 
+def _convert_webpage_to_card_format(webpage_data: Dict[str, Any], publish_time: str) -> Dict[str, Any]:
+    """
+    将网页解析的数据转换为标准名片格式,与任务解析结果.txt中的data字段格式一致
+    
+    Args:
+        webpage_data (Dict[str, Any]): 网页解析的原始数据
+        publish_time (str): 发布时间
+        
+    Returns:
+        Dict[str, Any]: 标准化后的名片格式数据
+    """
+    # 构建隶属关系
+    affiliation = []
+    company = webpage_data.get('hotel_zh', '')
+    if company:
+        affiliation.append({
+            "company": company,
+            "group": webpage_data.get('brand_group', '')
+        })
+    
+    # 构建职业轨迹
+    career_path = []
+    position = webpage_data.get('title_zh', '')
+    if position and company:
+        career_path.append({
+            "date": publish_time if publish_time else datetime.now().strftime('%Y-%m-%d'),
+            "hotel_en": webpage_data.get('hotel_en', ''),
+            "hotel_zh": company,
+            "image_path": webpage_data.get('pic_url', ''),
+            "source": "webpage_talent_extraction",
+            "title_en": webpage_data.get('title_en', ''),
+            "title_zh": position
+        })
+    
+    # 按照任务解析结果.txt的data字段格式组装数据
+    standardized = {
+        "address_en": webpage_data.get('address_en', ''),
+        "address_zh": webpage_data.get('address_zh', ''),
+        "affiliation": affiliation,
+        "age": webpage_data.get('age', 0),
+        "birthday": webpage_data.get('birthday', ''),
+        "brand_group": webpage_data.get('brand_group', ''),
+        "career_path": career_path,
+        "email": webpage_data.get('email', ''),
+        "hotel_en": webpage_data.get('hotel_en', ''),
+        "hotel_zh": company,
+        "mobile": webpage_data.get('mobile', ''),
+        "name_en": webpage_data.get('name_en', ''),
+        "name_zh": webpage_data.get('name_zh', ''),
+        "native_place": webpage_data.get('native_place', ''),
+        "phone": webpage_data.get('phone', ''),
+        "postal_code_en": webpage_data.get('postal_code_en', ''),
+        "postal_code_zh": webpage_data.get('postal_code_zh', ''),
+        "residence": webpage_data.get('residence', ''),
+        "title_en": webpage_data.get('title_en', ''),
+        "title_zh": position
+    }
+    
+    return standardized
+
+
 def batch_process_md(markdown_file_list, publish_time):
 def batch_process_md(markdown_file_list, publish_time):
     """
     """
     批量处理包含多个人员信息的markdown文件
     批量处理包含多个人员信息的markdown文件
@@ -757,24 +819,32 @@ def batch_process_md(markdown_file_list, publish_time):
         publish_time (str): 发布时间,用于career_path中的date字段
         publish_time (str): 发布时间,用于career_path中的date字段
         
         
     Returns:
     Returns:
-        dict: 批量处理结果,格式与batch_process_business_card_images保持一致
+        dict: 批量处理结果,格式与parse_result保持一致
     """
     """
     try:
     try:
         # 参数验证
         # 参数验证
         if not markdown_file_list or not isinstance(markdown_file_list, list):
         if not markdown_file_list or not isinstance(markdown_file_list, list):
             return {
             return {
-                'code': 400,
-                'success': False,
-                'message': 'markdown_file_list参数必须是非空数组',
-                'data': None
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": len(markdown_file_list) if markdown_file_list else 0,
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": len(markdown_file_list) if markdown_file_list else 0
+                }
             }
             }
         
         
         if not publish_time or not isinstance(publish_time, str):
         if not publish_time or not isinstance(publish_time, str):
             return {
             return {
-                'code': 400,
-                'success': False,
-                'message': 'publish_time参数必须是非空字符串',
-                'data': None
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": len(markdown_file_list),
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": len(markdown_file_list)
+                }
             }
             }
         
         
         logging.info(f"开始批量处理 {len(markdown_file_list)} 个markdown文件")
         logging.info(f"开始批量处理 {len(markdown_file_list)} 个markdown文件")
@@ -782,7 +852,7 @@ def batch_process_md(markdown_file_list, publish_time):
         results = []
         results = []
         success_count = 0
         success_count = 0
         failed_count = 0
         failed_count = 0
-        total_persons = 0
+        total_records = 0  # 总记录数(人员数)
         
         
         # 逐个处理每个markdown文件
         # 逐个处理每个markdown文件
         for i, minio_path in enumerate(markdown_file_list):
         for i, minio_path in enumerate(markdown_file_list):
@@ -793,44 +863,74 @@ def batch_process_md(markdown_file_list, publish_time):
                 file_result = process_single_markdown_file(minio_path, publish_time)
                 file_result = process_single_markdown_file(minio_path, publish_time)
                 
                 
                 if file_result.get('success', False):
                 if file_result.get('success', False):
-                    success_count += 1
-                    persons_count = file_result.get('data', {}).get('total_persons', 0)
-                    total_persons += persons_count
+                    # 提取处理结果中的人员信息
+                    persons_data = file_result.get('data', {}).get('all_results', [])
                     
                     
-                    results.append({
-                        'index': i,
-                        'minio_path': minio_path,
-                        'filename': minio_path.split('/')[-1] if '/' in minio_path else minio_path,
-                        'success': True,
-                        'error': None,
-                        'data': file_result.get('data'),
-                        'message': f'处理成功,提取 {persons_count} 个人员信息'
-                    })
-                    logging.info(f"成功处理第 {i+1} 个文件: {minio_path},提取 {persons_count} 个人员信息")
+                    if persons_data and isinstance(persons_data, list):
+                        # 为每个人员创建一个结果记录
+                        for person_idx, person_data in enumerate(persons_data):
+                            total_records += 1
+                            # 转换为标准名片格式
+                            standardized_data = _convert_webpage_to_card_format(person_data, publish_time)
+                            
+                            success_count += 1
+                            results.append({
+                                "data": standardized_data,
+                                "error": None,
+                                "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
+                                "index": len(results),  # 使用连续的索引
+                                "message": "网页人才信息解析成功",
+                                "minio_path": minio_path,
+                                "object_key": minio_path,
+                                "success": True
+                            })
+                            logging.info(f"成功提取人员 {person_idx+1}: {person_data.get('name_zh', 'Unknown')}")
+                    else:
+                        # 没有提取到有效数据,这算作一个失败记录
+                        total_records += 1
+                        failed_count += 1
+                        results.append({
+                            "data": None,
+                            "error": "未从markdown文件中提取到人员信息",
+                            "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
+                            "index": len(results),
+                            "message": "网页人才信息解析失败",
+                            "minio_path": minio_path,
+                            "object_key": minio_path,
+                            "success": False
+                        })
+                        logging.warning(f"第 {i+1} 个文件未提取到人员信息")
                 else:
                 else:
+                    # 文件处理失败,算作一个失败记录
+                    total_records += 1
                     failed_count += 1
                     failed_count += 1
                     error_msg = file_result.get('message', '处理失败')
                     error_msg = file_result.get('message', '处理失败')
                     results.append({
                     results.append({
-                        'index': i,
-                        'minio_path': minio_path,
-                        'filename': minio_path.split('/')[-1] if '/' in minio_path else minio_path,
-                        'success': False,
-                        'error': error_msg,
-                        'data': None
+                        "data": None,
+                        "error": error_msg,
+                        "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
+                        "index": len(results),
+                        "message": "网页人才信息解析失败",
+                        "minio_path": minio_path,
+                        "object_key": minio_path,
+                        "success": False
                     })
                     })
                     logging.error(f"处理第 {i+1} 个文件失败: {error_msg}")
                     logging.error(f"处理第 {i+1} 个文件失败: {error_msg}")
                     
                     
             except Exception as item_error:
             except Exception as item_error:
+                total_records += 1
                 failed_count += 1
                 failed_count += 1
                 error_msg = f"处理markdown文件失败: {str(item_error)}"
                 error_msg = f"处理markdown文件失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
                 logging.error(error_msg, exc_info=True)
                 results.append({
                 results.append({
-                    'index': i,
-                    'minio_path': minio_path,
-                    'filename': minio_path.split('/')[-1] if '/' in minio_path else minio_path,
-                    'success': False,
-                    'error': error_msg,
-                    'data': None
+                    "data": None,
+                    "error": error_msg,
+                    "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
+                    "index": len(results),
+                    "message": "网页人才信息解析失败",
+                    "minio_path": minio_path,
+                    "object_key": minio_path,
+                    "success": False
                 })
                 })
         
         
         # 组装最终结果
         # 组装最终结果
@@ -839,8 +939,7 @@ def batch_process_md(markdown_file_list, publish_time):
                 'total_files': len(markdown_file_list),
                 'total_files': len(markdown_file_list),
                 'success_count': success_count,
                 'success_count': success_count,
                 'failed_count': failed_count,
                 'failed_count': failed_count,
-                'success_rate': round((success_count / len(markdown_file_list)) * 100, 2) if len(markdown_file_list) > 0 else 0,
-                'total_persons': total_persons
+                'success_rate': round((success_count / len(markdown_file_list)) * 100, 2) if len(markdown_file_list) > 0 else 0
             },
             },
             'results': results,
             'results': results,
             'processed_time': datetime.now().isoformat()
             'processed_time': datetime.now().isoformat()
@@ -850,7 +949,7 @@ def batch_process_md(markdown_file_list, publish_time):
             return {
             return {
                 'code': 200,
                 'code': 200,
                 'success': True,
                 'success': True,
-                'message': f'批量处理完成,全部 {success_count} 个文件处理成功,共提取 {total_persons} 个人员信息',
+                'message': f'批量处理完成,全部 {success_count} 个文件处理成功',
                 'data': batch_result
                 'data': batch_result
             }
             }
         elif success_count == 0:
         elif success_count == 0:
@@ -864,18 +963,30 @@ def batch_process_md(markdown_file_list, publish_time):
             return {
             return {
                 'code': 206,  # Partial Content
                 'code': 206,  # Partial Content
                 'success': True,
                 'success': True,
-                'message': f'批量处理部分成功,成功 {success_count} 个,失败 {failed_count} 个,共提取 {total_persons} 个人员信息',
+                'message': f'批量处理部分成功,成功 {success_count} 个,失败 {failed_count} 个',
                 'data': batch_result
                 'data': batch_result
             }
             }
             
             
     except Exception as e:
     except Exception as e:
         error_msg = f"batch_process_md函数执行失败: {str(e)}"
         error_msg = f"batch_process_md函数执行失败: {str(e)}"
         logging.error(error_msg, exc_info=True)
         logging.error(error_msg, exc_info=True)
+        
+        batch_result = {
+            'summary': {
+                'total_files': len(markdown_file_list) if markdown_file_list else 1,
+                'success_count': 0,
+                'failed_count': len(markdown_file_list) if markdown_file_list else 1,
+                'success_rate': 0
+            },
+            'results': [],
+            'processed_time': datetime.now().isoformat()
+        }
+        
         return {
         return {
             'code': 500,
             'code': 500,
             'success': False,
             'success': False,
             'message': error_msg,
             'message': error_msg,
-            'data': None
+            'data': batch_result
         }
         }
 
 
 
 
@@ -885,7 +996,7 @@ def get_markdown_from_minio(minio_client, minio_path):
     
     
     Args:
     Args:
         minio_client: MinIO客户端
         minio_client: MinIO客户端
-        minio_path (str): MinIO中的文件路径
+        minio_path (str): MinIO中的文件路径或完整URL
         
         
     Returns:
     Returns:
         str: 文件内容,如果失败返回None
         str: 文件内容,如果失败返回None
@@ -893,8 +1004,16 @@ def get_markdown_from_minio(minio_client, minio_path):
     try:
     try:
         logging.info(f"从MinIO获取文件: {minio_path}")
         logging.info(f"从MinIO获取文件: {minio_path}")
         
         
+        # 如果是完整的URL,提取对象键
+        object_key = _extract_object_key_from_url(minio_path)
+        if object_key is None:
+            logging.error(f"无法从URL中提取有效的对象键: {minio_path}")
+            return None
+        if object_key != minio_path:
+            logging.info(f"从URL提取的对象键: {object_key}")
+        
         # 从MinIO下载文件
         # 从MinIO下载文件
-        response = minio_client.get_object(Bucket=minio_bucket, Key=minio_path)
+        response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
         
         
         # 读取文件内容
         # 读取文件内容
         content = response['Body'].read()
         content = response['Body'].read()
@@ -921,6 +1040,46 @@ def get_markdown_from_minio(minio_client, minio_path):
         return None
         return None
 
 
 
 
+def _extract_object_key_from_url(minio_url):
+    """
+    从MinIO完整URL中提取对象键名
+    
+    Args:
+        minio_url (str): 完整的MinIO URL,如 "http://host:port/bucket/path/to/file.md"
+        
+    Returns:
+        str: 对象键名,如 "path/to/file.md",失败时返回原始字符串或None
+    """
+    try:
+        if not minio_url or not isinstance(minio_url, str):
+            return None
+            
+        # 移除协议部分 (http:// 或 https://)
+        if minio_url.startswith('https://'):
+            url_without_protocol = minio_url[8:]
+        elif minio_url.startswith('http://'):
+            url_without_protocol = minio_url[7:]
+        else:
+            # 如果没有协议前缀,假设是相对路径,直接返回
+            return minio_url
+        
+        # 分割路径部分
+        parts = url_without_protocol.split('/')
+        
+        # 至少需要包含 host:port/bucket/object
+        if len(parts) < 3:
+            return None
+        
+        # 跳过host:port和bucket,获取对象路径
+        object_key = '/'.join(parts[2:])
+        
+        return object_key if object_key else None
+        
+    except Exception as e:
+        logging.error(f"解析MinIO URL失败: {str(e)}")
+        return None
+
+
 def save_section_to_minio(minio_client, section_content, original_minio_path, section_number):
 def save_section_to_minio(minio_client, section_content, original_minio_path, section_number):
     """
     """
     将分割后的markdown内容保存到MinIO
     将分割后的markdown内容保存到MinIO

+ 57 - 0
任务解析结果.txt

@@ -0,0 +1,57 @@
+parse_result:{
+  "processed_time": "2025-07-21T14:59:08.416455",
+  "results": [
+    {
+      "data": {
+        "address_en": "",
+        "address_zh": "苏州市吴中区木渎镇花苑东路726号3幢",
+        "affiliation": [
+          {
+            "company": "苏州木渎古镇ROSSO酒店",
+            "group": ""
+          }
+        ],
+        "age": 0,
+        "birthday": "",
+        "brand_group": "",
+        "career_path": [
+          {
+            "date": "2025-07-21",
+            "hotel_en": "",
+            "hotel_zh": "苏州木渎古镇ROSSO酒店",
+            "image_path": "",
+            "source": "business_card_creation",
+            "title_en": "Sales Director",
+            "title_zh": "销售总监"
+          }
+        ],
+        "email": "cw928383712@163.com",
+        "hotel_en": "",
+        "hotel_zh": "苏州木渎古镇ROSSO酒店",
+        "mobile": "13073381364",
+        "name_en": "",
+        "name_zh": "陈玮",
+        "native_place": "",
+        "phone": "0512-66563999",
+        "postal_code_en": "",
+        "postal_code_zh": "",
+        "residence": "",
+        "title_en": "Sales Director",
+        "title_zh": "销售总监"
+      },
+      "error": null,
+      "filename": "talent_photo_20250721_145849_b47a70cc.jpg",
+      "index": 0,
+      "message": "名片图片解析成功",
+      "minio_path": "http://192.168.3.143:9000/dataops-bucket/talent_photos/talent_photo_20250721_145849_b47a70cc.jpg",
+      "object_key": "talent_photos/talent_photo_20250721_145849_b47a70cc.jpg",
+      "success": true
+    }
+  ],
+  "summary": {
+    "failed_count": 0,
+    "success_count": 1,
+    "success_rate": 100,
+    "total_files": 1
+  }
+}