浏览代码

修改minio_path
修改origin_source
修改创建Talent节点逻辑
修改杂项minio路径赋值

maxiaolong 4 周之前
父节点
当前提交
94fe6052a9

+ 24 - 8
app/core/data_parse/parse_pic.py

@@ -930,41 +930,53 @@ def batch_process_images(image_paths: List[str], process_type: str = 'table') ->
                         # 为每个人员创建一个结果记录
                         for person_idx, person_data in enumerate(extracted_data):
                             success_count += 1
+                            # 构建完整的MinIO URL路径
+                            relative_path = f"misc_files/{os.path.basename(image_path)}" if image_path else f'misc_files/file_{i}.jpg'
+                            complete_minio_path = f"{minio_url}/{minio_bucket}/{relative_path}"
+                            
                             results.append({
                                 "data": person_data,
                                 "error": None,
                                 "filename": os.path.basename(image_path) if image_path else f'table_file_{i}.jpg',
                                 "index": len(results),  # 使用连续的索引
                                 "message": "表格图片解析成功",
-                                "minio_path": f"table_images/{os.path.basename(image_path)}" if image_path else '',
-                                "object_key": f"table_images/{os.path.basename(image_path)}" if image_path else f'table_images/file_{i}.jpg',
+                                "minio_path": complete_minio_path,
+                                "object_key": relative_path,
                                 "success": True
                             })
                             logging.info(f"成功提取人员 {person_idx+1}: {person_data.get('name_zh', 'Unknown')}")
                     else:
                         # 没有提取到有效数据
                         failed_count += 1
+                        # 构建完整的MinIO URL路径
+                        relative_path = f"misc_files/{os.path.basename(image_path)}" if image_path else f'misc_files/file_{i}.jpg'
+                        complete_minio_path = f"{minio_url}/{minio_bucket}/{relative_path}"
+                        
                         results.append({
                             "data": None,
                             "error": "未从表格图片中提取到人员信息",
                             "filename": os.path.basename(image_path) if image_path else f'table_file_{i}.jpg',
                             "index": i,
                             "message": "表格图片解析失败",
-                            "minio_path": f"table_images/{os.path.basename(image_path)}" if image_path else '',
-                            "object_key": f"table_images/{os.path.basename(image_path)}" if image_path else f'table_images/file_{i}.jpg',
+                            "minio_path": complete_minio_path,
+                            "object_key": relative_path,
                             "success": False
                         })
                         logging.warning(f"第 {i+1} 个文件未提取到人员信息")
                 else:
                     failed_count += 1
+                    # 构建完整的MinIO URL路径
+                    relative_path = f"misc_files/{os.path.basename(image_path)}" if image_path else f'misc_files/file_{i}.jpg'
+                    complete_minio_path = f"{minio_url}/{minio_bucket}/{relative_path}"
+                    
                     results.append({
                         "data": None,
                         "error": result.get('error', '处理失败'),
                         "filename": os.path.basename(image_path) if image_path else f'table_file_{i}.jpg',
                         "index": i,
                         "message": "表格图片解析失败",
-                        "minio_path": f"table_images/{os.path.basename(image_path)}" if image_path else '',
-                        "object_key": f"table_images/{os.path.basename(image_path)}" if image_path else f'table_images/file_{i}.jpg',
+                        "minio_path": complete_minio_path,
+                        "object_key": relative_path,
                         "success": False
                     })
                     logging.error(f"处理第 {i+1} 个文件失败: {result.get('error', '未知错误')}")
@@ -973,14 +985,18 @@ def batch_process_images(image_paths: List[str], process_type: str = 'table') ->
                 failed_count += 1
                 error_msg = f"处理图片失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
+                # 构建完整的MinIO URL路径
+                relative_path = f"misc_files/{os.path.basename(image_path)}" if image_path else f'misc_files/file_{i}.jpg'
+                complete_minio_path = f"{minio_url}/{minio_bucket}/{relative_path}"
+                
                 results.append({
                     "data": None,
                     "error": error_msg,
                     "filename": os.path.basename(image_path) if image_path else f'table_file_{i}.jpg',
                     "index": i,
                     "message": "表格图片解析失败",
-                    "minio_path": f"table_images/{os.path.basename(image_path)}" if image_path else '',
-                    "object_key": f"table_images/{os.path.basename(image_path)}" if image_path else f'table_images/file_{i}.jpg',
+                    "minio_path": complete_minio_path,
+                    "object_key": relative_path,
                     "success": False
                 })
         

+ 75 - 25
app/core/data_parse/parse_resume.py

@@ -60,6 +60,30 @@ def get_minio_client():
         return None
 
 
+def standardize_career_entry(entry):
+    """
+    标准化career_path条目格式
+    
+    Args:
+        entry: 原始条目数据
+        
+    Returns:
+        dict: 标准化后的条目
+    """
+    if not isinstance(entry, dict):
+        entry = {}
+    
+    return {
+        "date": entry.get('date', ''),
+        "hotel_en": entry.get('hotel_en', ''),
+        "hotel_zh": entry.get('hotel_zh', ''),
+        "image_path": entry.get('image_path', ''),
+        "source": entry.get('source', 'resume_extraction'),
+        "title_en": entry.get('title_en', ''),
+        "title_zh": entry.get('title_zh', '')
+    }
+
+
 def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
     """
     使用阿里云千问大模型解析简历文本
@@ -108,7 +132,7 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
 16. 籍贯 (native_place) - 出生地或户籍所在地信息
 17. 居住地 (residence) - 个人居住地址信息
 18. 品牌组合 (brand_group) - 如有多个品牌,使用逗号分隔
-19. 职业轨迹 (career_path) - 如能从简历中推断,以JSON数组格式返回,包含当前日期,公司名称和职位。自动生成当前日期
+19. 职业轨迹 (career_path) - 从简历中推断,以JSON数组格式返回,包含日期,公司名称和担任职务
 20. 隶属关系 (affiliation) - 如能从简历中推断,以JSON数组格式返回,包含公司名称和隶属集团名称
 
 ## 输出格式
@@ -189,26 +213,40 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
                 else:
                     parsed_resume[field] = ""
         
-        # 为career_path增加一条记录(如果提取到相关信息)
-        if parsed_resume.get('hotel_zh') or parsed_resume.get('hotel_en') or parsed_resume.get('title_zh') or parsed_resume.get('title_en'):
-            career_entry = {
-                "date": datetime.now().strftime('%Y-%m-%d'),
-                "hotel_en": parsed_resume.get('hotel_en', ''),
-                "hotel_zh": parsed_resume.get('hotel_zh', ''),
-                "image_path": '',
-                "source": 'resume_extraction',
-                "title_en": parsed_resume.get('title_en', ''),
-                "title_zh": parsed_resume.get('title_zh', '')
-            }
-            
-            # 如果原有career_path为空或不是数组,则重新设置
-            if not isinstance(parsed_resume.get('career_path'), list) or not parsed_resume['career_path']:
+        # 处理career_path字段,统一格式化处理
+        
+        # 处理career_path字段
+        career_path = parsed_resume.get('career_path')
+        
+        # 如果career_path为空值或不是数组,用提取信息组合一条记录
+        if not career_path or not isinstance(career_path, list):
+            if parsed_resume.get('hotel_zh') or parsed_resume.get('hotel_en') or parsed_resume.get('title_zh') or parsed_resume.get('title_en'):
+                # 用提取到的信息创建一条记录
+                new_entry = {
+                    "date": datetime.now().strftime('%Y-%m-%d'),
+                    "hotel_en": parsed_resume.get('hotel_en', ''),
+                    "hotel_zh": parsed_resume.get('hotel_zh', ''),
+                    "image_path": '',
+                    "source": 'resume_extraction',
+                    "title_en": parsed_resume.get('title_en', ''),
+                    "title_zh": parsed_resume.get('title_zh', '')
+                }
+                career_entry = standardize_career_entry(new_entry)
                 parsed_resume['career_path'] = [career_entry]
-                logging.info(f"为简历解析结果设置了career_path记录: {career_entry}")
+                logging.info(f"为简历解析结果创建了career_path记录: {career_entry}")
             else:
-                # 如果已有记录,添加到开头
-                parsed_resume['career_path'].insert(0, career_entry)
-                logging.info(f"为简历解析结果添加了career_path记录: {career_entry}")
+                parsed_resume['career_path'] = []
+                logging.info("简历中未提取到职业信息,career_path设为空数组")
+        else:
+            # 如果career_path是数组,对数组中的元素依次处理,统一为标准格式
+            standardized_entries = []
+            for i, entry in enumerate(career_path):
+                standardized_entry = standardize_career_entry(entry)
+                standardized_entries.append(standardized_entry)
+                logging.debug(f"标准化第 {i+1} 个career_path条目: {standardized_entry}")
+            
+            parsed_resume['career_path'] = standardized_entries
+            logging.info(f"标准化了 {len(standardized_entries)} 个career_path条目")
         
         # 为affiliation增加记录(如果提取到公司信息)
         if parsed_resume.get('hotel_zh') or parsed_resume.get('hotel_en'):
@@ -663,27 +701,35 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
                     }
                     
                     success_count += 1
+                    # 构建完整的MinIO URL路径
+                    relative_path = f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf'
+                    complete_minio_path = f"{minio_url}/{minio_bucket}/{relative_path}"
+                    
                     results.append({
                         "data": standardized_data,
                         "error": None,
                         "filename": _get_filename_from_path(file_path) if file_path else f'resume_{i}.pdf',
                         "index": i,
                         "message": "简历文件解析成功",
-                        "minio_path": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else '',
-                        "object_key": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
+                        "minio_path": complete_minio_path,
+                        "object_key": relative_path,
                         "success": True
                     })
                     logging.info(f"成功处理第 {i+1} 个文件: {_get_filename_from_path(file_path)}")
                 else:
                     failed_count += 1
+                    # 构建完整的MinIO URL路径
+                    relative_path = f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf'
+                    complete_minio_path = f"{minio_url}/{minio_bucket}/{relative_path}"
+                    
                     results.append({
                         "data": None,
                         "error": result.get('error', '处理失败'),
                         "filename": _get_filename_from_path(file_path) if file_path else f'resume_{i}.pdf',
                         "index": i,
                         "message": "简历文件解析失败",
-                        "minio_path": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else '',
-                        "object_key": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
+                        "minio_path": complete_minio_path,
+                        "object_key": relative_path,
                         "success": False
                     })
                     logging.error(f"处理第 {i+1} 个文件失败: {result.get('error', '未知错误')}")
@@ -692,14 +738,18 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
                 failed_count += 1
                 error_msg = f"处理简历文件失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
+                # 构建完整的MinIO URL路径
+                relative_path = f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf'
+                complete_minio_path = f"{minio_url}/{minio_bucket}/{relative_path}"
+                
                 results.append({
                     "data": None,
                     "error": error_msg,
                     "filename": _get_filename_from_path(file_path) if file_path else f'resume_{i}.pdf',
                     "index": i,
                     "message": "简历文件解析失败",
-                    "minio_path": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else '',
-                    "object_key": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
+                    "minio_path": complete_minio_path,
+                    "object_key": relative_path,
                     "success": False
                 })
         

+ 2 - 2
app/core/data_parse/parse_system.py

@@ -429,7 +429,7 @@ def create_main_card_with_duplicates(extracted_data, minio_path, suspected_dupli
         reason (str): 重复原因描述
         
     Returns:
-        BusinessCard: 创建的主名片记录
+        tuple: (BusinessCard, DuplicateBusinessCard) 创建的主名片记录和重复记录标记
     """
     try:
         # 标准化手机号码
@@ -505,7 +505,7 @@ def create_main_card_with_duplicates(extracted_data, minio_path, suspected_dupli
         
         logging.info(f"成功创建主名片记录 ID: {main_card.id},并标记 {len(suspected_duplicates)} 个疑似重复记录")
         
-        return main_card
+        return main_card, duplicate_record
         
     except Exception as e:
         db.session.rollback()

+ 159 - 6
app/core/data_parse/parse_task.py

@@ -220,7 +220,7 @@ def _handle_recruitment_task(created_by, data=None):
     
     Args:
         created_by (str): 创建者
-        data (str): 招聘数据内容
+        data (str or list): 招聘数据内容,可以是JSON字符串或已解析的列表
         
     Returns:
         dict: 处理结果
@@ -239,7 +239,40 @@ def _handle_recruitment_task(created_by, data=None):
         
         # 将传入的data参数写入task_source字段
         if data:
-            task_source['data'] = data
+            # 如果data是字符串,尝试解析为JSON
+            if isinstance(data, str):
+                try:
+                    data_list = json.loads(data)
+                except json.JSONDecodeError:
+                    # 如果不是有效的JSON,将其作为单个元素处理
+                    data_list = [data]
+            elif isinstance(data, list):
+                data_list = data
+            else:
+                # 其他类型转换为列表
+                data_list = [data]
+            
+            # 为每个数组元素添加指定字段
+            processed_data = []
+            for index, item in enumerate(data_list):
+                # 确保item是字典类型
+                if not isinstance(item, dict):
+                    item = {"original_data": item}
+                
+                # 添加指定字段
+                item.update({
+                    "error": None,
+                    "filename": "",
+                    "index": index,
+                    "message": "",
+                    "minio_path": "",
+                    "object_key": "",
+                    "success": True
+                })
+                
+                processed_data.append(item)
+            
+            task_source['data'] = processed_data
         
         # 创建解析任务记录
         parse_task = ParseTaskRepository(
@@ -257,7 +290,7 @@ def _handle_recruitment_task(created_by, data=None):
         db.session.add(parse_task)
         db.session.commit()
         
-        logging.info(f"成功创建招聘任务记录: {task_name}, 包含data参数: {'是' if data else '否'}")
+        logging.info(f"成功创建招聘任务记录: {task_name}, 处理了 {len(task_source.get('data', []))} 个数据项")
         
         return {
             'code': 200,
@@ -268,7 +301,8 @@ def _handle_recruitment_task(created_by, data=None):
                 'task_summary': {
                     'task_type': '招聘',
                     'description': '数据库记录处理任务',
-                    'requires_files': False
+                    'requires_files': False,
+                    'processed_items': len(task_source.get('data', []))
                 }
             }
         }
@@ -592,6 +626,49 @@ def add_parse_task(files, task_type, created_by='system', data=None, publish_tim
         }
 
 
+def _update_origin_source_with_minio_path(existing_origin_source, minio_path):
+    """
+    更新origin_source字段,将minio_path添加到JSON数组中
+    
+    Args:
+        existing_origin_source: 现有的origin_source内容
+        minio_path: 要添加的minio_path
+        
+    Returns:
+        str: 更新后的origin_source JSON字符串
+    """
+    import json
+    
+    try:
+        # 如果minio_path为空,直接返回现有的origin_source
+        if not minio_path:
+            return existing_origin_source
+        
+        # 解析现有的origin_source
+        if existing_origin_source:
+            try:
+                origin_list = json.loads(existing_origin_source)
+                if not isinstance(origin_list, list):
+                    origin_list = [origin_list]
+            except (json.JSONDecodeError, TypeError):
+                # 如果解析失败,将现有内容作为单个元素
+                origin_list = [existing_origin_source] if existing_origin_source else []
+        else:
+            origin_list = []
+        
+        # 添加新的minio_path(如果不存在)
+        if minio_path not in origin_list:
+            origin_list.append(minio_path)
+        
+        # 返回JSON字符串
+        return json.dumps(origin_list, ensure_ascii=False)
+        
+    except Exception as e:
+        logging.error(f"更新origin_source失败: {str(e)}")
+        # 如果处理失败,返回原始的origin_source
+        return existing_origin_source
+
+
 def add_single_talent(talent_data):
     """
     添加单个人才记录(基于add_business_card逻辑,去除MinIO图片上传)
@@ -690,7 +767,9 @@ def add_single_talent(talent_data):
                 existing_card.brand_group = talent_data.get('brand_group', existing_card.brand_group)
                 # 更新image_path字段,从talent_data中获取
                 existing_card.image_path = talent_data.get('image_path', existing_card.image_path)
-                existing_card.origin_source = talent_data.get('origin_source', existing_card.origin_source)
+                # 更新origin_source字段,将minio_path添加到JSON数组中
+                minio_path = talent_data.get('minio_path', '')
+                existing_card.origin_source = _update_origin_source_with_minio_path(existing_card.origin_source, minio_path)
                 existing_card.talent_profile = talent_data.get('talent_profile', existing_card.talent_profile)
                 existing_card.updated_by = 'talent_system'
                 
@@ -703,6 +782,29 @@ def add_single_talent(talent_data):
                 
                 logging.info(f"已更新现有人才记录,ID: {existing_card.id}")
                 
+                # 在Neo4j图数据库中更新Talent节点
+                try:
+                    from app.core.graph.graph_operations import create_or_get_node
+                    from datetime import datetime
+                    
+                    # 创建Talent节点属性
+                    talent_properties = {
+                        'name_zh': existing_card.name_zh,
+                        'name_en': existing_card.name_en,
+                        'mobile': existing_card.mobile,
+                        'email': existing_card.email,
+                        'pg_id': existing_card.id,  # PostgreSQL主记录的ID
+                        'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+                    }
+                    
+                    # 在Neo4j中更新或创建Talent节点
+                    neo4j_node_id = create_or_get_node('Talent', **talent_properties)
+                    logging.info(f"成功在Neo4j中更新Talent节点,Neo4j ID: {neo4j_node_id}, PostgreSQL ID: {existing_card.id}")
+                    
+                except Exception as neo4j_error:
+                    logging.error(f"在Neo4j中更新Talent节点失败: {str(neo4j_error)}")
+                    # Neo4j操作失败不影响主流程,继续返回成功结果
+                
                 return {
                     'code': 200,
                     'success': True,
@@ -720,6 +822,34 @@ def add_single_talent(talent_data):
                     duplicate_check['reason']
                 )
                 
+                # 更新origin_source字段,将minio_path添加到JSON数组中
+                minio_path = talent_data.get('minio_path', '')
+                main_card.origin_source = _update_origin_source_with_minio_path(main_card.origin_source, minio_path)
+                db.session.commit()  # 提交origin_source的更新
+                
+                # 在Neo4j图数据库中创建Talent节点
+                try:
+                    from app.core.graph.graph_operations import create_or_get_node
+                    from datetime import datetime
+                    
+                    # 创建Talent节点属性
+                    talent_properties = {
+                        'name_zh': main_card.name_zh,
+                        'name_en': main_card.name_en,
+                        'mobile': main_card.mobile,
+                        'email': main_card.email,
+                        'pg_id': main_card.id,  # PostgreSQL主记录的ID
+                        'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+                    }
+                    
+                    # 在Neo4j中创建Talent节点
+                    neo4j_node_id = create_or_get_node('Talent', **talent_properties)
+                    logging.info(f"成功在Neo4j中创建Talent节点,Neo4j ID: {neo4j_node_id}, PostgreSQL ID: {main_card.id}")
+                    
+                except Exception as neo4j_error:
+                    logging.error(f"在Neo4j中创建Talent节点失败: {str(neo4j_error)}")
+                    # Neo4j操作失败不影响主流程,继续返回成功结果
+                
                 return {
                     'code': 202,  # Accepted,表示已接受但需要进一步处理
                     'success': True,
@@ -787,7 +917,7 @@ def add_single_talent(talent_data):
                     image_path=image_path,  # 从talent_data获取图片路径
                     career_path=initial_career_path,
                     brand_group=talent_data.get('brand_group', ''),
-                    origin_source=talent_data.get('origin_source'),
+                    origin_source=json.dumps([talent_data.get('minio_path', '')], ensure_ascii=False) if talent_data.get('minio_path') else None,
                     talent_profile=talent_data.get('talent_profile', ''),
                     status='active',
                     updated_by='talent_system'
@@ -798,6 +928,29 @@ def add_single_talent(talent_data):
                 
                 logging.info(f"人才信息已保存到数据库,ID: {business_card.id}")
                 
+                # 在Neo4j图数据库中创建Talent节点
+                try:
+                    from app.core.graph.graph_operations import create_or_get_node
+                    from datetime import datetime
+                    
+                    # 创建Talent节点属性
+                    talent_properties = {
+                        'name_zh': business_card.name_zh,
+                        'name_en': business_card.name_en,
+                        'mobile': business_card.mobile,
+                        'email': business_card.email,
+                        'pg_id': business_card.id,  # PostgreSQL主记录的ID
+                        'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+                    }
+                    
+                    # 在Neo4j中创建Talent节点
+                    neo4j_node_id = create_or_get_node('Talent', **talent_properties)
+                    logging.info(f"成功在Neo4j中创建Talent节点,Neo4j ID: {neo4j_node_id}, PostgreSQL ID: {business_card.id}")
+                    
+                except Exception as neo4j_error:
+                    logging.error(f"在Neo4j中创建Talent节点失败: {str(neo4j_error)}")
+                    # Neo4j操作失败不影响主流程,继续返回成功结果
+                
                 return {
                     'code': 200,
                     'success': True,

+ 30 - 6
app/core/data_parse/parse_web.py

@@ -874,14 +874,22 @@ def batch_process_md(markdown_file_list, publish_time):
                             standardized_data = _convert_webpage_to_card_format(person_data, publish_time)
                             
                             success_count += 1
+                            # 构建完整的MinIO URL路径
+                            if minio_path.startswith('http'):
+                                complete_minio_path = minio_path
+                                object_key = _extract_object_key_from_url(minio_path)
+                            else:
+                                complete_minio_path = f"{minio_url}/{minio_bucket}/{minio_path}"
+                                object_key = minio_path
+                            
                             results.append({
                                 "data": standardized_data,
                                 "error": None,
                                 "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
                                 "index": len(results),  # 使用连续的索引
                                 "message": "网页人才信息解析成功",
-                                "minio_path": minio_path,
-                                "object_key": minio_path,
+                                "minio_path": complete_minio_path,
+                                "object_key": object_key,
                                 "success": True
                             })
                             logging.info(f"成功提取人员 {person_idx+1}: {person_data.get('name_zh', 'Unknown')}")
@@ -889,14 +897,22 @@ def batch_process_md(markdown_file_list, publish_time):
                         # 没有提取到有效数据,这算作一个失败记录
                         total_records += 1
                         failed_count += 1
+                        # 构建完整的MinIO URL路径
+                        if minio_path.startswith('http'):
+                            complete_minio_path = minio_path
+                            object_key = _extract_object_key_from_url(minio_path)
+                        else:
+                            complete_minio_path = f"{minio_url}/{minio_bucket}/{minio_path}"
+                            object_key = minio_path
+                            
                         results.append({
                             "data": None,
                             "error": "未从markdown文件中提取到人员信息",
                             "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
                             "index": len(results),
                             "message": "网页人才信息解析失败",
-                            "minio_path": minio_path,
-                            "object_key": minio_path,
+                            "minio_path": complete_minio_path,
+                            "object_key": object_key,
                             "success": False
                         })
                         logging.warning(f"第 {i+1} 个文件未提取到人员信息")
@@ -905,14 +921,22 @@ def batch_process_md(markdown_file_list, publish_time):
                     total_records += 1
                     failed_count += 1
                     error_msg = file_result.get('message', '处理失败')
+                    # 构建完整的MinIO URL路径
+                    if minio_path.startswith('http'):
+                        complete_minio_path = minio_path
+                        object_key = _extract_object_key_from_url(minio_path)
+                    else:
+                        complete_minio_path = f"{minio_url}/{minio_bucket}/{minio_path}"
+                        object_key = minio_path
+                        
                     results.append({
                         "data": None,
                         "error": error_msg,
                         "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
                         "index": len(results),
                         "message": "网页人才信息解析失败",
-                        "minio_path": minio_path,
-                        "object_key": minio_path,
+                        "minio_path": complete_minio_path,
+                        "object_key": object_key,
                         "success": False
                     })
                     logging.error(f"处理第 {i+1} 个文件失败: {error_msg}")

+ 82 - 0
解析数据格式.txt

@@ -0,0 +1,82 @@
+results:[
+  {
+    "data": {
+      "address_en": "12F, Tower C, The PLACE, No. 150 Zun Yi Road, Shanghai",
+      "address_zh": "上海市遵义路150号虹桥南丰城C座12楼",
+      "affiliation": [
+        {
+          "company": "雅高集团",
+          "group": ""
+        }
+      ],
+      "age": 0,
+      "birthday": "",
+      "brand_group": "",
+      "career_path": [
+        {
+          "date": "2025-07-23",
+          "hotel_en": "ACCOR",
+          "hotel_zh": "雅高集团",
+          "image_path": "http://192.168.3.143:9000/dataops-bucket/talent_photos/talent_photo_20250723_161352_2afa52d0.jpg",
+          "source": "business_card_creation",
+          "title_en": "Director of Development, Luxury & Lifestyle",
+          "title_zh": "奢华及生活时尚品牌发展总监"
+        }
+      ],
+      "email": "Shawn.zhang@accor.com",
+      "hotel_en": "ACCOR",
+      "hotel_zh": "雅高集团",
+      "mobile": "+86(0)138 1140 5768",
+      "name_en": "Shawn Zhang",
+      "name_zh": "张祥胜",
+      "native_place": "",
+      "phone": "+86(0)21 6119 7739",
+      "postal_code_en": "",
+      "postal_code_zh": "",
+      "residence": "",
+      "title_en": "Director of Development, Luxury & Lifestyle",
+      "title_zh": "奢华及生活时尚品牌发展总监"
+    },
+    "minio_path": "http://192.168.3.143:9000/dataops-bucket/talent_photos/talent_photo_20250723_161352_2afa52d0.jpg"
+  },
+  {
+    "data": {
+      "address_en": "No.168 Zhendong Street, Fuli Town, Yangshuo County, Guilin City, Guangxi Province, P. R. China 541905",
+      "address_zh": "中国广西壮族自治区桂林市阳朔县福利镇镇东街168号 541905",
+      "affiliation": [
+        {
+          "company": "Banyan Tree Yangshuo",
+          "group": "Banyan Tree"
+        }
+      ],
+      "age": 0,
+      "birthday": "",
+      "brand_group": "Banyan Tree",
+      "career_path": [
+        {
+          "date": "2025-07-23",
+          "hotel_en": "Banyan Tree Yangshuo",
+          "hotel_zh": "阳朔悦榕庄",
+          "image_path": "http://192.168.3.143:9000/dataops-bucket/talent_photos/talent_photo_20250723_161352_c99f5743.jpg",
+          "source": "business_card_creation",
+          "title_en": "General Manager",
+          "title_zh": "总经理"
+        }
+      ],
+      "email": "James.Zhou@banyantree.com",
+      "hotel_en": "Banyan Tree Yangshuo",
+      "hotel_zh": "阳朔悦榕庄",
+      "mobile": "+86 186 6196 1937",
+      "name_en": "James Zhou",
+      "name_zh": "周猛",
+      "native_place": "",
+      "phone": "+86 773 322 8888 ext.7000",
+      "postal_code_en": "541905",
+      "postal_code_zh": "541905",
+      "residence": "",
+      "title_en": "General Manager",
+      "title_zh": "总经理"
+    },
+    "minio_path": "http://192.168.3.143:9000/dataops-bucket/talent_photos/talent_photo_20250723_161352_c99f5743.jpg"
+  }
+]