Parcourir la source

修改已入库状态更新逻辑
修改职业轨迹保存逻辑
修改年龄字段保存逻辑
修改md文件拆分逻辑
修改招聘记录保存id和userId逻辑

maxiaolong il y a 2 semaines
Parent
commit
bdf8e5e50b

+ 24 - 1
app/api/data_parse/routes.py

@@ -1,5 +1,6 @@
 from flask import jsonify, request, make_response, Blueprint, current_app, send_file
 from datetime import datetime
+import json
 from app.api.data_parse import bp
 from app.core.data_parse.parse_system import (
     update_business_card, 
@@ -1535,7 +1536,12 @@ def add_parse_task_route():
                 # 获取任务ID和任务源数据
                 task_data = result['data']
                 task_id = task_data.get('id')
-                task_source = task_data.get('task_source', [])
+                
+                # 将task_data中的data赋值给task_source
+                task_source = []
+                if data:
+                    # data是JSON数组格式,直接解析
+                    task_source = json.loads(data)
                 
                 if task_id and task_source:
                     logger.info(f"招聘任务创建成功,开始执行批量处理: task_id={task_id}")
@@ -1731,6 +1737,23 @@ def execute_parse_task():
         # 获取任务ID
         task_id = task_data.get('id')
         
+        # 更新parse_task_repository数据库表中的task_source
+        if task_id:
+            try:
+                from app.core.data_parse.parse_system import ParseTaskRepository, db
+                task_record = ParseTaskRepository.query.get(task_id)
+                if task_record:
+                    task_record.task_source = task_source
+                    task_record.updated_at = datetime.now()
+                    task_record.updated_by = 'admin'
+                    db.session.commit()
+                    logging.info(f"已更新task_id为{task_id}的任务记录的task_source")
+                else:
+                    logging.warning(f"未找到task_id为{task_id}的任务记录")
+            except Exception as update_error:
+                logging.error(f"更新任务记录失败: {str(update_error)}")
+                db.session.rollback()
+        
         # 根据任务类型执行相应的处理函数
         try:
             if task_type == '名片':

+ 84 - 33
app/core/data_parse/parse_card.py

@@ -532,33 +532,46 @@ def delete_business_card(card_id):
 
 def batch_process_business_card_images(minio_paths_json, task_id=None, task_type=None):
     """
-    批量处理名片图片,从MinIO下载图片并进行解析
+    批量处理名片图片,从parse_task_repository表读取任务记录进行处理
     
     Args:
-        minio_paths_json (list): 包含MinIO对象访问地址的JSON数组,可以是字符串数组或字典数组
-        task_id (str, optional): 任务ID
+        minio_paths_json (list): 包含MinIO对象访问地址的JSON数组(已废弃,现在从数据库读取)
+        task_id (str, optional): 任务ID,用于从数据库读取task_source
         task_type (str, optional): 任务类型
         
     Returns:
         dict: 批量处理结果,包含所有解析结果的数组
     """
     try:
-        logging.info(f"开始批量处理名片图片,共 {len(minio_paths_json)} 个文件")
-        
-        # 参数验证
-        if not minio_paths_json or not isinstance(minio_paths_json, list):
+        # 根据task_id从parse_task_repository表读取记录
+        if not task_id:
             return {
                 'code': 400,
                 'success': False,
-                'message': 'minio_paths_json参数必须是非空数组',
+                'message': '缺少task_id参数',
+                'data': None
+            }
+        
+        # 导入数据库模型
+        from app.core.data_parse.parse_system import ParseTaskRepository, db
+        
+        # 查询对应的任务记录
+        task_record = ParseTaskRepository.query.get(task_id)
+        if not task_record:
+            return {
+                'code': 404,
+                'success': False,
+                'message': f'未找到task_id为{task_id}的任务记录',
                 'data': None
             }
         
-        if len(minio_paths_json) == 0:
+        # 获取task_source作为需要处理的数据列表
+        task_source = task_record.task_source
+        if not task_source or not isinstance(task_source, list):
             return {
                 'code': 400,
                 'success': False,
-                'message': 'MinIO路径数组不能为空',
+                'message': 'task_source为空或格式不正确',
                 'data': None
             }
         
@@ -577,35 +590,28 @@ def batch_process_business_card_images(minio_paths_json, task_id=None, task_type
         failed_count = 0
         parsed_record_ids = []  # 收集成功解析的记录ID
         
-        # 逐一处理每个MinIO路径
-        for i, item in enumerate(minio_paths_json):
+        logging.info(f"开始批量处理名片图片,共 {len(task_source)} 个文件")
+        
+        # 逐一处理每个task_source元素
+        for i, item in enumerate(task_source):
             try:
-                # 处理输入格式:支持字符串或字典格式
-                if isinstance(item, dict):
-                    minio_path = item.get('minio_path')
-                    original_filename = item.get('original_filename', '')
-                    status = item.get('status', '')
-                    if not minio_path:
-                        failed_count += 1
-                        results.append({
-                            'index': i,
-                            'minio_path': str(item),
-                            'success': False,
-                            'error': f'字典中缺少minio_path字段: {item}',
-                            'data': None
-                        })
-                        continue
-                elif isinstance(item, str):
-                    minio_path = item
-                    original_filename = ''
-                    status = ''
-                else:
+                # 检查parse_flag,只有值为1的才需要处理
+                if not isinstance(item, dict) or item.get('parse_flag') != 1:
+                    continue
+                
+                minio_path = item.get('minio_path')
+                original_filename = item.get('original_filename', '')
+                
+                if not minio_path:
                     failed_count += 1
+                    # 更新task_source中对应记录的状态
+                    item['parse_flag'] = 1
+                    item['status'] = '解析失败'
                     results.append({
                         'index': i,
                         'minio_path': str(item),
                         'success': False,
-                        'error': f'不支持的数据格式: {type(item)}',
+                        'error': f'字典中缺少minio_path字段: {item}',
                         'data': None
                     })
                     continue
@@ -688,10 +694,20 @@ def batch_process_business_card_images(minio_paths_json, task_id=None, task_type
                                     if parsed_record and 'id' in parsed_record:
                                         parsed_record_ids.append(str(parsed_record['id']))
                                     logging.info(f"成功记录人才信息到parsed_talents表: {talent_data.get('name_zh', '')}")
+                                    
+                                    # 更新task_source中对应记录的状态
+                                    item['parse_flag'] = 0
+                                    item['status'] = '解析成功'
                                 else:
                                     logging.warning(f"记录人才信息失败: {record_result.get('message', '')}")
+                                    # 更新task_source中对应记录的状态
+                                    item['parse_flag'] = 1
+                                    item['status'] = '解析失败'
                         except Exception as record_error:
                             logging.error(f"调用record_parsed_talent函数失败: {str(record_error)}")
+                            # 更新task_source中对应记录的状态
+                            item['parse_flag'] = 1
+                            item['status'] = '解析失败'
                         
                         success_count += 1
                         results.append({
@@ -707,6 +723,9 @@ def batch_process_business_card_images(minio_paths_json, task_id=None, task_type
                         logging.info(f"成功处理第 {i+1} 个文件: {filename}")
                     else:
                         failed_count += 1
+                        # 更新task_source中对应记录的状态
+                        item['parse_flag'] = 1
+                        item['status'] = '解析失败'
                         results.append({
                             'index': i,
                             'minio_path': minio_path,
@@ -722,6 +741,9 @@ def batch_process_business_card_images(minio_paths_json, task_id=None, task_type
                     failed_count += 1
                     error_msg = f"下载MinIO文件失败: {str(download_error)}"
                     logging.error(error_msg, exc_info=True)
+                    # 更新task_source中对应记录的状态
+                    item['parse_flag'] = 1
+                    item['status'] = '解析失败'
                     results.append({
                         'index': i,
                         'minio_path': minio_path,
@@ -735,6 +757,10 @@ def batch_process_business_card_images(minio_paths_json, task_id=None, task_type
                 failed_count += 1
                 error_msg = f"处理数组元素失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
+                # 更新task_source中对应记录的状态
+                if isinstance(item, dict):
+                    item['parse_flag'] = 1
+                    item['status'] = '解析失败'
                 results.append({
                     'index': i,
                     'minio_path': str(item) if isinstance(item, (str, dict)) else 'unknown',
@@ -743,6 +769,31 @@ def batch_process_business_card_images(minio_paths_json, task_id=None, task_type
                     'data': None
                 })
         
+        # 根据处理结果更新task_status
+        if failed_count == 0:
+            task_status = '解析成功'
+        elif success_count == 0:
+            task_status = '解析失败'
+        else:
+            task_status = '部分解析成功'
+        
+        # 所有task_source记录处理完成后,将更新后的task_source和task_status保存到数据库
+        try:
+            task_record.task_source = task_source
+            task_record.task_status = task_status
+            task_record.parse_count = success_count
+            task_record.parse_result = {
+                'success_count': success_count,
+                'failed_count': failed_count,
+                'parsed_record_ids': parsed_record_ids,
+                'processed_time': datetime.now().isoformat()
+            }
+            db.session.commit()
+            logging.info(f"成功更新task_id为{task_id}的任务记录,task_status={task_status},处理成功{success_count}条,失败{failed_count}条")
+        except Exception as update_error:
+            logging.error(f"更新任务记录失败: {str(update_error)}")
+            db.session.rollback()
+        
         # 组装最终结果
         if failed_count == 0:
             return {

+ 114 - 45
app/core/data_parse/parse_menduner.py

@@ -160,47 +160,44 @@ def _normalize_talent_to_card_format(raw_profile: Dict[str, Any]) -> Dict[str, A
     Returns:
         Dict[str, Any]: 标准化后的名片格式数据
     """
-    # 提取基本信息
-    name_zh = raw_profile.get('name', raw_profile.get('name_zh', ''))
-    company = raw_profile.get('company', raw_profile.get('hotel_zh', ''))
-    position = raw_profile.get('position', raw_profile.get('title_zh', ''))
-    mobile = raw_profile.get('phone', raw_profile.get('mobile', ''))
+    import json
+    
+    # 从raw_profile中提取基本信息
+    name_zh = raw_profile.get('name_zh', '')
     email = raw_profile.get('email', '')
-    location = raw_profile.get('location', raw_profile.get('address_zh', ''))
+    mobile = raw_profile.get('mobile', '')
+    birthday = raw_profile.get('birthday', '')
+    age = raw_profile.get('age', '')
+    career_path = raw_profile.get('career_path', [])
+    
+    # 从career_path中找到最后一个数组元素,提取hotel_zh和title_zh
+    hotel_zh = ''
+    title_zh = ''
+    if career_path and isinstance(career_path, list) and len(career_path) > 0:
+        last_career = career_path[-1]
+        if isinstance(last_career, dict):
+            hotel_zh = last_career.get('hotel_zh', '')
+            title_zh = last_career.get('title_zh', '')
     
-    # 构建隶属关系
-    affiliation = []
-    if company:
-        affiliation.append({
-            "company": company,
-            "group": raw_profile.get('group', '')
-        })
+    # 从id和userId组合成JSON字符串
+    id_value = raw_profile.get('id', '')
+    userId_value = raw_profile.get('userId', '')
+    id_json = json.dumps({"id": id_value, "userId": userId_value}, ensure_ascii=False)
     
-    # 构建职业轨迹
-    career_path = []
-    if position and company:
-        career_path.append({
-            "date": datetime.now().strftime('%Y-%m-%d'),
-            "hotel_en": raw_profile.get('hotel_en', ''),
-            "hotel_zh": company,
-            "image_path": raw_profile.get('image_path', ''),
-            "source": "menduner_data_creation",
-            "title_en": raw_profile.get('title_en', ''),
-            "title_zh": position
-        })
+    # 直接使用原始career_path
     
     # 按照任务解析结果.txt的data字段格式组装数据
     normalized = {
         "address_en": raw_profile.get('address_en', ''),
-        "address_zh": location,
-        "affiliation": affiliation,
-        "age": raw_profile.get('age', 0),
-        "birthday": raw_profile.get('birthday', ''),
+        "address_zh": raw_profile.get('address_zh', ''),
+        "affiliation": raw_profile.get('affiliation', []),
+        "age": age,
+        "birthday": birthday,
         "brand_group": raw_profile.get('brand_group', ''),
         "career_path": career_path,
         "email": _normalize_email(email),
         "hotel_en": raw_profile.get('hotel_en', ''),
-        "hotel_zh": company,
+        "hotel_zh": hotel_zh,
         "mobile": _normalize_phone(mobile),
         "name_en": raw_profile.get('name_en', ''),
         "name_zh": name_zh,
@@ -210,11 +207,11 @@ def _normalize_talent_to_card_format(raw_profile: Dict[str, Any]) -> Dict[str, A
         "postal_code_zh": raw_profile.get('postal_code_zh', ''),
         "residence": raw_profile.get('residence', ''),
         "title_en": raw_profile.get('title_en', ''),
-        "title_zh": position,
-        "image_path": raw_profile.get('id', ''),
+        "title_zh": title_zh,
+        "image_path": id_json,
         "origin_source": [{
             "task_type": "招聘",
-            "minio_path": raw_profile.get('id', ''),
+            "minio_path": id_json,
             "source_date": datetime.now().strftime('%Y-%m-%d')
         }]
     }
@@ -377,28 +374,34 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]], task_id=None, t
     批量处理门墩儿人才数据
     
     Args:
-        data_list (List[Dict[str, Any]]): 待处理的人才数据列表
-        task_id (str, optional): 任务ID
+        data_list (List[Dict[str, Any]]): 待处理的人才数据列表(已废弃,现在从数据库读取)
+        task_id (str, optional): 任务ID,用于从数据库读取task_source
         task_type (str, optional): 任务类型
         
     Returns:
         Dict[str, Any]: 批量处理结果,格式与parse_result保持一致
     """
     try:
-        # 验证参数
-        if not data_list or not isinstance(data_list, list):
+        # 根据task_id从parse_task_repository表读取记录
+        if not task_id:
             return {
                 "processed_time": datetime.now().isoformat(),
                 "results": [],
                 "summary": {
-                    "failed_count": len(data_list) if data_list else 0,
+                    "failed_count": 0,
                     "success_count": 0,
                     "success_rate": 0,
-                    "total_files": len(data_list) if data_list else 0
-                }
+                    "total_files": 0
+                },
+                "error": "缺少task_id参数"
             }
         
-        if len(data_list) == 0:
+        # 导入数据库模型
+        from app.core.data_parse.parse_system import ParseTaskRepository, db
+        
+        # 查询对应的任务记录
+        task_record = ParseTaskRepository.query.get(task_id)
+        if not task_record:
             return {
                 "processed_time": datetime.now().isoformat(),
                 "results": [],
@@ -407,7 +410,23 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]], task_id=None, t
                     "success_count": 0,
                     "success_rate": 0,
                     "total_files": 0
-                }
+                },
+                "error": f"未找到task_id为{task_id}的任务记录"
+            }
+        
+        # 获取task_source作为需要处理的数据列表
+        task_source = task_record.task_source
+        if not task_source or not isinstance(task_source, list):
+            return {
+                "processed_time": datetime.now().isoformat(),
+                "results": [],
+                "summary": {
+                    "failed_count": 0,
+                    "success_count": 0,
+                    "success_rate": 0,
+                    "total_files": 0
+                },
+                "error": "task_source为空或格式不正确"
             }
         
         results = []
@@ -415,12 +434,12 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]], task_id=None, t
         failed_count = 0
         parsed_record_ids = []  # 收集成功解析的记录ID
         
-        logging.info(f"开始批量处理门墩儿人才数据,共 {len(data_list)} 条记录")
+        logging.info(f"开始批量处理门墩儿人才数据,共 {len(task_source)} 条记录")
         
         # 逐一处理每条数据
-        for i, data in enumerate(data_list):
+        for i, data in enumerate(task_source):
             try:
-                logging.debug(f"处理第 {i+1}/{len(data_list)} 条数据")
+                logging.debug(f"处理第 {i+1}/{len(task_source)} 条数据")
                 
                 # 标准化数据为名片格式
                 normalized = _normalize_talent_to_card_format(data)
@@ -439,10 +458,23 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]], task_id=None, t
                             if parsed_record and 'id' in parsed_record:
                                 parsed_record_ids.append(str(parsed_record['id']))
                             logging.info(f"成功记录人才信息到parsed_talents表: {normalized.get('name_zh', '')}")
+                            
+                            # 更新task_source中对应记录的parse_flag和status
+                            if isinstance(data, dict):
+                                data['parse_flag'] = 0
+                                data['status'] = '解析成功'
                         else:
                             logging.warning(f"记录人才信息失败: {record_result.get('message', '')}")
+                            # 更新task_source中对应记录的parse_flag和status
+                            if isinstance(data, dict):
+                                data['parse_flag'] = 1
+                                data['status'] = '解析失败'
                     except Exception as record_error:
                         logging.error(f"调用record_parsed_talent函数失败: {str(record_error)}")
+                        # 更新task_source中对应记录的parse_flag和status
+                        if isinstance(data, dict):
+                            data['parse_flag'] = 1
+                            data['status'] = '解析失败'
                     
                     success_count += 1
                     results.append({
@@ -459,6 +491,12 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]], task_id=None, t
                 else:
                     failed_count += 1
                     error_messages = validation.get('errors', ['验证失败'])
+                    
+                    # 更新task_source中对应记录的parse_flag和status
+                    if isinstance(data, dict):
+                        data['parse_flag'] = 1
+                        data['status'] = '解析失败'
+                    
                     results.append({
                         "data": None,
                         "error": '; '.join(error_messages),
@@ -475,6 +513,12 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]], task_id=None, t
                 failed_count += 1
                 error_msg = f"处理门墩儿数据失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
+                
+                # 更新task_source中对应记录的parse_flag和status
+                if isinstance(data, dict):
+                    data['parse_flag'] = 1
+                    data['status'] = '解析失败'
+                
                 results.append({
                     "data": None,
                     "error": error_msg,
@@ -486,6 +530,31 @@ def batch_process_menduner_data(data_list: List[Dict[str, Any]], task_id=None, t
                     "success": False
                 })
         
+        # 根据处理结果更新task_status
+        if failed_count == 0:
+            task_status = '解析成功'
+        elif success_count == 0:
+            task_status = '解析失败'
+        else:
+            task_status = '部分解析成功'
+        
+        # 所有task_source记录处理完成后,将更新后的task_source和task_status保存到数据库
+        try:
+            task_record.task_source = task_source
+            task_record.task_status = task_status
+            task_record.parse_count = success_count
+            task_record.parse_result = {
+                'success_count': success_count,
+                'failed_count': failed_count,
+                'parsed_record_ids': parsed_record_ids,
+                'processed_time': datetime.now().isoformat()
+            }
+            db.session.commit()
+            logging.info(f"成功更新task_id为{task_id}的任务记录,task_status={task_status},处理成功{success_count}条,失败{failed_count}条")
+        except Exception as update_error:
+            logging.error(f"更新任务记录失败: {str(update_error)}")
+            db.session.rollback()
+        
         # 组装最终结果
         if failed_count == 0:
             return {

+ 110 - 79
app/core/data_parse/parse_pic.py

@@ -828,7 +828,7 @@ def parse_table_with_qwen(base64_image: str) -> List[Dict[str, Any]]:
                 "address_en": '',
                 "address_zh": '',
                 "affiliation": affiliation,
-                "age": 0,
+                "age": '',
                 "birthday": '',
                 "brand_group": '',
                 "career_path": [career_entry],
@@ -860,54 +860,57 @@ def parse_table_with_qwen(base64_image: str) -> List[Dict[str, Any]]:
 
 def batch_process_images(image_paths: List[Any], process_type: str = 'table', task_id=None, task_type=None) -> Dict[str, Any]:
     """
-    批量处理图片
+    批量处理图片,从parse_task_repository表读取任务记录进行处理
     
     Args:
-        image_paths (List[Any]): 图片路径列表,可以是字符串数组或字典数组
+        image_paths (List[Any]): 图片路径列表(已废弃,现在从数据库读取)
         process_type (str): 处理类型,只支持 'table'
-        task_id (str, optional): 任务ID
+        task_id (str, optional): 任务ID,用于从数据库读取task_source
         task_type (str, optional): 任务类型
         
     Returns:
         Dict[str, Any]: 批量处理结果,格式与parse_result保持一致
     """
     try:
-        # 验证处理类型
-        if process_type != 'table':
+        # 根据task_id从parse_task_repository表读取记录
+        if not task_id:
             return {
-                "processed_time": datetime.now().isoformat(),
-                "results": [],
-                "summary": {
-                    "failed_count": len(image_paths),
-                    "success_count": 0,
-                    "success_rate": 0,
-                    "total_files": len(image_paths)
-                }
+                'code': 400,
+                'success': False,
+                'message': '缺少task_id参数',
+                'data': None
             }
         
-        # 验证参数
-        if not image_paths or not isinstance(image_paths, list):
+        # 导入数据库模型
+        from app.core.data_parse.parse_system import ParseTaskRepository, db
+        
+        # 查询对应的任务记录
+        task_record = ParseTaskRepository.query.get(task_id)
+        if not task_record:
             return {
-                "processed_time": datetime.now().isoformat(),
-                "results": [],
-                "summary": {
-                    "failed_count": len(image_paths) if image_paths else 0,
-                    "success_count": 0,
-                    "success_rate": 0,
-                    "total_files": len(image_paths) if image_paths else 0
-                }
+                'code': 404,
+                'success': False,
+                'message': f'未找到task_id为{task_id}的任务记录',
+                'data': None
             }
         
-        if len(image_paths) == 0:
+        # 获取task_source作为需要处理的数据列表
+        task_source = task_record.task_source
+        if not task_source or not isinstance(task_source, list):
             return {
-                "processed_time": datetime.now().isoformat(),
-                "results": [],
-                "summary": {
-                    "failed_count": 0,
-                    "success_count": 0,
-                    "success_rate": 0,
-                    "total_files": 0
-                }
+                'code': 400,
+                'success': False,
+                'message': 'task_source为空或格式不正确',
+                'data': None
+            }
+        
+        # 验证处理类型
+        if process_type != 'table':
+            return {
+                'code': 400,
+                'success': False,
+                'message': 'process_type只支持table类型',
+                'data': None
             }
         
         results = []
@@ -915,55 +918,28 @@ def batch_process_images(image_paths: List[Any], process_type: str = 'table', ta
         failed_count = 0
         parsed_record_ids = []  # 收集成功解析的记录ID
         
-        logging.info(f"开始批量处理图片,共 {len(image_paths)} 个文件")
+        logging.info(f"开始批量处理图片,共 {len(task_source)} 个文件")
         
-        # 逐一处理每个图片路径
-        for i, item in enumerate(image_paths):
+        # 逐一处理每个task_source元素
+        for i, item in enumerate(task_source):
             try:
+                # 检查parse_flag,只有值为1的才需要处理
+                if not isinstance(item, dict) or item.get('parse_flag') != 1:
+                    continue
+                
                 # 处理输入格式:支持字符串或字典格式
-                if isinstance(item, dict):
-                    image_path = item.get('minio_path')
-                    original_filename = item.get('original_filename', '')
-                    status = item.get('status', '')
-                    
-                    # 确保image_path是字符串类型
-                    if not image_path:
-                        failed_count += 1
-                        results.append({
-                            "data": None,
-                            "error": f"字典中缺少minio_path字段: {item}",
-                            "filename": str(item),
-                            "index": i,
-                            "message": "图片路径格式无效",
-                            "minio_path": "",
-                            "object_key": "",
-                            "success": False
-                        })
-                        logging.warning(f"第 {i+1} 个文件缺少minio_path字段")
-                        continue
-                    elif not isinstance(image_path, str):
-                        failed_count += 1
-                        results.append({
-                            "data": None,
-                            "error": f"minio_path字段不是字符串类型: {type(image_path)}",
-                            "filename": str(item),
-                            "index": i,
-                            "message": "图片路径格式无效",
-                            "minio_path": "",
-                            "object_key": "",
-                            "success": False
-                        })
-                        logging.warning(f"第 {i+1} 个文件minio_path字段类型错误")
-                        continue
-                elif isinstance(item, str):
-                    image_path = item
-                    original_filename = ''
-                    status = ''
-                else:
+                image_path = item.get('minio_path')
+                original_filename = item.get('original_filename', '')
+                
+                # 确保image_path是字符串类型
+                if not image_path:
                     failed_count += 1
+                    # 更新task_source中对应记录的状态
+                    item['parse_flag'] = 1
+                    item['status'] = '解析失败'
                     results.append({
                         "data": None,
-                        "error": f"不支持的数据格式: {type(item)}",
+                        "error": f"字典中缺少minio_path字段: {item}",
                         "filename": str(item),
                         "index": i,
                         "message": "图片路径格式无效",
@@ -971,13 +947,30 @@ def batch_process_images(image_paths: List[Any], process_type: str = 'table', ta
                         "object_key": "",
                         "success": False
                     })
-                    logging.warning(f"第 {i+1} 个文件格式无效")
+                    logging.warning(f"第 {i+1} 个文件缺少minio_path字段")
                     continue
-                
+                elif not isinstance(image_path, str):
+                    failed_count += 1
+                    # 更新task_source中对应记录的状态
+                    item['parse_flag'] = 1
+                    item['status'] = '解析失败'
+                    results.append({
+                        "data": None,
+                        "error": f"minio_path字段不是字符串类型: {type(image_path)}",
+                        "filename": str(item),
+                        "index": i,
+                        "message": "图片路径格式无效",
+                        "minio_path": "",
+                        "object_key": "",
+                        "success": False
+                    })
+                    logging.warning(f"第 {i+1} 个文件minio_path字段类型错误")
+                    continue
+                               
                 logging.info(f"处理第 {i+1}/{len(image_paths)} 个文件: {image_path}")
                 
                 # 调用表格处理函数
-                result = parse_table_image(image_path)
+                result = parse_table_image(image_path,task_id)
                 
                 if result.get('success', False):
                     # 提取表格数据并转换为多个人员记录
@@ -1042,9 +1035,16 @@ def batch_process_images(image_paths: List[Any], process_type: str = 'table', ta
                                 "success": True
                             })
                             logging.info(f"成功提取人员 {person_idx+1}: {person_data.get('name_zh', 'Unknown')}")
+                        
+                        # 更新task_source中对应记录的状态(成功处理)
+                        item['parse_flag'] = 0
+                        item['status'] = '解析成功'
                     else:
                         # 没有提取到有效数据
                         failed_count += 1
+                        # 更新task_source中对应记录的状态
+                        item['parse_flag'] = 1
+                        item['status'] = '解析失败'
                         # 构建完整的MinIO URL路径
                         if original_filename:
                             filename = original_filename
@@ -1068,6 +1068,9 @@ def batch_process_images(image_paths: List[Any], process_type: str = 'table', ta
                         logging.warning(f"第 {i+1} 个文件未提取到人员信息")
                 else:
                     failed_count += 1
+                    # 更新task_source中对应记录的状态
+                    item['parse_flag'] = 1
+                    item['status'] = '解析失败'
                     # 构建完整的MinIO URL路径
                     if original_filename:
                         filename = original_filename
@@ -1094,6 +1097,9 @@ def batch_process_images(image_paths: List[Any], process_type: str = 'table', ta
                 failed_count += 1
                 error_msg = f"处理图片失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
+                # 更新task_source中对应记录的状态
+                item['parse_flag'] = 1
+                item['status'] = '解析失败'
                 # 构建完整的MinIO URL路径
                 if original_filename:
                     filename = original_filename
@@ -1115,6 +1121,31 @@ def batch_process_images(image_paths: List[Any], process_type: str = 'table', ta
                     "success": False
                 })
         
+        # 根据处理结果更新task_status
+        if failed_count == 0:
+            task_status = '解析成功'
+        elif success_count == 0:
+            task_status = '解析失败'
+        else:
+            task_status = '部分解析成功'
+        
+        # 所有task_source记录处理完成后,将更新后的task_source和task_status保存到数据库
+        try:
+            task_record.task_source = task_source
+            task_record.task_status = task_status
+            task_record.parse_count = success_count
+            task_record.parse_result = {
+                'success_count': success_count,
+                'failed_count': failed_count,
+                'parsed_record_ids': parsed_record_ids,
+                'processed_time': datetime.now().isoformat()
+            }
+            db.session.commit()
+            logging.info(f"成功更新task_id为{task_id}的任务记录,task_status={task_status},处理成功{success_count}条,失败{failed_count}条")
+        except Exception as update_error:
+            logging.error(f"更新任务记录失败: {str(update_error)}")
+            db.session.rollback()
+        
         # 组装最终结果
         if failed_count == 0:
             return {

+ 96 - 36
app/core/data_parse/parse_resume.py

@@ -153,7 +153,7 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
   "postal_code_zh": "",
   "postal_code_en": "",
   "birthday": "",
-  "age": 0,
+  "age": "",
   "native_place": "",
   "residence": "",
   "brand_group": "",
@@ -209,7 +209,7 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
                 if field in ['career_path', 'affiliation']:
                     parsed_resume[field] = []
                 elif field == 'age':
-                    parsed_resume[field] = 0
+                    parsed_resume[field] = ''
                 else:
                     parsed_resume[field] = ""
         
@@ -625,40 +625,47 @@ def validate_resume_format(file_path: str) -> bool:
 
 def batch_parse_resumes(file_paths: List[str], task_id=None, task_type=None) -> Dict[str, Any]:
     """
-    批量解析简历文件
+    批量解析简历文件,从parse_task_repository表读取任务记录进行处理
     
     Args:
-        file_paths (List[str]): 简历文件路径列表
-        task_id (str, optional): 任务ID
+        file_paths (List[str]): 简历文件路径列表(已废弃,现在从数据库读取)
+        task_id (str, optional): 任务ID,用于从数据库读取task_source
         task_type (str, optional): 任务类型
         
     Returns:
         Dict[str, Any]: 批量解析结果,格式与parse_result保持一致
     """
     try:
-        # 验证参数
-        if not file_paths or not isinstance(file_paths, list):
+        # 根据task_id从parse_task_repository表读取记录
+        if not task_id:
             return {
-                "processed_time": datetime.now().isoformat(),
-                "results": [],
-                "summary": {
-                    "failed_count": len(file_paths) if file_paths else 0,
-                    "success_count": 0,
-                    "success_rate": 0,
-                    "total_files": len(file_paths) if file_paths else 0
-                }
+                'code': 400,
+                'success': False,
+                'message': '缺少task_id参数',
+                'data': None
             }
         
-        if len(file_paths) == 0:
+        # 导入数据库模型
+        from app.core.data_parse.parse_system import ParseTaskRepository, db
+        
+        # 查询对应的任务记录
+        task_record = ParseTaskRepository.query.get(task_id)
+        if not task_record:
             return {
-                "processed_time": datetime.now().isoformat(),
-                "results": [],
-                "summary": {
-                    "failed_count": 0,
-                    "success_count": 0,
-                    "success_rate": 0,
-                    "total_files": 0
-                }
+                'code': 404,
+                'success': False,
+                'message': f'未找到task_id为{task_id}的任务记录',
+                'data': None
+            }
+        
+        # 获取task_source作为需要处理的数据列表
+        task_source = task_record.task_source
+        if not task_source or not isinstance(task_source, list):
+            return {
+                'code': 400,
+                'success': False,
+                'message': 'task_source为空或格式不正确',
+                'data': None
             }
         
         results = []
@@ -666,22 +673,34 @@ def batch_parse_resumes(file_paths: List[str], task_id=None, task_type=None) ->
         failed_count = 0
         parsed_record_ids = []  # 收集成功解析的记录ID
         
-        logging.info(f"开始批量解析简历文件,共 {len(file_paths)} 个文件")
+        logging.info(f"开始批量解析简历文件,共 {len(task_source)} 个文件")
         
-        # 逐一处理每个简历文件
-        for i, file_item in enumerate(file_paths):
+        # 逐一处理每个task_source元素
+        for i, item in enumerate(task_source):
             try:
+                # 检查parse_flag,只有值为1的才需要处理
+                if not isinstance(item, dict) or item.get('parse_flag') != 1:
+                    continue
+                
                 # 从文件项中获取minio_path和original_filename
-                if isinstance(file_item, dict):
-                    minio_path = file_item.get('minio_path', '')
-                    original_filename = file_item.get('original_filename', f'resume_{i}.pdf')
-                    file_status = file_item.get('status', '正常')
-                else:
-                    minio_path = file_item
-                    original_filename = _get_filename_from_path(file_item) if file_item else f'resume_{i}.pdf'
-                    file_status = '正常'
+                minio_path = item.get('minio_path', '')
+                original_filename = item.get('original_filename', f'resume_{i}.pdf')
                 
-                logging.info(f"处理第 {i+1}/{len(file_paths)} 个文件: {file_item}")
+                if not minio_path:
+                    failed_count += 1
+                    # 更新task_source中对应记录的状态
+                    item['parse_flag'] = 1
+                    item['status'] = '解析失败'
+                    results.append({
+                        'index': i,
+                        'minio_path': str(item),
+                        'success': False,
+                        'error': f'字典中缺少minio_path字段: {item}',
+                        'data': None
+                    })
+                    continue
+                
+                logging.info(f"处理第 {i+1}/{len(file_paths)} 个文件: {minio_path}")
                 
                 result = parse_resume_file(minio_path)
                 
@@ -741,10 +760,20 @@ def batch_parse_resumes(file_paths: List[str], task_id=None, task_type=None) ->
                             if parsed_record and 'id' in parsed_record:
                                 parsed_record_ids.append(str(parsed_record['id']))
                             logging.info(f"成功记录人才信息到parsed_talents表: {standardized_data.get('name_zh', '')}")
+                            
+                            # 更新task_source中对应记录的状态
+                            item['parse_flag'] = 0
+                            item['status'] = '解析成功'
                         else:
                             logging.warning(f"记录人才信息失败: {record_result.get('message', '')}")
+                            # 更新task_source中对应记录的状态
+                            item['parse_flag'] = 1
+                            item['status'] = '解析失败'
                     except Exception as record_error:
                         logging.error(f"调用record_parsed_talent函数失败: {str(record_error)}")
+                        # 更新task_source中对应记录的状态
+                        item['parse_flag'] = 1
+                        item['status'] = '解析失败'
                     
                     success_count += 1
                     # 构建完整的MinIO URL路径
@@ -764,6 +793,9 @@ def batch_parse_resumes(file_paths: List[str], task_id=None, task_type=None) ->
                     logging.info(f"成功处理第 {i+1} 个文件: {original_filename}")
                 else:
                     failed_count += 1
+                    # 更新task_source中对应记录的状态
+                    item['parse_flag'] = 1
+                    item['status'] = '解析失败'
                     # 构建完整的MinIO URL路径
                     relative_path = f"resume_files/{original_filename}"
                     complete_minio_path = minio_path if minio_path.startswith('http') else f"{minio_url}/{minio_bucket}/{relative_path}"
@@ -784,6 +816,9 @@ def batch_parse_resumes(file_paths: List[str], task_id=None, task_type=None) ->
                 failed_count += 1
                 error_msg = f"处理简历文件失败: {str(item_error)}"
                 logging.error(error_msg, exc_info=True)
+                # 更新task_source中对应记录的状态
+                item['parse_flag'] = 1
+                item['status'] = '解析失败'
                 # 构建完整的MinIO URL路径
                 relative_path = f"resume_files/{original_filename}"
                 complete_minio_path = minio_path if minio_path.startswith('http') else f"{minio_url}/{minio_bucket}/{relative_path}"
@@ -799,6 +834,31 @@ def batch_parse_resumes(file_paths: List[str], task_id=None, task_type=None) ->
                     "success": False
                 })
         
+        # 根据处理结果更新task_status
+        if failed_count == 0:
+            task_status = '解析成功'
+        elif success_count == 0:
+            task_status = '解析失败'
+        else:
+            task_status = '部分解析成功'
+        
+        # 所有task_source记录处理完成后,将更新后的task_source和task_status保存到数据库
+        try:
+            task_record.task_source = task_source
+            task_record.task_status = task_status
+            task_record.parse_count = success_count
+            task_record.parse_result = {
+                'success_count': success_count,
+                'failed_count': failed_count,
+                'parsed_record_ids': parsed_record_ids,
+                'processed_time': datetime.now().isoformat()
+            }
+            db.session.commit()
+            logging.info(f"成功更新task_id为{task_id}的任务记录,task_status={task_status},处理成功{success_count}条,失败{failed_count}条")
+        except Exception as update_error:
+            logging.error(f"更新任务记录失败: {str(update_error)}")
+            db.session.rollback()
+        
         # 组装最终结果
         if failed_count == 0:
             return {

+ 5 - 4
app/core/data_parse/parse_system.py

@@ -632,7 +632,7 @@ def get_business_cards():
     """
     try:
         # 查询所有名片记录,按创建时间倒序排列
-        cards = BusinessCard.query.filter_by(status='active').order_by(BusinessCard.created_at.desc()).all()
+        cards = BusinessCard.query.order_by(BusinessCard.created_at.desc()).all()
         
         # 转换为字典格式
         cards_data = [card.to_dict() for card in cards]
@@ -723,7 +723,8 @@ def update_business_card(card_id, data):
         # 更新字段
         updatable_fields = ['name_zh', 'name_en', 'title_zh', 'title_en', 'mobile', 'phone', 'email',
                            'hotel_zh', 'hotel_en', 'address_zh', 'address_en', 'postal_code_zh', 'postal_code_en',
-                           'brand_zh', 'brand_en', 'affiliation_zh', 'affiliation_en', 'brand_group', 'talent_profile']
+                           'brand_zh', 'brand_en', 'affiliation_zh', 'affiliation_en', 'career_path', 'brand_group', 
+                           'birthday', 'residence', 'age', 'native_place', 'talent_profile']
         
         for field in updatable_fields:
             if field in data and data[field] is not None:
@@ -2018,7 +2019,7 @@ def parse_text_with_qwen25VLplus(image_data):
   "postal_code_zh": "",
   "postal_code_en": "",
   "birthday": "",
-  "age": 0,
+  "age": 25,
   "native_place": "",
   "residence": "",
   "brand_group": "",
@@ -2089,7 +2090,7 @@ def parse_text_with_qwen25VLplus(image_data):
                 if field == 'career_path':
                     extracted_data[field] = []
                 elif field == 'age':
-                    extracted_data[field] = 0
+                    extracted_data[field] = ''
                 else:
                     extracted_data[field] = ""
         

+ 320 - 9
app/core/data_parse/parse_task.py

@@ -249,8 +249,15 @@ def _handle_recruitment_task(created_by, data=None):
                 # 其他类型转换为列表
                 data_list = [data]
             
-            # 直接使用原始数据,不添加额外字段
-            task_source = data_list
+            # 对每个元素添加parse_flag和status字段
+            for item in data_list:
+                if isinstance(item, dict):
+                    item['parse_flag'] = 1
+                    item['status'] = '待解析'
+                else:
+                    # 如果不是字典,转换为字典并添加parse_flag和status
+                    item = {'data': item, 'parse_flag': 1, 'status': '待解析'}
+                task_source.append(item)
         
         # 创建解析任务记录
         parse_task = ParseTaskRepository(
@@ -514,13 +521,12 @@ def add_parse_task(files, task_type, created_by='system', data=None, publish_tim
             file_obj = {
                 'original_filename': file_info['original_filename'],
                 'minio_path': file_info['minio_path'],
-                'status': '正常'
+                'status': '待解析',
+                'parse_flag': 1
             }
-            
             # 对于新任命类型,添加publish_time字段
             if task_type == '新任命' and publish_time:
                 file_obj['publish_time'] = publish_time
-            
             task_source.append(file_obj)
         
         # 添加失败的文件信息
@@ -528,13 +534,12 @@ def add_parse_task(files, task_type, created_by='system', data=None, publish_tim
             file_obj = {
                 'original_filename': failed_file['filename'],
                 'minio_path': '',
-                'status': '出错'
+                'status': '上传失败',
+                'parse_flag': 0
             }
-            
             # 对于新任命类型,添加publish_time字段
             if task_type == '新任命' and publish_time:
                 file_obj['publish_time'] = publish_time
-            
             task_source.append(file_obj)
         
         # 创建解析任务记录
@@ -559,6 +564,18 @@ def add_parse_task(files, task_type, created_by='system', data=None, publish_tim
             # 返回成功结果,简化结构
             result_data = parse_task.to_dict()
             
+            # 如果是新任命类型,执行MD文件切分
+            if task_type == '新任命':
+                try:
+                    logging.info(f"开始对新任命任务进行MD文件切分,task_id: {parse_task.id}")
+                    split_result = split_markdown_files(parse_task.id)
+                    if split_result.get('success'):
+                        logging.info(f"MD文件切分成功: {split_result.get('message', '')}")
+                    else:
+                        logging.warning(f"MD文件切分失败: {split_result.get('message', '')}")
+                except Exception as split_error:
+                    logging.error(f"执行MD文件切分时发生错误: {str(split_error)}")
+            
             if len(failed_uploads) > 0:
                 return {
                     'code': 206,  # Partial Content
@@ -1327,4 +1344,298 @@ def record_parsed_talent(talent_data, task_id=None, task_type=None):
             'success': False,
             'message': error_msg,
             'data': None
-        } 
+        } 
+
+
+def split_markdown_files(task_id):
+    """
+    根据task_id在parse_task_repository数据表里查找对应任务记录,
+    遍历task_source中的每个元素,对包含**数字**格式的MD文件进行切分
+    
+    Args:
+        task_id (str): 任务ID
+        
+    Returns:
+        dict: 包含操作结果的字典
+    """
+    try:
+        # 根据task_id查找任务记录
+        task_record = ParseTaskRepository.query.get(task_id)
+        if not task_record:
+            return {
+                'success': False,
+                'message': f'未找到task_id为{task_id}的任务记录',
+                'data': None
+            }
+        
+        task_source = task_record.task_source
+        if not task_source or not isinstance(task_source, list):
+            return {
+                'success': False,
+                'message': 'task_source为空或格式不正确',
+                'data': None
+            }
+        
+        # 获取MinIO客户端
+        minio_client = get_minio_client()
+        if not minio_client:
+            return {
+                'success': False,
+                'message': '无法连接到MinIO服务器',
+                'data': None
+            }
+        
+        success_count = 0
+        failed_count = 0
+        
+        # 遍历task_source中的每个元素
+        for i, item in enumerate(task_source):
+            try:
+                if not isinstance(item, dict):
+                    failed_count += 1
+                    continue
+                
+                minio_path = item.get('minio_path')
+                if not minio_path:
+                    failed_count += 1
+                    continue
+                
+                # 从MinIO下载MD文件
+                try:
+                    # 解析MinIO URL获取对象路径
+                    object_key = _extract_object_key_from_url(minio_path)
+                    if not object_key:
+                        failed_count += 1
+                        continue
+                    
+                    # 下载文件
+                    response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
+                    file_content = response['Body'].read().decode('utf-8')
+                    
+                    # 检查是否包含**数字**格式
+                    import re
+                    pattern = r'\*\*\d+\*\*'
+                    matches = re.findall(pattern, file_content)
+                    
+                    if not matches:
+                        # 没有找到**数字**格式,跳过处理
+                        continue
+                    
+                    # 获取原始文件名
+                    original_filename = item.get('original_filename', '')
+                    if not original_filename:
+                        # 从minio_path提取文件名
+                        original_filename = object_key.split('/')[-1]
+                    
+                    # 去掉.md扩展名
+                    base_name = original_filename
+                    if base_name.endswith('.md'):
+                        base_name = base_name[:-3]
+                    
+                    # 切分文件
+                    split_parts = _split_markdown_content(file_content)
+                    
+                    if len(split_parts) <= 1:
+                        # 没有成功切分,跳过
+                        continue
+                    
+                    # 为每个切分部分创建新的MD文件
+                    new_items = []
+                    for j, part_content in enumerate(split_parts, 1):
+                        try:
+                            # 生成新文件名
+                            new_filename = f"{base_name}_{j}.md"
+                            
+                            # 上传到MinIO
+                            new_minio_path = _upload_markdown_to_minio(
+                                minio_client, part_content, new_filename
+                            )
+                            
+                            if new_minio_path:
+                                # 创建新的task_source元素
+                                new_item = {
+                                    'status': '待解析',
+                                    'publish_time':item.get('publish_time', ''),
+                                    'parse_flag': 1,
+                                    'minio_path': new_minio_path,
+                                    'original_filename': new_filename
+                                }
+                                new_items.append(new_item)
+                                success_count += 1
+                            else:
+                                failed_count += 1
+                                
+                        except Exception as split_error:
+                            logging.error(f"处理切分文件失败: {str(split_error)}")
+                            failed_count += 1
+                    
+                    # 将新创建的元素添加到task_source
+                    task_source.extend(new_items)
+                    
+                    # 将原始记录的parse_flag设置为0
+                    item['parse_flag'] = 0
+                    
+                except Exception as download_error:
+                    logging.error(f"下载文件失败: {str(download_error)}")
+                    failed_count += 1
+                    
+            except Exception as item_error:
+                logging.error(f"处理task_source元素失败: {str(item_error)}")
+                failed_count += 1
+        
+        # 更新数据库记录
+        try:
+            task_record.task_source = task_source
+            db.session.commit()
+            logging.info(f"成功更新task_id为{task_id}的任务记录,切分成功{success_count}个文件,失败{failed_count}个")
+        except Exception as update_error:
+            logging.error(f"更新任务记录失败: {str(update_error)}")
+            db.session.rollback()
+            return {
+                'success': False,
+                'message': f'更新数据库失败: {str(update_error)}',
+                'data': None
+            }
+        
+        return {
+            'success': True,
+            'message': f'MD文件切分完成,成功{success_count}个,失败{failed_count}个',
+            'data': {
+                'success_count': success_count,
+                'failed_count': failed_count
+            }
+        }
+        
+    except Exception as e:
+        error_msg = f"MD文件切分失败: {str(e)}"
+        logging.error(error_msg, exc_info=True)
+        
+        return {
+            'success': False,
+            'message': error_msg,
+            'data': None
+        }
+
+
+def _split_markdown_content(content):
+    """
+    根据**数字**格式切分MD文件内容
+    
+    Args:
+        content (str): MD文件内容
+        
+    Returns:
+        list: 切分后的内容列表
+    """
+    import re
+    
+    # 查找所有**数字**的位置
+    pattern = r'\*\*\d+\*\*'
+    matches = re.finditer(pattern, content)
+    
+    positions = []
+    for match in matches:
+        positions.append(match.start())
+    
+    if len(positions) < 2:
+        # 至少需要两个标记才能切分
+        return [content]
+    
+    # 切分内容
+    parts = []
+    for i in range(len(positions)):
+        if i == 0:
+            # 第一个部分:从开始到第一个标记
+            start_pos = 0
+        else:
+            # 其他部分:从上一个标记到当前标记
+            start_pos = positions[i-1]
+        
+        if i == len(positions) - 1:
+            # 最后一个部分:从最后一个标记到文件末尾
+            end_pos = len(content)
+        else:
+            # 其他部分:到下一个标记
+            end_pos = positions[i]
+        
+        part_content = content[start_pos:end_pos].strip()
+        if part_content:
+            parts.append(part_content)
+    
+    return parts
+
+
+def _upload_markdown_to_minio(minio_client, content, filename):
+    """
+    将MD文件内容上传到MinIO
+    
+    Args:
+        minio_client: MinIO客户端
+        content (str): 文件内容
+        filename (str): 文件名
+        
+    Returns:
+        str: 上传后的minio_path,失败返回None
+    """
+    try:
+        # 生成唯一的文件名
+        unique_filename = f"{uuid.uuid4().hex}_{filename}"
+        minio_path = f"appointment_files/{unique_filename}"
+        
+        # 上传文件
+        minio_client.put_object(
+            Bucket=minio_bucket,
+            Key=minio_path,
+            Body=content.encode('utf-8'),
+            ContentType='text/markdown'
+        )
+        
+        # 构建完整的minio_path
+        full_minio_path = f"{minio_url}/{minio_bucket}/{minio_path}"
+        
+        logging.info(f"成功上传MD文件到MinIO: {full_minio_path}")
+        return full_minio_path
+        
+    except Exception as e:
+        logging.error(f"上传MD文件到MinIO失败: {str(e)}")
+        return None
+
+
+def _extract_object_key_from_url(minio_url):
+    """
+    从MinIO完整URL中提取对象键名
+    
+    Args:
+        minio_url (str): 完整的MinIO URL
+        
+    Returns:
+        str: 对象键名,失败时返回None
+    """
+    try:
+        if not minio_url or not isinstance(minio_url, str):
+            return None
+            
+        # 移除协议部分 (http:// 或 https://)
+        if minio_url.startswith('https://'):
+            url_without_protocol = minio_url[8:]
+        elif minio_url.startswith('http://'):
+            url_without_protocol = minio_url[7:]
+        else:
+            # 如果没有协议前缀,假设是相对路径
+            url_without_protocol = minio_url
+        
+        # 分割路径部分
+        parts = url_without_protocol.split('/')
+        
+        # 至少需要包含 host:port/bucket/object
+        if len(parts) < 3:
+            return None
+        
+        # 跳过host:port和bucket,获取对象路径
+        object_key = '/'.join(parts[2:])
+        
+        return object_key if object_key else None
+        
+    except Exception as e:
+        logging.error(f"解析MinIO URL失败: {str(e)}")
+        return None 

+ 1 - 1
app/core/data_parse/parse_web.py

@@ -773,7 +773,7 @@ def _convert_webpage_to_card_format(webpage_data: Dict[str, Any], publish_time:
         "address_en": webpage_data.get('address_en', ''),
         "address_zh": webpage_data.get('address_zh', ''),
         "affiliation": affiliation,
-        "age": webpage_data.get('age', 0),
+        "age": webpage_data.get('age', ''),
         "birthday": webpage_data.get('birthday', ''),
         "brand_group": webpage_data.get('brand_group', ''),
         "career_path": career_path,