il y a 1 jour · dc1f7daf00
--- a/api_documentation_parse_task.md
+++ b/api_documentation_parse_task.md
@@ -0,0 +1,590 @@
 
				+# 解析任务API接口文档
			
 
				+
			
 
				+本文档提供了解析任务相关API接口的详细使用说明，包括创建解析任务、查询任务列表和获取任务详情的完整接口文档。
			
 
				+
			
 
				+## 基础信息
			
 
				+
			
 
				+- **服务器地址**: 
			
 
				+  - 开发环境: `http://localhost:5500`
			
 
				+  - 生产环境: `http://192.168.3.143`
			
 
				+- **API基础路径**: `/api/parse`
			
 
				+- **内容类型**: `application/json`
			
 
				+- **字符编码**: `UTF-8`
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 1. 新增解析任务接口
			
 
				+
			
 
				+### 接口概述
			
 
				+创建新的解析任务，支持多种文件类型上传到MinIO存储。
			
 
				+
			
 
				+### 基本信息
			
 
				+- **URL**: `/api/parse/add-parse-task`
			
 
				+- **HTTP方法**: `POST`
			
 
				+- **内容类型**: `multipart/form-data`
			
 
				+
			
 
				+### 请求参数
			
 
				+
			
 
				+| 参数名 | 类型 | 必填 | 说明 |
			
 
				+|--------|------|------|------|
			
 
				+| `task_type` | String | 是 | 任务类型，可选值：`名片`、`简历`、`新任命`、`招聘`、`杂项` |
			
 
				+| `files` | File[] | 否* | 文件数组（招聘类型不需要文件） |
			
 
				+| `created_by` | String | 否 | 创建者名称，默认为`api_user` |
			
 
				+
			
 
				+*注：除招聘类型外，其他类型必须上传文件
			
 
				+
			
 
				+### 任务类型说明
			
 
				+
			
 
				+| 任务类型 | 支持文件格式 | 存储目录 | 说明 |
			
 
				+|---------|-------------|----------|------|
			
 
				+| 名片 | JPG, PNG | `talent_photos/` | 名片图片解析 |
			
 
				+| 简历 | PDF | `resume_files/` | 简历文档解析 |
			
 
				+| 新任命 | MD | `appointment_files/` | 任命文档解析 |
			
 
				+| 招聘 | 无需文件 | 无 | 数据库记录处理 |
			
 
				+| 杂项 | 任意格式 | `misc_files/` | 其他类型文件 |
			
 
				+
			
 
				+### 请求示例
			
 
				+
			
 
				+#### JavaScript/AJAX示例
			
 
				+```javascript
			
 
				+// 创建FormData对象
			
 
				+const formData = new FormData();
			
 
				+
			
 
				+// 添加任务类型
			
 
				+formData.append('task_type', '名片');
			
 
				+
			
 
				+// 添加文件（多文件上传）
			
 
				+const fileInput = document.getElementById('fileInput');
			
 
				+for (let i = 0; i < fileInput.files.length; i++) {
			
 
				+    formData.append('files', fileInput.files[i]);
			
 
				+}
			
 
				+
			
 
				+// 添加创建者（可选）
			
 
				+formData.append('created_by', 'frontend_user');
			
 
				+
			
 
				+// 发送请求
			
 
				+fetch('/api/parse/add-parse-task', {
			
 
				+    method: 'POST',
			
 
				+    body: formData
			
 
				+})
			
 
				+.then(response => response.json())
			
 
				+.then(data => {
			
 
				+    console.log('上传成功:', data);
			
 
				+})
			
 
				+.catch(error => {
			
 
				+    console.error('上传失败:', error);
			
 
				+});
			
 
				+```
			
 
				+
			
 
				+#### jQuery示例
			
 
				+```javascript
			
 
				+$('#uploadForm').on('submit', function(e) {
			
 
				+    e.preventDefault();
			
 
				+    
			
 
				+    const formData = new FormData();
			
 
				+    formData.append('task_type', $('#taskType').val());
			
 
				+    
			
 
				+    // 添加多个文件
			
 
				+    const files = $('#fileInput')[0].files;
			
 
				+    for (let i = 0; i < files.length; i++) {
			
 
				+        formData.append('files', files[i]);
			
 
				+    }
			
 
				+    
			
 
				+    formData.append('created_by', 'jquery_user');
			
 
				+    
			
 
				+    $.ajax({
			
 
				+        url: '/api/parse/add-parse-task',
			
 
				+        type: 'POST',
			
 
				+        data: formData,
			
 
				+        processData: false,
			
 
				+        contentType: false,
			
 
				+        success: function(response) {
			
 
				+            console.log('任务创建成功:', response);
			
 
				+        },
			
 
				+        error: function(xhr, status, error) {
			
 
				+            console.error('任务创建失败:', error);
			
 
				+        }
			
 
				+    });
			
 
				+});
			
 
				+```
			
 
				+
			
 
				+#### cURL示例
			
 
				+```bash
			
 
				+# 上传名片文件
			
 
				+curl -X POST "http://localhost:5500/api/parse/add-parse-task" \
			
 
				+  -F "task_type=名片" \
			
 
				+  -F "files=@/path/to/business_card1.jpg" \
			
 
				+  -F "files=@/path/to/business_card2.png" \
			
 
				+  -F "created_by=test_user"
			
 
				+
			
 
				+# 创建招聘任务（无需文件）
			
 
				+curl -X POST "http://localhost:5500/api/parse/add-parse-task" \
			
 
				+  -F "task_type=招聘" \
			
 
				+  -F "created_by=hr_user"
			
 
				+```
			
 
				+
			
 
				+### 响应格式
			
 
				+
			
 
				+#### 成功响应 (HTTP 200)
			
 
				+```json
			
 
				+{
			
 
				+    "success": true,
			
 
				+    "message": "解析任务创建成功，所有文件上传完成",
			
 
				+    "data": {
			
 
				+        "task_info": {
			
 
				+            "id": 123,
			
 
				+            "task_name": "parse_task_20250115_a1b2c3d4",
			
 
				+            "task_status": "待解析",
			
 
				+            "task_type": "名片",
			
 
				+            "task_source": "{\"minio_paths_json\":[\"http://192.168.3.143:9000/dataops-bucket/talent_photos/talent_photo_20250115_143012_a1b2c3d4.jpg\"],\"upload_time\":\"2025-01-15T14:30:25.123456\"}",
			
 
				+            "collection_count": 2,
			
 
				+            "parse_count": 0,
			
 
				+            "parse_result": null,
			
 
				+            "created_by": "api_user",
			
 
				+            "updated_by": "api_user",
			
 
				+            "created_at": "2025-01-15T14:30:25.123456",
			
 
				+            "updated_at": "2025-01-15T14:30:25.123456"
			
 
				+        },
			
 
				+        "upload_summary": {
			
 
				+            "task_type": "名片",
			
 
				+            "total_files": 2,
			
 
				+            "uploaded_count": 2,
			
 
				+            "failed_count": 0,
			
 
				+            "uploaded_files": [
			
 
				+                {
			
 
				+                    "original_filename": "business_card1.jpg",
			
 
				+                    "minio_path": "http://192.168.3.143:9000/dataops-bucket/talent_photos/talent_photo_20250115_143012_a1b2c3d4.jpg",
			
 
				+                    "relative_path": "talent_photos/talent_photo_20250115_143012_a1b2c3d4.jpg",
			
 
				+                    "file_size": 256000
			
 
				+                }
			
 
				+            ],
			
 
				+            "failed_uploads": []
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 部分成功响应 (HTTP 206)
			
 
				+```json
			
 
				+{
			
 
				+    "success": true,
			
 
				+    "message": "解析任务创建成功，但有1个文件上传失败",
			
 
				+    "data": {
			
 
				+        "task_info": { /* 任务信息 */ },
			
 
				+        "upload_summary": {
			
 
				+            "task_type": "名片",
			
 
				+            "total_files": 2,
			
 
				+            "uploaded_count": 1,
			
 
				+            "failed_count": 1,
			
 
				+            "uploaded_files": [ /* 成功上传的文件 */ ],
			
 
				+            "failed_uploads": [
			
 
				+                {
			
 
				+                    "filename": "broken_file.jpg",
			
 
				+                    "error": "文件损坏无法上传"
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 错误响应 (HTTP 400)
			
 
				+```json
			
 
				+{
			
 
				+    "success": false,
			
 
				+    "message": "task_type参数必须是以下值之一：名片、简历、新任命、招聘、杂项",
			
 
				+    "data": null
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 状态码说明
			
 
				+
			
 
				+| 状态码 | 说明 |
			
 
				+|--------|------|
			
 
				+| 200 | 所有文件上传成功，任务创建成功 |
			
 
				+| 206 | 部分文件上传成功，任务创建成功 |
			
 
				+| 400 | 请求参数错误（缺少必填参数、文件格式不支持等） |
			
 
				+| 500 | 服务器内部错误 |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 2. 获取解析任务列表接口
			
 
				+
			
 
				+### 接口概述
			
 
				+分页查询解析任务列表，支持按任务类型和状态过滤。
			
 
				+
			
 
				+### 基本信息
			
 
				+- **URL**: `/api/parse/get-parse-tasks`
			
 
				+- **HTTP方法**: `GET`
			
 
				+- **内容类型**: `application/json`
			
 
				+
			
 
				+### 请求参数
			
 
				+
			
 
				+| 参数名 | 类型 | 必填 | 默认值 | 说明 |
			
 
				+|--------|------|------|--------|------|
			
 
				+| `page` | Integer | 否 | 1 | 页码，从1开始 |
			
 
				+| `per_page` | Integer | 否 | 10 | 每页记录数，最大100 |
			
 
				+| `task_type` | String | 否 | 无 | 任务类型过滤 |
			
 
				+| `task_status` | String | 否 | 无 | 任务状态过滤 |
			
 
				+
			
 
				+### 请求示例
			
 
				+
			
 
				+#### JavaScript/Fetch示例
			
 
				+```javascript
			
 
				+// 基础查询
			
 
				+fetch('/api/parse/get-parse-tasks?page=1&per_page=20')
			
 
				+    .then(response => response.json())
			
 
				+    .then(data => {
			
 
				+        console.log('任务列表:', data);
			
 
				+    });
			
 
				+
			
 
				+// 带过滤条件的查询
			
 
				+const params = new URLSearchParams({
			
 
				+    page: 1,
			
 
				+    per_page: 10,
			
 
				+    task_type: '名片',
			
 
				+    task_status: '待解析'
			
 
				+});
			
 
				+
			
 
				+fetch(`/api/parse/get-parse-tasks?${params}`)
			
 
				+    .then(response => response.json())
			
 
				+    .then(data => {
			
 
				+        console.log('过滤后的任务列表:', data);
			
 
				+    });
			
 
				+```
			
 
				+
			
 
				+#### jQuery示例
			
 
				+```javascript
			
 
				+$.ajax({
			
 
				+    url: '/api/parse/get-parse-tasks',
			
 
				+    type: 'GET',
			
 
				+    data: {
			
 
				+        page: 1,
			
 
				+        per_page: 15,
			
 
				+        task_type: '简历',
			
 
				+        task_status: '解析完成'
			
 
				+    },
			
 
				+    success: function(response) {
			
 
				+        console.log('查询成功:', response);
			
 
				+        // 处理任务列表数据
			
 
				+        if (response.success && response.data.tasks) {
			
 
				+            response.data.tasks.forEach(task => {
			
 
				+                console.log(`任务: ${task.task_name}, 状态: ${task.task_status}`);
			
 
				+            });
			
 
				+        }
			
 
				+    },
			
 
				+    error: function(xhr, status, error) {
			
 
				+        console.error('查询失败:', error);
			
 
				+    }
			
 
				+});
			
 
				+```
			
 
				+
			
 
				+#### cURL示例
			
 
				+```bash
			
 
				+# 基础查询
			
 
				+curl "http://localhost:5500/api/parse/get-parse-tasks?page=1&per_page=10"
			
 
				+
			
 
				+# 带过滤条件查询
			
 
				+curl "http://localhost:5500/api/parse/get-parse-tasks?page=1&per_page=20&task_type=名片&task_status=待解析"
			
 
				+```
			
 
				+
			
 
				+### 响应格式
			
 
				+
			
 
				+#### 成功响应 (HTTP 200)
			
 
				+```json
			
 
				+{
			
 
				+    "success": true,
			
 
				+    "message": "获取解析任务列表成功",
			
 
				+    "data": {
			
 
				+        "tasks": [
			
 
				+            {
			
 
				+                "id": 123,
			
 
				+                "task_name": "parse_task_20250115_a1b2c3d4",
			
 
				+                "task_status": "待解析",
			
 
				+                "task_type": "名片",
			
 
				+                "task_source": "{\"minio_paths_json\":[\"http://192.168.3.143:9000/dataops-bucket/talent_photos/file1.jpg\"],\"upload_time\":\"2025-01-15T14:30:25.123456\"}",
			
 
				+                "collection_count": 2,
			
 
				+                "parse_count": 0,
			
 
				+                "parse_result": null,
			
 
				+                "created_by": "api_user",
			
 
				+                "updated_by": "api_user",
			
 
				+                "created_at": "2025-01-15T14:30:25.123456",
			
 
				+                "updated_at": "2025-01-15T14:30:25.123456"
			
 
				+            }
			
 
				+        ],
			
 
				+        "pagination": {
			
 
				+            "page": 1,
			
 
				+            "per_page": 10,
			
 
				+            "total": 50,
			
 
				+            "pages": 5,
			
 
				+            "has_next": true,
			
 
				+            "has_prev": false
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 错误响应 (HTTP 400)
			
 
				+```json
			
 
				+{
			
 
				+    "success": false,
			
 
				+    "message": "分页参数错误",
			
 
				+    "data": null
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 状态码说明
			
 
				+
			
 
				+| 状态码 | 说明 |
			
 
				+|--------|------|
			
 
				+| 200 | 查询成功 |
			
 
				+| 400 | 请求参数错误 |
			
 
				+| 500 | 服务器内部错误 |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 3. 获取解析任务详情接口
			
 
				+
			
 
				+### 接口概述
			
 
				+根据任务名称获取指定解析任务的详细信息。
			
 
				+
			
 
				+### 基本信息
			
 
				+- **URL**: `/api/parse/get-parse-task-detail`
			
 
				+- **HTTP方法**: `GET`
			
 
				+- **内容类型**: `application/json`
			
 
				+
			
 
				+### 请求参数
			
 
				+
			
 
				+| 参数名 | 类型 | 必填 | 说明 |
			
 
				+|--------|------|------|------|
			
 
				+| `task_name` | String | 是 | 任务名称 |
			
 
				+
			
 
				+### 请求示例
			
 
				+
			
 
				+#### JavaScript/Fetch示例
			
 
				+```javascript
			
 
				+// 获取任务详情
			
 
				+const taskName = 'parse_task_20250115_a1b2c3d4';
			
 
				+fetch(`/api/parse/get-parse-task-detail?task_name=${encodeURIComponent(taskName)}`)
			
 
				+    .then(response => response.json())
			
 
				+    .then(data => {
			
 
				+        if (data.success) {
			
 
				+            console.log('任务详情:', data.data);
			
 
				+            // 解析任务来源信息
			
 
				+            const taskSource = JSON.parse(data.data.task_source);
			
 
				+            console.log('MinIO文件路径:', taskSource.minio_paths_json);
			
 
				+        }
			
 
				+    });
			
 
				+```
			
 
				+
			
 
				+#### jQuery示例
			
 
				+```javascript
			
 
				+function getTaskDetail(taskName) {
			
 
				+    $.ajax({
			
 
				+        url: '/api/parse/get-parse-task-detail',
			
 
				+        type: 'GET',
			
 
				+        data: { task_name: taskName },
			
 
				+        success: function(response) {
			
 
				+            if (response.success) {
			
 
				+                const task = response.data;
			
 
				+                console.log('任务详情:', task);
			
 
				+                
			
 
				+                // 解析文件路径
			
 
				+                const taskSource = JSON.parse(task.task_source);
			
 
				+                const filePaths = taskSource.minio_paths_json;
			
 
				+                
			
 
				+                // 显示文件列表
			
 
				+                filePaths.forEach((path, index) => {
			
 
				+                    console.log(`文件${index + 1}: ${path}`);
			
 
				+                });
			
 
				+            }
			
 
				+        },
			
 
				+        error: function(xhr, status, error) {
			
 
				+            console.error('获取任务详情失败:', error);
			
 
				+        }
			
 
				+    });
			
 
				+}
			
 
				+
			
 
				+// 使用示例
			
 
				+getTaskDetail('parse_task_20250115_a1b2c3d4');
			
 
				+```
			
 
				+
			
 
				+#### cURL示例
			
 
				+```bash
			
 
				+# 获取任务详情
			
 
				+curl "http://localhost:5500/api/parse/get-parse-task-detail?task_name=parse_task_20250115_a1b2c3d4"
			
 
				+```
			
 
				+
			
 
				+### 响应格式
			
 
				+
			
 
				+#### 成功响应 (HTTP 200)
			
 
				+```json
			
 
				+{
			
 
				+    "success": true,
			
 
				+    "message": "成功获取任务 parse_task_20250115_a1b2c3d4 的详细信息",
			
 
				+    "data": {
			
 
				+        "id": 123,
			
 
				+        "task_name": "parse_task_20250115_a1b2c3d4",
			
 
				+        "task_status": "解析完成",
			
 
				+        "task_type": "名片",
			
 
				+        "task_source": "{\"minio_paths_json\":[\"http://192.168.3.143:9000/dataops-bucket/talent_photos/talent_photo_20250115_143012_a1b2c3d4.jpg\",\"http://192.168.3.143:9000/dataops-bucket/talent_photos/talent_photo_20250115_143015_b2c3d4e5.jpg\"],\"upload_time\":\"2025-01-15T14:30:25.123456\"}",
			
 
				+        "collection_count": 2,
			
 
				+        "parse_count": 2,
			
 
				+        "parse_result": "{\"parsed_cards\":[{\"name\":\"张三\",\"company\":\"ABC公司\",\"position\":\"技术总监\"}]}",
			
 
				+        "created_by": "api_user",
			
 
				+        "updated_by": "api_user",
			
 
				+        "created_at": "2025-01-15T14:30:25.123456",
			
 
				+        "updated_at": "2025-01-15T15:45:30.789012"
			
 
				+    }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 错误响应 (HTTP 400)
			
 
				+```json
			
 
				+{
			
 
				+    "success": false,
			
 
				+    "message": "任务名称参数不能为空",
			
 
				+    "data": null
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 错误响应 (HTTP 404)
			
 
				+```json
			
 
				+{
			
 
				+    "success": false,
			
 
				+    "message": "未找到任务名称为 invalid_task_name 的记录",
			
 
				+    "data": null
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 状态码说明
			
 
				+
			
 
				+| 状态码 | 说明 |
			
 
				+|--------|------|
			
 
				+| 200 | 查询成功 |
			
 
				+| 400 | 请求参数错误 |
			
 
				+| 404 | 任务不存在 |
			
 
				+| 500 | 服务器内部错误 |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 测试数据示例
			
 
				+
			
 
				+### 测试用文件准备
			
 
				+
			
 
				+```bash
			
 
				+# 创建测试文件目录
			
 
				+mkdir -p test_files
			
 
				+
			
 
				+# 准备名片图片文件
			
 
				+cp sample_business_card.jpg test_files/
			
 
				+cp sample_business_card.png test_files/
			
 
				+
			
 
				+# 准备简历PDF文件
			
 
				+cp sample_resume.pdf test_files/
			
 
				+
			
 
				+# 准备任命MD文件
			
 
				+echo "# 新任命通知\n\n## 任命信息\n- 姓名：张三\n- 职位：技术总监" > test_files/appointment.md
			
 
				+```
			
 
				+
			
 
				+### 完整测试流程
			
 
				+
			
 
				+```javascript
			
 
				+// 1. 创建名片解析任务
			
 
				+async function testCreateTask() {
			
 
				+    const formData = new FormData();
			
 
				+    formData.append('task_type', '名片');
			
 
				+    formData.append('files', document.querySelector('#fileInput').files[0]);
			
 
				+    formData.append('created_by', 'test_user');
			
 
				+    
			
 
				+    const response = await fetch('/api/parse/add-parse-task', {
			
 
				+        method: 'POST',
			
 
				+        body: formData
			
 
				+    });
			
 
				+    
			
 
				+    const result = await response.json();
			
 
				+    console.log('任务创建结果:', result);
			
 
				+    
			
 
				+    if (result.success) {
			
 
				+        const taskName = result.data.task_info.task_name;
			
 
				+        return taskName;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// 2. 查询任务列表
			
 
				+async function testGetTasks() {
			
 
				+    const response = await fetch('/api/parse/get-parse-tasks?page=1&per_page=10&task_type=名片');
			
 
				+    const result = await response.json();
			
 
				+    console.log('任务列表:', result);
			
 
				+}
			
 
				+
			
 
				+// 3. 获取任务详情
			
 
				+async function testGetTaskDetail(taskName) {
			
 
				+    const response = await fetch(`/api/parse/get-parse-task-detail?task_name=${taskName}`);
			
 
				+    const result = await response.json();
			
 
				+    console.log('任务详情:', result);
			
 
				+}
			
 
				+
			
 
				+// 完整测试
			
 
				+async function runFullTest() {
			
 
				+    try {
			
 
				+        const taskName = await testCreateTask();
			
 
				+        if (taskName) {
			
 
				+            await testGetTasks();
			
 
				+            await testGetTaskDetail(taskName);
			
 
				+        }
			
 
				+    } catch (error) {
			
 
				+        console.error('测试失败:', error);
			
 
				+    }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 常见问题与解决方案
			
 
				+
			
 
				+### 1. 文件上传失败
			
 
				+**问题**: 上传文件时返回400错误
			
 
				+**解决方案**: 
			
 
				+- 检查文件格式是否符合任务类型要求
			
 
				+- 确认文件大小不超过限制
			
 
				+- 验证`task_type`参数值是否正确
			
 
				+
			
 
				+### 2. 任务查询为空
			
 
				+**问题**: 查询任务列表返回空数组
			
 
				+**解决方案**:
			
 
				+- 确认数据库中有对应的任务记录
			
 
				+- 检查过滤条件是否正确
			
 
				+- 验证分页参数是否合理
			
 
				+
			
 
				+### 3. MinIO路径无法访问
			
 
				+**问题**: 返回的MinIO路径无法直接访问
			
 
				+**解决方案**:
			
 
				+- 确认MinIO服务器配置正确
			
 
				+- 检查网络连接和防火墙设置
			
 
				+- 验证MinIO访问权限配置
			
 
				+
			
 
				+### 4. 任务状态更新
			
 
				+**问题**: 如何更新任务状态
			
 
				+**解决方案**:
			
 
				+- 使用其他API接口更新任务状态
			
 
				+- 通过后台程序自动更新
			
 
				+- 检查解析进度和结果
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 版本信息
			
 
				+
			
 
				+- **文档版本**: v1.0
			
 
				+- **API版本**: v1.0
			
 
				+- **最后更新**: 2025-07-15
			
 
				+- **维护者**: DataOps团队
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 联系方式
			
 
				+
			
 
				+如有疑问或需要技术支持，请联系：
			
 
				+- **开发团队**: dataops-dev@company.com
			
 
				+- **技术文档**: [内部文档链接]
			
 
				+- **问题反馈**: [GitHub Issues] 
			
--- a/app/core/data_parse/parse_card.py
+++ b/app/core/data_parse/parse_card.py
@@ -6,6 +6,9 @@ import boto3
 
				 from botocore.config import Config
			
 
				 import logging
			
 
				 import uuid
			
 
				+import json
			
 
				+from io import BytesIO
			
 
				+from werkzeug.datastructures import FileStorage
			
 
				 from app.config.config import DevelopmentConfig, ProductionConfig
			
 
				 
			
 
				 # 导入原有的函数和模型
			
@@ -510,3 +513,261 @@ def delete_business_card(card_id):
 
				             'message': error_msg,
			
 
				             'data': None
			
 
				         }
			
 
				+
			
 
				+
			
 
				+def batch_process_business_card_images(minio_paths_json):
			
 
				+    """
			
 
				+    批量处理名片图片，从MinIO下载图片并进行解析
			
 
				+    
			
 
				+    Args:
			
 
				+        minio_paths_json (list): 包含MinIO对象访问地址的JSON数组
			
 
				+        
			
 
				+    Returns:
			
 
				+        dict: 批量处理结果，包含所有解析结果的数组
			
 
				+    """
			
 
				+    try:
			
 
				+        logging.info(f"开始批量处理名片图片，共 {len(minio_paths_json)} 个文件")
			
 
				+        
			
 
				+        # 参数验证
			
 
				+        if not minio_paths_json or not isinstance(minio_paths_json, list):
			
 
				+            return {
			
 
				+                'code': 400,
			
 
				+                'success': False,
			
 
				+                'message': 'minio_paths_json参数必须是非空数组',
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        if len(minio_paths_json) == 0:
			
 
				+            return {
			
 
				+                'code': 400,
			
 
				+                'success': False,
			
 
				+                'message': 'MinIO路径数组不能为空',
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        # 获取MinIO客户端
			
 
				+        minio_client = get_minio_client()
			
 
				+        if not minio_client:
			
 
				+            return {
			
 
				+                'code': 500,
			
 
				+                'success': False,
			
 
				+                'message': '无法连接到MinIO服务器',
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        results = []
			
 
				+        success_count = 0
			
 
				+        failed_count = 0
			
 
				+        
			
 
				+        # 逐一处理每个MinIO路径
			
 
				+        for i, minio_path in enumerate(minio_paths_json):
			
 
				+            try:
			
 
				+                logging.info(f"处理第 {i+1}/{len(minio_paths_json)} 个文件: {minio_path}")
			
 
				+                
			
 
				+                # 解析MinIO URL获取对象路径
			
 
				+                object_key = _extract_object_key_from_url(minio_path)
			
 
				+                if not object_key:
			
 
				+                    failed_count += 1
			
 
				+                    results.append({
			
 
				+                        'index': i,
			
 
				+                        'minio_path': minio_path,
			
 
				+                        'success': False,
			
 
				+                        'error': f'无效的MinIO URL格式: {minio_path}',
			
 
				+                        'data': None
			
 
				+                    })
			
 
				+                    continue
			
 
				+                
			
 
				+                # 从MinIO下载图片文件
			
 
				+                try:
			
 
				+                    logging.info(f"从MinIO下载文件: {object_key}")
			
 
				+                    response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
			
 
				+                    image_data = response['Body'].read()
			
 
				+                    
			
 
				+                    if len(image_data) == 0:
			
 
				+                        failed_count += 1
			
 
				+                        results.append({
			
 
				+                            'index': i,
			
 
				+                            'minio_path': minio_path,
			
 
				+                            'success': False,
			
 
				+                            'error': '下载的图片数据为空',
			
 
				+                            'data': None
			
 
				+                        })
			
 
				+                        continue
			
 
				+                    
			
 
				+                    # 获取文件名和内容类型
			
 
				+                    filename = object_key.split('/')[-1]
			
 
				+                    content_type = _get_content_type_by_filename(filename)
			
 
				+                    
			
 
				+                    # 创建FileStorage对象模拟上传的文件
			
 
				+                    image_stream = BytesIO(image_data)
			
 
				+                    file_storage = FileStorage(
			
 
				+                        stream=image_stream,
			
 
				+                        filename=filename,
			
 
				+                        content_type=content_type
			
 
				+                    )
			
 
				+                    
			
 
				+                    # 调用process_business_card_image函数处理图片
			
 
				+                    process_result = process_business_card_image(file_storage)
			
 
				+                    
			
 
				+                    if process_result.get('success', False):
			
 
				+                        success_count += 1
			
 
				+                        results.append({
			
 
				+                            'index': i,
			
 
				+                            'minio_path': minio_path,
			
 
				+                            'object_key': object_key,
			
 
				+                            'filename': filename,
			
 
				+                            'success': True,
			
 
				+                            'error': None,
			
 
				+                            'data': process_result.get('data'),
			
 
				+                            'message': process_result.get('message', '处理成功')
			
 
				+                        })
			
 
				+                        logging.info(f"成功处理第 {i+1} 个文件: {filename}")
			
 
				+                    else:
			
 
				+                        failed_count += 1
			
 
				+                        results.append({
			
 
				+                            'index': i,
			
 
				+                            'minio_path': minio_path,
			
 
				+                            'object_key': object_key,
			
 
				+                            'filename': filename,
			
 
				+                            'success': False,
			
 
				+                            'error': process_result.get('message', '处理失败'),
			
 
				+                            'data': None
			
 
				+                        })
			
 
				+                        logging.error(f"处理第 {i+1} 个文件失败: {process_result.get('message', '未知错误')}")
			
 
				+                    
			
 
				+                except Exception as download_error:
			
 
				+                    failed_count += 1
			
 
				+                    error_msg = f"下载MinIO文件失败: {str(download_error)}"
			
 
				+                    logging.error(error_msg, exc_info=True)
			
 
				+                    results.append({
			
 
				+                        'index': i,
			
 
				+                        'minio_path': minio_path,
			
 
				+                        'object_key': object_key,
			
 
				+                        'success': False,
			
 
				+                        'error': error_msg,
			
 
				+                        'data': None
			
 
				+                    })
			
 
				+                    
			
 
				+            except Exception as item_error:
			
 
				+                failed_count += 1
			
 
				+                error_msg = f"处理数组元素失败: {str(item_error)}"
			
 
				+                logging.error(error_msg, exc_info=True)
			
 
				+                results.append({
			
 
				+                    'index': i,
			
 
				+                    'minio_path': minio_path,
			
 
				+                    'success': False,
			
 
				+                    'error': error_msg,
			
 
				+                    'data': None
			
 
				+                })
			
 
				+        
			
 
				+        # 组装最终结果
			
 
				+        batch_result = {
			
 
				+            'summary': {
			
 
				+                'total_files': len(minio_paths_json),
			
 
				+                'success_count': success_count,
			
 
				+                'failed_count': failed_count,
			
 
				+                'success_rate': round((success_count / len(minio_paths_json)) * 100, 2) if len(minio_paths_json) > 0 else 0
			
 
				+            },
			
 
				+            'results': results,
			
 
				+            'processed_time': datetime.now().isoformat()
			
 
				+        }
			
 
				+        
			
 
				+        if failed_count == 0:
			
 
				+            return {
			
 
				+                'code': 200,
			
 
				+                'success': True,
			
 
				+                'message': f'批量处理完成，全部 {success_count} 个文件处理成功',
			
 
				+                'data': batch_result
			
 
				+            }
			
 
				+        elif success_count == 0:
			
 
				+            return {
			
 
				+                'code': 500,
			
 
				+                'success': False,
			
 
				+                'message': f'批量处理失败，全部 {failed_count} 个文件处理失败',
			
 
				+                'data': batch_result
			
 
				+            }
			
 
				+        else:
			
 
				+            return {
			
 
				+                'code': 206,  # Partial Content
			
 
				+                'success': True,
			
 
				+                'message': f'批量处理部分成功，成功 {success_count} 个，失败 {failed_count} 个',
			
 
				+                'data': batch_result
			
 
				+            }
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"批量处理名片图片失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        
			
 
				+        return {
			
 
				+            'code': 500,
			
 
				+            'success': False,
			
 
				+            'message': error_msg,
			
 
				+            'data': None
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def _extract_object_key_from_url(minio_url):
			
 
				+    """
			
 
				+    从MinIO完整URL中提取对象键名
			
 
				+    
			
 
				+    Args:
			
 
				+        minio_url (str): 完整的MinIO URL，如 "http://host:port/bucket/path/to/file.jpg"
			
 
				+        
			
 
				+    Returns:
			
 
				+        str: 对象键名，如 "path/to/file.jpg"，失败时返回None
			
 
				+    """
			
 
				+    try:
			
 
				+        if not minio_url or not isinstance(minio_url, str):
			
 
				+            return None
			
 
				+            
			
 
				+        # 移除协议部分 (http:// 或 https://)
			
 
				+        if minio_url.startswith('https://'):
			
 
				+            url_without_protocol = minio_url[8:]
			
 
				+        elif minio_url.startswith('http://'):
			
 
				+            url_without_protocol = minio_url[7:]
			
 
				+        else:
			
 
				+            # 如果没有协议前缀，假设是相对路径
			
 
				+            url_without_protocol = minio_url
			
 
				+        
			
 
				+        # 分割路径部分
			
 
				+        parts = url_without_protocol.split('/')
			
 
				+        
			
 
				+        # 至少需要包含 host:port/bucket/object
			
 
				+        if len(parts) < 3:
			
 
				+            return None
			
 
				+        
			
 
				+        # 跳过host:port和bucket，获取对象路径
			
 
				+        object_key = '/'.join(parts[2:])
			
 
				+        
			
 
				+        return object_key if object_key else None
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"解析MinIO URL失败: {str(e)}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _get_content_type_by_filename(filename):
			
 
				+    """
			
 
				+    根据文件名获取内容类型
			
 
				+    
			
 
				+    Args:
			
 
				+        filename (str): 文件名
			
 
				+        
			
 
				+    Returns:
			
 
				+        str: 内容类型
			
 
				+    """
			
 
				+    if not filename:
			
 
				+        return 'application/octet-stream'
			
 
				+    
			
 
				+    file_ext = filename.lower().split('.')[-1] if '.' in filename else ''
			
 
				+    
			
 
				+    content_type_mapping = {
			
 
				+        'jpg': 'image/jpeg',
			
 
				+        'jpeg': 'image/jpeg',
			
 
				+        'png': 'image/png',
			
 
				+        'gif': 'image/gif',
			
 
				+        'bmp': 'image/bmp',
			
 
				+        'webp': 'image/webp'
			
 
				+    }
			
 
				+    
			
 
				+    return content_type_mapping.get(file_ext, 'image/jpeg')  # 默认为JPEG图片
			
--- a/app/core/data_parse/parse_menduner.py
+++ b/app/core/data_parse/parse_menduner.py
@@ -0,0 +1,357 @@
 
				+"""
			
 
				+门墩儿数据解析模块
			
 
				+
			
 
				+该模块提供门墩儿人才数据的解析和处理功能。
			
 
				+"""
			
 
				+
			
 
				+import logging
			
 
				+from datetime import datetime
			
 
				+import json
			
 
				+import os
			
 
				+from typing import Dict, Any, Optional, List
			
 
				+import re
			
 
				+
			
 
				+
			
 
				+def parse_menduner_data(data_source: str, data_type: str = 'json') -> Dict[str, Any]:
			
 
				+    """
			
 
				+    解析门墩儿人才数据
			
 
				+    
			
 
				+    Args:
			
 
				+        data_source (str): 数据源（文件路径或JSON字符串）
			
 
				+        data_type (str): 数据类型，可选值：'json', 'file', 'api'
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 解析结果
			
 
				+    """
			
 
				+    try:
			
 
				+        logging.info(f"开始解析门墩儿数据，类型: {data_type}")
			
 
				+        
			
 
				+        raw_data = None
			
 
				+        
			
 
				+        if data_type == 'file':
			
 
				+            # 从文件读取数据
			
 
				+            if not os.path.exists(data_source):
			
 
				+                return {
			
 
				+                    'success': False,
			
 
				+                    'error': f'数据文件不存在: {data_source}',
			
 
				+                    'data': None
			
 
				+                }
			
 
				+            
			
 
				+            with open(data_source, 'r', encoding='utf-8') as f:
			
 
				+                raw_data = f.read()
			
 
				+                
			
 
				+        elif data_type == 'json':
			
 
				+            # 直接处理JSON字符串
			
 
				+            raw_data = data_source
			
 
				+            
			
 
				+        elif data_type == 'api':
			
 
				+            # TODO: 实现API数据获取逻辑
			
 
				+            raw_data = data_source
			
 
				+        
			
 
				+        # 解析数据
			
 
				+        if raw_data:
			
 
				+            parsed_data = _process_menduner_content(raw_data)
			
 
				+            
			
 
				+            result = {
			
 
				+                'success': True,
			
 
				+                'error': None,
			
 
				+                'data': {
			
 
				+                    'talent_profiles': parsed_data,
			
 
				+                    'parse_time': datetime.now().isoformat(),
			
 
				+                    'source_type': data_type,
			
 
				+                    'total_count': len(parsed_data) if isinstance(parsed_data, list) else 1
			
 
				+                }
			
 
				+            }
			
 
				+            
			
 
				+            logging.info(f"门墩儿数据解析完成，共解析 {result['data']['total_count']} 条记录")
			
 
				+            return result
			
 
				+        
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': '无法获取有效数据',
			
 
				+            'data': None
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"解析门墩儿数据失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': error_msg,
			
 
				+            'data': None
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def _process_menduner_content(raw_content: str) -> List[Dict[str, Any]]:
			
 
				+    """
			
 
				+    处理门墩儿原始数据内容
			
 
				+    
			
 
				+    Args:
			
 
				+        raw_content (str): 原始数据内容
			
 
				+        
			
 
				+    Returns:
			
 
				+        List[Dict[str, Any]]: 处理后的人才档案列表
			
 
				+    """
			
 
				+    try:
			
 
				+        # 尝试解析JSON格式
			
 
				+        try:
			
 
				+            json_data = json.loads(raw_content)
			
 
				+            if isinstance(json_data, list):
			
 
				+                return [_normalize_talent_profile(item) for item in json_data]
			
 
				+            elif isinstance(json_data, dict):
			
 
				+                return [_normalize_talent_profile(json_data)]
			
 
				+        except json.JSONDecodeError:
			
 
				+            pass
			
 
				+        
			
 
				+        # 如果不是JSON，尝试按行解析
			
 
				+        lines = raw_content.strip().split('\n')
			
 
				+        profiles = []
			
 
				+        
			
 
				+        for line in lines:
			
 
				+            line = line.strip()
			
 
				+            if line:
			
 
				+                profile = _parse_talent_line(line)
			
 
				+                if profile:
			
 
				+                    profiles.append(profile)
			
 
				+        
			
 
				+        return profiles
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"处理门墩儿数据内容失败: {str(e)}")
			
 
				+        return []
			
 
				+
			
 
				+
			
 
				+def _normalize_talent_profile(raw_profile: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    标准化人才档案数据
			
 
				+    
			
 
				+    Args:
			
 
				+        raw_profile (Dict[str, Any]): 原始档案数据
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 标准化后的档案数据
			
 
				+    """
			
 
				+    normalized = {
			
 
				+        'name': raw_profile.get('name', ''),
			
 
				+        'phone': _normalize_phone(raw_profile.get('phone', '')),
			
 
				+        'email': _normalize_email(raw_profile.get('email', '')),
			
 
				+        'position': raw_profile.get('position', ''),
			
 
				+        'company': raw_profile.get('company', ''),
			
 
				+        'location': raw_profile.get('location', ''),
			
 
				+        'experience_years': raw_profile.get('experience_years', 0),
			
 
				+        'skills': raw_profile.get('skills', []),
			
 
				+        'education': raw_profile.get('education', ''),
			
 
				+        'source': 'menduner',
			
 
				+        'processed_time': datetime.now().isoformat(),
			
 
				+        'raw_data': raw_profile
			
 
				+    }
			
 
				+    
			
 
				+    return normalized
			
 
				+
			
 
				+
			
 
				+def _parse_talent_line(line: str) -> Optional[Dict[str, Any]]:
			
 
				+    """
			
 
				+    解析单行人才信息
			
 
				+    
			
 
				+    Args:
			
 
				+        line (str): 包含人才信息的文本行
			
 
				+        
			
 
				+    Returns:
			
 
				+        Optional[Dict[str, Any]]: 解析后的人才信息，如果解析失败返回None
			
 
				+    """
			
 
				+    try:
			
 
				+        # TODO: 根据门墩儿数据的具体格式实现解析逻辑
			
 
				+        # 这里提供一个基础的解析示例
			
 
				+        
			
 
				+        # 简单的分隔符解析（假设用制表符或逗号分隔）
			
 
				+        if '\t' in line:
			
 
				+            parts = line.split('\t')
			
 
				+        elif ',' in line:
			
 
				+            parts = line.split(',')
			
 
				+        else:
			
 
				+            parts = [line]
			
 
				+        
			
 
				+        if len(parts) >= 3:
			
 
				+            return {
			
 
				+                'name': parts[0].strip(),
			
 
				+                'phone': parts[1].strip() if len(parts) > 1 else '',
			
 
				+                'position': parts[2].strip() if len(parts) > 2 else '',
			
 
				+                'company': parts[3].strip() if len(parts) > 3 else ''
			
 
				+            }
			
 
				+        
			
 
				+        return None
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"解析人才信息行失败: {str(e)}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _normalize_phone(phone: str) -> str:
			
 
				+    """
			
 
				+    标准化电话号码格式
			
 
				+    
			
 
				+    Args:
			
 
				+        phone (str): 原始电话号码
			
 
				+        
			
 
				+    Returns:
			
 
				+        str: 标准化后的电话号码
			
 
				+    """
			
 
				+    if not phone:
			
 
				+        return ''
			
 
				+    
			
 
				+    # 移除所有非数字字符
			
 
				+    digits = re.sub(r'\D', '', phone)
			
 
				+    
			
 
				+    # 中国手机号码格式化
			
 
				+    if len(digits) == 11 and digits.startswith('1'):
			
 
				+        return f"{digits[:3]}-{digits[3:7]}-{digits[7:]}"
			
 
				+    
			
 
				+    return phone
			
 
				+
			
 
				+
			
 
				+def _normalize_email(email: str) -> str:
			
 
				+    """
			
 
				+    标准化邮箱地址
			
 
				+    
			
 
				+    Args:
			
 
				+        email (str): 原始邮箱地址
			
 
				+        
			
 
				+    Returns:
			
 
				+        str: 标准化后的邮箱地址
			
 
				+    """
			
 
				+    if not email:
			
 
				+        return ''
			
 
				+    
			
 
				+    # 基础邮箱格式验证
			
 
				+    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
			
 
				+    if re.match(email_pattern, email.strip()):
			
 
				+        return email.strip().lower()
			
 
				+    
			
 
				+    return email
			
 
				+
			
 
				+
			
 
				+def validate_menduner_data(data: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    验证门墩儿人才数据的完整性和有效性
			
 
				+    
			
 
				+    Args:
			
 
				+        data (Dict[str, Any]): 待验证的人才数据
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 验证结果
			
 
				+    """
			
 
				+    try:
			
 
				+        errors = []
			
 
				+        warnings = []
			
 
				+        
			
 
				+        # 必填字段检查
			
 
				+        required_fields = ['name']
			
 
				+        for field in required_fields:
			
 
				+            if not data.get(field):
			
 
				+                errors.append(f"缺少必填字段: {field}")
			
 
				+        
			
 
				+        # 可选但建议填写的字段
			
 
				+        recommended_fields = ['phone', 'position', 'company']
			
 
				+        for field in recommended_fields:
			
 
				+            if not data.get(field):
			
 
				+                warnings.append(f"建议填写字段: {field}")
			
 
				+        
			
 
				+        # 格式验证
			
 
				+        if data.get('phone'):
			
 
				+            phone = data['phone']
			
 
				+            if not re.match(r'^1[3-9]\d{9}$', re.sub(r'\D', '', phone)):
			
 
				+                warnings.append("电话号码格式可能不正确")
			
 
				+        
			
 
				+        if data.get('email'):
			
 
				+            email = data['email']
			
 
				+            if not re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
			
 
				+                errors.append("邮箱格式不正确")
			
 
				+        
			
 
				+        return {
			
 
				+            'is_valid': len(errors) == 0,
			
 
				+            'errors': errors,
			
 
				+            'warnings': warnings,
			
 
				+            'score': max(0, 100 - len(errors) * 20 - len(warnings) * 5)
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"验证门墩儿数据失败: {str(e)}")
			
 
				+        return {
			
 
				+            'is_valid': False,
			
 
				+            'errors': [f"验证过程出错: {str(e)}"],
			
 
				+            'warnings': [],
			
 
				+            'score': 0
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def batch_process_menduner_data(data_list: List[Dict[str, Any]]) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    批量处理门墩儿人才数据
			
 
				+    
			
 
				+    Args:
			
 
				+        data_list (List[Dict[str, Any]]): 待处理的人才数据列表
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 批量处理结果
			
 
				+    """
			
 
				+    try:
			
 
				+        processed_data = []
			
 
				+        validation_results = []
			
 
				+        success_count = 0
			
 
				+        error_count = 0
			
 
				+        
			
 
				+        for i, data in enumerate(data_list):
			
 
				+            try:
			
 
				+                # 标准化数据
			
 
				+                normalized = _normalize_talent_profile(data)
			
 
				+                
			
 
				+                # 验证数据
			
 
				+                validation = validate_menduner_data(normalized)
			
 
				+                
			
 
				+                processed_data.append(normalized)
			
 
				+                validation_results.append({
			
 
				+                    'index': i,
			
 
				+                    'validation': validation
			
 
				+                })
			
 
				+                
			
 
				+                if validation['is_valid']:
			
 
				+                    success_count += 1
			
 
				+                else:
			
 
				+                    error_count += 1
			
 
				+                    
			
 
				+            except Exception as e:
			
 
				+                error_count += 1
			
 
				+                validation_results.append({
			
 
				+                    'index': i,
			
 
				+                    'validation': {
			
 
				+                        'is_valid': False,
			
 
				+                        'errors': [f"处理失败: {str(e)}"],
			
 
				+                        'warnings': [],
			
 
				+                        'score': 0
			
 
				+                    }
			
 
				+                })
			
 
				+        
			
 
				+        return {
			
 
				+            'success': True,
			
 
				+            'summary': {
			
 
				+                'total_count': len(data_list),
			
 
				+                'success_count': success_count,
			
 
				+                'error_count': error_count,
			
 
				+                'success_rate': (success_count / len(data_list)) * 100 if data_list else 0
			
 
				+            },
			
 
				+            'processed_data': processed_data,
			
 
				+            'validation_results': validation_results
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"批量处理门墩儿数据失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': error_msg,
			
 
				+            'summary': None,
			
 
				+            'processed_data': [],
			
 
				+            'validation_results': []
			
 
				+        } 
			
--- a/app/core/data_parse/parse_pic.py
+++ b/app/core/data_parse/parse_pic.py
@@ -0,0 +1,470 @@
 
				+"""
			
 
				+图片解析模块
			
 
				+
			
 
				+该模块提供图片文件的解析功能，包括名片识别、证件照处理等。
			
 
				+"""
			
 
				+
			
 
				+import logging
			
 
				+from datetime import datetime
			
 
				+import json
			
 
				+import os
			
 
				+from typing import Dict, Any, Optional, List, Tuple
			
 
				+import base64
			
 
				+from PIL import Image
			
 
				+import io
			
 
				+
			
 
				+
			
 
				+def parse_business_card_image(image_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    解析名片图片
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path (str): 名片图片路径
			
 
				+        task_id (str, optional): 关联的任务ID
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 解析结果
			
 
				+    """
			
 
				+    try:
			
 
				+        logging.info(f"开始解析名片图片: {image_path}")
			
 
				+        
			
 
				+        # 验证文件存在性和格式
			
 
				+        validation_result = validate_image_file(image_path)
			
 
				+        if not validation_result['is_valid']:
			
 
				+            return {
			
 
				+                'success': False,
			
 
				+                'error': validation_result['error'],
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        # 获取图片信息
			
 
				+        image_info = get_image_info(image_path)
			
 
				+        
			
 
				+        # TODO: 集成OCR引擎进行名片文字识别
			
 
				+        # 这里应该调用OCR服务（如百度OCR、腾讯OCR等）来识别名片上的文字
			
 
				+        
			
 
				+        # 模拟名片识别结果
			
 
				+        card_data = _extract_business_card_info(image_path)
			
 
				+        
			
 
				+        result = {
			
 
				+            'success': True,
			
 
				+            'error': None,
			
 
				+            'data': {
			
 
				+                'personal_info': card_data,
			
 
				+                'image_info': image_info,
			
 
				+                'parse_time': datetime.now().isoformat(),
			
 
				+                'task_id': task_id,
			
 
				+                'confidence_score': 0.85  # 模拟置信度分数
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        logging.info(f"名片图片解析完成: {image_path}")
			
 
				+        return result
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"解析名片图片失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': error_msg,
			
 
				+            'data': None
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def parse_portrait_image(image_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    解析证件照图片
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path (str): 证件照图片路径
			
 
				+        task_id (str, optional): 关联的任务ID
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 解析结果
			
 
				+    """
			
 
				+    try:
			
 
				+        logging.info(f"开始解析证件照图片: {image_path}")
			
 
				+        
			
 
				+        # 验证文件
			
 
				+        validation_result = validate_image_file(image_path)
			
 
				+        if not validation_result['is_valid']:
			
 
				+            return {
			
 
				+                'success': False,
			
 
				+                'error': validation_result['error'],
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        # 获取图片信息
			
 
				+        image_info = get_image_info(image_path)
			
 
				+        
			
 
				+        # 检查是否为合格的证件照
			
 
				+        portrait_analysis = _analyze_portrait_quality(image_path)
			
 
				+        
			
 
				+        result = {
			
 
				+            'success': True,
			
 
				+            'error': None,
			
 
				+            'data': {
			
 
				+                'image_info': image_info,
			
 
				+                'portrait_analysis': portrait_analysis,
			
 
				+                'parse_time': datetime.now().isoformat(),
			
 
				+                'task_id': task_id
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        logging.info(f"证件照图片解析完成: {image_path}")
			
 
				+        return result
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"解析证件照图片失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': error_msg,
			
 
				+            'data': None
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def validate_image_file(image_path: str) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    验证图片文件的有效性
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path (str): 图片文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 验证结果
			
 
				+    """
			
 
				+    try:
			
 
				+        # 检查文件是否存在
			
 
				+        if not os.path.exists(image_path):
			
 
				+            return {
			
 
				+                'is_valid': False,
			
 
				+                'error': f'图片文件不存在: {image_path}'
			
 
				+            }
			
 
				+        
			
 
				+        # 检查文件扩展名
			
 
				+        allowed_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif'}
			
 
				+        file_ext = os.path.splitext(image_path)[1].lower()
			
 
				+        
			
 
				+        if file_ext not in allowed_extensions:
			
 
				+            return {
			
 
				+                'is_valid': False,
			
 
				+                'error': f'不支持的图片格式: {file_ext}，支持的格式: {", ".join(allowed_extensions)}'
			
 
				+            }
			
 
				+        
			
 
				+        # 尝试打开图片验证完整性
			
 
				+        try:
			
 
				+            with Image.open(image_path) as img:
			
 
				+                img.verify()
			
 
				+        except Exception as e:
			
 
				+            return {
			
 
				+                'is_valid': False,
			
 
				+                'error': f'图片文件损坏或格式错误: {str(e)}'
			
 
				+            }
			
 
				+        
			
 
				+        return {
			
 
				+            'is_valid': True,
			
 
				+            'error': None
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        return {
			
 
				+            'is_valid': False,
			
 
				+            'error': f'验证图片文件时出错: {str(e)}'
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def get_image_info(image_path: str) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    获取图片基础信息
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path (str): 图片文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 图片信息
			
 
				+    """
			
 
				+    try:
			
 
				+        with Image.open(image_path) as img:
			
 
				+            file_size = os.path.getsize(image_path)
			
 
				+            
			
 
				+            return {
			
 
				+                'filename': os.path.basename(image_path),
			
 
				+                'file_path': image_path,
			
 
				+                'file_size': file_size,
			
 
				+                'file_size_mb': round(file_size / (1024 * 1024), 2),
			
 
				+                'dimensions': {
			
 
				+                    'width': img.width,
			
 
				+                    'height': img.height
			
 
				+                },
			
 
				+                'format': img.format,
			
 
				+                'mode': img.mode,
			
 
				+                'has_transparency': img.mode in ('RGBA', 'LA') or 'transparency' in img.info
			
 
				+            }
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"获取图片信息失败: {str(e)}")
			
 
				+        return {
			
 
				+            'filename': os.path.basename(image_path),
			
 
				+            'file_path': image_path,
			
 
				+            'error': str(e)
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def _extract_business_card_info(image_path: str) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    从名片图片中提取信息（模拟实现）
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path (str): 名片图片路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 提取的名片信息
			
 
				+    """
			
 
				+    # TODO: 这里应该集成真实的OCR服务来识别名片信息
			
 
				+    # 目前返回模拟数据
			
 
				+    
			
 
				+    return {
			
 
				+        'name': '',  # 姓名
			
 
				+        'title': '',  # 职位
			
 
				+        'company': '',  # 公司
			
 
				+        'department': '',  # 部门
			
 
				+        'phone': '',  # 电话
			
 
				+        'mobile': '',  # 手机
			
 
				+        'email': '',  # 邮箱
			
 
				+        'address': '',  # 地址
			
 
				+        'website': '',  # 网站
			
 
				+        'fax': '',  # 传真
			
 
				+        'extracted_text': '',  # 原始识别文本
			
 
				+        'confidence_details': {
			
 
				+            'name': 0.0,
			
 
				+            'phone': 0.0,
			
 
				+            'email': 0.0,
			
 
				+            'company': 0.0
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def _analyze_portrait_quality(image_path: str) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    分析证件照质量（模拟实现）
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path (str): 证件照图片路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 质量分析结果
			
 
				+    """
			
 
				+    try:
			
 
				+        with Image.open(image_path) as img:
			
 
				+            width, height = img.size
			
 
				+            
			
 
				+            # 基础质量检查
			
 
				+            quality_checks = {
			
 
				+                'resolution_check': {
			
 
				+                    'passed': width >= 300 and height >= 400,
			
 
				+                    'message': f'分辨率 {width}x{height}，建议至少300x400像素'
			
 
				+                },
			
 
				+                'aspect_ratio_check': {
			
 
				+                    'passed': 0.7 <= (height / width) <= 1.5,
			
 
				+                    'message': f'宽高比 {round(height/width, 2)}，建议在0.7-1.5之间'
			
 
				+                },
			
 
				+                'format_check': {
			
 
				+                    'passed': img.format in ['JPEG', 'PNG'],
			
 
				+                    'message': f'格式 {img.format}，建议使用JPEG或PNG格式'
			
 
				+                }
			
 
				+            }
			
 
				+            
			
 
				+            # 计算总体质量分数
			
 
				+            passed_checks = sum(1 for check in quality_checks.values() if check['passed'])
			
 
				+            quality_score = (passed_checks / len(quality_checks)) * 100
			
 
				+            
			
 
				+            return {
			
 
				+                'quality_score': quality_score,
			
 
				+                'quality_level': 'excellent' if quality_score >= 90 else 
			
 
				+                               'good' if quality_score >= 70 else 
			
 
				+                               'fair' if quality_score >= 50 else 'poor',
			
 
				+                'checks': quality_checks,
			
 
				+                'recommendations': _get_portrait_recommendations(quality_checks)
			
 
				+            }
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"分析证件照质量失败: {str(e)}")
			
 
				+        return {
			
 
				+            'quality_score': 0,
			
 
				+            'quality_level': 'unknown',
			
 
				+            'error': str(e)
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def _get_portrait_recommendations(quality_checks: Dict[str, Dict]) -> List[str]:
			
 
				+    """
			
 
				+    根据质量检查结果生成改进建议
			
 
				+    
			
 
				+    Args:
			
 
				+        quality_checks (Dict[str, Dict]): 质量检查结果
			
 
				+        
			
 
				+    Returns:
			
 
				+        List[str]: 改进建议列表
			
 
				+    """
			
 
				+    recommendations = []
			
 
				+    
			
 
				+    if not quality_checks['resolution_check']['passed']:
			
 
				+        recommendations.append('建议使用更高分辨率的图片，至少300x400像素')
			
 
				+    
			
 
				+    if not quality_checks['aspect_ratio_check']['passed']:
			
 
				+        recommendations.append('建议调整图片宽高比，使其更符合证件照标准')
			
 
				+    
			
 
				+    if not quality_checks['format_check']['passed']:
			
 
				+        recommendations.append('建议使用JPEG或PNG格式')
			
 
				+    
			
 
				+    if not recommendations:
			
 
				+        recommendations.append('图片质量良好，符合基本要求')
			
 
				+    
			
 
				+    return recommendations
			
 
				+
			
 
				+
			
 
				+def convert_image_to_base64(image_path: str) -> Optional[str]:
			
 
				+    """
			
 
				+    将图片转换为Base64编码
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path (str): 图片文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        Optional[str]: Base64编码字符串，失败时返回None
			
 
				+    """
			
 
				+    try:
			
 
				+        with open(image_path, 'rb') as image_file:
			
 
				+            encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
			
 
				+            return encoded_string
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"转换图片到Base64失败: {str(e)}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def resize_image(image_path: str, max_width: int = 800, max_height: int = 600, 
			
 
				+                output_path: Optional[str] = None) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    调整图片尺寸
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path (str): 原始图片路径
			
 
				+        max_width (int): 最大宽度
			
 
				+        max_height (int): 最大高度
			
 
				+        output_path (str, optional): 输出路径，如果不指定则覆盖原文件
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 处理结果
			
 
				+    """
			
 
				+    try:
			
 
				+        with Image.open(image_path) as img:
			
 
				+            # 计算新尺寸，保持宽高比
			
 
				+            width, height = img.size
			
 
				+            ratio = min(max_width / width, max_height / height)
			
 
				+            
			
 
				+            if ratio < 1:  # 只有当图片超过最大尺寸时才调整
			
 
				+                new_width = int(width * ratio)
			
 
				+                new_height = int(height * ratio)
			
 
				+                
			
 
				+                resized_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
			
 
				+                
			
 
				+                save_path = output_path or image_path
			
 
				+                resized_img.save(save_path, quality=95)
			
 
				+                
			
 
				+                return {
			
 
				+                    'success': True,
			
 
				+                    'original_size': (width, height),
			
 
				+                    'new_size': (new_width, new_height),
			
 
				+                    'compression_ratio': ratio,
			
 
				+                    'output_path': save_path
			
 
				+                }
			
 
				+            else:
			
 
				+                return {
			
 
				+                    'success': True,
			
 
				+                    'message': '图片尺寸已符合要求，无需调整',
			
 
				+                    'original_size': (width, height),
			
 
				+                    'output_path': image_path
			
 
				+                }
			
 
				+                
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"调整图片尺寸失败: {str(e)}"
			
 
				+        logging.error(error_msg)
			
 
				+        
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': error_msg
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def batch_process_images(image_paths: List[str], process_type: str = 'business_card') -> Dict[str, Any]:
			
 
				+    """
			
 
				+    批量处理图片
			
 
				+    
			
 
				+    Args:
			
 
				+        image_paths (List[str]): 图片路径列表
			
 
				+        process_type (str): 处理类型，可选值：'business_card', 'portrait'
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 批量处理结果
			
 
				+    """
			
 
				+    try:
			
 
				+        results = []
			
 
				+        success_count = 0
			
 
				+        failed_count = 0
			
 
				+        
			
 
				+        for image_path in image_paths:
			
 
				+            try:
			
 
				+                if process_type == 'business_card':
			
 
				+                    result = parse_business_card_image(image_path)
			
 
				+                elif process_type == 'portrait':
			
 
				+                    result = parse_portrait_image(image_path)
			
 
				+                else:
			
 
				+                    result = {
			
 
				+                        'success': False,
			
 
				+                        'error': f'不支持的处理类型: {process_type}'
			
 
				+                    }
			
 
				+                
			
 
				+                results.append({
			
 
				+                    'image_path': image_path,
			
 
				+                    'result': result
			
 
				+                })
			
 
				+                
			
 
				+                if result['success']:
			
 
				+                    success_count += 1
			
 
				+                else:
			
 
				+                    failed_count += 1
			
 
				+                    
			
 
				+            except Exception as e:
			
 
				+                failed_count += 1
			
 
				+                results.append({
			
 
				+                    'image_path': image_path,
			
 
				+                    'result': {
			
 
				+                        'success': False,
			
 
				+                        'error': str(e)
			
 
				+                    }
			
 
				+                })
			
 
				+        
			
 
				+        return {
			
 
				+            'success': True,
			
 
				+            'summary': {
			
 
				+                'total_images': len(image_paths),
			
 
				+                'success_count': success_count,
			
 
				+                'failed_count': failed_count,
			
 
				+                'success_rate': (success_count / len(image_paths)) * 100 if image_paths else 0
			
 
				+            },
			
 
				+            'results': results
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"批量处理图片失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': error_msg,
			
 
				+            'results': []
			
 
				+        } 
			
--- a/app/core/data_parse/parse_resume.py
+++ b/app/core/data_parse/parse_resume.py
@@ -0,0 +1,181 @@
 
				+"""
			
 
				+简历解析模块
			
 
				+
			
 
				+该模块提供简历文件的解析功能，支持PDF格式的简历文档解析。
			
 
				+"""
			
 
				+
			
 
				+import logging
			
 
				+from datetime import datetime
			
 
				+import json
			
 
				+import os
			
 
				+from typing import Dict, Any, Optional, List
			
 
				+
			
 
				+
			
 
				+def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    解析简历文件
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path (str): 简历文件路径
			
 
				+        task_id (str, optional): 关联的任务ID
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 解析结果
			
 
				+    """
			
 
				+    try:
			
 
				+        logging.info(f"开始解析简历文件: {file_path}")
			
 
				+        
			
 
				+        # 检查文件是否存在
			
 
				+        if not os.path.exists(file_path):
			
 
				+            return {
			
 
				+                'success': False,
			
 
				+                'error': f'文件不存在: {file_path}',
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        # 检查文件格式
			
 
				+        file_ext = os.path.splitext(file_path)[1].lower()
			
 
				+        if file_ext != '.pdf':
			
 
				+            return {
			
 
				+                'success': False,
			
 
				+                'error': f'不支持的文件格式: {file_ext}，仅支持PDF格式',
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        # TODO: 实现具体的简历解析逻辑
			
 
				+        # 这里应该集成PDF解析库和NLP模型来提取简历信息
			
 
				+        
			
 
				+        # 模拟解析结果
			
 
				+        parse_result = {
			
 
				+            'personal_info': {
			
 
				+                'name': '',
			
 
				+                'phone': '',
			
 
				+                'email': '',
			
 
				+                'address': ''
			
 
				+            },
			
 
				+            'education': [],
			
 
				+            'work_experience': [],
			
 
				+            'skills': [],
			
 
				+            'parse_time': datetime.now().isoformat(),
			
 
				+            'file_info': {
			
 
				+                'original_path': file_path,
			
 
				+                'file_size': os.path.getsize(file_path),
			
 
				+                'file_type': 'pdf'
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        logging.info(f"简历文件解析完成: {file_path}")
			
 
				+        
			
 
				+        return {
			
 
				+            'success': True,
			
 
				+            'error': None,
			
 
				+            'data': parse_result
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"解析简历文件失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': error_msg,
			
 
				+            'data': None
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def extract_resume_text(file_path: str) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    提取简历文本内容
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path (str): 简历文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 提取结果
			
 
				+    """
			
 
				+    try:
			
 
				+        # TODO: 实现PDF文本提取逻辑
			
 
				+        
			
 
				+        return {
			
 
				+            'success': True,
			
 
				+            'text_content': '',
			
 
				+            'page_count': 0
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"提取简历文本失败: {str(e)}", exc_info=True)
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': str(e),
			
 
				+            'text_content': None
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def validate_resume_format(file_path: str) -> bool:
			
 
				+    """
			
 
				+    验证简历文件格式
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path (str): 文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        bool: 是否为有效的简历格式
			
 
				+    """
			
 
				+    try:
			
 
				+        if not os.path.exists(file_path):
			
 
				+            return False
			
 
				+            
			
 
				+        file_ext = os.path.splitext(file_path)[1].lower()
			
 
				+        return file_ext == '.pdf'
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"验证简历格式失败: {str(e)}")
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    批量解析简历文件
			
 
				+    
			
 
				+    Args:
			
 
				+        file_paths (List[str]): 简历文件路径列表
			
 
				+        
			
 
				+    Returns:
			
 
				+        Dict[str, Any]: 批量解析结果
			
 
				+    """
			
 
				+    try:
			
 
				+        results = []
			
 
				+        success_count = 0
			
 
				+        failed_count = 0
			
 
				+        
			
 
				+        for file_path in file_paths:
			
 
				+            result = parse_resume_file(file_path)
			
 
				+            results.append({
			
 
				+                'file_path': file_path,
			
 
				+                'result': result
			
 
				+            })
			
 
				+            
			
 
				+            if result['success']:
			
 
				+                success_count += 1
			
 
				+            else:
			
 
				+                failed_count += 1
			
 
				+        
			
 
				+        return {
			
 
				+            'success': True,
			
 
				+            'summary': {
			
 
				+                'total_files': len(file_paths),
			
 
				+                'success_count': success_count,
			
 
				+                'failed_count': failed_count
			
 
				+            },
			
 
				+            'results': results
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"批量解析简历失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'error': error_msg,
			
 
				+            'results': []
			
 
				+        } 
			
--- a/app/core/data_parse/parse_task.py
+++ b/app/core/data_parse/parse_task.py
@@ -439,16 +439,22 @@ def add_parse_task(files, task_type, created_by='system'):
 
				                 # 根据文件类型设置ContentType
			
 
				                 content_type = file_obj.content_type or _get_content_type_by_extension(file_obj.filename)
			
 
				                 
			
 
				+                # 对包含非ASCII字符的文件名和任务类型进行URL编码处理
			
 
				+                import urllib.parse
			
 
				+                safe_filename = urllib.parse.quote(file_obj.filename, safe='')
			
 
				+                safe_task_type = urllib.parse.quote(task_type, safe='')
			
 
				+                safe_content_type = urllib.parse.quote(f'{task_type}_parse_task', safe='')
			
 
				+                
			
 
				                 minio_client.put_object(
			
 
				                     Bucket=minio_bucket,
			
 
				                     Key=minio_path,
			
 
				                     Body=file_obj,
			
 
				                     ContentType=content_type,
			
 
				                     Metadata={
			
 
				-                        'original_filename': file_obj.filename,
			
 
				+                        'original_filename': safe_filename,
			
 
				                         'upload_time': datetime.now().isoformat(),
			
 
				-                        'task_type': task_type,
			
 
				-                        'content_type': f'{task_type}_parse_task'
			
 
				+                        'task_type': safe_task_type,
			
 
				+                        'content_type': safe_content_type
			
 
				                     }
			
 
				                 )
			
 
				                 
			
--- a/app/core/data_parse/parse_web.py
+++ b/app/core/data_parse/parse_web.py
@@ -594,14 +594,14 @@ def process_single_talent_card(talent_data, minio_md_path):
 
				 
			
 
				 def process_webpage_with_QWen(markdown_text, publish_time):
			
 
				     """
			
 
				-    使用阿里云的 Qwen VL Max 模型解析网页 markdown 文本中的名片信息
			
 
				+    使用阿里云的 Qwen VL Max 模型解析单个人员的 markdown 文本中的名片信息
			
 
				     
			
 
				     Args:
			
 
				-        markdown_text (str): 网页的 markdown 格式文本内容
			
 
				+        markdown_text (str): 单个人员的 markdown 格式文本内容
			
 
				         publish_time (str): 发布时间，用于career_path中的date字段
			
 
				         
			
 
				     Returns:
			
 
				-        dict: 解析的名片信息
			
 
				+        list: 解析的名片信息列表（通常包含1个人员信息）
			
 
				     """
			
 
				     # 阿里云 Qwen API 配置
			
 
				     QWEN_API_KEY = os.environ.get('QWEN_API_KEY', 'sk-8f2320dafc9e4076968accdd8eebd8e9')
			
@@ -613,14 +613,14 @@ def process_webpage_with_QWen(markdown_text, publish_time):
 
				             base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
			
 
				         )
			
 
				         
			
 
				-        # 构建针对网页文本的优化提示语
			
 
				-        prompt = """你是酒店行业人事任命信息提取专家。请仔细分析提供的网页Markdown文本内容，精确提取其中的人员任命信息。
			
 
				+        # 构建针对单个人员网页文本的优化提示语
			
 
				+        prompt = """你是酒店行业人事任命信息提取专家。请仔细分析提供的网页Markdown文本内容，精确提取其中的单个人员任命信息。
			
 
				 
			
 
				 ## 重要说明
			
 
				-1. **多人员处理**: 文本中可能包含多个人的任命信息，通常以数字编号（如**1**、**2**、**3**等）分隔不同人员。请将所有人员信息都提取出来，以JSON数组格式返回。请仔细查看整个文档，不要遗漏任何人员信息。
			
 
				+1. **单人员处理**: 文本内容通常包含一个人的任命信息，可能包含数字编号（如**1**）作为标识。
			
 
				 2. **照片链接识别**: 人物照片链接通常出现在人物姓名的前面，通过空行分隔。请优先关联距离最近的照片链接。
			
 
				-3. **字段限制**: 只需要提取指定的9个字段，其他信息忽略。
			
 
				-4. **完整性要求**: 请确保提取文档中所有被任命的人员信息，包括总经理、副总裁、总监等各级管理人员。每个数字编号下的人员都需要提取。
			
 
				+3. **字段限制**: 只需要提取指定的8个字段，其他信息忽略。
			
 
				+4. **准确性要求**: 请确保提取的信息准确无误，包括总经理、副总裁、总监等各级管理人员的信息。
			
 
				 
			
 
				 ## 提取要求
			
 
				 - 区分中英文内容，分别提取
			
@@ -632,10 +632,9 @@ def process_webpage_with_QWen(markdown_text, publish_time):
 
				 ## 照片链接识别规则
			
 
				 - 照片通常以 ![描述](URL) 格式出现
			
 
				 - 照片链接通常位于人物姓名的前面，通过空行分隔
			
 
				-- 每个人物对应其前面最近的一张照片
			
 
				-- 如果照片无法明确对应某个人物，则该人物的pic_url为空字符串
			
 
				+- 如果照片无法明确对应人物，则pic_url为空字符串
			
 
				 
			
 
				-## 需提取的字段（仅这9个字段）
			
 
				+## 需提取的字段（仅这8个字段）
			
 
				 1. 中文姓名 (name_zh) - 人物的中文姓名
			
 
				 2. 英文姓名 (name_en) - 人物的英文姓名，如果没有则为空字符串
			
 
				 3. 中文职位/头衔 (title_zh) - 人物的中文职位或头衔
			
@@ -646,58 +645,28 @@ def process_webpage_with_QWen(markdown_text, publish_time):
 
				 8. 照片链接 (pic_url) - 人物的照片URL链接，根据上述识别规则提取
			
 
				 
			
 
				 ## 输出格式
			
 
				-请以严格的JSON格式返回结果，不要添加任何额外解释文字。如果文本中包含多个人员信息，返回JSON数组，每个人员一个JSON对象。如果只有一个人员，也要返回数组格式。
			
 
				+请以严格的JSON格式返回结果，不要添加任何额外解释文字。返回JSON对象格式（不是数组），包含单个人员的信息。
			
 
				 
			
 
				-单人员示例：
			
 
				+示例：
			
 
				 ```json
			
 
				-[
			
 
				-  {
			
 
				-    "name_zh": "张三",
			
 
				-    "name_en": "Zhang San",
			
 
				-    "title_zh": "总经理",
			
 
				-    "title_en": "General Manager",
			
 
				-    "hotel_zh": "北京万豪酒店",
			
 
				-    "hotel_en": "Beijing Marriott Hotel",
			
 
				-    "brand_group": "万豪",
			
 
				-    "pic_url": "https://example.com/photo1.jpg"
			
 
				-  }
			
 
				-]
			
 
				+{
			
 
				+  "name_zh": "张三",
			
 
				+  "name_en": "Zhang San",
			
 
				+  "title_zh": "总经理",
			
 
				+  "title_en": "General Manager",
			
 
				+  "hotel_zh": "北京万豪酒店",
			
 
				+  "hotel_en": "Beijing Marriott Hotel",
			
 
				+  "brand_group": "万豪",
			
 
				+  "pic_url": "https://example.com/photo1.jpg"
			
 
				+}
			
 
				 ```
			
 
				 
			
 
				-多人员示例：
			
 
				-```json
			
 
				-[
			
 
				-  {
			
 
				-    "name_zh": "张三",
			
 
				-    "name_en": "Zhang San",
			
 
				-    "title_zh": "总经理",
			
 
				-    "title_en": "General Manager",
			
 
				-    "hotel_zh": "北京万豪酒店",
			
 
				-    "hotel_en": "Beijing Marriott Hotel",
			
 
				-    "brand_group": "万豪",
			
 
				-    "pic_url": "https://example.com/photo1.jpg"
			
 
				-  },
			
 
				-  {
			
 
				-    "name_zh": "李四",
			
 
				-    "name_en": "Li Si",
			
 
				-    "title_zh": "市场总监",
			
 
				-    "title_en": "Marketing Director",
			
 
				-    "hotel_zh": "上海希尔顿酒店",
			
 
				-    "hotel_en": "Shanghai Hilton Hotel",
			
 
				-    "brand_group": "希尔顿",
			
 
				-    "pic_url": "https://example.com/photo2.jpg"
			
 
				-  }
			
 
				-]
			
 
				-```
			
 
				-
			
 
				-发现有数字编号（**1**、**2**、**3**）分隔，应该提取三个人员的信息，返回包含3个对象的数组。
			
 
				-
			
 
				 ## 特别提醒
			
 
				-- 请务必扫描整个文档，查找所有数字编号标记的人员信息
			
 
				-- 每个人员信息都要单独提取成一个JSON对象
			
 
				-- 最终返回的数组长度应该等于文档中的人员数量
			
 
				+- 专注于提取单个人员的完整信息
			
 
				+- 确保提取的信息准确且完整
			
 
				+- 如果某个字段在文本中不存在，请返回空字符串
			
 
				 
			
 
				-以下是需要分析的网页Markdown文本内容：
			
 
				+以下是需要分析的单个人员网页Markdown文本内容：
			
 
				 
			
 
				 """ + markdown_text
			
 
				         
			
@@ -719,81 +688,482 @@ def process_webpage_with_QWen(markdown_text, publish_time):
 
				         
			
 
				         # 解析响应
			
 
				         response_content = completion.choices[0].message.content
			
 
				-        logging.info(f"成功从 Qwen 模型获取网页文本响应: {response_content}")
			
 
				+        logging.info(f"成功从 Qwen 模型获取单个人员文本响应: {response_content}")
			
 
				         
			
 
				         # 直接解析 QWen 返回的 JSON 响应
			
 
				         try:
			
 
				             extracted_data = json.loads(response_content)
			
 
				-            logging.info("成功解析 Qwen 网页文本响应中的 JSON")
			
 
				+            logging.info("成功解析 Qwen 单个人员文本响应中的 JSON")
			
 
				         except json.JSONDecodeError as e:
			
 
				             error_msg = f"JSON 解析失败: {str(e)}"
			
 
				             logging.error(error_msg)
			
 
				             raise Exception(error_msg)
			
 
				 
			
 
				-        # 确保返回的是数组格式
			
 
				-        if not isinstance(extracted_data, list):
			
 
				-            # 如果返回的不是数组，包装成数组
			
 
				-            extracted_data = [extracted_data] if extracted_data else []
			
 
				+        # 确保返回的是单个人员对象，转换为列表格式以保持一致性
			
 
				+        if isinstance(extracted_data, list):
			
 
				+            # 如果意外返回数组，取第一个元素
			
 
				+            if len(extracted_data) > 0:
			
 
				+                person_data = extracted_data[0]
			
 
				+                logging.warning("Qwen返回了数组格式，取第一个人员信息")
			
 
				+            else:
			
 
				+                logging.error("Qwen返回了空数组")
			
 
				+                return []
			
 
				+        elif isinstance(extracted_data, dict):
			
 
				+            # 正常情况，返回的是单个人员对象
			
 
				+            person_data = extracted_data
			
 
				+        else:
			
 
				+            logging.error(f"Qwen返回了不支持的数据格式: {type(extracted_data)}")
			
 
				+            return []
			
 
				         
			
 
				-        # 确保数组中每个人员对象都包含所有必要字段
			
 
				+        # 确保人员对象包含所有必要字段
			
 
				         required_fields = [
			
 
				             'name_zh', 'name_en', 'title_zh', 'title_en', 
			
 
				             'hotel_zh', 'hotel_en', 'brand_group', 'pic_url'
			
 
				         ]
			
 
				         
			
 
				-        for person in extracted_data:
			
 
				-            for field in required_fields:
			
 
				-                if field not in person:
			
 
				-                    person[field] = ""
			
 
				-            
			
 
				-            # 为每个人员添加career_path字段
			
 
				-            career_entry = {
			
 
				-                'date': publish_time,
			
 
				-                'hotel_en': person.get('hotel_en', ''),
			
 
				-                'hotel_zh': person.get('hotel_zh', ''),
			
 
				-                'image_path': '',
			
 
				-                'source': 'webpage_extraction',
			
 
				-                'title_en': person.get('title_en', ''),
			
 
				-                'title_zh': person.get('title_zh', '')
			
 
				-            }
			
 
				-            
			
 
				-            person['career_path'] = [career_entry]
			
 
				-            logging.info(f"为人员 {person.get('name_zh', 'Unknown')} 添加了career_path记录: {career_entry}")
			
 
				+        for field in required_fields:
			
 
				+            if field not in person_data:
			
 
				+                person_data[field] = ""
			
 
				         
			
 
				-        # 创建解析任务记录
			
 
				-        try:
			
 
				-            # 生成唯一的任务名称：当前日期 + UUID
			
 
				-            current_date = datetime.now().strftime('%Y%m%d')
			
 
				-            task_uuid = str(uuid.uuid4())[:8]  # 取UUID的前8位
			
 
				-            task_name = f"{current_date}_{task_uuid}"
			
 
				-            
			
 
				-            # 创建解析任务记录
			
 
				-            parse_task = ParseTaskRepository(
			
 
				-                task_name=task_name,
			
 
				-                task_status='completed',  # 解析完成
			
 
				-                task_type='门墩儿新任命',
			
 
				-                task_source='webpage_extraction',
			
 
				-                collection_count=len(extracted_data),  # 采集人数
			
 
				-                parse_count=len(extracted_data),  # 解析人数
			
 
				-                parse_result=extracted_data,  # 解析结果
			
 
				-                created_by='system',
			
 
				-                updated_by='system'
			
 
				-            )
			
 
				-            
			
 
				-            db.session.add(parse_task)
			
 
				-            db.session.commit()
			
 
				-            
			
 
				-            logging.info(f"成功创建解析任务记录: {task_name}, 解析人数: {len(extracted_data)}")
			
 
				-            
			
 
				-        except Exception as db_error:
			
 
				-            db.session.rollback()
			
 
				-            logging.error(f"创建解析任务记录失败: {str(db_error)}", exc_info=True)
			
 
				-            # 不影响主要功能，只记录错误日志
			
 
				+        # 为人员添加career_path字段
			
 
				+        career_entry = {
			
 
				+            'date': publish_time,
			
 
				+            'hotel_en': person_data.get('hotel_en', ''),
			
 
				+            'hotel_zh': person_data.get('hotel_zh', ''),
			
 
				+            'image_path': '',
			
 
				+            'source': 'webpage_extraction',
			
 
				+            'title_en': person_data.get('title_en', ''),
			
 
				+            'title_zh': person_data.get('title_zh', '')
			
 
				+        }
			
 
				         
			
 
				-        return extracted_data
			
 
				+        person_data['career_path'] = [career_entry]
			
 
				+        logging.info(f"为人员 {person_data.get('name_zh', 'Unknown')} 添加了career_path记录: {career_entry}")
			
 
				+        
			
 
				+        # 返回列表格式以保持与其他函数的一致性
			
 
				+        return [person_data]
			
 
				         
			
 
				     except Exception as e:
			
 
				         error_msg = f"Qwen VL Max 模型网页文本解析失败: {str(e)}"
			
 
				         logging.error(error_msg, exc_info=True)
			
 
				         raise Exception(error_msg) 
			
 
				 
			
 
				+
			
 
				+def batch_process_md(markdown_file_list, publish_time):
			
 
				+    """
			
 
				+    批量处理包含多个人员信息的markdown文件
			
 
				+    
			
 
				+    Args:
			
 
				+        markdown_file_list (list): MinIO对象保存地址组成的数组
			
 
				+        publish_time (str): 发布时间，用于career_path中的date字段
			
 
				+        
			
 
				+    Returns:
			
 
				+        dict: 批量处理结果，包含所有人员的解析结果
			
 
				+    """
			
 
				+    try:
			
 
				+        # 参数验证
			
 
				+        if not markdown_file_list or not isinstance(markdown_file_list, list):
			
 
				+            return {
			
 
				+                'code': 400,
			
 
				+                'success': False,
			
 
				+                'message': 'markdown_file_list参数必须是非空数组',
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        if not publish_time or not isinstance(publish_time, str):
			
 
				+            return {
			
 
				+                'code': 400,
			
 
				+                'success': False,
			
 
				+                'message': 'publish_time参数必须是非空字符串',
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        logging.info(f"开始批量处理 {len(markdown_file_list)} 个markdown文件")
			
 
				+        
			
 
				+        # 批量处理结果统计
			
 
				+        batch_results = {
			
 
				+            'total_files': len(markdown_file_list),
			
 
				+            'processed_files': 0,
			
 
				+            'failed_files': 0,
			
 
				+            'total_sections': 0,
			
 
				+            'total_persons': 0,
			
 
				+            'all_results': [],
			
 
				+            'file_results': [],
			
 
				+            'failed_files_info': []
			
 
				+        }
			
 
				+        
			
 
				+        # 逐个处理每个markdown文件
			
 
				+        for file_index, minio_path in enumerate(markdown_file_list):
			
 
				+            try:
			
 
				+                logging.info(f"开始处理第 {file_index + 1} 个文件: {minio_path}")
			
 
				+                
			
 
				+                # 处理单个文件
			
 
				+                file_result = process_single_markdown_file(minio_path, publish_time)
			
 
				+                
			
 
				+                if file_result['success']:
			
 
				+                    batch_results['processed_files'] += 1
			
 
				+                    batch_results['total_sections'] += file_result['data']['total_sections']
			
 
				+                    batch_results['total_persons'] += file_result['data']['total_persons']
			
 
				+                    batch_results['all_results'].extend(file_result['data']['all_results'])
			
 
				+                    batch_results['file_results'].append({
			
 
				+                        'file_index': file_index + 1,
			
 
				+                        'minio_path': minio_path,
			
 
				+                        'result': file_result['data']
			
 
				+                    })
			
 
				+                    logging.info(f"文件 {minio_path} 处理成功，提取 {file_result['data']['total_persons']} 个人员信息")
			
 
				+                else:
			
 
				+                    batch_results['failed_files'] += 1
			
 
				+                    batch_results['failed_files_info'].append({
			
 
				+                        'file_index': file_index + 1,
			
 
				+                        'minio_path': minio_path,
			
 
				+                        'error': file_result['message']
			
 
				+                    })
			
 
				+                    logging.error(f"文件 {minio_path} 处理失败: {file_result['message']}")
			
 
				+                    
			
 
				+            except Exception as e:
			
 
				+                error_msg = f"处理文件 {minio_path} 时发生异常: {str(e)}"
			
 
				+                logging.error(error_msg, exc_info=True)
			
 
				+                batch_results['failed_files'] += 1
			
 
				+                batch_results['failed_files_info'].append({
			
 
				+                    'file_index': file_index + 1,
			
 
				+                    'minio_path': minio_path,
			
 
				+                    'error': error_msg
			
 
				+                })
			
 
				+        
			
 
				+        # 生成最终结果
			
 
				+        if batch_results['processed_files'] == batch_results['total_files']:
			
 
				+            # 全部处理成功
			
 
				+            return {
			
 
				+                'code': 200,
			
 
				+                'success': True,
			
 
				+                'message': f'所有 {batch_results["total_files"]} 个文件处理成功，共提取 {batch_results["total_persons"]} 个人员信息',
			
 
				+                'data': batch_results
			
 
				+            }
			
 
				+        elif batch_results['processed_files'] > 0:
			
 
				+            # 部分处理成功
			
 
				+            return {
			
 
				+                'code': 206,  # Partial Content
			
 
				+                'success': True,
			
 
				+                'message': f'部分处理成功：{batch_results["processed_files"]}/{batch_results["total_files"]} 个文件成功，共提取 {batch_results["total_persons"]} 个人员信息',
			
 
				+                'data': batch_results
			
 
				+            }
			
 
				+        else:
			
 
				+            # 全部处理失败
			
 
				+            return {
			
 
				+                'code': 500,
			
 
				+                'success': False,
			
 
				+                'message': f'所有 {batch_results["total_files"]} 个文件处理失败',
			
 
				+                'data': batch_results
			
 
				+            }
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"batch_process_md函数执行失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        return {
			
 
				+            'code': 500,
			
 
				+            'success': False,
			
 
				+            'message': error_msg,
			
 
				+            'data': None
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def get_markdown_from_minio(minio_client, minio_path):
			
 
				+    """
			
 
				+    从MinIO获取markdown文件内容
			
 
				+    
			
 
				+    Args:
			
 
				+        minio_client: MinIO客户端
			
 
				+        minio_path (str): MinIO中的文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        str: 文件内容，如果失败返回None
			
 
				+    """
			
 
				+    try:
			
 
				+        logging.info(f"从MinIO获取文件: {minio_path}")
			
 
				+        
			
 
				+        # 从MinIO下载文件
			
 
				+        response = minio_client.get_object(Bucket=minio_bucket, Key=minio_path)
			
 
				+        
			
 
				+        # 读取文件内容
			
 
				+        content = response['Body'].read()
			
 
				+        
			
 
				+        # 解码为字符串
			
 
				+        if isinstance(content, bytes):
			
 
				+            # 尝试不同的编码方式
			
 
				+            try:
			
 
				+                markdown_content = content.decode('utf-8')
			
 
				+            except UnicodeDecodeError:
			
 
				+                try:
			
 
				+                    markdown_content = content.decode('gbk')
			
 
				+                except UnicodeDecodeError:
			
 
				+                    markdown_content = content.decode('utf-8', errors='ignore')
			
 
				+                    logging.warning(f"文件 {minio_path} 编码检测失败，使用UTF-8忽略错误模式")
			
 
				+        else:
			
 
				+            markdown_content = str(content)
			
 
				+        
			
 
				+        logging.info(f"成功获取文件内容，长度: {len(markdown_content)} 字符")
			
 
				+        return markdown_content
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"从MinIO获取文件 {minio_path} 失败: {str(e)}", exc_info=True)
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def save_section_to_minio(minio_client, section_content, original_minio_path, section_number):
			
 
				+    """
			
 
				+    将分割后的markdown内容保存到MinIO
			
 
				+    
			
 
				+    Args:
			
 
				+        minio_client: MinIO客户端
			
 
				+        section_content (str): 分割后的markdown内容
			
 
				+        original_minio_path (str): 原始文件的MinIO路径
			
 
				+        section_number (str): 分隔符编号
			
 
				+        
			
 
				+    Returns:
			
 
				+        str: 新保存文件的MinIO路径，如果失败返回None
			
 
				+    """
			
 
				+    try:
			
 
				+        # 生成新的文件名
			
 
				+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
			
 
				+        unique_id = uuid.uuid4().hex[:8]
			
 
				+        
			
 
				+        # 从原始路径提取基础信息
			
 
				+        path_parts = original_minio_path.split('/')
			
 
				+        if len(path_parts) > 1:
			
 
				+            directory = '/'.join(path_parts[:-1])
			
 
				+            original_filename = path_parts[-1]
			
 
				+            # 移除扩展名
			
 
				+            base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename
			
 
				+        else:
			
 
				+            directory = 'webpage_talent_sections'
			
 
				+            base_name = 'section'
			
 
				+        
			
 
				+        # 构建新的文件名
			
 
				+        new_filename = f"{base_name}_section_{section_number}_{timestamp}_{unique_id}.md"
			
 
				+        new_minio_path = f"{directory}/{new_filename}"
			
 
				+        
			
 
				+        logging.info(f"开始保存分割内容到MinIO: {new_minio_path}")
			
 
				+        
			
 
				+        # 将内容转换为字节流
			
 
				+        content_bytes = section_content.encode('utf-8')
			
 
				+        content_stream = BytesIO(content_bytes)
			
 
				+        
			
 
				+        # 上传到MinIO
			
 
				+        minio_client.put_object(
			
 
				+            Bucket=minio_bucket,
			
 
				+            Key=new_minio_path,
			
 
				+            Body=content_stream,
			
 
				+            ContentType='text/markdown',
			
 
				+            Metadata={
			
 
				+                'original_file': original_minio_path,
			
 
				+                'section_number': section_number,
			
 
				+                'upload_time': datetime.now().isoformat(),
			
 
				+                'content_type': 'webpage_talent_section'
			
 
				+            }
			
 
				+        )
			
 
				+        
			
 
				+        logging.info(f"分割内容成功保存到MinIO: {new_minio_path}")
			
 
				+        return new_minio_path
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"保存分割内容到MinIO失败: {str(e)}", exc_info=True)
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def process_single_markdown_file(minio_path, publish_time):
			
 
				+    """
			
 
				+    处理单个markdown文件，从MinIO获取内容并判断是否需要分割
			
 
				+    
			
 
				+    Args:
			
 
				+        minio_path (str): MinIO中的文件路径
			
 
				+        publish_time (str): 发布时间
			
 
				+        
			
 
				+    Returns:
			
 
				+        dict: 处理结果
			
 
				+    """
			
 
				+    try:
			
 
				+        # 获取MinIO客户端
			
 
				+        minio_client = get_minio_client()
			
 
				+        if not minio_client:
			
 
				+            return {
			
 
				+                'success': False,
			
 
				+                'message': '无法连接到MinIO服务器',
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        # 从MinIO获取文件内容
			
 
				+        markdown_content = get_markdown_from_minio(minio_client, minio_path)
			
 
				+        if not markdown_content:
			
 
				+            return {
			
 
				+                'success': False,
			
 
				+                'message': f'无法从MinIO获取文件内容: {minio_path}',
			
 
				+                'data': None
			
 
				+            }
			
 
				+        
			
 
				+        # 检查是否存在 **1**, **2** 等分隔结构
			
 
				+        pattern = r'\*\*(\d+)\*\*'
			
 
				+        matches = re.findall(pattern, markdown_content)
			
 
				+        
			
 
				+        if not matches:
			
 
				+            # 如果没有找到分隔符，直接处理整个文件
			
 
				+            logging.info("未发现数字分隔符，直接处理整个markdown文件")
			
 
				+            try:
			
 
				+                result = process_webpage_with_QWen(markdown_content, publish_time)
			
 
				+                
			
 
				+                # 更新解析结果中的路径信息
			
 
				+                if result:
			
 
				+                    for person in result:
			
 
				+                        person['pic_url'] = minio_path  # 设置原始文件路径
			
 
				+                        if 'career_path' in person and person['career_path']:
			
 
				+                            for career_entry in person['career_path']:
			
 
				+                                career_entry['image_path'] = minio_path  # 设置原始文件路径
			
 
				+                
			
 
				+                return {
			
 
				+                    'success': True,
			
 
				+                    'message': '单个markdown文件处理成功',
			
 
				+                    'data': {
			
 
				+                        'total_sections': 1,
			
 
				+                        'processed_sections': 1,
			
 
				+                        'total_persons': len(result) if result else 0,
			
 
				+                        'all_results': result,
			
 
				+                        'section_results': [],
			
 
				+                        'failed_sections_info': []
			
 
				+                    }
			
 
				+                }
			
 
				+            except Exception as e:
			
 
				+                error_msg = f"处理单个markdown文件失败: {str(e)}"
			
 
				+                logging.error(error_msg, exc_info=True)
			
 
				+                return {
			
 
				+                    'success': False,
			
 
				+                    'message': error_msg,
			
 
				+                    'data': None
			
 
				+                }
			
 
				+        
			
 
				+        # 发现分隔符，按分隔符拆分内容
			
 
				+        logging.info(f"发现 {len(matches)} 个数字分隔符: {matches}")
			
 
				+        
			
 
				+        # 使用正则表达式分割内容
			
 
				+        sections = re.split(r'\*\*\d+\*\*', markdown_content)
			
 
				+        
			
 
				+        # 移除第一个空白部分（分隔符前的内容）
			
 
				+        if sections and not sections[0].strip():
			
 
				+            sections = sections[1:]
			
 
				+        
			
 
				+        # 确保分割后的部分数量与分隔符数量匹配
			
 
				+        if len(sections) != len(matches):
			
 
				+            logging.warning(f"分割部分数量 ({len(sections)}) 与分隔符数量 ({len(matches)}) 不匹配")
			
 
				+            # 取较小的数量以确保安全
			
 
				+            min_count = min(len(sections), len(matches))
			
 
				+            sections = sections[:min_count]
			
 
				+            matches = matches[:min_count]
			
 
				+        
			
 
				+        # 处理结果统计
			
 
				+        results = {
			
 
				+            'total_sections': len(sections),
			
 
				+            'processed_sections': 0,
			
 
				+            'failed_sections': 0,
			
 
				+            'total_persons': 0,
			
 
				+            'all_results': [],
			
 
				+            'section_results': [],
			
 
				+            'failed_sections_info': []
			
 
				+        }
			
 
				+        
			
 
				+        # 逐个处理每个markdown片段
			
 
				+        for i, (section_content, section_number) in enumerate(zip(sections, matches)):
			
 
				+            try:
			
 
				+                logging.info(f"开始处理第 {section_number} 部分 (索引: {i})")
			
 
				+                
			
 
				+                # 清理内容，移除前后空白
			
 
				+                section_content = section_content.strip()
			
 
				+                
			
 
				+                if not section_content:
			
 
				+                    logging.warning(f"第 {section_number} 部分内容为空，跳过处理")
			
 
				+                    results['failed_sections'] += 1
			
 
				+                    results['failed_sections_info'].append({
			
 
				+                        'section_number': section_number,
			
 
				+                        'index': i,
			
 
				+                        'error': '部分内容为空'
			
 
				+                    })
			
 
				+                    continue
			
 
				+                
			
 
				+                # 重新构建完整的markdown片段，包含分隔符标题
			
 
				+                full_section_content = f"**{section_number}**\n\n{section_content}"
			
 
				+                
			
 
				+                # 将分割后的内容保存到MinIO
			
 
				+                section_minio_path = save_section_to_minio(minio_client, full_section_content, minio_path, section_number)
			
 
				+                if not section_minio_path:
			
 
				+                    logging.warning(f"保存第 {section_number} 部分到MinIO失败，使用原始路径")
			
 
				+                    section_minio_path = minio_path
			
 
				+                
			
 
				+                # 调用process_webpage_with_QWen处理单个片段
			
 
				+                section_result = process_webpage_with_QWen(full_section_content, publish_time)
			
 
				+                
			
 
				+                # 更新解析结果中的路径信息
			
 
				+                if section_result:
			
 
				+                    for person in section_result:
			
 
				+                        person['pic_url'] = section_minio_path  # 设置分割后的文件路径
			
 
				+                        if 'career_path' in person and person['career_path']:
			
 
				+                            for career_entry in person['career_path']:
			
 
				+                                career_entry['image_path'] = section_minio_path  # 设置分割后的文件路径
			
 
				+                
			
 
				+                if section_result:
			
 
				+                    results['processed_sections'] += 1
			
 
				+                    results['total_persons'] += len(section_result)
			
 
				+                    results['all_results'].extend(section_result)
			
 
				+                    results['section_results'].append({
			
 
				+                        'section_number': section_number,
			
 
				+                        'index': i,
			
 
				+                        'persons_count': len(section_result),
			
 
				+                        'persons': section_result
			
 
				+                    })
			
 
				+                    logging.info(f"第 {section_number} 部分处理成功，提取 {len(section_result)} 个人员信息")
			
 
				+                else:
			
 
				+                    results['failed_sections'] += 1
			
 
				+                    results['failed_sections_info'].append({
			
 
				+                        'section_number': section_number,
			
 
				+                        'index': i,
			
 
				+                        'error': '未提取到人员信息'
			
 
				+                    })
			
 
				+                    logging.warning(f"第 {section_number} 部分未提取到人员信息")
			
 
				+                    
			
 
				+            except Exception as e:
			
 
				+                error_msg = f"处理第 {section_number} 部分失败: {str(e)}"
			
 
				+                logging.error(error_msg, exc_info=True)
			
 
				+                results['failed_sections'] += 1
			
 
				+                results['failed_sections_info'].append({
			
 
				+                    'section_number': section_number,
			
 
				+                    'index': i,
			
 
				+                    'error': error_msg
			
 
				+                })
			
 
				+        
			
 
				+        # 生成最终结果
			
 
				+        if results['processed_sections'] == results['total_sections']:
			
 
				+            # 全部处理成功
			
 
				+            return {
			
 
				+                'success': True,
			
 
				+                'message': f'所有 {results["total_sections"]} 个部分处理成功，共提取 {results["total_persons"]} 个人员信息',
			
 
				+                'data': results
			
 
				+            }
			
 
				+        elif results['processed_sections'] > 0:
			
 
				+            # 部分处理成功
			
 
				+            return {
			
 
				+                'success': True,
			
 
				+                'message': f'部分处理成功：{results["processed_sections"]}/{results["total_sections"]} 个部分成功，共提取 {results["total_persons"]} 个人员信息',
			
 
				+                'data': results
			
 
				+            }
			
 
				+        else:
			
 
				+            # 全部处理失败
			
 
				+            return {
			
 
				+                'success': False,
			
 
				+                'message': f'所有 {results["total_sections"]} 个部分处理失败',
			
 
				+                'data': results
			
 
				+            }
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        error_msg = f"process_single_markdown_file函数执行失败: {str(e)}"
			
 
				+        logging.error(error_msg, exc_info=True)
			
 
				+        return {
			
 
				+            'success': False,
			
 
				+            'message': error_msg,
			
 
				+            'data': None
			
 
				+        } 
			
 
				+
			
--- a/李倩.jpg
+++ b/李倩.jpg