2 months ago · 96354bf3aa
--- a/app/core/data_parse/parse_pic.py
+++ b/app/core/data_parse/parse_pic.py
@@ -14,11 +14,52 @@ import base64
 
															 from PIL import Image
														
 
															 import io
														
 
															 from openai import OpenAI
														
 
															+import boto3
														
 
															+from botocore.config import Config
														
 
															 from app.config.config import DevelopmentConfig, ProductionConfig
														
 
															 # 使用配置变量
														
 
															 config = ProductionConfig()
														
 
															+# MinIO 配置
														
 
															+minio_url = f"{'https' if config.MINIO_SECURE else 'http'}://{config.MINIO_HOST}"
														
 
															+minio_access_key = config.MINIO_USER
														
 
															+minio_secret_key = config.MINIO_PASSWORD
														
 
															+minio_bucket = config.MINIO_BUCKET
														
 
															+
														
 
															+
														
 
															+def get_minio_client():
														
 
															+    """获取MinIO客户端连接"""
														
 
															+    try:
														
 
															+        logging.info(f"尝试连接MinIO服务器: {minio_url}")
														
 
															+        
														
 
															+        minio_client = boto3.client(
														
 
															+            's3',
														
 
															+            endpoint_url=minio_url,
														
 
															+            aws_access_key_id=minio_access_key,
														
 
															+            aws_secret_access_key=minio_secret_key,
														
 
															+            config=Config(
														
 
															+                signature_version='s3v4',
														
 
															+                retries={'max_attempts': 3, 'mode': 'standard'},
														
 
															+                connect_timeout=10,
														
 
															+                read_timeout=30
														
 
															+            )
														
 
															+        )
														
 
															+        
														
 
															+        # 确保存储桶存在
														
 
															+        buckets = minio_client.list_buckets()
														
 
															+        bucket_names = [bucket['Name'] for bucket in buckets.get('Buckets', [])]
														
 
															+        logging.info(f"成功连接到MinIO服务器，现有存储桶: {bucket_names}")
														
 
															+        
														
 
															+        if minio_bucket not in bucket_names:
														
 
															+            logging.info(f"创建存储桶: {minio_bucket}")
														
 
															+            minio_client.create_bucket(Bucket=minio_bucket)
														
 
															+            
														
 
															+        return minio_client
														
 
															+    except Exception as e:
														
 
															+        logging.error(f"MinIO连接错误: {str(e)}")
														
 
															+        return None
														
 
															+
														
 
															 def parse_business_card_image(image_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
														
 
															     """
														
@@ -134,41 +175,103 @@ def parse_portrait_image(image_path: str, task_id: Optional[str] = None) -> Dict
 
															 def validate_image_file(image_path: str) -> Dict[str, Any]:
														
 
															     """
														
 
															-    验证图片文件的有效性
														
 
															+    验证图片文件的有效性，支持本地路径和MinIO URL
														
 
															     Args:
														
 
															-        image_path (str): 图片文件路径
														
 
															+        image_path (str): 图片文件路径或MinIO URL
														
 
															     Returns:
														
 
															         Dict[str, Any]: 验证结果
														
 
															     """
														
 
															     try:
														
 
															-        # 检查文件是否存在
														
 
															-        if not os.path.exists(image_path):
														
 
															-            return {
														
 
															-                'is_valid': False,
														
 
															-                'error': f'图片文件不存在: {image_path}'
														
 
															-            }
														
 
															-        
														
 
															-        # 检查文件扩展名
														
 
															-        allowed_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif'}
														
 
															-        file_ext = os.path.splitext(image_path)[1].lower()
														
 
															-        
														
 
															-        if file_ext not in allowed_extensions:
														
 
															-            return {
														
 
															-                'is_valid': False,
														
 
															-                'error': f'不支持的图片格式: {file_ext}，支持的格式: {", ".join(allowed_extensions)}'
														
 
															-            }
														
 
															-        
														
 
															-        # 尝试打开图片验证完整性
														
 
															-        try:
														
 
															-            with Image.open(image_path) as img:
														
 
															-                img.verify()
														
 
															-        except Exception as e:
														
 
															-            return {
														
 
															-                'is_valid': False,
														
 
															-                'error': f'图片文件损坏或格式错误: {str(e)}'
														
 
															-            }
														
 
															+        # 检查是否是MinIO URL
														
 
															+        if image_path.startswith('http://') or image_path.startswith('https://'):
														
 
															+            # 处理MinIO URL
														
 
															+            try:
														
 
															+                # 从URL提取文件扩展名
														
 
															+                from urllib.parse import urlparse
														
 
															+                parsed_url = urlparse(image_path)
														
 
															+                file_ext = os.path.splitext(parsed_url.path)[1].lower()
														
 
															+                
														
 
															+                # 检查文件扩展名
														
 
															+                allowed_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif'}
														
 
															+                if file_ext not in allowed_extensions:
														
 
															+                    return {
														
 
															+                        'is_valid': False,
														
 
															+                        'error': f'不支持的图片格式: {file_ext}，支持的格式: {", ".join(allowed_extensions)}'
														
 
															+                    }
														
 
															+                
														
 
															+                # 尝试从MinIO获取图片数据进行验证
														
 
															+                minio_client = get_minio_client()
														
 
															+                if not minio_client:
														
 
															+                    return {
														
 
															+                        'is_valid': False,
														
 
															+                        'error': '无法连接到MinIO服务器'
														
 
															+                    }
														
 
															+                
														
 
															+                # 提取对象键
														
 
															+                path_parts = parsed_url.path.strip('/').split('/', 1)
														
 
															+                if len(path_parts) < 2:
														
 
															+                    return {
														
 
															+                        'is_valid': False,
														
 
															+                        'error': f'无效的MinIO URL格式: {image_path}'
														
 
															+                    }
														
 
															+                
														
 
															+                object_key = path_parts[1]  # 跳过bucket名称
														
 
															+                
														
 
															+                # 从MinIO获取图片数据
														
 
															+                try:
														
 
															+                    response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
														
 
															+                    image_data = response['Body'].read()
														
 
															+                    
														
 
															+                    # 验证图片完整性
														
 
															+                    from io import BytesIO
														
 
															+                    with Image.open(BytesIO(image_data)) as img:
														
 
															+                        img.verify()
														
 
															+                    
														
 
															+                    return {
														
 
															+                        'is_valid': True,
														
 
															+                        'error': None
														
 
															+                    }
														
 
															+                except Exception as minio_error:
														
 
															+                    return {
														
 
															+                        'is_valid': False,
														
 
															+                        'error': f'图片文件不存在: {image_path}'
														
 
															+                    }
														
 
															+                    
														
 
															+            except Exception as url_error:
														
 
															+                return {
														
 
															+                    'is_valid': False,
														
 
															+                    'error': f'处理MinIO URL失败: {str(url_error)}'
														
 
															+                }
														
 
															+        else:
														
 
															+            # 处理本地文件路径
														
 
															+            # 检查文件是否存在
														
 
															+            if not os.path.exists(image_path):
														
 
															+                return {
														
 
															+                    'is_valid': False,
														
 
															+                    'error': f'图片文件不存在: {image_path}'
														
 
															+                }
														
 
															+            
														
 
															+            # 检查文件扩展名
														
 
															+            allowed_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif'}
														
 
															+            file_ext = os.path.splitext(image_path)[1].lower()
														
 
															+            
														
 
															+            if file_ext not in allowed_extensions:
														
 
															+                return {
														
 
															+                    'is_valid': False,
														
 
															+                    'error': f'不支持的图片格式: {file_ext}，支持的格式: {", ".join(allowed_extensions)}'
														
 
															+                }
														
 
															+            
														
 
															+            # 尝试打开图片验证完整性
														
 
															+            try:
														
 
															+                with Image.open(image_path) as img:
														
 
															+                    img.verify()
														
 
															+            except Exception as e:
														
 
															+                return {
														
 
															+                    'is_valid': False,
														
 
															+                    'error': f'图片文件损坏或格式错误: {str(e)}'
														
 
															+                }
														
 
															         return {
														
 
															             'is_valid': True,
														
@@ -184,31 +287,85 @@ def validate_image_file(image_path: str) -> Dict[str, Any]:
 
															 def get_image_info(image_path: str) -> Dict[str, Any]:
														
 
															     """
														
 
															-    获取图片基础信息
														
 
															+    获取图片基础信息，支持本地路径和MinIO URL
														
 
															     Args:
														
 
															-        image_path (str): 图片文件路径
														
 
															+        image_path (str): 图片文件路径或MinIO URL
														
 
															     Returns:
														
 
															         Dict[str, Any]: 图片信息
														
 
															     """
														
 
															     try:
														
 
															-        with Image.open(image_path) as img:
														
 
															-            file_size = os.path.getsize(image_path)
														
 
															+        # 检查是否是MinIO URL
														
 
															+        if image_path.startswith('http://') or image_path.startswith('https://'):
														
 
															+            # 处理MinIO URL
														
 
															+            from urllib.parse import urlparse
														
 
															+            from io import BytesIO
														
 
															-            return {
														
 
															-                'filename': os.path.basename(image_path),
														
 
															-                'file_path': image_path,
														
 
															-                'file_size': file_size,
														
 
															-                'file_size_mb': round(file_size / (1024 * 1024), 2),
														
 
															-                'dimensions': {
														
 
															-                    'width': img.width,
														
 
															-                    'height': img.height
														
 
															-                },
														
 
															-                'format': img.format,
														
 
															-                'mode': img.mode,
														
 
															-                'has_transparency': img.mode in ('RGBA', 'LA') or 'transparency' in img.info
														
 
															-            }
														
 
															+            # 获取MinIO客户端
														
 
															+            minio_client = get_minio_client()
														
 
															+            if not minio_client:
														
 
															+                return {
														
 
															+                    'filename': os.path.basename(image_path),
														
 
															+                    'file_path': image_path,
														
 
															+                    'error': '无法连接到MinIO服务器'
														
 
															+                }
														
 
															+            
														
 
															+            # 提取对象键
														
 
															+            parsed_url = urlparse(image_path)
														
 
															+            path_parts = parsed_url.path.strip('/').split('/', 1)
														
 
															+            if len(path_parts) < 2:
														
 
															+                return {
														
 
															+                    'filename': os.path.basename(image_path),
														
 
															+                    'file_path': image_path,
														
 
															+                    'error': f'无效的MinIO URL格式: {image_path}'
														
 
															+                }
														
 
															+            
														
 
															+            object_key = path_parts[1]  # 跳过bucket名称
														
 
															+            
														
 
															+            # 从MinIO获取图片数据
														
 
															+            try:
														
 
															+                response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
														
 
															+                image_data = response['Body'].read()
														
 
															+                
														
 
															+                with Image.open(BytesIO(image_data)) as img:
														
 
															+                    return {
														
 
															+                        'filename': os.path.basename(parsed_url.path),
														
 
															+                        'file_path': image_path,
														
 
															+                        'file_size': len(image_data),
														
 
															+                        'file_size_mb': round(len(image_data) / (1024 * 1024), 2),
														
 
															+                        'dimensions': {
														
 
															+                            'width': img.width,
														
 
															+                            'height': img.height
														
 
															+                        },
														
 
															+                        'format': img.format,
														
 
															+                        'mode': img.mode,
														
 
															+                        'has_transparency': img.mode in ('RGBA', 'LA') or 'transparency' in img.info
														
 
															+                    }
														
 
															+            except Exception as minio_error:
														
 
															+                return {
														
 
															+                    'filename': os.path.basename(parsed_url.path),
														
 
															+                    'file_path': image_path,
														
 
															+                    'error': f'从MinIO获取图片失败: {str(minio_error)}'
														
 
															+                }
														
 
															+        else:
														
 
															+            # 处理本地文件路径
														
 
															+            with Image.open(image_path) as img:
														
 
															+                file_size = os.path.getsize(image_path)
														
 
															+                
														
 
															+                return {
														
 
															+                    'filename': os.path.basename(image_path),
														
 
															+                    'file_path': image_path,
														
 
															+                    'file_size': file_size,
														
 
															+                    'file_size_mb': round(file_size / (1024 * 1024), 2),
														
 
															+                    'dimensions': {
														
 
															+                        'width': img.width,
														
 
															+                        'height': img.height
														
 
															+                    },
														
 
															+                    'format': img.format,
														
 
															+                    'mode': img.mode,
														
 
															+                    'has_transparency': img.mode in ('RGBA', 'LA') or 'transparency' in img.info
														
 
															+                }
														
 
															     except Exception as e:
														
 
															         logging.error(f"获取图片信息失败: {str(e)}")
														
@@ -334,18 +491,49 @@ def _get_portrait_recommendations(quality_checks: Dict[str, Dict]) -> List[str]:
 
															 def convert_image_to_base64(image_path: str) -> Optional[str]:
														
 
															     """
														
 
															-    将图片转换为Base64编码
														
 
															+    将图片转换为Base64编码，支持本地路径和MinIO URL
														
 
															     Args:
														
 
															-        image_path (str): 图片文件路径
														
 
															+        image_path (str): 图片文件路径或MinIO URL
														
 
															     Returns:
														
 
															         Optional[str]: Base64编码字符串，失败时返回None
														
 
															     """
														
 
															     try:
														
 
															-        with open(image_path, 'rb') as image_file:
														
 
															-            encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
														
 
															-            return encoded_string
														
 
															+        # 检查是否是MinIO URL
														
 
															+        if image_path.startswith('http://') or image_path.startswith('https://'):
														
 
															+            # 处理MinIO URL
														
 
															+            from urllib.parse import urlparse
														
 
															+            
														
 
															+            # 获取MinIO客户端
														
 
															+            minio_client = get_minio_client()
														
 
															+            if not minio_client:
														
 
															+                logging.error("无法连接到MinIO服务器")
														
 
															+                return None
														
 
															+            
														
 
															+            # 提取对象键
														
 
															+            parsed_url = urlparse(image_path)
														
 
															+            path_parts = parsed_url.path.strip('/').split('/', 1)
														
 
															+            if len(path_parts) < 2:
														
 
															+                logging.error(f"无效的MinIO URL格式: {image_path}")
														
 
															+                return None
														
 
															+            
														
 
															+            object_key = path_parts[1]  # 跳过bucket名称
														
 
															+            
														
 
															+            # 从MinIO获取图片数据
														
 
															+            try:
														
 
															+                response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
														
 
															+                image_data = response['Body'].read()
														
 
															+                encoded_string = base64.b64encode(image_data).decode('utf-8')
														
 
															+                return encoded_string
														
 
															+            except Exception as minio_error:
														
 
															+                logging.error(f"从MinIO获取图片失败: {str(minio_error)}")
														
 
															+                return None
														
 
															+        else:
														
 
															+            # 处理本地文件路径
														
 
															+            with open(image_path, 'rb') as image_file:
														
 
															+                encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
														
 
															+                return encoded_string
														
 
															     except Exception as e:
														
 
															         logging.error(f"转换图片到Base64失败: {str(e)}")
														
--- a/app/core/data_parse/parse_resume.py
+++ b/app/core/data_parse/parse_resume.py
@@ -13,11 +13,52 @@ import base64
 
															 from typing import Dict, Any, Optional, List
														
 
															 import PyPDF2
														
 
															 from openai import OpenAI
														
 
															+import boto3
														
 
															+from botocore.config import Config
														
 
															 from app.config.config import DevelopmentConfig, ProductionConfig
														
 
															 # 使用配置变量
														
 
															 config = ProductionConfig()
														
 
															+# MinIO 配置
														
 
															+minio_url = f"{'https' if config.MINIO_SECURE else 'http'}://{config.MINIO_HOST}"
														
 
															+minio_access_key = config.MINIO_USER
														
 
															+minio_secret_key = config.MINIO_PASSWORD
														
 
															+minio_bucket = config.MINIO_BUCKET
														
 
															+
														
 
															+
														
 
															+def get_minio_client():
														
 
															+    """获取MinIO客户端连接"""
														
 
															+    try:
														
 
															+        logging.info(f"尝试连接MinIO服务器: {minio_url}")
														
 
															+        
														
 
															+        minio_client = boto3.client(
														
 
															+            's3',
														
 
															+            endpoint_url=minio_url,
														
 
															+            aws_access_key_id=minio_access_key,
														
 
															+            aws_secret_access_key=minio_secret_key,
														
 
															+            config=Config(
														
 
															+                signature_version='s3v4',
														
 
															+                retries={'max_attempts': 3, 'mode': 'standard'},
														
 
															+                connect_timeout=10,
														
 
															+                read_timeout=30
														
 
															+            )
														
 
															+        )
														
 
															+        
														
 
															+        # 确保存储桶存在
														
 
															+        buckets = minio_client.list_buckets()
														
 
															+        bucket_names = [bucket['Name'] for bucket in buckets.get('Buckets', [])]
														
 
															+        logging.info(f"成功连接到MinIO服务器，现有存储桶: {bucket_names}")
														
 
															+        
														
 
															+        if minio_bucket not in bucket_names:
														
 
															+            logging.info(f"创建存储桶: {minio_bucket}")
														
 
															+            minio_client.create_bucket(Bucket=minio_bucket)
														
 
															+            
														
 
															+        return minio_client
														
 
															+    except Exception as e:
														
 
															+        logging.error(f"MinIO连接错误: {str(e)}")
														
 
															+        return None
														
 
															+
														
 
															 def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
														
 
															     """
														
@@ -202,10 +243,10 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
 
															 def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
														
 
															     """
														
 
															-    解析简历文件
														
 
															+    解析简历文件，支持本地路径和MinIO URL
														
 
															     Args:
														
 
															-        file_path (str): 简历文件路径
														
 
															+        file_path (str): 简历文件路径或MinIO URL
														
 
															         task_id (str, optional): 关联的任务ID
														
 
															     Returns:
														
@@ -214,22 +255,41 @@ def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str
 
															     try:
														
 
															         logging.info(f"开始解析简历文件: {file_path}")
														
 
															-        # 检查文件是否存在
														
 
															-        if not os.path.exists(file_path):
														
 
															-            return {
														
 
															-                'success': False,
														
 
															-                'error': f'文件不存在: {file_path}',
														
 
															-                'data': None
														
 
															-            }
														
 
															-        
														
 
															-        # 检查文件格式
														
 
															-        file_ext = os.path.splitext(file_path)[1].lower()
														
 
															-        if file_ext != '.pdf':
														
 
															-            return {
														
 
															-                'success': False,
														
 
															-                'error': f'不支持的文件格式: {file_ext}，仅支持PDF格式',
														
 
															-                'data': None
														
 
															-            }
														
 
															+        # 验证文件格式和存在性
														
 
															+        if not validate_resume_format(file_path):
														
 
															+            # 检查是否是MinIO URL
														
 
															+            if file_path.startswith('http://') or file_path.startswith('https://'):
														
 
															+                from urllib.parse import urlparse
														
 
															+                parsed_url = urlparse(file_path)
														
 
															+                file_ext = os.path.splitext(parsed_url.path)[1].lower()
														
 
															+                if file_ext != '.pdf':
														
 
															+                    return {
														
 
															+                        'success': False,
														
 
															+                        'error': f'不支持的文件格式: {file_ext}，仅支持PDF格式',
														
 
															+                        'data': None
														
 
															+                    }
														
 
															+                else:
														
 
															+                    return {
														
 
															+                        'success': False,
														
 
															+                        'error': f'文件不存在: {file_path}',
														
 
															+                        'data': None
														
 
															+                    }
														
 
															+            else:
														
 
															+                # 本地文件路径
														
 
															+                if not os.path.exists(file_path):
														
 
															+                    return {
														
 
															+                        'success': False,
														
 
															+                        'error': f'文件不存在: {file_path}',
														
 
															+                        'data': None
														
 
															+                    }
														
 
															+                
														
 
															+                file_ext = os.path.splitext(file_path)[1].lower()
														
 
															+                if file_ext != '.pdf':
														
 
															+                    return {
														
 
															+                        'success': False,
														
 
															+                        'error': f'不支持的文件格式: {file_ext}，仅支持PDF格式',
														
 
															+                        'data': None
														
 
															+                    }
														
 
															         # 步骤1: 提取PDF文本内容
														
 
															         logging.info("开始提取PDF文本内容")
														
@@ -267,12 +327,36 @@ def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str
 
															             }
														
 
															         # 步骤3: 构建完整的解析结果
														
 
															+        # 获取文件大小
														
 
															+        file_size = 0
														
 
															+        try:
														
 
															+            if file_path.startswith('http://') or file_path.startswith('https://'):
														
 
															+                # 对于MinIO URL，从extract_resume_text的结果中获取文件大小
														
 
															+                # 或者重新获取（这里我们使用一个简化的方法）
														
 
															+                from urllib.parse import urlparse
														
 
															+                
														
 
															+                minio_client = get_minio_client()
														
 
															+                if minio_client:
														
 
															+                    parsed_url = urlparse(file_path)
														
 
															+                    path_parts = parsed_url.path.strip('/').split('/', 1)
														
 
															+                    if len(path_parts) >= 2:
														
 
															+                        object_key = path_parts[1]
														
 
															+                        try:
														
 
															+                            response = minio_client.head_object(Bucket=minio_bucket, Key=object_key)
														
 
															+                            file_size = response.get('ContentLength', 0)
														
 
															+                        except Exception:
														
 
															+                            file_size = 0
														
 
															+            else:
														
 
															+                file_size = os.path.getsize(file_path)
														
 
															+        except Exception:
														
 
															+            file_size = 0
														
 
															+        
														
 
															         parse_result = {
														
 
															             **parsed_data,  # 包含所有千问解析的结果
														
 
															             'parse_time': datetime.now().isoformat(),
														
 
															             'file_info': {
														
 
															                 'original_path': file_path,
														
 
															-                'file_size': os.path.getsize(file_path),
														
 
															+                'file_size': file_size,
														
 
															                 'file_type': 'pdf',
														
 
															                 'page_count': page_count,
														
 
															                 'text_length': len(resume_text)
														
@@ -306,10 +390,10 @@ def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str
 
															 def extract_resume_text(file_path: str) -> Dict[str, Any]:
														
 
															     """
														
 
															-    提取简历文本内容
														
 
															+    提取简历文本内容，支持本地路径和MinIO URL
														
 
															     Args:
														
 
															-        file_path (str): 简历文件路径
														
 
															+        file_path (str): 简历文件路径或MinIO URL
														
 
															     Returns:
														
 
															         Dict[str, Any]: 提取结果
														
@@ -320,21 +404,79 @@ def extract_resume_text(file_path: str) -> Dict[str, Any]:
 
															         text_content = ""
														
 
															         page_count = 0
														
 
															-        # 使用PyPDF2提取PDF文本
														
 
															-        with open(file_path, 'rb') as file:
														
 
															-            pdf_reader = PyPDF2.PdfReader(file)
														
 
															-            page_count = len(pdf_reader.pages)
														
 
															+        # 检查是否是MinIO URL
														
 
															+        if file_path.startswith('http://') or file_path.startswith('https://'):
														
 
															+            # 处理MinIO URL
														
 
															+            from urllib.parse import urlparse
														
 
															+            from io import BytesIO
														
 
															-            for page_num, page in enumerate(pdf_reader.pages):
														
 
															-                try:
														
 
															-                    page_text = page.extract_text()
														
 
															-                    if page_text:
														
 
															-                        text_content += f"\n=== 第{page_num + 1}页 ===\n{page_text}\n"
														
 
															-                    else:
														
 
															-                        logging.warning(f"第{page_num + 1}页无法提取文本")
														
 
															-                except Exception as e:
														
 
															-                    logging.warning(f"提取第{page_num + 1}页文本失败: {str(e)}")
														
 
															-                    continue
														
 
															+            # 获取MinIO客户端
														
 
															+            minio_client = get_minio_client()
														
 
															+            if not minio_client:
														
 
															+                return {
														
 
															+                    'success': False,
														
 
															+                    'error': '无法连接到MinIO服务器',
														
 
															+                    'text_content': None,
														
 
															+                    'page_count': 0
														
 
															+                }
														
 
															+            
														
 
															+            # 提取对象键
														
 
															+            parsed_url = urlparse(file_path)
														
 
															+            path_parts = parsed_url.path.strip('/').split('/', 1)
														
 
															+            if len(path_parts) < 2:
														
 
															+                return {
														
 
															+                    'success': False,
														
 
															+                    'error': f'无效的MinIO URL格式: {file_path}',
														
 
															+                    'text_content': None,
														
 
															+                    'page_count': 0
														
 
															+                }
														
 
															+            
														
 
															+            object_key = path_parts[1]  # 跳过bucket名称
														
 
															+            
														
 
															+            # 从MinIO获取PDF数据
														
 
															+            try:
														
 
															+                response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
														
 
															+                pdf_data = response['Body'].read()
														
 
															+                
														
 
															+                # 使用PyPDF2提取PDF文本
														
 
															+                pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_data))
														
 
															+                page_count = len(pdf_reader.pages)
														
 
															+                
														
 
															+                for page_num, page in enumerate(pdf_reader.pages):
														
 
															+                    try:
														
 
															+                        page_text = page.extract_text()
														
 
															+                        if page_text:
														
 
															+                            text_content += f"\n=== 第{page_num + 1}页 ===\n{page_text}\n"
														
 
															+                        else:
														
 
															+                            logging.warning(f"第{page_num + 1}页无法提取文本")
														
 
															+                    except Exception as e:
														
 
															+                        logging.warning(f"提取第{page_num + 1}页文本失败: {str(e)}")
														
 
															+                        continue
														
 
															+                        
														
 
															+            except Exception as minio_error:
														
 
															+                return {
														
 
															+                    'success': False,
														
 
															+                    'error': f'从MinIO获取PDF失败: {str(minio_error)}',
														
 
															+                    'text_content': None,
														
 
															+                    'page_count': 0
														
 
															+                }
														
 
															+        else:
														
 
															+            # 处理本地文件路径
														
 
															+            # 使用PyPDF2提取PDF文本
														
 
															+            with open(file_path, 'rb') as file:
														
 
															+                pdf_reader = PyPDF2.PdfReader(file)
														
 
															+                page_count = len(pdf_reader.pages)
														
 
															+                
														
 
															+                for page_num, page in enumerate(pdf_reader.pages):
														
 
															+                    try:
														
 
															+                        page_text = page.extract_text()
														
 
															+                        if page_text:
														
 
															+                            text_content += f"\n=== 第{page_num + 1}页 ===\n{page_text}\n"
														
 
															+                        else:
														
 
															+                            logging.warning(f"第{page_num + 1}页无法提取文本")
														
 
															+                    except Exception as e:
														
 
															+                        logging.warning(f"提取第{page_num + 1}页文本失败: {str(e)}")
														
 
															+                        continue
														
 
															         # 清理文本内容
														
 
															         text_content = text_content.strip()
														
@@ -367,22 +509,76 @@ def extract_resume_text(file_path: str) -> Dict[str, Any]:
 
															         }
														
 
															+def _get_filename_from_path(file_path: str) -> str:
														
 
															+    """
														
 
															+    从文件路径或MinIO URL中提取文件名
														
 
															+    
														
 
															+    Args:
														
 
															+        file_path (str): 文件路径或MinIO URL
														
 
															+        
														
 
															+    Returns:
														
 
															+        str: 文件名
														
 
															+    """
														
 
															+    try:
														
 
															+        if file_path.startswith('http://') or file_path.startswith('https://'):
														
 
															+            # 从MinIO URL中提取文件名
														
 
															+            from urllib.parse import urlparse
														
 
															+            parsed_url = urlparse(file_path)
														
 
															+            return os.path.basename(parsed_url.path)
														
 
															+        else:
														
 
															+            # 从本地路径中提取文件名
														
 
															+            return os.path.basename(file_path)
														
 
															+    except Exception:
														
 
															+        return 'unknown_file.pdf'
														
 
															+
														
 
															+
														
 
															 def validate_resume_format(file_path: str) -> bool:
														
 
															     """
														
 
															-    验证简历文件格式
														
 
															+    验证简历文件格式，支持本地路径和MinIO URL
														
 
															     Args:
														
 
															-        file_path (str): 文件路径
														
 
															+        file_path (str): 文件路径或MinIO URL
														
 
															     Returns:
														
 
															         bool: 是否为有效的简历格式
														
 
															     """
														
 
															     try:
														
 
															-        if not os.path.exists(file_path):
														
 
															-            return False
														
 
															+        # 检查是否是MinIO URL
														
 
															+        if file_path.startswith('http://') or file_path.startswith('https://'):
														
 
															+            # 处理MinIO URL
														
 
															+            from urllib.parse import urlparse
														
 
															-        file_ext = os.path.splitext(file_path)[1].lower()
														
 
															-        return file_ext == '.pdf'
														
 
															+            # 从URL提取文件扩展名
														
 
															+            parsed_url = urlparse(file_path)
														
 
															+            file_ext = os.path.splitext(parsed_url.path)[1].lower()
														
 
															+            if file_ext != '.pdf':
														
 
															+                return False
														
 
															+            
														
 
															+            # 验证文件是否存在于MinIO中
														
 
															+            try:
														
 
															+                minio_client = get_minio_client()
														
 
															+                if not minio_client:
														
 
															+                    return False
														
 
															+                
														
 
															+                # 提取对象键
														
 
															+                path_parts = parsed_url.path.strip('/').split('/', 1)
														
 
															+                if len(path_parts) < 2:
														
 
															+                    return False
														
 
															+                
														
 
															+                object_key = path_parts[1]  # 跳过bucket名称
														
 
															+                
														
 
															+                # 检查文件是否存在
														
 
															+                response = minio_client.head_object(Bucket=minio_bucket, Key=object_key)
														
 
															+                return True
														
 
															+            except Exception:
														
 
															+                return False
														
 
															+        else:
														
 
															+            # 处理本地文件路径
														
 
															+            if not os.path.exists(file_path):
														
 
															+                return False
														
 
															+                
														
 
															+            file_ext = os.path.splitext(file_path)[1].lower()
														
 
															+            return file_ext == '.pdf'
														
 
															     except Exception as e:
														
 
															         logging.error(f"验证简历格式失败: {str(e)}")
														
@@ -470,24 +666,24 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
 
															                     results.append({
														
 
															                         "data": standardized_data,
														
 
															                         "error": None,
														
 
															-                        "filename": os.path.basename(file_path) if file_path else f'resume_{i}.pdf',
														
 
															+                        "filename": _get_filename_from_path(file_path) if file_path else f'resume_{i}.pdf',
														
 
															                         "index": i,
														
 
															                         "message": "简历文件解析成功",
														
 
															-                        "minio_path": f"resume_files/{os.path.basename(file_path)}" if file_path else '',
														
 
															-                        "object_key": f"resume_files/{os.path.basename(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
														
 
															+                        "minio_path": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else '',
														
 
															+                        "object_key": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
														
 
															                         "success": True
														
 
															                     })
														
 
															-                    logging.info(f"成功处理第 {i+1} 个文件: {os.path.basename(file_path)}")
														
 
															+                    logging.info(f"成功处理第 {i+1} 个文件: {_get_filename_from_path(file_path)}")
														
 
															                 else:
														
 
															                     failed_count += 1
														
 
															                     results.append({
														
 
															                         "data": None,
														
 
															                         "error": result.get('error', '处理失败'),
														
 
															-                        "filename": os.path.basename(file_path) if file_path else f'resume_{i}.pdf',
														
 
															+                        "filename": _get_filename_from_path(file_path) if file_path else f'resume_{i}.pdf',
														
 
															                         "index": i,
														
 
															                         "message": "简历文件解析失败",
														
 
															-                        "minio_path": f"resume_files/{os.path.basename(file_path)}" if file_path else '',
														
 
															-                        "object_key": f"resume_files/{os.path.basename(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
														
 
															+                        "minio_path": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else '',
														
 
															+                        "object_key": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
														
 
															                         "success": False
														
 
															                     })
														
 
															                     logging.error(f"处理第 {i+1} 个文件失败: {result.get('error', '未知错误')}")
														
@@ -499,11 +695,11 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
 
															                 results.append({
														
 
															                     "data": None,
														
 
															                     "error": error_msg,
														
 
															-                    "filename": os.path.basename(file_path) if file_path else f'resume_{i}.pdf',
														
 
															+                    "filename": _get_filename_from_path(file_path) if file_path else f'resume_{i}.pdf',
														
 
															                     "index": i,
														
 
															                     "message": "简历文件解析失败",
														
 
															-                    "minio_path": f"resume_files/{os.path.basename(file_path)}" if file_path else '',
														
 
															-                    "object_key": f"resume_files/{os.path.basename(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
														
 
															+                    "minio_path": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else '',
														
 
															+                    "object_key": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
														
 
															                     "success": False
														
 
															                 })