|
@@ -13,11 +13,52 @@ import base64
|
|
from typing import Dict, Any, Optional, List
|
|
from typing import Dict, Any, Optional, List
|
|
import PyPDF2
|
|
import PyPDF2
|
|
from openai import OpenAI
|
|
from openai import OpenAI
|
|
|
|
+import boto3
|
|
|
|
+from botocore.config import Config
|
|
from app.config.config import DevelopmentConfig, ProductionConfig
|
|
from app.config.config import DevelopmentConfig, ProductionConfig
|
|
|
|
|
|
# 使用配置变量
|
|
# 使用配置变量
|
|
config = ProductionConfig()
|
|
config = ProductionConfig()
|
|
|
|
|
|
|
|
+# MinIO 配置
|
|
|
|
+minio_url = f"{'https' if config.MINIO_SECURE else 'http'}://{config.MINIO_HOST}"
|
|
|
|
+minio_access_key = config.MINIO_USER
|
|
|
|
+minio_secret_key = config.MINIO_PASSWORD
|
|
|
|
+minio_bucket = config.MINIO_BUCKET
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_minio_client():
|
|
|
|
+ """获取MinIO客户端连接"""
|
|
|
|
+ try:
|
|
|
|
+ logging.info(f"尝试连接MinIO服务器: {minio_url}")
|
|
|
|
+
|
|
|
|
+ minio_client = boto3.client(
|
|
|
|
+ 's3',
|
|
|
|
+ endpoint_url=minio_url,
|
|
|
|
+ aws_access_key_id=minio_access_key,
|
|
|
|
+ aws_secret_access_key=minio_secret_key,
|
|
|
|
+ config=Config(
|
|
|
|
+ signature_version='s3v4',
|
|
|
|
+ retries={'max_attempts': 3, 'mode': 'standard'},
|
|
|
|
+ connect_timeout=10,
|
|
|
|
+ read_timeout=30
|
|
|
|
+ )
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 确保存储桶存在
|
|
|
|
+ buckets = minio_client.list_buckets()
|
|
|
|
+ bucket_names = [bucket['Name'] for bucket in buckets.get('Buckets', [])]
|
|
|
|
+ logging.info(f"成功连接到MinIO服务器,现有存储桶: {bucket_names}")
|
|
|
|
+
|
|
|
|
+ if minio_bucket not in bucket_names:
|
|
|
|
+ logging.info(f"创建存储桶: {minio_bucket}")
|
|
|
|
+ minio_client.create_bucket(Bucket=minio_bucket)
|
|
|
|
+
|
|
|
|
+ return minio_client
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.error(f"MinIO连接错误: {str(e)}")
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
|
|
def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
|
|
def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
|
|
"""
|
|
"""
|
|
@@ -202,10 +243,10 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
|
|
|
|
|
|
def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
|
|
def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
"""
|
|
- 解析简历文件
|
|
|
|
|
|
+ 解析简历文件,支持本地路径和MinIO URL
|
|
|
|
|
|
Args:
|
|
Args:
|
|
- file_path (str): 简历文件路径
|
|
|
|
|
|
+ file_path (str): 简历文件路径或MinIO URL
|
|
task_id (str, optional): 关联的任务ID
|
|
task_id (str, optional): 关联的任务ID
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
@@ -214,22 +255,41 @@ def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str
|
|
try:
|
|
try:
|
|
logging.info(f"开始解析简历文件: {file_path}")
|
|
logging.info(f"开始解析简历文件: {file_path}")
|
|
|
|
|
|
- # 检查文件是否存在
|
|
|
|
- if not os.path.exists(file_path):
|
|
|
|
- return {
|
|
|
|
- 'success': False,
|
|
|
|
- 'error': f'文件不存在: {file_path}',
|
|
|
|
- 'data': None
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- # 检查文件格式
|
|
|
|
- file_ext = os.path.splitext(file_path)[1].lower()
|
|
|
|
- if file_ext != '.pdf':
|
|
|
|
- return {
|
|
|
|
- 'success': False,
|
|
|
|
- 'error': f'不支持的文件格式: {file_ext},仅支持PDF格式',
|
|
|
|
- 'data': None
|
|
|
|
- }
|
|
|
|
|
|
+ # 验证文件格式和存在性
|
|
|
|
+ if not validate_resume_format(file_path):
|
|
|
|
+ # 检查是否是MinIO URL
|
|
|
|
+ if file_path.startswith('http://') or file_path.startswith('https://'):
|
|
|
|
+ from urllib.parse import urlparse
|
|
|
|
+ parsed_url = urlparse(file_path)
|
|
|
|
+ file_ext = os.path.splitext(parsed_url.path)[1].lower()
|
|
|
|
+ if file_ext != '.pdf':
|
|
|
|
+ return {
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'error': f'不支持的文件格式: {file_ext},仅支持PDF格式',
|
|
|
|
+ 'data': None
|
|
|
|
+ }
|
|
|
|
+ else:
|
|
|
|
+ return {
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'error': f'文件不存在: {file_path}',
|
|
|
|
+ 'data': None
|
|
|
|
+ }
|
|
|
|
+ else:
|
|
|
|
+ # 本地文件路径
|
|
|
|
+ if not os.path.exists(file_path):
|
|
|
|
+ return {
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'error': f'文件不存在: {file_path}',
|
|
|
|
+ 'data': None
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ file_ext = os.path.splitext(file_path)[1].lower()
|
|
|
|
+ if file_ext != '.pdf':
|
|
|
|
+ return {
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'error': f'不支持的文件格式: {file_ext},仅支持PDF格式',
|
|
|
|
+ 'data': None
|
|
|
|
+ }
|
|
|
|
|
|
# 步骤1: 提取PDF文本内容
|
|
# 步骤1: 提取PDF文本内容
|
|
logging.info("开始提取PDF文本内容")
|
|
logging.info("开始提取PDF文本内容")
|
|
@@ -267,12 +327,36 @@ def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str
|
|
}
|
|
}
|
|
|
|
|
|
# 步骤3: 构建完整的解析结果
|
|
# 步骤3: 构建完整的解析结果
|
|
|
|
+ # 获取文件大小
|
|
|
|
+ file_size = 0
|
|
|
|
+ try:
|
|
|
|
+ if file_path.startswith('http://') or file_path.startswith('https://'):
|
|
|
|
+ # 对于MinIO URL,从extract_resume_text的结果中获取文件大小
|
|
|
|
+ # 或者重新获取(这里我们使用一个简化的方法)
|
|
|
|
+ from urllib.parse import urlparse
|
|
|
|
+
|
|
|
|
+ minio_client = get_minio_client()
|
|
|
|
+ if minio_client:
|
|
|
|
+ parsed_url = urlparse(file_path)
|
|
|
|
+ path_parts = parsed_url.path.strip('/').split('/', 1)
|
|
|
|
+ if len(path_parts) >= 2:
|
|
|
|
+ object_key = path_parts[1]
|
|
|
|
+ try:
|
|
|
|
+ response = minio_client.head_object(Bucket=minio_bucket, Key=object_key)
|
|
|
|
+ file_size = response.get('ContentLength', 0)
|
|
|
|
+ except Exception:
|
|
|
|
+ file_size = 0
|
|
|
|
+ else:
|
|
|
|
+ file_size = os.path.getsize(file_path)
|
|
|
|
+ except Exception:
|
|
|
|
+ file_size = 0
|
|
|
|
+
|
|
parse_result = {
|
|
parse_result = {
|
|
**parsed_data, # 包含所有千问解析的结果
|
|
**parsed_data, # 包含所有千问解析的结果
|
|
'parse_time': datetime.now().isoformat(),
|
|
'parse_time': datetime.now().isoformat(),
|
|
'file_info': {
|
|
'file_info': {
|
|
'original_path': file_path,
|
|
'original_path': file_path,
|
|
- 'file_size': os.path.getsize(file_path),
|
|
|
|
|
|
+ 'file_size': file_size,
|
|
'file_type': 'pdf',
|
|
'file_type': 'pdf',
|
|
'page_count': page_count,
|
|
'page_count': page_count,
|
|
'text_length': len(resume_text)
|
|
'text_length': len(resume_text)
|
|
@@ -306,10 +390,10 @@ def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str
|
|
|
|
|
|
def extract_resume_text(file_path: str) -> Dict[str, Any]:
|
|
def extract_resume_text(file_path: str) -> Dict[str, Any]:
|
|
"""
|
|
"""
|
|
- 提取简历文本内容
|
|
|
|
|
|
+ 提取简历文本内容,支持本地路径和MinIO URL
|
|
|
|
|
|
Args:
|
|
Args:
|
|
- file_path (str): 简历文件路径
|
|
|
|
|
|
+ file_path (str): 简历文件路径或MinIO URL
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
Dict[str, Any]: 提取结果
|
|
Dict[str, Any]: 提取结果
|
|
@@ -320,21 +404,79 @@ def extract_resume_text(file_path: str) -> Dict[str, Any]:
|
|
text_content = ""
|
|
text_content = ""
|
|
page_count = 0
|
|
page_count = 0
|
|
|
|
|
|
- # 使用PyPDF2提取PDF文本
|
|
|
|
- with open(file_path, 'rb') as file:
|
|
|
|
- pdf_reader = PyPDF2.PdfReader(file)
|
|
|
|
- page_count = len(pdf_reader.pages)
|
|
|
|
|
|
+ # 检查是否是MinIO URL
|
|
|
|
+ if file_path.startswith('http://') or file_path.startswith('https://'):
|
|
|
|
+ # 处理MinIO URL
|
|
|
|
+ from urllib.parse import urlparse
|
|
|
|
+ from io import BytesIO
|
|
|
|
|
|
- for page_num, page in enumerate(pdf_reader.pages):
|
|
|
|
- try:
|
|
|
|
- page_text = page.extract_text()
|
|
|
|
- if page_text:
|
|
|
|
- text_content += f"\n=== 第{page_num + 1}页 ===\n{page_text}\n"
|
|
|
|
- else:
|
|
|
|
- logging.warning(f"第{page_num + 1}页无法提取文本")
|
|
|
|
- except Exception as e:
|
|
|
|
- logging.warning(f"提取第{page_num + 1}页文本失败: {str(e)}")
|
|
|
|
- continue
|
|
|
|
|
|
+ # 获取MinIO客户端
|
|
|
|
+ minio_client = get_minio_client()
|
|
|
|
+ if not minio_client:
|
|
|
|
+ return {
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'error': '无法连接到MinIO服务器',
|
|
|
|
+ 'text_content': None,
|
|
|
|
+ 'page_count': 0
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # 提取对象键
|
|
|
|
+ parsed_url = urlparse(file_path)
|
|
|
|
+ path_parts = parsed_url.path.strip('/').split('/', 1)
|
|
|
|
+ if len(path_parts) < 2:
|
|
|
|
+ return {
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'error': f'无效的MinIO URL格式: {file_path}',
|
|
|
|
+ 'text_content': None,
|
|
|
|
+ 'page_count': 0
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ object_key = path_parts[1] # 跳过bucket名称
|
|
|
|
+
|
|
|
|
+ # 从MinIO获取PDF数据
|
|
|
|
+ try:
|
|
|
|
+ response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
|
|
|
|
+ pdf_data = response['Body'].read()
|
|
|
|
+
|
|
|
|
+ # 使用PyPDF2提取PDF文本
|
|
|
|
+ pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_data))
|
|
|
|
+ page_count = len(pdf_reader.pages)
|
|
|
|
+
|
|
|
|
+ for page_num, page in enumerate(pdf_reader.pages):
|
|
|
|
+ try:
|
|
|
|
+ page_text = page.extract_text()
|
|
|
|
+ if page_text:
|
|
|
|
+ text_content += f"\n=== 第{page_num + 1}页 ===\n{page_text}\n"
|
|
|
|
+ else:
|
|
|
|
+ logging.warning(f"第{page_num + 1}页无法提取文本")
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.warning(f"提取第{page_num + 1}页文本失败: {str(e)}")
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ except Exception as minio_error:
|
|
|
|
+ return {
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'error': f'从MinIO获取PDF失败: {str(minio_error)}',
|
|
|
|
+ 'text_content': None,
|
|
|
|
+ 'page_count': 0
|
|
|
|
+ }
|
|
|
|
+ else:
|
|
|
|
+ # 处理本地文件路径
|
|
|
|
+ # 使用PyPDF2提取PDF文本
|
|
|
|
+ with open(file_path, 'rb') as file:
|
|
|
|
+ pdf_reader = PyPDF2.PdfReader(file)
|
|
|
|
+ page_count = len(pdf_reader.pages)
|
|
|
|
+
|
|
|
|
+ for page_num, page in enumerate(pdf_reader.pages):
|
|
|
|
+ try:
|
|
|
|
+ page_text = page.extract_text()
|
|
|
|
+ if page_text:
|
|
|
|
+ text_content += f"\n=== 第{page_num + 1}页 ===\n{page_text}\n"
|
|
|
|
+ else:
|
|
|
|
+ logging.warning(f"第{page_num + 1}页无法提取文本")
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.warning(f"提取第{page_num + 1}页文本失败: {str(e)}")
|
|
|
|
+ continue
|
|
|
|
|
|
# 清理文本内容
|
|
# 清理文本内容
|
|
text_content = text_content.strip()
|
|
text_content = text_content.strip()
|
|
@@ -367,22 +509,76 @@ def extract_resume_text(file_path: str) -> Dict[str, Any]:
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
+def _get_filename_from_path(file_path: str) -> str:
|
|
|
|
+ """
|
|
|
|
+ 从文件路径或MinIO URL中提取文件名
|
|
|
|
+
|
|
|
|
+ Args:
|
|
|
|
+ file_path (str): 文件路径或MinIO URL
|
|
|
|
+
|
|
|
|
+ Returns:
|
|
|
|
+ str: 文件名
|
|
|
|
+ """
|
|
|
|
+ try:
|
|
|
|
+ if file_path.startswith('http://') or file_path.startswith('https://'):
|
|
|
|
+ # 从MinIO URL中提取文件名
|
|
|
|
+ from urllib.parse import urlparse
|
|
|
|
+ parsed_url = urlparse(file_path)
|
|
|
|
+ return os.path.basename(parsed_url.path)
|
|
|
|
+ else:
|
|
|
|
+ # 从本地路径中提取文件名
|
|
|
|
+ return os.path.basename(file_path)
|
|
|
|
+ except Exception:
|
|
|
|
+ return 'unknown_file.pdf'
|
|
|
|
+
|
|
|
|
+
|
|
def validate_resume_format(file_path: str) -> bool:
|
|
def validate_resume_format(file_path: str) -> bool:
|
|
"""
|
|
"""
|
|
- 验证简历文件格式
|
|
|
|
|
|
+ 验证简历文件格式,支持本地路径和MinIO URL
|
|
|
|
|
|
Args:
|
|
Args:
|
|
- file_path (str): 文件路径
|
|
|
|
|
|
+ file_path (str): 文件路径或MinIO URL
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
bool: 是否为有效的简历格式
|
|
bool: 是否为有效的简历格式
|
|
"""
|
|
"""
|
|
try:
|
|
try:
|
|
- if not os.path.exists(file_path):
|
|
|
|
- return False
|
|
|
|
|
|
+ # 检查是否是MinIO URL
|
|
|
|
+ if file_path.startswith('http://') or file_path.startswith('https://'):
|
|
|
|
+ # 处理MinIO URL
|
|
|
|
+ from urllib.parse import urlparse
|
|
|
|
|
|
- file_ext = os.path.splitext(file_path)[1].lower()
|
|
|
|
- return file_ext == '.pdf'
|
|
|
|
|
|
+ # 从URL提取文件扩展名
|
|
|
|
+ parsed_url = urlparse(file_path)
|
|
|
|
+ file_ext = os.path.splitext(parsed_url.path)[1].lower()
|
|
|
|
+ if file_ext != '.pdf':
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ # 验证文件是否存在于MinIO中
|
|
|
|
+ try:
|
|
|
|
+ minio_client = get_minio_client()
|
|
|
|
+ if not minio_client:
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ # 提取对象键
|
|
|
|
+ path_parts = parsed_url.path.strip('/').split('/', 1)
|
|
|
|
+ if len(path_parts) < 2:
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ object_key = path_parts[1] # 跳过bucket名称
|
|
|
|
+
|
|
|
|
+ # 检查文件是否存在
|
|
|
|
+ response = minio_client.head_object(Bucket=minio_bucket, Key=object_key)
|
|
|
|
+ return True
|
|
|
|
+ except Exception:
|
|
|
|
+ return False
|
|
|
|
+ else:
|
|
|
|
+ # 处理本地文件路径
|
|
|
|
+ if not os.path.exists(file_path):
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ file_ext = os.path.splitext(file_path)[1].lower()
|
|
|
|
+ return file_ext == '.pdf'
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
logging.error(f"验证简历格式失败: {str(e)}")
|
|
logging.error(f"验证简历格式失败: {str(e)}")
|
|
@@ -470,24 +666,24 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
|
|
results.append({
|
|
results.append({
|
|
"data": standardized_data,
|
|
"data": standardized_data,
|
|
"error": None,
|
|
"error": None,
|
|
- "filename": os.path.basename(file_path) if file_path else f'resume_{i}.pdf',
|
|
|
|
|
|
+ "filename": _get_filename_from_path(file_path) if file_path else f'resume_{i}.pdf',
|
|
"index": i,
|
|
"index": i,
|
|
"message": "简历文件解析成功",
|
|
"message": "简历文件解析成功",
|
|
- "minio_path": f"resume_files/{os.path.basename(file_path)}" if file_path else '',
|
|
|
|
- "object_key": f"resume_files/{os.path.basename(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
|
|
|
|
|
|
+ "minio_path": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else '',
|
|
|
|
+ "object_key": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
|
|
"success": True
|
|
"success": True
|
|
})
|
|
})
|
|
- logging.info(f"成功处理第 {i+1} 个文件: {os.path.basename(file_path)}")
|
|
|
|
|
|
+ logging.info(f"成功处理第 {i+1} 个文件: {_get_filename_from_path(file_path)}")
|
|
else:
|
|
else:
|
|
failed_count += 1
|
|
failed_count += 1
|
|
results.append({
|
|
results.append({
|
|
"data": None,
|
|
"data": None,
|
|
"error": result.get('error', '处理失败'),
|
|
"error": result.get('error', '处理失败'),
|
|
- "filename": os.path.basename(file_path) if file_path else f'resume_{i}.pdf',
|
|
|
|
|
|
+ "filename": _get_filename_from_path(file_path) if file_path else f'resume_{i}.pdf',
|
|
"index": i,
|
|
"index": i,
|
|
"message": "简历文件解析失败",
|
|
"message": "简历文件解析失败",
|
|
- "minio_path": f"resume_files/{os.path.basename(file_path)}" if file_path else '',
|
|
|
|
- "object_key": f"resume_files/{os.path.basename(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
|
|
|
|
|
|
+ "minio_path": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else '',
|
|
|
|
+ "object_key": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
|
|
"success": False
|
|
"success": False
|
|
})
|
|
})
|
|
logging.error(f"处理第 {i+1} 个文件失败: {result.get('error', '未知错误')}")
|
|
logging.error(f"处理第 {i+1} 个文件失败: {result.get('error', '未知错误')}")
|
|
@@ -499,11 +695,11 @@ def batch_parse_resumes(file_paths: List[str]) -> Dict[str, Any]:
|
|
results.append({
|
|
results.append({
|
|
"data": None,
|
|
"data": None,
|
|
"error": error_msg,
|
|
"error": error_msg,
|
|
- "filename": os.path.basename(file_path) if file_path else f'resume_{i}.pdf',
|
|
|
|
|
|
+ "filename": _get_filename_from_path(file_path) if file_path else f'resume_{i}.pdf',
|
|
"index": i,
|
|
"index": i,
|
|
"message": "简历文件解析失败",
|
|
"message": "简历文件解析失败",
|
|
- "minio_path": f"resume_files/{os.path.basename(file_path)}" if file_path else '',
|
|
|
|
- "object_key": f"resume_files/{os.path.basename(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
|
|
|
|
|
|
+ "minio_path": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else '',
|
|
|
|
+ "object_key": f"resume_files/{_get_filename_from_path(file_path)}" if file_path else f'resume_files/file_{i}.pdf',
|
|
"success": False
|
|
"success": False
|
|
})
|
|
})
|
|
|
|
|