|
@@ -60,6 +60,73 @@ def get_minio_client():
|
|
|
return None
|
|
|
|
|
|
|
|
|
+def format_date_to_yyyy_mm_dd(raw_date):
|
|
|
+ """
|
|
|
+ 将各种日期格式转换为YYYY-MM-DD格式
|
|
|
+
|
|
|
+ Args:
|
|
|
+ raw_date (str): 原始日期字符串
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ str: 格式化后的日期字符串,格式为YYYY-MM-DD
|
|
|
+ """
|
|
|
+ if not raw_date:
|
|
|
+ return ''
|
|
|
+
|
|
|
+ import re
|
|
|
+ from datetime import datetime
|
|
|
+
|
|
|
+ # 移除多余的空格
|
|
|
+ raw_date = raw_date.strip()
|
|
|
+
|
|
|
+ # 匹配常见的日期格式
|
|
|
+ patterns = [
|
|
|
+ # 匹配 YYYY.MM 格式,如 2023.11
|
|
|
+ r'(\d{4})\.(\d{1,2})',
|
|
|
+ # 匹配 YYYY-MM 格式,如 2023-11
|
|
|
+ r'(\d{4})-(\d{1,2})',
|
|
|
+ # 匹配 YYYY年MM月 格式,如 2023年11月
|
|
|
+ r'(\d{4})年(\d{1,2})月',
|
|
|
+ # 匹配 YYYY/MM 格式,如 2023/11
|
|
|
+ r'(\d{4})/(\d{1,2})',
|
|
|
+ # 匹配 YYYYMM 格式,如 202311
|
|
|
+ r'(\d{4})(\d{2})',
|
|
|
+ # 匹配 YYYY.MM.DD 格式,如 2023.11.01
|
|
|
+ r'(\d{4})\.(\d{1,2})\.(\d{1,2})',
|
|
|
+ # 匹配 YYYY-MM-DD 格式,如 2023-11-01
|
|
|
+ r'(\d{4})-(\d{1,2})-(\d{1,2})',
|
|
|
+ # 匹配 YYYY年MM月DD日 格式,如 2023年11月1日
|
|
|
+ r'(\d{4})年(\d{1,2})月(\d{1,2})日',
|
|
|
+ # 匹配 YYYY/MM/DD 格式,如 2023/11/01
|
|
|
+ r'(\d{4})/(\d{1,2})/(\d{1,2})'
|
|
|
+ ]
|
|
|
+
|
|
|
+ for pattern in patterns:
|
|
|
+ match = re.match(pattern, raw_date)
|
|
|
+ if match:
|
|
|
+ groups = match.groups()
|
|
|
+ year = groups[0]
|
|
|
+ month = groups[1].zfill(2) # 补齐两位数
|
|
|
+
|
|
|
+ if len(groups) == 2:
|
|
|
+ # 只有年月,添加01作为日期
|
|
|
+ day = '01'
|
|
|
+ else:
|
|
|
+ # 有年月日
|
|
|
+ day = groups[2].zfill(2) # 补齐两位数
|
|
|
+
|
|
|
+ # 验证日期有效性
|
|
|
+ try:
|
|
|
+ datetime(int(year), int(month), int(day))
|
|
|
+ return f"{year}-{month}-{day}"
|
|
|
+ except ValueError:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 如果没有匹配到任何格式,返回原始值
|
|
|
+ logging.warning(f"无法解析日期格式: {raw_date}")
|
|
|
+ return raw_date
|
|
|
+
|
|
|
+
|
|
|
def standardize_career_entry(entry):
|
|
|
"""
|
|
|
标准化career_path条目格式
|
|
@@ -84,12 +151,12 @@ def standardize_career_entry(entry):
|
|
|
}
|
|
|
|
|
|
|
|
|
-def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
|
|
|
+def parse_resume_with_qwen(file_path: str) -> Dict[str, Any]:
|
|
|
"""
|
|
|
- 使用阿里云千问大模型解析简历文本
|
|
|
+ 使用阿里云千问大模型解析简历PDF文档
|
|
|
|
|
|
Args:
|
|
|
- resume_text (str): 简历文本内容
|
|
|
+ file_path (str): PDF文件路径或MinIO URL
|
|
|
|
|
|
Returns:
|
|
|
Dict[str, Any]: 解析结果
|
|
@@ -104,78 +171,123 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
|
|
|
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
|
|
)
|
|
|
|
|
|
- # 构建针对简历解析的专业提示语(参考名片解析格式)
|
|
|
- prompt = """你是企业简历的信息提取专家。请仔细分析提供的简历文本内容,精确提取名片相关信息。
|
|
|
-
|
|
|
-## 提取要求
|
|
|
-- 区分中英文内容,分别提取
|
|
|
-- 保持提取信息的原始格式(如大小写、标点)
|
|
|
-- 对于无法识别或简历中不存在的信息,返回空字符串
|
|
|
-- 简历中没有的信息,请不要猜测
|
|
|
-
|
|
|
-## 需提取的字段
|
|
|
-1. 中文姓名 (name_zh)
|
|
|
-2. 英文姓名 (name_en)
|
|
|
-3. 中文职位/头衔 (title_zh)
|
|
|
-4. 英文职位/头衔 (title_en)
|
|
|
-5. 中文酒店/公司名称 (hotel_zh)
|
|
|
-6. 英文酒店/公司名称 (hotel_en)
|
|
|
-7. 手机号码 (mobile) - 如有多个手机号码,使用逗号分隔,最多提取3个
|
|
|
-8. 固定电话 (phone) - 如有多个,只提取一个
|
|
|
-9. 电子邮箱 (email)
|
|
|
-10. 中文地址 (address_zh)
|
|
|
-11. 英文地址 (address_en)
|
|
|
-12. 中文邮政编码 (postal_code_zh)
|
|
|
-13. 英文邮政编码 (postal_code_en)
|
|
|
-14. 生日 (birthday) - 格式为YYYY-MM-DD,如1990-01-01
|
|
|
-15. 年龄 (age) - 数字格式,如30
|
|
|
-16. 籍贯 (native_place) - 出生地或户籍所在地信息
|
|
|
-17. 居住地 (residence) - 个人居住地址信息
|
|
|
-18. 品牌组合 (brand_group) - 如有多个品牌,使用逗号分隔
|
|
|
-19. 职业轨迹 (career_path) - 从简历中推断,以JSON数组格式返回,包含日期,公司名称和担任职务。
|
|
|
-20. 隶属关系 (affiliation) - 如能从简历中推断,以JSON数组格式返回,包含公司名称和隶属集团名称
|
|
|
-
|
|
|
-## 输出格式
|
|
|
-请以严格的JSON格式返回结果,不要添加任何额外解释文字。JSON格式如下:
|
|
|
-```json
|
|
|
+ # 构建针对简历解析的专业提示语
|
|
|
+ prompt = """你是企业简历的信息提取专家。请仔细分析提供的简历文本,精确提取名片相关信息。
|
|
|
+请从上传的简历文本中提取以下结构化信息,严格按JSON格式输出:
|
|
|
{
|
|
|
- "name_zh": "",
|
|
|
- "name_en": "",
|
|
|
- "title_zh": "",
|
|
|
- "title_en": "",
|
|
|
- "hotel_zh": "",
|
|
|
- "hotel_en": "",
|
|
|
- "mobile": "",
|
|
|
- "phone": "",
|
|
|
- "email": "",
|
|
|
- "address_zh": "",
|
|
|
- "address_en": "",
|
|
|
- "postal_code_zh": "",
|
|
|
- "postal_code_en": "",
|
|
|
- "birthday": "",
|
|
|
- "age": "",
|
|
|
- "native_place": "",
|
|
|
- "residence": "",
|
|
|
- "brand_group": "",
|
|
|
- "career_path": [],
|
|
|
- "affiliation": []
|
|
|
+ "basic_info": {
|
|
|
+ "中文姓名": "",
|
|
|
+ "英文姓名": "",
|
|
|
+ "中文头衔": "",
|
|
|
+ "英文头衔": "",
|
|
|
+ "中文酒店": "",
|
|
|
+ "英文酒店": "",
|
|
|
+ "手机号": "",
|
|
|
+ "邮箱": "",
|
|
|
+ "中文工作地址": "",
|
|
|
+ "英文工作地址": "",
|
|
|
+ "生日": "",
|
|
|
+ "年龄": "",
|
|
|
+ "籍贯": "",
|
|
|
+ "居住地": "",
|
|
|
+ "品牌": "",
|
|
|
+ "隶属关系": "",
|
|
|
+ "品牌组合": ""
|
|
|
+ },
|
|
|
+ "work_experience": [
|
|
|
+ {
|
|
|
+ "任职时间": "",
|
|
|
+ "中文酒店": "",
|
|
|
+ "英文酒店": "",
|
|
|
+ "中文职位": "",
|
|
|
+ "英文职位": ""
|
|
|
+ }
|
|
|
+ ]
|
|
|
}
|
|
|
-```
|
|
|
-
|
|
|
-以下是需要分析的简历文本内容:
|
|
|
-
|
|
|
-""" + resume_text
|
|
|
+提取要求:
|
|
|
+1. 中文优先,双语内容保留中文和英文
|
|
|
+2. 工作经历按倒序排列,只需要提取开始时间作为任职时间
|
|
|
+3. basic_info中的酒店和头衔按照工作经历提取的最近那一个工作酒店和头衔进行填写。
|
|
|
+4. 其他信息忽略,不需要写入JSON。
|
|
|
+5. 如果简历中没有工作经历,则不提取工作经历。
|
|
|
+"""
|
|
|
+
|
|
|
+ # 准备文件内容并提取文本
|
|
|
+ file_name = None
|
|
|
+ resume_text = ""
|
|
|
+
|
|
|
+ # 检查是否是MinIO URL
|
|
|
+ if file_path.startswith('http://') or file_path.startswith('https://'):
|
|
|
+ # 处理MinIO URL
|
|
|
+ from urllib.parse import urlparse
|
|
|
+
|
|
|
+ # 获取MinIO客户端
|
|
|
+ minio_client = get_minio_client()
|
|
|
+ if not minio_client:
|
|
|
+ raise Exception('无法连接到MinIO服务器')
|
|
|
+
|
|
|
+ # 提取对象键
|
|
|
+ parsed_url = urlparse(file_path)
|
|
|
+ path_parts = parsed_url.path.strip('/').split('/', 1)
|
|
|
+ if len(path_parts) < 2:
|
|
|
+ raise Exception(f'无效的MinIO URL格式: {file_path}')
|
|
|
+
|
|
|
+ object_key = path_parts[1] # 跳过bucket名称
|
|
|
+ file_name = os.path.basename(parsed_url.path)
|
|
|
+
|
|
|
+ # 从MinIO获取PDF数据并提取文本
|
|
|
+ try:
|
|
|
+ response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
|
|
|
+ pdf_data = response['Body'].read()
|
|
|
+
|
|
|
+ # 使用PyPDF2提取PDF文本
|
|
|
+ from io import BytesIO
|
|
|
+ pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_data))
|
|
|
+ for page_num, page in enumerate(pdf_reader.pages):
|
|
|
+ try:
|
|
|
+ page_text = page.extract_text()
|
|
|
+ if page_text:
|
|
|
+ resume_text += f"\n=== 第{page_num + 1}页 ===\n{page_text}\n"
|
|
|
+ except Exception as e:
|
|
|
+ logging.warning(f"提取第{page_num + 1}页文本失败: {str(e)}")
|
|
|
+ continue
|
|
|
+
|
|
|
+ except Exception as minio_error:
|
|
|
+ raise Exception(f'从MinIO获取PDF失败: {str(minio_error)}')
|
|
|
+ else:
|
|
|
+ # 处理本地文件路径
|
|
|
+ if not os.path.exists(file_path):
|
|
|
+ raise Exception(f'文件不存在: {file_path}')
|
|
|
+
|
|
|
+ file_name = os.path.basename(file_path)
|
|
|
+
|
|
|
+ # 使用PyPDF2提取PDF文本
|
|
|
+ with open(file_path, 'rb') as file:
|
|
|
+ pdf_reader = PyPDF2.PdfReader(file)
|
|
|
+ for page_num, page in enumerate(pdf_reader.pages):
|
|
|
+ try:
|
|
|
+ page_text = page.extract_text()
|
|
|
+ if page_text:
|
|
|
+ resume_text += f"\n=== 第{page_num + 1}页 ===\n{page_text}\n"
|
|
|
+ except Exception as e:
|
|
|
+ logging.warning(f"提取第{page_num + 1}页文本失败: {str(e)}")
|
|
|
+ continue
|
|
|
|
|
|
- # 调用 Qwen API
|
|
|
- logging.info("发送简历文本请求到 Qwen 模型")
|
|
|
+ # 检查是否成功提取到文本
|
|
|
+ if not resume_text or len(resume_text.strip()) < 50:
|
|
|
+ raise Exception('PDF文本提取失败,可能是扫描版PDF或文本质量较差')
|
|
|
+
|
|
|
+ # 构建完整的提示语
|
|
|
+ full_prompt = prompt + "\n\n以下是需要分析的简历文本内容:\n\n" + resume_text
|
|
|
+
|
|
|
+ # 调用 Qwen API,发送文本内容
|
|
|
+ logging.info(f"发送PDF文本内容到 Qwen 模型进行解析: {file_name}")
|
|
|
completion = client.chat.completions.create(
|
|
|
- model="qwen-plus-latest",
|
|
|
+ model="qwen-long-latest", # 使用qwen-long-latest模型
|
|
|
messages=[
|
|
|
{
|
|
|
"role": "user",
|
|
|
- "content": [
|
|
|
- {"type": "text", "text": prompt}
|
|
|
- ]
|
|
|
+ "content": full_prompt
|
|
|
}
|
|
|
],
|
|
|
temperature=0.1, # 降低温度增加精确性
|
|
@@ -186,16 +298,94 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
|
|
|
response_content = completion.choices[0].message.content
|
|
|
logging.info(f"成功从 Qwen 模型获取简历解析响应")
|
|
|
|
|
|
- # 直接解析 QWen 返回的 JSON 响应
|
|
|
+ # 直接解析 Qwen 返回的 JSON 响应
|
|
|
try:
|
|
|
- parsed_resume = json.loads(response_content)
|
|
|
+ qwen_response = json.loads(response_content)
|
|
|
logging.info("成功解析 Qwen 简历响应中的 JSON")
|
|
|
except json.JSONDecodeError as e:
|
|
|
error_msg = f"JSON 解析失败: {str(e)}"
|
|
|
logging.error(error_msg)
|
|
|
raise Exception(error_msg)
|
|
|
|
|
|
- # 确保所有必要字段存在(与名片解析保持一致)
|
|
|
+ # 从新的响应格式中提取并映射字段
|
|
|
+ parsed_resume = {}
|
|
|
+
|
|
|
+ # 提取 basic_info 中的字段
|
|
|
+ basic_info = qwen_response.get('basic_info', {})
|
|
|
+
|
|
|
+ # 映射中文字段到英文字段
|
|
|
+ field_mapping = {
|
|
|
+ '中文姓名': 'name_zh',
|
|
|
+ '英文姓名': 'name_en',
|
|
|
+ '中文头衔': 'title_zh',
|
|
|
+ '英文头衔': 'title_en',
|
|
|
+ '中文酒店': 'hotel_zh',
|
|
|
+ '英文酒店': 'hotel_en',
|
|
|
+ '手机号': 'mobile',
|
|
|
+ '邮箱': 'email',
|
|
|
+ '中文工作地址': 'address_zh',
|
|
|
+ '英文工作地址': 'address_en',
|
|
|
+ '生日': 'birthday',
|
|
|
+ '年龄': 'age',
|
|
|
+ '籍贯': 'native_place',
|
|
|
+ '居住地': 'residence',
|
|
|
+ '品牌': 'brand',
|
|
|
+ '隶属关系': 'affiliation',
|
|
|
+ '品牌组合': 'brand_group'
|
|
|
+ }
|
|
|
+
|
|
|
+ # 执行字段映射
|
|
|
+ for chinese_field, english_field in field_mapping.items():
|
|
|
+ value = basic_info.get(chinese_field, '')
|
|
|
+ if value:
|
|
|
+ # 特殊处理年龄字段,提取数字
|
|
|
+ if english_field == 'age':
|
|
|
+ import re
|
|
|
+ age_match = re.search(r'(\d+)', str(value))
|
|
|
+ if age_match:
|
|
|
+ parsed_resume[english_field] = age_match.group(1)
|
|
|
+ else:
|
|
|
+ parsed_resume[english_field] = ''
|
|
|
+ else:
|
|
|
+ parsed_resume[english_field] = value
|
|
|
+ else:
|
|
|
+ # 设置默认值
|
|
|
+ if english_field in ['career_path', 'affiliation']:
|
|
|
+ parsed_resume[english_field] = []
|
|
|
+ elif english_field == 'age':
|
|
|
+ parsed_resume[english_field] = ''
|
|
|
+ else:
|
|
|
+ parsed_resume[english_field] = ""
|
|
|
+
|
|
|
+ # 处理 work_experience 映射到 career_path
|
|
|
+ work_experience = qwen_response.get('work_experience', [])
|
|
|
+ if work_experience and isinstance(work_experience, list):
|
|
|
+ career_path = []
|
|
|
+ for work_item in work_experience:
|
|
|
+ if isinstance(work_item, dict):
|
|
|
+ # 格式化日期为YYYY-MM-DD格式
|
|
|
+ raw_date = work_item.get('任职时间', '')
|
|
|
+ formatted_date = format_date_to_yyyy_mm_dd(raw_date)
|
|
|
+
|
|
|
+ career_entry = {
|
|
|
+ "date": formatted_date,
|
|
|
+ "hotel_en": work_item.get('英文酒店', ''),
|
|
|
+ "hotel_zh": work_item.get('中文酒店', ''),
|
|
|
+ "image_path": '',
|
|
|
+ "source": 'resume_extraction',
|
|
|
+ "title_en": work_item.get('英文职位', ''),
|
|
|
+ "title_zh": work_item.get('中文职位', '')
|
|
|
+ }
|
|
|
+ career_entry = standardize_career_entry(career_entry)
|
|
|
+ career_path.append(career_entry)
|
|
|
+
|
|
|
+ parsed_resume['career_path'] = career_path
|
|
|
+ logging.info(f"成功映射 {len(career_path)} 条工作经历到 career_path")
|
|
|
+ else:
|
|
|
+ parsed_resume['career_path'] = []
|
|
|
+ logging.info("未找到工作经历信息,career_path设为空数组")
|
|
|
+
|
|
|
+ # 确保所有必要字段存在
|
|
|
required_fields = [
|
|
|
'name_zh', 'name_en', 'title_zh', 'title_en',
|
|
|
'hotel_zh', 'hotel_en', 'mobile', 'phone',
|
|
@@ -213,40 +403,14 @@ def parse_resume_with_qwen(resume_text: str) -> Dict[str, Any]:
|
|
|
else:
|
|
|
parsed_resume[field] = ""
|
|
|
|
|
|
- # 处理career_path字段,统一格式化处理
|
|
|
-
|
|
|
- # 处理career_path字段
|
|
|
- career_path = parsed_resume.get('career_path')
|
|
|
-
|
|
|
- # 如果career_path为空值或不是数组,用提取信息组合一条记录
|
|
|
- if not career_path or not isinstance(career_path, list):
|
|
|
- if parsed_resume.get('hotel_zh') or parsed_resume.get('hotel_en') or parsed_resume.get('title_zh') or parsed_resume.get('title_en'):
|
|
|
- # 用提取到的信息创建一条记录
|
|
|
- new_entry = {
|
|
|
- "date": datetime.now().strftime('%Y-%m-%d'),
|
|
|
- "hotel_en": parsed_resume.get('hotel_en', ''),
|
|
|
- "hotel_zh": parsed_resume.get('hotel_zh', ''),
|
|
|
- "image_path": '',
|
|
|
- "source": 'resume_extraction',
|
|
|
- "title_en": parsed_resume.get('title_en', ''),
|
|
|
- "title_zh": parsed_resume.get('title_zh', '')
|
|
|
- }
|
|
|
- career_entry = standardize_career_entry(new_entry)
|
|
|
- parsed_resume['career_path'] = [career_entry]
|
|
|
- logging.info(f"为简历解析结果创建了career_path记录: {career_entry}")
|
|
|
+ # 处理 affiliation 字段(如果从basic_info中提取到)
|
|
|
+ if parsed_resume.get('affiliation') and not isinstance(parsed_resume['affiliation'], list):
|
|
|
+ # 如果affiliation是字符串,转换为数组格式
|
|
|
+ affiliation_str = parsed_resume['affiliation']
|
|
|
+ if affiliation_str:
|
|
|
+ parsed_resume['affiliation'] = [{"company": "", "group": affiliation_str}]
|
|
|
else:
|
|
|
- parsed_resume['career_path'] = []
|
|
|
- logging.info("简历中未提取到职业信息,career_path设为空数组")
|
|
|
- else:
|
|
|
- # 如果career_path是数组,对数组中的元素依次处理,统一为标准格式
|
|
|
- standardized_entries = []
|
|
|
- for i, entry in enumerate(career_path):
|
|
|
- standardized_entry = standardize_career_entry(entry)
|
|
|
- standardized_entries.append(standardized_entry)
|
|
|
- logging.debug(f"标准化第 {i+1} 个career_path条目: {standardized_entry}")
|
|
|
-
|
|
|
- parsed_resume['career_path'] = standardized_entries
|
|
|
- logging.info(f"标准化了 {len(standardized_entries)} 个career_path条目")
|
|
|
+ parsed_resume['affiliation'] = []
|
|
|
|
|
|
# 为affiliation增加记录(如果提取到公司信息)
|
|
|
if parsed_resume.get('hotel_zh') or parsed_resume.get('hotel_en'):
|
|
@@ -329,50 +493,15 @@ def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str
|
|
|
'data': None
|
|
|
}
|
|
|
|
|
|
- # 步骤1: 提取PDF文本内容
|
|
|
- logging.info("开始提取PDF文本内容")
|
|
|
- text_extract_result = extract_resume_text(file_path)
|
|
|
-
|
|
|
- if not text_extract_result['success']:
|
|
|
- return {
|
|
|
- 'success': False,
|
|
|
- 'error': f"PDF文本提取失败: {text_extract_result.get('error', '未知错误')}",
|
|
|
- 'data': None
|
|
|
- }
|
|
|
-
|
|
|
- resume_text = text_extract_result['text_content']
|
|
|
- page_count = text_extract_result['page_count']
|
|
|
-
|
|
|
- if not resume_text or len(resume_text.strip()) < 50:
|
|
|
- return {
|
|
|
- 'success': False,
|
|
|
- 'error': '提取的简历文本内容过少,可能是扫描版PDF或文本质量较差',
|
|
|
- 'data': None
|
|
|
- }
|
|
|
-
|
|
|
- logging.info(f"成功提取PDF文本,共{page_count}页,文本长度: {len(resume_text)}字符")
|
|
|
-
|
|
|
- # 步骤2: 使用千问大模型解析简历信息
|
|
|
- logging.info("开始使用千问大模型解析简历信息")
|
|
|
- try:
|
|
|
- parsed_data = parse_resume_with_qwen(resume_text)
|
|
|
- logging.info("千问大模型解析完成")
|
|
|
- except Exception as e:
|
|
|
- return {
|
|
|
- 'success': False,
|
|
|
- 'error': f"大模型解析失败: {str(e)}",
|
|
|
- 'data': None
|
|
|
- }
|
|
|
-
|
|
|
- # 步骤3: 构建完整的解析结果
|
|
|
- # 获取文件大小
|
|
|
+ # 步骤1: 获取文件基本信息
|
|
|
+ logging.info("开始获取文件基本信息")
|
|
|
+ page_count = 0
|
|
|
file_size = 0
|
|
|
+
|
|
|
try:
|
|
|
if file_path.startswith('http://') or file_path.startswith('https://'):
|
|
|
- # 对于MinIO URL,从extract_resume_text的结果中获取文件大小
|
|
|
- # 或者重新获取(这里我们使用一个简化的方法)
|
|
|
+ # 对于MinIO URL,获取文件信息
|
|
|
from urllib.parse import urlparse
|
|
|
-
|
|
|
minio_client = get_minio_client()
|
|
|
if minio_client:
|
|
|
parsed_url = urlparse(file_path)
|
|
@@ -385,10 +514,24 @@ def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str
|
|
|
except Exception:
|
|
|
file_size = 0
|
|
|
else:
|
|
|
+ # 本地文件
|
|
|
file_size = os.path.getsize(file_path)
|
|
|
except Exception:
|
|
|
file_size = 0
|
|
|
|
|
|
+ # 步骤2: 使用千问大模型直接解析PDF文档
|
|
|
+ logging.info("开始使用千问大模型解析PDF文档")
|
|
|
+ try:
|
|
|
+ parsed_data = parse_resume_with_qwen(file_path)
|
|
|
+ logging.info("千问大模型解析完成")
|
|
|
+ except Exception as e:
|
|
|
+ return {
|
|
|
+ 'success': False,
|
|
|
+ 'error': f"大模型解析失败: {str(e)}",
|
|
|
+ 'data': None
|
|
|
+ }
|
|
|
+
|
|
|
+ # 步骤3: 构建完整的解析结果
|
|
|
parse_result = {
|
|
|
**parsed_data, # 包含所有千问解析的结果
|
|
|
'parse_time': datetime.now().isoformat(),
|
|
@@ -397,10 +540,10 @@ def parse_resume_file(file_path: str, task_id: Optional[str] = None) -> Dict[str
|
|
|
'file_size': file_size,
|
|
|
'file_type': 'pdf',
|
|
|
'page_count': page_count,
|
|
|
- 'text_length': len(resume_text)
|
|
|
+ 'text_length': 0 # 直接使用大模型解析,不提取文本
|
|
|
},
|
|
|
'extraction_info': {
|
|
|
- 'extraction_method': 'PyPDF2 + Qwen-Plus',
|
|
|
+ 'extraction_method': 'Qwen-Long-Latest PDF解析',
|
|
|
'text_extract_success': True,
|
|
|
'ai_parse_success': True,
|
|
|
'task_id': task_id
|