|
@@ -9,6 +9,7 @@ from botocore.config import Config
|
|
|
from io import BytesIO
|
|
|
from datetime import datetime
|
|
|
from openai import OpenAI
|
|
|
+from typing import Dict, Any
|
|
|
|
|
|
# 导入配置和业务逻辑模块
|
|
|
from app.config.config import DevelopmentConfig, ProductionConfig
|
|
@@ -748,6 +749,67 @@ def process_webpage_with_QWen(markdown_text, publish_time):
|
|
|
raise Exception(error_msg)
|
|
|
|
|
|
|
|
|
+def _convert_webpage_to_card_format(webpage_data: Dict[str, Any], publish_time: str) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 将网页解析的数据转换为标准名片格式,与任务解析结果.txt中的data字段格式一致
|
|
|
+
|
|
|
+ Args:
|
|
|
+ webpage_data (Dict[str, Any]): 网页解析的原始数据
|
|
|
+ publish_time (str): 发布时间
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Dict[str, Any]: 标准化后的名片格式数据
|
|
|
+ """
|
|
|
+ # 构建隶属关系
|
|
|
+ affiliation = []
|
|
|
+ company = webpage_data.get('hotel_zh', '')
|
|
|
+ if company:
|
|
|
+ affiliation.append({
|
|
|
+ "company": company,
|
|
|
+ "group": webpage_data.get('brand_group', '')
|
|
|
+ })
|
|
|
+
|
|
|
+ # 构建职业轨迹
|
|
|
+ career_path = []
|
|
|
+ position = webpage_data.get('title_zh', '')
|
|
|
+ if position and company:
|
|
|
+ career_path.append({
|
|
|
+ "date": publish_time if publish_time else datetime.now().strftime('%Y-%m-%d'),
|
|
|
+ "hotel_en": webpage_data.get('hotel_en', ''),
|
|
|
+ "hotel_zh": company,
|
|
|
+ "image_path": webpage_data.get('pic_url', ''),
|
|
|
+ "source": "webpage_talent_extraction",
|
|
|
+ "title_en": webpage_data.get('title_en', ''),
|
|
|
+ "title_zh": position
|
|
|
+ })
|
|
|
+
|
|
|
+ # 按照任务解析结果.txt的data字段格式组装数据
|
|
|
+ standardized = {
|
|
|
+ "address_en": webpage_data.get('address_en', ''),
|
|
|
+ "address_zh": webpage_data.get('address_zh', ''),
|
|
|
+ "affiliation": affiliation,
|
|
|
+ "age": webpage_data.get('age', 0),
|
|
|
+ "birthday": webpage_data.get('birthday', ''),
|
|
|
+ "brand_group": webpage_data.get('brand_group', ''),
|
|
|
+ "career_path": career_path,
|
|
|
+ "email": webpage_data.get('email', ''),
|
|
|
+ "hotel_en": webpage_data.get('hotel_en', ''),
|
|
|
+ "hotel_zh": company,
|
|
|
+ "mobile": webpage_data.get('mobile', ''),
|
|
|
+ "name_en": webpage_data.get('name_en', ''),
|
|
|
+ "name_zh": webpage_data.get('name_zh', ''),
|
|
|
+ "native_place": webpage_data.get('native_place', ''),
|
|
|
+ "phone": webpage_data.get('phone', ''),
|
|
|
+ "postal_code_en": webpage_data.get('postal_code_en', ''),
|
|
|
+ "postal_code_zh": webpage_data.get('postal_code_zh', ''),
|
|
|
+ "residence": webpage_data.get('residence', ''),
|
|
|
+ "title_en": webpage_data.get('title_en', ''),
|
|
|
+ "title_zh": position
|
|
|
+ }
|
|
|
+
|
|
|
+ return standardized
|
|
|
+
|
|
|
+
|
|
|
def batch_process_md(markdown_file_list, publish_time):
|
|
|
"""
|
|
|
批量处理包含多个人员信息的markdown文件
|
|
@@ -757,24 +819,32 @@ def batch_process_md(markdown_file_list, publish_time):
|
|
|
publish_time (str): 发布时间,用于career_path中的date字段
|
|
|
|
|
|
Returns:
|
|
|
- dict: 批量处理结果,格式与batch_process_business_card_images保持一致
|
|
|
+ dict: 批量处理结果,格式与parse_result保持一致
|
|
|
"""
|
|
|
try:
|
|
|
# 参数验证
|
|
|
if not markdown_file_list or not isinstance(markdown_file_list, list):
|
|
|
return {
|
|
|
- 'code': 400,
|
|
|
- 'success': False,
|
|
|
- 'message': 'markdown_file_list参数必须是非空数组',
|
|
|
- 'data': None
|
|
|
+ "processed_time": datetime.now().isoformat(),
|
|
|
+ "results": [],
|
|
|
+ "summary": {
|
|
|
+ "failed_count": len(markdown_file_list) if markdown_file_list else 0,
|
|
|
+ "success_count": 0,
|
|
|
+ "success_rate": 0,
|
|
|
+ "total_files": len(markdown_file_list) if markdown_file_list else 0
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
if not publish_time or not isinstance(publish_time, str):
|
|
|
return {
|
|
|
- 'code': 400,
|
|
|
- 'success': False,
|
|
|
- 'message': 'publish_time参数必须是非空字符串',
|
|
|
- 'data': None
|
|
|
+ "processed_time": datetime.now().isoformat(),
|
|
|
+ "results": [],
|
|
|
+ "summary": {
|
|
|
+ "failed_count": len(markdown_file_list),
|
|
|
+ "success_count": 0,
|
|
|
+ "success_rate": 0,
|
|
|
+ "total_files": len(markdown_file_list)
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
logging.info(f"开始批量处理 {len(markdown_file_list)} 个markdown文件")
|
|
@@ -782,7 +852,7 @@ def batch_process_md(markdown_file_list, publish_time):
|
|
|
results = []
|
|
|
success_count = 0
|
|
|
failed_count = 0
|
|
|
- total_persons = 0
|
|
|
+ total_records = 0 # 总记录数(人员数)
|
|
|
|
|
|
# 逐个处理每个markdown文件
|
|
|
for i, minio_path in enumerate(markdown_file_list):
|
|
@@ -793,44 +863,74 @@ def batch_process_md(markdown_file_list, publish_time):
|
|
|
file_result = process_single_markdown_file(minio_path, publish_time)
|
|
|
|
|
|
if file_result.get('success', False):
|
|
|
- success_count += 1
|
|
|
- persons_count = file_result.get('data', {}).get('total_persons', 0)
|
|
|
- total_persons += persons_count
|
|
|
+ # 提取处理结果中的人员信息
|
|
|
+ persons_data = file_result.get('data', {}).get('all_results', [])
|
|
|
|
|
|
- results.append({
|
|
|
- 'index': i,
|
|
|
- 'minio_path': minio_path,
|
|
|
- 'filename': minio_path.split('/')[-1] if '/' in minio_path else minio_path,
|
|
|
- 'success': True,
|
|
|
- 'error': None,
|
|
|
- 'data': file_result.get('data'),
|
|
|
- 'message': f'处理成功,提取 {persons_count} 个人员信息'
|
|
|
- })
|
|
|
- logging.info(f"成功处理第 {i+1} 个文件: {minio_path},提取 {persons_count} 个人员信息")
|
|
|
+ if persons_data and isinstance(persons_data, list):
|
|
|
+ # 为每个人员创建一个结果记录
|
|
|
+ for person_idx, person_data in enumerate(persons_data):
|
|
|
+ total_records += 1
|
|
|
+ # 转换为标准名片格式
|
|
|
+ standardized_data = _convert_webpage_to_card_format(person_data, publish_time)
|
|
|
+
|
|
|
+ success_count += 1
|
|
|
+ results.append({
|
|
|
+ "data": standardized_data,
|
|
|
+ "error": None,
|
|
|
+ "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
|
|
|
+ "index": len(results), # 使用连续的索引
|
|
|
+ "message": "网页人才信息解析成功",
|
|
|
+ "minio_path": minio_path,
|
|
|
+ "object_key": minio_path,
|
|
|
+ "success": True
|
|
|
+ })
|
|
|
+ logging.info(f"成功提取人员 {person_idx+1}: {person_data.get('name_zh', 'Unknown')}")
|
|
|
+ else:
|
|
|
+ # 没有提取到有效数据,这算作一个失败记录
|
|
|
+ total_records += 1
|
|
|
+ failed_count += 1
|
|
|
+ results.append({
|
|
|
+ "data": None,
|
|
|
+ "error": "未从markdown文件中提取到人员信息",
|
|
|
+ "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
|
|
|
+ "index": len(results),
|
|
|
+ "message": "网页人才信息解析失败",
|
|
|
+ "minio_path": minio_path,
|
|
|
+ "object_key": minio_path,
|
|
|
+ "success": False
|
|
|
+ })
|
|
|
+ logging.warning(f"第 {i+1} 个文件未提取到人员信息")
|
|
|
else:
|
|
|
+ # 文件处理失败,算作一个失败记录
|
|
|
+ total_records += 1
|
|
|
failed_count += 1
|
|
|
error_msg = file_result.get('message', '处理失败')
|
|
|
results.append({
|
|
|
- 'index': i,
|
|
|
- 'minio_path': minio_path,
|
|
|
- 'filename': minio_path.split('/')[-1] if '/' in minio_path else minio_path,
|
|
|
- 'success': False,
|
|
|
- 'error': error_msg,
|
|
|
- 'data': None
|
|
|
+ "data": None,
|
|
|
+ "error": error_msg,
|
|
|
+ "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
|
|
|
+ "index": len(results),
|
|
|
+ "message": "网页人才信息解析失败",
|
|
|
+ "minio_path": minio_path,
|
|
|
+ "object_key": minio_path,
|
|
|
+ "success": False
|
|
|
})
|
|
|
logging.error(f"处理第 {i+1} 个文件失败: {error_msg}")
|
|
|
|
|
|
except Exception as item_error:
|
|
|
+ total_records += 1
|
|
|
failed_count += 1
|
|
|
error_msg = f"处理markdown文件失败: {str(item_error)}"
|
|
|
logging.error(error_msg, exc_info=True)
|
|
|
results.append({
|
|
|
- 'index': i,
|
|
|
- 'minio_path': minio_path,
|
|
|
- 'filename': minio_path.split('/')[-1] if '/' in minio_path else minio_path,
|
|
|
- 'success': False,
|
|
|
- 'error': error_msg,
|
|
|
- 'data': None
|
|
|
+ "data": None,
|
|
|
+ "error": error_msg,
|
|
|
+ "filename": minio_path.split('/')[-1] if '/' in minio_path else minio_path,
|
|
|
+ "index": len(results),
|
|
|
+ "message": "网页人才信息解析失败",
|
|
|
+ "minio_path": minio_path,
|
|
|
+ "object_key": minio_path,
|
|
|
+ "success": False
|
|
|
})
|
|
|
|
|
|
# 组装最终结果
|
|
@@ -839,8 +939,7 @@ def batch_process_md(markdown_file_list, publish_time):
|
|
|
'total_files': len(markdown_file_list),
|
|
|
'success_count': success_count,
|
|
|
'failed_count': failed_count,
|
|
|
- 'success_rate': round((success_count / len(markdown_file_list)) * 100, 2) if len(markdown_file_list) > 0 else 0,
|
|
|
- 'total_persons': total_persons
|
|
|
+ 'success_rate': round((success_count / len(markdown_file_list)) * 100, 2) if len(markdown_file_list) > 0 else 0
|
|
|
},
|
|
|
'results': results,
|
|
|
'processed_time': datetime.now().isoformat()
|
|
@@ -850,7 +949,7 @@ def batch_process_md(markdown_file_list, publish_time):
|
|
|
return {
|
|
|
'code': 200,
|
|
|
'success': True,
|
|
|
- 'message': f'批量处理完成,全部 {success_count} 个文件处理成功,共提取 {total_persons} 个人员信息',
|
|
|
+ 'message': f'批量处理完成,全部 {success_count} 个文件处理成功',
|
|
|
'data': batch_result
|
|
|
}
|
|
|
elif success_count == 0:
|
|
@@ -864,18 +963,30 @@ def batch_process_md(markdown_file_list, publish_time):
|
|
|
return {
|
|
|
'code': 206, # Partial Content
|
|
|
'success': True,
|
|
|
- 'message': f'批量处理部分成功,成功 {success_count} 个,失败 {failed_count} 个,共提取 {total_persons} 个人员信息',
|
|
|
+ 'message': f'批量处理部分成功,成功 {success_count} 个,失败 {failed_count} 个',
|
|
|
'data': batch_result
|
|
|
}
|
|
|
|
|
|
except Exception as e:
|
|
|
error_msg = f"batch_process_md函数执行失败: {str(e)}"
|
|
|
logging.error(error_msg, exc_info=True)
|
|
|
+
|
|
|
+ batch_result = {
|
|
|
+ 'summary': {
|
|
|
+ 'total_files': len(markdown_file_list) if markdown_file_list else 1,
|
|
|
+ 'success_count': 0,
|
|
|
+ 'failed_count': len(markdown_file_list) if markdown_file_list else 1,
|
|
|
+ 'success_rate': 0
|
|
|
+ },
|
|
|
+ 'results': [],
|
|
|
+ 'processed_time': datetime.now().isoformat()
|
|
|
+ }
|
|
|
+
|
|
|
return {
|
|
|
'code': 500,
|
|
|
'success': False,
|
|
|
'message': error_msg,
|
|
|
- 'data': None
|
|
|
+ 'data': batch_result
|
|
|
}
|
|
|
|
|
|
|
|
@@ -885,7 +996,7 @@ def get_markdown_from_minio(minio_client, minio_path):
|
|
|
|
|
|
Args:
|
|
|
minio_client: MinIO客户端
|
|
|
- minio_path (str): MinIO中的文件路径
|
|
|
+ minio_path (str): MinIO中的文件路径或完整URL
|
|
|
|
|
|
Returns:
|
|
|
str: 文件内容,如果失败返回None
|
|
@@ -893,8 +1004,16 @@ def get_markdown_from_minio(minio_client, minio_path):
|
|
|
try:
|
|
|
logging.info(f"从MinIO获取文件: {minio_path}")
|
|
|
|
|
|
+ # 如果是完整的URL,提取对象键
|
|
|
+ object_key = _extract_object_key_from_url(minio_path)
|
|
|
+ if object_key is None:
|
|
|
+ logging.error(f"无法从URL中提取有效的对象键: {minio_path}")
|
|
|
+ return None
|
|
|
+ if object_key != minio_path:
|
|
|
+ logging.info(f"从URL提取的对象键: {object_key}")
|
|
|
+
|
|
|
# 从MinIO下载文件
|
|
|
- response = minio_client.get_object(Bucket=minio_bucket, Key=minio_path)
|
|
|
+ response = minio_client.get_object(Bucket=minio_bucket, Key=object_key)
|
|
|
|
|
|
# 读取文件内容
|
|
|
content = response['Body'].read()
|
|
@@ -921,6 +1040,46 @@ def get_markdown_from_minio(minio_client, minio_path):
|
|
|
return None
|
|
|
|
|
|
|
|
|
+def _extract_object_key_from_url(minio_url):
|
|
|
+ """
|
|
|
+ 从MinIO完整URL中提取对象键名
|
|
|
+
|
|
|
+ Args:
|
|
|
+ minio_url (str): 完整的MinIO URL,如 "http://host:port/bucket/path/to/file.md"
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ str: 对象键名,如 "path/to/file.md",失败时返回原始字符串或None
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ if not minio_url or not isinstance(minio_url, str):
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 移除协议部分 (http:// 或 https://)
|
|
|
+ if minio_url.startswith('https://'):
|
|
|
+ url_without_protocol = minio_url[8:]
|
|
|
+ elif minio_url.startswith('http://'):
|
|
|
+ url_without_protocol = minio_url[7:]
|
|
|
+ else:
|
|
|
+ # 如果没有协议前缀,假设是相对路径,直接返回
|
|
|
+ return minio_url
|
|
|
+
|
|
|
+ # 分割路径部分
|
|
|
+ parts = url_without_protocol.split('/')
|
|
|
+
|
|
|
+ # 至少需要包含 host:port/bucket/object
|
|
|
+ if len(parts) < 3:
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 跳过host:port和bucket,获取对象路径
|
|
|
+ object_key = '/'.join(parts[2:])
|
|
|
+
|
|
|
+ return object_key if object_key else None
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logging.error(f"解析MinIO URL失败: {str(e)}")
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
def save_section_to_minio(minio_client, section_content, original_minio_path, section_number):
|
|
|
"""
|
|
|
将分割后的markdown内容保存到MinIO
|