|
@@ -2,37 +2,480 @@ import os
|
|
import json
|
|
import json
|
|
import logging
|
|
import logging
|
|
import re
|
|
import re
|
|
|
|
+import uuid
|
|
|
|
+import boto3
|
|
|
|
+from botocore.config import Config
|
|
|
|
+from io import BytesIO
|
|
from datetime import datetime
|
|
from datetime import datetime
|
|
from openai import OpenAI
|
|
from openai import OpenAI
|
|
|
|
|
|
|
|
+# 导入配置和业务逻辑模块
|
|
|
|
+from app.config.config import DevelopmentConfig, ProductionConfig
|
|
|
|
+from app.core.data_parse.parse import (
|
|
|
|
+ BusinessCard, check_duplicate_business_card,
|
|
|
|
+ create_main_card_with_duplicates, update_career_path,
|
|
|
|
+ normalize_mobile_numbers
|
|
|
|
+)
|
|
|
|
+from app import db
|
|
|
|
|
|
-def extract_json_from_text(text):
|
|
|
|
|
|
+# 使用配置变量,缺省认为在生产环境运行
|
|
|
|
+config = ProductionConfig()
|
|
|
|
+# 使用配置变量
|
|
|
|
+minio_url = f"{'https' if config.MINIO_SECURE else 'http'}://{config.MINIO_HOST}"
|
|
|
|
+minio_access_key = config.MINIO_USER
|
|
|
|
+minio_secret_key = config.MINIO_PASSWORD
|
|
|
|
+minio_bucket = config.MINIO_BUCKET
|
|
|
|
+use_ssl = config.MINIO_SECURE
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_minio_client():
|
|
|
|
+ """获取MinIO客户端连接"""
|
|
|
|
+ try:
|
|
|
|
+ logging.info(f"尝试连接MinIO服务器: {minio_url}")
|
|
|
|
+
|
|
|
|
+ minio_client = boto3.client(
|
|
|
|
+ 's3',
|
|
|
|
+ endpoint_url=minio_url,
|
|
|
|
+ aws_access_key_id=minio_access_key,
|
|
|
|
+ aws_secret_access_key=minio_secret_key,
|
|
|
|
+ config=Config(
|
|
|
|
+ signature_version='s3v4',
|
|
|
|
+ retries={'max_attempts': 3, 'mode': 'standard'},
|
|
|
|
+ connect_timeout=10,
|
|
|
|
+ read_timeout=30
|
|
|
|
+ )
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 确保存储桶存在
|
|
|
|
+ buckets = minio_client.list_buckets()
|
|
|
|
+ bucket_names = [bucket['Name'] for bucket in buckets.get('Buckets', [])]
|
|
|
|
+ logging.info(f"成功连接到MinIO服务器,现有存储桶: {bucket_names}")
|
|
|
|
+
|
|
|
|
+ if minio_bucket not in bucket_names:
|
|
|
|
+ logging.info(f"创建存储桶: {minio_bucket}")
|
|
|
|
+ minio_client.create_bucket(Bucket=minio_bucket)
|
|
|
|
+
|
|
|
|
+ return minio_client
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.error(f"MinIO连接错误: {str(e)}")
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def upload_md_to_minio(web_md, filename=None):
|
|
"""
|
|
"""
|
|
- 从文本中提取JSON部分
|
|
|
|
|
|
+ 将markdown文本上传到MinIO
|
|
|
|
|
|
Args:
|
|
Args:
|
|
- text (str): 包含JSON的文本
|
|
|
|
|
|
+ web_md (str): markdown格式的文本内容
|
|
|
|
+ filename (str, optional): 指定的文件名,如果不提供则自动生成
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
- str: 提取的JSON字符串
|
|
|
|
|
|
+ str: MinIO中的文件路径,如果上传失败返回None
|
|
"""
|
|
"""
|
|
- # 尝试找到最外层的花括号对
|
|
|
|
- start_idx = text.find('{')
|
|
|
|
- if start_idx == -1:
|
|
|
|
- return "{}"
|
|
|
|
|
|
+ try:
|
|
|
|
+ # 生成文件名
|
|
|
|
+ if not filename:
|
|
|
|
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
|
|
+ unique_id = uuid.uuid4().hex[:8]
|
|
|
|
+ filename = f"webpage_talent_{timestamp}_{unique_id}.md"
|
|
|
|
+ elif not filename.endswith('.md'):
|
|
|
|
+ filename += '.md'
|
|
|
|
+
|
|
|
|
+ # 获取MinIO客户端
|
|
|
|
+ minio_client = get_minio_client()
|
|
|
|
+ if not minio_client:
|
|
|
|
+ logging.error("无法获取MinIO客户端")
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+ # 将文本转换为字节流
|
|
|
|
+ md_bytes = web_md.encode('utf-8')
|
|
|
|
+ md_stream = BytesIO(md_bytes)
|
|
|
|
+
|
|
|
|
+ # 上传到MinIO
|
|
|
|
+ minio_path = f"webpage_talent/{filename}"
|
|
|
|
+ logging.info(f"开始上传MD文件到MinIO: {minio_path}")
|
|
|
|
+
|
|
|
|
+ minio_client.put_object(
|
|
|
|
+ Bucket=minio_bucket,
|
|
|
|
+ Key=minio_path,
|
|
|
|
+ Body=md_stream,
|
|
|
|
+ ContentType='text/markdown',
|
|
|
|
+ Metadata={
|
|
|
|
+ 'original_filename': filename,
|
|
|
|
+ 'upload_time': datetime.now().isoformat(),
|
|
|
|
+ 'content_type': 'webpage_talent_md'
|
|
|
|
+ }
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ logging.info(f"MD文件成功上传到MinIO: {minio_path}")
|
|
|
|
+ return minio_path
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.error(f"上传MD文件到MinIO失败: {str(e)}", exc_info=True)
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def add_webpage_talent(talent_list, web_md):
|
|
|
|
+ """
|
|
|
|
+ 添加网页人才信息,包括保存网页内容和创建名片记录
|
|
|
|
|
|
- # 使用简单的括号匹配算法找到对应的闭合括号
|
|
|
|
- count = 0
|
|
|
|
- for i in range(start_idx, len(text)):
|
|
|
|
- if text[i] == '{':
|
|
|
|
- count += 1
|
|
|
|
- elif text[i] == '}':
|
|
|
|
- count -= 1
|
|
|
|
- if count == 0:
|
|
|
|
- return text[start_idx:i+1]
|
|
|
|
|
|
+ Args:
|
|
|
|
+ talent_list (list): 人才信息列表,每个item包含业务卡片格式的数据
|
|
|
|
+ web_md (str): 网页markdown文本内容
|
|
|
|
+
|
|
|
|
+ Returns:
|
|
|
|
+ dict: 处理结果,包含成功和失败的记录统计
|
|
|
|
+ """
|
|
|
|
+ try:
|
|
|
|
+ # 参数验证
|
|
|
|
+ if not talent_list or not isinstance(talent_list, list):
|
|
|
|
+ return {
|
|
|
|
+ 'code': 400,
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'message': 'talent_list参数必须是非空数组',
|
|
|
|
+ 'data': None
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if not web_md or not isinstance(web_md, str):
|
|
|
|
+ return {
|
|
|
|
+ 'code': 400,
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'message': 'web_md参数必须是非空字符串',
|
|
|
|
+ 'data': None
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # 上传markdown文件到MinIO
|
|
|
|
+ logging.info("开始上传网页内容到MinIO")
|
|
|
|
+ minio_md_path = upload_md_to_minio(web_md)
|
|
|
|
+
|
|
|
|
+ if not minio_md_path:
|
|
|
|
+ return {
|
|
|
|
+ 'code': 500,
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'message': '上传网页内容到MinIO失败',
|
|
|
|
+ 'data': None
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # 处理结果统计
|
|
|
|
+ results = {
|
|
|
|
+ 'total_count': len(talent_list),
|
|
|
|
+ 'success_count': 0,
|
|
|
|
+ 'failed_count': 0,
|
|
|
|
+ 'success_records': [],
|
|
|
|
+ 'failed_records': [],
|
|
|
|
+ 'minio_md_path': minio_md_path
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # 循环处理每个人才记录
|
|
|
|
+ for index, talent_data in enumerate(talent_list):
|
|
|
|
+ try:
|
|
|
|
+ logging.info(f"开始处理第{index + 1}个人才记录: {talent_data.get('name_zh', 'Unknown')}")
|
|
|
|
+
|
|
|
|
+ # 验证必要字段
|
|
|
|
+ if not talent_data.get('name_zh'):
|
|
|
|
+ error_msg = f"第{index + 1}个记录缺少name_zh字段"
|
|
|
|
+ logging.warning(error_msg)
|
|
|
|
+ results['failed_records'].append({
|
|
|
|
+ 'index': index + 1,
|
|
|
|
+ 'data': talent_data,
|
|
|
|
+ 'error': error_msg
|
|
|
|
+ })
|
|
|
|
+ results['failed_count'] += 1
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ # 设置origin_source为原始资料记录
|
|
|
|
+ talent_data['origin_source'] = {
|
|
|
|
+ 'type': 'webpage_talent',
|
|
|
|
+ 'minio_path': minio_md_path,
|
|
|
|
+ 'source_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
+ 'web_md_content': web_md[:1000] # 保存部分网页内容作为参考
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # 处理business_card记录
|
|
|
|
+ card_result = process_single_talent_card(talent_data, minio_md_path)
|
|
|
|
+
|
|
|
|
+ if card_result['success']:
|
|
|
|
+ results['success_records'].append({
|
|
|
|
+ 'index': index + 1,
|
|
|
|
+ 'data': card_result['data'],
|
|
|
|
+ 'message': card_result['message']
|
|
|
|
+ })
|
|
|
|
+ results['success_count'] += 1
|
|
|
|
+ logging.info(f"成功处理第{index + 1}个人才记录")
|
|
|
|
+ else:
|
|
|
|
+ results['failed_records'].append({
|
|
|
|
+ 'index': index + 1,
|
|
|
|
+ 'data': talent_data,
|
|
|
|
+ 'error': card_result['message']
|
|
|
|
+ })
|
|
|
|
+ results['failed_count'] += 1
|
|
|
|
+ logging.error(f"处理第{index + 1}个人才记录失败: {card_result['message']}")
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ error_msg = f"处理第{index + 1}个人才记录时发生异常: {str(e)}"
|
|
|
|
+ logging.error(error_msg, exc_info=True)
|
|
|
|
+ results['failed_records'].append({
|
|
|
|
+ 'index': index + 1,
|
|
|
|
+ 'data': talent_data,
|
|
|
|
+ 'error': error_msg
|
|
|
|
+ })
|
|
|
|
+ results['failed_count'] += 1
|
|
|
|
+
|
|
|
|
+ # 生成最终结果
|
|
|
|
+ if results['success_count'] == results['total_count']:
|
|
|
|
+ return {
|
|
|
|
+ 'code': 200,
|
|
|
|
+ 'success': True,
|
|
|
|
+ 'message': f'所有{results["total_count"]}条人才记录处理成功',
|
|
|
|
+ 'data': results
|
|
|
|
+ }
|
|
|
|
+ elif results['success_count'] > 0:
|
|
|
|
+ return {
|
|
|
|
+ 'code': 206, # Partial Content
|
|
|
|
+ 'success': True,
|
|
|
|
+ 'message': f'部分处理成功:{results["success_count"]}/{results["total_count"]}条记录成功',
|
|
|
|
+ 'data': results
|
|
|
|
+ }
|
|
|
|
+ else:
|
|
|
|
+ return {
|
|
|
|
+ 'code': 500,
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'message': f'所有{results["total_count"]}条人才记录处理失败',
|
|
|
|
+ 'data': results
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ error_msg = f"add_webpage_talent函数执行失败: {str(e)}"
|
|
|
|
+ logging.error(error_msg, exc_info=True)
|
|
|
|
+ return {
|
|
|
|
+ 'code': 500,
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'message': error_msg,
|
|
|
|
+ 'data': None
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def process_single_talent_card(talent_data, minio_md_path):
|
|
|
|
+ """
|
|
|
|
+ 处理单个人才的名片记录创建
|
|
|
|
|
|
- # 如果没有找到闭合括号,返回从开始位置到文本结尾
|
|
|
|
- return text[start_idx:]
|
|
|
|
|
|
+ Args:
|
|
|
|
+ talent_data (dict): 人才信息数据
|
|
|
|
+ minio_md_path (str): MinIO中网页内容的路径
|
|
|
|
+
|
|
|
|
+ Returns:
|
|
|
|
+ dict: 处理结果
|
|
|
|
+ """
|
|
|
|
+ try:
|
|
|
|
+ # 检查重复记录
|
|
|
|
+ try:
|
|
|
|
+ duplicate_check = check_duplicate_business_card(talent_data)
|
|
|
|
+ logging.info(f"重复记录检查结果: {duplicate_check['reason']}")
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.error(f"重复记录检查失败: {str(e)}", exc_info=True)
|
|
|
|
+ # 如果检查失败,默认创建新记录
|
|
|
|
+ duplicate_check = {
|
|
|
|
+ 'is_duplicate': False,
|
|
|
|
+ 'action': 'create_new',
|
|
|
|
+ 'existing_card': None,
|
|
|
|
+ 'reason': f'重复检查失败,创建新记录: {str(e)}'
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # 根据重复检查结果执行不同操作
|
|
|
|
+ if duplicate_check['action'] == 'update':
|
|
|
|
+ # 更新现有记录
|
|
|
|
+ existing_card = duplicate_check['existing_card']
|
|
|
|
+
|
|
|
|
+ # 更新基本信息
|
|
|
|
+ existing_card.name_en = talent_data.get('name_en', existing_card.name_en)
|
|
|
|
+ existing_card.title_zh = talent_data.get('title_zh', existing_card.title_zh)
|
|
|
|
+ existing_card.title_en = talent_data.get('title_en', existing_card.title_en)
|
|
|
|
+
|
|
|
|
+ # 处理手机号码字段,支持多个手机号码
|
|
|
|
+ if 'mobile' in talent_data:
|
|
|
|
+ new_mobile = normalize_mobile_numbers(talent_data.get('mobile', ''))
|
|
|
|
+ if new_mobile:
|
|
|
|
+ # 合并手机号码
|
|
|
|
+ from app.core.data_parse.parse import merge_mobile_numbers
|
|
|
|
+ existing_card.mobile = merge_mobile_numbers(existing_card.mobile, new_mobile)
|
|
|
|
+ elif talent_data.get('mobile') == '':
|
|
|
|
+ existing_card.mobile = ''
|
|
|
|
+
|
|
|
|
+ existing_card.phone = talent_data.get('phone', existing_card.phone)
|
|
|
|
+ existing_card.email = talent_data.get('email', existing_card.email)
|
|
|
|
+ existing_card.hotel_zh = talent_data.get('hotel_zh', existing_card.hotel_zh)
|
|
|
|
+ existing_card.hotel_en = talent_data.get('hotel_en', existing_card.hotel_en)
|
|
|
|
+ existing_card.address_zh = talent_data.get('address_zh', existing_card.address_zh)
|
|
|
|
+ existing_card.address_en = talent_data.get('address_en', existing_card.address_en)
|
|
|
|
+ existing_card.postal_code_zh = talent_data.get('postal_code_zh', existing_card.postal_code_zh)
|
|
|
|
+ existing_card.postal_code_en = talent_data.get('postal_code_en', existing_card.postal_code_en)
|
|
|
|
+ existing_card.brand_zh = talent_data.get('brand_zh', existing_card.brand_zh)
|
|
|
|
+ existing_card.brand_en = talent_data.get('brand_en', existing_card.brand_en)
|
|
|
|
+ existing_card.affiliation_zh = talent_data.get('affiliation_zh', existing_card.affiliation_zh)
|
|
|
|
+ existing_card.affiliation_en = talent_data.get('affiliation_en', existing_card.affiliation_en)
|
|
|
|
+
|
|
|
|
+ # 处理生日字段
|
|
|
|
+ if talent_data.get('birthday'):
|
|
|
|
+ try:
|
|
|
|
+ existing_card.birthday = datetime.strptime(talent_data.get('birthday'), '%Y-%m-%d').date()
|
|
|
|
+ except ValueError:
|
|
|
|
+ # 如果日期格式不正确,保持原值
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+ # 处理年龄字段
|
|
|
|
+ if 'age' in talent_data:
|
|
|
|
+ try:
|
|
|
|
+ if talent_data['age'] is not None and str(talent_data['age']).strip():
|
|
|
|
+ age_value = int(talent_data['age'])
|
|
|
|
+ if 0 < age_value <= 150: # 合理的年龄范围检查
|
|
|
|
+ existing_card.age = age_value
|
|
|
|
+ else:
|
|
|
|
+ existing_card.age = None
|
|
|
|
+ except (ValueError, TypeError):
|
|
|
|
+ # 如果年龄格式不正确,保持原值
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+ existing_card.native_place = talent_data.get('native_place', existing_card.native_place)
|
|
|
|
+ existing_card.residence = talent_data.get('residence', existing_card.residence)
|
|
|
|
+ existing_card.brand_group = talent_data.get('brand_group', existing_card.brand_group)
|
|
|
|
+ existing_card.updated_by = 'webpage_talent_system'
|
|
|
|
+
|
|
|
|
+ existing_card.updated_by = 'webpage_talent_system'
|
|
|
|
+
|
|
|
|
+ # 设置origin_source为原始资料记录
|
|
|
|
+ existing_card.origin_source = {
|
|
|
|
+ 'type': 'webpage_talent',
|
|
|
|
+ 'minio_path': minio_md_path,
|
|
|
|
+ 'source_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
+ 'talent_data': talent_data
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # 更新职业轨迹,传递网页来源信息
|
|
|
|
+ existing_card.career_path = update_career_path(existing_card, talent_data, image_path='')
|
|
|
|
+
|
|
|
|
+ db.session.commit()
|
|
|
|
+
|
|
|
|
+ logging.info(f"已更新现有名片记录,ID: {existing_card.id}")
|
|
|
|
+
|
|
|
|
+ return {
|
|
|
|
+ 'success': True,
|
|
|
|
+ 'message': f'名片信息已更新。{duplicate_check["reason"]}',
|
|
|
|
+ 'data': existing_card.to_dict()
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ elif duplicate_check['action'] == 'create_with_duplicates':
|
|
|
|
+ # 创建新记录作为主记录,并保存疑似重复记录信息
|
|
|
|
+ main_card, duplicate_record = create_main_card_with_duplicates(
|
|
|
|
+ talent_data,
|
|
|
|
+ None, # 网页提取没有图片路径
|
|
|
|
+ duplicate_check['suspected_duplicates'],
|
|
|
|
+ duplicate_check['reason']
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ main_card.updated_by = 'webpage_talent_system'
|
|
|
|
+
|
|
|
|
+ # 设置origin_source为原始资料记录
|
|
|
|
+ main_card.origin_source = {
|
|
|
|
+ 'type': 'webpage_talent',
|
|
|
|
+ 'minio_path': minio_md_path,
|
|
|
|
+ 'source_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
+ 'talent_data': talent_data
|
|
|
|
+ }
|
|
|
|
+ db.session.commit()
|
|
|
|
+
|
|
|
|
+ return {
|
|
|
|
+ 'success': True,
|
|
|
|
+ 'message': f'创建新记录成功,发现疑似重复记录待处理。{duplicate_check["reason"]}',
|
|
|
|
+ 'data': {
|
|
|
|
+ 'main_card': main_card.to_dict(),
|
|
|
|
+ 'duplicate_record_id': duplicate_record.id,
|
|
|
|
+ 'suspected_duplicates_count': len(duplicate_check['suspected_duplicates']),
|
|
|
|
+ 'processing_status': 'pending',
|
|
|
|
+ 'duplicate_reason': duplicate_record.duplicate_reason,
|
|
|
|
+ 'created_at': duplicate_record.created_at.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ else:
|
|
|
|
+ # 创建新记录
|
|
|
|
+ # 准备初始职业轨迹
|
|
|
|
+ initial_entry = {
|
|
|
|
+ 'date': datetime.now().strftime('%Y-%m-%d'),
|
|
|
|
+ 'hotel_zh': talent_data.get('hotel_zh', ''),
|
|
|
|
+ 'hotel_en': talent_data.get('hotel_en', ''),
|
|
|
|
+ 'title_zh': talent_data.get('title_zh', ''),
|
|
|
|
+ 'title_en': talent_data.get('title_en', ''),
|
|
|
|
+ 'image_path': '', # 网页提取没有图片路径
|
|
|
|
+ 'source': 'webpage_extraction'
|
|
|
|
+ }
|
|
|
|
+ initial_career_path = [initial_entry]
|
|
|
|
+
|
|
|
|
+ # 处理年龄字段,确保是有效的整数或None
|
|
|
|
+ age_value = None
|
|
|
|
+ if talent_data.get('age'):
|
|
|
|
+ try:
|
|
|
|
+ age_value = int(talent_data.get('age'))
|
|
|
|
+ if age_value <= 0 or age_value > 150: # 合理的年龄范围检查
|
|
|
|
+ age_value = None
|
|
|
|
+ except (ValueError, TypeError):
|
|
|
|
+ age_value = None
|
|
|
|
+
|
|
|
|
+ business_card = BusinessCard(
|
|
|
|
+ name_zh=talent_data.get('name_zh', ''),
|
|
|
|
+ name_en=talent_data.get('name_en', ''),
|
|
|
|
+ title_zh=talent_data.get('title_zh', ''),
|
|
|
|
+ title_en=talent_data.get('title_en', ''),
|
|
|
|
+ mobile=normalize_mobile_numbers(talent_data.get('mobile', '')),
|
|
|
|
+ phone=talent_data.get('phone', ''),
|
|
|
|
+ email=talent_data.get('email', ''),
|
|
|
|
+ hotel_zh=talent_data.get('hotel_zh', ''),
|
|
|
|
+ hotel_en=talent_data.get('hotel_en', ''),
|
|
|
|
+ address_zh=talent_data.get('address_zh', ''),
|
|
|
|
+ address_en=talent_data.get('address_en', ''),
|
|
|
|
+ postal_code_zh=talent_data.get('postal_code_zh', ''),
|
|
|
|
+ postal_code_en=talent_data.get('postal_code_en', ''),
|
|
|
|
+ brand_zh=talent_data.get('brand_zh', ''),
|
|
|
|
+ brand_en=talent_data.get('brand_en', ''),
|
|
|
|
+ affiliation_zh=talent_data.get('affiliation_zh', ''),
|
|
|
|
+ affiliation_en=talent_data.get('affiliation_en', ''),
|
|
|
|
+ birthday=datetime.strptime(talent_data.get('birthday'), '%Y-%m-%d').date() if talent_data.get('birthday') else None,
|
|
|
|
+ age=age_value,
|
|
|
|
+ native_place=talent_data.get('native_place', ''),
|
|
|
|
+ residence=talent_data.get('residence', ''),
|
|
|
|
+ image_path=None, # 网页提取没有图片路径
|
|
|
|
+ career_path=initial_career_path,
|
|
|
|
+ brand_group=talent_data.get('brand_group', ''),
|
|
|
|
+ origin_source={
|
|
|
|
+ 'type': 'webpage_talent',
|
|
|
|
+ 'minio_path': minio_md_path,
|
|
|
|
+ 'source_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
+ 'talent_data': talent_data
|
|
|
|
+ },
|
|
|
|
+ status='active',
|
|
|
|
+ updated_by='webpage_talent_system'
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ db.session.add(business_card)
|
|
|
|
+ db.session.commit()
|
|
|
|
+
|
|
|
|
+ logging.info(f"名片信息已保存到数据库,ID: {business_card.id}")
|
|
|
|
+
|
|
|
|
+ return {
|
|
|
|
+ 'success': True,
|
|
|
|
+ 'message': f'名片信息保存成功。{duplicate_check["reason"]}',
|
|
|
|
+ 'data': business_card.to_dict()
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ db.session.rollback()
|
|
|
|
+ error_msg = f"处理单个人才名片记录失败: {str(e)}"
|
|
|
|
+ logging.error(error_msg, exc_info=True)
|
|
|
|
+
|
|
|
|
+ return {
|
|
|
|
+ 'success': False,
|
|
|
|
+ 'message': error_msg,
|
|
|
|
+ 'data': None
|
|
|
|
+ }
|
|
|
|
|
|
|
|
|
|
def process_webpage_with_QWen(markdown_text, publish_time):
|
|
def process_webpage_with_QWen(markdown_text, publish_time):
|
|
@@ -208,4 +651,5 @@ def process_webpage_with_QWen(markdown_text, publish_time):
|
|
except Exception as e:
|
|
except Exception as e:
|
|
error_msg = f"Qwen VL Max 模型网页文本解析失败: {str(e)}"
|
|
error_msg = f"Qwen VL Max 模型网页文本解析失败: {str(e)}"
|
|
logging.error(error_msg, exc_info=True)
|
|
logging.error(error_msg, exc_info=True)
|
|
- raise Exception(error_msg)
|
|
|
|
|
|
+ raise Exception(error_msg)
|
|
|
|
+
|