mxl_citu
/
DataOps-platform


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
测试QWen模型提取人员信息功能
"""

import os
import sys
import json
import logging
from datetime import datetime

# 添加项目根目录到路径
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

# 导入需要测试的函数
from app.core.data_parse.parse_web import process_webpage_with_QWen

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('test_qwen_extraction.log', encoding='utf-8')
    ]
)

def read_markdown_file(file_path):
    """读取markdown文件内容"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        logging.error(f"读取文件 {file_path} 失败: {str(e)}")
        return None

def save_result_to_file(result, filename):
    """保存结果到JSON文件"""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        logging.info(f"结果已保存到: {filename}")
    except Exception as e:
        logging.error(f"保存结果到文件失败: {str(e)}")

def analyze_extraction_quality(result, expected_count=None, test_name=""):
    """分析提取质量"""
    print(f"\n=== {test_name} 提取结果分析 ===")
    
    if not result:
        print("❌ 提取失败，返回空结果")
        return False
    
    if not isinstance(result, list):
        print("❌ 返回结果不是数组格式")
        return False
    
    print(f"✅ 成功提取 {len(result)} 个人员信息")
    
    if expected_count and len(result) != expected_count:
        print(f"⚠️  警告：期望提取 {expected_count} 个人员，实际提取 {len(result)} 个")
    
    # 检查每个人员信息的完整性
    required_fields = [
        'name_zh', 'name_en', 'title_zh', 'title_en', 
        'hotel_zh', 'hotel_en', 'brand_group', 'career_path', 'pic_url'
    ]
    
    for i, person in enumerate(result, 1):
        print(f"\n第 {i} 个人员:")
        print(f"  中文姓名: {person.get('name_zh', '未提取')}")
        print(f"  英文姓名: {person.get('name_en', '未提取')}")
        print(f"  中文职位: {person.get('title_zh', '未提取')}")
        print(f"  英文职位: {person.get('title_en', '未提取')}")
        print(f"  中文酒店: {person.get('hotel_zh', '未提取')}")
        print(f"  英文酒店: {person.get('hotel_en', '未提取')}")
        print(f"  品牌组合: {person.get('brand_group', '未提取')}")
        print(f"  照片链接: {person.get('pic_url', '未提取')[:80]}..." if len(person.get('pic_url', '')) > 80 else f"  照片链接: {person.get('pic_url', '未提取')}")
        print(f"  职业轨迹: {len(person.get('career_path', []))} 条记录")
        
        # 检查字段完整性
        missing_fields = [field for field in required_fields if field not in person]
        if missing_fields:
            print(f"  ❌ 缺失字段: {missing_fields}")
        else:
            print(f"  ✅ 所有字段都存在")
        
        # 显示非空字段的统计
        non_empty_fields = [field for field in required_fields if person.get(field) not in ['', None, []]]
        print(f"  📊 非空字段: {len(non_empty_fields)}/{len(required_fields)}")
    
    return True

def test_single_person_extraction():
    """测试单人提取"""
    print("\n" + "="*60)
    print("测试单人任命信息提取")
    print("="*60)
    
    # 读取单人任命文件
    content = read_markdown_file('新任命单人-markdown格式.md')
    if not content:
        print("❌ 无法读取单人任命文件")
        return
    
    try:
        # 调用提取函数
        result = process_webpage_with_QWen(content, "2025-01-15")
        
        # 保存结果
        save_result_to_file(result, 'test_result_single_person.json')
        
        # 分析结果
        analyze_extraction_quality(result, expected_count=1, test_name="单人任命")
        
    except Exception as e:
        logging.error(f"单人提取测试失败: {str(e)}")
        print(f"❌ 单人提取测试失败: {str(e)}")

def test_multiple_person_extraction():
    """测试多人提取"""
    print("\n" + "="*60)
    print("测试多人任命信息提取")
    print("="*60)
    
    # 读取多人任命文件
    content = read_markdown_file('新任命多人-markdown格式.md')
    if not content:
        print("❌ 无法读取多人任命文件")
        return
    
    try:
        # 调用提取函数
        result = process_webpage_with_QWen(content, "2025-01-15")
        
        # 添加调试信息
        print(f"🔍 调试信息: 函数返回结果类型: {type(result)}")
        print(f"🔍 调试信息: 函数返回结果长度: {len(result) if isinstance(result, list) else 'N/A'}")
        
        # 保存结果
        save_result_to_file(result, 'test_result_multiple_person.json')
        
        # 分析结果 (根据文件内容，应该有16个人员)
        analyze_extraction_quality(result, expected_count=16, test_name="多人任命")
        
    except Exception as e:
        logging.error(f"多人提取测试失败: {str(e)}")
        print(f"❌ 多人提取测试失败: {str(e)}")

def main():
    """主测试函数"""
    print("开始测试QWen模型人员信息提取功能")
    print(f"测试时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    # 检查环境变量
    if not os.environ.get('QWEN_API_KEY'):
        print("⚠️  警告：未设置 QWEN_API_KEY 环境变量，将使用代码中的默认值")
    
    # 测试单人提取
    test_single_person_extraction()
    
    # 测试多人提取
    test_multiple_person_extraction()
    
    print("\n" + "="*60)
    print("测试完成")
    print("="*60)
    print("结果文件:")
    print("- test_result_single_person.json (单人提取结果)")
    print("- test_result_multiple_person.json (多人提取结果)")
    print("- test_qwen_extraction.log (测试日志)")

if __name__ == "__main__":
    main()