123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 测试QWen模型提取人员信息功能
- """
- import os
- import sys
- import json
- import logging
- from datetime import datetime
- # 添加项目根目录到路径
- sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
- # 导入需要测试的函数
- from app.core.data_parse.parse_web import process_webpage_with_QWen
- # 配置日志
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s',
- handlers=[
- logging.StreamHandler(),
- logging.FileHandler('test_qwen_extraction.log', encoding='utf-8')
- ]
- )
- def read_markdown_file(file_path):
- """读取markdown文件内容"""
- try:
- with open(file_path, 'r', encoding='utf-8') as f:
- return f.read()
- except Exception as e:
- logging.error(f"读取文件 {file_path} 失败: {str(e)}")
- return None
- def save_result_to_file(result, filename):
- """保存结果到JSON文件"""
- try:
- with open(filename, 'w', encoding='utf-8') as f:
- json.dump(result, f, ensure_ascii=False, indent=2)
- logging.info(f"结果已保存到: {filename}")
- except Exception as e:
- logging.error(f"保存结果到文件失败: {str(e)}")
- def analyze_extraction_quality(result, expected_count=None, test_name=""):
- """分析提取质量"""
- print(f"\n=== {test_name} 提取结果分析 ===")
-
- if not result:
- print("❌ 提取失败,返回空结果")
- return False
-
- if not isinstance(result, list):
- print("❌ 返回结果不是数组格式")
- return False
-
- print(f"✅ 成功提取 {len(result)} 个人员信息")
-
- if expected_count and len(result) != expected_count:
- print(f"⚠️ 警告:期望提取 {expected_count} 个人员,实际提取 {len(result)} 个")
-
- # 检查每个人员信息的完整性
- required_fields = [
- 'name_zh', 'name_en', 'title_zh', 'title_en',
- 'hotel_zh', 'hotel_en', 'brand_group', 'career_path', 'pic_url'
- ]
-
- for i, person in enumerate(result, 1):
- print(f"\n第 {i} 个人员:")
- print(f" 中文姓名: {person.get('name_zh', '未提取')}")
- print(f" 英文姓名: {person.get('name_en', '未提取')}")
- print(f" 中文职位: {person.get('title_zh', '未提取')}")
- print(f" 英文职位: {person.get('title_en', '未提取')}")
- print(f" 中文酒店: {person.get('hotel_zh', '未提取')}")
- print(f" 英文酒店: {person.get('hotel_en', '未提取')}")
- print(f" 品牌组合: {person.get('brand_group', '未提取')}")
- print(f" 照片链接: {person.get('pic_url', '未提取')[:80]}..." if len(person.get('pic_url', '')) > 80 else f" 照片链接: {person.get('pic_url', '未提取')}")
- print(f" 职业轨迹: {len(person.get('career_path', []))} 条记录")
-
- # 检查字段完整性
- missing_fields = [field for field in required_fields if field not in person]
- if missing_fields:
- print(f" ❌ 缺失字段: {missing_fields}")
- else:
- print(f" ✅ 所有字段都存在")
-
- # 显示非空字段的统计
- non_empty_fields = [field for field in required_fields if person.get(field) not in ['', None, []]]
- print(f" 📊 非空字段: {len(non_empty_fields)}/{len(required_fields)}")
-
- return True
- def test_single_person_extraction():
- """测试单人提取"""
- print("\n" + "="*60)
- print("测试单人任命信息提取")
- print("="*60)
-
- # 读取单人任命文件
- content = read_markdown_file('新任命单人-markdown格式.md')
- if not content:
- print("❌ 无法读取单人任命文件")
- return
-
- try:
- # 调用提取函数
- result = process_webpage_with_QWen(content, "2025-01-15")
-
- # 保存结果
- save_result_to_file(result, 'test_result_single_person.json')
-
- # 分析结果
- analyze_extraction_quality(result, expected_count=1, test_name="单人任命")
-
- except Exception as e:
- logging.error(f"单人提取测试失败: {str(e)}")
- print(f"❌ 单人提取测试失败: {str(e)}")
- def test_multiple_person_extraction():
- """测试多人提取"""
- print("\n" + "="*60)
- print("测试多人任命信息提取")
- print("="*60)
-
- # 读取多人任命文件
- content = read_markdown_file('新任命多人-markdown格式.md')
- if not content:
- print("❌ 无法读取多人任命文件")
- return
-
- try:
- # 调用提取函数
- result = process_webpage_with_QWen(content, "2025-01-15")
-
- # 添加调试信息
- print(f"🔍 调试信息: 函数返回结果类型: {type(result)}")
- print(f"🔍 调试信息: 函数返回结果长度: {len(result) if isinstance(result, list) else 'N/A'}")
-
- # 保存结果
- save_result_to_file(result, 'test_result_multiple_person.json')
-
- # 分析结果 (根据文件内容,应该有16个人员)
- analyze_extraction_quality(result, expected_count=16, test_name="多人任命")
-
- except Exception as e:
- logging.error(f"多人提取测试失败: {str(e)}")
- print(f"❌ 多人提取测试失败: {str(e)}")
- def main():
- """主测试函数"""
- print("开始测试QWen模型人员信息提取功能")
- print(f"测试时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-
- # 检查环境变量
- if not os.environ.get('QWEN_API_KEY'):
- print("⚠️ 警告:未设置 QWEN_API_KEY 环境变量,将使用代码中的默认值")
-
- # 测试单人提取
- test_single_person_extraction()
-
- # 测试多人提取
- test_multiple_person_extraction()
-
- print("\n" + "="*60)
- print("测试完成")
- print("="*60)
- print("结果文件:")
- print("- test_result_single_person.json (单人提取结果)")
- print("- test_result_multiple_person.json (多人提取结果)")
- print("- test_qwen_extraction.log (测试日志)")
- if __name__ == "__main__":
- main()
|