test_qwen_extraction.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 测试QWen模型提取人员信息功能
  5. """
  6. import os
  7. import sys
  8. import json
  9. import logging
  10. from datetime import datetime
  11. # 添加项目根目录到路径
  12. sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
  13. # 导入需要测试的函数
  14. from app.core.data_parse.parse_web import process_webpage_with_QWen
  15. # 配置日志
  16. logging.basicConfig(
  17. level=logging.INFO,
  18. format='%(asctime)s - %(levelname)s - %(message)s',
  19. handlers=[
  20. logging.StreamHandler(),
  21. logging.FileHandler('test_qwen_extraction.log', encoding='utf-8')
  22. ]
  23. )
  24. def read_markdown_file(file_path):
  25. """读取markdown文件内容"""
  26. try:
  27. with open(file_path, 'r', encoding='utf-8') as f:
  28. return f.read()
  29. except Exception as e:
  30. logging.error(f"读取文件 {file_path} 失败: {str(e)}")
  31. return None
  32. def save_result_to_file(result, filename):
  33. """保存结果到JSON文件"""
  34. try:
  35. with open(filename, 'w', encoding='utf-8') as f:
  36. json.dump(result, f, ensure_ascii=False, indent=2)
  37. logging.info(f"结果已保存到: {filename}")
  38. except Exception as e:
  39. logging.error(f"保存结果到文件失败: {str(e)}")
  40. def analyze_extraction_quality(result, expected_count=None, test_name=""):
  41. """分析提取质量"""
  42. print(f"\n=== {test_name} 提取结果分析 ===")
  43. if not result:
  44. print("❌ 提取失败,返回空结果")
  45. return False
  46. if not isinstance(result, list):
  47. print("❌ 返回结果不是数组格式")
  48. return False
  49. print(f"✅ 成功提取 {len(result)} 个人员信息")
  50. if expected_count and len(result) != expected_count:
  51. print(f"⚠️ 警告:期望提取 {expected_count} 个人员,实际提取 {len(result)} 个")
  52. # 检查每个人员信息的完整性
  53. required_fields = [
  54. 'name_zh', 'name_en', 'title_zh', 'title_en',
  55. 'hotel_zh', 'hotel_en', 'brand_group', 'career_path', 'pic_url'
  56. ]
  57. for i, person in enumerate(result, 1):
  58. print(f"\n第 {i} 个人员:")
  59. print(f" 中文姓名: {person.get('name_zh', '未提取')}")
  60. print(f" 英文姓名: {person.get('name_en', '未提取')}")
  61. print(f" 中文职位: {person.get('title_zh', '未提取')}")
  62. print(f" 英文职位: {person.get('title_en', '未提取')}")
  63. print(f" 中文酒店: {person.get('hotel_zh', '未提取')}")
  64. print(f" 英文酒店: {person.get('hotel_en', '未提取')}")
  65. print(f" 品牌组合: {person.get('brand_group', '未提取')}")
  66. print(f" 照片链接: {person.get('pic_url', '未提取')[:80]}..." if len(person.get('pic_url', '')) > 80 else f" 照片链接: {person.get('pic_url', '未提取')}")
  67. print(f" 职业轨迹: {len(person.get('career_path', []))} 条记录")
  68. # 检查字段完整性
  69. missing_fields = [field for field in required_fields if field not in person]
  70. if missing_fields:
  71. print(f" ❌ 缺失字段: {missing_fields}")
  72. else:
  73. print(f" ✅ 所有字段都存在")
  74. # 显示非空字段的统计
  75. non_empty_fields = [field for field in required_fields if person.get(field) not in ['', None, []]]
  76. print(f" 📊 非空字段: {len(non_empty_fields)}/{len(required_fields)}")
  77. return True
  78. def test_single_person_extraction():
  79. """测试单人提取"""
  80. print("\n" + "="*60)
  81. print("测试单人任命信息提取")
  82. print("="*60)
  83. # 读取单人任命文件
  84. content = read_markdown_file('新任命单人-markdown格式.md')
  85. if not content:
  86. print("❌ 无法读取单人任命文件")
  87. return
  88. try:
  89. # 调用提取函数
  90. result = process_webpage_with_QWen(content, "2025-01-15")
  91. # 保存结果
  92. save_result_to_file(result, 'test_result_single_person.json')
  93. # 分析结果
  94. analyze_extraction_quality(result, expected_count=1, test_name="单人任命")
  95. except Exception as e:
  96. logging.error(f"单人提取测试失败: {str(e)}")
  97. print(f"❌ 单人提取测试失败: {str(e)}")
  98. def test_multiple_person_extraction():
  99. """测试多人提取"""
  100. print("\n" + "="*60)
  101. print("测试多人任命信息提取")
  102. print("="*60)
  103. # 读取多人任命文件
  104. content = read_markdown_file('新任命多人-markdown格式.md')
  105. if not content:
  106. print("❌ 无法读取多人任命文件")
  107. return
  108. try:
  109. # 调用提取函数
  110. result = process_webpage_with_QWen(content, "2025-01-15")
  111. # 添加调试信息
  112. print(f"🔍 调试信息: 函数返回结果类型: {type(result)}")
  113. print(f"🔍 调试信息: 函数返回结果长度: {len(result) if isinstance(result, list) else 'N/A'}")
  114. # 保存结果
  115. save_result_to_file(result, 'test_result_multiple_person.json')
  116. # 分析结果 (根据文件内容,应该有16个人员)
  117. analyze_extraction_quality(result, expected_count=16, test_name="多人任命")
  118. except Exception as e:
  119. logging.error(f"多人提取测试失败: {str(e)}")
  120. print(f"❌ 多人提取测试失败: {str(e)}")
  121. def main():
  122. """主测试函数"""
  123. print("开始测试QWen模型人员信息提取功能")
  124. print(f"测试时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  125. # 检查环境变量
  126. if not os.environ.get('QWEN_API_KEY'):
  127. print("⚠️ 警告:未设置 QWEN_API_KEY 环境变量,将使用代码中的默认值")
  128. # 测试单人提取
  129. test_single_person_extraction()
  130. # 测试多人提取
  131. test_multiple_person_extraction()
  132. print("\n" + "="*60)
  133. print("测试完成")
  134. print("="*60)
  135. print("结果文件:")
  136. print("- test_result_single_person.json (单人提取结果)")
  137. print("- test_result_multiple_person.json (多人提取结果)")
  138. print("- test_qwen_extraction.log (测试日志)")
  139. if __name__ == "__main__":
  140. main()