test_webpage_parse.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 测试网页解析功能
  5. """
  6. import json
  7. import sys
  8. import os
  9. # 添加项目根目录到Python路径
  10. sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  11. from app.core.data_parse.parse_web import process_webpage_with_QWen
  12. def test_single_person():
  13. """测试单人任命信息提取"""
  14. print("=" * 50)
  15. print("测试单人任命信息提取")
  16. print("=" * 50)
  17. try:
  18. # 读取单人任命文件
  19. with open('新任命单人-markdown格式.md', 'r', encoding='utf-8') as f:
  20. single_person_content = f.read()
  21. print("开始解析单人任命信息...")
  22. result = process_webpage_with_QWen(single_person_content, "2025-01-15")
  23. print("解析结果:")
  24. print(json.dumps(result, ensure_ascii=False, indent=2))
  25. # 验证结果
  26. if isinstance(result, list) and len(result) > 0:
  27. person = result[0]
  28. print(f"\n提取的人员信息:")
  29. print(f"中文姓名: {person.get('name_zh', 'N/A')}")
  30. print(f"英文姓名: {person.get('name_en', 'N/A')}")
  31. print(f"中文职位: {person.get('title_zh', 'N/A')}")
  32. print(f"英文职位: {person.get('title_en', 'N/A')}")
  33. print(f"中文酒店: {person.get('hotel_zh', 'N/A')}")
  34. print(f"英文酒店: {person.get('hotel_en', 'N/A')}")
  35. print(f"品牌组合: {person.get('brand_group', 'N/A')}")
  36. print(f"照片链接: {person.get('pic_url', 'N/A')}")
  37. print(f"职业轨迹: {len(person.get('career_path', []))} 条记录")
  38. else:
  39. print("❌ 解析失败:返回结果格式不正确")
  40. except Exception as e:
  41. print(f"❌ 单人测试失败: {str(e)}")
  42. def test_multiple_persons():
  43. """测试多人任命信息提取"""
  44. print("\n" + "=" * 50)
  45. print("测试多人任命信息提取")
  46. print("=" * 50)
  47. try:
  48. # 读取多人任命文件
  49. with open('新任命多人-markdown格式.md', 'r', encoding='utf-8') as f:
  50. multiple_persons_content = f.read()
  51. print("开始解析多人任命信息...")
  52. result = process_webpage_with_QWen(multiple_persons_content, "2025-01-15")
  53. print("解析结果:")
  54. print(json.dumps(result, ensure_ascii=False, indent=2))
  55. # 验证结果
  56. if isinstance(result, list):
  57. print(f"\n✅ 成功提取 {len(result)} 个人员信息:")
  58. for i, person in enumerate(result, 1):
  59. print(f"\n第 {i} 个人员:")
  60. print(f" 中文姓名: {person.get('name_zh', 'N/A')}")
  61. print(f" 英文姓名: {person.get('name_en', 'N/A')}")
  62. print(f" 中文职位: {person.get('title_zh', 'N/A')}")
  63. print(f" 英文职位: {person.get('title_en', 'N/A')}")
  64. print(f" 中文酒店: {person.get('hotel_zh', 'N/A')}")
  65. print(f" 英文酒店: {person.get('hotel_en', 'N/A')}")
  66. print(f" 品牌组合: {person.get('brand_group', 'N/A')}")
  67. print(f" 照片链接: {person.get('pic_url', 'N/A')}")
  68. print(f" 职业轨迹: {len(person.get('career_path', []))} 条记录")
  69. else:
  70. print("❌ 解析失败:返回结果格式不正确")
  71. except Exception as e:
  72. print(f"❌ 多人测试失败: {str(e)}")
  73. def test_field_validation():
  74. """测试字段验证"""
  75. print("\n" + "=" * 50)
  76. print("字段验证测试")
  77. print("=" * 50)
  78. # 测试简单的文本
  79. test_content = """
  80. ![照片](https://example.com/photo.jpg)
  81. **张三先生**
  82. **Mr. Zhang San**
  83. 北京万豪酒店
  84. 总经理
  85. 张三先生拥有15年酒店管理经验...
  86. """
  87. try:
  88. print("测试简单文本解析...")
  89. result = process_webpage_with_QWen(test_content, "2025-01-15")
  90. if isinstance(result, list) and len(result) > 0:
  91. person = result[0]
  92. required_fields = ['name_zh', 'name_en', 'title_zh', 'title_en',
  93. 'hotel_zh', 'hotel_en', 'brand_group', 'career_path', 'pic_url']
  94. print("字段完整性检查:")
  95. all_fields_present = True
  96. for field in required_fields:
  97. if field in person:
  98. print(f" ✅ {field}: {person[field]}")
  99. else:
  100. print(f" ❌ {field}: 缺失")
  101. all_fields_present = False
  102. if all_fields_present:
  103. print("\n✅ 所有必要字段都存在")
  104. else:
  105. print("\n❌ 部分字段缺失")
  106. else:
  107. print("❌ 解析失败")
  108. except Exception as e:
  109. print(f"❌ 字段验证测试失败: {str(e)}")
  110. if __name__ == "__main__":
  111. print("网页解析功能测试")
  112. print("确保已设置 QWEN_API_KEY 环境变量")
  113. # 检查API密钥
  114. if not os.environ.get('QWEN_API_KEY'):
  115. print("⚠️ 警告: 未设置 QWEN_API_KEY 环境变量")
  116. # 检查测试文件是否存在
  117. if not os.path.exists('新任命单人-markdown格式.md'):
  118. print("❌ 测试文件 '新任命单人-markdown格式.md' 不存在")
  119. sys.exit(1)
  120. if not os.path.exists('新任命多人-markdown格式.md'):
  121. print("❌ 测试文件 '新任命多人-markdown格式.md' 不存在")
  122. sys.exit(1)
  123. # 运行测试
  124. test_single_person()
  125. test_multiple_persons()
  126. test_field_validation()
  127. print("\n" + "=" * 50)
  128. print("测试完成")
  129. print("=" * 50)