123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 测试网页解析功能
- """
- import json
- import sys
- import os
- # 添加项目根目录到Python路径
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
- from app.core.data_parse.parse_web import process_webpage_with_QWen
- def test_single_person():
- """测试单人任命信息提取"""
- print("=" * 50)
- print("测试单人任命信息提取")
- print("=" * 50)
-
- try:
- # 读取单人任命文件
- with open('新任命单人-markdown格式.md', 'r', encoding='utf-8') as f:
- single_person_content = f.read()
-
- print("开始解析单人任命信息...")
- result = process_webpage_with_QWen(single_person_content, "2025-01-15")
-
- print("解析结果:")
- print(json.dumps(result, ensure_ascii=False, indent=2))
-
- # 验证结果
- if isinstance(result, list) and len(result) > 0:
- person = result[0]
- print(f"\n提取的人员信息:")
- print(f"中文姓名: {person.get('name_zh', 'N/A')}")
- print(f"英文姓名: {person.get('name_en', 'N/A')}")
- print(f"中文职位: {person.get('title_zh', 'N/A')}")
- print(f"英文职位: {person.get('title_en', 'N/A')}")
- print(f"中文酒店: {person.get('hotel_zh', 'N/A')}")
- print(f"英文酒店: {person.get('hotel_en', 'N/A')}")
- print(f"品牌组合: {person.get('brand_group', 'N/A')}")
- print(f"照片链接: {person.get('pic_url', 'N/A')}")
- print(f"职业轨迹: {len(person.get('career_path', []))} 条记录")
- else:
- print("❌ 解析失败:返回结果格式不正确")
-
- except Exception as e:
- print(f"❌ 单人测试失败: {str(e)}")
- def test_multiple_persons():
- """测试多人任命信息提取"""
- print("\n" + "=" * 50)
- print("测试多人任命信息提取")
- print("=" * 50)
-
- try:
- # 读取多人任命文件
- with open('新任命多人-markdown格式.md', 'r', encoding='utf-8') as f:
- multiple_persons_content = f.read()
-
- print("开始解析多人任命信息...")
- result = process_webpage_with_QWen(multiple_persons_content, "2025-01-15")
-
- print("解析结果:")
- print(json.dumps(result, ensure_ascii=False, indent=2))
-
- # 验证结果
- if isinstance(result, list):
- print(f"\n✅ 成功提取 {len(result)} 个人员信息:")
- for i, person in enumerate(result, 1):
- print(f"\n第 {i} 个人员:")
- print(f" 中文姓名: {person.get('name_zh', 'N/A')}")
- print(f" 英文姓名: {person.get('name_en', 'N/A')}")
- print(f" 中文职位: {person.get('title_zh', 'N/A')}")
- print(f" 英文职位: {person.get('title_en', 'N/A')}")
- print(f" 中文酒店: {person.get('hotel_zh', 'N/A')}")
- print(f" 英文酒店: {person.get('hotel_en', 'N/A')}")
- print(f" 品牌组合: {person.get('brand_group', 'N/A')}")
- print(f" 照片链接: {person.get('pic_url', 'N/A')}")
- print(f" 职业轨迹: {len(person.get('career_path', []))} 条记录")
- else:
- print("❌ 解析失败:返回结果格式不正确")
-
- except Exception as e:
- print(f"❌ 多人测试失败: {str(e)}")
- def test_field_validation():
- """测试字段验证"""
- print("\n" + "=" * 50)
- print("字段验证测试")
- print("=" * 50)
-
- # 测试简单的文本
- test_content = """
- 
- **张三先生**
- **Mr. Zhang San**
- 北京万豪酒店
- 总经理
- 张三先生拥有15年酒店管理经验...
- """
-
- try:
- print("测试简单文本解析...")
- result = process_webpage_with_QWen(test_content, "2025-01-15")
-
- if isinstance(result, list) and len(result) > 0:
- person = result[0]
- required_fields = ['name_zh', 'name_en', 'title_zh', 'title_en',
- 'hotel_zh', 'hotel_en', 'brand_group', 'career_path', 'pic_url']
-
- print("字段完整性检查:")
- all_fields_present = True
- for field in required_fields:
- if field in person:
- print(f" ✅ {field}: {person[field]}")
- else:
- print(f" ❌ {field}: 缺失")
- all_fields_present = False
-
- if all_fields_present:
- print("\n✅ 所有必要字段都存在")
- else:
- print("\n❌ 部分字段缺失")
- else:
- print("❌ 解析失败")
-
- except Exception as e:
- print(f"❌ 字段验证测试失败: {str(e)}")
- if __name__ == "__main__":
- print("网页解析功能测试")
- print("确保已设置 QWEN_API_KEY 环境变量")
-
- # 检查API密钥
- if not os.environ.get('QWEN_API_KEY'):
- print("⚠️ 警告: 未设置 QWEN_API_KEY 环境变量")
-
- # 检查测试文件是否存在
- if not os.path.exists('新任命单人-markdown格式.md'):
- print("❌ 测试文件 '新任命单人-markdown格式.md' 不存在")
- sys.exit(1)
-
- if not os.path.exists('新任命多人-markdown格式.md'):
- print("❌ 测试文件 '新任命多人-markdown格式.md' 不存在")
- sys.exit(1)
-
- # 运行测试
- test_single_person()
- test_multiple_persons()
- test_field_validation()
-
- print("\n" + "=" * 50)
- print("测试完成")
- print("=" * 50)
|