test_process_urls_direct.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 直接测试 web_url_crawl 函数的脚本(不依赖Flask应用)
  5. """
  6. import sys
  7. import os
  8. import json
  9. import time
  10. # 添加项目根目录到Python路径
  11. sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
  12. from app.core.data_parse.parse_task import web_url_crawl
  13. import logging
  14. # 配置日志
  15. logging.basicConfig(
  16. level=logging.INFO,
  17. format='%(asctime)s - %(levelname)s - %(message)s'
  18. )
  19. def test_web_url_crawl_direct():
  20. """直接测试web_url_crawl函数"""
  21. # 测试URL列表 - 包含微信公众号文章URL
  22. test_urls = [
  23. "https://mp.weixin.qq.com/s/4yz-kNAWAlF36aeQ_cgQQg", # 微信公众号文章
  24. "https://httpbin.org/html", # 测试HTML页面
  25. "https://httpbin.org/json", # 测试JSON页面
  26. ]
  27. print("开始直接测试 web_url_crawl 函数...")
  28. print(f"测试URL数量: {len(test_urls)}")
  29. print("-" * 50)
  30. # 调用函数
  31. start_time = time.time()
  32. result = web_url_crawl(test_urls)
  33. end_time = time.time()
  34. # 输出结果
  35. print("爬取结果:")
  36. print(f"成功: {result['success']}")
  37. print(f"消息: {result['message']}")
  38. print(f"总URL数: {result['data']['total_urls']}")
  39. print(f"成功数量: {result['data']['success_count']}")
  40. print(f"失败数量: {result['data']['failed_count']}")
  41. print(f"总耗时: {end_time - start_time:.2f} 秒")
  42. print("\n成功爬取的内容:")
  43. for i, content in enumerate(result['data']['contents']):
  44. print(f"\n{i+1}. URL: {content['url']}")
  45. print(f" 状态: {content['status']}")
  46. print(f" 内容长度: {content['content_length']}")
  47. print(f" 原始长度: {content['original_length']}")
  48. print(f" 状态码: {content['status_code']}")
  49. print(f" 编码: {content['encoding']}")
  50. if 'note' in content:
  51. print(f" 备注: {content['note']}")
  52. # 显示内容预览(前300个字符)
  53. preview = content['data'][:300] + "..." if len(content['data']) > 300 else content['data']
  54. print(f" 内容预览: {preview}")
  55. print("\n失败的URL:")
  56. for i, failed in enumerate(result['data']['failed_items']):
  57. print(f"\n{i+1}. URL: {failed['url']}")
  58. print(f" 错误: {failed['error']}")
  59. print(f" 状态: {failed['status']}")
  60. # 保存结果到文件
  61. timestamp = time.strftime("%Y%m%d_%H%M%S")
  62. filename = f"web_crawl_direct_test_{timestamp}.json"
  63. with open(filename, 'w', encoding='utf-8') as f:
  64. json.dump(result, f, ensure_ascii=False, indent=2)
  65. print(f"\n💾 完整结果已保存到: {filename}")
  66. print("-" * 50)
  67. print("直接测试完成!")
  68. def test_api_interface_logic():
  69. """测试API接口的逻辑(模拟)"""
  70. print("\n开始测试API接口逻辑...")
  71. print("-" * 50)
  72. # 模拟API请求数据
  73. test_data = {
  74. "urlArr": [
  75. "https://mp.weixin.qq.com/s/4yz-kNAWAlF36aeQ_cgQQg",
  76. "https://httpbin.org/html"
  77. ]
  78. }
  79. print(f"模拟API请求数据: {json.dumps(test_data, ensure_ascii=False, indent=2)}")
  80. # 模拟参数验证
  81. if not test_data:
  82. print("❌ 请求数据为空")
  83. return
  84. if 'urlArr' not in test_data:
  85. print("❌ 缺少必填字段: urlArr")
  86. return
  87. url_arr = test_data.get('urlArr')
  88. if not isinstance(url_arr, list):
  89. print("❌ urlArr字段必须是数组格式")
  90. return
  91. if len(url_arr) == 0:
  92. print("❌ urlArr数组不能为空")
  93. return
  94. # 验证每个URL是否为字符串
  95. for i, url in enumerate(url_arr):
  96. if not isinstance(url, str):
  97. print(f"❌ urlArr[{i}]必须是字符串格式,当前类型: {type(url).__name__}")
  98. return
  99. print("✅ 参数验证通过")
  100. # 调用核心业务逻辑
  101. print("调用web_url_crawl函数...")
  102. result = web_url_crawl(url_arr)
  103. # 模拟API响应逻辑
  104. if result.get('success', False):
  105. success_count = result.get('data', {}).get('success_count', 0)
  106. failed_count = result.get('data', {}).get('failed_count', 0)
  107. if failed_count == 0:
  108. status_code = 200 # 完全成功
  109. print(f"✅ 完全成功 (状态码: {status_code})")
  110. elif success_count > 0:
  111. status_code = 206 # 部分成功
  112. print(f"⚠️ 部分成功 (状态码: {status_code})")
  113. else:
  114. status_code = 500 # 完全失败
  115. print(f"❌ 完全失败 (状态码: {status_code})")
  116. else:
  117. status_code = 500 # 服务器错误
  118. print(f"❌ 服务器错误 (状态码: {status_code})")
  119. # 模拟API响应
  120. api_response = {
  121. 'success': result.get('success', False),
  122. 'message': result.get('message', '处理完成'),
  123. 'data': result.get('data', {})
  124. }
  125. print(f"\n模拟API响应:")
  126. print(f"状态码: {status_code}")
  127. print(f"响应内容: {json.dumps(api_response, ensure_ascii=False, indent=2)}")
  128. print("-" * 50)
  129. print("API接口逻辑测试完成!")
  130. if __name__ == "__main__":
  131. # 运行直接测试
  132. test_web_url_crawl_direct()
  133. # 运行API接口逻辑测试
  134. test_api_interface_logic()