test_web_crawl.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 测试 web_url_crawl 函数的脚本
  5. """
  6. import sys
  7. import os
  8. # 添加项目根目录到Python路径
  9. sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
  10. from app.core.data_parse.parse_task import web_url_crawl
  11. import logging
  12. # 配置日志
  13. logging.basicConfig(
  14. level=logging.INFO,
  15. format='%(asctime)s - %(levelname)s - %(message)s'
  16. )
  17. def test_web_url_crawl():
  18. """测试网页爬取功能"""
  19. # 测试URL列表 - 使用用户提供的微信公众号文章URL
  20. test_urls = [
  21. "https://mp.weixin.qq.com/s/4yz-kNAWAlF36aeQ_cgQQg", # 微信公众号文章
  22. "https://httpbin.org/html", # 测试HTML页面
  23. "https://httpbin.org/json", # 测试JSON页面
  24. ]
  25. print("开始测试 web_url_crawl 函数...")
  26. print(f"测试URL数量: {len(test_urls)}")
  27. print("-" * 50)
  28. # 调用函数
  29. result = web_url_crawl(test_urls)
  30. # 输出结果
  31. print("爬取结果:")
  32. print(f"成功: {result['success']}")
  33. print(f"消息: {result['message']}")
  34. print(f"总URL数: {result['data']['total_urls']}")
  35. print(f"成功数量: {result['data']['success_count']}")
  36. print(f"失败数量: {result['data']['failed_count']}")
  37. print("\n成功爬取的内容:")
  38. for i, content in enumerate(result['data']['contents']):
  39. print(f"\n{i+1}. URL: {content['url']}")
  40. print(f" 状态: {content['status']}")
  41. print(f" 内容长度: {content['content_length']}")
  42. print(f" 原始长度: {content['original_length']}")
  43. print(f" 状态码: {content['status_code']}")
  44. print(f" 编码: {content['encoding']}")
  45. if 'note' in content:
  46. print(f" 备注: {content['note']}")
  47. # 显示内容预览(前300个字符)
  48. preview = content['data'][:300] + "..." if len(content['data']) > 300 else content['data']
  49. print(f" 内容预览: {preview}")
  50. print("\n失败的URL:")
  51. for i, failed in enumerate(result['data']['failed_items']):
  52. print(f"\n{i+1}. URL: {failed['url']}")
  53. print(f" 错误: {failed['error']}")
  54. print(f" 状态: {failed['status']}")
  55. print("-" * 50)
  56. print("测试完成!")
  57. if __name__ == "__main__":
  58. test_web_url_crawl()