123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 测试 web_url_crawl 函数的脚本
- """
- import sys
- import os
- # 添加项目根目录到Python路径
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
- from app.core.data_parse.parse_task import web_url_crawl
- import logging
- # 配置日志
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s'
- )
- def test_web_url_crawl():
- """测试网页爬取功能"""
-
- # 测试URL列表 - 使用用户提供的微信公众号文章URL
- test_urls = [
- "https://mp.weixin.qq.com/s/4yz-kNAWAlF36aeQ_cgQQg", # 微信公众号文章
- "https://httpbin.org/html", # 测试HTML页面
- "https://httpbin.org/json", # 测试JSON页面
- ]
-
- print("开始测试 web_url_crawl 函数...")
- print(f"测试URL数量: {len(test_urls)}")
- print("-" * 50)
-
- # 调用函数
- result = web_url_crawl(test_urls)
-
- # 输出结果
- print("爬取结果:")
- print(f"成功: {result['success']}")
- print(f"消息: {result['message']}")
- print(f"总URL数: {result['data']['total_urls']}")
- print(f"成功数量: {result['data']['success_count']}")
- print(f"失败数量: {result['data']['failed_count']}")
-
- print("\n成功爬取的内容:")
- for i, content in enumerate(result['data']['contents']):
- print(f"\n{i+1}. URL: {content['url']}")
- print(f" 状态: {content['status']}")
- print(f" 内容长度: {content['content_length']}")
- print(f" 原始长度: {content['original_length']}")
- print(f" 状态码: {content['status_code']}")
- print(f" 编码: {content['encoding']}")
- if 'note' in content:
- print(f" 备注: {content['note']}")
-
- # 显示内容预览(前300个字符)
- preview = content['data'][:300] + "..." if len(content['data']) > 300 else content['data']
- print(f" 内容预览: {preview}")
-
- print("\n失败的URL:")
- for i, failed in enumerate(result['data']['failed_items']):
- print(f"\n{i+1}. URL: {failed['url']}")
- print(f" 错误: {failed['error']}")
- print(f" 状态: {failed['status']}")
-
- print("-" * 50)
- print("测试完成!")
- if __name__ == "__main__":
- test_web_url_crawl()
|