#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 测试 web_url_crawl 函数的脚本 """ import sys import os # 添加项目根目录到Python路径 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from app.core.data_parse.parse_task import web_url_crawl import logging # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) def test_web_url_crawl(): """测试网页爬取功能""" # 测试URL列表 - 使用用户提供的微信公众号文章URL test_urls = [ "https://mp.weixin.qq.com/s/4yz-kNAWAlF36aeQ_cgQQg", # 微信公众号文章 "https://httpbin.org/html", # 测试HTML页面 "https://httpbin.org/json", # 测试JSON页面 ] print("开始测试 web_url_crawl 函数...") print(f"测试URL数量: {len(test_urls)}") print("-" * 50) # 调用函数 result = web_url_crawl(test_urls) # 输出结果 print("爬取结果:") print(f"成功: {result['success']}") print(f"消息: {result['message']}") print(f"总URL数: {result['data']['total_urls']}") print(f"成功数量: {result['data']['success_count']}") print(f"失败数量: {result['data']['failed_count']}") print("\n成功爬取的内容:") for i, content in enumerate(result['data']['contents']): print(f"\n{i+1}. URL: {content['url']}") print(f" 状态: {content['status']}") print(f" 内容长度: {content['content_length']}") print(f" 原始长度: {content['original_length']}") print(f" 状态码: {content['status_code']}") print(f" 编码: {content['encoding']}") if 'note' in content: print(f" 备注: {content['note']}") # 显示内容预览(前300个字符) preview = content['data'][:300] + "..." if len(content['data']) > 300 else content['data'] print(f" 内容预览: {preview}") print("\n失败的URL:") for i, failed in enumerate(result['data']['failed_items']): print(f"\n{i+1}. URL: {failed['url']}") print(f" 错误: {failed['error']}") print(f" 状态: {failed['status']}") print("-" * 50) print("测试完成!") if __name__ == "__main__": test_web_url_crawl()