123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 直接测试 web_url_crawl 函数的脚本(不依赖Flask应用)
- """
- import sys
- import os
- import json
- import time
- # 添加项目根目录到Python路径
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
- from app.core.data_parse.parse_task import web_url_crawl
- import logging
- # 配置日志
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s'
- )
- def test_web_url_crawl_direct():
- """直接测试web_url_crawl函数"""
-
- # 测试URL列表 - 包含微信公众号文章URL
- test_urls = [
- "https://mp.weixin.qq.com/s/4yz-kNAWAlF36aeQ_cgQQg", # 微信公众号文章
- "https://httpbin.org/html", # 测试HTML页面
- "https://httpbin.org/json", # 测试JSON页面
- ]
-
- print("开始直接测试 web_url_crawl 函数...")
- print(f"测试URL数量: {len(test_urls)}")
- print("-" * 50)
-
- # 调用函数
- start_time = time.time()
- result = web_url_crawl(test_urls)
- end_time = time.time()
-
- # 输出结果
- print("爬取结果:")
- print(f"成功: {result['success']}")
- print(f"消息: {result['message']}")
- print(f"总URL数: {result['data']['total_urls']}")
- print(f"成功数量: {result['data']['success_count']}")
- print(f"失败数量: {result['data']['failed_count']}")
- print(f"总耗时: {end_time - start_time:.2f} 秒")
-
- print("\n成功爬取的内容:")
- for i, content in enumerate(result['data']['contents']):
- print(f"\n{i+1}. URL: {content['url']}")
- print(f" 状态: {content['status']}")
- print(f" 内容长度: {content['content_length']}")
- print(f" 原始长度: {content['original_length']}")
- print(f" 状态码: {content['status_code']}")
- print(f" 编码: {content['encoding']}")
- if 'note' in content:
- print(f" 备注: {content['note']}")
-
- # 显示内容预览(前300个字符)
- preview = content['data'][:300] + "..." if len(content['data']) > 300 else content['data']
- print(f" 内容预览: {preview}")
-
- print("\n失败的URL:")
- for i, failed in enumerate(result['data']['failed_items']):
- print(f"\n{i+1}. URL: {failed['url']}")
- print(f" 错误: {failed['error']}")
- print(f" 状态: {failed['status']}")
-
- # 保存结果到文件
- timestamp = time.strftime("%Y%m%d_%H%M%S")
- filename = f"web_crawl_direct_test_{timestamp}.json"
- with open(filename, 'w', encoding='utf-8') as f:
- json.dump(result, f, ensure_ascii=False, indent=2)
- print(f"\n💾 完整结果已保存到: {filename}")
-
- print("-" * 50)
- print("直接测试完成!")
- def test_api_interface_logic():
- """测试API接口的逻辑(模拟)"""
-
- print("\n开始测试API接口逻辑...")
- print("-" * 50)
-
- # 模拟API请求数据
- test_data = {
- "urlArr": [
- "https://mp.weixin.qq.com/s/4yz-kNAWAlF36aeQ_cgQQg",
- "https://httpbin.org/html"
- ]
- }
-
- print(f"模拟API请求数据: {json.dumps(test_data, ensure_ascii=False, indent=2)}")
-
- # 模拟参数验证
- if not test_data:
- print("❌ 请求数据为空")
- return
-
- if 'urlArr' not in test_data:
- print("❌ 缺少必填字段: urlArr")
- return
-
- url_arr = test_data.get('urlArr')
-
- if not isinstance(url_arr, list):
- print("❌ urlArr字段必须是数组格式")
- return
-
- if len(url_arr) == 0:
- print("❌ urlArr数组不能为空")
- return
-
- # 验证每个URL是否为字符串
- for i, url in enumerate(url_arr):
- if not isinstance(url, str):
- print(f"❌ urlArr[{i}]必须是字符串格式,当前类型: {type(url).__name__}")
- return
-
- print("✅ 参数验证通过")
-
- # 调用核心业务逻辑
- print("调用web_url_crawl函数...")
- result = web_url_crawl(url_arr)
-
- # 模拟API响应逻辑
- if result.get('success', False):
- success_count = result.get('data', {}).get('success_count', 0)
- failed_count = result.get('data', {}).get('failed_count', 0)
-
- if failed_count == 0:
- status_code = 200 # 完全成功
- print(f"✅ 完全成功 (状态码: {status_code})")
- elif success_count > 0:
- status_code = 206 # 部分成功
- print(f"⚠️ 部分成功 (状态码: {status_code})")
- else:
- status_code = 500 # 完全失败
- print(f"❌ 完全失败 (状态码: {status_code})")
- else:
- status_code = 500 # 服务器错误
- print(f"❌ 服务器错误 (状态码: {status_code})")
-
- # 模拟API响应
- api_response = {
- 'success': result.get('success', False),
- 'message': result.get('message', '处理完成'),
- 'data': result.get('data', {})
- }
-
- print(f"\n模拟API响应:")
- print(f"状态码: {status_code}")
- print(f"响应内容: {json.dumps(api_response, ensure_ascii=False, indent=2)}")
-
- print("-" * 50)
- print("API接口逻辑测试完成!")
- if __name__ == "__main__":
- # 运行直接测试
- test_web_url_crawl_direct()
-
- # 运行API接口逻辑测试
- test_api_interface_logic()
|