#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 直接测试 web_url_crawl 函数的脚本(不依赖Flask应用) """ import sys import os import json import time # 添加项目根目录到Python路径 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from app.core.data_parse.parse_task import web_url_crawl import logging # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) def test_web_url_crawl_direct(): """直接测试web_url_crawl函数""" # 测试URL列表 - 包含微信公众号文章URL test_urls = [ "https://mp.weixin.qq.com/s/4yz-kNAWAlF36aeQ_cgQQg", # 微信公众号文章 "https://httpbin.org/html", # 测试HTML页面 "https://httpbin.org/json", # 测试JSON页面 ] print("开始直接测试 web_url_crawl 函数...") print(f"测试URL数量: {len(test_urls)}") print("-" * 50) # 调用函数 start_time = time.time() result = web_url_crawl(test_urls) end_time = time.time() # 输出结果 print("爬取结果:") print(f"成功: {result['success']}") print(f"消息: {result['message']}") print(f"总URL数: {result['data']['total_urls']}") print(f"成功数量: {result['data']['success_count']}") print(f"失败数量: {result['data']['failed_count']}") print(f"总耗时: {end_time - start_time:.2f} 秒") print("\n成功爬取的内容:") for i, content in enumerate(result['data']['contents']): print(f"\n{i+1}. URL: {content['url']}") print(f" 状态: {content['status']}") print(f" 内容长度: {content['content_length']}") print(f" 原始长度: {content['original_length']}") print(f" 状态码: {content['status_code']}") print(f" 编码: {content['encoding']}") if 'note' in content: print(f" 备注: {content['note']}") # 显示内容预览(前300个字符) preview = content['data'][:300] + "..." if len(content['data']) > 300 else content['data'] print(f" 内容预览: {preview}") print("\n失败的URL:") for i, failed in enumerate(result['data']['failed_items']): print(f"\n{i+1}. URL: {failed['url']}") print(f" 错误: {failed['error']}") print(f" 状态: {failed['status']}") # 保存结果到文件 timestamp = time.strftime("%Y%m%d_%H%M%S") filename = f"web_crawl_direct_test_{timestamp}.json" with open(filename, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"\n💾 完整结果已保存到: {filename}") print("-" * 50) print("直接测试完成!") def test_api_interface_logic(): """测试API接口的逻辑(模拟)""" print("\n开始测试API接口逻辑...") print("-" * 50) # 模拟API请求数据 test_data = { "urlArr": [ "https://mp.weixin.qq.com/s/4yz-kNAWAlF36aeQ_cgQQg", "https://httpbin.org/html" ] } print(f"模拟API请求数据: {json.dumps(test_data, ensure_ascii=False, indent=2)}") # 模拟参数验证 if not test_data: print("❌ 请求数据为空") return if 'urlArr' not in test_data: print("❌ 缺少必填字段: urlArr") return url_arr = test_data.get('urlArr') if not isinstance(url_arr, list): print("❌ urlArr字段必须是数组格式") return if len(url_arr) == 0: print("❌ urlArr数组不能为空") return # 验证每个URL是否为字符串 for i, url in enumerate(url_arr): if not isinstance(url, str): print(f"❌ urlArr[{i}]必须是字符串格式,当前类型: {type(url).__name__}") return print("✅ 参数验证通过") # 调用核心业务逻辑 print("调用web_url_crawl函数...") result = web_url_crawl(url_arr) # 模拟API响应逻辑 if result.get('success', False): success_count = result.get('data', {}).get('success_count', 0) failed_count = result.get('data', {}).get('failed_count', 0) if failed_count == 0: status_code = 200 # 完全成功 print(f"✅ 完全成功 (状态码: {status_code})") elif success_count > 0: status_code = 206 # 部分成功 print(f"⚠️ 部分成功 (状态码: {status_code})") else: status_code = 500 # 完全失败 print(f"❌ 完全失败 (状态码: {status_code})") else: status_code = 500 # 服务器错误 print(f"❌ 服务器错误 (状态码: {status_code})") # 模拟API响应 api_response = { 'success': result.get('success', False), 'message': result.get('message', '处理完成'), 'data': result.get('data', {}) } print(f"\n模拟API响应:") print(f"状态码: {status_code}") print(f"响应内容: {json.dumps(api_response, ensure_ascii=False, indent=2)}") print("-" * 50) print("API接口逻辑测试完成!") if __name__ == "__main__": # 运行直接测试 test_web_url_crawl_direct() # 运行API接口逻辑测试 test_api_interface_logic()