#!/bin/bash # # DataOps Platform 问题诊断脚本 # 用于排查启动失败的原因 # # 配置变量 APP_NAME="dataops-platform" APP_DIR="/opt/dataops-platform" VENV_DIR="${APP_DIR}/venv" LOG_DIR="${APP_DIR}/logs" # 颜色输出 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color echo_info() { echo -e "${GREEN}[INFO]${NC} $1" } echo_warn() { echo -e "${YELLOW}[WARN]${NC} $1" } echo_error() { echo -e "${RED}[ERROR]${NC} $1" } echo_section() { echo -e "\n${BLUE}========================================${NC}" echo -e "${BLUE} $1${NC}" echo -e "${BLUE}========================================${NC}" } # 1. 检查目录结构 check_directories() { echo_section "1. 检查目录结构" echo_info "应用目录: ${APP_DIR}" if [ -d "${APP_DIR}" ]; then echo_info "✓ 应用目录存在" ls -la "${APP_DIR}" | head -20 else echo_error "✗ 应用目录不存在" fi echo "" echo_info "虚拟环境: ${VENV_DIR}" if [ -d "${VENV_DIR}" ]; then echo_info "✓ 虚拟环境存在" else echo_error "✗ 虚拟环境不存在" fi echo "" echo_info "日志目录: ${LOG_DIR}" if [ -d "${LOG_DIR}" ]; then echo_info "✓ 日志目录存在" ls -la "${LOG_DIR}" else echo_error "✗ 日志目录不存在,正在创建..." sudo mkdir -p "${LOG_DIR}" sudo chown ubuntu:ubuntu "${LOG_DIR}" fi } # 2. 检查 supervisor 配置 check_supervisor() { echo_section "2. 检查 Supervisor 配置" echo_info "Supervisor 配置文件:" if [ -f "/etc/supervisor/conf.d/${APP_NAME}.conf" ]; then echo_info "✓ 配置文件存在" cat "/etc/supervisor/conf.d/${APP_NAME}.conf" else echo_error "✗ 配置文件不存在: /etc/supervisor/conf.d/${APP_NAME}.conf" fi echo "" echo_info "Supervisord 进程状态:" if pgrep -x "supervisord" > /dev/null; then echo_info "✓ supervisord 正在运行" ps aux | grep supervisord | grep -v grep else echo_error "✗ supervisord 未运行" fi echo "" echo_info "应用状态:" sudo supervisorctl status ${APP_NAME} || echo_error "无法获取应用状态" } # 3. 检查 Python 环境 check_python() { echo_section "3. 检查 Python 环境" if [ -f "${VENV_DIR}/bin/python" ]; then echo_info "Python 版本:" ${VENV_DIR}/bin/python --version echo "" echo_info "检查关键依赖:" ${VENV_DIR}/bin/python -c "import flask; print(f'Flask: {flask.__version__}')" 2>&1 ${VENV_DIR}/bin/python -c "import gunicorn; print(f'Gunicorn: {gunicorn.__version__}')" 2>&1 echo "" echo_info "检查 zoneinfo (时区模块):" ${VENV_DIR}/bin/python -c " try: from zoneinfo import ZoneInfo print('✓ 使用标准库 zoneinfo') except ImportError: from backports.zoneinfo import ZoneInfo print('✓ 使用 backports.zoneinfo (Python 3.8)') tz = ZoneInfo('Asia/Shanghai') print(f'✓ 东八区时区加载成功: {tz}') " 2>&1 || echo_error "✗ zoneinfo 不可用或时区数据缺失" else echo_error "Python 虚拟环境不存在" fi } # 4. 测试应用导入 test_app_import() { echo_section "4. 测试应用导入" echo_info "尝试导入应用模块..." cd "${APP_DIR}" ${VENV_DIR}/bin/python -c " import sys sys.path.insert(0, '${APP_DIR}') try: from app import create_app print('✓ 应用模块导入成功') app = create_app() print('✓ 应用实例创建成功') except Exception as e: print(f'✗ 导入失败: {e}') import traceback traceback.print_exc() " 2>&1 } # 5. 检查日志文件 check_logs() { echo_section "5. 检查日志文件" echo_info "Supervisor 日志:" if [ -f "/var/log/supervisor/supervisord.log" ]; then echo_info "最近 20 行:" sudo tail -20 /var/log/supervisor/supervisord.log else echo_warn "日志文件不存在" fi echo "" echo_info "应用错误日志:" if [ -f "${LOG_DIR}/gunicorn_error.log" ]; then echo_info "最近 30 行:" tail -30 "${LOG_DIR}/gunicorn_error.log" else echo_warn "应用错误日志不存在: ${LOG_DIR}/gunicorn_error.log" fi echo "" echo_info "应用访问日志:" if [ -f "${LOG_DIR}/gunicorn_access.log" ]; then echo_info "最近 10 行:" tail -10 "${LOG_DIR}/gunicorn_access.log" else echo_warn "应用访问日志不存在: ${LOG_DIR}/gunicorn_access.log" fi echo "" echo_info "Supervisor 应用日志:" if [ -f "/var/log/supervisor/${APP_NAME}-stderr.log" ]; then echo_info "stderr 最近 30 行:" sudo tail -30 "/var/log/supervisor/${APP_NAME}-stderr.log" else echo_warn "Supervisor stderr 日志不存在" fi if [ -f "/var/log/supervisor/${APP_NAME}-stdout.log" ]; then echo_info "stdout 最近 20 行:" sudo tail -20 "/var/log/supervisor/${APP_NAME}-stdout.log" else echo_warn "Supervisor stdout 日志不存在" fi } # 6. 检查端口占用 check_ports() { echo_section "6. 检查端口占用" echo_info "检查 5500 端口:" if sudo netstat -tlnp | grep :5500; then echo_info "✓ 端口 5500 已被占用" else echo_warn "✗ 端口 5500 未被占用(应用可能未启动)" fi } # 7. 检查环境变量和配置 check_config() { echo_section "7. 检查配置文件" if [ -f "${APP_DIR}/.env" ]; then echo_info "✓ .env 文件存在" echo_info "环境变量(隐藏敏感信息):" grep -v "PASSWORD\|SECRET\|KEY" "${APP_DIR}/.env" || echo "无非敏感配置" else echo_warn "✗ .env 文件不存在" fi } # 8. 提供修复建议 provide_suggestions() { echo_section "8. 修复建议" echo_info "基于诊断结果,尝试以下步骤:" echo "" echo "1. 如果是 zoneinfo 问题(Python 3.9+ 时区模块):" echo " sudo apt-get update" echo " sudo apt-get install -y tzdata" echo "" echo "2. 如果日志目录不存在:" echo " sudo mkdir -p ${LOG_DIR}" echo " sudo chown ubuntu:ubuntu ${LOG_DIR}" echo "" echo "3. 重新加载 supervisor 配置:" echo " sudo supervisorctl reread" echo " sudo supervisorctl update" echo "" echo "4. 手动启动应用测试:" echo " cd ${APP_DIR}" echo " source ${VENV_DIR}/bin/activate" echo " gunicorn -c gunicorn_config.py 'app:create_app()'" echo "" echo "5. 查看实时日志:" echo " sudo tail -f /var/log/supervisor/${APP_NAME}-stderr.log" } # 主函数 main() { echo "==========================================" echo " DataOps Platform 问题诊断" echo "==========================================" echo "" check_directories check_supervisor check_python test_app_import check_logs check_ports check_config provide_suggestions echo "" echo_info "诊断完成!" } main "$@"