| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266 |
- #!/bin/bash
- #
- # DataOps Platform 问题诊断脚本
- # 用于排查启动失败的原因
- #
- # 配置变量
- APP_NAME="dataops-platform"
- APP_DIR="/opt/dataops-platform"
- VENV_DIR="${APP_DIR}/venv"
- LOG_DIR="${APP_DIR}/logs"
- # 颜色输出
- RED='\033[0;31m'
- GREEN='\033[0;32m'
- YELLOW='\033[1;33m'
- BLUE='\033[0;34m'
- NC='\033[0m' # No Color
- echo_info() {
- echo -e "${GREEN}[INFO]${NC} $1"
- }
- echo_warn() {
- echo -e "${YELLOW}[WARN]${NC} $1"
- }
- echo_error() {
- echo -e "${RED}[ERROR]${NC} $1"
- }
- echo_section() {
- echo -e "\n${BLUE}========================================${NC}"
- echo -e "${BLUE} $1${NC}"
- echo -e "${BLUE}========================================${NC}"
- }
- # 1. 检查目录结构
- check_directories() {
- echo_section "1. 检查目录结构"
-
- echo_info "应用目录: ${APP_DIR}"
- if [ -d "${APP_DIR}" ]; then
- echo_info "✓ 应用目录存在"
- ls -la "${APP_DIR}" | head -20
- else
- echo_error "✗ 应用目录不存在"
- fi
-
- echo ""
- echo_info "虚拟环境: ${VENV_DIR}"
- if [ -d "${VENV_DIR}" ]; then
- echo_info "✓ 虚拟环境存在"
- else
- echo_error "✗ 虚拟环境不存在"
- fi
-
- echo ""
- echo_info "日志目录: ${LOG_DIR}"
- if [ -d "${LOG_DIR}" ]; then
- echo_info "✓ 日志目录存在"
- ls -la "${LOG_DIR}"
- else
- echo_error "✗ 日志目录不存在,正在创建..."
- sudo mkdir -p "${LOG_DIR}"
- sudo chown ubuntu:ubuntu "${LOG_DIR}"
- fi
- }
- # 2. 检查 supervisor 配置
- check_supervisor() {
- echo_section "2. 检查 Supervisor 配置"
-
- echo_info "Supervisor 配置文件:"
- if [ -f "/etc/supervisor/conf.d/${APP_NAME}.conf" ]; then
- echo_info "✓ 配置文件存在"
- cat "/etc/supervisor/conf.d/${APP_NAME}.conf"
- else
- echo_error "✗ 配置文件不存在: /etc/supervisor/conf.d/${APP_NAME}.conf"
- fi
-
- echo ""
- echo_info "Supervisord 进程状态:"
- if pgrep -x "supervisord" > /dev/null; then
- echo_info "✓ supervisord 正在运行"
- ps aux | grep supervisord | grep -v grep
- else
- echo_error "✗ supervisord 未运行"
- fi
-
- echo ""
- echo_info "应用状态:"
- sudo supervisorctl status ${APP_NAME} || echo_error "无法获取应用状态"
- }
- # 3. 检查 Python 环境
- check_python() {
- echo_section "3. 检查 Python 环境"
-
- if [ -f "${VENV_DIR}/bin/python" ]; then
- echo_info "Python 版本:"
- ${VENV_DIR}/bin/python --version
-
- echo ""
- echo_info "检查关键依赖:"
- ${VENV_DIR}/bin/python -c "import flask; print(f'Flask: {flask.__version__}')" 2>&1
- ${VENV_DIR}/bin/python -c "import gunicorn; print(f'Gunicorn: {gunicorn.__version__}')" 2>&1
-
- echo ""
- echo_info "检查 zoneinfo (时区模块):"
- ${VENV_DIR}/bin/python -c "
- try:
- from zoneinfo import ZoneInfo
- print('✓ 使用标准库 zoneinfo')
- except ImportError:
- from backports.zoneinfo import ZoneInfo
- print('✓ 使用 backports.zoneinfo (Python 3.8)')
- tz = ZoneInfo('Asia/Shanghai')
- print(f'✓ 东八区时区加载成功: {tz}')
- " 2>&1 || echo_error "✗ zoneinfo 不可用或时区数据缺失"
- else
- echo_error "Python 虚拟环境不存在"
- fi
- }
- # 4. 测试应用导入
- test_app_import() {
- echo_section "4. 测试应用导入"
-
- echo_info "尝试导入应用模块..."
- cd "${APP_DIR}"
- ${VENV_DIR}/bin/python -c "
- import sys
- sys.path.insert(0, '${APP_DIR}')
- try:
- from app import create_app
- print('✓ 应用模块导入成功')
- app = create_app()
- print('✓ 应用实例创建成功')
- except Exception as e:
- print(f'✗ 导入失败: {e}')
- import traceback
- traceback.print_exc()
- " 2>&1
- }
- # 5. 检查日志文件
- check_logs() {
- echo_section "5. 检查日志文件"
-
- echo_info "Supervisor 日志:"
- if [ -f "/var/log/supervisor/supervisord.log" ]; then
- echo_info "最近 20 行:"
- sudo tail -20 /var/log/supervisor/supervisord.log
- else
- echo_warn "日志文件不存在"
- fi
-
- echo ""
- echo_info "应用错误日志:"
- if [ -f "${LOG_DIR}/gunicorn_error.log" ]; then
- echo_info "最近 30 行:"
- tail -30 "${LOG_DIR}/gunicorn_error.log"
- else
- echo_warn "应用错误日志不存在: ${LOG_DIR}/gunicorn_error.log"
- fi
-
- echo ""
- echo_info "应用访问日志:"
- if [ -f "${LOG_DIR}/gunicorn_access.log" ]; then
- echo_info "最近 10 行:"
- tail -10 "${LOG_DIR}/gunicorn_access.log"
- else
- echo_warn "应用访问日志不存在: ${LOG_DIR}/gunicorn_access.log"
- fi
-
- echo ""
- echo_info "Supervisor 应用日志:"
- if [ -f "/var/log/supervisor/${APP_NAME}-stderr.log" ]; then
- echo_info "stderr 最近 30 行:"
- sudo tail -30 "/var/log/supervisor/${APP_NAME}-stderr.log"
- else
- echo_warn "Supervisor stderr 日志不存在"
- fi
-
- if [ -f "/var/log/supervisor/${APP_NAME}-stdout.log" ]; then
- echo_info "stdout 最近 20 行:"
- sudo tail -20 "/var/log/supervisor/${APP_NAME}-stdout.log"
- else
- echo_warn "Supervisor stdout 日志不存在"
- fi
- }
- # 6. 检查端口占用
- check_ports() {
- echo_section "6. 检查端口占用"
-
- echo_info "检查 5500 端口:"
- if sudo netstat -tlnp | grep :5500; then
- echo_info "✓ 端口 5500 已被占用"
- else
- echo_warn "✗ 端口 5500 未被占用(应用可能未启动)"
- fi
- }
- # 7. 检查环境变量和配置
- check_config() {
- echo_section "7. 检查配置文件"
-
- if [ -f "${APP_DIR}/.env" ]; then
- echo_info "✓ .env 文件存在"
- echo_info "环境变量(隐藏敏感信息):"
- grep -v "PASSWORD\|SECRET\|KEY" "${APP_DIR}/.env" || echo "无非敏感配置"
- else
- echo_warn "✗ .env 文件不存在"
- fi
- }
- # 8. 提供修复建议
- provide_suggestions() {
- echo_section "8. 修复建议"
-
- echo_info "基于诊断结果,尝试以下步骤:"
- echo ""
- echo "1. 如果是 zoneinfo 问题(Python 3.9+ 时区模块):"
- echo " sudo apt-get update"
- echo " sudo apt-get install -y tzdata"
- echo ""
- echo "2. 如果日志目录不存在:"
- echo " sudo mkdir -p ${LOG_DIR}"
- echo " sudo chown ubuntu:ubuntu ${LOG_DIR}"
- echo ""
- echo "3. 重新加载 supervisor 配置:"
- echo " sudo supervisorctl reread"
- echo " sudo supervisorctl update"
- echo ""
- echo "4. 手动启动应用测试:"
- echo " cd ${APP_DIR}"
- echo " source ${VENV_DIR}/bin/activate"
- echo " gunicorn -c gunicorn_config.py 'app:create_app()'"
- echo ""
- echo "5. 查看实时日志:"
- echo " sudo tail -f /var/log/supervisor/${APP_NAME}-stderr.log"
- }
- # 主函数
- main() {
- echo "=========================================="
- echo " DataOps Platform 问题诊断"
- echo "=========================================="
- echo ""
-
- check_directories
- check_supervisor
- check_python
- test_app_import
- check_logs
- check_ports
- check_config
- provide_suggestions
-
- echo ""
- echo_info "诊断完成!"
- }
- main "$@"
|