#!/usr/bin/env bash # shellcheck disable=SC2034 # Shared configuration and helpers for DataOps production scripts. APP_NAME="${APP_NAME:-dataops-platform}" APP_DIR="${APP_DIR:-/opt/dataops-platform}" APP_USER="${APP_USER:-ubuntu}" APP_GROUP="${APP_GROUP:-$APP_USER}" VENV_DIR="${APP_DIR}/venv" LOG_DIR="${APP_DIR}/logs" SCRIPTS_DIR="${APP_DIR}/scripts" RUN_SCRIPT="${SCRIPTS_DIR}/run_dataops.sh" ENV_DIR="${ENV_DIR:-/etc/dataops-platform}" ENV_FILE="${ENV_FILE:-${ENV_DIR}/dataops.env}" LISTEN_HOST="${LISTEN_HOST:-0.0.0.0}" LISTEN_PORT="${LISTEN_PORT:-5500}" GUNICORN_WORKERS="${GUNICORN_WORKERS:-4}" GUNICORN_TIMEOUT="${GUNICORN_TIMEOUT:-120}" SUPERVISOR_CONF="/etc/supervisor/conf.d/${APP_NAME}.conf" SUPERVISOR_LOG="/var/log/supervisor/${APP_NAME}.log" RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' echo_info() { echo -e "${GREEN}[INFO]${NC} $1"; } echo_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } echo_error() { echo -e "${RED}[ERROR]${NC} $1"; } echo_step() { echo -e "${BLUE}[STEP]${NC} $1"; } # Windows 编辑的脚本可能带 CRLF,bash 会报 $'\r': command not found normalize_shell_scripts() { local path for path in "$@"; do if [[ -d "${path}" ]]; then local file for file in "${path}"/*.sh; do [[ -f "${file}" ]] && sed -i 's/\r$//' "${file}" done elif [[ -f "${path}" ]]; then sed -i 's/\r$//' "${path}" fi done } # Windows 编辑的 env 文件带 CRLF/BOM 时,source 会报 line 1: #: command not found normalize_env_file() { local file="${1:-${ENV_FILE}}" [[ -f "${file}" && -w "${file}" ]] || return 0 sed -i 's/\r$//' "${file}" sed -i '1s/^\xEF\xBB\xBF//' "${file}" 2>/dev/null || true } source_env_file() { local file="${1:-${ENV_FILE}}" [[ -f "${file}" ]] || return 0 if [[ ! -r "${file}" ]]; then echo_error "无法读取环境变量文件: ${file} (当前用户: $(id -un))" echo_info "请执行: sudo chown root:${APP_GROUP} ${file} && sudo chmod 640 ${file}" return 1 fi set -a # shellcheck disable=SC1090 source <(sed 's/\r$//' "${file}" | sed '1s/^\xEF\xBB\xBF//') set +a } ensure_env_file_permissions() { [[ -f "${ENV_FILE}" ]] || return 0 chown root:"${APP_GROUP}" "${ENV_FILE}" 2>/dev/null \ || chown "${APP_USER}:${APP_GROUP}" "${ENV_FILE}" 2>/dev/null \ || true chmod 640 "${ENV_FILE}" 2>/dev/null || chmod 600 "${ENV_FILE}" 2>/dev/null || true } require_root() { if [[ "${EUID}" -ne 0 ]]; then echo_error "请使用 sudo 运行此脚本" exit 1 fi } load_env_file() { normalize_env_file source_env_file LISTEN_HOST="${LISTEN_HOST:-0.0.0.0}" LISTEN_PORT="${LISTEN_PORT:-5500}" GUNICORN_WORKERS="${GUNICORN_WORKERS:-4}" GUNICORN_TIMEOUT="${GUNICORN_TIMEOUT:-120}" } resolve_listen_port() { load_env_file echo "${LISTEN_PORT}" } ensure_env_file() { if [[ -f "${ENV_FILE}" ]]; then echo_info "环境变量文件: ${ENV_FILE}" return 0 fi local script_dir="${1:-}" local candidates=() if [[ -n "${script_dir}" ]]; then candidates+=( "${script_dir}/../deployment/dataops.env" "${APP_DIR}/deployment/dataops.env" ) fi candidates+=( "${APP_DIR}/deployment/dataops.env" "${APP_DIR}/dataops.env" ) local candidate for candidate in "${candidates[@]}"; do if [[ -f "${candidate}" ]]; then mkdir -p "${ENV_DIR}" install -m 640 "${candidate}" "${ENV_FILE}" chown root:"${APP_GROUP}" "${ENV_FILE}" 2>/dev/null || true normalize_env_file "${ENV_FILE}" echo_info "已从 ${candidate} 安装环境变量到 ${ENV_FILE}" return 0 fi done echo_error "未找到 ${ENV_FILE}" echo_info "请创建该文件,或将 deployment/dataops.env 复制到 ${ENV_FILE} 后重试" exit 1 } check_env_file() { if [[ ! -f "${ENV_FILE}" ]]; then echo_error "环境变量文件不存在: ${ENV_FILE}" echo_info "请先运行: sudo ${APP_DIR}/scripts/deploy_dataops.sh" exit 1 fi if [[ "${EUID}" -eq 0 ]]; then ensure_env_file_permissions elif [[ ! -r "${ENV_FILE}" ]]; then echo_error "无法读取环境变量文件: ${ENV_FILE} (当前用户: $(id -un))" echo_info "请执行: sudo chown root:${APP_GROUP} ${ENV_FILE} && sudo chmod 640 ${ENV_FILE}" exit 1 fi } check_venv() { if [[ ! -d "${VENV_DIR}" ]]; then echo_error "虚拟环境不存在: ${VENV_DIR}" echo_info "请先运行: sudo ${APP_DIR}/scripts/deploy_dataops.sh" exit 1 fi } check_run_script() { if [[ ! -x "${RUN_SCRIPT}" ]]; then echo_error "启动脚本不存在或不可执行: ${RUN_SCRIPT}" echo_info "请先运行: sudo ${SCRIPTS_DIR}/deploy_dataops.sh" exit 1 fi } check_supervisor() { if ! command -v supervisorctl >/dev/null 2>&1; then echo_error "supervisorctl 未安装" exit 1 fi if ! pgrep -x supervisord >/dev/null 2>&1; then echo_warn "supervisord 未运行,正在启动..." supervisord -c /etc/supervisor/supervisord.conf || systemctl start supervisor sleep 2 fi } is_port_listening() { local app_port="$1" if command -v ss >/dev/null 2>&1; then ss -ltn | grep -q ":${app_port} " return $? fi if command -v netstat >/dev/null 2>&1; then netstat -ltn | grep -q ":${app_port} " return $? fi return 1 } diagnose_service() { local app_port app_port="$(resolve_listen_port)" echo_warn "========== 诊断信息 ==========" echo_info "Supervisor 状态:" supervisorctl status "${APP_NAME}" || true echo_info "Supervisor 配置 command:" grep -E '^command=' "${SUPERVISOR_CONF}" 2>/dev/null || true echo_info "端口 ${app_port} 监听情况:" if command -v ss >/dev/null 2>&1; then ss -ltnp | grep -E ":${app_port}\\b" || echo " (未监听)" elif command -v netstat >/dev/null 2>&1; then netstat -ltnp 2>/dev/null | grep -E ":${app_port}\\b" || echo " (未监听)" fi echo_info "Gunicorn 进程:" pgrep -af gunicorn || echo " (无 gunicorn 进程)" echo_info "最近 Supervisor 日志 (${SUPERVISOR_LOG}):" tail -n 30 "${SUPERVISOR_LOG}" 2>/dev/null || echo " (无法读取)" echo_info "最近 Gunicorn 错误日志 (${LOG_DIR}/gunicorn_error.log):" tail -n 30 "${LOG_DIR}/gunicorn_error.log" 2>/dev/null || echo " (无法读取)" echo_warn "==============================" } health_check() { local app_port app_port="$(resolve_listen_port)" echo_info "健康检查: http://127.0.0.1:${app_port}/api/system/health" local max_retries=8 local retry_interval=3 local retry_count=0 local response="000" while [[ ${retry_count} -lt ${max_retries} ]]; do sleep "${retry_interval}" retry_count=$((retry_count + 1)) if ! is_port_listening "${app_port}"; then echo_info "尝试 ${retry_count}/${max_retries}: 端口 ${app_port} 尚未监听,等待..." continue fi response="$(curl -s -o /dev/null -w '%{http_code}' \ "http://127.0.0.1:${app_port}/api/system/health" 2>/dev/null)" || response="000" if [[ "${response}" == "200" ]]; then echo_info "健康检查通过 (HTTP ${response})" return 0 fi echo_info "尝试 ${retry_count}/${max_retries}: HTTP ${response},等待重试..." done echo_warn "健康检查失败,最近 HTTP 状态码: ${response}" diagnose_service return 1 } ensure_wsgi() { if [[ -f "${APP_DIR}/wsgi.py" ]]; then : else cat >"${APP_DIR}/wsgi.py" <<'EOF' """WSGI entry point for production deployment.""" from app import create_app application = create_app() app = application EOF chown "${APP_USER}:${APP_GROUP}" "${APP_DIR}/wsgi.py" echo_info "已创建 ${APP_DIR}/wsgi.py" fi ensure_gunicorn_config "${1:-}" } ensure_gunicorn_config() { if [[ -f "${APP_DIR}/gunicorn_config.py" ]]; then return 0 fi local script_dir="${1:-}" local candidates=() if [[ -n "${script_dir}" ]]; then candidates+=( "${script_dir}/../gunicorn_config.py" "${script_dir}/gunicorn_config.py" ) fi candidates+=( "${APP_DIR}/gunicorn_config.py" "${APP_DIR}/deployment/gunicorn_config.py" ) local candidate for candidate in "${candidates[@]}"; do if [[ -f "${candidate}" ]]; then install -m 644 "${candidate}" "${APP_DIR}/gunicorn_config.py" chown "${APP_USER}:${APP_GROUP}" "${APP_DIR}/gunicorn_config.py" echo_info "已安装 ${APP_DIR}/gunicorn_config.py" return 0 fi done echo_error "缺少 ${APP_DIR}/gunicorn_config.py" echo_info "请将 gunicorn_config.py 放到应用目录后重试" exit 1 } remove_legacy_run_script() { if [[ -f "${APP_DIR}/run_dataops.sh" ]]; then rm -f "${APP_DIR}/run_dataops.sh" echo_warn "已移除旧版 ${APP_DIR}/run_dataops.sh,统一使用 ${RUN_SCRIPT}" fi } ensure_run_script() { if [[ ! -f "${RUN_SCRIPT}" ]]; then echo_error "启动脚本不存在: ${RUN_SCRIPT}" echo_info "请先运行: sudo ${SCRIPTS_DIR}/deploy_dataops.sh" exit 1 fi chmod 755 "${RUN_SCRIPT}" chown "${APP_USER}:${APP_GROUP}" "${RUN_SCRIPT}" 2>/dev/null || true normalize_shell_scripts "${RUN_SCRIPT}" remove_legacy_run_script } remove_legacy_gunicorn_config() { if [[ -f "${APP_DIR}/gunicorn.conf.py" ]]; then mv "${APP_DIR}/gunicorn.conf.py" "${APP_DIR}/gunicorn.conf.py.bak.$(date +%Y%m%d%H%M%S)" echo_warn "已备份旧 gunicorn.conf.py(曾可能导致错误端口 80 绑定)" fi } configure_supervisor() { remove_legacy_gunicorn_config ensure_run_script mkdir -p "${LOG_DIR}" /var/log/supervisor chown -R "${APP_USER}:${APP_GROUP}" "${LOG_DIR}" cat >"${SUPERVISOR_CONF}" <