dataops-common.sh 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. #!/usr/bin/env bash
  2. # shellcheck disable=SC2034
  3. # Shared configuration and helpers for DataOps production scripts.
  4. APP_NAME="${APP_NAME:-dataops-platform}"
  5. APP_DIR="${APP_DIR:-/opt/dataops-platform}"
  6. APP_USER="${APP_USER:-ubuntu}"
  7. APP_GROUP="${APP_GROUP:-$APP_USER}"
  8. VENV_DIR="${APP_DIR}/venv"
  9. LOG_DIR="${APP_DIR}/logs"
  10. SCRIPTS_DIR="${APP_DIR}/scripts"
  11. RUN_SCRIPT="${SCRIPTS_DIR}/run_dataops.sh"
  12. ENV_DIR="${ENV_DIR:-/etc/dataops-platform}"
  13. ENV_FILE="${ENV_FILE:-${ENV_DIR}/dataops.env}"
  14. LISTEN_HOST="${LISTEN_HOST:-0.0.0.0}"
  15. LISTEN_PORT="${LISTEN_PORT:-5500}"
  16. GUNICORN_WORKERS="${GUNICORN_WORKERS:-4}"
  17. GUNICORN_TIMEOUT="${GUNICORN_TIMEOUT:-120}"
  18. SUPERVISOR_CONF="/etc/supervisor/conf.d/${APP_NAME}.conf"
  19. SUPERVISOR_LOG="/var/log/supervisor/${APP_NAME}.log"
  20. RED='\033[0;31m'
  21. GREEN='\033[0;32m'
  22. YELLOW='\033[1;33m'
  23. BLUE='\033[0;34m'
  24. NC='\033[0m'
  25. echo_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
  26. echo_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
  27. echo_error() { echo -e "${RED}[ERROR]${NC} $1"; }
  28. echo_step() { echo -e "${BLUE}[STEP]${NC} $1"; }
  29. # Windows 编辑的脚本可能带 CRLF,bash 会报 $'\r': command not found
  30. normalize_shell_scripts() {
  31. local path
  32. for path in "$@"; do
  33. if [[ -d "${path}" ]]; then
  34. local file
  35. for file in "${path}"/*.sh; do
  36. [[ -f "${file}" ]] && sed -i 's/\r$//' "${file}"
  37. done
  38. elif [[ -f "${path}" ]]; then
  39. sed -i 's/\r$//' "${path}"
  40. fi
  41. done
  42. }
  43. # Windows 编辑的 env 文件带 CRLF/BOM 时,source 会报 line 1: #: command not found
  44. normalize_env_file() {
  45. local file="${1:-${ENV_FILE}}"
  46. [[ -f "${file}" && -w "${file}" ]] || return 0
  47. sed -i 's/\r$//' "${file}"
  48. sed -i '1s/^\xEF\xBB\xBF//' "${file}" 2>/dev/null || true
  49. }
  50. source_env_file() {
  51. local file="${1:-${ENV_FILE}}"
  52. [[ -f "${file}" ]] || return 0
  53. if [[ ! -r "${file}" ]]; then
  54. echo_error "无法读取环境变量文件: ${file} (当前用户: $(id -un))"
  55. echo_info "请执行: sudo chown root:${APP_GROUP} ${file} && sudo chmod 640 ${file}"
  56. return 1
  57. fi
  58. set -a
  59. # shellcheck disable=SC1090
  60. source <(sed 's/\r$//' "${file}" | sed '1s/^\xEF\xBB\xBF//')
  61. set +a
  62. }
  63. ensure_env_file_permissions() {
  64. [[ -f "${ENV_FILE}" ]] || return 0
  65. chown root:"${APP_GROUP}" "${ENV_FILE}" 2>/dev/null \
  66. || chown "${APP_USER}:${APP_GROUP}" "${ENV_FILE}" 2>/dev/null \
  67. || true
  68. chmod 640 "${ENV_FILE}" 2>/dev/null || chmod 600 "${ENV_FILE}" 2>/dev/null || true
  69. }
  70. require_root() {
  71. if [[ "${EUID}" -ne 0 ]]; then
  72. echo_error "请使用 sudo 运行此脚本"
  73. exit 1
  74. fi
  75. }
  76. load_env_file() {
  77. normalize_env_file
  78. source_env_file
  79. LISTEN_HOST="${LISTEN_HOST:-0.0.0.0}"
  80. LISTEN_PORT="${LISTEN_PORT:-5500}"
  81. GUNICORN_WORKERS="${GUNICORN_WORKERS:-4}"
  82. GUNICORN_TIMEOUT="${GUNICORN_TIMEOUT:-120}"
  83. }
  84. resolve_listen_port() {
  85. load_env_file
  86. echo "${LISTEN_PORT}"
  87. }
  88. ensure_env_file() {
  89. if [[ -f "${ENV_FILE}" ]]; then
  90. echo_info "环境变量文件: ${ENV_FILE}"
  91. return 0
  92. fi
  93. local script_dir="${1:-}"
  94. local candidates=()
  95. if [[ -n "${script_dir}" ]]; then
  96. candidates+=(
  97. "${script_dir}/../deployment/dataops.env"
  98. "${APP_DIR}/deployment/dataops.env"
  99. )
  100. fi
  101. candidates+=(
  102. "${APP_DIR}/deployment/dataops.env"
  103. "${APP_DIR}/dataops.env"
  104. )
  105. local candidate
  106. for candidate in "${candidates[@]}"; do
  107. if [[ -f "${candidate}" ]]; then
  108. mkdir -p "${ENV_DIR}"
  109. install -m 640 "${candidate}" "${ENV_FILE}"
  110. chown root:"${APP_GROUP}" "${ENV_FILE}" 2>/dev/null || true
  111. normalize_env_file "${ENV_FILE}"
  112. echo_info "已从 ${candidate} 安装环境变量到 ${ENV_FILE}"
  113. return 0
  114. fi
  115. done
  116. echo_error "未找到 ${ENV_FILE}"
  117. echo_info "请创建该文件,或将 deployment/dataops.env 复制到 ${ENV_FILE} 后重试"
  118. exit 1
  119. }
  120. check_env_file() {
  121. if [[ ! -f "${ENV_FILE}" ]]; then
  122. echo_error "环境变量文件不存在: ${ENV_FILE}"
  123. echo_info "请先运行: sudo ${APP_DIR}/scripts/deploy_dataops.sh"
  124. exit 1
  125. fi
  126. if [[ "${EUID}" -eq 0 ]]; then
  127. ensure_env_file_permissions
  128. elif [[ ! -r "${ENV_FILE}" ]]; then
  129. echo_error "无法读取环境变量文件: ${ENV_FILE} (当前用户: $(id -un))"
  130. echo_info "请执行: sudo chown root:${APP_GROUP} ${ENV_FILE} && sudo chmod 640 ${ENV_FILE}"
  131. exit 1
  132. fi
  133. }
  134. check_venv() {
  135. if [[ ! -d "${VENV_DIR}" ]]; then
  136. echo_error "虚拟环境不存在: ${VENV_DIR}"
  137. echo_info "请先运行: sudo ${APP_DIR}/scripts/deploy_dataops.sh"
  138. exit 1
  139. fi
  140. }
  141. check_run_script() {
  142. if [[ ! -x "${RUN_SCRIPT}" ]]; then
  143. echo_error "启动脚本不存在或不可执行: ${RUN_SCRIPT}"
  144. echo_info "请先运行: sudo ${SCRIPTS_DIR}/deploy_dataops.sh"
  145. exit 1
  146. fi
  147. }
  148. check_supervisor() {
  149. if ! command -v supervisorctl >/dev/null 2>&1; then
  150. echo_error "supervisorctl 未安装"
  151. exit 1
  152. fi
  153. if ! pgrep -x supervisord >/dev/null 2>&1; then
  154. echo_warn "supervisord 未运行,正在启动..."
  155. supervisord -c /etc/supervisor/supervisord.conf || systemctl start supervisor
  156. sleep 2
  157. fi
  158. }
  159. is_port_listening() {
  160. local app_port="$1"
  161. if command -v ss >/dev/null 2>&1; then
  162. ss -ltn | grep -q ":${app_port} "
  163. return $?
  164. fi
  165. if command -v netstat >/dev/null 2>&1; then
  166. netstat -ltn | grep -q ":${app_port} "
  167. return $?
  168. fi
  169. return 1
  170. }
  171. diagnose_service() {
  172. local app_port
  173. app_port="$(resolve_listen_port)"
  174. echo_warn "========== 诊断信息 =========="
  175. echo_info "Supervisor 状态:"
  176. supervisorctl status "${APP_NAME}" || true
  177. echo_info "Supervisor 配置 command:"
  178. grep -E '^command=' "${SUPERVISOR_CONF}" 2>/dev/null || true
  179. echo_info "端口 ${app_port} 监听情况:"
  180. if command -v ss >/dev/null 2>&1; then
  181. ss -ltnp | grep -E ":${app_port}\\b" || echo " (未监听)"
  182. elif command -v netstat >/dev/null 2>&1; then
  183. netstat -ltnp 2>/dev/null | grep -E ":${app_port}\\b" || echo " (未监听)"
  184. fi
  185. echo_info "Gunicorn 进程:"
  186. pgrep -af gunicorn || echo " (无 gunicorn 进程)"
  187. echo_info "最近 Supervisor 日志 (${SUPERVISOR_LOG}):"
  188. tail -n 30 "${SUPERVISOR_LOG}" 2>/dev/null || echo " (无法读取)"
  189. echo_info "最近 Gunicorn 错误日志 (${LOG_DIR}/gunicorn_error.log):"
  190. tail -n 30 "${LOG_DIR}/gunicorn_error.log" 2>/dev/null || echo " (无法读取)"
  191. echo_warn "=============================="
  192. }
  193. health_check() {
  194. local app_port
  195. app_port="$(resolve_listen_port)"
  196. echo_info "健康检查: http://127.0.0.1:${app_port}/api/system/health"
  197. local max_retries=8
  198. local retry_interval=3
  199. local retry_count=0
  200. local response="000"
  201. while [[ ${retry_count} -lt ${max_retries} ]]; do
  202. sleep "${retry_interval}"
  203. retry_count=$((retry_count + 1))
  204. if ! is_port_listening "${app_port}"; then
  205. echo_info "尝试 ${retry_count}/${max_retries}: 端口 ${app_port} 尚未监听,等待..."
  206. continue
  207. fi
  208. response="$(curl -s -o /dev/null -w '%{http_code}' \
  209. "http://127.0.0.1:${app_port}/api/system/health" 2>/dev/null)" || response="000"
  210. if [[ "${response}" == "200" ]]; then
  211. echo_info "健康检查通过 (HTTP ${response})"
  212. return 0
  213. fi
  214. echo_info "尝试 ${retry_count}/${max_retries}: HTTP ${response},等待重试..."
  215. done
  216. echo_warn "健康检查失败,最近 HTTP 状态码: ${response}"
  217. diagnose_service
  218. return 1
  219. }
  220. ensure_wsgi() {
  221. if [[ -f "${APP_DIR}/wsgi.py" ]]; then
  222. :
  223. else
  224. cat >"${APP_DIR}/wsgi.py" <<'EOF'
  225. """WSGI entry point for production deployment."""
  226. from app import create_app
  227. application = create_app()
  228. app = application
  229. EOF
  230. chown "${APP_USER}:${APP_GROUP}" "${APP_DIR}/wsgi.py"
  231. echo_info "已创建 ${APP_DIR}/wsgi.py"
  232. fi
  233. ensure_gunicorn_config "${1:-}"
  234. }
  235. ensure_gunicorn_config() {
  236. if [[ -f "${APP_DIR}/gunicorn_config.py" ]]; then
  237. return 0
  238. fi
  239. local script_dir="${1:-}"
  240. local candidates=()
  241. if [[ -n "${script_dir}" ]]; then
  242. candidates+=(
  243. "${script_dir}/../gunicorn_config.py"
  244. "${script_dir}/gunicorn_config.py"
  245. )
  246. fi
  247. candidates+=(
  248. "${APP_DIR}/gunicorn_config.py"
  249. "${APP_DIR}/deployment/gunicorn_config.py"
  250. )
  251. local candidate
  252. for candidate in "${candidates[@]}"; do
  253. if [[ -f "${candidate}" ]]; then
  254. install -m 644 "${candidate}" "${APP_DIR}/gunicorn_config.py"
  255. chown "${APP_USER}:${APP_GROUP}" "${APP_DIR}/gunicorn_config.py"
  256. echo_info "已安装 ${APP_DIR}/gunicorn_config.py"
  257. return 0
  258. fi
  259. done
  260. echo_error "缺少 ${APP_DIR}/gunicorn_config.py"
  261. echo_info "请将 gunicorn_config.py 放到应用目录后重试"
  262. exit 1
  263. }
  264. remove_legacy_run_script() {
  265. if [[ -f "${APP_DIR}/run_dataops.sh" ]]; then
  266. rm -f "${APP_DIR}/run_dataops.sh"
  267. echo_warn "已移除旧版 ${APP_DIR}/run_dataops.sh,统一使用 ${RUN_SCRIPT}"
  268. fi
  269. }
  270. ensure_run_script() {
  271. if [[ ! -f "${RUN_SCRIPT}" ]]; then
  272. echo_error "启动脚本不存在: ${RUN_SCRIPT}"
  273. echo_info "请先运行: sudo ${SCRIPTS_DIR}/deploy_dataops.sh"
  274. exit 1
  275. fi
  276. chmod 755 "${RUN_SCRIPT}"
  277. chown "${APP_USER}:${APP_GROUP}" "${RUN_SCRIPT}" 2>/dev/null || true
  278. normalize_shell_scripts "${RUN_SCRIPT}"
  279. remove_legacy_run_script
  280. }
  281. remove_legacy_gunicorn_config() {
  282. if [[ -f "${APP_DIR}/gunicorn.conf.py" ]]; then
  283. mv "${APP_DIR}/gunicorn.conf.py" "${APP_DIR}/gunicorn.conf.py.bak.$(date +%Y%m%d%H%M%S)"
  284. echo_warn "已备份旧 gunicorn.conf.py(曾可能导致错误端口 80 绑定)"
  285. fi
  286. }
  287. configure_supervisor() {
  288. remove_legacy_gunicorn_config
  289. ensure_run_script
  290. mkdir -p "${LOG_DIR}" /var/log/supervisor
  291. chown -R "${APP_USER}:${APP_GROUP}" "${LOG_DIR}"
  292. cat >"${SUPERVISOR_CONF}" <<EOF
  293. [program:${APP_NAME}]
  294. command=/usr/bin/env bash ${RUN_SCRIPT}
  295. directory=${APP_DIR}
  296. user=${APP_USER}
  297. autostart=true
  298. autorestart=true
  299. startsecs=8
  300. startretries=5
  301. stopasgroup=true
  302. killasgroup=true
  303. redirect_stderr=true
  304. stdout_logfile=${SUPERVISOR_LOG}
  305. stdout_logfile_maxbytes=50MB
  306. stdout_logfile_backups=5
  307. environment=FLASK_ENV="production",APP_ENV_FILE="${ENV_FILE}",APP_DIR="${APP_DIR}",PATH="${VENV_DIR}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
  308. EOF
  309. supervisorctl reread
  310. supervisorctl update
  311. echo_info "Supervisor 配置已更新: ${SUPERVISOR_CONF}"
  312. }