diagnose_issue.sh 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. #!/bin/bash
  2. #
  3. # DataOps Platform 问题诊断脚本
  4. # 用于排查启动失败的原因
  5. #
  6. # 配置变量
  7. APP_NAME="dataops-platform"
  8. APP_DIR="/opt/dataops-platform"
  9. VENV_DIR="${APP_DIR}/venv"
  10. LOG_DIR="${APP_DIR}/logs"
  11. # 颜色输出
  12. RED='\033[0;31m'
  13. GREEN='\033[0;32m'
  14. YELLOW='\033[1;33m'
  15. BLUE='\033[0;34m'
  16. NC='\033[0m' # No Color
  17. echo_info() {
  18. echo -e "${GREEN}[INFO]${NC} $1"
  19. }
  20. echo_warn() {
  21. echo -e "${YELLOW}[WARN]${NC} $1"
  22. }
  23. echo_error() {
  24. echo -e "${RED}[ERROR]${NC} $1"
  25. }
  26. echo_section() {
  27. echo -e "\n${BLUE}========================================${NC}"
  28. echo -e "${BLUE} $1${NC}"
  29. echo -e "${BLUE}========================================${NC}"
  30. }
  31. # 1. 检查目录结构
  32. check_directories() {
  33. echo_section "1. 检查目录结构"
  34. echo_info "应用目录: ${APP_DIR}"
  35. if [ -d "${APP_DIR}" ]; then
  36. echo_info "✓ 应用目录存在"
  37. ls -la "${APP_DIR}" | head -20
  38. else
  39. echo_error "✗ 应用目录不存在"
  40. fi
  41. echo ""
  42. echo_info "虚拟环境: ${VENV_DIR}"
  43. if [ -d "${VENV_DIR}" ]; then
  44. echo_info "✓ 虚拟环境存在"
  45. else
  46. echo_error "✗ 虚拟环境不存在"
  47. fi
  48. echo ""
  49. echo_info "日志目录: ${LOG_DIR}"
  50. if [ -d "${LOG_DIR}" ]; then
  51. echo_info "✓ 日志目录存在"
  52. ls -la "${LOG_DIR}"
  53. else
  54. echo_error "✗ 日志目录不存在,正在创建..."
  55. sudo mkdir -p "${LOG_DIR}"
  56. sudo chown ubuntu:ubuntu "${LOG_DIR}"
  57. fi
  58. }
  59. # 2. 检查 supervisor 配置
  60. check_supervisor() {
  61. echo_section "2. 检查 Supervisor 配置"
  62. echo_info "Supervisor 配置文件:"
  63. if [ -f "/etc/supervisor/conf.d/${APP_NAME}.conf" ]; then
  64. echo_info "✓ 配置文件存在"
  65. cat "/etc/supervisor/conf.d/${APP_NAME}.conf"
  66. else
  67. echo_error "✗ 配置文件不存在: /etc/supervisor/conf.d/${APP_NAME}.conf"
  68. fi
  69. echo ""
  70. echo_info "Supervisord 进程状态:"
  71. if pgrep -x "supervisord" > /dev/null; then
  72. echo_info "✓ supervisord 正在运行"
  73. ps aux | grep supervisord | grep -v grep
  74. else
  75. echo_error "✗ supervisord 未运行"
  76. fi
  77. echo ""
  78. echo_info "应用状态:"
  79. sudo supervisorctl status ${APP_NAME} || echo_error "无法获取应用状态"
  80. }
  81. # 3. 检查 Python 环境
  82. check_python() {
  83. echo_section "3. 检查 Python 环境"
  84. if [ -f "${VENV_DIR}/bin/python" ]; then
  85. echo_info "Python 版本:"
  86. ${VENV_DIR}/bin/python --version
  87. echo ""
  88. echo_info "检查关键依赖:"
  89. ${VENV_DIR}/bin/python -c "import flask; print(f'Flask: {flask.__version__}')" 2>&1
  90. ${VENV_DIR}/bin/python -c "import gunicorn; print(f'Gunicorn: {gunicorn.__version__}')" 2>&1
  91. echo ""
  92. echo_info "检查 zoneinfo (时区模块):"
  93. ${VENV_DIR}/bin/python -c "
  94. try:
  95. from zoneinfo import ZoneInfo
  96. print('✓ 使用标准库 zoneinfo')
  97. except ImportError:
  98. from backports.zoneinfo import ZoneInfo
  99. print('✓ 使用 backports.zoneinfo (Python 3.8)')
  100. tz = ZoneInfo('Asia/Shanghai')
  101. print(f'✓ 东八区时区加载成功: {tz}')
  102. " 2>&1 || echo_error "✗ zoneinfo 不可用或时区数据缺失"
  103. else
  104. echo_error "Python 虚拟环境不存在"
  105. fi
  106. }
  107. # 4. 测试应用导入
  108. test_app_import() {
  109. echo_section "4. 测试应用导入"
  110. echo_info "尝试导入应用模块..."
  111. cd "${APP_DIR}"
  112. ${VENV_DIR}/bin/python -c "
  113. import sys
  114. sys.path.insert(0, '${APP_DIR}')
  115. try:
  116. from app import create_app
  117. print('✓ 应用模块导入成功')
  118. app = create_app()
  119. print('✓ 应用实例创建成功')
  120. except Exception as e:
  121. print(f'✗ 导入失败: {e}')
  122. import traceback
  123. traceback.print_exc()
  124. " 2>&1
  125. }
  126. # 5. 检查日志文件
  127. check_logs() {
  128. echo_section "5. 检查日志文件"
  129. echo_info "Supervisor 日志:"
  130. if [ -f "/var/log/supervisor/supervisord.log" ]; then
  131. echo_info "最近 20 行:"
  132. sudo tail -20 /var/log/supervisor/supervisord.log
  133. else
  134. echo_warn "日志文件不存在"
  135. fi
  136. echo ""
  137. echo_info "应用错误日志:"
  138. if [ -f "${LOG_DIR}/gunicorn_error.log" ]; then
  139. echo_info "最近 30 行:"
  140. tail -30 "${LOG_DIR}/gunicorn_error.log"
  141. else
  142. echo_warn "应用错误日志不存在: ${LOG_DIR}/gunicorn_error.log"
  143. fi
  144. echo ""
  145. echo_info "应用访问日志:"
  146. if [ -f "${LOG_DIR}/gunicorn_access.log" ]; then
  147. echo_info "最近 10 行:"
  148. tail -10 "${LOG_DIR}/gunicorn_access.log"
  149. else
  150. echo_warn "应用访问日志不存在: ${LOG_DIR}/gunicorn_access.log"
  151. fi
  152. echo ""
  153. echo_info "Supervisor 应用日志:"
  154. if [ -f "/var/log/supervisor/${APP_NAME}-stderr.log" ]; then
  155. echo_info "stderr 最近 30 行:"
  156. sudo tail -30 "/var/log/supervisor/${APP_NAME}-stderr.log"
  157. else
  158. echo_warn "Supervisor stderr 日志不存在"
  159. fi
  160. if [ -f "/var/log/supervisor/${APP_NAME}-stdout.log" ]; then
  161. echo_info "stdout 最近 20 行:"
  162. sudo tail -20 "/var/log/supervisor/${APP_NAME}-stdout.log"
  163. else
  164. echo_warn "Supervisor stdout 日志不存在"
  165. fi
  166. }
  167. # 6. 检查端口占用
  168. check_ports() {
  169. echo_section "6. 检查端口占用"
  170. echo_info "检查 5500 端口:"
  171. if sudo netstat -tlnp | grep :5500; then
  172. echo_info "✓ 端口 5500 已被占用"
  173. else
  174. echo_warn "✗ 端口 5500 未被占用(应用可能未启动)"
  175. fi
  176. }
  177. # 7. 检查环境变量和配置
  178. check_config() {
  179. echo_section "7. 检查配置文件"
  180. if [ -f "${APP_DIR}/.env" ]; then
  181. echo_info "✓ .env 文件存在"
  182. echo_info "环境变量(隐藏敏感信息):"
  183. grep -v "PASSWORD\|SECRET\|KEY" "${APP_DIR}/.env" || echo "无非敏感配置"
  184. else
  185. echo_warn "✗ .env 文件不存在"
  186. fi
  187. }
  188. # 8. 提供修复建议
  189. provide_suggestions() {
  190. echo_section "8. 修复建议"
  191. echo_info "基于诊断结果,尝试以下步骤:"
  192. echo ""
  193. echo "1. 如果是 zoneinfo 问题(Python 3.9+ 时区模块):"
  194. echo " sudo apt-get update"
  195. echo " sudo apt-get install -y tzdata"
  196. echo ""
  197. echo "2. 如果日志目录不存在:"
  198. echo " sudo mkdir -p ${LOG_DIR}"
  199. echo " sudo chown ubuntu:ubuntu ${LOG_DIR}"
  200. echo ""
  201. echo "3. 重新加载 supervisor 配置:"
  202. echo " sudo supervisorctl reread"
  203. echo " sudo supervisorctl update"
  204. echo ""
  205. echo "4. 手动启动应用测试:"
  206. echo " cd ${APP_DIR}"
  207. echo " source ${VENV_DIR}/bin/activate"
  208. echo " gunicorn -c gunicorn_config.py 'app:create_app()'"
  209. echo ""
  210. echo "5. 查看实时日志:"
  211. echo " sudo tail -f /var/log/supervisor/${APP_NAME}-stderr.log"
  212. }
  213. # 主函数
  214. main() {
  215. echo "=========================================="
  216. echo " DataOps Platform 问题诊断"
  217. echo "=========================================="
  218. echo ""
  219. check_directories
  220. check_supervisor
  221. check_python
  222. test_app_import
  223. check_logs
  224. check_ports
  225. check_config
  226. provide_suggestions
  227. echo ""
  228. echo_info "诊断完成!"
  229. }
  230. main "$@"