#!/bin/bash # TSP智能助手监控脚本 # 配置变量 APP_NAME="tsp_assistant" SERVICE_NAME="tsp_assistant" HEALTH_URL="http://localhost:5000/api/health" LOG_FILE="./logs/monitor.log" ALERT_EMAIL="admin@example.com" ALERT_PHONE="13800138000" # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # 日志函数 log_info() { echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')] INFO${NC} $1" | tee -a "$LOG_FILE" } log_warn() { echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')] WARN${NC} $1" | tee -a "$LOG_FILE" } log_error() { echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] ERROR${NC} $1" | tee -a "$LOG_FILE" } # 发送告警 send_alert() { local message=$1 local level=$2 log_error "告警: $message" # 发送邮件告警 if command -v mail &> /dev/null; then echo "$message" | mail -s "[$level] TSP助手告警" "$ALERT_EMAIL" fi # 发送短信告警(需要配置短信服务) # curl -X POST "https://api.sms.com/send" \ # -d "phone=$ALERT_PHONE" \ # -d "message=$message" } # 检查服务状态 check_service_status() { if systemctl is-active --quiet "$SERVICE_NAME"; then return 0 else return 1 fi } # 检查健康状态 check_health() { local response_code response_code=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" 2>/dev/null) if [ "$response_code" = "200" ]; then return 0 else return 1 fi } # 检查响应时间 check_response_time() { local response_time response_time=$(curl -s -o /dev/null -w "%{time_total}" "$HEALTH_URL" 2>/dev/null) # 响应时间超过5秒认为异常 if (( $(echo "$response_time > 5.0" | bc -l) )); then return 1 else return 0 fi } # 检查系统资源 check_system_resources() { local cpu_usage local memory_usage local disk_usage # CPU使用率 cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}') # 内存使用率 memory_usage=$(free | grep Mem | awk '{printf "%.2f", $3/$2 * 100.0}') # 磁盘使用率 disk_usage=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//') # 检查阈值 if (( $(echo "$cpu_usage > 80" | bc -l) )); then send_alert "CPU使用率过高: ${cpu_usage}%" "HIGH" fi if (( $(echo "$memory_usage > 80" | bc -l) )); then send_alert "内存使用率过高: ${memory_usage}%" "HIGH" fi if [ "$disk_usage" -gt 80 ]; then send_alert "磁盘使用率过高: ${disk_usage}%" "HIGH" fi log_info "系统资源 - CPU: ${cpu_usage}%, 内存: ${memory_usage}%, 磁盘: ${disk_usage}%" } # 检查日志错误 check_log_errors() { local log_file="./logs/tsp_assistant.log" local error_count if [ -f "$log_file" ]; then # 检查最近5分钟的错误日志 error_count=$(tail -n 100 "$log_file" | grep -c "ERROR" 2>/dev/null || echo "0") if [ "$error_count" -gt 10 ]; then send_alert "最近5分钟错误日志过多: $error_count 条" "MEDIUM" fi fi } # 检查数据库连接 check_database() { local db_file="./tsp_assistant.db" if [ -f "$db_file" ]; then # 检查数据库文件大小 local db_size db_size=$(du -h "$db_file" | cut -f1) log_info "数据库大小: $db_size" # 检查数据库是否可读 if ! sqlite3 "$db_file" "SELECT 1;" > /dev/null 2>&1; then send_alert "数据库连接失败" "CRITICAL" return 1 fi fi return 0 } # 自动重启服务 restart_service() { log_warn "尝试重启服务..." sudo systemctl restart "$SERVICE_NAME" sleep 10 if check_service_status && check_health; then log_info "服务重启成功" return 0 else log_error "服务重启失败" return 1 fi } # 主监控循环 monitor_loop() { local consecutive_failures=0 local max_failures=3 while true; do log_info "开始监控检查..." # 检查服务状态 if ! check_service_status; then log_error "服务未运行" send_alert "TSP助手服务未运行" "CRITICAL" consecutive_failures=$((consecutive_failures + 1)) else # 检查健康状态 if ! check_health; then log_error "健康检查失败" send_alert "TSP助手健康检查失败" "HIGH" consecutive_failures=$((consecutive_failures + 1)) else # 检查响应时间 if ! check_response_time; then log_warn "响应时间过长" send_alert "TSP助手响应时间过长" "MEDIUM" fi consecutive_failures=0 fi fi # 检查系统资源 check_system_resources # 检查日志错误 check_log_errors # 检查数据库 check_database # 连续失败处理 if [ "$consecutive_failures" -ge "$max_failures" ]; then log_error "连续失败次数达到阈值,尝试重启服务" if restart_service; then consecutive_failures=0 else send_alert "TSP助手服务重启失败,需要人工干预" "CRITICAL" fi fi # 等待下次检查 sleep 60 done } # 一次性检查 single_check() { log_info "执行一次性健康检查..." if check_service_status; then log_info "✓ 服务运行正常" else log_error "✗ 服务未运行" exit 1 fi if check_health; then log_info "✓ 健康检查通过" else log_error "✗ 健康检查失败" exit 1 fi if check_response_time; then log_info "✓ 响应时间正常" else log_warn "⚠ 响应时间过长" fi check_system_resources check_log_errors check_database log_info "健康检查完成" } # 主函数 main() { # 创建日志目录 mkdir -p logs case ${1:-monitor} in monitor) log_info "启动TSP助手监控服务..." monitor_loop ;; check) single_check ;; restart) restart_service ;; *) echo "用法: $0 {monitor|check|restart}" echo " monitor - 持续监控模式" echo " check - 一次性健康检查" echo " restart - 重启服务" exit 1 ;; esac } # 执行主函数 main "$@"