scripts/monitor.sh

#!/bin/bash
# TSP智能助手监控脚本

# 配置变量
APP_NAME="tsp_assistant"
SERVICE_NAME="tsp_assistant"
HEALTH_URL="http://localhost:5000/api/health"
LOG_FILE="./logs/monitor.log"
ALERT_EMAIL="admin@example.com"
ALERT_PHONE="13800138000"

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

# 日志函数
log_info() {
    echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')] INFO${NC} $1" | tee -a "$LOG_FILE"
}

log_warn() {
    echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')] WARN${NC} $1" | tee -a "$LOG_FILE"
}

log_error() {
    echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] ERROR${NC} $1" | tee -a "$LOG_FILE"
}

# 发送告警
send_alert() {
    local message=$1
    local level=$2
    
    log_error "告警: $message"
    
    # 发送邮件告警
    if command -v mail &> /dev/null; then
        echo "$message" | mail -s "[$level] TSP助手告警" "$ALERT_EMAIL"
    fi
    
    # 发送短信告警（需要配置短信服务）
    # curl -X POST "https://api.sms.com/send" \
    #   -d "phone=$ALERT_PHONE" \
    #   -d "message=$message"
}

# 检查服务状态
check_service_status() {
    if systemctl is-active --quiet "$SERVICE_NAME"; then
        return 0
    else
        return 1
    fi
}

# 检查健康状态
check_health() {
    local response_code
    response_code=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" 2>/dev/null)
    
    if [ "$response_code" = "200" ]; then
        return 0
    else
        return 1
    fi
}

# 检查响应时间
check_response_time() {
    local response_time
    response_time=$(curl -s -o /dev/null -w "%{time_total}" "$HEALTH_URL" 2>/dev/null)
    
    # 响应时间超过5秒认为异常
    if (( $(echo "$response_time > 5.0" | bc -l) )); then
        return 1
    else
        return 0
    fi
}

# 检查系统资源
check_system_resources() {
    local cpu_usage
    local memory_usage
    local disk_usage
    
    # CPU使用率
    cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
    
    # 内存使用率
    memory_usage=$(free | grep Mem | awk '{printf "%.2f", $3/$2 * 100.0}')
    
    # 磁盘使用率
    disk_usage=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
    
    # 检查阈值
    if (( $(echo "$cpu_usage > 80" | bc -l) )); then
        send_alert "CPU使用率过高: ${cpu_usage}%" "HIGH"
    fi
    
    if (( $(echo "$memory_usage > 80" | bc -l) )); then
        send_alert "内存使用率过高: ${memory_usage}%" "HIGH"
    fi
    
    if [ "$disk_usage" -gt 80 ]; then
        send_alert "磁盘使用率过高: ${disk_usage}%" "HIGH"
    fi
    
    log_info "系统资源 - CPU: ${cpu_usage}%, 内存: ${memory_usage}%, 磁盘: ${disk_usage}%"
}

# 检查日志错误
check_log_errors() {
    local log_file="./logs/tsp_assistant.log"
    local error_count
    
    if [ -f "$log_file" ]; then
        # 检查最近5分钟的错误日志
        error_count=$(tail -n 100 "$log_file" | grep -c "ERROR" 2>/dev/null || echo "0")
        
        if [ "$error_count" -gt 10 ]; then
            send_alert "最近5分钟错误日志过多: $error_count 条" "MEDIUM"
        fi
    fi
}

# 检查数据库连接
check_database() {
    local db_file="./tsp_assistant.db"
    
    if [ -f "$db_file" ]; then
        # 检查数据库文件大小
        local db_size
        db_size=$(du -h "$db_file" | cut -f1)
        log_info "数据库大小: $db_size"
        
        # 检查数据库是否可读
        if ! sqlite3 "$db_file" "SELECT 1;" > /dev/null 2>&1; then
            send_alert "数据库连接失败" "CRITICAL"
            return 1
        fi
    fi
    
    return 0
}

# 自动重启服务
restart_service() {
    log_warn "尝试重启服务..."
    
    sudo systemctl restart "$SERVICE_NAME"
    sleep 10
    
    if check_service_status && check_health; then
        log_info "服务重启成功"
        return 0
    else
        log_error "服务重启失败"
        return 1
    fi
}

# 主监控循环
monitor_loop() {
    local consecutive_failures=0
    local max_failures=3
    
    while true; do
        log_info "开始监控检查..."
        
        # 检查服务状态
        if ! check_service_status; then
            log_error "服务未运行"
            send_alert "TSP助手服务未运行" "CRITICAL"
            consecutive_failures=$((consecutive_failures + 1))
        else
            # 检查健康状态
            if ! check_health; then
                log_error "健康检查失败"
                send_alert "TSP助手健康检查失败" "HIGH"
                consecutive_failures=$((consecutive_failures + 1))
            else
                # 检查响应时间
                if ! check_response_time; then
                    log_warn "响应时间过长"
                    send_alert "TSP助手响应时间过长" "MEDIUM"
                fi
                
                consecutive_failures=0
            fi
        fi
        
        # 检查系统资源
        check_system_resources
        
        # 检查日志错误
        check_log_errors
        
        # 检查数据库
        check_database
        
        # 连续失败处理
        if [ "$consecutive_failures" -ge "$max_failures" ]; then
            log_error "连续失败次数达到阈值，尝试重启服务"
            if restart_service; then
                consecutive_failures=0
            else
                send_alert "TSP助手服务重启失败，需要人工干预" "CRITICAL"
            fi
        fi
        
        # 等待下次检查
        sleep 60
    done
}

# 一次性检查
single_check() {
    log_info "执行一次性健康检查..."
    
    if check_service_status; then
        log_info "✓ 服务运行正常"
    else
        log_error "✗ 服务未运行"
        exit 1
    fi
    
    if check_health; then
        log_info "✓ 健康检查通过"
    else
        log_error "✗ 健康检查失败"
        exit 1
    fi
    
    if check_response_time; then
        log_info "✓ 响应时间正常"
    else
        log_warn "⚠ 响应时间过长"
    fi
    
    check_system_resources
    check_log_errors
    check_database
    
    log_info "健康检查完成"
}

# 主函数
main() {
    # 创建日志目录
    mkdir -p logs
    
    case ${1:-monitor} in
        monitor)
            log_info "启动TSP助手监控服务..."
            monitor_loop
            ;;
        check)
            single_check
            ;;
        restart)
            restart_service
            ;;
        *)
            echo "用法: $0 {monitor|check|restart}"
            echo "  monitor  - 持续监控模式"
            echo "  check    - 一次性健康检查"
            echo "  restart  - 重启服务"
            exit 1
            ;;
    esac
}

# 执行主函数
main "$@"