Files
tsp-assistant/scripts/monitor.sh

278 lines
6.7 KiB
Bash
Raw Normal View History

2025-09-08 15:27:22 +08:00
#!/bin/bash
# TSP智能助手监控脚本
# 配置变量
APP_NAME="tsp_assistant"
SERVICE_NAME="tsp_assistant"
HEALTH_URL="http://localhost:5000/api/health"
LOG_FILE="./logs/monitor.log"
ALERT_EMAIL="admin@example.com"
ALERT_PHONE="13800138000"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# 日志函数
log_info() {
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')] INFO${NC} $1" | tee -a "$LOG_FILE"
}
log_warn() {
echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')] WARN${NC} $1" | tee -a "$LOG_FILE"
}
log_error() {
echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] ERROR${NC} $1" | tee -a "$LOG_FILE"
}
# 发送告警
send_alert() {
local message=$1
local level=$2
log_error "告警: $message"
# 发送邮件告警
if command -v mail &> /dev/null; then
echo "$message" | mail -s "[$level] TSP助手告警" "$ALERT_EMAIL"
fi
# 发送短信告警(需要配置短信服务)
# curl -X POST "https://api.sms.com/send" \
# -d "phone=$ALERT_PHONE" \
# -d "message=$message"
}
# 检查服务状态
check_service_status() {
if systemctl is-active --quiet "$SERVICE_NAME"; then
return 0
else
return 1
fi
}
# 检查健康状态
check_health() {
local response_code
response_code=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" 2>/dev/null)
if [ "$response_code" = "200" ]; then
return 0
else
return 1
fi
}
# 检查响应时间
check_response_time() {
local response_time
response_time=$(curl -s -o /dev/null -w "%{time_total}" "$HEALTH_URL" 2>/dev/null)
# 响应时间超过5秒认为异常
if (( $(echo "$response_time > 5.0" | bc -l) )); then
return 1
else
return 0
fi
}
# 检查系统资源
check_system_resources() {
local cpu_usage
local memory_usage
local disk_usage
# CPU使用率
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
# 内存使用率
memory_usage=$(free | grep Mem | awk '{printf "%.2f", $3/$2 * 100.0}')
# 磁盘使用率
disk_usage=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
# 检查阈值
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
send_alert "CPU使用率过高: ${cpu_usage}%" "HIGH"
fi
if (( $(echo "$memory_usage > 80" | bc -l) )); then
send_alert "内存使用率过高: ${memory_usage}%" "HIGH"
fi
if [ "$disk_usage" -gt 80 ]; then
send_alert "磁盘使用率过高: ${disk_usage}%" "HIGH"
fi
log_info "系统资源 - CPU: ${cpu_usage}%, 内存: ${memory_usage}%, 磁盘: ${disk_usage}%"
}
# 检查日志错误
check_log_errors() {
local log_file="./logs/tsp_assistant.log"
local error_count
if [ -f "$log_file" ]; then
# 检查最近5分钟的错误日志
error_count=$(tail -n 100 "$log_file" | grep -c "ERROR" 2>/dev/null || echo "0")
if [ "$error_count" -gt 10 ]; then
send_alert "最近5分钟错误日志过多: $error_count" "MEDIUM"
fi
fi
}
# 检查数据库连接
check_database() {
local db_file="./tsp_assistant.db"
if [ -f "$db_file" ]; then
# 检查数据库文件大小
local db_size
db_size=$(du -h "$db_file" | cut -f1)
log_info "数据库大小: $db_size"
# 检查数据库是否可读
if ! sqlite3 "$db_file" "SELECT 1;" > /dev/null 2>&1; then
send_alert "数据库连接失败" "CRITICAL"
return 1
fi
fi
return 0
}
# 自动重启服务
restart_service() {
log_warn "尝试重启服务..."
sudo systemctl restart "$SERVICE_NAME"
sleep 10
if check_service_status && check_health; then
log_info "服务重启成功"
return 0
else
log_error "服务重启失败"
return 1
fi
}
# 主监控循环
monitor_loop() {
local consecutive_failures=0
local max_failures=3
while true; do
log_info "开始监控检查..."
# 检查服务状态
if ! check_service_status; then
log_error "服务未运行"
send_alert "TSP助手服务未运行" "CRITICAL"
consecutive_failures=$((consecutive_failures + 1))
else
# 检查健康状态
if ! check_health; then
log_error "健康检查失败"
send_alert "TSP助手健康检查失败" "HIGH"
consecutive_failures=$((consecutive_failures + 1))
else
# 检查响应时间
if ! check_response_time; then
log_warn "响应时间过长"
send_alert "TSP助手响应时间过长" "MEDIUM"
fi
consecutive_failures=0
fi
fi
# 检查系统资源
check_system_resources
# 检查日志错误
check_log_errors
# 检查数据库
check_database
# 连续失败处理
if [ "$consecutive_failures" -ge "$max_failures" ]; then
log_error "连续失败次数达到阈值,尝试重启服务"
if restart_service; then
consecutive_failures=0
else
send_alert "TSP助手服务重启失败需要人工干预" "CRITICAL"
fi
fi
# 等待下次检查
sleep 60
done
}
# 一次性检查
single_check() {
log_info "执行一次性健康检查..."
if check_service_status; then
log_info "✓ 服务运行正常"
else
log_error "✗ 服务未运行"
exit 1
fi
if check_health; then
log_info "✓ 健康检查通过"
else
log_error "✗ 健康检查失败"
exit 1
fi
if check_response_time; then
log_info "✓ 响应时间正常"
else
log_warn "⚠ 响应时间过长"
fi
check_system_resources
check_log_errors
check_database
log_info "健康检查完成"
}
# 主函数
main() {
# 创建日志目录
mkdir -p logs
case ${1:-monitor} in
monitor)
log_info "启动TSP助手监控服务..."
monitor_loop
;;
check)
single_check
;;
restart)
restart_service
;;
*)
echo "用法: $0 {monitor|check|restart}"
echo " monitor - 持续监控模式"
echo " check - 一次性健康检查"
echo " restart - 重启服务"
exit 1
;;
esac
}
# 执行主函数
main "$@"