Files
tsp-assistant/scripts/monitor.sh
2025-09-08 15:27:22 +08:00

278 lines
6.7 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# TSP智能助手监控脚本
# 配置变量
APP_NAME="tsp_assistant"
SERVICE_NAME="tsp_assistant"
HEALTH_URL="http://localhost:5000/api/health"
LOG_FILE="./logs/monitor.log"
ALERT_EMAIL="admin@example.com"
ALERT_PHONE="13800138000"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# 日志函数
log_info() {
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')] INFO${NC} $1" | tee -a "$LOG_FILE"
}
log_warn() {
echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')] WARN${NC} $1" | tee -a "$LOG_FILE"
}
log_error() {
echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] ERROR${NC} $1" | tee -a "$LOG_FILE"
}
# 发送告警
send_alert() {
local message=$1
local level=$2
log_error "告警: $message"
# 发送邮件告警
if command -v mail &> /dev/null; then
echo "$message" | mail -s "[$level] TSP助手告警" "$ALERT_EMAIL"
fi
# 发送短信告警(需要配置短信服务)
# curl -X POST "https://api.sms.com/send" \
# -d "phone=$ALERT_PHONE" \
# -d "message=$message"
}
# 检查服务状态
check_service_status() {
if systemctl is-active --quiet "$SERVICE_NAME"; then
return 0
else
return 1
fi
}
# 检查健康状态
check_health() {
local response_code
response_code=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" 2>/dev/null)
if [ "$response_code" = "200" ]; then
return 0
else
return 1
fi
}
# 检查响应时间
check_response_time() {
local response_time
response_time=$(curl -s -o /dev/null -w "%{time_total}" "$HEALTH_URL" 2>/dev/null)
# 响应时间超过5秒认为异常
if (( $(echo "$response_time > 5.0" | bc -l) )); then
return 1
else
return 0
fi
}
# 检查系统资源
check_system_resources() {
local cpu_usage
local memory_usage
local disk_usage
# CPU使用率
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
# 内存使用率
memory_usage=$(free | grep Mem | awk '{printf "%.2f", $3/$2 * 100.0}')
# 磁盘使用率
disk_usage=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
# 检查阈值
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
send_alert "CPU使用率过高: ${cpu_usage}%" "HIGH"
fi
if (( $(echo "$memory_usage > 80" | bc -l) )); then
send_alert "内存使用率过高: ${memory_usage}%" "HIGH"
fi
if [ "$disk_usage" -gt 80 ]; then
send_alert "磁盘使用率过高: ${disk_usage}%" "HIGH"
fi
log_info "系统资源 - CPU: ${cpu_usage}%, 内存: ${memory_usage}%, 磁盘: ${disk_usage}%"
}
# 检查日志错误
check_log_errors() {
local log_file="./logs/tsp_assistant.log"
local error_count
if [ -f "$log_file" ]; then
# 检查最近5分钟的错误日志
error_count=$(tail -n 100 "$log_file" | grep -c "ERROR" 2>/dev/null || echo "0")
if [ "$error_count" -gt 10 ]; then
send_alert "最近5分钟错误日志过多: $error_count" "MEDIUM"
fi
fi
}
# 检查数据库连接
check_database() {
local db_file="./tsp_assistant.db"
if [ -f "$db_file" ]; then
# 检查数据库文件大小
local db_size
db_size=$(du -h "$db_file" | cut -f1)
log_info "数据库大小: $db_size"
# 检查数据库是否可读
if ! sqlite3 "$db_file" "SELECT 1;" > /dev/null 2>&1; then
send_alert "数据库连接失败" "CRITICAL"
return 1
fi
fi
return 0
}
# 自动重启服务
restart_service() {
log_warn "尝试重启服务..."
sudo systemctl restart "$SERVICE_NAME"
sleep 10
if check_service_status && check_health; then
log_info "服务重启成功"
return 0
else
log_error "服务重启失败"
return 1
fi
}
# 主监控循环
monitor_loop() {
local consecutive_failures=0
local max_failures=3
while true; do
log_info "开始监控检查..."
# 检查服务状态
if ! check_service_status; then
log_error "服务未运行"
send_alert "TSP助手服务未运行" "CRITICAL"
consecutive_failures=$((consecutive_failures + 1))
else
# 检查健康状态
if ! check_health; then
log_error "健康检查失败"
send_alert "TSP助手健康检查失败" "HIGH"
consecutive_failures=$((consecutive_failures + 1))
else
# 检查响应时间
if ! check_response_time; then
log_warn "响应时间过长"
send_alert "TSP助手响应时间过长" "MEDIUM"
fi
consecutive_failures=0
fi
fi
# 检查系统资源
check_system_resources
# 检查日志错误
check_log_errors
# 检查数据库
check_database
# 连续失败处理
if [ "$consecutive_failures" -ge "$max_failures" ]; then
log_error "连续失败次数达到阈值,尝试重启服务"
if restart_service; then
consecutive_failures=0
else
send_alert "TSP助手服务重启失败需要人工干预" "CRITICAL"
fi
fi
# 等待下次检查
sleep 60
done
}
# 一次性检查
single_check() {
log_info "执行一次性健康检查..."
if check_service_status; then
log_info "✓ 服务运行正常"
else
log_error "✗ 服务未运行"
exit 1
fi
if check_health; then
log_info "✓ 健康检查通过"
else
log_error "✗ 健康检查失败"
exit 1
fi
if check_response_time; then
log_info "✓ 响应时间正常"
else
log_warn "⚠ 响应时间过长"
fi
check_system_resources
check_log_errors
check_database
log_info "健康检查完成"
}
# 主函数
main() {
# 创建日志目录
mkdir -p logs
case ${1:-monitor} in
monitor)
log_info "启动TSP助手监控服务..."
monitor_loop
;;
check)
single_check
;;
restart)
restart_service
;;
*)
echo "用法: $0 {monitor|check|restart}"
echo " monitor - 持续监控模式"
echo " check - 一次性健康检查"
echo " restart - 重启服务"
exit 1
;;
esac
}
# 执行主函数
main "$@"