278 lines
6.7 KiB
Bash
278 lines
6.7 KiB
Bash
|
|
#!/bin/bash
|
|||
|
|
# TSP智能助手监控脚本
|
|||
|
|
|
|||
|
|
# 配置变量
|
|||
|
|
APP_NAME="tsp_assistant"
|
|||
|
|
SERVICE_NAME="tsp_assistant"
|
|||
|
|
HEALTH_URL="http://localhost:5000/api/health"
|
|||
|
|
LOG_FILE="./logs/monitor.log"
|
|||
|
|
ALERT_EMAIL="admin@example.com"
|
|||
|
|
ALERT_PHONE="13800138000"
|
|||
|
|
|
|||
|
|
# 颜色定义
|
|||
|
|
RED='\033[0;31m'
|
|||
|
|
GREEN='\033[0;32m'
|
|||
|
|
YELLOW='\033[1;33m'
|
|||
|
|
NC='\033[0m'
|
|||
|
|
|
|||
|
|
# 日志函数
|
|||
|
|
log_info() {
|
|||
|
|
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')] INFO${NC} $1" | tee -a "$LOG_FILE"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
log_warn() {
|
|||
|
|
echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')] WARN${NC} $1" | tee -a "$LOG_FILE"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
log_error() {
|
|||
|
|
echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] ERROR${NC} $1" | tee -a "$LOG_FILE"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 发送告警
|
|||
|
|
send_alert() {
|
|||
|
|
local message=$1
|
|||
|
|
local level=$2
|
|||
|
|
|
|||
|
|
log_error "告警: $message"
|
|||
|
|
|
|||
|
|
# 发送邮件告警
|
|||
|
|
if command -v mail &> /dev/null; then
|
|||
|
|
echo "$message" | mail -s "[$level] TSP助手告警" "$ALERT_EMAIL"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 发送短信告警(需要配置短信服务)
|
|||
|
|
# curl -X POST "https://api.sms.com/send" \
|
|||
|
|
# -d "phone=$ALERT_PHONE" \
|
|||
|
|
# -d "message=$message"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 检查服务状态
|
|||
|
|
check_service_status() {
|
|||
|
|
if systemctl is-active --quiet "$SERVICE_NAME"; then
|
|||
|
|
return 0
|
|||
|
|
else
|
|||
|
|
return 1
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 检查健康状态
|
|||
|
|
check_health() {
|
|||
|
|
local response_code
|
|||
|
|
response_code=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" 2>/dev/null)
|
|||
|
|
|
|||
|
|
if [ "$response_code" = "200" ]; then
|
|||
|
|
return 0
|
|||
|
|
else
|
|||
|
|
return 1
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 检查响应时间
|
|||
|
|
check_response_time() {
|
|||
|
|
local response_time
|
|||
|
|
response_time=$(curl -s -o /dev/null -w "%{time_total}" "$HEALTH_URL" 2>/dev/null)
|
|||
|
|
|
|||
|
|
# 响应时间超过5秒认为异常
|
|||
|
|
if (( $(echo "$response_time > 5.0" | bc -l) )); then
|
|||
|
|
return 1
|
|||
|
|
else
|
|||
|
|
return 0
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 检查系统资源
|
|||
|
|
check_system_resources() {
|
|||
|
|
local cpu_usage
|
|||
|
|
local memory_usage
|
|||
|
|
local disk_usage
|
|||
|
|
|
|||
|
|
# CPU使用率
|
|||
|
|
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
|
|||
|
|
|
|||
|
|
# 内存使用率
|
|||
|
|
memory_usage=$(free | grep Mem | awk '{printf "%.2f", $3/$2 * 100.0}')
|
|||
|
|
|
|||
|
|
# 磁盘使用率
|
|||
|
|
disk_usage=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
|
|||
|
|
|
|||
|
|
# 检查阈值
|
|||
|
|
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
|
|||
|
|
send_alert "CPU使用率过高: ${cpu_usage}%" "HIGH"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
if (( $(echo "$memory_usage > 80" | bc -l) )); then
|
|||
|
|
send_alert "内存使用率过高: ${memory_usage}%" "HIGH"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
if [ "$disk_usage" -gt 80 ]; then
|
|||
|
|
send_alert "磁盘使用率过高: ${disk_usage}%" "HIGH"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
log_info "系统资源 - CPU: ${cpu_usage}%, 内存: ${memory_usage}%, 磁盘: ${disk_usage}%"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 检查日志错误
|
|||
|
|
check_log_errors() {
|
|||
|
|
local log_file="./logs/tsp_assistant.log"
|
|||
|
|
local error_count
|
|||
|
|
|
|||
|
|
if [ -f "$log_file" ]; then
|
|||
|
|
# 检查最近5分钟的错误日志
|
|||
|
|
error_count=$(tail -n 100 "$log_file" | grep -c "ERROR" 2>/dev/null || echo "0")
|
|||
|
|
|
|||
|
|
if [ "$error_count" -gt 10 ]; then
|
|||
|
|
send_alert "最近5分钟错误日志过多: $error_count 条" "MEDIUM"
|
|||
|
|
fi
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 检查数据库连接
|
|||
|
|
check_database() {
|
|||
|
|
local db_file="./tsp_assistant.db"
|
|||
|
|
|
|||
|
|
if [ -f "$db_file" ]; then
|
|||
|
|
# 检查数据库文件大小
|
|||
|
|
local db_size
|
|||
|
|
db_size=$(du -h "$db_file" | cut -f1)
|
|||
|
|
log_info "数据库大小: $db_size"
|
|||
|
|
|
|||
|
|
# 检查数据库是否可读
|
|||
|
|
if ! sqlite3 "$db_file" "SELECT 1;" > /dev/null 2>&1; then
|
|||
|
|
send_alert "数据库连接失败" "CRITICAL"
|
|||
|
|
return 1
|
|||
|
|
fi
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
return 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 自动重启服务
|
|||
|
|
restart_service() {
|
|||
|
|
log_warn "尝试重启服务..."
|
|||
|
|
|
|||
|
|
sudo systemctl restart "$SERVICE_NAME"
|
|||
|
|
sleep 10
|
|||
|
|
|
|||
|
|
if check_service_status && check_health; then
|
|||
|
|
log_info "服务重启成功"
|
|||
|
|
return 0
|
|||
|
|
else
|
|||
|
|
log_error "服务重启失败"
|
|||
|
|
return 1
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 主监控循环
|
|||
|
|
monitor_loop() {
|
|||
|
|
local consecutive_failures=0
|
|||
|
|
local max_failures=3
|
|||
|
|
|
|||
|
|
while true; do
|
|||
|
|
log_info "开始监控检查..."
|
|||
|
|
|
|||
|
|
# 检查服务状态
|
|||
|
|
if ! check_service_status; then
|
|||
|
|
log_error "服务未运行"
|
|||
|
|
send_alert "TSP助手服务未运行" "CRITICAL"
|
|||
|
|
consecutive_failures=$((consecutive_failures + 1))
|
|||
|
|
else
|
|||
|
|
# 检查健康状态
|
|||
|
|
if ! check_health; then
|
|||
|
|
log_error "健康检查失败"
|
|||
|
|
send_alert "TSP助手健康检查失败" "HIGH"
|
|||
|
|
consecutive_failures=$((consecutive_failures + 1))
|
|||
|
|
else
|
|||
|
|
# 检查响应时间
|
|||
|
|
if ! check_response_time; then
|
|||
|
|
log_warn "响应时间过长"
|
|||
|
|
send_alert "TSP助手响应时间过长" "MEDIUM"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
consecutive_failures=0
|
|||
|
|
fi
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 检查系统资源
|
|||
|
|
check_system_resources
|
|||
|
|
|
|||
|
|
# 检查日志错误
|
|||
|
|
check_log_errors
|
|||
|
|
|
|||
|
|
# 检查数据库
|
|||
|
|
check_database
|
|||
|
|
|
|||
|
|
# 连续失败处理
|
|||
|
|
if [ "$consecutive_failures" -ge "$max_failures" ]; then
|
|||
|
|
log_error "连续失败次数达到阈值,尝试重启服务"
|
|||
|
|
if restart_service; then
|
|||
|
|
consecutive_failures=0
|
|||
|
|
else
|
|||
|
|
send_alert "TSP助手服务重启失败,需要人工干预" "CRITICAL"
|
|||
|
|
fi
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 等待下次检查
|
|||
|
|
sleep 60
|
|||
|
|
done
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 一次性检查
|
|||
|
|
single_check() {
|
|||
|
|
log_info "执行一次性健康检查..."
|
|||
|
|
|
|||
|
|
if check_service_status; then
|
|||
|
|
log_info "✓ 服务运行正常"
|
|||
|
|
else
|
|||
|
|
log_error "✗ 服务未运行"
|
|||
|
|
exit 1
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
if check_health; then
|
|||
|
|
log_info "✓ 健康检查通过"
|
|||
|
|
else
|
|||
|
|
log_error "✗ 健康检查失败"
|
|||
|
|
exit 1
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
if check_response_time; then
|
|||
|
|
log_info "✓ 响应时间正常"
|
|||
|
|
else
|
|||
|
|
log_warn "⚠ 响应时间过长"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
check_system_resources
|
|||
|
|
check_log_errors
|
|||
|
|
check_database
|
|||
|
|
|
|||
|
|
log_info "健康检查完成"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 主函数
|
|||
|
|
main() {
|
|||
|
|
# 创建日志目录
|
|||
|
|
mkdir -p logs
|
|||
|
|
|
|||
|
|
case ${1:-monitor} in
|
|||
|
|
monitor)
|
|||
|
|
log_info "启动TSP助手监控服务..."
|
|||
|
|
monitor_loop
|
|||
|
|
;;
|
|||
|
|
check)
|
|||
|
|
single_check
|
|||
|
|
;;
|
|||
|
|
restart)
|
|||
|
|
restart_service
|
|||
|
|
;;
|
|||
|
|
*)
|
|||
|
|
echo "用法: $0 {monitor|check|restart}"
|
|||
|
|
echo " monitor - 持续监控模式"
|
|||
|
|
echo " check - 一次性健康检查"
|
|||
|
|
echo " restart - 重启服务"
|
|||
|
|
exit 1
|
|||
|
|
;;
|
|||
|
|
esac
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 执行主函数
|
|||
|
|
main "$@"
|