first commit

This commit is contained in:
zhaojie
2025-09-06 21:06:18 +08:00
commit 8083f136c9
94 changed files with 20559 additions and 0 deletions

View File

@@ -0,0 +1 @@
# 分析模块

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,432 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TSP助手智能预警系统
支持多种预警规则、实时监控和智能分析
"""
import logging
from typing import Dict, List, Any, Optional, Callable
from datetime import datetime, timedelta
from dataclasses import dataclass
from enum import Enum
import json
from ..core.database import db_manager
from ..core.models import WorkOrder, Conversation, Analytics, Alert
logger = logging.getLogger(__name__)
class AlertLevel(Enum):
"""预警级别"""
INFO = "info" # 信息
WARNING = "warning" # 警告
ERROR = "error" # 错误
CRITICAL = "critical" # 严重
class AlertType(Enum):
"""预警类型"""
PERFORMANCE = "performance" # 性能预警
QUALITY = "quality" # 质量预警
VOLUME = "volume" # 量级预警
SYSTEM = "system" # 系统预警
BUSINESS = "business" # 业务预警
@dataclass
class AlertRule:
"""预警规则"""
name: str
description: str
alert_type: AlertType
level: AlertLevel
threshold: float
condition: str # 条件表达式
enabled: bool = True
check_interval: int = 300 # 检查间隔(秒)
last_check: Optional[datetime] = None
cooldown: int = 3600 # 冷却时间(秒)
class AlertSystem:
"""智能预警系统"""
def __init__(self):
self.rules = self._initialize_rules()
self.alert_history = []
self.active_alerts = {}
def _initialize_rules(self) -> Dict[str, AlertRule]:
"""初始化预警规则"""
rules = {}
# 性能预警规则
rules["low_satisfaction"] = AlertRule(
name="满意度预警",
description="用户满意度低于阈值",
alert_type=AlertType.QUALITY,
level=AlertLevel.WARNING,
threshold=0.6,
condition="satisfaction_avg < threshold",
check_interval=1800 # 30分钟
)
rules["high_resolution_time"] = AlertRule(
name="解决时间预警",
description="平均解决时间过长",
alert_type=AlertType.PERFORMANCE,
level=AlertLevel.WARNING,
threshold=24, # 24小时
condition="avg_resolution_time > threshold",
check_interval=3600 # 1小时
)
rules["low_knowledge_hit_rate"] = AlertRule(
name="知识库命中率预警",
description="知识库命中率过低",
alert_type=AlertType.QUALITY,
level=AlertLevel.WARNING,
threshold=0.5,
condition="knowledge_hit_rate < threshold",
check_interval=1800 # 30分钟
)
rules["high_error_rate"] = AlertRule(
name="错误率预警",
description="系统错误率过高",
alert_type=AlertType.SYSTEM,
level=AlertLevel.ERROR,
threshold=0.1,
condition="error_rate > threshold",
check_interval=300 # 5分钟
)
rules["high_volume"] = AlertRule(
name="工单量预警",
description="工单量异常增长",
alert_type=AlertType.VOLUME,
level=AlertLevel.INFO,
threshold=50, # 每小时50个工单
condition="hourly_orders > threshold",
check_interval=600 # 10分钟
)
rules["low_response_time"] = AlertRule(
name="响应时间预警",
description="系统响应时间过长",
alert_type=AlertType.PERFORMANCE,
level=AlertLevel.WARNING,
threshold=5.0, # 5秒
condition="avg_response_time > threshold",
check_interval=300 # 5分钟
)
rules["memory_usage"] = AlertRule(
name="内存使用预警",
description="系统内存使用率过高",
alert_type=AlertType.SYSTEM,
level=AlertLevel.WARNING,
threshold=80.0, # 80%
condition="memory_usage > threshold",
check_interval=300 # 5分钟
)
rules["conversation_drop"] = AlertRule(
name="对话中断预警",
description="用户对话中断率过高",
alert_type=AlertType.QUALITY,
level=AlertLevel.WARNING,
threshold=0.3, # 30%
condition="conversation_drop_rate > threshold",
check_interval=1800 # 30分钟
)
return rules
def check_all_rules(self) -> List[Dict[str, Any]]:
"""检查所有预警规则"""
triggered_alerts = []
for rule_name, rule in self.rules.items():
if not rule.enabled:
continue
# 检查冷却时间
if rule.last_check and (datetime.now() - rule.last_check).seconds < rule.cooldown:
continue
try:
# 获取相关数据
data = self._get_rule_data(rule_name)
# 评估规则条件
if self._evaluate_rule(rule, data):
alert = self._create_alert(rule, data)
triggered_alerts.append(alert)
# 更新规则状态
rule.last_check = datetime.now()
except Exception as e:
logger.error(f"检查规则 {rule_name} 失败: {e}")
return triggered_alerts
def _get_rule_data(self, rule_name: str) -> Dict[str, Any]:
"""获取规则相关数据"""
data = {}
try:
with db_manager.get_session() as session:
# 获取最近24小时的数据
end_time = datetime.now()
start_time = end_time - timedelta(hours=24)
# 工单数据
work_orders = session.query(WorkOrder).filter(
WorkOrder.created_at >= start_time,
WorkOrder.created_at <= end_time
).all()
# 对话数据
conversations = session.query(Conversation).filter(
Conversation.timestamp >= start_time,
Conversation.timestamp <= end_time
).all()
# 计算基础指标
total_orders = len(work_orders)
resolved_orders = len([wo for wo in work_orders if wo.status == "resolved"])
# 满意度
satisfaction_scores = [wo.satisfaction_score for wo in work_orders if wo.satisfaction_score]
data["satisfaction_avg"] = sum(satisfaction_scores) / len(satisfaction_scores) if satisfaction_scores else 0
# 解决时间
resolution_times = []
for wo in work_orders:
if wo.status == "resolved" and wo.updated_at:
resolution_time = (wo.updated_at - wo.created_at).total_seconds() / 3600
resolution_times.append(resolution_time)
data["avg_resolution_time"] = sum(resolution_times) / len(resolution_times) if resolution_times else 0
# 知识库命中率
knowledge_hits = len([c for c in conversations if c.knowledge_used])
data["knowledge_hit_rate"] = knowledge_hits / len(conversations) if conversations else 0
# 错误率
error_conversations = len([c for c in conversations if "error" in c.assistant_response.lower()])
data["error_rate"] = error_conversations / len(conversations) if conversations else 0
# 工单量
data["hourly_orders"] = total_orders / 24
# 响应时间
response_times = []
for c in conversations:
if c.response_time:
response_times.append(c.response_time)
data["avg_response_time"] = sum(response_times) / len(response_times) if response_times else 0
# 内存使用
from ..utils.helpers import get_memory_usage
memory_info = get_memory_usage()
data["memory_usage"] = memory_info.get("percent", 0) * 100
# 对话中断率
total_conversations = len(conversations)
dropped_conversations = len([c for c in conversations if c.user_message and not c.assistant_response])
data["conversation_drop_rate"] = dropped_conversations / total_conversations if total_conversations else 0
except Exception as e:
logger.error(f"获取规则数据失败: {e}")
return data
def _evaluate_rule(self, rule: AlertRule, data: Dict[str, Any]) -> bool:
"""评估规则条件"""
try:
# 简单的条件评估
if rule.condition == "satisfaction_avg < threshold":
return data.get("satisfaction_avg", 0) < rule.threshold
elif rule.condition == "avg_resolution_time > threshold":
return data.get("avg_resolution_time", 0) > rule.threshold
elif rule.condition == "knowledge_hit_rate < threshold":
return data.get("knowledge_hit_rate", 0) < rule.threshold
elif rule.condition == "error_rate > threshold":
return data.get("error_rate", 0) > rule.threshold
elif rule.condition == "hourly_orders > threshold":
return data.get("hourly_orders", 0) > rule.threshold
elif rule.condition == "avg_response_time > threshold":
return data.get("avg_response_time", 0) > rule.threshold
elif rule.condition == "memory_usage > threshold":
return data.get("memory_usage", 0) > rule.threshold
elif rule.condition == "conversation_drop_rate > threshold":
return data.get("conversation_drop_rate", 0) > rule.threshold
return False
except Exception as e:
logger.error(f"评估规则条件失败: {e}")
return False
def _create_alert(self, rule: AlertRule, data: Dict[str, Any]) -> Dict[str, Any]:
"""创建预警"""
alert = {
"rule_name": rule.name,
"alert_type": rule.alert_type.value,
"level": rule.level.value,
"message": self._generate_alert_message(rule, data),
"data": data,
"timestamp": datetime.now().isoformat(),
"rule_id": rule.name
}
# 保存到数据库
self._save_alert(alert)
# 添加到活跃预警
self.active_alerts[rule.name] = alert
return alert
def _generate_alert_message(self, rule: AlertRule, data: Dict[str, Any]) -> str:
"""生成预警消息"""
if rule.name == "满意度预警":
return f"用户满意度较低: {data.get('satisfaction_avg', 0):.2f} (阈值: {rule.threshold})"
elif rule.name == "解决时间预警":
return f"平均解决时间过长: {data.get('avg_resolution_time', 0):.1f}小时 (阈值: {rule.threshold}小时)"
elif rule.name == "知识库命中率预警":
return f"知识库命中率较低: {data.get('knowledge_hit_rate', 0):.2f} (阈值: {rule.threshold})"
elif rule.name == "错误率预警":
return f"系统错误率过高: {data.get('error_rate', 0):.2f} (阈值: {rule.threshold})"
elif rule.name == "工单量预警":
return f"工单量异常增长: {data.get('hourly_orders', 0):.1f}个/小时 (阈值: {rule.threshold}个/小时)"
elif rule.name == "响应时间预警":
return f"系统响应时间过长: {data.get('avg_response_time', 0):.2f}秒 (阈值: {rule.threshold}秒)"
elif rule.name == "内存使用预警":
return f"系统内存使用率过高: {data.get('memory_usage', 0):.1f}% (阈值: {rule.threshold}%)"
elif rule.name == "对话中断预警":
return f"用户对话中断率过高: {data.get('conversation_drop_rate', 0):.2f} (阈值: {rule.threshold})"
else:
return f"触发预警: {rule.name}"
def _save_alert(self, alert: Dict[str, Any]) -> None:
"""保存预警到数据库"""
try:
with db_manager.get_session() as session:
db_alert = Alert(
rule_name=alert["rule_name"],
alert_type=alert["alert_type"],
level=alert["level"],
message=alert["message"],
data=json.dumps(alert["data"], ensure_ascii=False),
is_active=True,
created_at=datetime.now()
)
session.add(db_alert)
session.commit()
except Exception as e:
logger.error(f"保存预警失败: {e}")
def get_active_alerts(self) -> List[Dict[str, Any]]:
"""获取活跃预警"""
try:
with db_manager.get_session() as session:
alerts = session.query(Alert).filter(
Alert.is_active == True
).order_by(Alert.created_at.desc()).all()
return [{
"id": alert.id,
"rule_name": alert.rule_name,
"alert_type": alert.alert_type,
"level": alert.level,
"message": alert.message,
"created_at": alert.created_at.isoformat(),
"data": json.loads(alert.data) if alert.data else {}
} for alert in alerts]
except Exception as e:
logger.error(f"获取活跃预警失败: {e}")
return []
def resolve_alert(self, alert_id: int) -> bool:
"""解决预警"""
try:
with db_manager.get_session() as session:
alert = session.query(Alert).filter(Alert.id == alert_id).first()
if alert:
alert.is_active = False
alert.resolved_at = datetime.now()
session.commit()
return True
return False
except Exception as e:
logger.error(f"解决预警失败: {e}")
return False
def get_alert_statistics(self) -> Dict[str, Any]:
"""获取预警统计"""
try:
with db_manager.get_session() as session:
total_alerts = session.query(Alert).count()
active_alerts = session.query(Alert).filter(Alert.is_active == True).count()
# 按级别统计
level_stats = {}
for level in AlertLevel:
count = session.query(Alert).filter(Alert.level == level.value).count()
level_stats[level.value] = count
# 按类型统计
type_stats = {}
for alert_type in AlertType:
count = session.query(Alert).filter(Alert.alert_type == alert_type.value).count()
type_stats[alert_type.value] = count
return {
"total_alerts": total_alerts,
"active_alerts": active_alerts,
"level_distribution": level_stats,
"type_distribution": type_stats
}
except Exception as e:
logger.error(f"获取预警统计失败: {e}")
return {}
def add_custom_rule(self, rule: AlertRule) -> bool:
"""添加自定义规则"""
try:
self.rules[rule.name] = rule
logger.info(f"添加自定义规则: {rule.name}")
return True
except Exception as e:
logger.error(f"添加自定义规则失败: {e}")
return False
def update_rule(self, rule_name: str, **kwargs) -> bool:
"""更新规则"""
try:
if rule_name in self.rules:
rule = self.rules[rule_name]
for key, value in kwargs.items():
if hasattr(rule, key):
setattr(rule, key, value)
logger.info(f"更新规则: {rule_name}")
return True
return False
except Exception as e:
logger.error(f"更新规则失败: {e}")
return False
def delete_rule(self, rule_name: str) -> bool:
"""删除规则"""
try:
if rule_name in self.rules:
del self.rules[rule_name]
logger.info(f"删除规则: {rule_name}")
return True
return False
except Exception as e:
logger.error(f"删除规则失败: {e}")
return False

View File

@@ -0,0 +1,300 @@
import logging
from typing import Dict, List, Any, Optional
from datetime import datetime, timedelta
import json
from collections import defaultdict
from ..core.database import db_manager
from ..core.models import WorkOrder, Conversation, Analytics, Alert, KnowledgeEntry
logger = logging.getLogger(__name__)
class AnalyticsManager:
"""分析统计管理器"""
def __init__(self):
self.alert_thresholds = {
"low_satisfaction": 0.6,
"high_resolution_time": 24, # 小时
"knowledge_hit_rate": 0.5,
"error_rate": 0.1
}
def generate_daily_analytics(self, date: Optional[datetime] = None) -> Dict[str, Any]:
"""生成每日分析报告"""
if date is None:
date = datetime.now().date()
try:
with db_manager.get_session() as session:
# 获取指定日期的工单数据
start_time = datetime.combine(date, datetime.min.time())
end_time = datetime.combine(date, datetime.max.time())
work_orders = session.query(WorkOrder).filter(
WorkOrder.created_at >= start_time,
WorkOrder.created_at <= end_time
).all()
if not work_orders:
return {"message": f"{date} 没有工单数据"}
# 计算基础统计
total_orders = len(work_orders)
resolved_orders = len([wo for wo in work_orders if wo.status == "resolved"])
# 平均解决时间
resolution_times = []
for wo in work_orders:
if wo.status == "resolved" and wo.updated_at:
resolution_time = (wo.updated_at - wo.created_at).total_seconds() / 3600
resolution_times.append(resolution_time)
avg_resolution_time = sum(resolution_times) / len(resolution_times) if resolution_times else 0
# 平均满意度
satisfaction_scores = [wo.satisfaction_score for wo in work_orders if wo.satisfaction_score]
satisfaction_avg = sum(satisfaction_scores) / len(satisfaction_scores) if satisfaction_scores else 0
# 知识库命中率
conversations = session.query(Conversation).filter(
Conversation.timestamp >= start_time,
Conversation.timestamp <= end_time
).all()
knowledge_hit_rate = self._calculate_knowledge_hit_rate(conversations)
# 类别分布
category_distribution = defaultdict(int)
for wo in work_orders:
category_distribution[wo.category] += 1
# 保存分析结果
analytics = Analytics(
date=start_time,
total_orders=total_orders,
resolved_orders=resolved_orders,
avg_resolution_time=avg_resolution_time,
satisfaction_avg=satisfaction_avg,
knowledge_hit_rate=knowledge_hit_rate,
category_distribution=json.dumps(dict(category_distribution))
)
session.add(analytics)
session.commit()
# 检查预警条件
self._check_alerts(
session,
satisfaction_avg,
avg_resolution_time,
knowledge_hit_rate,
total_orders
)
return {
"date": date.isoformat(),
"total_orders": total_orders,
"resolved_orders": resolved_orders,
"resolution_rate": resolved_orders / total_orders if total_orders > 0 else 0,
"avg_resolution_time_hours": round(avg_resolution_time, 2),
"satisfaction_avg": round(satisfaction_avg, 2),
"knowledge_hit_rate": round(knowledge_hit_rate, 2),
"category_distribution": dict(category_distribution)
}
except Exception as e:
logger.error(f"生成每日分析报告失败: {e}")
return {"error": f"生成失败: {str(e)}"}
def _calculate_knowledge_hit_rate(self, conversations: List[Conversation]) -> float:
"""计算知识库命中率"""
if not conversations:
return 0.0
hit_count = 0
for conv in conversations:
if conv.knowledge_used and conv.knowledge_used != "[]":
hit_count += 1
return hit_count / len(conversations)
def _check_alerts(
self,
session,
satisfaction_avg: float,
avg_resolution_time: float,
knowledge_hit_rate: float,
total_orders: int
):
"""检查预警条件"""
alerts = []
# 满意度预警
if satisfaction_avg < self.alert_thresholds["low_satisfaction"]:
alerts.append({
"type": "low_satisfaction",
"message": f"客户满意度较低: {satisfaction_avg:.2f}",
"severity": "high"
})
# 解决时间预警
if avg_resolution_time > self.alert_thresholds["high_resolution_time"]:
alerts.append({
"type": "high_resolution_time",
"message": f"平均解决时间过长: {avg_resolution_time:.2f}小时",
"severity": "medium"
})
# 知识库命中率预警
if knowledge_hit_rate < self.alert_thresholds["knowledge_hit_rate"]:
alerts.append({
"type": "low_knowledge_hit_rate",
"message": f"知识库命中率较低: {knowledge_hit_rate:.2f}",
"severity": "medium"
})
# 创建预警记录
for alert_data in alerts:
alert = Alert(
alert_type=alert_data["type"],
message=alert_data["message"],
severity=alert_data["severity"],
is_active=True,
created_at=datetime.now()
)
session.add(alert)
session.commit()
def get_analytics_summary(self, days: int = 7) -> Dict[str, Any]:
"""获取分析摘要"""
try:
with db_manager.get_session() as session:
end_date = datetime.now()
start_date = end_date - timedelta(days=days)
analytics = session.query(Analytics).filter(
Analytics.date >= start_date,
Analytics.date <= end_date
).order_by(Analytics.date).all()
if not analytics:
return {"message": f"最近{days}天没有分析数据"}
# 计算汇总统计
total_orders = sum(a.total_orders for a in analytics)
total_resolved = sum(a.resolved_orders for a in analytics)
avg_resolution_time = sum(a.avg_resolution_time for a in analytics) / len(analytics)
avg_satisfaction = sum(a.satisfaction_avg for a in analytics) / len(analytics)
avg_knowledge_hit_rate = sum(a.knowledge_hit_rate for a in analytics) / len(analytics)
# 趋势分析
trends = {
"orders_trend": [a.total_orders for a in analytics],
"satisfaction_trend": [a.satisfaction_avg for a in analytics],
"resolution_time_trend": [a.avg_resolution_time for a in analytics]
}
return {
"period": f"{days}",
"total_orders": total_orders,
"total_resolved": total_resolved,
"resolution_rate": total_resolved / total_orders if total_orders > 0 else 0,
"avg_resolution_time_hours": round(avg_resolution_time, 2),
"avg_satisfaction": round(avg_satisfaction, 2),
"avg_knowledge_hit_rate": round(avg_knowledge_hit_rate, 2),
"trends": trends
}
except Exception as e:
logger.error(f"获取分析摘要失败: {e}")
return {"error": f"获取失败: {str(e)}"}
def get_active_alerts(self) -> List[Dict[str, Any]]:
"""获取活跃预警"""
try:
with db_manager.get_session() as session:
alerts = session.query(Alert).filter(
Alert.is_active == True
).order_by(Alert.created_at.desc()).all()
return [
{
"id": alert.id,
"type": alert.alert_type,
"message": alert.message,
"severity": alert.severity,
"created_at": alert.created_at.isoformat()
}
for alert in alerts
]
except Exception as e:
logger.error(f"获取活跃预警失败: {e}")
return []
def resolve_alert(self, alert_id: int) -> bool:
"""解决预警"""
try:
with db_manager.get_session() as session:
alert = session.query(Alert).filter(Alert.id == alert_id).first()
if alert:
alert.is_active = False
alert.resolved_at = datetime.now()
session.commit()
return True
return False
except Exception as e:
logger.error(f"解决预警失败: {e}")
return False
def get_category_performance(self, days: int = 30) -> Dict[str, Any]:
"""获取类别性能分析"""
try:
with db_manager.get_session() as session:
end_date = datetime.now()
start_date = end_date - timedelta(days=days)
work_orders = session.query(WorkOrder).filter(
WorkOrder.created_at >= start_date,
WorkOrder.created_at <= end_date
).all()
category_stats = defaultdict(lambda: {
"total": 0,
"resolved": 0,
"satisfaction_scores": [],
"resolution_times": []
})
for wo in work_orders:
category_stats[wo.category]["total"] += 1
if wo.status == "resolved":
category_stats[wo.category]["resolved"] += 1
if wo.satisfaction_score:
category_stats[wo.category]["satisfaction_scores"].append(wo.satisfaction_score)
if wo.status == "resolved" and wo.updated_at:
resolution_time = (wo.updated_at - wo.created_at).total_seconds() / 3600
category_stats[wo.category]["resolution_times"].append(resolution_time)
# 计算性能指标
performance = {}
for category, stats in category_stats.items():
resolution_rate = stats["resolved"] / stats["total"] if stats["total"] > 0 else 0
avg_satisfaction = sum(stats["satisfaction_scores"]) / len(stats["satisfaction_scores"]) if stats["satisfaction_scores"] else 0
avg_resolution_time = sum(stats["resolution_times"]) / len(stats["resolution_times"]) if stats["resolution_times"] else 0
performance[category] = {
"total_orders": stats["total"],
"resolution_rate": round(resolution_rate, 2),
"avg_satisfaction": round(avg_satisfaction, 2),
"avg_resolution_time_hours": round(avg_resolution_time, 2)
}
return performance
except Exception as e:
logger.error(f"获取类别性能分析失败: {e}")
return {}

View File

@@ -0,0 +1,261 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TSP助手监控服务
实时监控系统状态,执行预警检查
"""
import logging
import threading
import time
from typing import Dict, Any, List
from datetime import datetime, timedelta
from .alert_system import AlertSystem, AlertRule, AlertLevel, AlertType
logger = logging.getLogger(__name__)
class MonitorService:
"""监控服务"""
def __init__(self):
self.alert_system = AlertSystem()
self.is_running = False
self.monitor_thread = None
self.check_interval = 60 # 检查间隔(秒)
def start(self):
"""启动监控服务"""
if self.is_running:
logger.warning("监控服务已在运行")
return
self.is_running = True
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
self.monitor_thread.start()
logger.info("监控服务已启动")
def stop(self):
"""停止监控服务"""
self.is_running = False
if self.monitor_thread:
self.monitor_thread.join(timeout=5)
logger.info("监控服务已停止")
def _monitor_loop(self):
"""监控循环"""
while self.is_running:
try:
# 执行预警检查
triggered_alerts = self.alert_system.check_all_rules()
if triggered_alerts:
logger.info(f"触发 {len(triggered_alerts)} 个预警")
for alert in triggered_alerts:
self._handle_alert(alert)
# 等待下次检查
time.sleep(self.check_interval)
except Exception as e:
logger.error(f"监控循环异常: {e}")
time.sleep(10) # 异常时等待10秒再继续
def _handle_alert(self, alert: Dict[str, Any]):
"""处理预警"""
try:
# 记录预警
logger.warning(f"预警触发: {alert['message']}")
# 根据预警级别采取不同措施
if alert['level'] == 'critical':
self._handle_critical_alert(alert)
elif alert['level'] == 'error':
self._handle_error_alert(alert)
elif alert['level'] == 'warning':
self._handle_warning_alert(alert)
else:
self._handle_info_alert(alert)
except Exception as e:
logger.error(f"处理预警失败: {e}")
def _handle_critical_alert(self, alert: Dict[str, Any]):
"""处理严重预警"""
# 发送紧急通知
self._send_notification(alert, "紧急")
# 记录到日志
logger.critical(f"严重预警: {alert['message']}")
# 可以添加自动恢复措施
self._attempt_auto_recovery(alert)
def _handle_error_alert(self, alert: Dict[str, Any]):
"""处理错误预警"""
# 发送错误通知
self._send_notification(alert, "错误")
# 记录到日志
logger.error(f"错误预警: {alert['message']}")
def _handle_warning_alert(self, alert: Dict[str, Any]):
"""处理警告预警"""
# 发送警告通知
self._send_notification(alert, "警告")
# 记录到日志
logger.warning(f"警告预警: {alert['message']}")
def _handle_info_alert(self, alert: Dict[str, Any]):
"""处理信息预警"""
# 记录到日志
logger.info(f"信息预警: {alert['message']}")
def _send_notification(self, alert: Dict[str, Any], level: str):
"""发送通知"""
# 这里可以集成邮件、短信、钉钉等通知方式
notification = {
"level": level,
"message": alert['message'],
"timestamp": alert['timestamp'],
"rule_name": alert['rule_name']
}
# 记录通知
logger.info(f"发送通知: {notification}")
# TODO: 实现具体的通知发送逻辑
# 例如:发送邮件、短信、钉钉消息等
def _attempt_auto_recovery(self, alert: Dict[str, Any]):
"""尝试自动恢复"""
try:
rule_name = alert['rule_name']
if rule_name == "内存使用预警":
# 尝试清理内存
self._cleanup_memory()
elif rule_name == "错误率预警":
# 尝试重启相关服务
self._restart_services()
elif rule_name == "响应时间预警":
# 尝试优化性能
self._optimize_performance()
except Exception as e:
logger.error(f"自动恢复失败: {e}")
def _cleanup_memory(self):
"""清理内存"""
try:
import gc
gc.collect()
logger.info("执行内存清理")
except Exception as e:
logger.error(f"内存清理失败: {e}")
def _restart_services(self):
"""重启服务"""
try:
# 这里可以实现重启相关服务的逻辑
logger.info("尝试重启服务")
except Exception as e:
logger.error(f"重启服务失败: {e}")
def _optimize_performance(self):
"""优化性能"""
try:
# 这里可以实现性能优化的逻辑
logger.info("尝试优化性能")
except Exception as e:
logger.error(f"性能优化失败: {e}")
def get_system_health(self) -> Dict[str, Any]:
"""获取系统健康状态"""
try:
# 获取活跃预警
active_alerts = self.alert_system.get_active_alerts()
# 获取预警统计
alert_stats = self.alert_system.get_alert_statistics()
# 计算健康分数
health_score = self._calculate_health_score(active_alerts, alert_stats)
return {
"health_score": health_score,
"status": self._get_health_status(health_score),
"active_alerts": len(active_alerts),
"alert_statistics": alert_stats,
"monitor_status": "running" if self.is_running else "stopped",
"last_check": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"获取系统健康状态失败: {e}")
return {"error": str(e)}
def _calculate_health_score(self, active_alerts: List[Dict[str, Any]], alert_stats: Dict[str, Any]) -> float:
"""计算健康分数"""
try:
base_score = 100.0
# 根据活跃预警扣分
for alert in active_alerts:
if alert['level'] == 'critical':
base_score -= 20
elif alert['level'] == 'error':
base_score -= 10
elif alert['level'] == 'warning':
base_score -= 5
else:
base_score -= 1
# 确保分数在0-100之间
return max(0, min(100, base_score))
except Exception as e:
logger.error(f"计算健康分数失败: {e}")
return 50.0
def _get_health_status(self, health_score: float) -> str:
"""获取健康状态"""
if health_score >= 90:
return "excellent"
elif health_score >= 70:
return "good"
elif health_score >= 50:
return "fair"
elif health_score >= 30:
return "poor"
else:
return "critical"
def add_custom_rule(self, rule: AlertRule) -> bool:
"""添加自定义规则"""
return self.alert_system.add_custom_rule(rule)
def update_rule(self, rule_name: str, **kwargs) -> bool:
"""更新规则"""
return self.alert_system.update_rule(rule_name, **kwargs)
def delete_rule(self, rule_name: str) -> bool:
"""删除规则"""
return self.alert_system.delete_rule(rule_name)
def get_rules(self) -> Dict[str, Any]:
"""获取所有规则"""
return {
name: {
"name": rule.name,
"description": rule.description,
"alert_type": rule.alert_type.value,
"level": rule.level.value,
"threshold": rule.threshold,
"enabled": rule.enabled,
"check_interval": rule.check_interval,
"cooldown": rule.cooldown
}
for name, rule in self.alert_system.rules.items()
}