Files
assist/src/agent/auto_monitor.py
zhaojie c3560b43fd docs: update README and CLAUDE.md to v2.2.0
- Added documentation for audit tracking (IP address, invocation method).
- Updated database model descriptions for enhanced WorkOrder and Conversation fields.
- Documented the new UnifiedConfig system.
- Reflected enhanced logging transparency for knowledge base parsing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 00:08:09 +08:00

359 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
自动监控服务
实现Agent的主动调用功能
"""
import asyncio跳过系统检查直接启动服务...
import logging
import threading
import time
from datetime import datetime, timedelta
from typing import Dict, Any, List, Optional
import json
logger = logging.getLogger(__name__)
class AutoMonitorService:
"""自动监控服务"""
def __init__(self, agent_assistant):
self.agent_assistant = agent_assistant
self.is_running = False
self.monitor_thread = None
self.check_interval = 300 # 5分钟检查一次
self.last_check_time = None
self.monitoring_stats = {
"total_checks": 0,
"proactive_actions": 0,
"last_action_time": None,
"error_count": 0
}
def start_auto_monitoring(self) -> bool:
"""启动自动监控"""
try:
if self.is_running:
logger.warning("自动监控已在运行中")
return True
self.is_running = True
self.monitor_thread = threading.Thread(target=self._monitoring_loop, daemon=True)
self.monitor_thread.start()
logger.info("自动监控服务已启动")
return True
except Exception as e:
logger.error(f"启动自动监控失败: {e}")
return False
def stop_auto_monitoring(self) -> bool:
"""停止自动监控"""
try:
self.is_running = False
if self.monitor_thread and self.monitor_thread.is_alive():
self.monitor_thread.join(timeout=5)
logger.info("自动监控服务已停止")
return True
except Exception as e:
logger.error(f"停止自动监控失败: {e}")
return False
def _monitoring_loop(self):
"""监控循环"""
logger.info("自动监控循环已启动")
while self.is_running:
try:
# 执行监控检查
self._perform_monitoring_check()
# 等待下次检查
time.sleep(self.check_interval)
except Exception as e:
logger.error(f"监控循环错误: {e}")
self.monitoring_stats["error_count"] += 1
time.sleep(60) # 出错后等待1分钟
def _perform_monitoring_check(self):
"""执行监控检查"""
try:
self.monitoring_stats["total_checks"] += 1
self.last_check_time = datetime.now()
logger.info(f"执行第 {self.monitoring_stats['total_checks']} 次自动监控检查")
# 1. 检查系统健康状态
self._check_system_health()
# 2. 检查预警状态
self._check_alert_status()
# 3. 检查工单积压
self._check_workorder_backlog()
# 4. 检查知识库质量
self._check_knowledge_quality()
# 5. 检查用户满意度
self._check_user_satisfaction()
# 6. 检查系统性能
self._check_system_performance()
except Exception as e:
logger.error(f"执行监控检查失败: {e}")
self.monitoring_stats["error_count"] += 1
def _check_system_health(self):
"""检查系统健康状态"""
try:
health = self.agent_assistant.get_system_health()
health_score = health.get("health_score", 1.0)
if health_score < 0.7:
self._trigger_proactive_action({
"type": "system_health_warning",
"priority": "high",
"description": f"系统健康分数较低: {health_score:.2f}",
"action": "建议立即检查系统状态",
"data": health
})
except Exception as e:
logger.error(f"检查系统健康状态失败: {e}")
def _check_alert_status(self):
"""检查预警状态"""
try:
alerts = self.agent_assistant.get_active_alerts()
alert_count = len(alerts) if isinstance(alerts, list) else 0
if alert_count > 5:
self._trigger_proactive_action({
"type": "alert_overflow",
"priority": "high",
"description": f"活跃预警数量过多: {alert_count}",
"action": "建议立即处理预警",
"data": {"alert_count": alert_count}
})
except Exception as e:
logger.error(f"检查预警状态失败: {e}")
def _check_workorder_backlog(self):
"""检查工单积压"""
try:
# 获取工单统计
workorders = self.agent_assistant.get_workorders()
if isinstance(workorders, list):
open_count = len([w for w in workorders if w.get("status") == "open"])
in_progress_count = len([w for w in workorders if w.get("status") == "in_progress"])
total_pending = open_count + in_progress_count
if total_pending > 10:
self._trigger_proactive_action({
"type": "workorder_backlog",
"priority": "medium",
"description": f"待处理工单过多: {total_pending}",
"action": "建议增加处理人员或优化流程",
"data": {
"open_count": open_count,
"in_progress_count": in_progress_count,
"total_pending": total_pending
}
})
except Exception as e:
logger.error(f"检查工单积压失败: {e}")
def _check_knowledge_quality(self):
"""检查知识库质量"""
try:
stats = self.agent_assistant.knowledge_manager.get_knowledge_stats()
avg_confidence = stats.get("average_confidence", 0.8)
total_entries = stats.get("total_entries", 0)
if avg_confidence < 0.6:
self._trigger_proactive_action({
"type": "knowledge_quality_low",
"priority": "medium",
"description": f"知识库平均置信度较低: {avg_confidence:.2f}",
"action": "建议更新和优化知识库内容",
"data": {"avg_confidence": avg_confidence, "total_entries": total_entries}
})
except Exception as e:
logger.error(f"检查知识库质量失败: {e}")
def _check_user_satisfaction(self):
"""检查用户满意度"""
try:
# 模拟检查用户满意度
# 这里可以从数据库或API获取真实的满意度数据
satisfaction_score = 0.75 # 模拟数据
if satisfaction_score < 0.7:
self._trigger_proactive_action({
"type": "low_satisfaction",
"priority": "high",
"description": f"用户满意度较低: {satisfaction_score:.2f}",
"action": "建议分析低满意度原因并改进服务",
"data": {"satisfaction_score": satisfaction_score}
})
except Exception as e:
logger.error(f"检查用户满意度失败: {e}")
def _check_system_performance(self):
"""检查系统性能"""
try:
# 模拟检查系统性能指标
response_time = 1.2 # 模拟响应时间(秒)
error_rate = 0.02 # 模拟错误率
if response_time > 2.0:
self._trigger_proactive_action({
"type": "slow_response",
"priority": "medium",
"description": f"系统响应时间过慢: {response_time:.2f}",
"action": "建议优化系统性能",
"data": {"response_time": response_time}
})
if error_rate > 0.05:
self._trigger_proactive_action({
"type": "high_error_rate",
"priority": "high",
"description": f"系统错误率过高: {error_rate:.2%}",
"action": "建议立即检查系统错误",
"data": {"error_rate": error_rate}
})
except Exception as e:
logger.error(f"检查系统性能失败: {e}")
def _trigger_proactive_action(self, action_data: Dict[str, Any]):
"""触发主动行动"""
try:
self.monitoring_stats["proactive_actions"] += 1
self.monitoring_stats["last_action_time"] = datetime.now()
logger.info(f"触发主动行动: {action_data['type']} - {action_data['description']}")
# 记录主动行动
self._log_proactive_action(action_data)
# 根据行动类型执行相应操作
self._execute_proactive_action(action_data)
except Exception as e:
logger.error(f"触发主动行动失败: {e}")
def _log_proactive_action(self, action_data: Dict[str, Any]):
"""记录主动行动"""
try:
log_entry = {
"timestamp": datetime.now().isoformat(),
"action_type": action_data["type"],
"priority": action_data["priority"],
"description": action_data["description"],
"action": action_data["action"],
"data": action_data.get("data", {})
}
# 这里可以将日志保存到数据库或文件
logger.info(f"主动行动记录: {json.dumps(log_entry, ensure_ascii=False)}")
except Exception as e:
logger.error(f"记录主动行动失败: {e}")
def _execute_proactive_action(self, action_data: Dict[str, Any]):
"""执行主动行动"""
try:
action_type = action_data["type"]
if action_type == "system_health_warning":
self._handle_system_health_warning(action_data)
elif action_type == "alert_overflow":
self._handle_alert_overflow(action_data)
elif action_type == "workorder_backlog":
self._handle_workorder_backlog(action_data)
elif action_type == "knowledge_quality_low":
self._handle_knowledge_quality_low(action_data)
elif action_type == "low_satisfaction":
self._handle_low_satisfaction(action_data)
elif action_type == "slow_response":
self._handle_slow_response(action_data)
elif action_type == "high_error_rate":
self._handle_high_error_rate(action_data)
else:
logger.warning(f"未知的主动行动类型: {action_type}")
except Exception as e:
logger.error(f"执行主动行动失败: {e}")
def _handle_system_health_warning(self, action_data: Dict[str, Any]):
"""处理系统健康警告"""
logger.info("处理系统健康警告")
# 这里可以实现具体的处理逻辑,如发送通知、重启服务等
def _handle_alert_overflow(self, action_data: Dict[str, Any]):
"""处理预警溢出"""
logger.info("处理预警溢出")
# 这里可以实现具体的处理逻辑,如自动处理预警、发送通知等
def _handle_workorder_backlog(self, action_data: Dict[str, Any]):
"""处理工单积压"""
logger.info("处理工单积压")
# 这里可以实现具体的处理逻辑,如自动分配工单、发送提醒等
def _handle_knowledge_quality_low(self, action_data: Dict[str, Any]):
"""处理知识库质量低"""
logger.info("处理知识库质量低")
# 这里可以实现具体的处理逻辑,如自动更新知识库、发送提醒等
def _handle_low_satisfaction(self, action_data: Dict[str, Any]):
"""处理低满意度"""
logger.info("处理低满意度")
# 这里可以实现具体的处理逻辑,如分析原因、发送通知等
def _handle_slow_response(self, action_data: Dict[str, Any]):
"""处理响应慢"""
logger.info("处理响应慢")
# 这里可以实现具体的处理逻辑,如优化配置、发送通知等
def _handle_high_error_rate(self, action_data: Dict[str, Any]):
"""处理高错误率"""
logger.info("处理高错误率")
# 这里可以实现具体的处理逻辑,如检查日志、发送通知等
def get_monitoring_status(self) -> Dict[str, Any]:
"""获取监控状态"""
return {
"is_running": self.is_running,
"check_interval": self.check_interval,
"last_check_time": self.last_check_time.isoformat() if self.last_check_time else None,
"stats": self.monitoring_stats
}
def update_check_interval(self, interval: int) -> bool:
"""更新检查间隔"""
try:
if interval < 60: # 最少1分钟
logger.warning("检查间隔不能少于60秒")
return False
self.check_interval = interval
logger.info(f"检查间隔已更新为 {interval}")
return True
except Exception as e:
logger.error(f"更新检查间隔失败: {e}")
return False