feat: 四层架构数据分析 Agent

- Layer 1 Planner: 意图规划,将问题转为结构化分析计划
- Layer 2 Explorer: 自适应探索循环,多轮迭代动态生成 SQL
- Layer 3 InsightEngine: 异常检测 + 主动洞察
- Layer 4 ContextManager: 多轮对话上下文记忆

安全设计:AI 只看 Schema + 聚合结果,不接触原始数据。
支持任意 OpenAI 兼容 API(OpenAI / Ollama / DeepSeek / vLLM)
This commit is contained in:
OpenClaw Agent
2026-03-19 12:21:04 +08:00
commit 96927a789d
14 changed files with 1683 additions and 0 deletions

239
insights.py Normal file
View File

@@ -0,0 +1,239 @@
"""
Layer 3: 洞察引擎
对探索结果进行异常检测 + 主动洞察,输出用户没问但值得知道的事。
"""
import json
import re
import statistics
from typing import Any
import openai
from config import LLM_CONFIG
from explorer import ExplorationStep
INSIGHT_SYSTEM = """你是一个数据洞察专家。你会收到探索过程的所有结果,你需要:
1. 从结果中发现异常和有趣现象
2. 对比不同维度,找出差异
3. 输出用户可能没问但值得知道的洞察
## 输出格式(严格 JSON 数组)
```json
[
{
"type": "outlier" | "trend" | "distribution" | "correlation" | "recommendation",
"severity": "high" | "medium" | "low",
"title": "简短标题",
"detail": "详细描述,包含具体数字",
"evidence": "支撑这个洞察的数据来源"
}
]
```
## 洞察类型
- outlier: 离群值(某个分组异常高/低)
- trend: 趋势发现(增长/下降、季节性)
- distribution: 分布异常(不均衡、集中度过高)
- correlation: 关联发现(两个维度的意外关联)
- recommendation: 行动建议(基于数据的建议)
## 分析原则
- 每个洞察必须有具体数字支撑
- 用对比来说话A 比 B 高 X%
- 关注异常,不描述平淡的事实
- 如果没有异常,返回空数组"""
class Insight:
"""单条洞察"""
def __init__(self, data: dict):
self.type = data.get("type", "unknown")
self.severity = data.get("severity", "low")
self.title = data.get("title", "")
self.detail = data.get("detail", "")
self.evidence = data.get("evidence", "")
@property
def emoji(self) -> str:
return {
"outlier": "⚠️",
"trend": "📈",
"distribution": "📊",
"correlation": "🔗",
"recommendation": "💡",
}.get(self.type, "📌")
@property
def severity_emoji(self) -> str:
return {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(self.severity, "")
def __str__(self):
return f"{self.emoji} {self.severity_emoji} {self.title}: {self.detail}"
class InsightEngine:
"""洞察引擎:自动检测异常 + 主动输出"""
def __init__(self):
self.client = openai.OpenAI(
api_key=LLM_CONFIG["api_key"],
base_url=LLM_CONFIG["base_url"],
)
self.model = LLM_CONFIG["model"]
def analyze(self, steps: list[ExplorationStep], question: str) -> list[Insight]:
"""
对探索结果进行洞察分析
Args:
steps: 探索步骤列表
question: 原始用户问题
Returns:
洞察列表
"""
if not steps:
return []
# 构建探索历史文本
history = self._build_history(steps)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": INSIGHT_SYSTEM},
{
"role": "user",
"content": (
f"## 用户原始问题\n{question}\n\n"
f"## 探索历史\n{history}\n\n"
f"请分析以上数据,输出异常和洞察。"
),
},
],
temperature=0.3,
max_tokens=2048,
)
content = response.choices[0].message.content.strip()
insights_data = self._extract_json_array(content)
return [Insight(d) for d in insights_data]
def format_insights(self, insights: list[Insight]) -> str:
"""格式化洞察为可读文本"""
if not insights:
return ""
# 按严重程度排序
severity_order = {"high": 0, "medium": 1, "low": 2}
sorted_insights = sorted(insights, key=lambda i: severity_order.get(i.severity, 9))
lines = ["## 💡 主动洞察", ""]
lines.append("_以下是你没问但数据告诉我们的事_\n")
for insight in sorted_insights:
lines.append(f"**{insight.emoji} {insight.title}** {insight.severity_emoji}")
lines.append(f" {insight.detail}")
lines.append(f" _数据来源: {insight.evidence}_")
lines.append("")
return "\n".join(lines)
def _build_history(self, steps: list[ExplorationStep]) -> str:
"""构建探索历史文本"""
parts = []
for step in steps:
if step.action == "done":
parts.append(f"### 结束\n{step.reasoning}")
continue
if step.success:
parts.append(
f"### 第 {step.round_num} 轮:{step.purpose}\n"
f"思考: {step.reasoning}\n"
f"SQL: `{step.sql}`\n"
f"结果 ({step.row_count} 行):\n"
f"列: {step.columns}\n"
f"数据: {json.dumps(step.rows, ensure_ascii=False)}"
)
else:
parts.append(
f"### 第 {step.round_num} 轮:{step.purpose}\n"
f"SQL: `{step.sql}`\n"
f"结果: 执行失败 - {step.error}"
)
return "\n\n".join(parts)
def _extract_json_array(self, text: str) -> list[dict]:
"""从 LLM 输出提取 JSON 数组"""
try:
result = json.loads(text)
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass
for pattern in [r'```json\s*\n(.*?)\n```', r'```\s*\n(.*?)\n```']:
match = re.search(pattern, text, re.DOTALL)
if match:
try:
result = json.loads(match.group(1))
if isinstance(result, list):
return result
except json.JSONDecodeError:
continue
# 找最外层 []
match = re.search(r'\[.*\]', text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return []
# ── 基于规则的快速异常检测(无需 LLM────────────────
def quick_detect(steps: list[ExplorationStep]) -> list[str]:
"""
基于规则的快速异常检测,不调 LLM。
检测离群值、不均衡分布等。
"""
alerts = []
for step in steps:
if not step.success or not step.rows:
continue
for row in step.rows:
for col in step.columns:
val = row.get(col)
if not isinstance(val, (int, float)):
continue
# 检查 pct 列:某个分组占比异常
if col.lower() in ("pct", "percent", "percentage", "占比"):
if isinstance(val, (int, float)) and val > 50:
alerts.append(
f"⚠️ {step.purpose} 中某个分组占比 {val}%,超过 50%,集中度过高"
)
# 检查 count 列:极值差异
if col.lower() in ("count", "cnt", "n", "total", "order_count"):
count_vals = [
r.get(col) for r in step.rows
if isinstance(r.get(col), (int, float))
]
if len(count_vals) >= 3 and max(count_vals) > 0:
ratio = max(count_vals) / (sum(count_vals) / len(count_vals))
if ratio > 3:
alerts.append(
f"⚠️ {step.purpose} 中最大值是均值的 {ratio:.1f} 倍,分布极不均衡"
)
return alerts