Files
iov_ana/insights.py
OpenClaw Agent 96927a789d feat: 四层架构数据分析 Agent
- Layer 1 Planner: 意图规划,将问题转为结构化分析计划
- Layer 2 Explorer: 自适应探索循环,多轮迭代动态生成 SQL
- Layer 3 InsightEngine: 异常检测 + 主动洞察
- Layer 4 ContextManager: 多轮对话上下文记忆

安全设计:AI 只看 Schema + 聚合结果,不接触原始数据。
支持任意 OpenAI 兼容 API(OpenAI / Ollama / DeepSeek / vLLM)
2026-03-19 12:21:04 +08:00

240 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Layer 3: 洞察引擎
对探索结果进行异常检测 + 主动洞察,输出用户没问但值得知道的事。
"""
import json
import re
import statistics
from typing import Any
import openai
from config import LLM_CONFIG
from explorer import ExplorationStep
INSIGHT_SYSTEM = """你是一个数据洞察专家。你会收到探索过程的所有结果,你需要:
1. 从结果中发现异常和有趣现象
2. 对比不同维度,找出差异
3. 输出用户可能没问但值得知道的洞察
## 输出格式(严格 JSON 数组)
```json
[
{
"type": "outlier" | "trend" | "distribution" | "correlation" | "recommendation",
"severity": "high" | "medium" | "low",
"title": "简短标题",
"detail": "详细描述,包含具体数字",
"evidence": "支撑这个洞察的数据来源"
}
]
```
## 洞察类型
- outlier: 离群值(某个分组异常高/低)
- trend: 趋势发现(增长/下降、季节性)
- distribution: 分布异常(不均衡、集中度过高)
- correlation: 关联发现(两个维度的意外关联)
- recommendation: 行动建议(基于数据的建议)
## 分析原则
- 每个洞察必须有具体数字支撑
- 用对比来说话A 比 B 高 X%
- 关注异常,不描述平淡的事实
- 如果没有异常,返回空数组"""
class Insight:
"""单条洞察"""
def __init__(self, data: dict):
self.type = data.get("type", "unknown")
self.severity = data.get("severity", "low")
self.title = data.get("title", "")
self.detail = data.get("detail", "")
self.evidence = data.get("evidence", "")
@property
def emoji(self) -> str:
return {
"outlier": "⚠️",
"trend": "📈",
"distribution": "📊",
"correlation": "🔗",
"recommendation": "💡",
}.get(self.type, "📌")
@property
def severity_emoji(self) -> str:
return {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(self.severity, "")
def __str__(self):
return f"{self.emoji} {self.severity_emoji} {self.title}: {self.detail}"
class InsightEngine:
"""洞察引擎:自动检测异常 + 主动输出"""
def __init__(self):
self.client = openai.OpenAI(
api_key=LLM_CONFIG["api_key"],
base_url=LLM_CONFIG["base_url"],
)
self.model = LLM_CONFIG["model"]
def analyze(self, steps: list[ExplorationStep], question: str) -> list[Insight]:
"""
对探索结果进行洞察分析
Args:
steps: 探索步骤列表
question: 原始用户问题
Returns:
洞察列表
"""
if not steps:
return []
# 构建探索历史文本
history = self._build_history(steps)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": INSIGHT_SYSTEM},
{
"role": "user",
"content": (
f"## 用户原始问题\n{question}\n\n"
f"## 探索历史\n{history}\n\n"
f"请分析以上数据,输出异常和洞察。"
),
},
],
temperature=0.3,
max_tokens=2048,
)
content = response.choices[0].message.content.strip()
insights_data = self._extract_json_array(content)
return [Insight(d) for d in insights_data]
def format_insights(self, insights: list[Insight]) -> str:
"""格式化洞察为可读文本"""
if not insights:
return ""
# 按严重程度排序
severity_order = {"high": 0, "medium": 1, "low": 2}
sorted_insights = sorted(insights, key=lambda i: severity_order.get(i.severity, 9))
lines = ["## 💡 主动洞察", ""]
lines.append("_以下是你没问但数据告诉我们的事_\n")
for insight in sorted_insights:
lines.append(f"**{insight.emoji} {insight.title}** {insight.severity_emoji}")
lines.append(f" {insight.detail}")
lines.append(f" _数据来源: {insight.evidence}_")
lines.append("")
return "\n".join(lines)
def _build_history(self, steps: list[ExplorationStep]) -> str:
"""构建探索历史文本"""
parts = []
for step in steps:
if step.action == "done":
parts.append(f"### 结束\n{step.reasoning}")
continue
if step.success:
parts.append(
f"### 第 {step.round_num} 轮:{step.purpose}\n"
f"思考: {step.reasoning}\n"
f"SQL: `{step.sql}`\n"
f"结果 ({step.row_count} 行):\n"
f"列: {step.columns}\n"
f"数据: {json.dumps(step.rows, ensure_ascii=False)}"
)
else:
parts.append(
f"### 第 {step.round_num} 轮:{step.purpose}\n"
f"SQL: `{step.sql}`\n"
f"结果: 执行失败 - {step.error}"
)
return "\n\n".join(parts)
def _extract_json_array(self, text: str) -> list[dict]:
"""从 LLM 输出提取 JSON 数组"""
try:
result = json.loads(text)
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass
for pattern in [r'```json\s*\n(.*?)\n```', r'```\s*\n(.*?)\n```']:
match = re.search(pattern, text, re.DOTALL)
if match:
try:
result = json.loads(match.group(1))
if isinstance(result, list):
return result
except json.JSONDecodeError:
continue
# 找最外层 []
match = re.search(r'\[.*\]', text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return []
# ── 基于规则的快速异常检测(无需 LLM────────────────
def quick_detect(steps: list[ExplorationStep]) -> list[str]:
"""
基于规则的快速异常检测,不调 LLM。
检测离群值、不均衡分布等。
"""
alerts = []
for step in steps:
if not step.success or not step.rows:
continue
for row in step.rows:
for col in step.columns:
val = row.get(col)
if not isinstance(val, (int, float)):
continue
# 检查 pct 列:某个分组占比异常
if col.lower() in ("pct", "percent", "percentage", "占比"):
if isinstance(val, (int, float)) and val > 50:
alerts.append(
f"⚠️ {step.purpose} 中某个分组占比 {val}%,超过 50%,集中度过高"
)
# 检查 count 列:极值差异
if col.lower() in ("count", "cnt", "n", "total", "order_count"):
count_vals = [
r.get(col) for r in step.rows
if isinstance(r.get(col), (int, float))
]
if len(count_vals) >= 3 and max(count_vals) > 0:
ratio = max(count_vals) / (sum(count_vals) / len(count_vals))
if ratio > 3:
alerts.append(
f"⚠️ {step.purpose} 中最大值是均值的 {ratio:.1f} 倍,分布极不均衡"
)
return alerts