Files
iov_ana/insights.py

240 lines
7.8 KiB
Python
Raw Normal View History

"""
Layer 3: 洞察引擎
对探索结果进行异常检测 + 主动洞察输出用户没问但值得知道的事
"""
import json
import re
import statistics
from typing import Any
import openai
from config import LLM_CONFIG
from explorer import ExplorationStep
INSIGHT_SYSTEM = """你是一个数据洞察专家。你会收到探索过程的所有结果,你需要:
1. 从结果中发现异常和有趣现象
2. 对比不同维度找出差异
3. 输出用户可能没问但值得知道的洞察
## 输出格式(严格 JSON 数组)
```json
[
{
"type": "outlier" | "trend" | "distribution" | "correlation" | "recommendation",
"severity": "high" | "medium" | "low",
"title": "简短标题",
"detail": "详细描述,包含具体数字",
"evidence": "支撑这个洞察的数据来源"
}
]
```
## 洞察类型
- outlier: 离群值某个分组异常高/
- trend: 趋势发现增长/下降季节性
- distribution: 分布异常不均衡集中度过高
- correlation: 关联发现两个维度的意外关联
- recommendation: 行动建议基于数据的建议
## 分析原则
- 每个洞察必须有具体数字支撑
- 用对比来说话A B X%
- 关注异常不描述平淡的事实
- 如果没有异常返回空数组"""
class Insight:
"""单条洞察"""
def __init__(self, data: dict):
self.type = data.get("type", "unknown")
self.severity = data.get("severity", "low")
self.title = data.get("title", "")
self.detail = data.get("detail", "")
self.evidence = data.get("evidence", "")
@property
def emoji(self) -> str:
return {
"outlier": "⚠️",
"trend": "📈",
"distribution": "📊",
"correlation": "🔗",
"recommendation": "💡",
}.get(self.type, "📌")
@property
def severity_emoji(self) -> str:
return {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(self.severity, "")
def __str__(self):
return f"{self.emoji} {self.severity_emoji} {self.title}: {self.detail}"
class InsightEngine:
"""洞察引擎:自动检测异常 + 主动输出"""
def __init__(self):
self.client = openai.OpenAI(
api_key=LLM_CONFIG["api_key"],
base_url=LLM_CONFIG["base_url"],
)
self.model = LLM_CONFIG["model"]
def analyze(self, steps: list[ExplorationStep], question: str) -> list[Insight]:
"""
对探索结果进行洞察分析
Args:
steps: 探索步骤列表
question: 原始用户问题
Returns:
洞察列表
"""
if not steps:
return []
# 构建探索历史文本
history = self._build_history(steps)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": INSIGHT_SYSTEM},
{
"role": "user",
"content": (
f"## 用户原始问题\n{question}\n\n"
f"## 探索历史\n{history}\n\n"
f"请分析以上数据,输出异常和洞察。"
),
},
],
temperature=0.3,
max_tokens=2048,
)
content = response.choices[0].message.content.strip()
insights_data = self._extract_json_array(content)
return [Insight(d) for d in insights_data]
def format_insights(self, insights: list[Insight]) -> str:
"""格式化洞察为可读文本"""
if not insights:
return ""
# 按严重程度排序
severity_order = {"high": 0, "medium": 1, "low": 2}
sorted_insights = sorted(insights, key=lambda i: severity_order.get(i.severity, 9))
lines = ["## 💡 主动洞察", ""]
lines.append("_以下是你没问但数据告诉我们的事_\n")
for insight in sorted_insights:
lines.append(f"**{insight.emoji} {insight.title}** {insight.severity_emoji}")
lines.append(f" {insight.detail}")
lines.append(f" _数据来源: {insight.evidence}_")
lines.append("")
return "\n".join(lines)
def _build_history(self, steps: list[ExplorationStep]) -> str:
"""构建探索历史文本"""
parts = []
for step in steps:
if step.action == "done":
parts.append(f"### 结束\n{step.reasoning}")
continue
if step.success:
parts.append(
f"### 第 {step.round_num} 轮:{step.purpose}\n"
f"思考: {step.reasoning}\n"
f"SQL: `{step.sql}`\n"
f"结果 ({step.row_count} 行):\n"
f"列: {step.columns}\n"
f"数据: {json.dumps(step.rows, ensure_ascii=False)}"
)
else:
parts.append(
f"### 第 {step.round_num} 轮:{step.purpose}\n"
f"SQL: `{step.sql}`\n"
f"结果: 执行失败 - {step.error}"
)
return "\n\n".join(parts)
def _extract_json_array(self, text: str) -> list[dict]:
"""从 LLM 输出提取 JSON 数组"""
try:
result = json.loads(text)
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass
for pattern in [r'```json\s*\n(.*?)\n```', r'```\s*\n(.*?)\n```']:
match = re.search(pattern, text, re.DOTALL)
if match:
try:
result = json.loads(match.group(1))
if isinstance(result, list):
return result
except json.JSONDecodeError:
continue
# 找最外层 []
match = re.search(r'\[.*\]', text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return []
# ── 基于规则的快速异常检测(无需 LLM────────────────
def quick_detect(steps: list[ExplorationStep]) -> list[str]:
"""
基于规则的快速异常检测不调 LLM
检测离群值不均衡分布等
"""
alerts = []
for step in steps:
if not step.success or not step.rows:
continue
for row in step.rows:
for col in step.columns:
val = row.get(col)
if not isinstance(val, (int, float)):
continue
# 检查 pct 列:某个分组占比异常
if col.lower() in ("pct", "percent", "percentage", "占比"):
if isinstance(val, (int, float)) and val > 50:
alerts.append(
f"⚠️ {step.purpose} 中某个分组占比 {val}%,超过 50%,集中度过高"
)
# 检查 count 列:极值差异
if col.lower() in ("count", "cnt", "n", "total", "order_count"):
count_vals = [
r.get(col) for r in step.rows
if isinstance(r.get(col), (int, float))
]
if len(count_vals) >= 3 and max(count_vals) > 0:
ratio = max(count_vals) / (sum(count_vals) / len(count_vals))
if ratio > 3:
alerts.append(
f"⚠️ {step.purpose} 中最大值是均值的 {ratio:.1f} 倍,分布极不均衡"
)
return alerts