240 lines
7.8 KiB
Python
240 lines
7.8 KiB
Python
|
|
"""
|
|||
|
|
Layer 3: 洞察引擎
|
|||
|
|
对探索结果进行异常检测 + 主动洞察,输出用户没问但值得知道的事。
|
|||
|
|
"""
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
import statistics
|
|||
|
|
from typing import Any
|
|||
|
|
|
|||
|
|
import openai
|
|||
|
|
from config import LLM_CONFIG
|
|||
|
|
from explorer import ExplorationStep
|
|||
|
|
|
|||
|
|
|
|||
|
|
INSIGHT_SYSTEM = """你是一个数据洞察专家。你会收到探索过程的所有结果,你需要:
|
|||
|
|
|
|||
|
|
1. 从结果中发现异常和有趣现象
|
|||
|
|
2. 对比不同维度,找出差异
|
|||
|
|
3. 输出用户可能没问但值得知道的洞察
|
|||
|
|
|
|||
|
|
## 输出格式(严格 JSON 数组)
|
|||
|
|
```json
|
|||
|
|
[
|
|||
|
|
{
|
|||
|
|
"type": "outlier" | "trend" | "distribution" | "correlation" | "recommendation",
|
|||
|
|
"severity": "high" | "medium" | "low",
|
|||
|
|
"title": "简短标题",
|
|||
|
|
"detail": "详细描述,包含具体数字",
|
|||
|
|
"evidence": "支撑这个洞察的数据来源"
|
|||
|
|
}
|
|||
|
|
]
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## 洞察类型
|
|||
|
|
- outlier: 离群值(某个分组异常高/低)
|
|||
|
|
- trend: 趋势发现(增长/下降、季节性)
|
|||
|
|
- distribution: 分布异常(不均衡、集中度过高)
|
|||
|
|
- correlation: 关联发现(两个维度的意外关联)
|
|||
|
|
- recommendation: 行动建议(基于数据的建议)
|
|||
|
|
|
|||
|
|
## 分析原则
|
|||
|
|
- 每个洞察必须有具体数字支撑
|
|||
|
|
- 用对比来说话(A 比 B 高 X%)
|
|||
|
|
- 关注异常,不描述平淡的事实
|
|||
|
|
- 如果没有异常,返回空数组"""
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Insight:
|
|||
|
|
"""单条洞察"""
|
|||
|
|
def __init__(self, data: dict):
|
|||
|
|
self.type = data.get("type", "unknown")
|
|||
|
|
self.severity = data.get("severity", "low")
|
|||
|
|
self.title = data.get("title", "")
|
|||
|
|
self.detail = data.get("detail", "")
|
|||
|
|
self.evidence = data.get("evidence", "")
|
|||
|
|
|
|||
|
|
@property
|
|||
|
|
def emoji(self) -> str:
|
|||
|
|
return {
|
|||
|
|
"outlier": "⚠️",
|
|||
|
|
"trend": "📈",
|
|||
|
|
"distribution": "📊",
|
|||
|
|
"correlation": "🔗",
|
|||
|
|
"recommendation": "💡",
|
|||
|
|
}.get(self.type, "📌")
|
|||
|
|
|
|||
|
|
@property
|
|||
|
|
def severity_emoji(self) -> str:
|
|||
|
|
return {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(self.severity, "")
|
|||
|
|
|
|||
|
|
def __str__(self):
|
|||
|
|
return f"{self.emoji} {self.severity_emoji} {self.title}: {self.detail}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
class InsightEngine:
|
|||
|
|
"""洞察引擎:自动检测异常 + 主动输出"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self.client = openai.OpenAI(
|
|||
|
|
api_key=LLM_CONFIG["api_key"],
|
|||
|
|
base_url=LLM_CONFIG["base_url"],
|
|||
|
|
)
|
|||
|
|
self.model = LLM_CONFIG["model"]
|
|||
|
|
|
|||
|
|
def analyze(self, steps: list[ExplorationStep], question: str) -> list[Insight]:
|
|||
|
|
"""
|
|||
|
|
对探索结果进行洞察分析
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
steps: 探索步骤列表
|
|||
|
|
question: 原始用户问题
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
洞察列表
|
|||
|
|
"""
|
|||
|
|
if not steps:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# 构建探索历史文本
|
|||
|
|
history = self._build_history(steps)
|
|||
|
|
|
|||
|
|
response = self.client.chat.completions.create(
|
|||
|
|
model=self.model,
|
|||
|
|
messages=[
|
|||
|
|
{"role": "system", "content": INSIGHT_SYSTEM},
|
|||
|
|
{
|
|||
|
|
"role": "user",
|
|||
|
|
"content": (
|
|||
|
|
f"## 用户原始问题\n{question}\n\n"
|
|||
|
|
f"## 探索历史\n{history}\n\n"
|
|||
|
|
f"请分析以上数据,输出异常和洞察。"
|
|||
|
|
),
|
|||
|
|
},
|
|||
|
|
],
|
|||
|
|
temperature=0.3,
|
|||
|
|
max_tokens=2048,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
content = response.choices[0].message.content.strip()
|
|||
|
|
insights_data = self._extract_json_array(content)
|
|||
|
|
|
|||
|
|
return [Insight(d) for d in insights_data]
|
|||
|
|
|
|||
|
|
def format_insights(self, insights: list[Insight]) -> str:
|
|||
|
|
"""格式化洞察为可读文本"""
|
|||
|
|
if not insights:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
# 按严重程度排序
|
|||
|
|
severity_order = {"high": 0, "medium": 1, "low": 2}
|
|||
|
|
sorted_insights = sorted(insights, key=lambda i: severity_order.get(i.severity, 9))
|
|||
|
|
|
|||
|
|
lines = ["## 💡 主动洞察", ""]
|
|||
|
|
lines.append("_以下是你没问但数据告诉我们的事:_\n")
|
|||
|
|
|
|||
|
|
for insight in sorted_insights:
|
|||
|
|
lines.append(f"**{insight.emoji} {insight.title}** {insight.severity_emoji}")
|
|||
|
|
lines.append(f" {insight.detail}")
|
|||
|
|
lines.append(f" _数据来源: {insight.evidence}_")
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
return "\n".join(lines)
|
|||
|
|
|
|||
|
|
def _build_history(self, steps: list[ExplorationStep]) -> str:
|
|||
|
|
"""构建探索历史文本"""
|
|||
|
|
parts = []
|
|||
|
|
for step in steps:
|
|||
|
|
if step.action == "done":
|
|||
|
|
parts.append(f"### 结束\n{step.reasoning}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if step.success:
|
|||
|
|
parts.append(
|
|||
|
|
f"### 第 {step.round_num} 轮:{step.purpose}\n"
|
|||
|
|
f"思考: {step.reasoning}\n"
|
|||
|
|
f"SQL: `{step.sql}`\n"
|
|||
|
|
f"结果 ({step.row_count} 行):\n"
|
|||
|
|
f"列: {step.columns}\n"
|
|||
|
|
f"数据: {json.dumps(step.rows, ensure_ascii=False)}"
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
parts.append(
|
|||
|
|
f"### 第 {step.round_num} 轮:{step.purpose}\n"
|
|||
|
|
f"SQL: `{step.sql}`\n"
|
|||
|
|
f"结果: 执行失败 - {step.error}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return "\n\n".join(parts)
|
|||
|
|
|
|||
|
|
def _extract_json_array(self, text: str) -> list[dict]:
|
|||
|
|
"""从 LLM 输出提取 JSON 数组"""
|
|||
|
|
try:
|
|||
|
|
result = json.loads(text)
|
|||
|
|
if isinstance(result, list):
|
|||
|
|
return result
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
for pattern in [r'```json\s*\n(.*?)\n```', r'```\s*\n(.*?)\n```']:
|
|||
|
|
match = re.search(pattern, text, re.DOTALL)
|
|||
|
|
if match:
|
|||
|
|
try:
|
|||
|
|
result = json.loads(match.group(1))
|
|||
|
|
if isinstance(result, list):
|
|||
|
|
return result
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 找最外层 []
|
|||
|
|
match = re.search(r'\[.*\]', text, re.DOTALL)
|
|||
|
|
if match:
|
|||
|
|
try:
|
|||
|
|
return json.loads(match.group())
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── 基于规则的快速异常检测(无需 LLM)────────────────
|
|||
|
|
|
|||
|
|
def quick_detect(steps: list[ExplorationStep]) -> list[str]:
|
|||
|
|
"""
|
|||
|
|
基于规则的快速异常检测,不调 LLM。
|
|||
|
|
检测离群值、不均衡分布等。
|
|||
|
|
"""
|
|||
|
|
alerts = []
|
|||
|
|
|
|||
|
|
for step in steps:
|
|||
|
|
if not step.success or not step.rows:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
for row in step.rows:
|
|||
|
|
for col in step.columns:
|
|||
|
|
val = row.get(col)
|
|||
|
|
if not isinstance(val, (int, float)):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 检查 pct 列:某个分组占比异常
|
|||
|
|
if col.lower() in ("pct", "percent", "percentage", "占比"):
|
|||
|
|
if isinstance(val, (int, float)) and val > 50:
|
|||
|
|
alerts.append(
|
|||
|
|
f"⚠️ {step.purpose} 中某个分组占比 {val}%,超过 50%,集中度过高"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 检查 count 列:极值差异
|
|||
|
|
if col.lower() in ("count", "cnt", "n", "total", "order_count"):
|
|||
|
|
count_vals = [
|
|||
|
|
r.get(col) for r in step.rows
|
|||
|
|
if isinstance(r.get(col), (int, float))
|
|||
|
|
]
|
|||
|
|
if len(count_vals) >= 3 and max(count_vals) > 0:
|
|||
|
|
ratio = max(count_vals) / (sum(count_vals) / len(count_vals))
|
|||
|
|
if ratio > 3:
|
|||
|
|
alerts.append(
|
|||
|
|
f"⚠️ {step.purpose} 中最大值是均值的 {ratio:.1f} 倍,分布极不均衡"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return alerts
|