""" Layer 3: 洞察引擎 对探索结果进行异常检测 + 主动洞察,输出用户没问但值得知道的事。 """ import json import re import statistics from typing import Any import openai from config import LLM_CONFIG from explorer import ExplorationStep INSIGHT_SYSTEM = """你是一个数据洞察专家。你会收到探索过程的所有结果,你需要: 1. 从结果中发现异常和有趣现象 2. 对比不同维度,找出差异 3. 输出用户可能没问但值得知道的洞察 ## 输出格式(严格 JSON 数组) ```json [ { "type": "outlier" | "trend" | "distribution" | "correlation" | "recommendation", "severity": "high" | "medium" | "low", "title": "简短标题", "detail": "详细描述,包含具体数字", "evidence": "支撑这个洞察的数据来源" } ] ``` ## 洞察类型 - outlier: 离群值(某个分组异常高/低) - trend: 趋势发现(增长/下降、季节性) - distribution: 分布异常(不均衡、集中度过高) - correlation: 关联发现(两个维度的意外关联) - recommendation: 行动建议(基于数据的建议) ## 分析原则 - 每个洞察必须有具体数字支撑 - 用对比来说话(A 比 B 高 X%) - 关注异常,不描述平淡的事实 - 如果没有异常,返回空数组""" class Insight: """单条洞察""" def __init__(self, data: dict): self.type = data.get("type", "unknown") self.severity = data.get("severity", "low") self.title = data.get("title", "") self.detail = data.get("detail", "") self.evidence = data.get("evidence", "") @property def emoji(self) -> str: return { "outlier": "⚠️", "trend": "📈", "distribution": "📊", "correlation": "🔗", "recommendation": "💡", }.get(self.type, "📌") @property def severity_emoji(self) -> str: return {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(self.severity, "") def __str__(self): return f"{self.emoji} {self.severity_emoji} {self.title}: {self.detail}" class InsightEngine: """洞察引擎:自动检测异常 + 主动输出""" def __init__(self): self.client = openai.OpenAI( api_key=LLM_CONFIG["api_key"], base_url=LLM_CONFIG["base_url"], ) self.model = LLM_CONFIG["model"] def analyze(self, steps: list[ExplorationStep], question: str) -> list[Insight]: """ 对探索结果进行洞察分析 Args: steps: 探索步骤列表 question: 原始用户问题 Returns: 洞察列表 """ if not steps: return [] # 构建探索历史文本 history = self._build_history(steps) response = self.client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": INSIGHT_SYSTEM}, { "role": "user", "content": ( f"## 用户原始问题\n{question}\n\n" f"## 探索历史\n{history}\n\n" f"请分析以上数据,输出异常和洞察。" ), }, ], temperature=0.3, max_tokens=2048, ) content = response.choices[0].message.content.strip() insights_data = self._extract_json_array(content) return [Insight(d) for d in insights_data] def format_insights(self, insights: list[Insight]) -> str: """格式化洞察为可读文本""" if not insights: return "" # 按严重程度排序 severity_order = {"high": 0, "medium": 1, "low": 2} sorted_insights = sorted(insights, key=lambda i: severity_order.get(i.severity, 9)) lines = ["## 💡 主动洞察", ""] lines.append("_以下是你没问但数据告诉我们的事:_\n") for insight in sorted_insights: lines.append(f"**{insight.emoji} {insight.title}** {insight.severity_emoji}") lines.append(f" {insight.detail}") lines.append(f" _数据来源: {insight.evidence}_") lines.append("") return "\n".join(lines) def _build_history(self, steps: list[ExplorationStep]) -> str: """构建探索历史文本""" parts = [] for step in steps: if step.action == "done": parts.append(f"### 结束\n{step.reasoning}") continue if step.success: parts.append( f"### 第 {step.round_num} 轮:{step.purpose}\n" f"思考: {step.reasoning}\n" f"SQL: `{step.sql}`\n" f"结果 ({step.row_count} 行):\n" f"列: {step.columns}\n" f"数据: {json.dumps(step.rows, ensure_ascii=False)}" ) else: parts.append( f"### 第 {step.round_num} 轮:{step.purpose}\n" f"SQL: `{step.sql}`\n" f"结果: 执行失败 - {step.error}" ) return "\n\n".join(parts) def _extract_json_array(self, text: str) -> list[dict]: """从 LLM 输出提取 JSON 数组""" try: result = json.loads(text) if isinstance(result, list): return result except json.JSONDecodeError: pass for pattern in [r'```json\s*\n(.*?)\n```', r'```\s*\n(.*?)\n```']: match = re.search(pattern, text, re.DOTALL) if match: try: result = json.loads(match.group(1)) if isinstance(result, list): return result except json.JSONDecodeError: continue # 找最外层 [] match = re.search(r'\[.*\]', text, re.DOTALL) if match: try: return json.loads(match.group()) except json.JSONDecodeError: pass return [] # ── 基于规则的快速异常检测(无需 LLM)──────────────── def quick_detect(steps: list[ExplorationStep]) -> list[str]: """ 基于规则的快速异常检测,不调 LLM。 检测离群值、不均衡分布等。 """ alerts = [] for step in steps: if not step.success or not step.rows: continue for row in step.rows: for col in step.columns: val = row.get(col) if not isinstance(val, (int, float)): continue # 检查 pct 列:某个分组占比异常 if col.lower() in ("pct", "percent", "percentage", "占比"): if isinstance(val, (int, float)) and val > 50: alerts.append( f"⚠️ {step.purpose} 中某个分组占比 {val}%,超过 50%,集中度过高" ) # 检查 count 列:极值差异 if col.lower() in ("count", "cnt", "n", "total", "order_count"): count_vals = [ r.get(col) for r in step.rows if isinstance(r.get(col), (int, float)) ] if len(count_vals) >= 3 and max(count_vals) > 0: ratio = max(count_vals) / (sum(count_vals) / len(count_vals)) if ratio > 3: alerts.append( f"⚠️ {step.purpose} 中最大值是均值的 {ratio:.1f} 倍,分布极不均衡" ) return alerts