feat: 四层架构数据分析 Agent

- Layer 1 Planner: 意图规划，将问题转为结构化分析计划 - Layer 2 Explorer: 自适应探索循环，多轮迭代动态生成 SQL - Layer 3 InsightEngine: 异常检测 + 主动洞察 - Layer 4 ContextManager: 多轮对话上下文记忆安全设计：AI 只看 Schema + 聚合结果，不接触原始数据。支持任意 OpenAI 兼容 API（OpenAI / Ollama / DeepSeek / vLLM）
2026-03-19 12:21:04 +08:00
commit 96927a789d
14 changed files with 1683 additions and 0 deletions
--- a/explorer.py
+++ b/explorer.py
@@ -0,0 +1,238 @@
+"""
+Layer 2: 自适应探索器
+基于分析计划 + 已有发现，动态决定下一步查什么。
+多轮迭代，直到 AI 判断"够了"或达到上限。
+"""
+import json
+import re
+from typing import Any
+
+import openai
+from config import LLM_CONFIG
+from sandbox_executor import SandboxExecutor
+
+
+EXPLORER_SYSTEM = """你是一个数据分析执行者。你的上级给了你一个分析计划，你需要通过迭代执行 SQL 来完成分析。
+
+## 你的工作方式
+每一轮你看到：
+1. 分析计划（上级给的目标）
+2. 数据库 Schema（表结构、数据画像）
+3. 之前的探索历史（查过什么、得到什么结果）
+
+你决定下一步：
+- 输出一条 SQL 继续探索
+- 或者输出 done 表示分析足够
+
+## 输出格式（严格 JSON）
+```json
+{
+  "action": "query",
+  "reasoning": "为什么要做这个查询",
+  "sql": "SELECT ...",
+  "purpose": "这个查询的目的"
+}
+```
+
+或：
+```json
+{
+  "action": "done",
+  "reasoning": "为什么分析已经足够"
+}
+```
+
+## SQL 规则
+- 只用 SELECT，必须有聚合函数或 GROUP BY
+- 禁止 SELECT *
+- 用 ROUND 控制精度
+- 合理使用 LIMIT（分组结果 15 行以内，时间序列 60 行以内）
+
+## 探索策略
+1. 第一轮：验证核心假设（计划中最关键的查询）
+2. 后续轮：基于已有结果追问
+   - 发现离群值 → 追问为什么
+   - 发现异常比例 → 追问细分维度
+   - 结果平淡 → 换个角度试试
+3. 不要重复查已经确认的事
+4. 每轮要有新发现，否则就该结束"""
+
+
+EXPLORER_CONTINUE = """查询结果：
+
+{result_text}
+
+请基于这个结果决定下一步。如果发现异常或值得深挖的点，继续查询。如果分析足够，输出 done。"""
+
+
+class ExplorationStep:
+    """单步探索结果"""
+    def __init__(self, round_num: int, decision: dict, result: dict):
+        self.round_num = round_num
+        self.reasoning = decision.get("reasoning", "")
+        self.purpose = decision.get("purpose", "")
+        self.sql = decision.get("sql", "")
+        self.action = decision.get("action", "query")
+        self.success = result.get("success", False)
+        self.error = result.get("error")
+        self.columns = result.get("columns", [])
+        self.rows = result.get("rows", [])
+        self.row_count = result.get("row_count", 0)
+
+    def to_dict(self) -> dict:
+        d = {
+            "round": self.round_num,
+            "action": self.action,
+            "reasoning": self.reasoning,
+            "purpose": self.purpose,
+            "sql": self.sql,
+            "success": self.success,
+        }
+        if self.success:
+            d["result"] = {
+                "columns": self.columns,
+                "rows": self.rows,
+                "row_count": self.row_count,
+            }
+        else:
+            d["result"] = {"error": self.error}
+        return d
+
+
+class Explorer:
+    """自适应探索器：多轮迭代执行 SQL"""
+
+    def __init__(self, executor: SandboxExecutor):
+        self.executor = executor
+        self.client = openai.OpenAI(
+            api_key=LLM_CONFIG["api_key"],
+            base_url=LLM_CONFIG["base_url"],
+        )
+        self.model = LLM_CONFIG["model"]
+
+    def explore(
+        self,
+        plan: dict,
+        schema_text: str,
+        max_rounds: int = 6,
+    ) -> list[ExplorationStep]:
+        """
+        执行探索循环
+
+        Args:
+            plan: Planner 生成的分析计划
+            schema_text: Schema 文本描述
+            max_rounds: 最大探索轮数
+
+        Returns:
+            探步列表
+        """
+        steps: list[ExplorationStep] = []
+
+        # 构建初始消息
+        messages = [
+            {"role": "system", "content": EXPLORER_SYSTEM},
+            {
+                "role": "user",
+                "content": (
+                    f"## 分析计划\n```json\n{json.dumps(plan, ensure_ascii=False, indent=2)}\n```\n\n"
+                    f"## 数据库 Schema\n{schema_text}\n\n"
+                    f"请开始第一轮探索。根据计划，先执行最关键的查询。"
+                ),
+            },
+        ]
+
+        for round_num in range(1, max_rounds + 1):
+            print(f"\n  🔄 探索第 {round_num}/{max_rounds} 轮")
+
+            # LLM 决策
+            decision = self._llm_decide(messages)
+            action = decision.get("action", "query")
+            reasoning = decision.get("reasoning", "")
+
+            print(f"     💭 {reasoning[:80]}{'...' if len(reasoning) > 80 else ''}")
+
+            if action == "done":
+                print(f"     ✅ 探索完成")
+                steps.append(ExplorationStep(round_num, decision, {"success": True}))
+                break
+
+            # 执行 SQL
+            sql = decision.get("sql", "")
+            purpose = decision.get("purpose", "")
+
+            if not sql:
+                print(f"     ⚠️ 未生成 SQL，跳过")
+                continue
+
+            print(f"     📝 {purpose}")
+            result = self.executor.execute(sql)
+
+            if result["success"]:
+                print(f"     ✅ {result['row_count']} 行")
+            else:
+                print(f"     ❌ {result['error']}")
+
+            step = ExplorationStep(round_num, decision, result)
+            steps.append(step)
+
+            # 更新对话历史
+            messages.append({
+                "role": "assistant",
+                "content": json.dumps(decision, ensure_ascii=False),
+            })
+
+            result_text = self._format_result(result)
+            messages.append({
+                "role": "user",
+                "content": EXPLORER_CONTINUE.format(result_text=result_text),
+            })
+
+        return steps
+
+    def _llm_decide(self, messages: list[dict]) -> dict:
+        """LLM 决策"""
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            temperature=0.2,
+            max_tokens=1024,
+        )
+        content = response.choices[0].message.content.strip()
+        return self._extract_json(content)
+
+    def _format_result(self, result: dict) -> str:
+        """格式化查询结果"""
+        if not result.get("success"):
+            return f"❌ 执行失败: {result.get('error', '未知错误')}"
+
+        rows = result["rows"][:20]
+        return (
+            f"✅ 查询成功，返回 {result['row_count']} 行\n"
+            f"列: {result['columns']}\n"
+            f"数据:\n{json.dumps(rows, ensure_ascii=False, indent=2)}"
+        )
+
+    def _extract_json(self, text: str) -> dict:
+        """从 LLM 输出提取 JSON"""
+        try:
+            return json.loads(text)
+        except json.JSONDecodeError:
+            pass
+
+        for pattern in [r'```json\s*\n(.*?)\n```', r'```\s*\n(.*?)\n```']:
+            match = re.search(pattern, text, re.DOTALL)
+            if match:
+                try:
+                    return json.loads(match.group(1))
+                except json.JSONDecodeError:
+                    continue
+
+        match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', text, re.DOTALL)
+        if match:
+            try:
+                return json.loads(match.group())
+            except json.JSONDecodeError:
+                pass
+
+        return {"action": "done", "reasoning": f"无法解析输出: {text[:100]}"}