Files
iov_ana/layers/explorer.py
openclaw e8f8e2f1ba feat: 四层架构全面增强
安全与稳定性:
- 移除硬编码 API Key,改用 .env + 环境变量
- LLM 调用统一重试机制(指数退避,3 次重试,处理 429/5xx/超时)
- 中文字体检测增强(CJK 关键词兜底 + 无字体时英文 fallback)
- 缺失 API Key 给出友好提示而非崩溃

分析能力提升:
- 异常检测新增 z-score 检测(标准差>2 标记异常)
- 新增变异系数 CV 检测(数据波动性)
- 新增零值/缺失检测
- 上下文管理器升级为关键词语义匹配(替代简单取最近 2 条)

用户体验:
- 报告自动保存为 Markdown(reports/ 目录)
- 新增 export 命令导出查询结果为 CSV
- 新增 reports 命令查看已保存报告
- CLI 支持 readline 命令历史(方向键翻阅)
- CSV 导入工具重写:自动列名映射、容错处理、dry-run 模式
- 新增 .env.example 配置模板
2026-03-31 14:39:17 +08:00

225 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Layer 2: 自适应探索器
"""
import json
from typing import Any
from dataclasses import dataclass, field
from core.config import LLM_CONFIG
from core.utils import get_llm_client, llm_chat, extract_json_object
from core.sandbox import SandboxExecutor
EXPLORER_SYSTEM = """你是一个数据分析执行者。你的上级给了你一个分析计划,你需要通过迭代执行 SQL 来完成分析。
## 你的工作方式
每一轮你看到:
1. 分析计划(上级给的目标)
2. 数据库 Schema表结构、数据画像
3. 之前的探索历史(查过什么、得到什么结果)
你决定下一步:
- 输出一条 SQL 继续探索
- 或者输出 done 表示分析足够
## 输出格式(严格 JSON
```json
{
"action": "query",
"reasoning": "为什么要做这个查询",
"sql": "SELECT ...",
"purpose": "这个查询的目的"
}
```
或:
```json
{
"action": "done",
"reasoning": "为什么分析已经足够"
}
```
## SQL 规则(严格遵守,否则会被沙箱拒绝)
- 只用 SELECT
- 每条 SQL 必须包含聚合函数COUNT/SUM/AVG/MIN/MAX或 GROUP BY 或 LIMIT
- 禁止 SELECT *
- 用 ROUND 控制精度
- 合理使用 LIMIT分组结果 15 行以内,时间序列 60 行以内)
- 如果需要查看明细数据,必须加 LIMIT
## 探索策略
1. 第一轮:验证核心假设
2. 后续轮:基于已有结果追问
3. 不要重复查已经确认的事
4. 每轮要有新发现,否则就该结束"""
@dataclass
class ExplorationStep:
"""单步探索结果"""
round_num: int = 0
reasoning: str = ""
purpose: str = ""
sql: str = ""
action: str = "query"
success: bool = False
error: str | None = None
columns: list[str] = field(default_factory=list)
rows: list[dict] = field(default_factory=list)
row_count: int = 0
@classmethod
def from_decision(cls, round_num: int, decision: dict, result: dict) -> "ExplorationStep":
return cls(
round_num=round_num,
reasoning=decision.get("reasoning", ""),
purpose=decision.get("purpose", ""),
sql=decision.get("sql", ""),
action=decision.get("action", "query"),
success=result.get("success", False),
error=result.get("error"),
columns=result.get("columns", []),
rows=result.get("rows", []),
row_count=result.get("row_count", 0),
)
def to_dict(self) -> dict:
d = {
"round": self.round_num, "action": self.action,
"reasoning": self.reasoning, "purpose": self.purpose,
"sql": self.sql, "success": self.success,
}
if self.success:
d["result"] = {"columns": self.columns, "rows": self.rows, "row_count": self.row_count}
else:
d["result"] = {"error": self.error}
return d
class Explorer:
"""自适应探索器"""
def __init__(self, executor: SandboxExecutor):
self.executor = executor
self.client, self.model = get_llm_client(LLM_CONFIG)
def explore(
self, plan: dict, schema_text: str,
max_rounds: int = 6, playbook_result: dict | None = None,
) -> list[ExplorationStep]:
steps: list[ExplorationStep] = []
# 阶段 A: 预设查询
preset_context = ""
if playbook_result and playbook_result.get("preset_queries"):
preset_steps = self._run_preset_queries(playbook_result["preset_queries"])
steps.extend(preset_steps)
preset_context = self._build_preset_context(preset_steps, playbook_result)
# 阶段 B: 自适应探索
preset_used = len([s for s in steps if s.success])
remaining = max(1, max_rounds - preset_used)
initial = (
f"## 分析计划\n```json\n{json.dumps(plan, ensure_ascii=False, indent=2)}\n```\n\n"
f"## 数据库 Schema\n{schema_text}\n\n"
)
# 注入历史上下文
prev_context = plan.pop("_prev_context", None)
if prev_context:
initial += f"## 历史分析参考\n{prev_context}\n\n"
if preset_context:
initial += (
f"## 预设分析结果(已执行)\n{preset_context}\n\n"
f"请基于这些已有数据,决定是否需要进一步探索。\n"
f"重点关注:预设结果中的异常、值得深挖的点。\n"
f"如果预设结果已经足够,直接输出 done。"
)
if playbook_result.get("exploration_hints"):
initial += f"\n\n## 探索提示\n{playbook_result['exploration_hints']}"
else:
initial += "请开始第一轮探索。根据计划,先执行最关键的查询。"
messages = [
{"role": "system", "content": EXPLORER_SYSTEM},
{"role": "user", "content": initial},
]
offset = len(steps)
for round_num in range(offset + 1, offset + remaining + 1):
print(f"\n 🔄 探索第 {round_num}/{max_rounds}")
decision = self._llm_decide(messages)
reasoning = decision.get("reasoning", "")
print(f" 💭 {reasoning[:80]}{'...' if len(reasoning) > 80 else ''}")
if decision.get("action") == "done":
print(f" ✅ 探索完成")
steps.append(ExplorationStep.from_decision(round_num, decision, {"success": True}))
break
sql = decision.get("sql", "")
if not sql:
continue
print(f" 📝 {decision.get('purpose', '')}")
try:
result = self.executor.execute(sql)
except Exception as e:
result = {"success": False, "error": str(e), "sql": sql}
print(f" {'' if result['success'] else ''} {result.get('row_count', result.get('error', ''))}")
steps.append(ExplorationStep.from_decision(round_num, decision, result))
messages.append({"role": "assistant", "content": json.dumps(decision, ensure_ascii=False)})
messages.append({"role": "user", "content": self._format_result(result)})
return steps
def _run_preset_queries(self, preset_queries: list[dict]) -> list[ExplorationStep]:
steps = []
for i, pq in enumerate(preset_queries, 1):
sql, purpose = pq["sql"], pq.get("purpose", f"预设查询 {i}")
print(f"\n 📌 预设查询 {i}/{len(preset_queries)}: {purpose}")
try:
result = self.executor.execute(sql)
except Exception as e:
result = {"success": False, "error": str(e), "sql": sql}
decision = {"action": "query", "reasoning": f"[预设] {purpose}", "sql": sql, "purpose": purpose}
steps.append(ExplorationStep.from_decision(i, decision, result))
print(f" {'' if result['success'] else ''} {result.get('row_count', result.get('error', ''))}")
return steps
def _build_preset_context(self, steps: list[ExplorationStep], playbook_result: dict) -> str:
parts = [f"Playbook: {playbook_result.get('playbook_name', '未知')}"]
for step in steps:
if step.success:
parts.append(
f"### {step.purpose}\nSQL: `{step.sql}`\n"
f"结果 ({step.row_count} 行): {json.dumps(step.rows[:15], ensure_ascii=False)}"
)
else:
parts.append(f"### {step.purpose}\nSQL: `{step.sql}`\n执行失败: {step.error}")
return "\n\n".join(parts)
def _llm_decide(self, messages: list[dict]) -> dict:
content = llm_chat(
self.client, self.model,
messages=messages, temperature=0.2, max_tokens=1024,
)
result = extract_json_object(content)
return result if result else {"action": "done", "reasoning": f"无法解析: {content[:100]}"}
def _format_result(self, result: dict) -> str:
if not result.get("success"):
return f"❌ 执行失败: {result.get('error', '未知错误')}"
rows = result["rows"][:20]
return (
f"查询结果:\n\n✅ 返回 {result['row_count']}\n"
f"列: {result['columns']}\n数据:\n{json.dumps(rows, ensure_ascii=False, indent=2)}\n\n"
f"请基于这个结果决定下一步。如果发现异常或值得深挖的点,继续查询。如果分析足够,输出 done。"
)