安全与稳定性: - 移除硬编码 API Key,改用 .env + 环境变量 - LLM 调用统一重试机制(指数退避,3 次重试,处理 429/5xx/超时) - 中文字体检测增强(CJK 关键词兜底 + 无字体时英文 fallback) - 缺失 API Key 给出友好提示而非崩溃 分析能力提升: - 异常检测新增 z-score 检测(标准差>2 标记异常) - 新增变异系数 CV 检测(数据波动性) - 新增零值/缺失检测 - 上下文管理器升级为关键词语义匹配(替代简单取最近 2 条) 用户体验: - 报告自动保存为 Markdown(reports/ 目录) - 新增 export 命令导出查询结果为 CSV - 新增 reports 命令查看已保存报告 - CLI 支持 readline 命令历史(方向键翻阅) - CSV 导入工具重写:自动列名映射、容错处理、dry-run 模式 - 新增 .env.example 配置模板
225 lines
8.4 KiB
Python
225 lines
8.4 KiB
Python
"""
|
||
Layer 2: 自适应探索器
|
||
"""
|
||
import json
|
||
from typing import Any
|
||
from dataclasses import dataclass, field
|
||
|
||
from core.config import LLM_CONFIG
|
||
from core.utils import get_llm_client, llm_chat, extract_json_object
|
||
from core.sandbox import SandboxExecutor
|
||
|
||
|
||
EXPLORER_SYSTEM = """你是一个数据分析执行者。你的上级给了你一个分析计划,你需要通过迭代执行 SQL 来完成分析。
|
||
|
||
## 你的工作方式
|
||
每一轮你看到:
|
||
1. 分析计划(上级给的目标)
|
||
2. 数据库 Schema(表结构、数据画像)
|
||
3. 之前的探索历史(查过什么、得到什么结果)
|
||
|
||
你决定下一步:
|
||
- 输出一条 SQL 继续探索
|
||
- 或者输出 done 表示分析足够
|
||
|
||
## 输出格式(严格 JSON)
|
||
```json
|
||
{
|
||
"action": "query",
|
||
"reasoning": "为什么要做这个查询",
|
||
"sql": "SELECT ...",
|
||
"purpose": "这个查询的目的"
|
||
}
|
||
```
|
||
|
||
或:
|
||
```json
|
||
{
|
||
"action": "done",
|
||
"reasoning": "为什么分析已经足够"
|
||
}
|
||
```
|
||
|
||
## SQL 规则(严格遵守,否则会被沙箱拒绝)
|
||
- 只用 SELECT
|
||
- 每条 SQL 必须包含聚合函数(COUNT/SUM/AVG/MIN/MAX)或 GROUP BY 或 LIMIT
|
||
- 禁止 SELECT *
|
||
- 用 ROUND 控制精度
|
||
- 合理使用 LIMIT(分组结果 15 行以内,时间序列 60 行以内)
|
||
- 如果需要查看明细数据,必须加 LIMIT
|
||
|
||
## 探索策略
|
||
1. 第一轮:验证核心假设
|
||
2. 后续轮:基于已有结果追问
|
||
3. 不要重复查已经确认的事
|
||
4. 每轮要有新发现,否则就该结束"""
|
||
|
||
|
||
@dataclass
|
||
class ExplorationStep:
|
||
"""单步探索结果"""
|
||
round_num: int = 0
|
||
reasoning: str = ""
|
||
purpose: str = ""
|
||
sql: str = ""
|
||
action: str = "query"
|
||
success: bool = False
|
||
error: str | None = None
|
||
columns: list[str] = field(default_factory=list)
|
||
rows: list[dict] = field(default_factory=list)
|
||
row_count: int = 0
|
||
|
||
@classmethod
|
||
def from_decision(cls, round_num: int, decision: dict, result: dict) -> "ExplorationStep":
|
||
return cls(
|
||
round_num=round_num,
|
||
reasoning=decision.get("reasoning", ""),
|
||
purpose=decision.get("purpose", ""),
|
||
sql=decision.get("sql", ""),
|
||
action=decision.get("action", "query"),
|
||
success=result.get("success", False),
|
||
error=result.get("error"),
|
||
columns=result.get("columns", []),
|
||
rows=result.get("rows", []),
|
||
row_count=result.get("row_count", 0),
|
||
)
|
||
|
||
def to_dict(self) -> dict:
|
||
d = {
|
||
"round": self.round_num, "action": self.action,
|
||
"reasoning": self.reasoning, "purpose": self.purpose,
|
||
"sql": self.sql, "success": self.success,
|
||
}
|
||
if self.success:
|
||
d["result"] = {"columns": self.columns, "rows": self.rows, "row_count": self.row_count}
|
||
else:
|
||
d["result"] = {"error": self.error}
|
||
return d
|
||
|
||
|
||
class Explorer:
|
||
"""自适应探索器"""
|
||
|
||
def __init__(self, executor: SandboxExecutor):
|
||
self.executor = executor
|
||
self.client, self.model = get_llm_client(LLM_CONFIG)
|
||
|
||
def explore(
|
||
self, plan: dict, schema_text: str,
|
||
max_rounds: int = 6, playbook_result: dict | None = None,
|
||
) -> list[ExplorationStep]:
|
||
steps: list[ExplorationStep] = []
|
||
|
||
# 阶段 A: 预设查询
|
||
preset_context = ""
|
||
if playbook_result and playbook_result.get("preset_queries"):
|
||
preset_steps = self._run_preset_queries(playbook_result["preset_queries"])
|
||
steps.extend(preset_steps)
|
||
preset_context = self._build_preset_context(preset_steps, playbook_result)
|
||
|
||
# 阶段 B: 自适应探索
|
||
preset_used = len([s for s in steps if s.success])
|
||
remaining = max(1, max_rounds - preset_used)
|
||
|
||
initial = (
|
||
f"## 分析计划\n```json\n{json.dumps(plan, ensure_ascii=False, indent=2)}\n```\n\n"
|
||
f"## 数据库 Schema\n{schema_text}\n\n"
|
||
)
|
||
|
||
# 注入历史上下文
|
||
prev_context = plan.pop("_prev_context", None)
|
||
if prev_context:
|
||
initial += f"## 历史分析参考\n{prev_context}\n\n"
|
||
|
||
if preset_context:
|
||
initial += (
|
||
f"## 预设分析结果(已执行)\n{preset_context}\n\n"
|
||
f"请基于这些已有数据,决定是否需要进一步探索。\n"
|
||
f"重点关注:预设结果中的异常、值得深挖的点。\n"
|
||
f"如果预设结果已经足够,直接输出 done。"
|
||
)
|
||
if playbook_result.get("exploration_hints"):
|
||
initial += f"\n\n## 探索提示\n{playbook_result['exploration_hints']}"
|
||
else:
|
||
initial += "请开始第一轮探索。根据计划,先执行最关键的查询。"
|
||
|
||
messages = [
|
||
{"role": "system", "content": EXPLORER_SYSTEM},
|
||
{"role": "user", "content": initial},
|
||
]
|
||
|
||
offset = len(steps)
|
||
for round_num in range(offset + 1, offset + remaining + 1):
|
||
print(f"\n 🔄 探索第 {round_num}/{max_rounds} 轮")
|
||
|
||
decision = self._llm_decide(messages)
|
||
reasoning = decision.get("reasoning", "")
|
||
print(f" 💭 {reasoning[:80]}{'...' if len(reasoning) > 80 else ''}")
|
||
|
||
if decision.get("action") == "done":
|
||
print(f" ✅ 探索完成")
|
||
steps.append(ExplorationStep.from_decision(round_num, decision, {"success": True}))
|
||
break
|
||
|
||
sql = decision.get("sql", "")
|
||
if not sql:
|
||
continue
|
||
|
||
print(f" 📝 {decision.get('purpose', '')}")
|
||
try:
|
||
result = self.executor.execute(sql)
|
||
except Exception as e:
|
||
result = {"success": False, "error": str(e), "sql": sql}
|
||
print(f" {'✅' if result['success'] else '❌'} {result.get('row_count', result.get('error', ''))}")
|
||
|
||
steps.append(ExplorationStep.from_decision(round_num, decision, result))
|
||
|
||
messages.append({"role": "assistant", "content": json.dumps(decision, ensure_ascii=False)})
|
||
messages.append({"role": "user", "content": self._format_result(result)})
|
||
|
||
return steps
|
||
|
||
def _run_preset_queries(self, preset_queries: list[dict]) -> list[ExplorationStep]:
|
||
steps = []
|
||
for i, pq in enumerate(preset_queries, 1):
|
||
sql, purpose = pq["sql"], pq.get("purpose", f"预设查询 {i}")
|
||
print(f"\n 📌 预设查询 {i}/{len(preset_queries)}: {purpose}")
|
||
try:
|
||
result = self.executor.execute(sql)
|
||
except Exception as e:
|
||
result = {"success": False, "error": str(e), "sql": sql}
|
||
decision = {"action": "query", "reasoning": f"[预设] {purpose}", "sql": sql, "purpose": purpose}
|
||
steps.append(ExplorationStep.from_decision(i, decision, result))
|
||
print(f" {'✅' if result['success'] else '❌'} {result.get('row_count', result.get('error', ''))}")
|
||
return steps
|
||
|
||
def _build_preset_context(self, steps: list[ExplorationStep], playbook_result: dict) -> str:
|
||
parts = [f"Playbook: {playbook_result.get('playbook_name', '未知')}"]
|
||
for step in steps:
|
||
if step.success:
|
||
parts.append(
|
||
f"### {step.purpose}\nSQL: `{step.sql}`\n"
|
||
f"结果 ({step.row_count} 行): {json.dumps(step.rows[:15], ensure_ascii=False)}"
|
||
)
|
||
else:
|
||
parts.append(f"### {step.purpose}\nSQL: `{step.sql}`\n执行失败: {step.error}")
|
||
return "\n\n".join(parts)
|
||
|
||
def _llm_decide(self, messages: list[dict]) -> dict:
|
||
content = llm_chat(
|
||
self.client, self.model,
|
||
messages=messages, temperature=0.2, max_tokens=1024,
|
||
)
|
||
result = extract_json_object(content)
|
||
return result if result else {"action": "done", "reasoning": f"无法解析: {content[:100]}"}
|
||
|
||
def _format_result(self, result: dict) -> str:
|
||
if not result.get("success"):
|
||
return f"❌ 执行失败: {result.get('error', '未知错误')}"
|
||
rows = result["rows"][:20]
|
||
return (
|
||
f"查询结果:\n\n✅ 返回 {result['row_count']} 行\n"
|
||
f"列: {result['columns']}\n数据:\n{json.dumps(rows, ensure_ascii=False, indent=2)}\n\n"
|
||
f"请基于这个结果决定下一步。如果发现异常或值得深挖的点,继续查询。如果分析足够,输出 done。"
|
||
)
|