SQLite 持久连接 — sandbox 不再每次查询开关连接,改为 __init__ 时建连、close() 时释放
Explorer 的 system prompt 明确告知 sandbox 规则 — "每条 SQL 必须包含聚合函数或 LIMIT",减少 LLM 生成违规 SQL 浪费轮次 LLM 客户端单例 — 所有组件共享一个 openai.OpenAI 实例,不再各建各的 sanitize 顺序修复 — 小样本抑制放在 float round 之前,避免被 round 干扰 quick_detect 从 O(n²) 改为 O(n) — 按列聚合一次,加去重,不再对每行重复算整列统计 历史上下文实际生效 — get_context_for 的结果现在会注入到 Explorer 的初始 prompt 里,多轮分析时 LLM 能看到之前的发现
This commit is contained in:
1
layers/__init__.py
Normal file
1
layers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""分析层:Planner → Playbook → Explorer → Insights → Context"""
|
||||
80
layers/context.py
Normal file
80
layers/context.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
Layer 4: 上下文管理器
|
||||
"""
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from layers.explorer import ExplorationStep
|
||||
from layers.insights import Insight
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnalysisSession:
|
||||
"""一次分析的完整记录"""
|
||||
question: str
|
||||
plan: dict
|
||||
steps: list[ExplorationStep]
|
||||
insights: list[Insight]
|
||||
report: str
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
|
||||
def summary(self) -> str:
|
||||
parts = [f"**问题**: {self.question}"]
|
||||
if self.plan:
|
||||
parts.append(f"**分析类型**: {self.plan.get('analysis_type', 'unknown')}")
|
||||
parts.append(f"**维度**: {', '.join(self.plan.get('dimensions', []))}")
|
||||
key_findings = []
|
||||
for step in self.steps:
|
||||
if step.success and step.rows:
|
||||
top_row = step.rows[0] if step.rows else {}
|
||||
finding = f"{step.purpose}: " + ", ".join(f"{k}={v}" for k, v in top_row.items() if k.lower() != "id")
|
||||
key_findings.append(finding)
|
||||
if key_findings:
|
||||
parts.append("**核心发现**:")
|
||||
for f in key_findings[:5]:
|
||||
parts.append(f" - {f}")
|
||||
if self.insights:
|
||||
parts.append("**洞察**:")
|
||||
for i in self.insights[:3]:
|
||||
parts.append(f" - {i}")
|
||||
return "\n".join(parts)
|
||||
|
||||
def to_reference_text(self) -> str:
|
||||
return (
|
||||
f"## 之前的分析\n### 问题\n{self.question}\n### 摘要\n{self.summary()}\n### 发现\n"
|
||||
+ "\n".join(f"- {s.purpose}: {s.row_count} 行" for s in self.steps if s.success)
|
||||
)
|
||||
|
||||
|
||||
class ContextManager:
|
||||
"""上下文管理器"""
|
||||
|
||||
def __init__(self, max_history: int = 10):
|
||||
self.sessions: list[AnalysisSession] = []
|
||||
self.max_history = max_history
|
||||
|
||||
def add_session(self, question: str, plan: dict, steps: list[ExplorationStep],
|
||||
insights: list[Insight], report: str) -> AnalysisSession:
|
||||
session = AnalysisSession(question=question, plan=plan, steps=steps, insights=insights, report=report)
|
||||
self.sessions.append(session)
|
||||
if len(self.sessions) > self.max_history:
|
||||
self.sessions = self.sessions[-self.max_history:]
|
||||
return session
|
||||
|
||||
def get_context_for(self, new_question: str) -> Optional[str]:
|
||||
if not self.sessions:
|
||||
return None
|
||||
return "\n\n---\n\n".join(s.to_reference_text() for s in self.sessions[-2:])
|
||||
|
||||
def get_history_summary(self) -> str:
|
||||
if not self.sessions:
|
||||
return "(无历史分析)"
|
||||
lines = [f"共 {len(self.sessions)} 次分析:"]
|
||||
for i, s in enumerate(self.sessions, 1):
|
||||
ts = time.strftime("%H:%M", time.localtime(s.timestamp))
|
||||
lines.append(f" {i}. [{ts}] {s.question}")
|
||||
return "\n".join(lines)
|
||||
|
||||
def clear(self):
|
||||
self.sessions.clear()
|
||||
224
layers/explorer.py
Normal file
224
layers/explorer.py
Normal file
@@ -0,0 +1,224 @@
|
||||
"""
|
||||
Layer 2: 自适应探索器
|
||||
"""
|
||||
import json
|
||||
from typing import Any
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from core.config import LLM_CONFIG
|
||||
from core.utils import get_llm_client, extract_json_object
|
||||
from core.sandbox import SandboxExecutor
|
||||
|
||||
|
||||
EXPLORER_SYSTEM = """你是一个数据分析执行者。你的上级给了你一个分析计划,你需要通过迭代执行 SQL 来完成分析。
|
||||
|
||||
## 你的工作方式
|
||||
每一轮你看到:
|
||||
1. 分析计划(上级给的目标)
|
||||
2. 数据库 Schema(表结构、数据画像)
|
||||
3. 之前的探索历史(查过什么、得到什么结果)
|
||||
|
||||
你决定下一步:
|
||||
- 输出一条 SQL 继续探索
|
||||
- 或者输出 done 表示分析足够
|
||||
|
||||
## 输出格式(严格 JSON)
|
||||
```json
|
||||
{
|
||||
"action": "query",
|
||||
"reasoning": "为什么要做这个查询",
|
||||
"sql": "SELECT ...",
|
||||
"purpose": "这个查询的目的"
|
||||
}
|
||||
```
|
||||
|
||||
或:
|
||||
```json
|
||||
{
|
||||
"action": "done",
|
||||
"reasoning": "为什么分析已经足够"
|
||||
}
|
||||
```
|
||||
|
||||
## SQL 规则(严格遵守,否则会被沙箱拒绝)
|
||||
- 只用 SELECT
|
||||
- 每条 SQL 必须包含聚合函数(COUNT/SUM/AVG/MIN/MAX)或 GROUP BY 或 LIMIT
|
||||
- 禁止 SELECT *
|
||||
- 用 ROUND 控制精度
|
||||
- 合理使用 LIMIT(分组结果 15 行以内,时间序列 60 行以内)
|
||||
- 如果需要查看明细数据,必须加 LIMIT
|
||||
|
||||
## 探索策略
|
||||
1. 第一轮:验证核心假设
|
||||
2. 后续轮:基于已有结果追问
|
||||
3. 不要重复查已经确认的事
|
||||
4. 每轮要有新发现,否则就该结束"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExplorationStep:
|
||||
"""单步探索结果"""
|
||||
round_num: int = 0
|
||||
reasoning: str = ""
|
||||
purpose: str = ""
|
||||
sql: str = ""
|
||||
action: str = "query"
|
||||
success: bool = False
|
||||
error: str | None = None
|
||||
columns: list[str] = field(default_factory=list)
|
||||
rows: list[dict] = field(default_factory=list)
|
||||
row_count: int = 0
|
||||
|
||||
@classmethod
|
||||
def from_decision(cls, round_num: int, decision: dict, result: dict) -> "ExplorationStep":
|
||||
return cls(
|
||||
round_num=round_num,
|
||||
reasoning=decision.get("reasoning", ""),
|
||||
purpose=decision.get("purpose", ""),
|
||||
sql=decision.get("sql", ""),
|
||||
action=decision.get("action", "query"),
|
||||
success=result.get("success", False),
|
||||
error=result.get("error"),
|
||||
columns=result.get("columns", []),
|
||||
rows=result.get("rows", []),
|
||||
row_count=result.get("row_count", 0),
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
d = {
|
||||
"round": self.round_num, "action": self.action,
|
||||
"reasoning": self.reasoning, "purpose": self.purpose,
|
||||
"sql": self.sql, "success": self.success,
|
||||
}
|
||||
if self.success:
|
||||
d["result"] = {"columns": self.columns, "rows": self.rows, "row_count": self.row_count}
|
||||
else:
|
||||
d["result"] = {"error": self.error}
|
||||
return d
|
||||
|
||||
|
||||
class Explorer:
|
||||
"""自适应探索器"""
|
||||
|
||||
def __init__(self, executor: SandboxExecutor):
|
||||
self.executor = executor
|
||||
self.client, self.model = get_llm_client(LLM_CONFIG)
|
||||
|
||||
def explore(
|
||||
self, plan: dict, schema_text: str,
|
||||
max_rounds: int = 6, playbook_result: dict | None = None,
|
||||
) -> list[ExplorationStep]:
|
||||
steps: list[ExplorationStep] = []
|
||||
|
||||
# 阶段 A: 预设查询
|
||||
preset_context = ""
|
||||
if playbook_result and playbook_result.get("preset_queries"):
|
||||
preset_steps = self._run_preset_queries(playbook_result["preset_queries"])
|
||||
steps.extend(preset_steps)
|
||||
preset_context = self._build_preset_context(preset_steps, playbook_result)
|
||||
|
||||
# 阶段 B: 自适应探索
|
||||
preset_used = len([s for s in steps if s.success])
|
||||
remaining = max(1, max_rounds - preset_used)
|
||||
|
||||
initial = (
|
||||
f"## 分析计划\n```json\n{json.dumps(plan, ensure_ascii=False, indent=2)}\n```\n\n"
|
||||
f"## 数据库 Schema\n{schema_text}\n\n"
|
||||
)
|
||||
|
||||
# 注入历史上下文
|
||||
prev_context = plan.pop("_prev_context", None)
|
||||
if prev_context:
|
||||
initial += f"## 历史分析参考\n{prev_context}\n\n"
|
||||
|
||||
if preset_context:
|
||||
initial += (
|
||||
f"## 预设分析结果(已执行)\n{preset_context}\n\n"
|
||||
f"请基于这些已有数据,决定是否需要进一步探索。\n"
|
||||
f"重点关注:预设结果中的异常、值得深挖的点。\n"
|
||||
f"如果预设结果已经足够,直接输出 done。"
|
||||
)
|
||||
if playbook_result.get("exploration_hints"):
|
||||
initial += f"\n\n## 探索提示\n{playbook_result['exploration_hints']}"
|
||||
else:
|
||||
initial += "请开始第一轮探索。根据计划,先执行最关键的查询。"
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": EXPLORER_SYSTEM},
|
||||
{"role": "user", "content": initial},
|
||||
]
|
||||
|
||||
offset = len(steps)
|
||||
for round_num in range(offset + 1, offset + remaining + 1):
|
||||
print(f"\n 🔄 探索第 {round_num}/{max_rounds} 轮")
|
||||
|
||||
decision = self._llm_decide(messages)
|
||||
reasoning = decision.get("reasoning", "")
|
||||
print(f" 💭 {reasoning[:80]}{'...' if len(reasoning) > 80 else ''}")
|
||||
|
||||
if decision.get("action") == "done":
|
||||
print(f" ✅ 探索完成")
|
||||
steps.append(ExplorationStep.from_decision(round_num, decision, {"success": True}))
|
||||
break
|
||||
|
||||
sql = decision.get("sql", "")
|
||||
if not sql:
|
||||
continue
|
||||
|
||||
print(f" 📝 {decision.get('purpose', '')}")
|
||||
try:
|
||||
result = self.executor.execute(sql)
|
||||
except Exception as e:
|
||||
result = {"success": False, "error": str(e), "sql": sql}
|
||||
print(f" {'✅' if result['success'] else '❌'} {result.get('row_count', result.get('error', ''))}")
|
||||
|
||||
steps.append(ExplorationStep.from_decision(round_num, decision, result))
|
||||
|
||||
messages.append({"role": "assistant", "content": json.dumps(decision, ensure_ascii=False)})
|
||||
messages.append({"role": "user", "content": self._format_result(result)})
|
||||
|
||||
return steps
|
||||
|
||||
def _run_preset_queries(self, preset_queries: list[dict]) -> list[ExplorationStep]:
|
||||
steps = []
|
||||
for i, pq in enumerate(preset_queries, 1):
|
||||
sql, purpose = pq["sql"], pq.get("purpose", f"预设查询 {i}")
|
||||
print(f"\n 📌 预设查询 {i}/{len(preset_queries)}: {purpose}")
|
||||
try:
|
||||
result = self.executor.execute(sql)
|
||||
except Exception as e:
|
||||
result = {"success": False, "error": str(e), "sql": sql}
|
||||
decision = {"action": "query", "reasoning": f"[预设] {purpose}", "sql": sql, "purpose": purpose}
|
||||
steps.append(ExplorationStep.from_decision(i, decision, result))
|
||||
print(f" {'✅' if result['success'] else '❌'} {result.get('row_count', result.get('error', ''))}")
|
||||
return steps
|
||||
|
||||
def _build_preset_context(self, steps: list[ExplorationStep], playbook_result: dict) -> str:
|
||||
parts = [f"Playbook: {playbook_result.get('playbook_name', '未知')}"]
|
||||
for step in steps:
|
||||
if step.success:
|
||||
parts.append(
|
||||
f"### {step.purpose}\nSQL: `{step.sql}`\n"
|
||||
f"结果 ({step.row_count} 行): {json.dumps(step.rows[:15], ensure_ascii=False)}"
|
||||
)
|
||||
else:
|
||||
parts.append(f"### {step.purpose}\nSQL: `{step.sql}`\n执行失败: {step.error}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
def _llm_decide(self, messages: list[dict]) -> dict:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model, messages=messages, temperature=0.2, max_tokens=1024,
|
||||
)
|
||||
content = response.choices[0].message.content.strip()
|
||||
result = extract_json_object(content)
|
||||
return result if result else {"action": "done", "reasoning": f"无法解析: {content[:100]}"}
|
||||
|
||||
def _format_result(self, result: dict) -> str:
|
||||
if not result.get("success"):
|
||||
return f"❌ 执行失败: {result.get('error', '未知错误')}"
|
||||
rows = result["rows"][:20]
|
||||
return (
|
||||
f"查询结果:\n\n✅ 返回 {result['row_count']} 行\n"
|
||||
f"列: {result['columns']}\n数据:\n{json.dumps(rows, ensure_ascii=False, indent=2)}\n\n"
|
||||
f"请基于这个结果决定下一步。如果发现异常或值得深挖的点,继续查询。如果分析足够,输出 done。"
|
||||
)
|
||||
148
layers/insights.py
Normal file
148
layers/insights.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
Layer 3: 洞察引擎
|
||||
"""
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from core.config import LLM_CONFIG
|
||||
from core.utils import get_llm_client, extract_json_array
|
||||
from layers.explorer import ExplorationStep
|
||||
|
||||
|
||||
INSIGHT_SYSTEM = """你是一个数据洞察专家。你会收到探索过程的所有结果,你需要:
|
||||
|
||||
1. 从结果中发现异常和有趣现象
|
||||
2. 对比不同维度,找出差异
|
||||
3. 输出用户可能没问但值得知道的洞察
|
||||
|
||||
## 输出格式(严格 JSON 数组)
|
||||
```json
|
||||
[
|
||||
{
|
||||
"type": "outlier" | "trend" | "distribution" | "correlation" | "recommendation",
|
||||
"severity": "high" | "medium" | "low",
|
||||
"title": "简短标题",
|
||||
"detail": "详细描述,包含具体数字",
|
||||
"evidence": "支撑这个洞察的数据来源"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## 分析原则
|
||||
- 每个洞察必须有具体数字支撑
|
||||
- 用对比来说话(A 比 B 高 X%)
|
||||
- 关注异常,不描述平淡的事实
|
||||
- 如果没有异常,返回空数组"""
|
||||
|
||||
|
||||
class Insight:
|
||||
"""单条洞察"""
|
||||
def __init__(self, data: dict):
|
||||
self.type = data.get("type", "unknown")
|
||||
self.severity = data.get("severity", "low")
|
||||
self.title = data.get("title", "")
|
||||
self.detail = data.get("detail", "")
|
||||
self.evidence = data.get("evidence", "")
|
||||
|
||||
@property
|
||||
def emoji(self) -> str:
|
||||
return {"outlier": "⚠️", "trend": "📈", "distribution": "📊",
|
||||
"correlation": "🔗", "recommendation": "💡"}.get(self.type, "📌")
|
||||
|
||||
@property
|
||||
def severity_emoji(self) -> str:
|
||||
return {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(self.severity, "")
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.emoji} {self.severity_emoji} {self.title}: {self.detail}"
|
||||
|
||||
|
||||
class InsightEngine:
|
||||
"""洞察引擎"""
|
||||
|
||||
def __init__(self):
|
||||
self.client, self.model = get_llm_client(LLM_CONFIG)
|
||||
|
||||
def analyze(self, steps: list[ExplorationStep], question: str) -> list[Insight]:
|
||||
if not steps:
|
||||
return []
|
||||
|
||||
history = self._build_history(steps)
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": INSIGHT_SYSTEM},
|
||||
{"role": "user", "content": f"## 用户问题\n{question}\n\n## 探索历史\n{history}\n\n请分析以上数据,输出异常和洞察。"},
|
||||
],
|
||||
temperature=0.3, max_tokens=2048,
|
||||
)
|
||||
content = response.choices[0].message.content.strip()
|
||||
return [Insight(d) for d in extract_json_array(content)]
|
||||
|
||||
def format_insights(self, insights: list[Insight]) -> str:
|
||||
if not insights:
|
||||
return ""
|
||||
severity_order = {"high": 0, "medium": 1, "low": 2}
|
||||
sorted_insights = sorted(insights, key=lambda i: severity_order.get(i.severity, 9))
|
||||
lines = ["## 💡 主动洞察", "", "_以下是你没问但数据告诉我们的事:_\n"]
|
||||
for insight in sorted_insights:
|
||||
lines.append(f"**{insight.emoji} {insight.title}** {insight.severity_emoji}")
|
||||
lines.append(f" {insight.detail}")
|
||||
lines.append(f" _数据来源: {insight.evidence}_")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
def _build_history(self, steps: list[ExplorationStep]) -> str:
|
||||
parts = []
|
||||
for step in steps:
|
||||
if step.action == "done":
|
||||
parts.append(f"### 结束\n{step.reasoning}")
|
||||
elif step.success:
|
||||
parts.append(
|
||||
f"### 第 {step.round_num} 轮:{step.purpose}\n"
|
||||
f"SQL: `{step.sql}`\n结果 ({step.row_count} 行):\n"
|
||||
f"数据: {json.dumps(step.rows, ensure_ascii=False)}"
|
||||
)
|
||||
else:
|
||||
parts.append(f"### 第 {step.round_num} 轮:{step.purpose}\nSQL: `{step.sql}`\n失败: {step.error}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def quick_detect(steps: list[ExplorationStep]) -> list[str]:
|
||||
"""基于规则的快速异常检测,不调 LLM"""
|
||||
alerts = []
|
||||
seen = set() # 去重
|
||||
|
||||
for step in steps:
|
||||
if not step.success or not step.rows:
|
||||
continue
|
||||
|
||||
for col in step.columns:
|
||||
vals = [r.get(col) for r in step.rows if isinstance(r.get(col), (int, float))]
|
||||
if not vals:
|
||||
continue
|
||||
|
||||
col_lower = col.lower()
|
||||
|
||||
# 占比列:某个分组占比过高
|
||||
if col_lower in ("pct", "percent", "percentage", "占比"):
|
||||
for v in vals:
|
||||
if v > 50:
|
||||
key = f"pct_{step.purpose}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
alerts.append(f"⚠️ {step.purpose} 中某个分组占比 {v}%,集中度过高")
|
||||
break
|
||||
|
||||
# 计数列:极值差异
|
||||
if col_lower in ("count", "cnt", "n", "total", "order_count") and len(vals) >= 3:
|
||||
avg = sum(vals) / len(vals)
|
||||
if avg > 0:
|
||||
ratio = max(vals) / avg
|
||||
if ratio > 3:
|
||||
key = f"count_{step.purpose}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
alerts.append(f"⚠️ {step.purpose} 中最大值是均值的 {ratio:.1f} 倍")
|
||||
|
||||
return alerts
|
||||
74
layers/planner.py
Normal file
74
layers/planner.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Layer 1: 意图规划器
|
||||
"""
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from core.config import LLM_CONFIG
|
||||
from core.utils import get_llm_client, extract_json_object
|
||||
|
||||
PROMPT = """你是一个数据分析规划专家。
|
||||
|
||||
## 你的任务
|
||||
根据用户的分析问题和数据库 Schema,生成一个结构化的分析计划。
|
||||
|
||||
## 输出格式(严格 JSON)
|
||||
```json
|
||||
{
|
||||
"intent": "一句话描述用户想了解什么",
|
||||
"analysis_type": "ranking" | "distribution" | "trend" | "comparison" | "anomaly" | "overview",
|
||||
"primary_table": "主要分析的表名",
|
||||
"dimensions": ["分组维度列名"],
|
||||
"metrics": ["需要聚合的数值列名"],
|
||||
"aggregations": ["SUM", "AVG", "COUNT", ...],
|
||||
"filters": [{"column": "列名", "condition": "过滤条件(可选)"}],
|
||||
"join_needed": false,
|
||||
"join_info": {"tables": [], "on": ""},
|
||||
"expected_rounds": 3,
|
||||
"rationale": "为什么这样规划,需要关注什么"
|
||||
}
|
||||
```
|
||||
|
||||
## 分析类型说明
|
||||
- ranking: 按某维度排名
|
||||
- distribution: 分布/占比
|
||||
- trend: 时间趋势
|
||||
- comparison: 对比分析
|
||||
- anomaly: 异常检测
|
||||
- overview: 全局概览
|
||||
|
||||
## 规划原则
|
||||
1. 只选择与问题相关的表和列
|
||||
2. 如果需要 JOIN,说明关联条件
|
||||
3. 预估需要几轮探索(1-6)
|
||||
4. 标注可能的异常关注点
|
||||
5. metrics 不要包含 id 列"""
|
||||
|
||||
|
||||
class Planner:
|
||||
"""意图规划器"""
|
||||
|
||||
def __init__(self):
|
||||
self.client, self.model = get_llm_client(LLM_CONFIG)
|
||||
|
||||
def plan(self, question: str, schema_text: str) -> dict[str, Any]:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": PROMPT},
|
||||
{"role": "user", "content": f"## Schema\n{schema_text}\n\n## 用户问题\n{question}"},
|
||||
],
|
||||
temperature=0.1,
|
||||
max_tokens=1024,
|
||||
)
|
||||
content = response.choices[0].message.content.strip()
|
||||
plan = extract_json_object(content)
|
||||
|
||||
if not plan:
|
||||
plan = {"intent": content[:100], "analysis_type": "overview"}
|
||||
|
||||
plan.setdefault("analysis_type", "overview")
|
||||
plan.setdefault("expected_rounds", 3)
|
||||
plan.setdefault("filters", [])
|
||||
plan.setdefault("join_needed", False)
|
||||
return plan
|
||||
179
layers/playbook.py
Normal file
179
layers/playbook.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""
|
||||
Layer 1.5: 预设分析剧本
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from core.config import LLM_CONFIG
|
||||
from core.utils import get_llm_client, extract_json_object, extract_json_array
|
||||
|
||||
|
||||
class Playbook:
|
||||
"""一个预设分析剧本"""
|
||||
def __init__(self, data: dict):
|
||||
self.name = data["name"]
|
||||
self.description = data["description"]
|
||||
self.tags = data.get("tags", [])
|
||||
self.preset_queries: list[dict] = data.get("preset_queries", [])
|
||||
self.exploration_hints = data.get("exploration_hints", "")
|
||||
self.placeholders = data.get("placeholders", {})
|
||||
|
||||
def to_summary(self) -> str:
|
||||
return f"[{self.name}] {self.description} (标签: {', '.join(self.tags)})"
|
||||
|
||||
def render_queries(self, schema: dict) -> list[dict]:
|
||||
rendered = []
|
||||
for q in self.preset_queries:
|
||||
sql, purpose = q["sql"], q.get("purpose", "")
|
||||
for key, val in self.placeholders.items():
|
||||
sql = sql.replace(f"{{{{{key}}}}}", val)
|
||||
purpose = purpose.replace(f"{{{{{key}}}}}", val)
|
||||
rendered.append({"sql": sql, "purpose": purpose})
|
||||
return rendered
|
||||
|
||||
|
||||
class PlaybookManager:
|
||||
"""加载和匹配 Playbook"""
|
||||
|
||||
def __init__(self, playbook_dir: str = ""):
|
||||
self.playbooks: list[Playbook] = []
|
||||
self.client, self.model = get_llm_client(LLM_CONFIG)
|
||||
if playbook_dir and os.path.isdir(playbook_dir):
|
||||
self._load_from_dir(playbook_dir)
|
||||
|
||||
def _load_from_dir(self, dir_path: str):
|
||||
for fname in sorted(os.listdir(dir_path)):
|
||||
if not fname.endswith(".json"):
|
||||
continue
|
||||
try:
|
||||
with open(os.path.join(dir_path, fname), "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
items = data if isinstance(data, list) else [data]
|
||||
for item in items:
|
||||
self.playbooks.append(Playbook(item))
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
print(f" ⚠️ 加载 playbook 失败 {fname}: {e}")
|
||||
|
||||
def add(self, playbook: Playbook):
|
||||
self.playbooks.append(playbook)
|
||||
|
||||
def auto_generate(self, schema_text: str, save_dir: str = "") -> list[Playbook]:
|
||||
"""让 LLM 根据 Schema 自动生成 Playbook"""
|
||||
prompt = f"""你是一个数据分析专家。根据以下数据库 Schema,生成 3-5 个预设分析剧本。
|
||||
|
||||
## 数据库 Schema
|
||||
{schema_text}
|
||||
|
||||
## 输出格式(严格 JSON 数组)
|
||||
```json
|
||||
[
|
||||
{{
|
||||
"name": "剧本名称",
|
||||
"description": "一句话描述",
|
||||
"tags": ["关键词1", "关键词2"],
|
||||
"preset_queries": [
|
||||
{{"purpose": "查询目的", "sql": "SELECT ... GROUP BY ..."}}
|
||||
],
|
||||
"exploration_hints": "后续探索提示"
|
||||
}}
|
||||
]
|
||||
```
|
||||
|
||||
## SQL 规则
|
||||
- 只用 SELECT,必须有聚合函数或 GROUP BY
|
||||
- 禁止 SELECT *,用 ROUND 控制精度,合理 LIMIT
|
||||
- 直接使用实际表名和列名"""
|
||||
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "你是数据分析专家。只输出 JSON,不要其他内容。"},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.3, max_tokens=4096,
|
||||
)
|
||||
content = response.choices[0].message.content.strip()
|
||||
playbooks_data = extract_json_array(content)
|
||||
if not playbooks_data:
|
||||
return []
|
||||
|
||||
generated = []
|
||||
for i, pb_data in enumerate(playbooks_data):
|
||||
pb_data.setdefault("tags", [])
|
||||
pb_data.setdefault("exploration_hints", "")
|
||||
pb_data.setdefault("placeholders", {})
|
||||
try:
|
||||
pb = Playbook(pb_data)
|
||||
self.playbooks.append(pb)
|
||||
generated.append(pb)
|
||||
if save_dir:
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
safe = re.sub(r'[^\w\u4e00-\u9fff]', '_', pb.name)[:30]
|
||||
fpath = os.path.join(save_dir, f"auto_{i+1}_{safe}.json")
|
||||
with open(fpath, "w", encoding="utf-8") as f:
|
||||
json.dump(pb_data, f, ensure_ascii=False, indent=2)
|
||||
except (KeyError, TypeError) as e:
|
||||
print(f" ⚠️ 跳过无效 Playbook: {e}")
|
||||
return generated
|
||||
except Exception as e:
|
||||
print(f" ⚠️ 自动生成 Playbook 出错: {e}")
|
||||
return []
|
||||
|
||||
def match(self, plan: dict, schema_text: str) -> Optional[dict]:
|
||||
"""用 LLM 判断当前分析计划是否匹配某个 Playbook"""
|
||||
if not self.playbooks:
|
||||
return None
|
||||
|
||||
pb_summaries = []
|
||||
for i, pb in enumerate(self.playbooks):
|
||||
queries_desc = "\n".join(f" - {q.get('purpose', '')}: {q['sql'][:100]}" for q in pb.preset_queries)
|
||||
pb_summaries.append(f"{i+1}. {pb.to_summary()}\n 预设查询:\n{queries_desc}")
|
||||
|
||||
prompt = f"""判断当前分析计划是否适合使用某个预设剧本。
|
||||
|
||||
## 分析计划
|
||||
```json
|
||||
{json.dumps(plan, ensure_ascii=False, indent=2)}
|
||||
```
|
||||
|
||||
## Schema
|
||||
{schema_text}
|
||||
|
||||
## 可用剧本
|
||||
{chr(10).join(pb_summaries)}
|
||||
|
||||
## 输出(严格 JSON)
|
||||
匹配: {{"matched": true, "playbook_index": 1, "reasoning": "原因", "placeholders": {{}}}}
|
||||
不匹配: {{"matched": false, "reasoning": "原因"}}"""
|
||||
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "你是分析计划匹配器。"},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.1, max_tokens=512,
|
||||
)
|
||||
result = extract_json_object(response.choices[0].message.content.strip())
|
||||
if not result.get("matched"):
|
||||
return None
|
||||
|
||||
idx = result.get("playbook_index", 1) - 1
|
||||
if idx < 0 or idx >= len(self.playbooks):
|
||||
return None
|
||||
|
||||
pb = self.playbooks[idx]
|
||||
pb.placeholders = {**pb.placeholders, **result.get("placeholders", {})}
|
||||
return {
|
||||
"matched": True, "playbook_name": pb.name,
|
||||
"reasoning": result.get("reasoning", ""),
|
||||
"preset_queries": pb.render_queries({}),
|
||||
"exploration_hints": pb.exploration_hints,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Playbook 匹配出错: {e}")
|
||||
return None
|
||||
Reference in New Issue
Block a user