SQLite 持久连接 — sandbox 不再每次查询开关连接，改为 __init__ 时建连、close() 时释放

Explorer 的 system prompt 明确告知 sandbox 规则 — "每条 SQL 必须包含聚合函数或 LIMIT"，减少 LLM 生成违规 SQL 浪费轮次 LLM 客户端单例 — 所有组件共享一个 openai.OpenAI 实例，不再各建各的 sanitize 顺序修复 — 小样本抑制放在 float round 之前，避免被 round 干扰 quick_detect 从 O(n²) 改为 O(n) — 按列聚合一次，加去重，不再对每行重复算整列统计历史上下文实际生效 — get_context_for 的结果现在会注入到 Explorer 的初始 prompt 里，多轮分析时 LLM 能看到之前的发现
2026-03-20 13:20:31 +08:00
parent 96927a789d
commit b7a27b12bd
39 changed files with 2637 additions and 1133 deletions
--- a/output/init.py
+++ b/output/init.py
@@ -0,0 +1 @@
+"""输出层：报告、图表、整合"""
--- a/output/chart.py
+++ b/output/chart.py
@@ -0,0 +1,247 @@
+"""
+图表生成器 —— 根据探索结果自动生成可视化图表
+"""
+import json
+import os
+import re
+from typing import Any
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.font_manager as fm
+
+from core.config import LLM_CONFIG
+from core.utils import get_llm_client, extract_json_array
+from layers.explorer import ExplorationStep
+
+
+def _setup_chinese_font():
+    candidates = [
+        "SimHei", "Microsoft YaHei", "STHeiti", "WenQuanYi Micro Hei",
+        "Noto Sans CJK SC", "PingFang SC", "Source Han Sans CN",
+    ]
+    available = {f.name for f in fm.fontManager.ttflist}
+    for font in candidates:
+        if font in available:
+            plt.rcParams["font.sans-serif"] = [font]
+            plt.rcParams["axes.unicode_minus"] = False
+            return font
+    plt.rcParams["axes.unicode_minus"] = False
+    return None
+
+_setup_chinese_font()
+
+
+CHART_PLAN_PROMPT = """你是一个数据可视化专家。根据以下分析结果，规划需要生成的图表。
+
+## 探索结果
+{exploration_summary}
+
+## 规划规则
+1. 每个有意义的查询结果生成 1 张图，最多 5 张
+2. 图表类型：bar / horizontal_bar / pie / line / stacked_bar
+3. 跳过数据量太少（<2 行）的结果
+4. 标题要简洁
+
+## 输出格式（纯 JSON 数组，不要代码块）
+[
+  {{
+    "step_index": 0,
+    "chart_type": "bar",
+    "title": "图表标题",
+    "x_column": "分类轴列名",
+    "y_column": "数值轴列名",
+    "y2_column": null,
+    "top_n": 10,
+    "sort_desc": true
+  }}
+]"""
+
+
+class ChartGenerator:
+    """图表生成器"""
+
+    def __init__(self, output_dir: str = "charts"):
+        self.output_dir = output_dir
+        self.client, self.model = get_llm_client(LLM_CONFIG)
+
+    def generate(self, steps: list[ExplorationStep], question: str) -> list[dict]:
+        valid_steps = [(i, s) for i, s in enumerate(steps) if s.success and s.rows and s.row_count >= 2 and s.action != "done"]
+        if not valid_steps:
+            return []
+
+        plans = self._plan_charts(valid_steps, question)
+        if not plans:
+            return []
+
+        self._clean_old_charts()
+        os.makedirs(self.output_dir, exist_ok=True)
+
+        charts = []
+        for i, plan in enumerate(plans):
+            try:
+                path = self._render_chart(plan, steps, i)
+                if path:
+                    charts.append({"path": path, "title": plan.get("title", f"图表 {i+1}")})
+            except Exception as e:
+                print(f"     ⚠️ 图表生成失败: {e}")
+        return charts
+
+    def _plan_charts(self, valid_steps: list[tuple[int, ExplorationStep]], question: str) -> list[dict]:
+        summary_parts = []
+        for idx, step in valid_steps:
+            summary_parts.append(
+                f"### 步骤 {idx}: {step.purpose}\n列: {step.columns}\n行数: {step.row_count}\n"
+                f"前 5 行: {json.dumps(step.rows[:5], ensure_ascii=False)}"
+            )
+
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": "你是数据可视化专家。只输出纯 JSON 数组，不要 markdown 代码块。"},
+                    {"role": "user", "content": CHART_PLAN_PROMPT.format(exploration_summary="\n\n".join(summary_parts))},
+                ],
+                temperature=0.1, max_tokens=1024,
+            )
+            plans = extract_json_array(response.choices[0].message.content.strip())
+            return plans if plans else self._fallback_plan(valid_steps)
+        except Exception as e:
+            print(f"     ⚠️ 图表规划失败: {e}，使用 fallback")
+            return self._fallback_plan(valid_steps)
+
+    def _fallback_plan(self, valid_steps: list[tuple[int, ExplorationStep]]) -> list[dict]:
+        plans = []
+        for idx, step in valid_steps[:4]:
+            if len(step.columns) < 2 or step.row_count < 2:
+                continue
+            x_col = step.columns[0]
+            y_col = None
+            for col in step.columns[1:]:
+                if isinstance(step.rows[0].get(col), (int, float)):
+                    y_col = col
+                    break
+            if not y_col:
+                continue
+
+            chart_type = "bar"
+            if any(kw in x_col for kw in ("月", "日期", "时间", "month", "date")):
+                chart_type = "line"
+            elif step.row_count <= 6:
+                chart_type = "pie"
+            elif len(str(step.rows[0].get(x_col, ""))) > 10:
+                chart_type = "horizontal_bar"
+
+            plans.append({
+                "step_index": idx, "chart_type": chart_type,
+                "title": f"各{x_col}的{y_col}",
+                "x_column": x_col, "y_column": y_col,
+                "y2_column": None, "top_n": 10,
+                "sort_desc": chart_type != "line",
+            })
+        return plans
+
+    def _render_chart(self, plan: dict, steps: list[ExplorationStep], chart_idx: int) -> str | None:
+        step_idx = plan.get("step_index", 0)
+        if step_idx >= len(steps):
+            return None
+        step = steps[step_idx]
+        if not step.success or not step.rows:
+            return None
+
+        chart_type = plan.get("chart_type", "bar")
+        title = plan.get("title", f"图表 {chart_idx + 1}")
+        x_col, y_col = plan.get("x_column", ""), plan.get("y_column", "")
+        y2_col = plan.get("y2_column")
+        top_n = plan.get("top_n", 15)
+        sort_desc = plan.get("sort_desc", True)
+
+        rows = step.rows[:top_n] if top_n else step.rows
+        x_vals = [str(r.get(x_col, "")) for r in rows]
+        y_vals = [self._to_number(r.get(y_col, 0)) for r in rows]
+
+        if sort_desc and chart_type not in ("line",):
+            paired = sorted(zip(x_vals, y_vals), key=lambda p: p[1], reverse=True)
+            x_vals, y_vals = zip(*paired) if paired else ([], [])
+
+        if not x_vals or not y_vals:
+            return None
+
+        fig, ax = plt.subplots(figsize=(10, 6))
+
+        if chart_type == "bar":
+            bars = ax.bar(range(len(x_vals)), y_vals, color="#4C78A8")
+            ax.set_xticks(range(len(x_vals)))
+            ax.set_xticklabels(x_vals, rotation=45, ha="right", fontsize=9)
+            self._add_bar_labels(ax, bars)
+        elif chart_type == "horizontal_bar":
+            bars = ax.barh(range(len(x_vals)), y_vals, color="#4C78A8")
+            ax.set_yticks(range(len(x_vals)))
+            ax.set_yticklabels(x_vals, fontsize=9)
+            ax.invert_yaxis()
+        elif chart_type == "pie":
+            filtered = [(x, y) for x, y in zip(x_vals, y_vals) if y > 0]
+            if not filtered:
+                plt.close(fig)
+                return None
+            x_vals, y_vals = zip(*filtered)
+            ax.pie(y_vals, labels=x_vals, autopct="%1.1f%%", startangle=90, textprops={"fontsize": 9})
+        elif chart_type == "line":
+            ax.plot(range(len(x_vals)), y_vals, marker="o", color="#4C78A8", linewidth=2)
+            ax.set_xticks(range(len(x_vals)))
+            ax.set_xticklabels(x_vals, rotation=45, ha="right", fontsize=9)
+            ax.fill_between(range(len(x_vals)), y_vals, alpha=0.1, color="#4C78A8")
+            if y2_col:
+                y2_vals = [self._to_number(r.get(y2_col, 0)) for r in rows]
+                ax2 = ax.twinx()
+                ax2.plot(range(len(x_vals)), y2_vals, marker="s", color="#E45756", linewidth=2, linestyle="--")
+                ax2.set_ylabel(y2_col, fontsize=10, color="#E45756")
+        elif chart_type == "stacked_bar":
+            ax.bar(range(len(x_vals)), y_vals, label=y_col, color="#4C78A8")
+            if y2_col:
+                y2_vals = [self._to_number(r.get(y2_col, 0)) for r in rows]
+                ax.bar(range(len(x_vals)), y2_vals, bottom=y_vals, label=y2_col, color="#E45756")
+            ax.set_xticks(range(len(x_vals)))
+            ax.set_xticklabels(x_vals, rotation=45, ha="right", fontsize=9)
+            ax.legend()
+
+        ax.set_title(title, fontsize=13, fontweight="bold", pad=12)
+        if chart_type not in ("pie",):
+            ax.set_xlabel(x_col, fontsize=10)
+            if chart_type != "horizontal_bar":
+                ax.set_ylabel(y_col, fontsize=10)
+            ax.grid(axis="y", alpha=0.3)
+
+        plt.tight_layout()
+        fname = f"chart_{chart_idx + 1}.png"
+        fpath = os.path.join(self.output_dir, fname)
+        fig.savefig(fpath, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        return fpath
+
+    def _clean_old_charts(self):
+        if os.path.isdir(self.output_dir):
+            for f in os.listdir(self.output_dir):
+                if f.endswith(".png"):
+                    try:
+                        os.remove(os.path.join(self.output_dir, f))
+                    except OSError:
+                        pass
+
+    def _add_bar_labels(self, ax, bars):
+        for bar in bars:
+            h = bar.get_height()
+            if h > 0:
+                label = f"{h:.1f}" if isinstance(h, float) else str(int(h))
+                ax.text(bar.get_x() + bar.get_width() / 2, h, label, ha="center", va="bottom", fontsize=8)
+
+    def _to_number(self, val) -> float:
+        if isinstance(val, (int, float)):
+            return float(val)
+        if isinstance(val, str):
+            try:
+                return float(val.replace("<", "").replace(",", "").strip())
+            except ValueError:
+                return 0.0
+        return 0.0
--- a/output/consolidator.py
+++ b/output/consolidator.py
@@ -0,0 +1,84 @@
+"""
+报告整合器 —— 将多次分析结果合并为一份完整报告
+"""
+import json
+
+from core.config import LLM_CONFIG
+from core.utils import get_llm_client
+from layers.context import AnalysisSession
+
+
+CONSOLIDATE_PROMPT = """你是一个高级数据分析总监。下面是你的团队针对同一份数据做的多次分析，请整合为一份完整的综合报告。
+
+## 核心问题
+{question}
+
+## 各次分析结果
+{sections}
+
+## 可用图表
+{charts_text}
+
+## 整合要求
+1. **执行摘要**：3-5 句话概括全局结论
+2. **核心发现**：从所有分析中提炼最重要的发现，去重，按重要性排列
+3. **交叉洞察**：不同维度之间的关联
+4. **图表引用**：用 `![标题](路径)` 嵌入相关段落
+5. **风险与建议**：按优先级排列
+6. **数据附录**：关键统计数字
+
+中文，专业简报风格。先结论后细节。"""
+
+
+class ReportConsolidator:
+    """报告整合器"""
+
+    def __init__(self):
+        self.client, self.model = get_llm_client(LLM_CONFIG)
+
+    def consolidate(self, sessions: list[AnalysisSession], question: str = "",
+                    charts: list[dict] | None = None) -> str:
+        if not sessions:
+            return "（无分析数据可整合）"
+        if not question:
+            question = sessions[0].question
+
+        sections = self._build_sections(sessions)
+        charts_text = "\n".join(f"{i}. {c['title']}: {c['path']}" for i, c in enumerate(charts or [], 1)) or "无图表。"
+
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": "你是高级数据分析总监，整合多维度分析结果。"},
+                    {"role": "user", "content": CONSOLIDATE_PROMPT.format(question=question, sections=sections, charts_text=charts_text)},
+                ],
+                temperature=0.3, max_tokens=4096,
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            print(f"  ⚠️ LLM 整合失败: {e}，使用拼接模式")
+            return self._fallback_concat(sessions, charts)
+
+    def _build_sections(self, sessions: list[AnalysisSession]) -> str:
+        parts = []
+        for i, session in enumerate(sessions, 1):
+            section = f"### 分析 {i}: {session.question}\n"
+            section += f"类型: {session.plan.get('analysis_type', '未知')}\n\n"
+            for step in session.steps:
+                if not step.success or not step.rows or step.action == "done":
+                    continue
+                section += f"- {step.purpose} ({step.row_count} 行)\n"
+                section += f"  数据: {json.dumps(step.rows[:8], ensure_ascii=False)}\n\n"
+            if session.insights:
+                section += "#### 洞察\n" + "\n".join(f"- {i}" for i in session.insights) + "\n"
+            parts.append(section)
+        return "\n---\n".join(parts)
+
+    def _fallback_concat(self, sessions: list[AnalysisSession], charts: list[dict] | None) -> str:
+        parts = ["# 综合分析报告\n"]
+        for i, s in enumerate(sessions, 1):
+            parts.append(f"## 第 {i} 部分: {s.question}\n{s.report}\n")
+        if charts:
+            parts.append("\n## 可视化\n" + "\n".join(f"![{c['title']}]({c['path']})" for c in charts))
+        return "\n".join(parts)
--- a/output/reporter.py
+++ b/output/reporter.py
@@ -0,0 +1,84 @@
+"""
+报告生成器 —— 单次分析报告
+"""
+import json
+from typing import Any
+
+from core.config import LLM_CONFIG
+from core.utils import get_llm_client
+from layers.explorer import ExplorationStep
+from layers.insights import Insight
+
+
+REPORT_PROMPT = """你是一个数据分析报告撰写专家。基于以下信息撰写报告。
+
+## 用户问题
+{question}
+
+## 分析计划
+{plan}
+
+## 探索过程
+{exploration}
+
+## 主动洞察
+{insights_text}
+
+## 可用图表
+{charts_text}
+
+## 撰写要求
+1. **开头**：一句话总结核心结论
+2. **核心发现**：按重要性排列，带具体数字
+3. **图表引用**：用 `![标题](路径)` 嵌入到相关段落
+4. **深入洞察**：异常、趋势、关联
+5. **建议**：基于数据的行动建议
+6. **审计**：末尾附上所有 SQL
+
+中文，专业简报风格。图表自然嵌入对应段落。"""
+
+
+class ReportGenerator:
+    """报告生成器"""
+
+    def __init__(self):
+        self.client, self.model = get_llm_client(LLM_CONFIG)
+
+    def generate(self, question: str, plan: dict, steps: list[ExplorationStep],
+                 insights: list[Insight], charts: list[dict] | None = None) -> str:
+        exploration = self._build_exploration(steps)
+        insights_text = "\n".join(str(i) for i in insights) if insights else "未检测到异常。"
+        charts_text = "\n".join(f"{i}. 标题: {c['title']}, 路径: {c['path']}" for i, c in enumerate(charts or [], 1)) or "无图表。"
+
+        prompt = REPORT_PROMPT.format(
+            question=question,
+            plan=json.dumps(plan, ensure_ascii=False, indent=2),
+            exploration=exploration,
+            insights_text=insights_text,
+            charts_text=charts_text,
+        )
+
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "你是专业的数据分析师，撰写清晰、有洞察力的分析报告。"},
+                {"role": "user", "content": prompt},
+            ],
+            temperature=0.3, max_tokens=4096,
+        )
+        return response.choices[0].message.content
+
+    def _build_exploration(self, steps: list[ExplorationStep]) -> str:
+        parts = []
+        for step in steps:
+            if step.action == "done":
+                parts.append(f"### 结束\n{step.reasoning}")
+            elif step.success:
+                parts.append(
+                    f"### 第 {step.round_num} 轮：{step.purpose}\n"
+                    f"SQL: `{step.sql}`\n结果 ({step.row_count} 行):\n"
+                    f"数据: {json.dumps(step.rows, ensure_ascii=False)}"
+                )
+            else:
+                parts.append(f"### 第 {step.round_num} 轮：{step.purpose}\nSQL: `{step.sql}`\n失败: {step.error}")
+        return "\n\n".join(parts) if parts else "无探索步骤"