feat: 四层架构数据分析 Agent

- Layer 1 Planner: 意图规划，将问题转为结构化分析计划 - Layer 2 Explorer: 自适应探索循环，多轮迭代动态生成 SQL - Layer 3 InsightEngine: 异常检测 + 主动洞察 - Layer 4 ContextManager: 多轮对话上下文记忆安全设计：AI 只看 Schema + 聚合结果，不接触原始数据。支持任意 OpenAI 兼容 API（OpenAI / Ollama / DeepSeek / vLLM）
2026-03-19 12:21:04 +08:00
commit 96927a789d
14 changed files with 1683 additions and 0 deletions
--- a/schema_extractor.py
+++ b/schema_extractor.py
@@ -0,0 +1,126 @@
+"""
+Schema 提取器 —— 只提取表结构，不碰数据
+"""
+import sqlite3
+from typing import Any
+
+
+def extract_schema(db_path: str) -> dict[str, Any]:
+    """
+    从数据库提取 Schema，只返回结构信息：
+    - 表名、列名、类型
+    - 主键、外键
+    - 行数
+    - 枚举列的去重值（不含原始数据）
+    """
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+    cur = conn.cursor()
+
+    # 获取所有表
+    cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
+    tables = [row["name"] for row in cur.fetchall()]
+
+    schema = {"tables": []}
+
+    for table in tables:
+        # 列信息
+        cur.execute(f"PRAGMA table_info('{table}')")
+        columns = []
+        for col in cur.fetchall():
+            columns.append({
+                "name": col["name"],
+                "type": col["type"],
+                "nullable": col["notnull"] == 0,
+                "is_primary_key": col["pk"] == 1,
+            })
+
+        # 外键
+        cur.execute(f"PRAGMA foreign_key_list('{table}')")
+        fks = []
+        for fk in cur.fetchall():
+            fks.append({
+                "column": fk["from"],
+                "references_table": fk["table"],
+                "references_column": fk["to"],
+            })
+
+        # 行数
+        cur.execute(f"SELECT COUNT(*) AS cnt FROM '{table}'")
+        row_count = cur.fetchone()["cnt"]
+
+        # 对 VARCHAR / TEXT 类型列，提取去重枚举值（最多 20 个）
+        data_profile = {}
+        for col in columns:
+            col_name = col["name"]
+            col_type = (col["type"] or "").upper()
+
+            if any(t in col_type for t in ("VARCHAR", "TEXT", "CHAR")):
+                cur.execute(f'SELECT DISTINCT "{col_name}" FROM "{table}" WHERE "{col_name}" IS NOT NULL LIMIT 20')
+                vals = [row[0] for row in cur.fetchall()]
+                if len(vals) <= 20:
+                    data_profile[col_name] = {
+                        "type": "enum",
+                        "distinct_count": len(vals),
+                        "values": vals,
+                    }
+            elif any(t in col_type for t in ("INT", "REAL", "FLOAT", "DOUBLE", "DECIMAL", "NUMERIC")):
+                cur.execute(f'''
+                    SELECT MIN("{col_name}") AS min_val, MAX("{col_name}") AS max_val,
+                           AVG("{col_name}") AS avg_val, COUNT(DISTINCT "{col_name}") AS distinct_count
+                    FROM "{table}" WHERE "{col_name}" IS NOT NULL
+                ''')
+                row = cur.fetchone()
+                if row and row["min_val"] is not None:
+                    data_profile[col_name] = {
+                        "type": "numeric",
+                        "min": round(row["min_val"], 2),
+                        "max": round(row["max_val"], 2),
+                        "avg": round(row["avg_val"], 2),
+                        "distinct_count": row["distinct_count"],
+                    }
+
+        schema["tables"].append({
+            "name": table,
+            "columns": columns,
+            "foreign_keys": fks,
+            "row_count": row_count,
+            "data_profile": data_profile,
+        })
+
+    conn.close()
+    return schema
+
+
+def schema_to_text(schema: dict) -> str:
+    """将 Schema 转为可读文本，供 LLM 理解"""
+    lines = ["=== 数据库 Schema ===\n"]
+
+    for table in schema["tables"]:
+        lines.append(f"📋 表: {table['name']} (共 {table['row_count']} 行)")
+        lines.append("  列:")
+        for col in table["columns"]:
+            pk = " [PK]" if col["is_primary_key"] else ""
+            null = " NULL" if col["nullable"] else " NOT NULL"
+            lines.append(f'    - {col["name"]}: {col["type"]}{pk}{null}')
+
+        if table["foreign_keys"]:
+            lines.append("  外键:")
+            for fk in table["foreign_keys"]:
+                lines.append(f'    - {fk["column"]} → {fk["references_table"]}.{fk["references_column"]}')
+
+        if table["data_profile"]:
+            lines.append("  数据画像:")
+            for col_name, profile in table["data_profile"].items():
+                if profile["type"] == "enum":
+                    vals = ", ".join(str(v) for v in profile["values"][:10])
+                    lines.append(f'    - {col_name}: 枚举值({profile["distinct_count"]}个) = [{vals}]')
+                elif profile["type"] == "numeric":
+                    lines.append(
+                        f'    - {col_name}: 范围[{profile["min"]} ~ {profile["max"]}], '
+                        f'均值{profile["avg"]}, {profile["distinct_count"]}个不同值'
+                    )
+
+        lines.append("")
+
+    return "\n".join(lines)