feat: 四层架构数据分析 Agent
- Layer 1 Planner: 意图规划,将问题转为结构化分析计划 - Layer 2 Explorer: 自适应探索循环,多轮迭代动态生成 SQL - Layer 3 InsightEngine: 异常检测 + 主动洞察 - Layer 4 ContextManager: 多轮对话上下文记忆 安全设计:AI 只看 Schema + 聚合结果,不接触原始数据。 支持任意 OpenAI 兼容 API(OpenAI / Ollama / DeepSeek / vLLM)
This commit is contained in:
126
schema_extractor.py
Normal file
126
schema_extractor.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""
|
||||
Schema 提取器 —— 只提取表结构,不碰数据
|
||||
"""
|
||||
import sqlite3
|
||||
from typing import Any
|
||||
|
||||
|
||||
def extract_schema(db_path: str) -> dict[str, Any]:
|
||||
"""
|
||||
从数据库提取 Schema,只返回结构信息:
|
||||
- 表名、列名、类型
|
||||
- 主键、外键
|
||||
- 行数
|
||||
- 枚举列的去重值(不含原始数据)
|
||||
"""
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cur = conn.cursor()
|
||||
|
||||
# 获取所有表
|
||||
cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
|
||||
tables = [row["name"] for row in cur.fetchall()]
|
||||
|
||||
schema = {"tables": []}
|
||||
|
||||
for table in tables:
|
||||
# 列信息
|
||||
cur.execute(f"PRAGMA table_info('{table}')")
|
||||
columns = []
|
||||
for col in cur.fetchall():
|
||||
columns.append({
|
||||
"name": col["name"],
|
||||
"type": col["type"],
|
||||
"nullable": col["notnull"] == 0,
|
||||
"is_primary_key": col["pk"] == 1,
|
||||
})
|
||||
|
||||
# 外键
|
||||
cur.execute(f"PRAGMA foreign_key_list('{table}')")
|
||||
fks = []
|
||||
for fk in cur.fetchall():
|
||||
fks.append({
|
||||
"column": fk["from"],
|
||||
"references_table": fk["table"],
|
||||
"references_column": fk["to"],
|
||||
})
|
||||
|
||||
# 行数
|
||||
cur.execute(f"SELECT COUNT(*) AS cnt FROM '{table}'")
|
||||
row_count = cur.fetchone()["cnt"]
|
||||
|
||||
# 对 VARCHAR / TEXT 类型列,提取去重枚举值(最多 20 个)
|
||||
data_profile = {}
|
||||
for col in columns:
|
||||
col_name = col["name"]
|
||||
col_type = (col["type"] or "").upper()
|
||||
|
||||
if any(t in col_type for t in ("VARCHAR", "TEXT", "CHAR")):
|
||||
cur.execute(f'SELECT DISTINCT "{col_name}" FROM "{table}" WHERE "{col_name}" IS NOT NULL LIMIT 20')
|
||||
vals = [row[0] for row in cur.fetchall()]
|
||||
if len(vals) <= 20:
|
||||
data_profile[col_name] = {
|
||||
"type": "enum",
|
||||
"distinct_count": len(vals),
|
||||
"values": vals,
|
||||
}
|
||||
elif any(t in col_type for t in ("INT", "REAL", "FLOAT", "DOUBLE", "DECIMAL", "NUMERIC")):
|
||||
cur.execute(f'''
|
||||
SELECT MIN("{col_name}") AS min_val, MAX("{col_name}") AS max_val,
|
||||
AVG("{col_name}") AS avg_val, COUNT(DISTINCT "{col_name}") AS distinct_count
|
||||
FROM "{table}" WHERE "{col_name}" IS NOT NULL
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
if row and row["min_val"] is not None:
|
||||
data_profile[col_name] = {
|
||||
"type": "numeric",
|
||||
"min": round(row["min_val"], 2),
|
||||
"max": round(row["max_val"], 2),
|
||||
"avg": round(row["avg_val"], 2),
|
||||
"distinct_count": row["distinct_count"],
|
||||
}
|
||||
|
||||
schema["tables"].append({
|
||||
"name": table,
|
||||
"columns": columns,
|
||||
"foreign_keys": fks,
|
||||
"row_count": row_count,
|
||||
"data_profile": data_profile,
|
||||
})
|
||||
|
||||
conn.close()
|
||||
return schema
|
||||
|
||||
|
||||
def schema_to_text(schema: dict) -> str:
|
||||
"""将 Schema 转为可读文本,供 LLM 理解"""
|
||||
lines = ["=== 数据库 Schema ===\n"]
|
||||
|
||||
for table in schema["tables"]:
|
||||
lines.append(f"📋 表: {table['name']} (共 {table['row_count']} 行)")
|
||||
lines.append(" 列:")
|
||||
for col in table["columns"]:
|
||||
pk = " [PK]" if col["is_primary_key"] else ""
|
||||
null = " NULL" if col["nullable"] else " NOT NULL"
|
||||
lines.append(f' - {col["name"]}: {col["type"]}{pk}{null}')
|
||||
|
||||
if table["foreign_keys"]:
|
||||
lines.append(" 外键:")
|
||||
for fk in table["foreign_keys"]:
|
||||
lines.append(f' - {fk["column"]} → {fk["references_table"]}.{fk["references_column"]}')
|
||||
|
||||
if table["data_profile"]:
|
||||
lines.append(" 数据画像:")
|
||||
for col_name, profile in table["data_profile"].items():
|
||||
if profile["type"] == "enum":
|
||||
vals = ", ".join(str(v) for v in profile["values"][:10])
|
||||
lines.append(f' - {col_name}: 枚举值({profile["distinct_count"]}个) = [{vals}]')
|
||||
elif profile["type"] == "numeric":
|
||||
lines.append(
|
||||
f' - {col_name}: 范围[{profile["min"]} ~ {profile["max"]}], '
|
||||
f'均值{profile["avg"]}, {profile["distinct_count"]}个不同值'
|
||||
)
|
||||
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user