feat: 四层架构全面增强

安全与稳定性:
- 移除硬编码 API Key,改用 .env + 环境变量
- LLM 调用统一重试机制(指数退避,3 次重试,处理 429/5xx/超时)
- 中文字体检测增强(CJK 关键词兜底 + 无字体时英文 fallback)
- 缺失 API Key 给出友好提示而非崩溃

分析能力提升:
- 异常检测新增 z-score 检测(标准差>2 标记异常)
- 新增变异系数 CV 检测(数据波动性)
- 新增零值/缺失检测
- 上下文管理器升级为关键词语义匹配(替代简单取最近 2 条)

用户体验:
- 报告自动保存为 Markdown(reports/ 目录)
- 新增 export 命令导出查询结果为 CSV
- 新增 reports 命令查看已保存报告
- CLI 支持 readline 命令历史(方向键翻阅)
- CSV 导入工具重写:自动列名映射、容错处理、dry-run 模式
- 新增 .env.example 配置模板
This commit is contained in:
openclaw
2026-03-31 14:39:17 +08:00
parent b7a27b12bd
commit e8f8e2f1ba
14 changed files with 588 additions and 115 deletions

21
.env.example Normal file
View File

@@ -0,0 +1,21 @@
# LLM 配置(兼容 OpenAI API 格式)
LLM_API_KEY=sk-your-api-key-here
LLM_BASE_URL=https://api.openai.com/v1
LLM_MODEL=gpt-4o-mini
# 其他可用配置:
# Ollama本地部署
# LLM_API_KEY=ollama
# LLM_BASE_URL=http://localhost:11434/v1
# LLM_MODEL=qwen2.5-coder:7b
# DeepSeek
# LLM_API_KEY=sk-xxx
# LLM_BASE_URL=https://api.deepseek.com
# LLM_MODEL=deepseek-chat
# 数据库路径(可选,默认 demo.db
# DB_PATH=/path/to/your.db
# 探索轮数(可选,默认 6
# MAX_ROUNDS=6

View File

@@ -9,10 +9,11 @@ Layer 4: Context 上下文记忆
Output: Reporter + Chart + Consolidator Output: Reporter + Chart + Consolidator
""" """
import os import os
import time
from typing import Optional from typing import Optional
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from core.config import DB_PATH, MAX_EXPLORATION_ROUNDS, PLAYBOOK_DIR, CHARTS_DIR from core.config import DB_PATH, MAX_EXPLORATION_ROUNDS, PLAYBOOK_DIR, CHARTS_DIR, PROJECT_ROOT
from core.schema import extract_schema, schema_to_text from core.schema import extract_schema, schema_to_text
from core.sandbox import SandboxExecutor from core.sandbox import SandboxExecutor
from layers.planner import Planner from layers.planner import Planner
@@ -47,6 +48,10 @@ class DataAnalysisAgent:
# 累积图表 # 累积图表
self._all_charts: list[dict] = [] self._all_charts: list[dict] = []
# 报告输出目录
self.reports_dir = os.path.join(PROJECT_ROOT, "reports")
os.makedirs(self.reports_dir, exist_ok=True)
# 自动生成 Playbook # 自动生成 Playbook
if not self.playbook_mgr.playbooks: if not self.playbook_mgr.playbooks:
print("\n🤖 [Playbook] 未发现预设剧本AI 自动生成中...") print("\n🤖 [Playbook] 未发现预设剧本AI 自动生成中...")
@@ -145,6 +150,9 @@ class DataAnalysisAgent:
# Layer 4: 记录上下文 # Layer 4: 记录上下文
self.context.add_session(question=question, plan=plan, steps=steps, insights=insights, report=report) self.context.add_session(question=question, plan=plan, steps=steps, insights=insights, report=report)
# 自动保存报告
self.save_report(report, question, charts=charts)
return report return report
def full_report(self, question: str = "") -> str: def full_report(self, question: str = "") -> str:
@@ -173,3 +181,54 @@ class DataAnalysisAgent:
def close(self): def close(self):
"""释放资源""" """释放资源"""
self.executor.close() self.executor.close()
def save_report(self, report: str, question: str, charts: list[dict] | None = None) -> str:
"""将报告保存为 Markdown 文件,返回文件路径"""
ts = time.strftime("%Y%m%d_%H%M%S")
# 取问题前 20 字作为文件名
import re
safe_q = re.sub(r'[^\w\u4e00-\u9fff]', '_', question)[:20].strip('_')
fname = f"{ts}_{safe_q}.md"
fpath = os.path.join(self.reports_dir, fname)
with open(fpath, "w", encoding="utf-8") as f:
f.write(f"# 分析报告: {question}\n\n")
f.write(f"_生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}_\n\n")
f.write(report)
if charts:
f.write("\n\n---\n\n## 📊 图表索引\n\n")
for c in charts:
f.write(f"### {c['title']}\n![{c['title']}]({os.path.abspath(c['path'])})\n\n")
print(f" 💾 报告已保存: {fpath}")
return fpath
def export_data(self, steps: list, format: str = "csv") -> str | None:
"""导出探索结果为 CSV"""
import csv
import io
all_rows = []
all_cols = set()
for step in steps:
if step.success and step.rows:
for row in step.rows:
row["_query"] = step.purpose
all_rows.append(row)
all_cols.update(row.keys())
if not all_rows:
return None
ts = time.strftime("%Y%m%d_%H%M%S")
fname = f"export_{ts}.csv"
fpath = os.path.join(self.reports_dir, fname)
cols = sorted(all_cols)
with open(fpath, "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=cols)
writer.writeheader()
writer.writerows(all_rows)
print(f" 📁 数据已导出: {fpath} ({len(all_rows)} 行)")
return fpath

64
cli.py
View File

@@ -1,5 +1,5 @@
""" """
交互式 CLI —— 四层架构自适应分析 交互式 CLI —— 四层架构自适应分析(增强版)
用法: python cli.py [数据库路径] 用法: python cli.py [数据库路径]
""" """
import os import os
@@ -7,7 +7,7 @@ import sys
sys.path.insert(0, os.path.dirname(__file__)) sys.path.insert(0, os.path.dirname(__file__))
from core.config import DB_PATH, LLM_CONFIG, MAX_EXPLORATION_ROUNDS, PLAYBOOK_DIR from core.config import DB_PATH, LLM_CONFIG, MAX_EXPLORATION_ROUNDS, PLAYBOOK_DIR, PROJECT_ROOT
from agent import DataAnalysisAgent from agent import DataAnalysisAgent
@@ -17,6 +17,8 @@ def print_help():
<问题> 分析一个问题 <问题> 分析一个问题
rounds=<N> <问题> 设置探索轮数 rounds=<N> <问题> 设置探索轮数
report [主题] 整合所有分析,生成综合报告 report [主题] 整合所有分析,生成综合报告
export 导出最近一次分析结果为 CSV
reports 列出已保存的报告文件
schema 查看数据库 Schema schema 查看数据库 Schema
playbooks 查看已加载的预设剧本 playbooks 查看已加载的预设剧本
regen 重新生成预设剧本 regen 重新生成预设剧本
@@ -28,6 +30,39 @@ def print_help():
""") """)
def cmd_reports(agent):
"""列出已保存的报告"""
reports_dir = agent.reports_dir
if not os.path.isdir(reports_dir):
print("reports 目录不存在)")
return
files = sorted([f for f in os.listdir(reports_dir) if f.endswith(".md")])
if not files:
print("(尚无保存的报告)")
return
print(f"\n📁 已保存 {len(files)} 份报告:")
for f in files:
fpath = os.path.join(reports_dir, f)
size = os.path.getsize(fpath)
print(f" 📄 {f} ({size/1024:.1f} KB)")
def setup_readline():
"""启用命令历史Linux/macOS"""
try:
import readline
histfile = os.path.join(PROJECT_ROOT, ".cli_history")
try:
readline.read_history_file(histfile)
except FileNotFoundError:
pass
import atexit
atexit.register(readline.write_history_file, histfile)
readline.set_history_length(100)
except ImportError:
pass
def main(): def main():
db_path = sys.argv[1] if len(sys.argv) > 1 else DB_PATH db_path = sys.argv[1] if len(sys.argv) > 1 else DB_PATH
@@ -36,9 +71,15 @@ def main():
sys.exit(1) sys.exit(1)
if not LLM_CONFIG["api_key"]: if not LLM_CONFIG["api_key"]:
print("⚠️ 未配置 LLM_API_KEY") print(" LLM_API_KEY 未配置!")
print(" 请设置环境变量或创建 .env 文件:")
print(" LLM_API_KEY=your-key")
print(" LLM_BASE_URL=https://api.openai.com/v1")
print(" LLM_MODEL=gpt-4o-mini")
sys.exit(1) sys.exit(1)
setup_readline()
agent = DataAnalysisAgent(db_path) agent = DataAnalysisAgent(db_path)
print("=" * 60) print("=" * 60)
@@ -50,6 +91,8 @@ def main():
print(f"📋 预设剧本: {len(agent.playbook_mgr.playbooks)}") print(f"📋 预设剧本: {len(agent.playbook_mgr.playbooks)}")
print(f"\n💬 输入分析问题help 查看命令)\n") print(f"\n💬 输入分析问题help 查看命令)\n")
last_steps = None # 记录最近一次分析的 steps用于 export
while True: while True:
try: try:
user_input = input("📊 > ").strip() user_input = input("📊 > ").strip()
@@ -75,7 +118,19 @@ def main():
print(agent.get_audit()) print(agent.get_audit())
elif cmd == "clear": elif cmd == "clear":
agent.clear_history() agent.clear_history()
last_steps = None
print("✅ 历史已清空") print("✅ 历史已清空")
elif cmd == "reports":
cmd_reports(agent)
elif cmd == "export":
if last_steps:
fpath = agent.export_data(last_steps)
if fpath:
print(f"✅ 导出成功: {fpath}")
else:
print("⚠️ 无数据可导出")
else:
print("⚠️ 请先执行一次分析")
elif cmd.startswith("report"): elif cmd.startswith("report"):
topic = user_input[6:].strip() topic = user_input[6:].strip()
try: try:
@@ -120,6 +175,9 @@ def main():
try: try:
report = agent.analyze(question, max_rounds=max_rounds) report = agent.analyze(question, max_rounds=max_rounds)
# 保存 steps 用于 export
if agent.context.sessions:
last_steps = agent.context.sessions[-1].steps
print("\n" + report) print("\n" + report)
print("\n" + "~" * 60) print("\n" + "~" * 60)
except Exception as e: except Exception as e:

View File

@@ -1,13 +1,38 @@
""" """
配置文件 配置文件 —— 支持环境变量 + .env 文件
""" """
import os import os
def _load_dotenv(path: str = ".env"):
"""简易 .env 加载器,不依赖 python-dotenv"""
if not os.path.isfile(path):
return
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if "=" not in line:
continue
key, _, val = line.partition("=")
key, val = key.strip(), val.strip().strip('"').strip("'")
if key and key not in os.environ: # 环境变量优先
os.environ[key] = val
# 项目根目录(先定义,.env 加载需要用到)
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
# 加载 .env项目根目录优先其次当前目录
_load_dotenv(os.path.join(PROJECT_ROOT, ".env"))
_load_dotenv(".env")
# LLM 配置(兼容 OpenAI API 格式,包括 Ollama / vLLM / DeepSeek 等) # LLM 配置(兼容 OpenAI API 格式,包括 Ollama / vLLM / DeepSeek 等)
LLM_CONFIG = { LLM_CONFIG = {
"api_key": os.getenv("LLM_API_KEY", "sk-c44i1hy64xgzwox6x08o4zug93frq6rgn84oqugf2pje1tg4"), "api_key": os.getenv("LLM_API_KEY", ""),
"base_url": os.getenv("LLM_BASE_URL", "https://api.xiaomimimo.com/v1"), "base_url": os.getenv("LLM_BASE_URL", "https://api.openai.com/v1"),
"model": os.getenv("LLM_MODEL", "mimo-v2-flash"), "model": os.getenv("LLM_MODEL", "gpt-4o-mini"),
} }
# 沙箱安全规则 # 沙箱安全规则

View File

@@ -1,8 +1,9 @@
""" """
公共工具 —— JSON 提取、LLM 客户端单例 公共工具 —— JSON 提取、LLM 客户端单例、重试机制
""" """
import json import json
import re import re
import time
from typing import Any from typing import Any
import openai import openai
@@ -17,14 +18,77 @@ def get_llm_client(config: dict) -> tuple[openai.OpenAI, str]:
"""获取 LLM 客户端(单例),避免每个组件各建一个""" """获取 LLM 客户端(单例),避免每个组件各建一个"""
global _llm_client, _llm_model global _llm_client, _llm_model
if _llm_client is None: if _llm_client is None:
api_key = config.get("api_key", "")
if not api_key:
raise RuntimeError(
"LLM_API_KEY 未配置!请设置环境变量或在 .env 文件中添加:\n"
" LLM_API_KEY=your-key\n"
" LLM_BASE_URL=https://api.openai.com/v1\n"
" LLM_MODEL=gpt-4o-mini"
)
_llm_client = openai.OpenAI( _llm_client = openai.OpenAI(
api_key=config["api_key"], api_key=api_key,
base_url=config["base_url"], base_url=config["base_url"],
) )
_llm_model = config["model"] _llm_model = config["model"]
return _llm_client, _llm_model return _llm_client, _llm_model
# ── LLM 调用重试包装 ────────────────────────────────
class LLMCallError(Exception):
"""LLM 调用最终失败"""
pass
def llm_chat(client: openai.OpenAI, model: str, messages: list[dict],
max_retries: int = 3, **kwargs) -> str:
"""
带指数退避重试的 LLM 调用。
处理 429 限频、5xx 超时、网络错误。
"""
last_err = None
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model=model, messages=messages, **kwargs
)
return response.choices[0].message.content.strip()
except openai.RateLimitError as e:
last_err = e
# 读取 Retry-After 或使用默认退避
wait = _get_retry_delay(e, attempt)
print(f" ⏳ 限频,等待 {wait:.1f}s 后重试 ({attempt+1}/{max_retries})...")
time.sleep(wait)
except (openai.APITimeoutError, openai.APIConnectionError, openai.APIStatusError) as e:
last_err = e
wait = min(2 ** attempt * 2, 30)
print(f" ⚠️ API 错误: {type(e).__name__},等待 {wait:.1f}s ({attempt+1}/{max_retries})...")
time.sleep(wait)
except Exception as e:
last_err = e
if attempt < max_retries - 1:
wait = min(2 ** attempt * 2, 30)
time.sleep(wait)
continue
raise
raise LLMCallError(f"LLM 调用失败({max_retries} 次重试): {last_err}")
def _get_retry_delay(error, attempt: int) -> float:
"""从错误响应中提取重试等待时间"""
try:
if hasattr(error, 'response') and error.response is not None:
retry_after = error.response.headers.get('Retry-After')
if retry_after:
return float(retry_after)
except Exception:
pass
# 指数退避: 2s, 4s, 8s, 最大 30s
return min(2 ** (attempt + 1), 30)
# ── JSON 提取 ──────────────────────────────────────── # ── JSON 提取 ────────────────────────────────────────
def extract_json_object(text: str) -> dict: def extract_json_object(text: str) -> dict:

View File

@@ -1,21 +1,198 @@
""" """
将工单 CSV 数据导入 SQLite 数据库 将工单 CSV 数据导入 SQLite 数据库 —— 增强版
- 自动检测列名映射(兼容中英文)
- 空值/异常数据容错
- 数据类型自动推断
- 导入前完整性校验
""" """
import csv import csv
import sqlite3 import sqlite3
import os import os
import sys import sys
import re
from typing import Any, Optional
def import_csv(csv_path: str, db_path: str):
"""将工单 CSV 导入 SQLite""" # ── 列名别名映射(兼容不同版本 CSV─────────────────
COLUMN_ALIASES = {
"工单号": ["工单号", "ticket_id", "ticket_no", "id", "工单编号"],
"来源": ["来源", "source", "渠道"],
"创建日期": ["创建日期", "created_date", "create_date"],
"问题类型": ["问题类型", "issue_type", "type", "问题分类"],
"问题描述": ["问题描述", "description", "描述"],
"处理过程": ["处理过程", "process", "处理流程"],
"跟踪记录": ["跟踪记录", "tracking", "跟踪"],
"严重程度": ["严重程度", "severity", "priority", "优先级"],
"工单状态": ["工单状态", "status", "状态"],
"模块": ["模块", "module", "功能模块"],
"责任人": ["责任人", "assignee", "负责人"],
"关闭日期": ["关闭日期", "closed_date", "close_date"],
"车型": ["车型", "vehicle_model", "car_model"],
"VIN": ["VIN", "vin", "车架号"],
"SIM": ["SIM", "sim", "sim卡号"],
"Notes": ["Notes", "notes", "备注"],
"Attachment": ["Attachment", "attachment", "附件"],
"创建人": ["创建人", "creator", "创建者"],
"关闭时长_天": ["关闭时长(天)", "关闭时长_天", "close_duration", "duration_days"],
"创建日期_解析": ["创建日期_解析", "created_date_parsed"],
"关闭日期_解析": ["关闭日期_解析", "closed_date_parsed"],
}
def detect_column_mapping(headers: list[str]) -> dict[str, Optional[str]]:
"""
自动检测 CSV 列名到标准列名的映射。
返回 {标准列名: CSV实际列名},找不到的值为 None。
"""
# 标准化:去空格、小写
header_map = {h.strip().lower(): h for h in headers}
mapping = {}
for std_name, aliases in COLUMN_ALIASES.items():
found = None
for alias in aliases:
key = alias.strip().lower()
if key in header_map:
found = header_map[key]
break
mapping[std_name] = found
return mapping
def safe_float(val: Any) -> Optional[float]:
"""安全转 float"""
if val is None or str(val).strip() == "":
return None
try:
return float(str(val).strip())
except (ValueError, TypeError):
return None
def safe_str(val: Any) -> str:
"""安全转 stringNone → 空串"""
if val is None:
return ""
return str(val).strip()
def validate_row(row: dict, mapping: dict) -> tuple[bool, list[str]]:
"""校验单行数据,返回 (是否通过, 问题列表)"""
issues = []
ticket_id = safe_str(row.get(mapping.get("工单号", ""), ""))
if not ticket_id:
issues.append("缺少工单号")
return len(issues) == 0, issues
def import_csv(csv_path: str, db_path: str, dry_run: bool = False) -> dict:
"""
导入 CSV 到 SQLite。
返回统计信息 dict。
"""
stats = {
"total": 0, "imported": 0, "skipped": 0,
"warnings": [], "columns_detected": {}, "columns_missing": [],
}
if not os.path.isfile(csv_path):
print(f"❌ CSV 文件不存在: {csv_path}")
return stats
# ── 读取 CSV ──────────────────────────────
with open(csv_path, "r", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
headers = reader.fieldnames or []
rows = list(reader)
stats["total"] = len(rows)
print(f"📄 读取 CSV: {csv_path}")
print(f" 列: {headers}")
print(f" 行数: {len(rows)}")
# ── 检测列映射 ────────────────────────────
mapping = detect_column_mapping(headers)
detected = {k: v for k, v in mapping.items() if v is not None}
missing = [k for k, v in mapping.items() if v is None]
stats["columns_detected"] = detected
stats["columns_missing"] = missing
print(f"\n🔍 列名映射:")
for std, actual in detected.items():
print(f"{std}{actual}")
for m in missing:
print(f" ⚠️ {m} ← 未找到(将使用空值)")
if "工单号" not in detected:
print(f"\n❌ 至少需要「工单号」列,无法继续!")
return stats
# ── 数据预处理 + 校验 ──────────────────────
processed = []
for i, row in enumerate(rows):
valid, issues = validate_row(row, mapping)
if not valid:
stats["skipped"] += 1
if len(stats["warnings"]) < 10:
stats["warnings"].append(f"{i+2}: {', '.join(issues)}")
continue
def get_col(std_name: str, default: str = "") -> str:
actual = mapping.get(std_name)
return safe_str(row.get(actual, default)) if actual else default
def get_float(std_name: str) -> Optional[float]:
actual = mapping.get(std_name)
if not actual:
return None
return safe_float(row.get(actual))
processed.append((
get_col("工单号"),
get_col("来源"),
get_col("创建日期"),
get_col("问题类型"),
get_col("问题描述"),
get_col("处理过程"),
get_col("跟踪记录"),
get_col("严重程度"),
get_col("工单状态"),
get_col("模块"),
get_col("责任人"),
get_col("关闭日期"),
get_col("车型"),
get_col("VIN"),
get_col("SIM"),
get_col("Notes"),
get_col("Attachment"),
get_col("创建人"),
get_float("关闭时长_天"),
get_col("创建日期_解析"),
get_col("关闭日期_解析"),
))
stats["imported"] = len(processed)
print(f"\n✅ 预处理完成: {len(processed)} 条有效, {stats['skipped']} 条跳过")
if stats["warnings"]:
print(f" 警告:")
for w in stats["warnings"][:5]:
print(f" ⚠️ {w}")
if dry_run:
print(" (dry_run 模式,未写入数据库)")
return stats
# ── 写入数据库 ─────────────────────────────
if os.path.exists(db_path): if os.path.exists(db_path):
os.remove(db_path) os.remove(db_path)
print(f"🗑️ 已删除旧数据库: {db_path}") print(f"\n🗑️ 已删除旧数据库: {db_path}")
conn = sqlite3.connect(db_path) conn = sqlite3.connect(db_path)
cur = conn.cursor() cur = conn.cursor()
# 创建工单表
cur.execute(""" cur.execute("""
CREATE TABLE tickets ( CREATE TABLE tickets (
工单号 TEXT PRIMARY KEY, 工单号 TEXT PRIMARY KEY,
@@ -42,62 +219,39 @@ def import_csv(csv_path: str, db_path: str):
) )
""") """)
with open(csv_path, "r", encoding="utf-8-sig") as f: cur.executemany(
reader = csv.DictReader(f) "INSERT INTO tickets VALUES (" + ",".join(["?"] * 21) + ")",
rows = list(reader) processed,
)
for row in rows:
cur.execute("""
INSERT INTO tickets VALUES (
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
)
""", (
row.get("工单号", ""),
row.get("来源", ""),
row.get("创建日期", ""),
row.get("问题类型", ""),
row.get("问题描述", ""),
row.get("处理过程", ""),
row.get("跟踪记录", ""),
row.get("严重程度", ""),
row.get("工单状态", ""),
row.get("模块", ""),
row.get("责任人", ""),
row.get("关闭日期", ""),
row.get("车型", ""),
row.get("VIN", ""),
row.get("SIM", ""),
row.get("Notes", ""),
row.get("Attachment", ""),
row.get("创建人", ""),
float(row["关闭时长(天)"]) if row.get("关闭时长(天)") else None,
row.get("创建日期_解析", ""),
row.get("关闭日期_解析", ""),
))
conn.commit() conn.commit()
print(f"✅ 导入 {len(rows)} 条工单到 {db_path}")
# 验证 # ── 验证 ──────────────────────────────────
cur.execute("SELECT COUNT(*) FROM tickets") cur.execute("SELECT COUNT(*) FROM tickets")
print(f" 数据库中共 {cur.fetchone()[0]} 条记录") db_count = cur.fetchone()[0]
print(f"\n✅ 写入完成: 数据库中 {db_count} 条记录")
cur.execute("SELECT DISTINCT 问题类型 FROM tickets") # 打印维度信息
types = [r[0] for r in cur.fetchall()] for col in ("问题类型", "工单状态", "车型", "模块", "来源"):
print(f" 问题类型: {', '.join(types)}") actual = mapping.get(col)
if actual:
cur.execute("SELECT DISTINCT 工单状态 FROM tickets") cur.execute(f'SELECT DISTINCT "{col}" FROM tickets WHERE "{col}" != ""')
statuses = [r[0] for r in cur.fetchall()] vals = [r[0] for r in cur.fetchall()]
print(f" 工单状态: {', '.join(statuses)}") if vals:
print(f" {col}: {', '.join(vals[:10])}{'...' if len(vals) > 10 else ''}")
cur.execute("SELECT DISTINCT 车型 FROM tickets")
models = [r[0] for r in cur.fetchall()]
print(f" 车型: {', '.join(models)}")
conn.close() conn.close()
return stats
if __name__ == "__main__": if __name__ == "__main__":
csv_path = sys.argv[1] if len(sys.argv) > 1 else "cleaned_data.csv" csv_path = sys.argv[1] if len(sys.argv) > 1 else "cleaned_data.csv"
db_path = os.path.join(os.path.dirname(__file__), "demo.db") db_path = os.path.join(os.path.dirname(__file__), "demo.db")
import_csv(csv_path, db_path) dry_run = "--dry-run" in sys.argv
stats = import_csv(csv_path, db_path, dry_run=dry_run)
if stats["columns_missing"]:
print(f"\n💡 提示: 以下列在 CSV 中未找到,已用空值填充:")
for m in stats["columns_missing"]:
print(f" - {m}")

View File

@@ -1,7 +1,10 @@
""" """
Layer 4: 上下文管理器 Layer 4: 上下文管理器 —— 增强版
- 关键词语义匹配,替代简单取最近 N 条
- 会话摘要去重
""" """
import time import time
import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
@@ -19,6 +22,26 @@ class AnalysisSession:
report: str report: str
timestamp: float = field(default_factory=time.time) timestamp: float = field(default_factory=time.time)
@property
def keywords(self) -> set[str]:
"""提取会话关键词(中文分字 + 英文词切分)"""
text = f"{self.question} {self.plan.get('intent', '')} {' '.join(self.plan.get('dimensions', []))}"
# 中文字符
cn_chars = set(re.findall(r'[\u4e00-\u9fff]+', text))
# 英文单词(小写)
en_words = set(re.findall(r'[a-zA-Z]{2,}', text.lower()))
return cn_chars | en_words
def similarity(self, question: str) -> float:
"""与新问题的关键词相似度Jaccard-like"""
q_cn = set(re.findall(r'[\u4e00-\u9fff]+', question))
q_en = set(re.findall(r'[a-zA-Z]{2,}', question.lower()))
q_kw = q_cn | q_en
if not q_kw:
return 0.0
overlap = self.keywords & q_kw
return len(overlap) / len(q_kw)
def summary(self) -> str: def summary(self) -> str:
parts = [f"**问题**: {self.question}"] parts = [f"**问题**: {self.question}"]
if self.plan: if self.plan:
@@ -48,9 +71,9 @@ class AnalysisSession:
class ContextManager: class ContextManager:
"""上下文管理器""" """上下文管理器 —— 语义匹配增强版"""
def __init__(self, max_history: int = 10): def __init__(self, max_history: int = 20):
self.sessions: list[AnalysisSession] = [] self.sessions: list[AnalysisSession] = []
self.max_history = max_history self.max_history = max_history
@@ -63,9 +86,25 @@ class ContextManager:
return session return session
def get_context_for(self, new_question: str) -> Optional[str]: def get_context_for(self, new_question: str) -> Optional[str]:
"""
智能匹配最相关的 1~3 个历史分析作为上下文。
相似度 > 0.3 才引用,最多 3 条,按相似度降序。
"""
if not self.sessions: if not self.sessions:
return None return None
return "\n\n---\n\n".join(s.to_reference_text() for s in self.sessions[-2:])
scored = []
for s in self.sessions:
sim = s.similarity(new_question)
if sim > 0.3: # 相关性阈值
scored.append((sim, s))
if not scored:
# 无相关历史,返回最近 1 条作为兜底
return self.sessions[-1].to_reference_text()
scored.sort(key=lambda x: x[0], reverse=True)
return "\n\n---\n\n".join(s.to_reference_text() for _, s in scored[:3])
def get_history_summary(self) -> str: def get_history_summary(self) -> str:
if not self.sessions: if not self.sessions:

View File

@@ -6,7 +6,7 @@ from typing import Any
from dataclasses import dataclass, field from dataclasses import dataclass, field
from core.config import LLM_CONFIG from core.config import LLM_CONFIG
from core.utils import get_llm_client, extract_json_object from core.utils import get_llm_client, llm_chat, extract_json_object
from core.sandbox import SandboxExecutor from core.sandbox import SandboxExecutor
@@ -206,10 +206,10 @@ class Explorer:
return "\n\n".join(parts) return "\n\n".join(parts)
def _llm_decide(self, messages: list[dict]) -> dict: def _llm_decide(self, messages: list[dict]) -> dict:
response = self.client.chat.completions.create( content = llm_chat(
model=self.model, messages=messages, temperature=0.2, max_tokens=1024, self.client, self.model,
messages=messages, temperature=0.2, max_tokens=1024,
) )
content = response.choices[0].message.content.strip()
result = extract_json_object(content) result = extract_json_object(content)
return result if result else {"action": "done", "reasoning": f"无法解析: {content[:100]}"} return result if result else {"action": "done", "reasoning": f"无法解析: {content[:100]}"}

View File

@@ -5,7 +5,7 @@ import json
from typing import Any from typing import Any
from core.config import LLM_CONFIG from core.config import LLM_CONFIG
from core.utils import get_llm_client, extract_json_array from core.utils import get_llm_client, llm_chat, extract_json_array
from layers.explorer import ExplorationStep from layers.explorer import ExplorationStep
@@ -68,15 +68,14 @@ class InsightEngine:
return [] return []
history = self._build_history(steps) history = self._build_history(steps)
response = self.client.chat.completions.create( content = llm_chat(
model=self.model, self.client, self.model,
messages=[ messages=[
{"role": "system", "content": INSIGHT_SYSTEM}, {"role": "system", "content": INSIGHT_SYSTEM},
{"role": "user", "content": f"## 用户问题\n{question}\n\n## 探索历史\n{history}\n\n请分析以上数据,输出异常和洞察。"}, {"role": "user", "content": f"## 用户问题\n{question}\n\n## 探索历史\n{history}\n\n请分析以上数据,输出异常和洞察。"},
], ],
temperature=0.3, max_tokens=2048, temperature=0.3, max_tokens=2048,
) )
content = response.choices[0].message.content.strip()
return [Insight(d) for d in extract_json_array(content)] return [Insight(d) for d in extract_json_array(content)]
def format_insights(self, insights: list[Insight]) -> str: def format_insights(self, insights: list[Insight]) -> str:
@@ -109,9 +108,9 @@ class InsightEngine:
def quick_detect(steps: list[ExplorationStep]) -> list[str]: def quick_detect(steps: list[ExplorationStep]) -> list[str]:
"""基于规则的快速异常检测,不调 LLM""" """基于规则的快速异常检测(零 LLM 成本)"""
alerts = [] alerts = []
seen = set() # 去重 seen = set()
for step in steps: for step in steps:
if not step.success or not step.rows: if not step.success or not step.rows:
@@ -119,22 +118,21 @@ def quick_detect(steps: list[ExplorationStep]) -> list[str]:
for col in step.columns: for col in step.columns:
vals = [r.get(col) for r in step.rows if isinstance(r.get(col), (int, float))] vals = [r.get(col) for r in step.rows if isinstance(r.get(col), (int, float))]
if not vals: if len(vals) < 2:
continue continue
col_lower = col.lower() col_lower = col.lower()
# 占比列:某个分组占比过高 # ── 占比列:集中度过高 ──
if col_lower in ("pct", "percent", "percentage", "占比"): if col_lower in ("pct", "percent", "percentage", "占比"):
for v in vals: max_pct = max(vals)
if v > 50: if max_pct > 50:
key = f"pct_{step.purpose}" key = f"pct_{step.purpose}"
if key not in seen: if key not in seen:
seen.add(key) seen.add(key)
alerts.append(f"⚠️ {step.purpose} 中某个分组占比 {v}%,集中度过高") alerts.append(f"⚠️ {step.purpose}: 最高占比 {max_pct}%,集中度过高")
break
# 计数列:极值差异 # ── 计数列:极值差异 ──
if col_lower in ("count", "cnt", "n", "total", "order_count") and len(vals) >= 3: if col_lower in ("count", "cnt", "n", "total", "order_count") and len(vals) >= 3:
avg = sum(vals) / len(vals) avg = sum(vals) / len(vals)
if avg > 0: if avg > 0:
@@ -143,6 +141,47 @@ def quick_detect(steps: list[ExplorationStep]) -> list[str]:
key = f"count_{step.purpose}" key = f"count_{step.purpose}"
if key not in seen: if key not in seen:
seen.add(key) seen.add(key)
alerts.append(f"⚠️ {step.purpose} 最大值是均值的 {ratio:.1f}") alerts.append(f"⚠️ {step.purpose}: 最大值是均值的 {ratio:.1f}")
# ── Z-Score 异常检测 ──
if len(vals) >= 5 and col_lower not in ("id", "year", "month"):
mean = sum(vals) / len(vals)
variance = sum((v - mean) ** 2 for v in vals) / len(vals)
std = variance ** 0.5
if std > 0:
outliers = [(i, v) for i, v in enumerate(vals) if abs(v - mean) / std > 2]
if outliers:
key = f"zscore_{step.purpose}_{col}"
if key not in seen:
seen.add(key)
outlier_desc = ", ".join(f"{v:.1f}" for _, v in outliers[:3])
alerts.append(
f"⚠️ {step.purpose}{col}」发现 {len(outliers)} 个异常值 "
f"(均值={mean:.1f}, σ={std:.1f}, 异常值={outlier_desc})"
)
# ── 离散度检测(变异系数 CV──
if len(vals) >= 3 and col_lower not in ("id", "year", "month"):
mean = sum(vals) / len(vals)
if mean != 0:
variance = sum((v - mean) ** 2 for v in vals) / len(vals)
std = variance ** 0.5
cv = std / abs(mean)
if cv > 1.0:
key = f"cv_{step.purpose}_{col}"
if key not in seen:
seen.add(key)
alerts.append(f"⚠️ {step.purpose}{col}」离散度高 (CV={cv:.2f}),数据波动大")
# ── 零值/缺失检测 ──
if col_lower in ("count", "cnt", "total", "amount", "sum", "关闭时长"):
zero_count = sum(1 for v in vals if v == 0)
if zero_count > 0 and zero_count < len(vals):
pct = zero_count / len(vals) * 100
if pct > 10:
key = f"zero_{step.purpose}_{col}"
if key not in seen:
seen.add(key)
alerts.append(f"⚠️ {step.purpose}{col}」有 {zero_count} 个零值 ({pct:.0f}%)")
return alerts return alerts

View File

@@ -5,7 +5,7 @@ import json
from typing import Any from typing import Any
from core.config import LLM_CONFIG from core.config import LLM_CONFIG
from core.utils import get_llm_client, extract_json_object from core.utils import get_llm_client, llm_chat, extract_json_object
PROMPT = """你是一个数据分析规划专家。 PROMPT = """你是一个数据分析规划专家。
@@ -52,8 +52,8 @@ class Planner:
self.client, self.model = get_llm_client(LLM_CONFIG) self.client, self.model = get_llm_client(LLM_CONFIG)
def plan(self, question: str, schema_text: str) -> dict[str, Any]: def plan(self, question: str, schema_text: str) -> dict[str, Any]:
response = self.client.chat.completions.create( content = llm_chat(
model=self.model, self.client, self.model,
messages=[ messages=[
{"role": "system", "content": PROMPT}, {"role": "system", "content": PROMPT},
{"role": "user", "content": f"## Schema\n{schema_text}\n\n## 用户问题\n{question}"}, {"role": "user", "content": f"## Schema\n{schema_text}\n\n## 用户问题\n{question}"},
@@ -61,7 +61,6 @@ class Planner:
temperature=0.1, temperature=0.1,
max_tokens=1024, max_tokens=1024,
) )
content = response.choices[0].message.content.strip()
plan = extract_json_object(content) plan = extract_json_object(content)
if not plan: if not plan:

View File

@@ -7,7 +7,7 @@ import re
from typing import Optional from typing import Optional
from core.config import LLM_CONFIG from core.config import LLM_CONFIG
from core.utils import get_llm_client, extract_json_object, extract_json_array from core.utils import get_llm_client, llm_chat, extract_json_object, extract_json_array
class Playbook: class Playbook:
@@ -87,15 +87,14 @@ class PlaybookManager:
- 直接使用实际表名和列名""" - 直接使用实际表名和列名"""
try: try:
response = self.client.chat.completions.create( content = llm_chat(
model=self.model, self.client, self.model,
messages=[ messages=[
{"role": "system", "content": "你是数据分析专家。只输出 JSON不要其他内容。"}, {"role": "system", "content": "你是数据分析专家。只输出 JSON不要其他内容。"},
{"role": "user", "content": prompt}, {"role": "user", "content": prompt},
], ],
temperature=0.3, max_tokens=4096, temperature=0.3, max_tokens=4096,
) )
content = response.choices[0].message.content.strip()
playbooks_data = extract_json_array(content) playbooks_data = extract_json_array(content)
if not playbooks_data: if not playbooks_data:
return [] return []
@@ -150,15 +149,15 @@ class PlaybookManager:
不匹配: {{"matched": false, "reasoning": "原因"}}""" 不匹配: {{"matched": false, "reasoning": "原因"}}"""
try: try:
response = self.client.chat.completions.create( content = llm_chat(
model=self.model, self.client, self.model,
messages=[ messages=[
{"role": "system", "content": "你是分析计划匹配器。"}, {"role": "system", "content": "你是分析计划匹配器。"},
{"role": "user", "content": prompt}, {"role": "user", "content": prompt},
], ],
temperature=0.1, max_tokens=512, temperature=0.1, max_tokens=512,
) )
result = extract_json_object(response.choices[0].message.content.strip()) result = extract_json_object(content)
if not result.get("matched"): if not result.get("matched"):
return None return None

View File

@@ -12,11 +12,12 @@ import matplotlib.pyplot as plt
import matplotlib.font_manager as fm import matplotlib.font_manager as fm
from core.config import LLM_CONFIG from core.config import LLM_CONFIG
from core.utils import get_llm_client, extract_json_array from core.utils import get_llm_client, llm_chat, extract_json_array
from layers.explorer import ExplorationStep from layers.explorer import ExplorationStep
def _setup_chinese_font(): def _setup_chinese_font():
"""尝试加载中文字体,找不到时用英文显示(不崩溃)"""
candidates = [ candidates = [
"SimHei", "Microsoft YaHei", "STHeiti", "WenQuanYi Micro Hei", "SimHei", "Microsoft YaHei", "STHeiti", "WenQuanYi Micro Hei",
"Noto Sans CJK SC", "PingFang SC", "Source Han Sans CN", "Noto Sans CJK SC", "PingFang SC", "Source Han Sans CN",
@@ -27,10 +28,27 @@ def _setup_chinese_font():
plt.rcParams["font.sans-serif"] = [font] plt.rcParams["font.sans-serif"] = [font]
plt.rcParams["axes.unicode_minus"] = False plt.rcParams["axes.unicode_minus"] = False
return font return font
# 兜底:尝试找任何 CJK 字体
for f in fm.fontManager.ttflist:
if any(kw in f.name.lower() for kw in ("cjk", "chinese", "hei", "song", "ming", "fang")):
plt.rcParams["font.sans-serif"] = [f.name]
plt.rcParams["axes.unicode_minus"] = False
return f.name
plt.rcParams["axes.unicode_minus"] = False plt.rcParams["axes.unicode_minus"] = False
return None return None # 后续图表标题会用英文 fallback
_setup_chinese_font()
_CN_FONT = _setup_chinese_font()
def _safe_title(title: str) -> str:
"""无中文字体时将标题转为安全显示文本"""
if _CN_FONT:
return title
# 简单映射:中文→拼音首字母摘要,保留英文和数字
import re
clean = re.sub(r'[^\w\s.,;:!?%/()\-+]', '', title)
return clean if clean.strip() else "Chart"
CHART_PLAN_PROMPT = """你是一个数据可视化专家。根据以下分析结果,规划需要生成的图表。 CHART_PLAN_PROMPT = """你是一个数据可视化专家。根据以下分析结果,规划需要生成的图表。
@@ -97,15 +115,15 @@ class ChartGenerator:
) )
try: try:
response = self.client.chat.completions.create( content = llm_chat(
model=self.model, self.client, self.model,
messages=[ messages=[
{"role": "system", "content": "你是数据可视化专家。只输出纯 JSON 数组,不要 markdown 代码块。"}, {"role": "system", "content": "你是数据可视化专家。只输出纯 JSON 数组,不要 markdown 代码块。"},
{"role": "user", "content": CHART_PLAN_PROMPT.format(exploration_summary="\n\n".join(summary_parts))}, {"role": "user", "content": CHART_PLAN_PROMPT.format(exploration_summary="\n\n".join(summary_parts))},
], ],
temperature=0.1, max_tokens=1024, temperature=0.1, max_tokens=1024,
) )
plans = extract_json_array(response.choices[0].message.content.strip()) plans = extract_json_array(content)
return plans if plans else self._fallback_plan(valid_steps) return plans if plans else self._fallback_plan(valid_steps)
except Exception as e: except Exception as e:
print(f" ⚠️ 图表规划失败: {e},使用 fallback") print(f" ⚠️ 图表规划失败: {e},使用 fallback")
@@ -206,11 +224,11 @@ class ChartGenerator:
ax.set_xticklabels(x_vals, rotation=45, ha="right", fontsize=9) ax.set_xticklabels(x_vals, rotation=45, ha="right", fontsize=9)
ax.legend() ax.legend()
ax.set_title(title, fontsize=13, fontweight="bold", pad=12) ax.set_title(_safe_title(title), fontsize=13, fontweight="bold", pad=12)
if chart_type not in ("pie",): if chart_type not in ("pie",):
ax.set_xlabel(x_col, fontsize=10) ax.set_xlabel(_safe_title(x_col), fontsize=10)
if chart_type != "horizontal_bar": if chart_type != "horizontal_bar":
ax.set_ylabel(y_col, fontsize=10) ax.set_ylabel(_safe_title(y_col), fontsize=10)
ax.grid(axis="y", alpha=0.3) ax.grid(axis="y", alpha=0.3)
plt.tight_layout() plt.tight_layout()

View File

@@ -4,7 +4,7 @@
import json import json
from core.config import LLM_CONFIG from core.config import LLM_CONFIG
from core.utils import get_llm_client from core.utils import get_llm_client, llm_chat
from layers.context import AnalysisSession from layers.context import AnalysisSession
@@ -47,15 +47,14 @@ class ReportConsolidator:
charts_text = "\n".join(f"{i}. {c['title']}: {c['path']}" for i, c in enumerate(charts or [], 1)) or "无图表。" charts_text = "\n".join(f"{i}. {c['title']}: {c['path']}" for i, c in enumerate(charts or [], 1)) or "无图表。"
try: try:
response = self.client.chat.completions.create( return llm_chat(
model=self.model, self.client, self.model,
messages=[ messages=[
{"role": "system", "content": "你是高级数据分析总监,整合多维度分析结果。"}, {"role": "system", "content": "你是高级数据分析总监,整合多维度分析结果。"},
{"role": "user", "content": CONSOLIDATE_PROMPT.format(question=question, sections=sections, charts_text=charts_text)}, {"role": "user", "content": CONSOLIDATE_PROMPT.format(question=question, sections=sections, charts_text=charts_text)},
], ],
temperature=0.3, max_tokens=4096, temperature=0.3, max_tokens=4096,
) )
return response.choices[0].message.content
except Exception as e: except Exception as e:
print(f" ⚠️ LLM 整合失败: {e},使用拼接模式") print(f" ⚠️ LLM 整合失败: {e},使用拼接模式")
return self._fallback_concat(sessions, charts) return self._fallback_concat(sessions, charts)

View File

@@ -5,7 +5,7 @@ import json
from typing import Any from typing import Any
from core.config import LLM_CONFIG from core.config import LLM_CONFIG
from core.utils import get_llm_client from core.utils import get_llm_client, llm_chat
from layers.explorer import ExplorationStep from layers.explorer import ExplorationStep
from layers.insights import Insight from layers.insights import Insight
@@ -58,15 +58,14 @@ class ReportGenerator:
charts_text=charts_text, charts_text=charts_text,
) )
response = self.client.chat.completions.create( return llm_chat(
model=self.model, self.client, self.model,
messages=[ messages=[
{"role": "system", "content": "你是专业的数据分析师,撰写清晰、有洞察力的分析报告。"}, {"role": "system", "content": "你是专业的数据分析师,撰写清晰、有洞察力的分析报告。"},
{"role": "user", "content": prompt}, {"role": "user", "content": prompt},
], ],
temperature=0.3, max_tokens=4096, temperature=0.3, max_tokens=4096,
) )
return response.choices[0].message.content
def _build_exploration(self, steps: list[ExplorationStep]) -> str: def _build_exploration(self, steps: list[ExplorationStep]) -> str:
parts = [] parts = []