feat: 四层架构全面增强
安全与稳定性: - 移除硬编码 API Key,改用 .env + 环境变量 - LLM 调用统一重试机制(指数退避,3 次重试,处理 429/5xx/超时) - 中文字体检测增强(CJK 关键词兜底 + 无字体时英文 fallback) - 缺失 API Key 给出友好提示而非崩溃 分析能力提升: - 异常检测新增 z-score 检测(标准差>2 标记异常) - 新增变异系数 CV 检测(数据波动性) - 新增零值/缺失检测 - 上下文管理器升级为关键词语义匹配(替代简单取最近 2 条) 用户体验: - 报告自动保存为 Markdown(reports/ 目录) - 新增 export 命令导出查询结果为 CSV - 新增 reports 命令查看已保存报告 - CLI 支持 readline 命令历史(方向键翻阅) - CSV 导入工具重写:自动列名映射、容错处理、dry-run 模式 - 新增 .env.example 配置模板
This commit is contained in:
@@ -1,7 +1,10 @@
|
||||
"""
|
||||
Layer 4: 上下文管理器
|
||||
Layer 4: 上下文管理器 —— 增强版
|
||||
- 关键词语义匹配,替代简单取最近 N 条
|
||||
- 会话摘要去重
|
||||
"""
|
||||
import time
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
@@ -19,6 +22,26 @@ class AnalysisSession:
|
||||
report: str
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
|
||||
@property
|
||||
def keywords(self) -> set[str]:
|
||||
"""提取会话关键词(中文分字 + 英文词切分)"""
|
||||
text = f"{self.question} {self.plan.get('intent', '')} {' '.join(self.plan.get('dimensions', []))}"
|
||||
# 中文字符
|
||||
cn_chars = set(re.findall(r'[\u4e00-\u9fff]+', text))
|
||||
# 英文单词(小写)
|
||||
en_words = set(re.findall(r'[a-zA-Z]{2,}', text.lower()))
|
||||
return cn_chars | en_words
|
||||
|
||||
def similarity(self, question: str) -> float:
|
||||
"""与新问题的关键词相似度(Jaccard-like)"""
|
||||
q_cn = set(re.findall(r'[\u4e00-\u9fff]+', question))
|
||||
q_en = set(re.findall(r'[a-zA-Z]{2,}', question.lower()))
|
||||
q_kw = q_cn | q_en
|
||||
if not q_kw:
|
||||
return 0.0
|
||||
overlap = self.keywords & q_kw
|
||||
return len(overlap) / len(q_kw)
|
||||
|
||||
def summary(self) -> str:
|
||||
parts = [f"**问题**: {self.question}"]
|
||||
if self.plan:
|
||||
@@ -48,9 +71,9 @@ class AnalysisSession:
|
||||
|
||||
|
||||
class ContextManager:
|
||||
"""上下文管理器"""
|
||||
"""上下文管理器 —— 语义匹配增强版"""
|
||||
|
||||
def __init__(self, max_history: int = 10):
|
||||
def __init__(self, max_history: int = 20):
|
||||
self.sessions: list[AnalysisSession] = []
|
||||
self.max_history = max_history
|
||||
|
||||
@@ -63,9 +86,25 @@ class ContextManager:
|
||||
return session
|
||||
|
||||
def get_context_for(self, new_question: str) -> Optional[str]:
|
||||
"""
|
||||
智能匹配最相关的 1~3 个历史分析作为上下文。
|
||||
相似度 > 0.3 才引用,最多 3 条,按相似度降序。
|
||||
"""
|
||||
if not self.sessions:
|
||||
return None
|
||||
return "\n\n---\n\n".join(s.to_reference_text() for s in self.sessions[-2:])
|
||||
|
||||
scored = []
|
||||
for s in self.sessions:
|
||||
sim = s.similarity(new_question)
|
||||
if sim > 0.3: # 相关性阈值
|
||||
scored.append((sim, s))
|
||||
|
||||
if not scored:
|
||||
# 无相关历史,返回最近 1 条作为兜底
|
||||
return self.sessions[-1].to_reference_text()
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return "\n\n---\n\n".join(s.to_reference_text() for _, s in scored[:3])
|
||||
|
||||
def get_history_summary(self) -> str:
|
||||
if not self.sessions:
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import Any
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from core.config import LLM_CONFIG
|
||||
from core.utils import get_llm_client, extract_json_object
|
||||
from core.utils import get_llm_client, llm_chat, extract_json_object
|
||||
from core.sandbox import SandboxExecutor
|
||||
|
||||
|
||||
@@ -206,10 +206,10 @@ class Explorer:
|
||||
return "\n\n".join(parts)
|
||||
|
||||
def _llm_decide(self, messages: list[dict]) -> dict:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model, messages=messages, temperature=0.2, max_tokens=1024,
|
||||
content = llm_chat(
|
||||
self.client, self.model,
|
||||
messages=messages, temperature=0.2, max_tokens=1024,
|
||||
)
|
||||
content = response.choices[0].message.content.strip()
|
||||
result = extract_json_object(content)
|
||||
return result if result else {"action": "done", "reasoning": f"无法解析: {content[:100]}"}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ import json
|
||||
from typing import Any
|
||||
|
||||
from core.config import LLM_CONFIG
|
||||
from core.utils import get_llm_client, extract_json_array
|
||||
from core.utils import get_llm_client, llm_chat, extract_json_array
|
||||
from layers.explorer import ExplorationStep
|
||||
|
||||
|
||||
@@ -68,15 +68,14 @@ class InsightEngine:
|
||||
return []
|
||||
|
||||
history = self._build_history(steps)
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
content = llm_chat(
|
||||
self.client, self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": INSIGHT_SYSTEM},
|
||||
{"role": "user", "content": f"## 用户问题\n{question}\n\n## 探索历史\n{history}\n\n请分析以上数据,输出异常和洞察。"},
|
||||
],
|
||||
temperature=0.3, max_tokens=2048,
|
||||
)
|
||||
content = response.choices[0].message.content.strip()
|
||||
return [Insight(d) for d in extract_json_array(content)]
|
||||
|
||||
def format_insights(self, insights: list[Insight]) -> str:
|
||||
@@ -109,9 +108,9 @@ class InsightEngine:
|
||||
|
||||
|
||||
def quick_detect(steps: list[ExplorationStep]) -> list[str]:
|
||||
"""基于规则的快速异常检测,不调 LLM"""
|
||||
"""基于规则的快速异常检测(零 LLM 成本)"""
|
||||
alerts = []
|
||||
seen = set() # 去重
|
||||
seen = set()
|
||||
|
||||
for step in steps:
|
||||
if not step.success or not step.rows:
|
||||
@@ -119,22 +118,21 @@ def quick_detect(steps: list[ExplorationStep]) -> list[str]:
|
||||
|
||||
for col in step.columns:
|
||||
vals = [r.get(col) for r in step.rows if isinstance(r.get(col), (int, float))]
|
||||
if not vals:
|
||||
if len(vals) < 2:
|
||||
continue
|
||||
|
||||
col_lower = col.lower()
|
||||
|
||||
# 占比列:某个分组占比过高
|
||||
# ── 占比列:集中度过高 ──
|
||||
if col_lower in ("pct", "percent", "percentage", "占比"):
|
||||
for v in vals:
|
||||
if v > 50:
|
||||
key = f"pct_{step.purpose}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
alerts.append(f"⚠️ {step.purpose} 中某个分组占比 {v}%,集中度过高")
|
||||
break
|
||||
max_pct = max(vals)
|
||||
if max_pct > 50:
|
||||
key = f"pct_{step.purpose}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
alerts.append(f"⚠️ {step.purpose}: 最高占比 {max_pct}%,集中度过高")
|
||||
|
||||
# 计数列:极值差异
|
||||
# ── 计数列:极值差异 ──
|
||||
if col_lower in ("count", "cnt", "n", "total", "order_count") and len(vals) >= 3:
|
||||
avg = sum(vals) / len(vals)
|
||||
if avg > 0:
|
||||
@@ -143,6 +141,47 @@ def quick_detect(steps: list[ExplorationStep]) -> list[str]:
|
||||
key = f"count_{step.purpose}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
alerts.append(f"⚠️ {step.purpose} 中最大值是均值的 {ratio:.1f} 倍")
|
||||
alerts.append(f"⚠️ {step.purpose}: 最大值是均值的 {ratio:.1f} 倍")
|
||||
|
||||
# ── Z-Score 异常检测 ──
|
||||
if len(vals) >= 5 and col_lower not in ("id", "year", "month"):
|
||||
mean = sum(vals) / len(vals)
|
||||
variance = sum((v - mean) ** 2 for v in vals) / len(vals)
|
||||
std = variance ** 0.5
|
||||
if std > 0:
|
||||
outliers = [(i, v) for i, v in enumerate(vals) if abs(v - mean) / std > 2]
|
||||
if outliers:
|
||||
key = f"zscore_{step.purpose}_{col}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
outlier_desc = ", ".join(f"{v:.1f}" for _, v in outliers[:3])
|
||||
alerts.append(
|
||||
f"⚠️ {step.purpose}「{col}」发现 {len(outliers)} 个异常值 "
|
||||
f"(均值={mean:.1f}, σ={std:.1f}, 异常值={outlier_desc})"
|
||||
)
|
||||
|
||||
# ── 离散度检测(变异系数 CV)──
|
||||
if len(vals) >= 3 and col_lower not in ("id", "year", "month"):
|
||||
mean = sum(vals) / len(vals)
|
||||
if mean != 0:
|
||||
variance = sum((v - mean) ** 2 for v in vals) / len(vals)
|
||||
std = variance ** 0.5
|
||||
cv = std / abs(mean)
|
||||
if cv > 1.0:
|
||||
key = f"cv_{step.purpose}_{col}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
alerts.append(f"⚠️ {step.purpose}「{col}」离散度高 (CV={cv:.2f}),数据波动大")
|
||||
|
||||
# ── 零值/缺失检测 ──
|
||||
if col_lower in ("count", "cnt", "total", "amount", "sum", "关闭时长"):
|
||||
zero_count = sum(1 for v in vals if v == 0)
|
||||
if zero_count > 0 and zero_count < len(vals):
|
||||
pct = zero_count / len(vals) * 100
|
||||
if pct > 10:
|
||||
key = f"zero_{step.purpose}_{col}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
alerts.append(f"⚠️ {step.purpose}「{col}」有 {zero_count} 个零值 ({pct:.0f}%)")
|
||||
|
||||
return alerts
|
||||
|
||||
@@ -5,7 +5,7 @@ import json
|
||||
from typing import Any
|
||||
|
||||
from core.config import LLM_CONFIG
|
||||
from core.utils import get_llm_client, extract_json_object
|
||||
from core.utils import get_llm_client, llm_chat, extract_json_object
|
||||
|
||||
PROMPT = """你是一个数据分析规划专家。
|
||||
|
||||
@@ -52,8 +52,8 @@ class Planner:
|
||||
self.client, self.model = get_llm_client(LLM_CONFIG)
|
||||
|
||||
def plan(self, question: str, schema_text: str) -> dict[str, Any]:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
content = llm_chat(
|
||||
self.client, self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": PROMPT},
|
||||
{"role": "user", "content": f"## Schema\n{schema_text}\n\n## 用户问题\n{question}"},
|
||||
@@ -61,7 +61,6 @@ class Planner:
|
||||
temperature=0.1,
|
||||
max_tokens=1024,
|
||||
)
|
||||
content = response.choices[0].message.content.strip()
|
||||
plan = extract_json_object(content)
|
||||
|
||||
if not plan:
|
||||
|
||||
@@ -7,7 +7,7 @@ import re
|
||||
from typing import Optional
|
||||
|
||||
from core.config import LLM_CONFIG
|
||||
from core.utils import get_llm_client, extract_json_object, extract_json_array
|
||||
from core.utils import get_llm_client, llm_chat, extract_json_object, extract_json_array
|
||||
|
||||
|
||||
class Playbook:
|
||||
@@ -87,15 +87,14 @@ class PlaybookManager:
|
||||
- 直接使用实际表名和列名"""
|
||||
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
content = llm_chat(
|
||||
self.client, self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "你是数据分析专家。只输出 JSON,不要其他内容。"},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.3, max_tokens=4096,
|
||||
)
|
||||
content = response.choices[0].message.content.strip()
|
||||
playbooks_data = extract_json_array(content)
|
||||
if not playbooks_data:
|
||||
return []
|
||||
@@ -150,15 +149,15 @@ class PlaybookManager:
|
||||
不匹配: {{"matched": false, "reasoning": "原因"}}"""
|
||||
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
content = llm_chat(
|
||||
self.client, self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "你是分析计划匹配器。"},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.1, max_tokens=512,
|
||||
)
|
||||
result = extract_json_object(response.choices[0].message.content.strip())
|
||||
result = extract_json_object(content)
|
||||
if not result.get("matched"):
|
||||
return None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user