Files
assist/src/knowledge_base/knowledge_manager.py

758 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import logging
from typing import List, Dict, Optional, Any
from datetime import datetime
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sqlalchemy import func, Integer
from ..core.database import db_manager
from ..core.models import KnowledgeEntry, WorkOrder, Conversation
from ..core.llm_client import QwenClient
from ..core.embedding_client import EmbeddingClient
from ..core.vector_store import vector_store
from ..config.unified_config import get_config
logger = logging.getLogger(__name__)
class KnowledgeManager:
"""知识库管理器"""
def __init__(self):
self.llm_client = QwenClient()
self.embedding_client = EmbeddingClient()
self.embedding_enabled = get_config().embedding.enabled
self.similarity_threshold = get_config().embedding.similarity_threshold
self.vectorizer = TfidfVectorizer(
max_features=1000,
stop_words=None, # 不使用英文停用词,因为数据是中文
ngram_range=(1, 2)
)
self._load_vectorizer()
# 加载向量索引embedding 模式)
if self.embedding_enabled:
vector_store.load_from_db()
def _load_vectorizer(self):
"""加载向量化器"""
try:
logger.debug("正在初始化知识库向量化器...")
with db_manager.get_session() as session:
entries = session.query(KnowledgeEntry).filter(
KnowledgeEntry.is_active == True
).all()
if entries:
texts = [entry.question + " " + entry.answer for entry in entries]
self.vectorizer.fit(texts)
logger.debug(f"向量化器加载成功: 共处理 {len(entries)} 个知识条目")
else:
logger.warning("知识库尚无活跃条目,向量化器将保持空状态")
except Exception as e:
logger.error(f"加载向量化器失败: {e}")
def learn_from_work_order(self, work_order_id: int) -> bool:
"""从工单中学习知识"""
try:
with db_manager.get_session() as session:
work_order = session.query(WorkOrder).filter(
WorkOrder.id == work_order_id
).first()
if not work_order or not work_order.resolution:
return False
# 提取问题和答案
question = work_order.title + " " + work_order.description
answer = work_order.resolution
logger.info(f"开始从工单 {work_order_id} 学习知识: 标题长度={len(work_order.title)}, 描述长度={len(work_order.description)}")
# 检查是否已存在相似条目
existing_entry = self._find_similar_entry(question, session)
if existing_entry:
# 更新现有条目
logger.info(f"检测到相似知识条目 (ID: {existing_entry.id}),执行更新操作")
existing_entry.answer = answer
existing_entry.usage_count += 1
existing_entry.updated_at = datetime.now()
if work_order.satisfaction_score:
existing_entry.confidence_score = work_order.satisfaction_score
# 更新 embedding
if self.embedding_enabled:
vec = self.embedding_client.embed_text(question + " " + answer)
if vec:
existing_entry.vector_embedding = json.dumps(vec)
vector_store.update(existing_entry.id, vec)
else:
# 创建新条目
logger.info(f"未发现相似条目,正在为工单 {work_order_id} 创建新知识点")
embedding_json = None
vec = None
if self.embedding_enabled:
vec = self.embedding_client.embed_text(question + " " + answer)
if vec:
embedding_json = json.dumps(vec)
new_entry = KnowledgeEntry(
question=question,
answer=answer,
category=work_order.category,
tenant_id=work_order.tenant_id,
confidence_score=work_order.satisfaction_score or 0.5,
usage_count=1,
vector_embedding=embedding_json
)
session.add(new_entry)
session.flush() # 获取 ID
if vec and new_entry.id:
vector_store.add(new_entry.id, vec)
session.commit()
logger.info(f"从工单 {work_order_id} 学习知识成功")
return True
except Exception as e:
logger.error(f"从工单学习知识失败: {e}")
return False
def _find_similar_entry(self, question: str, session) -> Optional[KnowledgeEntry]:
"""查找相似的知识库条目"""
try:
# 优先使用 embedding 查找
if self.embedding_enabled:
query_vec = self.embedding_client.embed_text(question)
if query_vec:
candidates = vector_store.search(query_vec, top_k=1, threshold=0.8)
if candidates:
entry_id, score = candidates[0]
entry = session.query(KnowledgeEntry).filter(
KnowledgeEntry.id == entry_id,
KnowledgeEntry.is_active == True
).first()
if entry:
logger.info(f"Embedding 匹配成功: 相似度 {score:.4f}, ID={entry_id}")
return entry
# 降级TF-IDF 匹配
entries = session.query(KnowledgeEntry).filter(
KnowledgeEntry.is_active == True
).all()
if not entries:
return None
texts = [entry.question for entry in entries]
question_vector = self.vectorizer.transform([question])
entry_vectors = self.vectorizer.transform(texts)
similarities = cosine_similarity(question_vector, entry_vectors)[0]
max_similarity_idx = np.argmax(similarities)
max_score = similarities[max_similarity_idx]
logger.debug(f"TF-IDF 相似度检索: 最高分值={max_score:.4f}")
if max_score > 0.8:
return entries[max_similarity_idx]
return None
except Exception as e:
logger.error(f"查找相似条目失败: {e}")
return None
def search_knowledge(self, query: str, top_k: int = 3, verified_only: bool = True, tenant_id: Optional[str] = None) -> List[Dict[str, Any]]:
"""搜索知识库 — 优先使用 embedding 语义检索,降级为关键词匹配"""
try:
# 尝试 embedding 语义检索
if self.embedding_enabled:
results = self._search_by_embedding(query, top_k, verified_only, tenant_id=tenant_id)
if results:
return results
logger.debug("Embedding 检索无结果,降级为关键词匹配")
# 降级:关键词匹配
return self._search_by_keyword(query, top_k, verified_only, tenant_id=tenant_id)
except Exception as e:
logger.error(f"搜索知识库失败: {e}")
return []
def _search_by_embedding(self, query: str, top_k: int = 3, verified_only: bool = True, tenant_id: Optional[str] = None) -> List[Dict[str, Any]]:
"""基于 embedding 向量的语义检索"""
try:
query_vec = self.embedding_client.embed_text(query)
if query_vec is None:
return []
# 向量检索
candidates = vector_store.search(
query_vector=query_vec,
top_k=top_k * 3, # 多取一些,后面过滤
threshold=self.similarity_threshold
)
if not candidates:
return []
# 从 DB 获取完整条目并过滤
candidate_ids = [cid for cid, _ in candidates]
score_map = {cid: score for cid, score in candidates}
with db_manager.get_session() as session:
query_filter = session.query(KnowledgeEntry).filter(
KnowledgeEntry.id.in_(candidate_ids),
KnowledgeEntry.is_active == True
)
if tenant_id is not None:
query_filter = query_filter.filter(KnowledgeEntry.tenant_id == tenant_id)
if verified_only:
query_filter = query_filter.filter(KnowledgeEntry.is_verified == True)
entries = query_filter.all()
# 如果 verified_only 没结果,回退到全部
if not entries and verified_only:
fallback_filter = session.query(KnowledgeEntry).filter(
KnowledgeEntry.id.in_(candidate_ids),
KnowledgeEntry.is_active == True
)
if tenant_id is not None:
fallback_filter = fallback_filter.filter(KnowledgeEntry.tenant_id == tenant_id)
entries = fallback_filter.all()
results = []
for entry in entries:
results.append({
"id": entry.id,
"question": entry.question,
"answer": entry.answer,
"category": entry.category,
"confidence_score": entry.confidence_score,
"similarity_score": score_map.get(entry.id, 0.0),
"usage_count": entry.usage_count,
"is_verified": entry.is_verified
})
results.sort(key=lambda x: x['similarity_score'], reverse=True)
results = results[:top_k]
logger.info(f"Embedding 搜索 '{query[:30]}' 返回 {len(results)} 个结果")
return results
except Exception as e:
logger.error(f"Embedding 搜索失败: {e}")
return []
def _search_by_keyword(self, query: str, top_k: int = 3, verified_only: bool = True, tenant_id: Optional[str] = None) -> List[Dict[str, Any]]:
"""基于关键词的搜索(降级方案)"""
try:
with db_manager.get_session() as session:
# 构建查询条件
query_filter = session.query(KnowledgeEntry).filter(
KnowledgeEntry.is_active == True
)
if tenant_id is not None:
query_filter = query_filter.filter(KnowledgeEntry.tenant_id == tenant_id)
# 如果只搜索已验证的知识库
if verified_only:
query_filter = query_filter.filter(KnowledgeEntry.is_verified == True)
entries = query_filter.all()
# 若已验证为空,则回退到全部活跃条目
if not entries and verified_only:
fallback_filter = session.query(KnowledgeEntry).filter(KnowledgeEntry.is_active == True)
if tenant_id is not None:
fallback_filter = fallback_filter.filter(KnowledgeEntry.tenant_id == tenant_id)
entries = fallback_filter.all()
if not entries:
logger.warning("知识库中没有活跃条目")
return []
# 如果查询为空,返回所有条目
if not query.strip():
logger.info("查询为空,返回所有条目")
return [{
"id": entry.id,
"question": entry.question,
"answer": entry.answer,
"category": entry.category,
"confidence_score": entry.confidence_score,
"similarity_score": 1.0,
"usage_count": entry.usage_count,
"is_verified": entry.is_verified
} for entry in entries[:top_k]]
# 使用简化的关键词匹配搜索
q = query.strip().lower()
results = []
for entry in entries:
# 组合问题和答案进行搜索
search_text = (entry.question + " " + entry.answer).lower()
# 计算匹配分数
score = 0.0
# 完全匹配
if q in search_text:
score = 1.0
else:
# 分词匹配
query_words = q.split()
text_words = search_text.split()
# 计算单词匹配度
matched_words = 0
for word in query_words:
if word in text_words:
matched_words += 1
if matched_words > 0:
score = matched_words / len(query_words) * 0.8
# 如果分数大于0添加到结果中
if score > 0:
results.append({
"id": entry.id,
"question": entry.question,
"answer": entry.answer,
"category": entry.category,
"confidence_score": entry.confidence_score,
"similarity_score": score,
"usage_count": entry.usage_count,
"is_verified": entry.is_verified
})
# 按相似度排序并返回top_k个结果
results.sort(key=lambda x: x['similarity_score'], reverse=True)
results = results[:top_k]
logger.info(f"搜索查询 '{query}' 返回 {len(results)} 个结果")
return results
except Exception as e:
logger.error(f"搜索知识库失败: {e}")
return []
def add_knowledge_entry(
self,
question: str,
answer: str,
category: str,
confidence_score: float = 0.5,
is_verified: bool = False,
tenant_id: Optional[str] = None
) -> bool:
"""添加知识库条目"""
try:
# 确定 tenant_id优先使用传入值否则取配置默认值
effective_tenant_id = tenant_id if tenant_id is not None else get_config().server.tenant_id
# 生成 embedding
embedding_json = None
vec = None
text_for_embedding = question + " " + answer
if self.embedding_enabled:
vec = self.embedding_client.embed_text(text_for_embedding)
if vec:
embedding_json = json.dumps(vec)
with db_manager.get_session() as session:
entry = KnowledgeEntry(
question=question,
answer=answer,
category=category,
confidence_score=confidence_score,
usage_count=0,
is_verified=is_verified,
tenant_id=effective_tenant_id,
vector_embedding=embedding_json
)
session.add(entry)
session.commit()
entry_id = entry.id
# 更新向量索引
if vec and entry_id:
vector_store.add(entry_id, vec)
# 重新训练 TF-IDF 向量化器
self._load_vectorizer()
logger.info(f"添加知识库条目成功: {question[:50]}...")
return True
except Exception as e:
logger.error(f"添加知识库条目失败: {e}")
return False
def update_knowledge_entry(
self,
entry_id: int,
question: str = None,
answer: str = None,
category: str = None,
confidence_score: float = None
) -> bool:
"""更新知识库条目"""
try:
with db_manager.get_session() as session:
entry = session.query(KnowledgeEntry).filter(
KnowledgeEntry.id == entry_id
).first()
if not entry:
return False
content_changed = False
if question:
entry.question = question
content_changed = True
if answer:
entry.answer = answer
content_changed = True
if category:
entry.category = category
if confidence_score is not None:
entry.confidence_score = confidence_score
# 内容变更时重新生成 embedding
if content_changed and self.embedding_enabled:
text_for_embedding = (question or entry.question) + " " + (answer or entry.answer)
vec = self.embedding_client.embed_text(text_for_embedding)
if vec:
entry.vector_embedding = json.dumps(vec)
vector_store.update(entry_id, vec)
entry.updated_at = datetime.now()
session.commit()
logger.info(f"更新知识库条目成功: {entry_id}")
return True
except Exception as e:
logger.error(f"更新知识库条目失败: {e}")
return False
def get_knowledge_entries(self, page: int = 1, per_page: int = 10) -> Dict[str, Any]:
"""获取知识库条目(分页)"""
try:
with db_manager.get_session() as session:
# 计算偏移量
offset = (page - 1) * per_page
# 获取总数
total = session.query(KnowledgeEntry).filter(
KnowledgeEntry.is_active == True
).count()
# 获取分页数据
entries = session.query(KnowledgeEntry).filter(
KnowledgeEntry.is_active == True
).order_by(KnowledgeEntry.created_at.desc()).offset(offset).limit(per_page).all()
# 转换为字典格式
knowledge_list = []
for entry in entries:
knowledge_list.append({
"id": entry.id,
"question": entry.question,
"answer": entry.answer,
"category": entry.category,
"confidence_score": entry.confidence_score,
"usage_count": entry.usage_count,
"created_at": entry.created_at.isoformat() if entry.created_at else None,
"is_verified": getattr(entry, 'is_verified', False) # 添加验证状态
})
return {
"knowledge": knowledge_list,
"total": total,
"page": page,
"per_page": per_page,
"total_pages": (total + per_page - 1) // per_page
}
except Exception as e:
logger.error(f"获取知识库条目失败: {e}")
return {"knowledge": [], "total": 0, "page": 1, "per_page": per_page, "total_pages": 0}
def verify_knowledge_entry(self, entry_id: int, verified_by: str = "admin") -> bool:
"""验证知识库条目"""
try:
with db_manager.get_session() as session:
entry = session.query(KnowledgeEntry).filter(
KnowledgeEntry.id == entry_id
).first()
if not entry:
return False
entry.is_verified = True
entry.verified_by = verified_by
entry.verified_at = datetime.now()
session.commit()
logger.info(f"知识库条目验证成功: {entry_id}")
return True
except Exception as e:
logger.error(f"验证知识库条目失败: {e}")
return False
def unverify_knowledge_entry(self, entry_id: int) -> bool:
"""取消验证知识库条目"""
try:
with db_manager.get_session() as session:
entry = session.query(KnowledgeEntry).filter(
KnowledgeEntry.id == entry_id
).first()
if not entry:
return False
entry.is_verified = False
entry.verified_by = None
entry.verified_at = None
session.commit()
logger.info(f"知识库条目取消验证成功: {entry_id}")
return True
except Exception as e:
logger.error(f"取消验证知识库条目失败: {e}")
return False
def delete_knowledge_entry(self, entry_id: int) -> bool:
"""删除知识库条目(软删除)"""
try:
with db_manager.get_session() as session:
entry = session.query(KnowledgeEntry).filter(
KnowledgeEntry.id == entry_id
).first()
if not entry:
logger.warning(f"知识库条目不存在: {entry_id}")
return False
entry.is_active = False
session.commit()
# 从向量索引中移除
vector_store.remove(entry_id)
# 重新训练向量化器(如果还有活跃条目)
try:
self._load_vectorizer()
except Exception as vectorizer_error:
logger.warning(f"重新加载向量化器失败: {vectorizer_error}")
logger.info(f"删除知识库条目成功: {entry_id}")
return True
except Exception as e:
logger.error(f"删除知识库条目失败: {e}")
return False
def get_knowledge_stats(self, tenant_id: Optional[str] = None) -> Dict[str, Any]:
"""获取知识库统计信息"""
try:
with db_manager.get_session() as session:
# 基础过滤条件
base_filter = [KnowledgeEntry.is_active == True]
if tenant_id is not None:
base_filter.append(KnowledgeEntry.tenant_id == tenant_id)
# 只统计活跃(未删除)的条目
total_entries = session.query(KnowledgeEntry).filter(
*base_filter
).count()
# 统计已验证的条目
verified_entries = session.query(KnowledgeEntry).filter(
*base_filter,
KnowledgeEntry.is_verified == True
).count()
# 按类别统计(仅限活跃条目)
category_stats = session.query(
KnowledgeEntry.category,
func.count(KnowledgeEntry.id)
).filter(
*base_filter
).group_by(KnowledgeEntry.category).all()
# 平均置信度(仅限活跃条目)
avg_confidence = session.query(
func.avg(KnowledgeEntry.confidence_score)
).filter(
*base_filter
).scalar() or 0.0
result = {
"total_entries": total_entries,
"active_entries": verified_entries, # 将 active_entries 复用为已验证数量,或前端相应修改
"category_distribution": dict(category_stats),
"average_confidence": float(avg_confidence)
}
if tenant_id is not None:
result["tenant_id"] = tenant_id
return result
except Exception as e:
logger.error(f"获取知识库统计失败: {e}")
return {}
def get_tenant_summary(self) -> List[Dict[str, Any]]:
"""按 tenant_id 聚合活跃知识条目,返回租户汇总列表。
返回格式: [
{
"tenant_id": "market_a",
"entry_count": 42,
"verified_count": 30,
"category_distribution": {"FAQ": 20, "故障排查": 22}
}, ...
]
按 entry_count 降序排列。
"""
try:
with db_manager.get_session() as session:
# 主聚合查询:按 tenant_id 统计 entry_count 和 verified_count
summary_rows = session.query(
KnowledgeEntry.tenant_id,
func.count(KnowledgeEntry.id).label('entry_count'),
func.sum(
func.cast(KnowledgeEntry.is_verified, Integer)
).label('verified_count')
).filter(
KnowledgeEntry.is_active == True
).group_by(
KnowledgeEntry.tenant_id
).order_by(
func.count(KnowledgeEntry.id).desc()
).all()
if not summary_rows:
return []
# 类别分布查询:按 tenant_id + category 统计
category_rows = session.query(
KnowledgeEntry.tenant_id,
KnowledgeEntry.category,
func.count(KnowledgeEntry.id).label('cat_count')
).filter(
KnowledgeEntry.is_active == True
).group_by(
KnowledgeEntry.tenant_id,
KnowledgeEntry.category
).all()
# 构建 tenant_id -> {category: count} 映射
category_map: Dict[str, Dict[str, int]] = {}
for row in category_rows:
if row.tenant_id not in category_map:
category_map[row.tenant_id] = {}
category_map[row.tenant_id][row.category] = row.cat_count
# 组装结果
result = []
for row in summary_rows:
result.append({
"tenant_id": row.tenant_id,
"entry_count": row.entry_count,
"verified_count": int(row.verified_count or 0),
"category_distribution": category_map.get(row.tenant_id, {})
})
return result
except Exception as e:
logger.error(f"获取租户汇总失败: {e}")
return []
def update_usage_count(self, entry_ids: List[int]) -> bool:
"""更新知识库条目的使用次数"""
try:
with db_manager.get_session() as session:
# 批量更新使用次数
session.query(KnowledgeEntry).filter(
KnowledgeEntry.id.in_(entry_ids)
).update({
"usage_count": KnowledgeEntry.usage_count + 1,
"updated_at": datetime.now()
}, synchronize_session=False)
session.commit()
logger.info(f"成功更新 {len(entry_ids)} 个知识库条目的使用次数")
return True
except Exception as e:
logger.error(f"更新知识库使用次数失败: {e}")
return False
def get_knowledge_paginated(self, page: int = 1, per_page: int = 10, category_filter: str = '', verified_filter: str = '', tenant_id: Optional[str] = None) -> Dict[str, Any]:
"""获取知识库条目(分页和过滤)"""
try:
with db_manager.get_session() as session:
query = session.query(KnowledgeEntry).filter(KnowledgeEntry.is_active == True)
if tenant_id is not None:
query = query.filter(KnowledgeEntry.tenant_id == tenant_id)
if category_filter:
query = query.filter(KnowledgeEntry.category == category_filter)
if verified_filter:
if verified_filter == 'true':
query = query.filter(KnowledgeEntry.is_verified == True)
elif verified_filter == 'false':
query = query.filter(KnowledgeEntry.is_verified == False)
query = query.order_by(KnowledgeEntry.created_at.desc())
total = query.count()
knowledge_entries = query.offset((page - 1) * per_page).limit(per_page).all()
knowledge_data = []
for entry in knowledge_entries:
knowledge_data.append({
'id': entry.id,
'question': entry.question,
'answer': entry.answer,
'category': entry.category,
'confidence_score': entry.confidence_score,
'usage_count': entry.usage_count,
'is_verified': entry.is_verified,
'is_active': entry.is_active,
'created_at': entry.created_at.isoformat() if entry.created_at else None,
'updated_at': entry.updated_at.isoformat() if entry.updated_at else None
})
total_pages = (total + per_page - 1) // per_page
return {
'knowledge': knowledge_data,
'page': page,
'per_page': per_page,
'total': total,
'total_pages': total_pages
}
except Exception as e:
logger.error(f"获取分页知识库失败: {e}")
# 返回一个空的结构以避免在调用方出现错误
return {
'knowledge': [],
'page': page,
'per_page': per_page,
'total': 0,
'total_pages': 0
}