import json import logging from typing import List, Dict, Optional, Any from datetime import datetime import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sqlalchemy import func from ..core.database import db_manager from ..core.models import KnowledgeEntry, WorkOrder, Conversation from ..core.llm_client import QwenClient from ..core.embedding_client import EmbeddingClient from ..core.vector_store import vector_store from ..config.unified_config import get_config logger = logging.getLogger(__name__) class KnowledgeManager: """知识库管理器""" def __init__(self): self.llm_client = QwenClient() self.embedding_client = EmbeddingClient() self.embedding_enabled = get_config().embedding.enabled self.similarity_threshold = get_config().embedding.similarity_threshold self.vectorizer = TfidfVectorizer( max_features=1000, stop_words=None, # 不使用英文停用词,因为数据是中文 ngram_range=(1, 2) ) self._load_vectorizer() # 加载向量索引(embedding 模式) if self.embedding_enabled: vector_store.load_from_db() def _load_vectorizer(self): """加载向量化器""" try: logger.info("正在初始化知识库向量化器...") with db_manager.get_session() as session: entries = session.query(KnowledgeEntry).filter( KnowledgeEntry.is_active == True ).all() if entries: texts = [entry.question + " " + entry.answer for entry in entries] self.vectorizer.fit(texts) logger.info(f"向量化器加载成功: 共处理 {len(entries)} 个知识条目") else: logger.warning("知识库尚无活跃条目,向量化器将保持空状态") except Exception as e: logger.error(f"加载向量化器失败: {e}") def learn_from_work_order(self, work_order_id: int) -> bool: """从工单中学习知识""" try: with db_manager.get_session() as session: work_order = session.query(WorkOrder).filter( WorkOrder.id == work_order_id ).first() if not work_order or not work_order.resolution: return False # 提取问题和答案 question = work_order.title + " " + work_order.description answer = work_order.resolution logger.info(f"开始从工单 {work_order_id} 学习知识: 标题长度={len(work_order.title)}, 描述长度={len(work_order.description)}") # 检查是否已存在相似条目 existing_entry = self._find_similar_entry(question, session) if existing_entry: # 更新现有条目 logger.info(f"检测到相似知识条目 (ID: {existing_entry.id}),执行更新操作") existing_entry.answer = answer existing_entry.usage_count += 1 existing_entry.updated_at = datetime.now() if work_order.satisfaction_score: existing_entry.confidence_score = work_order.satisfaction_score # 更新 embedding if self.embedding_enabled: vec = self.embedding_client.embed_text(question + " " + answer) if vec: existing_entry.vector_embedding = json.dumps(vec) vector_store.update(existing_entry.id, vec) else: # 创建新条目 logger.info(f"未发现相似条目,正在为工单 {work_order_id} 创建新知识点") embedding_json = None vec = None if self.embedding_enabled: vec = self.embedding_client.embed_text(question + " " + answer) if vec: embedding_json = json.dumps(vec) new_entry = KnowledgeEntry( question=question, answer=answer, category=work_order.category, confidence_score=work_order.satisfaction_score or 0.5, usage_count=1, vector_embedding=embedding_json ) session.add(new_entry) session.flush() # 获取 ID if vec and new_entry.id: vector_store.add(new_entry.id, vec) session.commit() logger.info(f"从工单 {work_order_id} 学习知识成功") return True except Exception as e: logger.error(f"从工单学习知识失败: {e}") return False def _find_similar_entry(self, question: str, session) -> Optional[KnowledgeEntry]: """查找相似的知识库条目""" try: # 优先使用 embedding 查找 if self.embedding_enabled: query_vec = self.embedding_client.embed_text(question) if query_vec: candidates = vector_store.search(query_vec, top_k=1, threshold=0.8) if candidates: entry_id, score = candidates[0] entry = session.query(KnowledgeEntry).filter( KnowledgeEntry.id == entry_id, KnowledgeEntry.is_active == True ).first() if entry: logger.info(f"Embedding 匹配成功: 相似度 {score:.4f}, ID={entry_id}") return entry # 降级:TF-IDF 匹配 entries = session.query(KnowledgeEntry).filter( KnowledgeEntry.is_active == True ).all() if not entries: return None texts = [entry.question for entry in entries] question_vector = self.vectorizer.transform([question]) entry_vectors = self.vectorizer.transform(texts) similarities = cosine_similarity(question_vector, entry_vectors)[0] max_similarity_idx = np.argmax(similarities) max_score = similarities[max_similarity_idx] logger.debug(f"TF-IDF 相似度检索: 最高分值={max_score:.4f}") if max_score > 0.8: return entries[max_similarity_idx] return None except Exception as e: logger.error(f"查找相似条目失败: {e}") return None def search_knowledge(self, query: str, top_k: int = 3, verified_only: bool = True) -> List[Dict[str, Any]]: """搜索知识库 — 优先使用 embedding 语义检索,降级为关键词匹配""" try: # 尝试 embedding 语义检索 if self.embedding_enabled: results = self._search_by_embedding(query, top_k, verified_only) if results: return results logger.debug("Embedding 检索无结果,降级为关键词匹配") # 降级:关键词匹配 return self._search_by_keyword(query, top_k, verified_only) except Exception as e: logger.error(f"搜索知识库失败: {e}") return [] def _search_by_embedding(self, query: str, top_k: int = 3, verified_only: bool = True) -> List[Dict[str, Any]]: """基于 embedding 向量的语义检索""" try: query_vec = self.embedding_client.embed_text(query) if query_vec is None: return [] # 向量检索 candidates = vector_store.search( query_vector=query_vec, top_k=top_k * 3, # 多取一些,后面过滤 threshold=self.similarity_threshold ) if not candidates: return [] # 从 DB 获取完整条目并过滤 candidate_ids = [cid for cid, _ in candidates] score_map = {cid: score for cid, score in candidates} with db_manager.get_session() as session: query_filter = session.query(KnowledgeEntry).filter( KnowledgeEntry.id.in_(candidate_ids), KnowledgeEntry.is_active == True ) if verified_only: query_filter = query_filter.filter(KnowledgeEntry.is_verified == True) entries = query_filter.all() # 如果 verified_only 没结果,回退到全部 if not entries and verified_only: entries = session.query(KnowledgeEntry).filter( KnowledgeEntry.id.in_(candidate_ids), KnowledgeEntry.is_active == True ).all() results = [] for entry in entries: results.append({ "id": entry.id, "question": entry.question, "answer": entry.answer, "category": entry.category, "confidence_score": entry.confidence_score, "similarity_score": score_map.get(entry.id, 0.0), "usage_count": entry.usage_count, "is_verified": entry.is_verified }) results.sort(key=lambda x: x['similarity_score'], reverse=True) results = results[:top_k] logger.info(f"Embedding 搜索 '{query[:30]}' 返回 {len(results)} 个结果") return results except Exception as e: logger.error(f"Embedding 搜索失败: {e}") return [] def _search_by_keyword(self, query: str, top_k: int = 3, verified_only: bool = True) -> List[Dict[str, Any]]: """基于关键词的搜索(降级方案)""" try: with db_manager.get_session() as session: # 构建查询条件 query_filter = session.query(KnowledgeEntry).filter( KnowledgeEntry.is_active == True ) # 如果只搜索已验证的知识库 if verified_only: query_filter = query_filter.filter(KnowledgeEntry.is_verified == True) entries = query_filter.all() # 若已验证为空,则回退到全部活跃条目 if not entries and verified_only: entries = session.query(KnowledgeEntry).filter(KnowledgeEntry.is_active == True).all() if not entries: logger.warning("知识库中没有活跃条目") return [] # 如果查询为空,返回所有条目 if not query.strip(): logger.info("查询为空,返回所有条目") return [{ "id": entry.id, "question": entry.question, "answer": entry.answer, "category": entry.category, "confidence_score": entry.confidence_score, "similarity_score": 1.0, "usage_count": entry.usage_count, "is_verified": entry.is_verified } for entry in entries[:top_k]] # 使用简化的关键词匹配搜索 q = query.strip().lower() results = [] for entry in entries: # 组合问题和答案进行搜索 search_text = (entry.question + " " + entry.answer).lower() # 计算匹配分数 score = 0.0 # 完全匹配 if q in search_text: score = 1.0 else: # 分词匹配 query_words = q.split() text_words = search_text.split() # 计算单词匹配度 matched_words = 0 for word in query_words: if word in text_words: matched_words += 1 if matched_words > 0: score = matched_words / len(query_words) * 0.8 # 如果分数大于0,添加到结果中 if score > 0: results.append({ "id": entry.id, "question": entry.question, "answer": entry.answer, "category": entry.category, "confidence_score": entry.confidence_score, "similarity_score": score, "usage_count": entry.usage_count, "is_verified": entry.is_verified }) # 按相似度排序并返回top_k个结果 results.sort(key=lambda x: x['similarity_score'], reverse=True) results = results[:top_k] logger.info(f"搜索查询 '{query}' 返回 {len(results)} 个结果") return results except Exception as e: logger.error(f"搜索知识库失败: {e}") return [] def add_knowledge_entry( self, question: str, answer: str, category: str, confidence_score: float = 0.5, is_verified: bool = False ) -> bool: """添加知识库条目""" try: # 生成 embedding embedding_json = None text_for_embedding = question + " " + answer if self.embedding_enabled: vec = self.embedding_client.embed_text(text_for_embedding) if vec: embedding_json = json.dumps(vec) with db_manager.get_session() as session: entry = KnowledgeEntry( question=question, answer=answer, category=category, confidence_score=confidence_score, usage_count=0, is_verified=is_verified, vector_embedding=embedding_json ) session.add(entry) session.commit() entry_id = entry.id # 更新向量索引 if vec and entry_id: vector_store.add(entry_id, vec) # 重新训练 TF-IDF 向量化器 self._load_vectorizer() logger.info(f"添加知识库条目成功: {question[:50]}...") return True except Exception as e: logger.error(f"添加知识库条目失败: {e}") return False def update_knowledge_entry( self, entry_id: int, question: str = None, answer: str = None, category: str = None, confidence_score: float = None ) -> bool: """更新知识库条目""" try: with db_manager.get_session() as session: entry = session.query(KnowledgeEntry).filter( KnowledgeEntry.id == entry_id ).first() if not entry: return False content_changed = False if question: entry.question = question content_changed = True if answer: entry.answer = answer content_changed = True if category: entry.category = category if confidence_score is not None: entry.confidence_score = confidence_score # 内容变更时重新生成 embedding if content_changed and self.embedding_enabled: text_for_embedding = (question or entry.question) + " " + (answer or entry.answer) vec = self.embedding_client.embed_text(text_for_embedding) if vec: entry.vector_embedding = json.dumps(vec) vector_store.update(entry_id, vec) entry.updated_at = datetime.now() session.commit() logger.info(f"更新知识库条目成功: {entry_id}") return True except Exception as e: logger.error(f"更新知识库条目失败: {e}") return False def get_knowledge_entries(self, page: int = 1, per_page: int = 10) -> Dict[str, Any]: """获取知识库条目(分页)""" try: with db_manager.get_session() as session: # 计算偏移量 offset = (page - 1) * per_page # 获取总数 total = session.query(KnowledgeEntry).filter( KnowledgeEntry.is_active == True ).count() # 获取分页数据 entries = session.query(KnowledgeEntry).filter( KnowledgeEntry.is_active == True ).order_by(KnowledgeEntry.created_at.desc()).offset(offset).limit(per_page).all() # 转换为字典格式 knowledge_list = [] for entry in entries: knowledge_list.append({ "id": entry.id, "question": entry.question, "answer": entry.answer, "category": entry.category, "confidence_score": entry.confidence_score, "usage_count": entry.usage_count, "created_at": entry.created_at.isoformat() if entry.created_at else None, "is_verified": getattr(entry, 'is_verified', False) # 添加验证状态 }) return { "knowledge": knowledge_list, "total": total, "page": page, "per_page": per_page, "total_pages": (total + per_page - 1) // per_page } except Exception as e: logger.error(f"获取知识库条目失败: {e}") return {"knowledge": [], "total": 0, "page": 1, "per_page": per_page, "total_pages": 0} def verify_knowledge_entry(self, entry_id: int, verified_by: str = "admin") -> bool: """验证知识库条目""" try: with db_manager.get_session() as session: entry = session.query(KnowledgeEntry).filter( KnowledgeEntry.id == entry_id ).first() if not entry: return False entry.is_verified = True entry.verified_by = verified_by entry.verified_at = datetime.now() session.commit() logger.info(f"知识库条目验证成功: {entry_id}") return True except Exception as e: logger.error(f"验证知识库条目失败: {e}") return False def unverify_knowledge_entry(self, entry_id: int) -> bool: """取消验证知识库条目""" try: with db_manager.get_session() as session: entry = session.query(KnowledgeEntry).filter( KnowledgeEntry.id == entry_id ).first() if not entry: return False entry.is_verified = False entry.verified_by = None entry.verified_at = None session.commit() logger.info(f"知识库条目取消验证成功: {entry_id}") return True except Exception as e: logger.error(f"取消验证知识库条目失败: {e}") return False def delete_knowledge_entry(self, entry_id: int) -> bool: """删除知识库条目(软删除)""" try: with db_manager.get_session() as session: entry = session.query(KnowledgeEntry).filter( KnowledgeEntry.id == entry_id ).first() if not entry: logger.warning(f"知识库条目不存在: {entry_id}") return False entry.is_active = False session.commit() # 从向量索引中移除 vector_store.remove(entry_id) # 重新训练向量化器(如果还有活跃条目) try: self._load_vectorizer() except Exception as vectorizer_error: logger.warning(f"重新加载向量化器失败: {vectorizer_error}") logger.info(f"删除知识库条目成功: {entry_id}") return True except Exception as e: logger.error(f"删除知识库条目失败: {e}") return False def get_knowledge_stats(self) -> Dict[str, Any]: """获取知识库统计信息""" try: with db_manager.get_session() as session: # 只统计活跃(未删除)的条目 total_entries = session.query(KnowledgeEntry).filter( KnowledgeEntry.is_active == True ).count() # 统计已验证的条目 verified_entries = session.query(KnowledgeEntry).filter( KnowledgeEntry.is_active == True, KnowledgeEntry.is_verified == True ).count() # 按类别统计(仅限活跃条目) category_stats = session.query( KnowledgeEntry.category, func.count(KnowledgeEntry.id) ).filter( KnowledgeEntry.is_active == True ).group_by(KnowledgeEntry.category).all() # 平均置信度(仅限活跃条目) avg_confidence = session.query( func.avg(KnowledgeEntry.confidence_score) ).filter( KnowledgeEntry.is_active == True ).scalar() or 0.0 return { "total_entries": total_entries, "active_entries": verified_entries, # 将 active_entries 复用为已验证数量,或前端相应修改 "category_distribution": dict(category_stats), "average_confidence": float(avg_confidence) } except Exception as e: logger.error(f"获取知识库统计失败: {e}") return {} def update_usage_count(self, entry_ids: List[int]) -> bool: """更新知识库条目的使用次数""" try: with db_manager.get_session() as session: # 批量更新使用次数 session.query(KnowledgeEntry).filter( KnowledgeEntry.id.in_(entry_ids) ).update({ "usage_count": KnowledgeEntry.usage_count + 1, "updated_at": datetime.now() }, synchronize_session=False) session.commit() logger.info(f"成功更新 {len(entry_ids)} 个知识库条目的使用次数") return True except Exception as e: logger.error(f"更新知识库使用次数失败: {e}") return False def get_knowledge_paginated(self, page: int = 1, per_page: int = 10, category_filter: str = '', verified_filter: str = '') -> Dict[str, Any]: """获取知识库条目(分页和过滤)""" try: with db_manager.get_session() as session: query = session.query(KnowledgeEntry).filter(KnowledgeEntry.is_active == True) if category_filter: query = query.filter(KnowledgeEntry.category == category_filter) if verified_filter: if verified_filter == 'true': query = query.filter(KnowledgeEntry.is_verified == True) elif verified_filter == 'false': query = query.filter(KnowledgeEntry.is_verified == False) query = query.order_by(KnowledgeEntry.created_at.desc()) total = query.count() knowledge_entries = query.offset((page - 1) * per_page).limit(per_page).all() knowledge_data = [] for entry in knowledge_entries: knowledge_data.append({ 'id': entry.id, 'question': entry.question, 'answer': entry.answer, 'category': entry.category, 'confidence_score': entry.confidence_score, 'usage_count': entry.usage_count, 'is_verified': entry.is_verified, 'is_active': entry.is_active, 'created_at': entry.created_at.isoformat() if entry.created_at else None, 'updated_at': entry.updated_at.isoformat() if entry.updated_at else None }) total_pages = (total + per_page - 1) // per_page return { 'knowledge': knowledge_data, 'page': page, 'per_page': per_page, 'total': total, 'total_pages': total_pages } except Exception as e: logger.error(f"获取分页知识库失败: {e}") # 返回一个空的结构以避免在调用方出现错误 return { 'knowledge': [], 'page': page, 'per_page': per_page, 'total': 0, 'total_pages': 0 }