大改,未验证
This commit is contained in:
@@ -10,6 +10,9 @@ from sqlalchemy import func
|
||||
from ..core.database import db_manager
|
||||
from ..core.models import KnowledgeEntry, WorkOrder, Conversation
|
||||
from ..core.llm_client import QwenClient
|
||||
from ..core.embedding_client import EmbeddingClient
|
||||
from ..core.vector_store import vector_store
|
||||
from ..config.unified_config import get_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -18,12 +21,18 @@ class KnowledgeManager:
|
||||
|
||||
def __init__(self):
|
||||
self.llm_client = QwenClient()
|
||||
self.embedding_client = EmbeddingClient()
|
||||
self.embedding_enabled = get_config().embedding.enabled
|
||||
self.similarity_threshold = get_config().embedding.similarity_threshold
|
||||
self.vectorizer = TfidfVectorizer(
|
||||
max_features=1000,
|
||||
stop_words=None, # 不使用英文停用词,因为数据是中文
|
||||
ngram_range=(1, 2)
|
||||
)
|
||||
self._load_vectorizer()
|
||||
# 加载向量索引(embedding 模式)
|
||||
if self.embedding_enabled:
|
||||
vector_store.load_from_db()
|
||||
|
||||
def _load_vectorizer(self):
|
||||
"""加载向量化器"""
|
||||
@@ -71,17 +80,34 @@ class KnowledgeManager:
|
||||
existing_entry.updated_at = datetime.now()
|
||||
if work_order.satisfaction_score:
|
||||
existing_entry.confidence_score = work_order.satisfaction_score
|
||||
# 更新 embedding
|
||||
if self.embedding_enabled:
|
||||
vec = self.embedding_client.embed_text(question + " " + answer)
|
||||
if vec:
|
||||
existing_entry.vector_embedding = json.dumps(vec)
|
||||
vector_store.update(existing_entry.id, vec)
|
||||
else:
|
||||
# 创建新条目
|
||||
logger.info(f"未发现相似条目,正在为工单 {work_order_id} 创建新知识点")
|
||||
embedding_json = None
|
||||
vec = None
|
||||
if self.embedding_enabled:
|
||||
vec = self.embedding_client.embed_text(question + " " + answer)
|
||||
if vec:
|
||||
embedding_json = json.dumps(vec)
|
||||
|
||||
new_entry = KnowledgeEntry(
|
||||
question=question,
|
||||
answer=answer,
|
||||
category=work_order.category,
|
||||
confidence_score=work_order.satisfaction_score or 0.5,
|
||||
usage_count=1
|
||||
usage_count=1,
|
||||
vector_embedding=embedding_json
|
||||
)
|
||||
session.add(new_entry)
|
||||
session.flush() # 获取 ID
|
||||
if vec and new_entry.id:
|
||||
vector_store.add(new_entry.id, vec)
|
||||
|
||||
session.commit()
|
||||
logger.info(f"从工单 {work_order_id} 学习知识成功")
|
||||
@@ -94,6 +120,22 @@ class KnowledgeManager:
|
||||
def _find_similar_entry(self, question: str, session) -> Optional[KnowledgeEntry]:
|
||||
"""查找相似的知识库条目"""
|
||||
try:
|
||||
# 优先使用 embedding 查找
|
||||
if self.embedding_enabled:
|
||||
query_vec = self.embedding_client.embed_text(question)
|
||||
if query_vec:
|
||||
candidates = vector_store.search(query_vec, top_k=1, threshold=0.8)
|
||||
if candidates:
|
||||
entry_id, score = candidates[0]
|
||||
entry = session.query(KnowledgeEntry).filter(
|
||||
KnowledgeEntry.id == entry_id,
|
||||
KnowledgeEntry.is_active == True
|
||||
).first()
|
||||
if entry:
|
||||
logger.info(f"Embedding 匹配成功: 相似度 {score:.4f}, ID={entry_id}")
|
||||
return entry
|
||||
|
||||
# 降级:TF-IDF 匹配
|
||||
entries = session.query(KnowledgeEntry).filter(
|
||||
KnowledgeEntry.is_active == True
|
||||
).all()
|
||||
@@ -101,7 +143,6 @@ class KnowledgeManager:
|
||||
if not entries:
|
||||
return None
|
||||
|
||||
# 计算相似度
|
||||
texts = [entry.question for entry in entries]
|
||||
question_vector = self.vectorizer.transform([question])
|
||||
entry_vectors = self.vectorizer.transform(texts)
|
||||
@@ -110,13 +151,11 @@ class KnowledgeManager:
|
||||
max_similarity_idx = np.argmax(similarities)
|
||||
max_score = similarities[max_similarity_idx]
|
||||
|
||||
logger.debug(f"相似度检索完成: 最高分值={max_score:.4f}, 目标ID={entries[max_similarity_idx].id if entries else 'N/A'}")
|
||||
logger.debug(f"TF-IDF 相似度检索: 最高分值={max_score:.4f}")
|
||||
|
||||
if max_score > 0.8: # 相似度阈值
|
||||
logger.info(f"匹配成功: 相似度 {max_score:.4f} 超过阈值 0.8")
|
||||
if max_score > 0.8:
|
||||
return entries[max_similarity_idx]
|
||||
|
||||
logger.debug(f"匹配跳过: 相似度 {max_score:.4f} 未达到阈值 0.8")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
@@ -124,7 +163,85 @@ class KnowledgeManager:
|
||||
return None
|
||||
|
||||
def search_knowledge(self, query: str, top_k: int = 3, verified_only: bool = True) -> List[Dict[str, Any]]:
|
||||
"""搜索知识库"""
|
||||
"""搜索知识库 — 优先使用 embedding 语义检索,降级为关键词匹配"""
|
||||
try:
|
||||
# 尝试 embedding 语义检索
|
||||
if self.embedding_enabled:
|
||||
results = self._search_by_embedding(query, top_k, verified_only)
|
||||
if results:
|
||||
return results
|
||||
logger.debug("Embedding 检索无结果,降级为关键词匹配")
|
||||
|
||||
# 降级:关键词匹配
|
||||
return self._search_by_keyword(query, top_k, verified_only)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"搜索知识库失败: {e}")
|
||||
return []
|
||||
|
||||
def _search_by_embedding(self, query: str, top_k: int = 3, verified_only: bool = True) -> List[Dict[str, Any]]:
|
||||
"""基于 embedding 向量的语义检索"""
|
||||
try:
|
||||
query_vec = self.embedding_client.embed_text(query)
|
||||
if query_vec is None:
|
||||
return []
|
||||
|
||||
# 向量检索
|
||||
candidates = vector_store.search(
|
||||
query_vector=query_vec,
|
||||
top_k=top_k * 3, # 多取一些,后面过滤
|
||||
threshold=self.similarity_threshold
|
||||
)
|
||||
|
||||
if not candidates:
|
||||
return []
|
||||
|
||||
# 从 DB 获取完整条目并过滤
|
||||
candidate_ids = [cid for cid, _ in candidates]
|
||||
score_map = {cid: score for cid, score in candidates}
|
||||
|
||||
with db_manager.get_session() as session:
|
||||
query_filter = session.query(KnowledgeEntry).filter(
|
||||
KnowledgeEntry.id.in_(candidate_ids),
|
||||
KnowledgeEntry.is_active == True
|
||||
)
|
||||
if verified_only:
|
||||
query_filter = query_filter.filter(KnowledgeEntry.is_verified == True)
|
||||
|
||||
entries = query_filter.all()
|
||||
|
||||
# 如果 verified_only 没结果,回退到全部
|
||||
if not entries and verified_only:
|
||||
entries = session.query(KnowledgeEntry).filter(
|
||||
KnowledgeEntry.id.in_(candidate_ids),
|
||||
KnowledgeEntry.is_active == True
|
||||
).all()
|
||||
|
||||
results = []
|
||||
for entry in entries:
|
||||
results.append({
|
||||
"id": entry.id,
|
||||
"question": entry.question,
|
||||
"answer": entry.answer,
|
||||
"category": entry.category,
|
||||
"confidence_score": entry.confidence_score,
|
||||
"similarity_score": score_map.get(entry.id, 0.0),
|
||||
"usage_count": entry.usage_count,
|
||||
"is_verified": entry.is_verified
|
||||
})
|
||||
|
||||
results.sort(key=lambda x: x['similarity_score'], reverse=True)
|
||||
results = results[:top_k]
|
||||
|
||||
logger.info(f"Embedding 搜索 '{query[:30]}' 返回 {len(results)} 个结果")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Embedding 搜索失败: {e}")
|
||||
return []
|
||||
|
||||
def _search_by_keyword(self, query: str, top_k: int = 3, verified_only: bool = True) -> List[Dict[str, Any]]:
|
||||
"""基于关键词的搜索(降级方案)"""
|
||||
try:
|
||||
with db_manager.get_session() as session:
|
||||
# 构建查询条件
|
||||
@@ -221,6 +338,14 @@ class KnowledgeManager:
|
||||
) -> bool:
|
||||
"""添加知识库条目"""
|
||||
try:
|
||||
# 生成 embedding
|
||||
embedding_json = None
|
||||
text_for_embedding = question + " " + answer
|
||||
if self.embedding_enabled:
|
||||
vec = self.embedding_client.embed_text(text_for_embedding)
|
||||
if vec:
|
||||
embedding_json = json.dumps(vec)
|
||||
|
||||
with db_manager.get_session() as session:
|
||||
entry = KnowledgeEntry(
|
||||
question=question,
|
||||
@@ -228,12 +353,18 @@ class KnowledgeManager:
|
||||
category=category,
|
||||
confidence_score=confidence_score,
|
||||
usage_count=0,
|
||||
is_verified=is_verified
|
||||
is_verified=is_verified,
|
||||
vector_embedding=embedding_json
|
||||
)
|
||||
session.add(entry)
|
||||
session.commit()
|
||||
entry_id = entry.id
|
||||
|
||||
# 重新训练向量化器
|
||||
# 更新向量索引
|
||||
if vec and entry_id:
|
||||
vector_store.add(entry_id, vec)
|
||||
|
||||
# 重新训练 TF-IDF 向量化器
|
||||
self._load_vectorizer()
|
||||
|
||||
logger.info(f"添加知识库条目成功: {question[:50]}...")
|
||||
@@ -261,15 +392,26 @@ class KnowledgeManager:
|
||||
if not entry:
|
||||
return False
|
||||
|
||||
content_changed = False
|
||||
if question:
|
||||
entry.question = question
|
||||
content_changed = True
|
||||
if answer:
|
||||
entry.answer = answer
|
||||
content_changed = True
|
||||
if category:
|
||||
entry.category = category
|
||||
if confidence_score is not None:
|
||||
entry.confidence_score = confidence_score
|
||||
|
||||
# 内容变更时重新生成 embedding
|
||||
if content_changed and self.embedding_enabled:
|
||||
text_for_embedding = (question or entry.question) + " " + (answer or entry.answer)
|
||||
vec = self.embedding_client.embed_text(text_for_embedding)
|
||||
if vec:
|
||||
entry.vector_embedding = json.dumps(vec)
|
||||
vector_store.update(entry_id, vec)
|
||||
|
||||
entry.updated_at = datetime.now()
|
||||
session.commit()
|
||||
|
||||
@@ -383,12 +525,14 @@ class KnowledgeManager:
|
||||
entry.is_active = False
|
||||
session.commit()
|
||||
|
||||
# 从向量索引中移除
|
||||
vector_store.remove(entry_id)
|
||||
|
||||
# 重新训练向量化器(如果还有活跃条目)
|
||||
try:
|
||||
self._load_vectorizer()
|
||||
except Exception as vectorizer_error:
|
||||
logger.warning(f"重新加载向量化器失败: {vectorizer_error}")
|
||||
# 即使向量化器加载失败,删除操作仍然成功
|
||||
|
||||
logger.info(f"删除知识库条目成功: {entry_id}")
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user