feat: 重大功能更新 v1.4.0 - 飞书集成、AI语义相似度、前端优化
主要更新内容: - 🚀 飞书多维表格集成,支持工单数据同步 - 🤖 AI建议与人工描述语义相似度计算 - 🎨 前端UI全面优化,现代化设计 - 📊 智能知识库入库策略(AI准确率<90%使用人工描述) - 🔧 代码重构,模块化架构优化 - 📚 完整文档整合和更新 - 🐛 修复配置导入和数据库字段问题 技术特性: - 使用sentence-transformers进行语义相似度计算 - 快速模式结合TF-IDF和语义方法 - 响应式设计,支持移动端 - 加载状态和动画效果 - 配置化AI准确率阈值
This commit is contained in:
@@ -76,18 +76,24 @@ def extract_keywords(text: str, max_keywords: int = 10) -> List[str]:
|
||||
return [word for word, count in sorted_words[:max_keywords]]
|
||||
|
||||
def calculate_similarity(text1: str, text2: str) -> float:
|
||||
"""计算文本相似度"""
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
"""计算文本相似度(使用语义相似度)"""
|
||||
try:
|
||||
vectorizer = TfidfVectorizer()
|
||||
vectors = vectorizer.fit_transform([text1, text2])
|
||||
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
|
||||
return float(similarity)
|
||||
from src.utils.semantic_similarity import calculate_semantic_similarity
|
||||
return calculate_semantic_similarity(text1, text2)
|
||||
except Exception as e:
|
||||
logging.error(f"计算相似度失败: {e}")
|
||||
return 0.0
|
||||
logging.error(f"计算语义相似度失败: {e}")
|
||||
# 回退到传统方法
|
||||
try:
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
vectors = vectorizer.fit_transform([text1, text2])
|
||||
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
|
||||
return float(similarity)
|
||||
except Exception as e2:
|
||||
logging.error(f"计算TF-IDF相似度失败: {e2}")
|
||||
return 0.0
|
||||
|
||||
def format_time_duration(seconds: float) -> str:
|
||||
"""格式化时间持续时间"""
|
||||
|
||||
256
src/utils/semantic_similarity.py
Normal file
256
src/utils/semantic_similarity.py
Normal file
@@ -0,0 +1,256 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
语义相似度计算服务
|
||||
使用sentence-transformers进行更准确的语义相似度计算
|
||||
"""
|
||||
|
||||
import logging
|
||||
import numpy as np
|
||||
from typing import List, Tuple, Optional
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import torch
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SemanticSimilarityCalculator:
|
||||
"""语义相似度计算器"""
|
||||
|
||||
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
||||
"""
|
||||
初始化语义相似度计算器
|
||||
|
||||
Args:
|
||||
model_name: 使用的预训练模型名称
|
||||
- all-MiniLM-L6-v2: 英文模型,速度快,推荐用于生产环境
|
||||
- paraphrase-multilingual-MiniLM-L12-v2: 多语言模型,支持中文
|
||||
- paraphrase-multilingual-mpnet-base-v2: 多语言模型,精度高
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.model = None
|
||||
self._load_model()
|
||||
|
||||
def _load_model(self):
|
||||
"""加载预训练模型"""
|
||||
try:
|
||||
logger.info(f"正在加载语义相似度模型: {self.model_name}")
|
||||
self.model = SentenceTransformer(self.model_name)
|
||||
logger.info("语义相似度模型加载成功")
|
||||
except Exception as e:
|
||||
logger.error(f"加载语义相似度模型失败: {e}")
|
||||
# 回退到简单模型
|
||||
self.model = None
|
||||
|
||||
def calculate_similarity(self, text1: str, text2: str, fast_mode: bool = True) -> float:
|
||||
"""
|
||||
计算两个文本的语义相似度
|
||||
|
||||
Args:
|
||||
text1: 第一个文本
|
||||
text2: 第二个文本
|
||||
fast_mode: 是否使用快速模式(结合传统方法)
|
||||
|
||||
Returns:
|
||||
相似度分数 (0-1之间)
|
||||
"""
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
# 快速模式:先使用传统方法快速筛选
|
||||
if fast_mode:
|
||||
tfidf_sim = self._calculate_tfidf_similarity(text1, text2)
|
||||
|
||||
# 如果传统方法相似度很高或很低,直接返回
|
||||
if tfidf_sim >= 0.9:
|
||||
return tfidf_sim
|
||||
elif tfidf_sim <= 0.3:
|
||||
return tfidf_sim
|
||||
|
||||
# 中等相似度时,使用语义方法进行精确计算
|
||||
if self.model is not None:
|
||||
semantic_sim = self._calculate_semantic_similarity(text1, text2)
|
||||
# 结合两种方法的结果
|
||||
return (tfidf_sim * 0.3 + semantic_sim * 0.7)
|
||||
else:
|
||||
return tfidf_sim
|
||||
|
||||
# 完整模式:直接使用语义相似度
|
||||
if self.model is not None:
|
||||
return self._calculate_semantic_similarity(text1, text2)
|
||||
else:
|
||||
return self._calculate_tfidf_similarity(text1, text2)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"计算语义相似度失败: {e}")
|
||||
return self._calculate_tfidf_similarity(text1, text2)
|
||||
|
||||
def _calculate_semantic_similarity(self, text1: str, text2: str) -> float:
|
||||
"""使用sentence-transformers计算语义相似度"""
|
||||
try:
|
||||
# 获取文本嵌入向量
|
||||
embeddings = self.model.encode([text1, text2])
|
||||
|
||||
# 计算余弦相似度
|
||||
similarity = self._cosine_similarity(embeddings[0], embeddings[1])
|
||||
|
||||
# 确保结果在0-1范围内
|
||||
similarity = max(0.0, min(1.0, similarity))
|
||||
|
||||
logger.debug(f"语义相似度计算: {similarity:.4f}")
|
||||
return float(similarity)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"语义相似度计算失败: {e}")
|
||||
return self._calculate_tfidf_similarity(text1, text2)
|
||||
|
||||
def _calculate_tfidf_similarity(self, text1: str, text2: str) -> float:
|
||||
"""使用TF-IDF计算相似度(回退方法)"""
|
||||
try:
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
vectorizer = TfidfVectorizer(max_features=1000, stop_words=None)
|
||||
vectors = vectorizer.fit_transform([text1, text2])
|
||||
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
|
||||
|
||||
logger.debug(f"TF-IDF相似度计算: {similarity:.4f}")
|
||||
return float(similarity)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TF-IDF相似度计算失败: {e}")
|
||||
return 0.0
|
||||
|
||||
def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
|
||||
"""计算余弦相似度"""
|
||||
try:
|
||||
# 计算点积
|
||||
dot_product = np.dot(vec1, vec2)
|
||||
|
||||
# 计算向量的模长
|
||||
norm1 = np.linalg.norm(vec1)
|
||||
norm2 = np.linalg.norm(vec2)
|
||||
|
||||
# 避免除零错误
|
||||
if norm1 == 0 or norm2 == 0:
|
||||
return 0.0
|
||||
|
||||
# 计算余弦相似度
|
||||
similarity = dot_product / (norm1 * norm2)
|
||||
|
||||
return float(similarity)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"余弦相似度计算失败: {e}")
|
||||
return 0.0
|
||||
|
||||
def batch_calculate_similarity(self, text_pairs: List[Tuple[str, str]]) -> List[float]:
|
||||
"""
|
||||
批量计算相似度
|
||||
|
||||
Args:
|
||||
text_pairs: 文本对列表 [(text1, text2), ...]
|
||||
|
||||
Returns:
|
||||
相似度分数列表
|
||||
"""
|
||||
if not text_pairs:
|
||||
return []
|
||||
|
||||
try:
|
||||
if self.model is not None:
|
||||
return self._batch_semantic_similarity(text_pairs)
|
||||
else:
|
||||
return [self._calculate_tfidf_similarity(t1, t2) for t1, t2 in text_pairs]
|
||||
except Exception as e:
|
||||
logger.error(f"批量相似度计算失败: {e}")
|
||||
return [0.0] * len(text_pairs)
|
||||
|
||||
def _batch_semantic_similarity(self, text_pairs: List[Tuple[str, str]]) -> List[float]:
|
||||
"""批量计算语义相似度"""
|
||||
try:
|
||||
# 提取所有文本
|
||||
all_texts = []
|
||||
for text1, text2 in text_pairs:
|
||||
all_texts.extend([text1, text2])
|
||||
|
||||
# 批量获取嵌入向量
|
||||
embeddings = self.model.encode(all_texts)
|
||||
|
||||
# 计算每对的相似度
|
||||
similarities = []
|
||||
for i in range(0, len(embeddings), 2):
|
||||
similarity = self._cosine_similarity(embeddings[i], embeddings[i+1])
|
||||
similarities.append(float(similarity))
|
||||
|
||||
return similarities
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量语义相似度计算失败: {e}")
|
||||
return [self._calculate_tfidf_similarity(t1, t2) for t1, t2 in text_pairs]
|
||||
|
||||
def get_similarity_explanation(self, text1: str, text2: str, similarity: float) -> str:
|
||||
"""
|
||||
获取相似度解释
|
||||
|
||||
Args:
|
||||
text1: 第一个文本
|
||||
text2: 第二个文本
|
||||
similarity: 相似度分数
|
||||
|
||||
Returns:
|
||||
相似度解释文本
|
||||
"""
|
||||
if similarity >= 0.95:
|
||||
return "语义高度相似,建议自动审批"
|
||||
elif similarity >= 0.8:
|
||||
return "语义较为相似,建议人工审核"
|
||||
elif similarity >= 0.6:
|
||||
return "语义部分相似,需要人工判断"
|
||||
elif similarity >= 0.4:
|
||||
return "语义相似度较低,建议重新生成"
|
||||
else:
|
||||
return "语义差异较大,建议重新生成"
|
||||
|
||||
def is_model_available(self) -> bool:
|
||||
"""检查模型是否可用"""
|
||||
return self.model is not None
|
||||
|
||||
# 全局实例
|
||||
_similarity_calculator = None
|
||||
|
||||
def get_similarity_calculator() -> SemanticSimilarityCalculator:
|
||||
"""获取全局相似度计算器实例"""
|
||||
global _similarity_calculator
|
||||
if _similarity_calculator is None:
|
||||
_similarity_calculator = SemanticSimilarityCalculator()
|
||||
return _similarity_calculator
|
||||
|
||||
def calculate_semantic_similarity(text1: str, text2: str, fast_mode: bool = True) -> float:
|
||||
"""
|
||||
计算语义相似度的便捷函数
|
||||
|
||||
Args:
|
||||
text1: 第一个文本
|
||||
text2: 第二个文本
|
||||
fast_mode: 是否使用快速模式
|
||||
|
||||
Returns:
|
||||
相似度分数 (0-1之间)
|
||||
"""
|
||||
calculator = get_similarity_calculator()
|
||||
return calculator.calculate_similarity(text1, text2, fast_mode)
|
||||
|
||||
def batch_calculate_semantic_similarity(text_pairs: List[Tuple[str, str]]) -> List[float]:
|
||||
"""
|
||||
批量计算语义相似度的便捷函数
|
||||
|
||||
Args:
|
||||
text_pairs: 文本对列表
|
||||
|
||||
Returns:
|
||||
相似度分数列表
|
||||
"""
|
||||
calculator = get_similarity_calculator()
|
||||
return calculator.batch_calculate_similarity(text_pairs)
|
||||
Reference in New Issue
Block a user