feat: Introduce LLM response caching and streaming, add application configuration, and enhance session data with progress and history tracking.

This commit is contained in:
2026-01-24 12:52:35 +08:00
parent 162f5c4da4
commit fbbb5a2470
10 changed files with 1015 additions and 4 deletions

View File

@@ -5,8 +5,17 @@ LLM调用辅助模块
import asyncio
import yaml
from typing import Optional, Callable, AsyncIterator
from config.llm_config import LLMConfig
from config.app_config import app_config
from utils.fallback_openai_client import AsyncFallbackOpenAIClient
from utils.cache_manager import LLMCacheManager
# 初始化LLM缓存管理器
llm_cache = LLMCacheManager(
cache_dir=app_config.llm_cache_dir,
enabled=app_config.llm_cache_enabled
)
class LLMHelper:
"""LLM调用辅助类支持同步和异步调用"""
@@ -82,6 +91,104 @@ class LLMHelper:
print(f"原始响应: {response}")
return {}
async def close(self):
"""关闭客户端"""
await self.client.close()
await self.client.close()
async def async_call_with_cache(
self,
prompt: str,
system_prompt: str = None,
max_tokens: int = None,
temperature: float = None,
use_cache: bool = True
) -> str:
"""带缓存的异步LLM调用"""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
# 生成缓存键
cache_key = llm_cache.get_cache_key_from_messages(messages, self.config.model)
# 尝试从缓存获取
if use_cache and app_config.llm_cache_enabled:
cached_response = llm_cache.get(cache_key)
if cached_response:
print("💾 使用LLM缓存响应")
return cached_response
# 调用LLM
response = await self.async_call(prompt, system_prompt, max_tokens, temperature)
# 缓存响应
if use_cache and app_config.llm_cache_enabled and response:
llm_cache.set(cache_key, response)
return response
def call_with_cache(
self,
prompt: str,
system_prompt: str = None,
max_tokens: int = None,
temperature: float = None,
use_cache: bool = True
) -> str:
"""带缓存的同步LLM调用"""
try:
loop = asyncio.get_event_loop()
except RuntimeError:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
import nest_asyncio
nest_asyncio.apply()
return loop.run_until_complete(
self.async_call_with_cache(prompt, system_prompt, max_tokens, temperature, use_cache)
)
async def async_call_stream(
self,
prompt: str,
system_prompt: str = None,
max_tokens: int = None,
temperature: float = None,
callback: Optional[Callable[[str], None]] = None
) -> AsyncIterator[str]:
"""流式异步LLM调用"""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
kwargs = {
'stream': True,
'max_tokens': max_tokens or self.config.max_tokens,
'temperature': temperature or self.config.temperature
}
try:
response = await self.client.chat_completions_create(
messages=messages,
**kwargs
)
full_response = ""
async for chunk in response:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
full_response += content
# 调用回调函数
if callback:
callback(content)
yield content
except Exception as e:
print(f"流式LLM调用失败: {e}")
yield ""