feat: Introduce LLM response caching and streaming, add application configuration, and enhance session data with progress and history tracking.

This commit is contained in:
2026-01-24 12:52:35 +08:00
parent 162f5c4da4
commit fbbb5a2470
10 changed files with 1015 additions and 4 deletions

View File

@@ -2,6 +2,17 @@
import os
import pandas as pd
import io
import hashlib
from pathlib import Path
from typing import Optional, Iterator
from config.app_config import app_config
from utils.cache_manager import CacheManager
# 初始化缓存管理器
data_cache = CacheManager(
cache_dir=app_config.cache_dir,
enabled=app_config.data_cache_enabled
)
def load_and_profile_data(file_paths: list) -> str:
"""
@@ -88,3 +99,119 @@ def load_and_profile_data(file_paths: list) -> str:
profile_summary += f"❌ 读取或分析文件失败: {str(e)}\n\n"
return profile_summary
def get_file_hash(file_path: str) -> str:
"""计算文件哈希值,用于缓存键"""
hasher = hashlib.md5()
hasher.update(file_path.encode())
# 添加文件修改时间
if os.path.exists(file_path):
mtime = os.path.getmtime(file_path)
hasher.update(str(mtime).encode())
return hasher.hexdigest()
def load_data_chunked(file_path: str, chunksize: Optional[int] = None) -> Iterator[pd.DataFrame]:
"""
流式读取大文件分块返回DataFrame
Args:
file_path: 文件路径
chunksize: 每块行数,默认使用配置值
Yields:
DataFrame块
"""
if chunksize is None:
chunksize = app_config.chunk_size
ext = os.path.splitext(file_path)[1].lower()
if ext == '.csv':
# 尝试多种编码
for encoding in ['utf-8', 'gbk', 'latin1']:
try:
chunks = pd.read_csv(file_path, encoding=encoding, chunksize=chunksize)
for chunk in chunks:
yield chunk
break
except UnicodeDecodeError:
continue
except Exception as e:
print(f"❌ 读取CSV文件失败: {e}")
break
elif ext in ['.xlsx', '.xls']:
# Excel文件不支持chunksize直接读取
try:
df = pd.read_excel(file_path)
# 手动分块
for i in range(0, len(df), chunksize):
yield df.iloc[i:i+chunksize]
except Exception as e:
print(f"❌ 读取Excel文件失败: {e}")
def load_data_with_cache(file_path: str, force_reload: bool = False) -> Optional[pd.DataFrame]:
"""
带缓存的数据加载
Args:
file_path: 文件路径
force_reload: 是否强制重新加载
Returns:
DataFrame或None
"""
if not os.path.exists(file_path):
print(f"⚠️ 文件不存在: {file_path}")
return None
# 检查文件大小
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
# 对于大文件,建议使用流式处理
if file_size_mb > app_config.max_file_size_mb:
print(f"⚠️ 文件过大 ({file_size_mb:.1f}MB),建议使用 load_data_chunked() 流式处理")
# 生成缓存键
cache_key = get_file_hash(file_path)
# 尝试从缓存加载
if not force_reload and app_config.data_cache_enabled:
cached_data = data_cache.get(cache_key)
if cached_data is not None:
print(f"💾 从缓存加载数据: {os.path.basename(file_path)}")
return cached_data
# 加载数据
ext = os.path.splitext(file_path)[1].lower()
df = None
try:
if ext == '.csv':
# 尝试多种编码
for encoding in ['utf-8', 'gbk', 'latin1']:
try:
df = pd.read_csv(file_path, encoding=encoding)
break
except UnicodeDecodeError:
continue
elif ext in ['.xlsx', '.xls']:
df = pd.read_excel(file_path)
else:
print(f"⚠️ 不支持的文件格式: {ext}")
return None
# 缓存数据
if df is not None and app_config.data_cache_enabled:
data_cache.set(cache_key, df)
print(f"✅ 数据已缓存: {os.path.basename(file_path)}")
return df
except Exception as e:
print(f"❌ 加载数据失败: {e}")
return None