feat: Introduce LLM response caching and streaming, add application configuration, and enhance session data with progress and history tracking.

2026-01-24 12:52:35 +08:00
parent 162f5c4da4
commit fbbb5a2470
10 changed files with 1015 additions and 4 deletions
--- a/utils/data_loader.py
+++ b/utils/data_loader.py
@@ -2,6 +2,17 @@
 import os
 import pandas as pd
 import io
+import hashlib
+from pathlib import Path
+from typing import Optional, Iterator
+from config.app_config import app_config
+from utils.cache_manager import CacheManager
+
+# 初始化缓存管理器
+data_cache = CacheManager(
+    cache_dir=app_config.cache_dir,
+    enabled=app_config.data_cache_enabled
+)

 def load_and_profile_data(file_paths: list) -> str:
    """
@@ -88,3 +99,119 @@ def load_and_profile_data(file_paths: list) -> str:
            profile_summary += f"❌ 读取或分析文件失败: {str(e)}\n\n"
            
    return profile_summary
+
+
+def get_file_hash(file_path: str) -> str:
+    """计算文件哈希值，用于缓存键"""
+    hasher = hashlib.md5()
+    hasher.update(file_path.encode())
+    
+    # 添加文件修改时间
+    if os.path.exists(file_path):
+        mtime = os.path.getmtime(file_path)
+        hasher.update(str(mtime).encode())
+    
+    return hasher.hexdigest()
+
+
+def load_data_chunked(file_path: str, chunksize: Optional[int] = None) -> Iterator[pd.DataFrame]:
+    """
+    流式读取大文件，分块返回DataFrame
+    
+    Args:
+        file_path: 文件路径
+        chunksize: 每块行数，默认使用配置值
+        
+    Yields:
+        DataFrame块
+    """
+    if chunksize is None:
+        chunksize = app_config.chunk_size
+    
+    ext = os.path.splitext(file_path)[1].lower()
+    
+    if ext == '.csv':
+        # 尝试多种编码
+        for encoding in ['utf-8', 'gbk', 'latin1']:
+            try:
+                chunks = pd.read_csv(file_path, encoding=encoding, chunksize=chunksize)
+                for chunk in chunks:
+                    yield chunk
+                break
+            except UnicodeDecodeError:
+                continue
+            except Exception as e:
+                print(f"❌ 读取CSV文件失败: {e}")
+                break
+    elif ext in ['.xlsx', '.xls']:
+        # Excel文件不支持chunksize，直接读取
+        try:
+            df = pd.read_excel(file_path)
+            # 手动分块
+            for i in range(0, len(df), chunksize):
+                yield df.iloc[i:i+chunksize]
+        except Exception as e:
+            print(f"❌ 读取Excel文件失败: {e}")
+
+
+def load_data_with_cache(file_path: str, force_reload: bool = False) -> Optional[pd.DataFrame]:
+    """
+    带缓存的数据加载
+    
+    Args:
+        file_path: 文件路径
+        force_reload: 是否强制重新加载
+        
+    Returns:
+        DataFrame或None
+    """
+    if not os.path.exists(file_path):
+        print(f"⚠️ 文件不存在: {file_path}")
+        return None
+    
+    # 检查文件大小
+    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+    
+    # 对于大文件，建议使用流式处理
+    if file_size_mb > app_config.max_file_size_mb:
+        print(f"⚠️ 文件过大 ({file_size_mb:.1f}MB)，建议使用 load_data_chunked() 流式处理")
+    
+    # 生成缓存键
+    cache_key = get_file_hash(file_path)
+    
+    # 尝试从缓存加载
+    if not force_reload and app_config.data_cache_enabled:
+        cached_data = data_cache.get(cache_key)
+        if cached_data is not None:
+            print(f"💾 从缓存加载数据: {os.path.basename(file_path)}")
+            return cached_data
+    
+    # 加载数据
+    ext = os.path.splitext(file_path)[1].lower()
+    df = None
+    
+    try:
+        if ext == '.csv':
+            # 尝试多种编码
+            for encoding in ['utf-8', 'gbk', 'latin1']:
+                try:
+                    df = pd.read_csv(file_path, encoding=encoding)
+                    break
+                except UnicodeDecodeError:
+                    continue
+        elif ext in ['.xlsx', '.xls']:
+            df = pd.read_excel(file_path)
+        else:
+            print(f"⚠️ 不支持的文件格式: {ext}")
+            return None
+        
+        # 缓存数据
+        if df is not None and app_config.data_cache_enabled:
+            data_cache.set(cache_key, df)
+            print(f"✅ 数据已缓存: {os.path.basename(file_path)}")
+        
+        return df
+        
+    except Exception as e:
+        print(f"❌ 加载数据失败: {e}")
+        return None