feat: adjust report format and enforce image persistence

2026-01-06 19:44:17 +08:00
commit fcbdec1298
20 changed files with 11171 additions and 0 deletions
--- a/utils/data_loader.py
+++ b/utils/data_loader.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+import os
+import pandas as pd
+import io
+
+def load_and_profile_data(file_paths: list) -> str:
+    """
+    加载数据并生成数据画像
+    
+    Args:
+        file_paths: 文件路径列表
+        
+    Returns:
+        包含数据画像的Markdown字符串
+    """
+    profile_summary = "# 数据画像报告 (Data Profile)\n\n"
+    
+    if not file_paths:
+        return profile_summary + "未提供数据文件。"
+        
+    for file_path in file_paths:
+        file_name = os.path.basename(file_path)
+        profile_summary += f"## 文件: {file_name}\n\n"
+        
+        if not os.path.exists(file_path):
+            profile_summary += f"⚠️ 文件不存在: {file_path}\n\n"
+            continue
+            
+        try:
+            # 根据扩展名选择加载方式
+            ext = os.path.splitext(file_path)[1].lower()
+            if ext == '.csv':
+                # 尝试多种编码
+                try:
+                    df = pd.read_csv(file_path, encoding='utf-8')
+                except UnicodeDecodeError:
+                    try:
+                        df = pd.read_csv(file_path, encoding='gbk')
+                    except Exception:
+                        df = pd.read_csv(file_path, encoding='latin1')
+            elif ext in ['.xlsx', '.xls']:
+                df = pd.read_excel(file_path)
+            else:
+                profile_summary += f"⚠️ 不支持的文件格式: {ext}\n\n"
+                continue
+                
+            # 基础信息
+            rows, cols = df.shape
+            profile_summary += f"- **维度**: {rows} 行 x {cols} 列\n"
+            profile_summary += f"- **列名**: `{', '.join(df.columns)}`\n\n"
+            
+            profile_summary += "### 列详细分布:\n"
+            
+            # 遍历分析每列
+            for col in df.columns:
+                dtype = df[col].dtype
+                null_count = df[col].isnull().sum()
+                null_ratio = (null_count / rows) * 100
+                
+                profile_summary += f"#### {col} ({dtype})\n"
+                if null_count > 0:
+                    profile_summary += f"- ⚠️ 空值: {null_count} ({null_ratio:.1f}%)\n"
+                
+                # 数值列分析
+                if pd.api.types.is_numeric_dtype(dtype):
+                    desc = df[col].describe()
+                    profile_summary += f"- 统计: Min={desc['min']:.2f}, Max={desc['max']:.2f}, Mean={desc['mean']:.2f}\n"
+                
+                # 文本/分类列分析
+                elif pd.api.types.is_object_dtype(dtype) or pd.api.types.is_categorical_dtype(dtype):
+                    unique_count = df[col].nunique()
+                    profile_summary += f"- 唯一值数量: {unique_count}\n"
+                    
+                    # 如果唯一值较少（<50）或者看起来是分类数据，显示Top分布
+                    # 这对识别“高频问题”至关重要
+                    if unique_count > 0:
+                        top_n = df[col].value_counts().head(5)
+                        top_items_str = ", ".join([f"{k}({v})" for k, v in top_n.items()])
+                        profile_summary += f"- **TOP 5 高频值**: {top_items_str}\n"
+                        
+                # 时间列分析
+                elif pd.api.types.is_datetime64_any_dtype(dtype):
+                    profile_summary += f"- 范围: {df[col].min()} 至 {df[col].max()}\n"
+                
+                profile_summary += "\n"
+                
+        except Exception as e:
+            profile_summary += f"❌ 读取或分析文件失败: {str(e)}\n\n"
+            
+    return profile_summary