更新readme文档

2026-01-09 16:52:45 +08:00
parent e51cdfea6f
commit b1d0cc5462
22 changed files with 1871 additions and 174 deletions
--- a/data_analysis_agent.py
+++ b/data_analysis_agent.py
@@ -136,30 +136,29 @@ class DataAnalysisAgent:
            print(f"   🔍 分析: {analysis}")


-            # 记录图片信息
-            collected_figures.append(
-                {
-                    "figure_number": figure_number,
-                    "filename": filename,
-                    "file_path": file_path,
-                    "description": description,
-                    "analysis": analysis,
-                }
-            )
+            # 使用seen_paths集合来去重，防止重复收集
+            seen_paths = set()
+            
            # 验证文件是否存在
            # 只有文件真正存在时才加入列表，防止报告出现裂图
            if file_path and os.path.exists(file_path):
-                print(f"   ✅ 文件存在: {file_path}")
-                # 记录图片信息
-                collected_figures.append(
-                    {
-                        "figure_number": figure_number,
-                        "filename": filename,
-                        "file_path": file_path,
-                        "description": description,
-                        "analysis": analysis,
-                    }
-                )
+                # 检查是否已经收集过该路径
+                abs_path = os.path.abspath(file_path)
+                if abs_path not in seen_paths:
+                    print(f"   ✅ 文件存在: {file_path}")
+                    # 记录图片信息
+                    collected_figures.append(
+                        {
+                            "figure_number": figure_number,
+                            "filename": filename,
+                            "file_path": file_path,
+                            "description": description,
+                            "analysis": analysis,
+                        }
+                    )
+                    seen_paths.add(abs_path)
+                else:
+                    print(f"   ⚠️ 跳过重复图片: {file_path}")
            else:
                if file_path:
                    print(f"   ⚠️ 文件不存在: {file_path}")
@@ -224,7 +223,7 @@ class DataAnalysisAgent:
                "continue": True,
            }

-    def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None) -> Dict[str, Any]:
+    def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None, reset_session: bool = True, max_rounds: int = None) -> Dict[str, Any]:
        """
        开始分析流程

@@ -232,59 +231,91 @@ class DataAnalysisAgent:
            user_input: 用户的自然语言需求
            files: 数据文件路径列表
            session_output_dir: 指定的会话输出目录（可选）
+            reset_session: 是否重置会话 (True: 新开启分析; False: 在现有上下文中继续)
+            max_rounds: 本次分析的最大轮数 (可选，如果不填则使用默认值)

        Returns:
            分析结果字典
        """
-        # 重置状态
-        self.conversation_history = []
-        self.analysis_results = []
-        self.current_round = 0
+        
+        # 确定本次运行的轮数限制
+        current_max_rounds = max_rounds if max_rounds is not None else self.max_rounds

-        # 创建本次分析的专用输出目录
-        if session_output_dir:
-             self.session_output_dir = session_output_dir
+        if reset_session:
+            # --- 初始化新会话 ---
+            self.conversation_history = []
+            self.analysis_results = []
+            self.current_round = 0
+            
+            # 创建本次分析的专用输出目录
+            if session_output_dir:
+                 self.session_output_dir = session_output_dir
+            else:
+                self.session_output_dir = create_session_output_dir(
+                    self.base_output_dir, user_input
+                )
+
+            # 初始化代码执行器，使用会话目录
+            self.executor = CodeExecutor(self.session_output_dir)
+
+            # 设置会话目录变量到执行环境中
+            self.executor.set_variable("session_output_dir", self.session_output_dir)
+
+            # 设用工具生成数据画像
+            data_profile = ""
+            if files:
+                print("🔍 正在生成数据画像...")
+                try:
+                    data_profile = load_and_profile_data(files)
+                    print("✅ 数据画像生成完毕")
+                except Exception as e:
+                    print(f"⚠️ 数据画像生成失败: {e}")
+            
+            # 保存到实例变量供最终报告使用
+            self.data_profile = data_profile
+
+            # 构建初始prompt
+            initial_prompt = f"""用户需求: {user_input}"""
+            if files:
+                initial_prompt += f"\n数据文件: {', '.join(files)}"
+            
+            if data_profile:
+                initial_prompt += f"\n\n{data_profile}\n\n请根据上述【数据画像】中的统计信息（如高频值、缺失率、数据范围）来制定分析策略。如果发现明显的高频问题或异常分布，请优先进行深度分析。"
+
+            print(f"🚀 开始数据分析任务")
+            print(f"📝 用户需求: {user_input}")
+            if files:
+                print(f"📁 数据文件: {', '.join(files)}")
+            print(f"📂 输出目录: {self.session_output_dir}")
+            
+            # 添加到对话历史
+            self.conversation_history.append({"role": "user", "content": initial_prompt})
+            
        else:
-            self.session_output_dir = create_session_output_dir(
-                self.base_output_dir, user_input
-            )
+            # --- 继续现有会话 ---
+            # 如果是追问，且没有指定轮数，默认减少轮数，避免过度分析
+            if max_rounds is None:
+                current_max_rounds = 10 # 追问通常不需要那么长的思考链，10轮足够
+            
+            print(f"\n🚀 继续分析任务 (追问模式)")
+            print(f"📝 后续需求: {user_input}")
+            
+            # 重置当前轮数计数器，以便给新任务足够的轮次
+            self.current_round = 0 
+            
+            # 添加到对话历史
+            # 提示Agent这是后续追问，可以简化步骤
+            follow_up_prompt = f"后续需求: {user_input}\n(注意：这是后续追问，请直接针对该问题进行分析，无需从头开始执行完整SOP。)"
+            self.conversation_history.append({"role": "user", "content": follow_up_prompt})

-
-        # 初始化代码执行器，使用会话目录
-        self.executor = CodeExecutor(self.session_output_dir)
-
-        # 设置会话目录变量到执行环境中
-        self.executor.set_variable("session_output_dir", self.session_output_dir)
-
-        # 设用工具生成数据画像
-        data_profile = ""
-        if files:
-            print("🔍 正在生成数据画像...")
-            data_profile = load_and_profile_data(files)
-            print("✅ 数据画像生成完毕")
-        
-        # 保存到实例变量供最终报告使用
-        self.data_profile = data_profile
-
-        # 构建初始prompt
-        initial_prompt = f"""用户需求: {user_input}"""
-        if files:
-            initial_prompt += f"\n数据文件: {', '.join(files)}"
-        
-        if data_profile:
-            initial_prompt += f"\n\n{data_profile}\n\n请根据上述【数据画像】中的统计信息（如高频值、缺失率、数据范围）来制定分析策略。如果发现明显的高频问题或异常分布，请优先进行深度分析。"
-
-        print(f"🚀 开始数据分析任务")
-        print(f"📝 用户需求: {user_input}")
-        if files:
-            print(f"📁 数据文件: {', '.join(files)}")
-        print(f"📂 输出目录: {self.session_output_dir}")
-        print(f"🔢 最大轮数: {self.max_rounds}")
+        print(f"🔢 本次最大轮数: {current_max_rounds}")
        if self.force_max_rounds:
-            print(f"⚡ 强制模式: 将运行满 {self.max_rounds} 轮（忽略AI完成信号）")
+            print(f"⚡ 强制模式: 将运行满 {current_max_rounds} 轮（忽略AI完成信号）")
        print("=" * 60)
-        # 添加到对话历史
-        self.conversation_history.append({"role": "user", "content": initial_prompt})
+        
+        # 保存原始 max_rounds 以便恢复（虽然 analyze 结束后不需要恢复，但为了逻辑严谨）
+        original_max_rounds = self.max_rounds
+        self.max_rounds = current_max_rounds

        while self.current_round < self.max_rounds:
            self.current_round += 1
@@ -311,6 +342,15 @@ class DataAnalysisAgent:
                process_result = self._process_response(response)

                # 根据处理结果决定是否继续（仅在非强制模式下）
+                if process_result.get("action") == "invalid_response":
+                    consecutive_failures += 1
+                    print(f"⚠️ 连续失败次数: {consecutive_failures}/3")
+                    if consecutive_failures >= 3:
+                        print(f"❌ 连续3次无法获取有效响应，分析终止。请检查网络或配置。")
+                        break
+                else:
+                    consecutive_failures = 0  # 重置计数器
+
                if not self.force_max_rounds and not process_result.get(
                    "continue", True
                ):
@@ -406,6 +446,35 @@ class DataAnalysisAgent:

        print(f"\n📊 开始生成最终分析报告...")
        print(f"📂 输出目录: {self.session_output_dir}")
+        
+        # --- 自动补全/发现图片机制 ---
+        # 扫描目录下所有的png文件
+        try:
+            import glob
+            existing_pngs = glob.glob(os.path.join(self.session_output_dir, "*.png"))
+            
+            # 获取已收集的图片路径集合
+            collected_paths = set()
+            for fig in all_figures:
+                if fig.get("file_path"):
+                    collected_paths.add(os.path.abspath(fig.get("file_path")))
+            
+            # 检查是否有漏网之鱼
+            for png_path in existing_pngs:
+                abs_png_path = os.path.abspath(png_path)
+                if abs_png_path not in collected_paths:
+                    print(f"🔍 [自动发现] 补充未显式收集的图片: {os.path.basename(png_path)}")
+                    all_figures.append({
+                        "figure_number": "Auto",
+                        "filename": os.path.basename(png_path),
+                        "file_path": abs_png_path,
+                        "description": f"自动发现的分析图表: {os.path.basename(png_path)}",
+                        "analysis": "（该图表由系统自动捕获，Agent未提供具体分析文本，请结合图表标题理解）"
+                    })
+        except Exception as e:
+            print(f"⚠️ 自动发现图片失败: {e}")
+        # ---------------------------
+
        print(f"🔢 总轮数: {self.current_round}")
        print(f"📈 收集图片: {len(all_figures)} 个")

@@ -419,28 +488,19 @@ class DataAnalysisAgent:
                max_tokens=16384,  # 设置较大的token限制以容纳完整报告
            )

-            # 解析响应，提取最终报告
-            try:
-                # 尝试解析YAML
-                yaml_data = self.llm.parse_yaml_response(response)
-                
-                # 情况1: 标准YAML格式，包含 action: analysis_complete
-                if yaml_data.get("action") == "analysis_complete":
-                     final_report_content = yaml_data.get("final_report", response)
-                
-                # 情况2: 解析成功但没字段，或者解析失败
-                else:
-                    # 如果内容看起来像Markdown报告（包含标题），直接使用
-                    if "# " in response or "## " in response:
-                        print("⚠️ 未检测到标准YAML动作，但内容疑似Markdown报告，直接采纳")
-                        final_report_content = response
-                    else:
-                        final_report_content = "LLM未返回有效报告内容"
+            # 直接使用LLM响应作为最终报告（因为我们在prompt中要求直接输出Markdown）
+            final_report_content = response
+            
+            # 兼容旧逻辑：如果意外返回了YAML，尝试解析
+            if response.strip().startswith("action:") or "final_report:" in response:
+                try:
+                    yaml_data = self.llm.parse_yaml_response(response)
+                    if yaml_data.get("action") == "analysis_complete":
+                         final_report_content = yaml_data.get("final_report", response)
+                except:
+                    pass # 解析失败则保持原样

-            except Exception as e:
-                # 解析完全失败，直接使用原始响应
-                print(f"⚠️ YAML解析失败 ({e})，直接使用原始响应作为报告")
-                final_report_content = response
+            print("✅ 最终报告生成完成")

            print("✅ 最终报告生成完成")