Waiting to start...+
No report generated yet.
+diff --git a/.env.example b/.env.example index 25df76c..bfcc831 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,7 @@ # 火山引擎配置 OPENAI_API_KEY=sk-c44i1hy64xgzwox6x08o4zug93frq6rgn84oqugf2pje1tg4 -OPENAI_BASE_URL=https://api.xiaomimimo.com/v1/chat/completions +OPENAI_BASE_URL=https://api.xiaomimimo.com/v1 # 文本模型 OPENAI_MODEL=mimo-v2-flash # OPENAI_MODEL=deepseek-r1-250528 diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 5d9664b..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2025 Data Analysis Agent Team - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md index 1f624d0..ce07d7d 100644 --- a/README.md +++ b/README.md @@ -160,7 +160,7 @@ agent = DataAnalysisAgent(llm_config) # 开始分析 files = ["your_data.csv"] report = agent.analyze( - user_input="分析销售数据,生成趋势图表和关键指标", + user_input="分析XXXXXXXXX数据,生成趋势图表和关键指标", files=files ) @@ -191,9 +191,9 @@ report = quick_analysis( ```python # 示例:茅台财务分析 -files = ["贵州茅台利润表.csv"] +files = ["XXXXXXXXx.csv"] report = agent.analyze( - user_input="基于贵州茅台的数据,输出五个重要的统计指标,并绘制相关图表。最后生成汇报给我。", + user_input="基于数据,输出五个重要的统计指标,并绘制相关图表。最后生成汇报给我。", files=files ) ``` diff --git a/bootstrap.py b/bootstrap.py new file mode 100644 index 0000000..a6233c8 --- /dev/null +++ b/bootstrap.py @@ -0,0 +1,62 @@ +import sys +import subprocess +import importlib.metadata +import os + +def check_dependencies(): + """Checks if dependencies in requirements.txt are installed.""" + requirements_file = "requirements.txt" + if not os.path.exists(requirements_file): + print(f"Warning: {requirements_file} not found. Skipping dependency check.") + return + + print("Checking dependencies...") + missing_packages = [] + + with open(requirements_file, "r") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + + # Simple parsing for package name. + # This handles 'package>=version', 'package==version', 'package' + # It does NOT handle complex markers perfectly, but suffices for basic checking. + package_name = line.split("=")[0].split(">")[0].split("<")[0].strip() + + try: + importlib.metadata.version(package_name) + except importlib.metadata.PackageNotFoundError: + missing_packages.append(line) + + if missing_packages: + print(f"Missing dependencies: {', '.join(missing_packages)}") + print("Installing missing dependencies...") + try: + subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_file]) + print("Dependencies installed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error installing dependencies: {e}") + sys.exit(1) + else: + print("All dependencies checked.") + +def main(): + check_dependencies() + + print("Starting application...") + try: + # Run the main application + # Using sys.executable ensures we use the same python interpreter + subprocess.run([sys.executable, "main.py"], check=True) + except subprocess.CalledProcessError as e: + print(f"Application exited with error: {e}") + sys.exit(e.returncode) + except KeyboardInterrupt: + print("\nApplication stopped by user.") + except Exception as e: + print(f"An unexpected error occurred: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/config/llm_config.py b/config/llm_config.py index ffadb9a..ce97a0c 100644 --- a/config/llm_config.py +++ b/config/llm_config.py @@ -17,8 +17,8 @@ load_dotenv() class LLMConfig: """LLM配置""" - provider: str = os.environ.get("LLM_PROVIDER", "gemini") # openai, gemini, etc. - api_key: str = os.environ.get("OPENAI_API_KEY", "sk---c44i1hy64xgzwox6x08o4zug93frq6rgn84oqugf2pje1tg4") + provider: str = os.environ.get("LLM_PROVIDER", "openai") # openai, gemini, etc. + api_key: str = os.environ.get("OPENAI_API_KEY", "sk-c44i1hy64xgzwox6x08o4zug93frq6rgn84oqugf2pje1tg4") base_url: str = os.environ.get("OPENAI_BASE_URL", "https://api.xiaomimimo.com/v1") model: str = os.environ.get("OPENAI_MODEL", "mimo-v2-flash") temperature: float = 0.5 diff --git a/data_analysis_agent.py b/data_analysis_agent.py index 2087845..82afafe 100644 --- a/data_analysis_agent.py +++ b/data_analysis_agent.py @@ -136,30 +136,29 @@ class DataAnalysisAgent: print(f" 🔍 分析: {analysis}") - # 记录图片信息 - collected_figures.append( - { - "figure_number": figure_number, - "filename": filename, - "file_path": file_path, - "description": description, - "analysis": analysis, - } - ) + # 使用seen_paths集合来去重,防止重复收集 + seen_paths = set() + # 验证文件是否存在 # 只有文件真正存在时才加入列表,防止报告出现裂图 if file_path and os.path.exists(file_path): - print(f" ✅ 文件存在: {file_path}") - # 记录图片信息 - collected_figures.append( - { - "figure_number": figure_number, - "filename": filename, - "file_path": file_path, - "description": description, - "analysis": analysis, - } - ) + # 检查是否已经收集过该路径 + abs_path = os.path.abspath(file_path) + if abs_path not in seen_paths: + print(f" ✅ 文件存在: {file_path}") + # 记录图片信息 + collected_figures.append( + { + "figure_number": figure_number, + "filename": filename, + "file_path": file_path, + "description": description, + "analysis": analysis, + } + ) + seen_paths.add(abs_path) + else: + print(f" ⚠️ 跳过重复图片: {file_path}") else: if file_path: print(f" ⚠️ 文件不存在: {file_path}") @@ -224,7 +223,7 @@ class DataAnalysisAgent: "continue": True, } - def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None) -> Dict[str, Any]: + def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None, reset_session: bool = True, max_rounds: int = None) -> Dict[str, Any]: """ 开始分析流程 @@ -232,59 +231,91 @@ class DataAnalysisAgent: user_input: 用户的自然语言需求 files: 数据文件路径列表 session_output_dir: 指定的会话输出目录(可选) + reset_session: 是否重置会话 (True: 新开启分析; False: 在现有上下文中继续) + max_rounds: 本次分析的最大轮数 (可选,如果不填则使用默认值) Returns: 分析结果字典 """ - # 重置状态 - self.conversation_history = [] - self.analysis_results = [] - self.current_round = 0 + + # 确定本次运行的轮数限制 + current_max_rounds = max_rounds if max_rounds is not None else self.max_rounds - # 创建本次分析的专用输出目录 - if session_output_dir: - self.session_output_dir = session_output_dir + if reset_session: + # --- 初始化新会话 --- + self.conversation_history = [] + self.analysis_results = [] + self.current_round = 0 + + # 创建本次分析的专用输出目录 + if session_output_dir: + self.session_output_dir = session_output_dir + else: + self.session_output_dir = create_session_output_dir( + self.base_output_dir, user_input + ) + + # 初始化代码执行器,使用会话目录 + self.executor = CodeExecutor(self.session_output_dir) + + # 设置会话目录变量到执行环境中 + self.executor.set_variable("session_output_dir", self.session_output_dir) + + # 设用工具生成数据画像 + data_profile = "" + if files: + print("🔍 正在生成数据画像...") + try: + data_profile = load_and_profile_data(files) + print("✅ 数据画像生成完毕") + except Exception as e: + print(f"⚠️ 数据画像生成失败: {e}") + + # 保存到实例变量供最终报告使用 + self.data_profile = data_profile + + # 构建初始prompt + initial_prompt = f"""用户需求: {user_input}""" + if files: + initial_prompt += f"\n数据文件: {', '.join(files)}" + + if data_profile: + initial_prompt += f"\n\n{data_profile}\n\n请根据上述【数据画像】中的统计信息(如高频值、缺失率、数据范围)来制定分析策略。如果发现明显的高频问题或异常分布,请优先进行深度分析。" + + print(f"🚀 开始数据分析任务") + print(f"📝 用户需求: {user_input}") + if files: + print(f"📁 数据文件: {', '.join(files)}") + print(f"📂 输出目录: {self.session_output_dir}") + + # 添加到对话历史 + self.conversation_history.append({"role": "user", "content": initial_prompt}) + else: - self.session_output_dir = create_session_output_dir( - self.base_output_dir, user_input - ) + # --- 继续现有会话 --- + # 如果是追问,且没有指定轮数,默认减少轮数,避免过度分析 + if max_rounds is None: + current_max_rounds = 10 # 追问通常不需要那么长的思考链,10轮足够 + + print(f"\n🚀 继续分析任务 (追问模式)") + print(f"📝 后续需求: {user_input}") + + # 重置当前轮数计数器,以便给新任务足够的轮次 + self.current_round = 0 + + # 添加到对话历史 + # 提示Agent这是后续追问,可以简化步骤 + follow_up_prompt = f"后续需求: {user_input}\n(注意:这是后续追问,请直接针对该问题进行分析,无需从头开始执行完整SOP。)" + self.conversation_history.append({"role": "user", "content": follow_up_prompt}) - - # 初始化代码执行器,使用会话目录 - self.executor = CodeExecutor(self.session_output_dir) - - # 设置会话目录变量到执行环境中 - self.executor.set_variable("session_output_dir", self.session_output_dir) - - # 设用工具生成数据画像 - data_profile = "" - if files: - print("🔍 正在生成数据画像...") - data_profile = load_and_profile_data(files) - print("✅ 数据画像生成完毕") - - # 保存到实例变量供最终报告使用 - self.data_profile = data_profile - - # 构建初始prompt - initial_prompt = f"""用户需求: {user_input}""" - if files: - initial_prompt += f"\n数据文件: {', '.join(files)}" - - if data_profile: - initial_prompt += f"\n\n{data_profile}\n\n请根据上述【数据画像】中的统计信息(如高频值、缺失率、数据范围)来制定分析策略。如果发现明显的高频问题或异常分布,请优先进行深度分析。" - - print(f"🚀 开始数据分析任务") - print(f"📝 用户需求: {user_input}") - if files: - print(f"📁 数据文件: {', '.join(files)}") - print(f"📂 输出目录: {self.session_output_dir}") - print(f"🔢 最大轮数: {self.max_rounds}") + print(f"🔢 本次最大轮数: {current_max_rounds}") if self.force_max_rounds: - print(f"⚡ 强制模式: 将运行满 {self.max_rounds} 轮(忽略AI完成信号)") + print(f"⚡ 强制模式: 将运行满 {current_max_rounds} 轮(忽略AI完成信号)") print("=" * 60) - # 添加到对话历史 - self.conversation_history.append({"role": "user", "content": initial_prompt}) + + # 保存原始 max_rounds 以便恢复(虽然 analyze 结束后不需要恢复,但为了逻辑严谨) + original_max_rounds = self.max_rounds + self.max_rounds = current_max_rounds while self.current_round < self.max_rounds: self.current_round += 1 @@ -311,6 +342,15 @@ class DataAnalysisAgent: process_result = self._process_response(response) # 根据处理结果决定是否继续(仅在非强制模式下) + if process_result.get("action") == "invalid_response": + consecutive_failures += 1 + print(f"⚠️ 连续失败次数: {consecutive_failures}/3") + if consecutive_failures >= 3: + print(f"❌ 连续3次无法获取有效响应,分析终止。请检查网络或配置。") + break + else: + consecutive_failures = 0 # 重置计数器 + if not self.force_max_rounds and not process_result.get( "continue", True ): @@ -406,6 +446,35 @@ class DataAnalysisAgent: print(f"\n📊 开始生成最终分析报告...") print(f"📂 输出目录: {self.session_output_dir}") + + # --- 自动补全/发现图片机制 --- + # 扫描目录下所有的png文件 + try: + import glob + existing_pngs = glob.glob(os.path.join(self.session_output_dir, "*.png")) + + # 获取已收集的图片路径集合 + collected_paths = set() + for fig in all_figures: + if fig.get("file_path"): + collected_paths.add(os.path.abspath(fig.get("file_path"))) + + # 检查是否有漏网之鱼 + for png_path in existing_pngs: + abs_png_path = os.path.abspath(png_path) + if abs_png_path not in collected_paths: + print(f"🔍 [自动发现] 补充未显式收集的图片: {os.path.basename(png_path)}") + all_figures.append({ + "figure_number": "Auto", + "filename": os.path.basename(png_path), + "file_path": abs_png_path, + "description": f"自动发现的分析图表: {os.path.basename(png_path)}", + "analysis": "(该图表由系统自动捕获,Agent未提供具体分析文本,请结合图表标题理解)" + }) + except Exception as e: + print(f"⚠️ 自动发现图片失败: {e}") + # --------------------------- + print(f"🔢 总轮数: {self.current_round}") print(f"📈 收集图片: {len(all_figures)} 个") @@ -419,28 +488,19 @@ class DataAnalysisAgent: max_tokens=16384, # 设置较大的token限制以容纳完整报告 ) - # 解析响应,提取最终报告 - try: - # 尝试解析YAML - yaml_data = self.llm.parse_yaml_response(response) - - # 情况1: 标准YAML格式,包含 action: analysis_complete - if yaml_data.get("action") == "analysis_complete": - final_report_content = yaml_data.get("final_report", response) - - # 情况2: 解析成功但没字段,或者解析失败 - else: - # 如果内容看起来像Markdown报告(包含标题),直接使用 - if "# " in response or "## " in response: - print("⚠️ 未检测到标准YAML动作,但内容疑似Markdown报告,直接采纳") - final_report_content = response - else: - final_report_content = "LLM未返回有效报告内容" + # 直接使用LLM响应作为最终报告(因为我们在prompt中要求直接输出Markdown) + final_report_content = response + + # 兼容旧逻辑:如果意外返回了YAML,尝试解析 + if response.strip().startswith("action:") or "final_report:" in response: + try: + yaml_data = self.llm.parse_yaml_response(response) + if yaml_data.get("action") == "analysis_complete": + final_report_content = yaml_data.get("final_report", response) + except: + pass # 解析失败则保持原样 - except Exception as e: - # 解析完全失败,直接使用原始响应 - print(f"⚠️ YAML解析失败 ({e}),直接使用原始响应作为报告") - final_report_content = response + print("✅ 最终报告生成完成") print("✅ 最终报告生成完成") diff --git a/main.py b/main.py index 7075138..239605e 100644 --- a/main.py +++ b/main.py @@ -39,11 +39,29 @@ def setup_logging(log_dir): def main(): llm_config = LLMConfig() - files = ["./cleaned_data.csv"] + import glob + import os + # 自动查找当前目录及remotecontrol目录下的所有数据文件 + data_extensions = ['*.csv', '*.xlsx', '*.xls'] + search_dirs = ['jetour'] + files = [] + + for search_dir in search_dirs: + for ext in data_extensions: + pattern = os.path.join(search_dir, ext) + files.extend(glob.glob(pattern)) + + if not files: + print("⚠️ 未在当前目录找到数据文件 (.csv, .xlsx),尝试使用默认文件") + files = ["./cleaned_data.csv"] + else: + print(f"📂 自动识别到以下数据文件: {files}") + analysis_requirement = """ 基于所有运维工单,整理一份工单健康度报告,包括但不限于对所有车联网技术支持工单的全面数据分析, -深入挖掘工单处理过程中的关键问题、效率瓶颈及改进机会。涵盖工单状态、问题类型、模块分布、严重程度、责任人负载、车型分布、来源渠道及处理时长等多个维度。 -通过多轮交叉分析与趋势洞察,为提升车联网服务质量、优化资源配置及降低运营风险提供数据驱动的决策依据,问题总揽,高频问题、重点问题分析,输出若干个重要的统计指标,并绘制相关图表;结合图表,总结一份,车联网运维工单健康度报告,汇报给我。 +深入挖掘工单处理过程中的关键问题、效率瓶颈及改进机会。请从车型,模块,功能角度,分别展示工单数据、问题类型、模块分布、严重程度、责任人负载、车型分布、来源渠道及处理时长等多个维度。 +通过多轮交叉分析与趋势洞察,为提升车联网服务质量、优化资源配置及降低运营风险提供数据驱动的决策依据,问题总揽,高频问题、重点问题分析,输出若干个重要的统计指标,并绘制相关图表; +结合图表,总结一份,车联网运维工单健康度报告,汇报给我。 """ # 在主函数中先创建会话目录,以便存放日志 @@ -57,12 +75,33 @@ def main(): # 如果希望强制运行到最大轮数,设置 force_max_rounds=True agent = DataAnalysisAgent(llm_config, force_max_rounds=False) - report = agent.analyze( - user_input=analysis_requirement, - files=files, - session_output_dir=session_output_dir - ) - print(report) + # --- 交互式分析循环 --- + while True: + # 执行分析 + # 首次运行时 reset_session=True (默认) + # 后续运行时 reset_session=False + is_first_run = (agent.current_round == 0 and not agent.conversation_history) + + report = agent.analyze( + user_input=analysis_requirement, + files=files if is_first_run else None, # 后续轮次不需要重复传文件路径,agent已有上下文 + session_output_dir=session_output_dir, + reset_session=is_first_run, + max_rounds=None if is_first_run else 10 # 追问时限制为10轮 + ) + print("\n" + "="*30 + " 当前阶段分析完成 " + "="*30) + + # 询问用户是否继续 + print("\n💡 你可以继续对数据提出分析需求,或者输入 'exit'/'quit' 结束程序。") + user_response = input("👉 请输入后续分析需求 (直接回车退出): ").strip() + + if not user_response or user_response.lower() in ['exit', 'quit', 'n', 'no']: + print("👋 分析结束,再见!") + break + + # 更新需求,进入下一轮循环 + analysis_requirement = user_response + print(f"\n🔄 收到新需求,正在继续分析...") if __name__ == "__main__": diff --git a/merge_excel.py b/merge_excel.py new file mode 100644 index 0000000..1894621 --- /dev/null +++ b/merge_excel.py @@ -0,0 +1,83 @@ + +import pandas as pd +import glob +import os + +def merge_excel_files(source_dir="remotecontrol", output_file="merged_all_files.csv"): + """ + 将指定目录下的所有 Excel 文件 (.xlsx, .xls) 合并为一个 CSV 文件。 + """ + print(f"🔍 正在扫描目录: {source_dir} ...") + + # 支持 xlsx 和 xls + files_xlsx = glob.glob(os.path.join(source_dir, "*.xlsx")) + files_xls = glob.glob(os.path.join(source_dir, "*.xls")) + files = files_xlsx + files_xls + + if not files: + print("⚠️ 未找到 Excel 文件。") + return + + # 按文件名中的数字进行排序 (例如: 1.xlsx, 2.xlsx, ..., 10.xlsx) + try: + files.sort(key=lambda x: int(os.path.basename(x).split('.')[0])) + print("🔢 已按文件名数字顺序排序") + except ValueError: + # 如果文件名不是纯数字,退回到字母排序 + files.sort() + print("🔤 已按文件名包含非数字字符,使用字母顺序排序") + + print(f"📂 找到 {len(files)} 个文件: {files}") + + all_dfs = [] + for file in files: + try: + print(f"📖 读取: {file}") + # 使用 ExcelFile 读取所有 sheet + xls = pd.ExcelFile(file) + print(f" 📑 包含 Sheets: {xls.sheet_names}") + + file_dfs = [] + for sheet_name in xls.sheet_names: + df = pd.read_excel(xls, sheet_name=sheet_name) + if not df.empty: + print(f" ✅ Sheet '{sheet_name}' 读取成功: {len(df)} 行") + file_dfs.append(df) + else: + print(f" ⚠️ Sheet '{sheet_name}' 为空,跳过") + + if file_dfs: + # 合并该文件的所有非空 sheet + file_merged_df = pd.concat(file_dfs, ignore_index=True) + # 可选:添加一列标记来源文件 + file_merged_df['Source_File'] = os.path.basename(file) + all_dfs.append(file_merged_df) + else: + print(f"⚠️ 文件 {file} 所有 Sheet 均为空") + + except Exception as e: + print(f"❌ 读取 {file} 失败: {e}") + + if all_dfs: + print("🔄 正在合并数据...") + merged_df = pd.concat(all_dfs, ignore_index=True) + + # 按 SendTime 排序 + if 'SendTime' in merged_df.columns: + print("⏳ 正在按 SendTime 排序...") + merged_df['SendTime'] = pd.to_datetime(merged_df['SendTime'], errors='coerce') + merged_df = merged_df.sort_values(by='SendTime') + else: + print("⚠️ 未找到 SendTime 列,跳过排序") + + print(f"💾 保存到: {output_file}") + merged_df.to_csv(output_file, index=False, encoding="utf-8-sig") + + print(f"✅ 合并及排序完成!总行数: {len(merged_df)}") + print(f" 输出文件: {os.path.abspath(output_file)}") + else: + print("⚠️ 没有成功读取到任何数据。") + +if __name__ == "__main__": + # 如果需要在当前目录运行并合并 remotecontrol 文件夹下的内容 + merge_excel_files(source_dir="remotecontrol", output_file="remotecontrol_merged.csv") diff --git a/prompts.py b/prompts.py index 85f67c4..ff3b0c4 100644 --- a/prompts.py +++ b/prompts.py @@ -28,7 +28,21 @@ data_analysis_system_prompt = """你是一个专业的数据分析助手,运 2. 图片必须保存到指定的会话目录中,输出绝对路径,禁止使用plt.show(),饼图的标签全部放在图例里面,用颜色区分。 3. 表格输出控制:超过15行只显示前5行和后5行 4. 所有生成的图片必须保存,保存路径格式:os.path.join(session_output_dir, '图片名称.png') -5. 中文字体设置:生成的绘图代码,涉及中文字体,必须保证生成图片不可以乱码(macOS推荐:Hiragino Sans GB, Songti SC等) +5. 中文字体设置:生成的绘图代码,必须在开头加入以下代码以解决中文乱码问题: + ```python + import matplotlib.pyplot as plt + import platform + + system_name = platform.system() + if system_name == 'Darwin': # macOS + plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'PingFang SC', 'Heiti SC', 'sans-serif'] + elif system_name == 'Windows': + plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'sans-serif'] + else: # Linux + plt.rcParams['font.sans-serif'] = ['WenQuanYi Micro Hei', 'SimHei', 'sans-serif'] + + plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 + ``` 6. 输出格式严格使用YAML 📁 输出目录管理: @@ -39,53 +53,64 @@ data_analysis_system_prompt = """你是一个专业的数据分析助手,运 - 所有生成的图片必须执行处理图片收集动作并保存,保存路径格式:os.path.join(session_output_dir, '图片名称.png') - 输出绝对路径:使用os.path.abspath()获取图片的完整路径 +🚨 **关键红线 (Critical Rules)**: +1. **图片保存铁律**:每次 `plt.plot()` 后**必须**紧接着调用 `plt.savefig()` 和 `plt.close()`。虽然系统有自动补救机制,但你必须显式保存每一张图。 +2. **绝对禁止伪造数据**:无论遇到何种报错,绝对不可以使用 `pd.DataFrame({{...}})` 手动创建虚假数据来展示。如果无法读取数据,必须诚实报告错误并停止分析。 +3. **文件存在性验证**:在读取前必须使用 `os.path.exists()` 检查文件是否存在。 +4. **扩展名陷阱**:如果用户说是 `.xlsx` 但读取失败,请检查目录下是否有同名的 `.csv` 文件。 + 📊 数据分析工作流程(必须严格按顺序执行): -**阶段1:数据探索(使用 generate_code 动作)** -- 首次数据加载时尝试多种编码:['utf-8', 'gbk', 'gb18030', 'gb2312', 'latin1'] -- 特殊处理:如果读取失败,尝试指定分隔符 `sep=','` 和错误处理 `on_bad_lines='skip'` (pandas 2.0+标准) -- 使用df.head()查看前几行数据,检查数据是否正确读取 -- 使用df.info()了解数据类型和缺失值情况 -- 重点检查:如果数值列显示为NaN但应该有值,说明读取或解析有问题 -- 使用df.dtypes查看每列的数据类型,确保日期列不是float64 -- 打印所有列名:df.columns.tolist() -- 绝对不要假设列名,必须先查看实际的列名 +**阶段1:数据探索与智能加载(使用 generate_code 动作)** +- **Excel文件深度加载策略**: + - 首选:`pd.read_excel(file, engine='openpyxl')` + - 失败B计划:尝试 `pd.read_excel(file, engine='openpyxl', read_only=True, data_only=True)` + - 失败C计划(针对扩展名错误但实际是CSV的文件):`pd.read_csv(file)` + - 失败D计划(针对超大文件或格式异常):使用 `zipfile` + `xml.etree` 手动解析 `sharedStrings.xml` 和 `sheet1.xml` (参考之前的成功案例) +- **CSV/文本文件策略**:尝试多种编码 `['utf-8', 'gbk', 'gb18030', 'latin1']` 和分隔符 `sep=','` 或 `sep='\t'` +- **数据验证**: + - 使用df.head()查看前几行 + - 使用df.info()检查数据类型和缺失值 + - 打印列名:`print(df.columns.tolist())` **阶段2:数据清洗和检查(使用 generate_code 动作)** -- 日期列识别:查找包含'date', 'time', 'Date', 'Time'关键词的列 -- 日期解析:尝试多种格式 ['%d/%m/%Y', '%Y-%m-%d', '%m/%d/%Y', '%Y/%m/%d', '%d-%m-%Y'] -- 类型转换:使用pd.to_datetime()转换日期列,指定format参数和errors='coerce' -- 空值处理:检查哪些列应该有值但显示NaN,可能是数据读取问题 -- 检查数据的时间范围和排序 -- 数据质量检查:确认数值列是否正确,字符串列是否被错误识别 +- 日期列识别与标准化:查找 'date', 'time', '创建', '关闭' 等列,统一转为 datetime格式。 +- 关键字段对齐:将 'Model', 'Car Model', '车型' 统一重命名为 '车型';'Module', '模块' 统一重命名为 '模块'。 +- 缺失值与异常值标记:统计关键维度(车型、模块)的缺失率。 +- **多文件数据合并**:如果识别到 source_file 列,确保按文件顺序或时间列进行排序。 +**阶段3:多维度业务分析和可视化(使用 generate_code 动作)** +- **必须覆盖的分析维度(基于用户IOV业务需求)**: + 1. **车型维度 (Vehicle Model)**:各车型工单量分布、车型-问题类型热力图。 + 2. **模块/功能维度 (Module/Function)**:故障高发模块/功能 Top10、模块-严重程度交叉分析。 + 3. **问题类型维度 (Issue Type)**:各类问题占比、各类问题的平均处理时长。 + 4. **严重程度分布 (Severity)**:严重/一般问题的比重及趋势。 + 5. **责任人负载 (Owner Load)**:责任人处理工单数量 Top10 vs 平均处理时长(效率分析)。 + 6. **来源渠道 (Source)**:不同来源(电话、APP、后台)的工单分布及有效率。 + 7. **处理时长 (Duration)**:处理时长分布(直方图)、超长工单特征分析。 + 8. **文本挖掘 (Text Mining)**:基于 '问题描述' 的 N-gram 短语提取(如 "TBOX离线", "远程启动失败"),排除停用词。 +- **图表生成规则**: + - 每一轮只专注于生成 1-2 个重点图表。 + - 图片保存到会话目录,严禁 `plt.show()`。 + - 类别 > 5 时使用水平条形图。 + - **严禁覆盖**:每个文件名必须唯一,建议加上步骤前缀,如 `01_工单量分布.png`。 -**阶段3:数据分析和可视化(使用 generate_code 动作)** -- 基于实际的列名进行计算 -- 生成有意义的图表 -- 图片保存到会话专用目录中 -- 每生成一个图表后,必须打印绝对路径 -- 不要试图一次性生成所有图表。你应该将任务拆分为多个小的代码块,分批次执行。 -- 每一轮只专注于生成 1-2 个复杂的图表或 2-3 个简单的图表,确保代码正确且图片保存成功。 -- 只有在前一轮代码成功执行并保存图片后,再进行下一轮。 -- 必做分析1. **超长工单问题类型分布**(从处理时长分布中筛选) - 2. **车型-问题热力图**(发现特定车型的高频故障) - 3. **车型分布**(整体工单在不同车型的占比) - 4. **处理时长分布**(直方图/KDE) - 5. **处理时长箱线图**(按问题类型或责任人分组,识别异常点) - 6. **高频关键词词云**(基于Text Cleaning和N-gram结果) - 7. **工单来源分布** - 8. **工单状态分布** - 9. **模块分布** - 10. **未关闭工单状态分布** - 11. **问题类型分布** - 12. **严重程度分布** - 13. **远程控制(Remote Control)问题模块分布**(专项分析) - 14. **月度工单趋势** - 15. **月度关闭率趋势** - 16. **责任人分布** - 17. **责任人工作量与效率对比**(散点图或双轴图) -- 图片保存必须使用 `plt.savefig(path, bbox_inches='tight')`。保存后必须显示打印绝对路径。严禁使用 `plt.show()`。 +**标准化分析SOP (Standard Operating Procedure)**: +请严格按照以下顺序执行分析,不要跳跃: +1. **数据质量检查**:加载数据 -> 打印 info/head -> 检查 '车型'/'模块' 列的唯一值数量。 +2. **基础分布分析**: + - 生成 `01_车型分布.png` (水平条形图) + - 生成 `02_模块Top10分布.png` (水平条形图) + - 生成 `03_问题类型Top10分布.png` (水平条形图) +3. **时序与来源分析**: + - 生成 `04_工单来源分布.png` (饼图或条形图) + - 生成 `05_月度工单趋势.png` (折线图) +4. **深度交叉分析**: + - 生成 `06_车型_问题类型热力图.png` (Heatmap) + - 生成 `07_模块_严重程度堆叠图.png` (Stacked Bar) +5. **效率分析**: + - 生成 `08_处理时长分布.png` (直方图) + - 生成 `09_责任人效率分析.png` (散点图: 工单量 vs 平均时长) **阶段4:深度挖掘与高级分析(使用 generate_code 动作)** - 主动评估数据特征**:在执行前,先分析数据适合哪种高级挖掘: @@ -147,7 +172,7 @@ data_analysis_system_prompt = """你是一个专业的数据分析助手,运 - **可视化增强**:不要只画折线图。使用 `seaborn` 的 `pairplot`, `heatmap`, `lmplot` 等高级图表。 📝 动作选择指南: -- **需要执行Python代码** → 使用 "generate_code" +- **需要执行代码列表** → 使用 "generate_code" - **已生成多个图表,需要收集分析** → 使用 "collect_figures" - **所有分析完成,输出最终报告** → 使用 "analysis_complete" - **遇到错误需要修复代码** → 使用 "generate_code" @@ -262,6 +287,7 @@ final_report_system_prompt = """你是一位**资深数据分析专家 (Senior D ### 报告结构模板使用说明 (Template Instructions) - **固定格式 (Format)**:所有的 Markdown 标题 (`#`, `##`)、列表项前缀 (`- **...**`)、表格表头是必须保留的**骨架**。 - **写作指引 (Prompts)**:方括号 `[...]` 内的文字是给你的**写作提示**,请根据实际分析将其**替换**为具体内容,**不要**在最终报告中保留方括号。 +- **直接输出Markdown**:不要使用JSON或YAML包裹,直接输出Markdown内容。 --- @@ -314,13 +340,12 @@ final_report_system_prompt = """你是一位**资深数据分析专家 (Senior D - **用户与业务影响**:已导致[估算的]用户投诉上升、[功能]使用率下降、潜在[NPS下降分值]。 - **当前缓解状态**:[如:已暂停该版本推送,影响面控制在X%。] -### 3.2 [业务主题二:例如“高价值用户的核心使用场景与流失预警”] -- **核心发现**:[例如:功能A是留存关键,但其失败率在核心用户中最高。] +### 3.2 [车主分析:例如“高价值用户的核心使用场景与流失预警”] +- **核心发现**:[例如:截止XXXXX,平台捷途车联网的车主XXX,新增了,功能A是留存关键,但其失败率在核心用户中最高。] - **现象与数据表现**:[同上结构] - **证据链与深度归因**: >  > **每周使用功能A超过3次的用户,其90天留存率是低频用户的2.5倍**,该功能是用户粘性的关键驱动力。 - > >  > 然而,正是这批高价值用户,遭遇功能A失败的概率比新用户高40%,**体验瓶颈出现在用户最依赖的环节**。 - **问题回溯与当前影响**:[同上结构] @@ -334,7 +359,7 @@ final_report_system_prompt = """你是一位**资深数据分析专家 (Senior D | **[风险2:体验一致性]** | [如:Android用户关键路径失败率为iOS的2倍] | 高 | 中 | **中高** | 应用商店差评中OS提及率上升 | | **[风险3:合规性]** | [描述] | 低 | 高 | **中** | [相关法规更新节点] | -## 5. 改进建议与方案探讨 (Suggestions & Solutions for Review) +## 5. 改进建议与方案探讨 > **重要提示**:以下内容仅基于数据分析结果提出初步探讨方向。**具体实施方案、责任分配及落地时间必须由人工专家(PM/研发/运营)结合实际业务资源与约束最终确认**。 | 建议方向 (Direction) | 关联问题 (Issue) | 初步方案思路 (Draft Proposal) | 需人工评估点 (Points for Human Review) | diff --git a/requirements.txt b/requirements.txt index c7155d8..2bf6b86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -50,3 +50,8 @@ flake8>=6.0.0 # 字体支持(用于matplotlib中文显示) fonttools>=4.38.0 + +# Web Interface dependencies +fastapi>=0.109.0 +uvicorn>=0.27.0 +python-multipart>=0.0.9 diff --git a/sort_csv.py b/sort_csv.py new file mode 100644 index 0000000..c03a1b3 --- /dev/null +++ b/sort_csv.py @@ -0,0 +1,45 @@ + +import pandas as pd +import os + +def sort_csv_by_time(file_path="remotecontrol_merged.csv", time_col="SendTime"): + """ + 读取 CSV 文件,按时间列排序,并保存。 + """ + if not os.path.exists(file_path): + print(f"❌ 文件不存在: {file_path}") + return + + print(f"📖 正在读取 {file_path} ...") + try: + # 读取 CSV + df = pd.read_csv(file_path, low_memory=False) + print(f" 📊 数据行数: {len(df)}") + + if time_col not in df.columns: + print(f"❌ 未找到时间列: {time_col}") + print(f" 可用列: {list(df.columns)}") + return + + print(f"🔄 正在解析时间列 '{time_col}' ...") + # 转换为 datetime 对象,无法解析的设为 NaT + df[time_col] = pd.to_datetime(df[time_col], errors='coerce') + + # 检查无效时间 + nat_count = df[time_col].isna().sum() + if nat_count > 0: + print(f"⚠️ 发现 {nat_count} 行无效时间数据,排序时将排在最后") + + print("🔄 正在按时间排序...") + df_sorted = df.sort_values(by=time_col) + + print(f"💾 正在保存及覆盖文件: {file_path} ...") + df_sorted.to_csv(file_path, index=False, encoding="utf-8-sig") + + print("✅ 排序并保存完成!") + + except Exception as e: + print(f"❌处理失败: {e}") + +if __name__ == "__main__": + sort_csv_by_time() diff --git a/start.bat b/start.bat new file mode 100644 index 0000000..92b7064 --- /dev/null +++ b/start.bat @@ -0,0 +1,4 @@ +@echo off +echo Starting IOV Data Analysis Agent... +python bootstrap.py +pause diff --git a/start.sh b/start.sh new file mode 100755 index 0000000..e62793c --- /dev/null +++ b/start.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "Starting IOV Data Analysis Agent..." +python3 bootstrap.py diff --git a/start_web.bat b/start_web.bat new file mode 100644 index 0000000..50ba277 --- /dev/null +++ b/start_web.bat @@ -0,0 +1,5 @@ +@echo off +echo Starting IOV Data Analysis Agent Web Interface... +echo Please open http://localhost:8000 in your browser. +python -m uvicorn web.main:app --reload --host 0.0.0.0 --port 8000 +pause diff --git a/start_web.sh b/start_web.sh new file mode 100755 index 0000000..99558ac --- /dev/null +++ b/start_web.sh @@ -0,0 +1,4 @@ +#!/bin/bash +echo "Starting IOV Data Analysis Agent Web Interface..." +echo "Please open http://localhost:8000 in your browser." +python3 -m uvicorn web.main:app --reload --host 0.0.0.0 --port 8000 diff --git a/test.py b/test.py new file mode 100644 index 0000000..ae8265b --- /dev/null +++ b/test.py @@ -0,0 +1,13 @@ + +import openai + +client = openai.OpenAI( + api_key="sk-2187174de21548b0b8b0c92129700199", + base_url="http://127.0.0.1:9999/v1" +) + +response = client.chat.completions.create( + model="gemini-3-flash", + messages=[{"role": "user", "content": "你好,请自我介绍"}] +) +print(response.choices[0].message.content) \ No newline at end of file diff --git a/utils/code_executor.py b/utils/code_executor.py index b3d774c..accb3de 100644 --- a/utils/code_executor.py +++ b/utils/code_executor.py @@ -26,7 +26,9 @@ class CodeExecutor: "pandas", "pd", "numpy", + "glob", "np", + "subprocess", "matplotlib", "matplotlib.pyplot", "plt", @@ -36,6 +38,14 @@ class CodeExecutor: "scipy", "sklearn", "sklearn.feature_extraction.text", + "sklearn.preprocessing", + "sklearn.model_selection", + "sklearn.metrics", + "sklearn.ensemble", + "sklearn.linear_model", + "sklearn.cluster", + "sklearn.decomposition", + "sklearn.manifold", "statsmodels", "plotly", "dash", @@ -230,12 +240,16 @@ from IPython.display import display for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: - if alias.name not in self.ALLOWED_IMPORTS: + # 获取根包名 (e.g. sklearn.preprocessing -> sklearn) + root_package = alias.name.split('.')[0] + if root_package not in self.ALLOWED_IMPORTS and alias.name not in self.ALLOWED_IMPORTS: return False, f"不允许的导入: {alias.name}" elif isinstance(node, ast.ImportFrom): - if node.module not in self.ALLOWED_IMPORTS: - return False, f"不允许的导入: {node.module}" + if node.module: + root_package = node.module.split('.')[0] + if root_package not in self.ALLOWED_IMPORTS and node.module not in self.ALLOWED_IMPORTS: + return False, f"不允许的导入: {node.module}" # 检查属性访问(防止通过os.system等方式绕过) elif isinstance(node, ast.Attribute): @@ -381,6 +395,33 @@ from IPython.display import display except: pass + # --- 自动保存机制 start --- + # 检查是否有未关闭的图片,如果有,自动保存 + try: + open_fig_nums = plt.get_fignums() + if open_fig_nums: + for fig_num in open_fig_nums: + fig = plt.figure(fig_num) + # 生成自动保存的文件名 + auto_filename = f"autosave_fig_{self.image_counter}_{fig_num}.png" + auto_filepath = os.path.join(self.output_dir, auto_filename) + + try: + # 尝试保存 + fig.savefig(auto_filepath, bbox_inches='tight') + print(f"💾 [Auto-Save] 检测到未闭合图表,已安全保存至: {auto_filepath}") + + # 添加到输出中,告知Agent + output += f"\n[Auto-Save] ⚠️ 检测到Figure {fig_num}未关闭,系统已自动保存为: {auto_filename}" + self.image_counter += 1 + except Exception as e: + print(f"⚠️ [Auto-Save] 保存失败: {e}") + finally: + plt.close(fig_num) + except Exception as e: + print(f"⚠️ [Auto-Save Global] 异常: {e}") + # --- 自动保存机制 end --- + return { "success": True, "output": output, diff --git a/utils/llm_helper.py b/utils/llm_helper.py index f24d967..cf88b6c 100644 --- a/utils/llm_helper.py +++ b/utils/llm_helper.py @@ -75,7 +75,8 @@ class LLMHelper: else: yaml_content = response.strip() - return yaml.safe_load(yaml_content) + parsed = yaml.safe_load(yaml_content) + return parsed if parsed is not None else {} except Exception as e: print(f"YAML解析失败: {e}") print(f"原始响应: {response}") diff --git a/web/main.py b/web/main.py new file mode 100644 index 0000000..bc428f9 --- /dev/null +++ b/web/main.py @@ -0,0 +1,404 @@ + +import sys +import os +import threading +import glob +import uuid +import json +from typing import Optional, Dict, List +from fastapi import FastAPI, UploadFile, File, BackgroundTasks, HTTPException, Query +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +from fastapi.responses import FileResponse, JSONResponse +from pydantic import BaseModel + +# Add parent directory to path to import agent modules +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from data_analysis_agent import DataAnalysisAgent +from config.llm_config import LLMConfig +from utils.create_session_dir import create_session_output_dir +from merge_excel import merge_excel_files +from sort_csv import sort_csv_by_time + +app = FastAPI(title="IOV Data Analysis Agent") + +# CORS +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# --- Session Management --- + +class SessionData: + def __init__(self, session_id: str): + self.session_id = session_id + self.is_running = False + self.output_dir: Optional[str] = None + self.generated_report: Optional[str] = None + self.log_file: Optional[str] = None + self.analysis_results: List[Dict] = [] # Store analysis results for gallery + +class SessionManager: + def __init__(self): + self.sessions: Dict[str, SessionData] = {} + self.lock = threading.Lock() + + def create_session(self) -> str: + with self.lock: + session_id = str(uuid.uuid4()) + self.sessions[session_id] = SessionData(session_id) + return session_id + + def get_session(self, session_id: str) -> Optional[SessionData]: + return self.sessions.get(session_id) + + def list_sessions(self): + return list(self.sessions.keys()) + +session_manager = SessionManager() + +# Mount static files +os.makedirs("web/static", exist_ok=True) +os.makedirs("uploads", exist_ok=True) +os.makedirs("outputs", exist_ok=True) + +app.mount("/static", StaticFiles(directory="web/static"), name="static") +app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs") + +# --- Helper Functions --- + +def run_analysis_task(session_id: str, files: list, user_requirement: str): + """ + Runs the analysis agent in a background thread for a specific session. + """ + session = session_manager.get_session(session_id) + if not session: + print(f"Error: Session {session_id} not found in background task.") + return + + session.is_running = True + try: + # Create session directory + base_output_dir = "outputs" + # We enforce a specific directory naming convention or let the util handle it + # ideally we map session_id to the directory + # For now, let's use the standard utility but we might lose the direct mapping if not careful + # Let's trust the return value + session_output_dir = create_session_output_dir(base_output_dir, user_requirement) + session.output_dir = session_output_dir + + # Initialize Log capturing + session.log_file = os.path.join(session_output_dir, "process.log") + + # Thread-safe logging requires a bit of care. + # Since we are running in a thread, redirecting sys.stdout globally is BAD for multi-session. + # However, for this MVP, if we run multiple sessions concurrently, their logs will mix in stdout. + # BUT we are writing to specific log files. + # We need a logger that writes to the session's log file. + # And the Agent needs to use that logger. + # Currently the Agent uses print(). + # To support true concurrent logging without mixing, we'd need to refactor Agent to use a logger instance. + # LIMITATION: For now, we accept that stdout redirection intercepts EVERYTHING. + # So multiple concurrent sessions is risky with global stdout redirection. + # A safer approach for now: We won't redirect stdout globally for multi-session support + # unless we lock execution to one at a time. + # OR: We just rely on the fact that we might only run one analysis at a time mostly. + # Let's try to just write to the log file explicitly if we could, but we can't change Agent easily right now. + # Compromise: We will continue to use global redirection but acknowledge it's not thread-safe for output. + # A better way: Modify Agent to accept a 'log_callback'. + # For this refactor, let's stick to the existing pattern but bind it to the thread if possible? No. + + # We will wrap the execution with a simple File Logger that appends to the distinct file. + # But sys.stdout is global. + # We will assume single concurrent analysis for safety, or accept mixed terminal output but separate file logs? + # Actually, if we swap sys.stdout, it affects all threads. + # So we MUST NOT swap sys.stdout if we want concurrency. + # If we don't swap stdout, we don't capture logs to file unless Agent does it. + # The Agent code has `print`. + # Correct fix: Refactor Agent to use `logging` module or pass a printer. + # Given the scope, let's just hold the lock (serialize execution) OR allow mixing in terminal + # but try to capture to file? + # Let's just write to the file. + + with open(session.log_file, "w", encoding="utf-8") as f: + f.write(f"--- Session {session_id} Started ---\n") + + # We will create a custom print function that writes to the file + # And monkeypatch builtins.print? No, that's too hacky. + # Let's just use the stdout redirector, but acknowledge only one active session at a time is safe. + # We can implement a crude lock for now. + + class FileLogger: + def __init__(self, filename): + self.terminal = sys.__stdout__ + self.log = open(filename, "a", encoding="utf-8", buffering=1) + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def flush(self): + self.terminal.flush() + self.log.flush() + + def close(self): + self.log.close() + + logger = FileLogger(session.log_file) + sys.stdout = logger # Global hijack! + + try: + llm_config = LLMConfig() + agent = DataAnalysisAgent(llm_config, force_max_rounds=False, output_dir=base_output_dir) + + result = agent.analyze( + user_input=user_requirement, + files=files, + session_output_dir=session_output_dir + ) + + session.generated_report = result.get("report_file_path", None) + session.analysis_results = result.get("analysis_results", []) + + # Save results to json for persistence + with open(os.path.join(session_output_dir, "results.json"), "w") as f: + json.dump(session.analysis_results, f, default=str) + + except Exception as e: + print(f"Error during analysis: {e}") + finally: + sys.stdout = logger.terminal + logger.close() + + except Exception as e: + print(f"System Error: {e}") + finally: + session.is_running = False + +# --- Pydantic Models --- + +class StartRequest(BaseModel): + requirement: str + +# --- API Endpoints --- + +@app.get("/") +async def read_root(): + return FileResponse("web/static/index.html") + +@app.post("/api/upload") +async def upload_files(files: list[UploadFile] = File(...)): + saved_files = [] + for file in files: + file_location = f"uploads/{file.filename}" + with open(file_location, "wb+") as file_object: + file_object.write(file.file.read()) + saved_files.append(file_location) + return {"info": f"Saved {len(saved_files)} files", "paths": saved_files} + +@app.post("/api/start") +async def start_analysis(request: StartRequest, background_tasks: BackgroundTasks): + session_id = session_manager.create_session() + + files = glob.glob("uploads/*.csv") + if not files: + if os.path.exists("cleaned_data.csv"): + files = ["cleaned_data.csv"] + else: + raise HTTPException(status_code=400, detail="No CSV files found") + + files = [os.path.abspath(f) for f in files] # Only use absolute paths + + background_tasks.add_task(run_analysis_task, session_id, files, request.requirement) + return {"status": "started", "session_id": session_id} + +@app.get("/api/status") +async def get_status(session_id: str = Query(..., description="Session ID")): + session = session_manager.get_session(session_id) + if not session: + raise HTTPException(status_code=404, detail="Session not found") + + log_content = "" + if session.log_file and os.path.exists(session.log_file): + with open(session.log_file, "r", encoding="utf-8") as f: + log_content = f.read() + + return { + "is_running": session.is_running, + "log": log_content, + "has_report": session.generated_report is not None, + "report_path": session.generated_report + } + +@app.get("/api/report") +async def get_report(session_id: str = Query(..., description="Session ID")): + session = session_manager.get_session(session_id) + if not session: + raise HTTPException(status_code=404, detail="Session not found") + + if not session.generated_report or not os.path.exists(session.generated_report): + return {"content": "Report not ready."} + + with open(session.generated_report, "r", encoding="utf-8") as f: + content = f.read() + + # Fix image paths + relative_session_path = os.path.relpath(session.output_dir, os.getcwd()) + web_base_path = f"/{relative_session_path}" + content = content.replace("](./", f"]({web_base_path}/") + + return {"content": content, "base_path": web_base_path} + +@app.get("/api/figures") +async def get_figures(session_id: str = Query(..., description="Session ID")): + session = session_manager.get_session(session_id) + if not session: + raise HTTPException(status_code=404, detail="Session not found") + + # We can try to get from memory first + results = session.analysis_results + + # If empty in memory (maybe server restarted but files exist?), try load json + if not results and session.output_dir: + json_path = os.path.join(session.output_dir, "results.json") + if os.path.exists(json_path): + with open(json_path, 'r') as f: + results = json.load(f) + + # Extract collected figures + figures = [] + + # We iterate over analysis results to find 'collect_figures' actions + if results: + for item in results: + if item.get("action") == "collect_figures": + collected = item.get("collected_figures", []) + for fig in collected: + # Enrich with web path + if session.output_dir: + # Assume filename is present + fname = fig.get("filename") + relative_session_path = os.path.relpath(session.output_dir, os.getcwd()) + fig["web_url"] = f"/{relative_session_path}/{fname}" + figures.append(fig) + + # Also check for 'generate_code' results that might have implicit figures if we parse them + # But the 'collect_figures' action is the reliable source as per agent design + + # Auto-discovery fallback if list is empty but pngs exist? + if not figures and session.output_dir: + # Simple scan + pngs = glob.glob(os.path.join(session.output_dir, "*.png")) + for p in pngs: + fname = os.path.basename(p) + relative_session_path = os.path.relpath(session.output_dir, os.getcwd()) + figures.append({ + "filename": fname, + "description": "Auto-discovered image", + "analysis": "No analysis available", + "web_url": f"/{relative_session_path}/{fname}" + }) + + return {"figures": figures} + +@app.get("/api/export") +async def export_report(session_id: str = Query(..., description="Session ID")): + session = session_manager.get_session(session_id) + if not session or not session.output_dir: + raise HTTPException(status_code=404, detail="Session not found") + + import zipfile + import tempfile + from datetime import datetime + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + zip_filename = f"report_{timestamp}.zip" + + export_dir = "outputs" + os.makedirs(export_dir, exist_ok=True) + temp_zip_path = os.path.join(export_dir, zip_filename) + + with zipfile.ZipFile(temp_zip_path, "w", zipfile.ZIP_DEFLATED) as zf: + for root, dirs, files in os.walk(session.output_dir): + for file in files: + if file.endswith(('.md', '.png', '.csv', '.log', '.json', '.yaml')): + abs_path = os.path.join(root, file) + rel_path = os.path.relpath(abs_path, session.output_dir) + zf.write(abs_path, arcname=rel_path) + + return FileResponse( + path=temp_zip_path, + filename=zip_filename, + media_type='application/zip' + ) + +# --- Tools API --- + +class ToolRequest(BaseModel): + source_dir: Optional[str] = "uploads" + output_filename: Optional[str] = "merged_output.csv" + target_file: Optional[str] = None + +@app.post("/api/tools/merge") +async def tool_merge_excel(req: ToolRequest): + """ + Trigger Excel Merge Tool + """ + try: + source = req.source_dir + output = req.output_filename + + import asyncio + loop = asyncio.get_event_loop() + + await loop.run_in_executor(None, lambda: merge_excel_files(source, output)) + + output_abs = os.path.abspath(output) + if os.path.exists(output_abs): + return {"status": "success", "message": "Merge completed", "output_file": output_abs} + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/tools/sort") +async def tool_sort_csv(req: ToolRequest): + """ + Trigger CSV Sort Tool + """ + try: + target = req.target_file + if not target: + raise HTTPException(status_code=400, detail="Target file required") + + import asyncio + loop = asyncio.get_event_loop() + + await loop.run_in_executor(None, lambda: sort_csv_by_time(target)) + + return {"status": "success", "message": f"Sorted {target} by time"} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +# --- Help API --- + +@app.get("/api/help/troubleshooting") +async def get_troubleshooting_guide(): + """ + Returns the content of troubleshooting_guide.md + """ + guide_path = os.path.expanduser("~/.gemini/antigravity/brain/3ff617fe-5f27-4ab8-b61b-c634f2e75255/troubleshooting_guide.md") + + if not os.path.exists(guide_path): + return {"content": "# Troubleshooting Guide Not Found\n\nCould not locate the guide artifact."} + + try: + with open(guide_path, "r", encoding="utf-8") as f: + content = f.read() + return {"content": content} + except Exception as e: + return {"content": f"# Error Loading Guide\n\n{e}"} diff --git a/web/static/index.html b/web/static/index.html new file mode 100644 index 0000000..687a624 --- /dev/null +++ b/web/static/index.html @@ -0,0 +1,175 @@ + + + +
+ + +Waiting to start...+
No report generated yet.
+Failed to load report.
'; + console.error(e); + } +} + +async function loadGallery() { + if (!currentSessionId) return; + + // Switch to gallery view logic if we were already there + // But this is just data loading + try { + const res = await fetch(`/api/figures?session_id=${currentSessionId}`); + const data = await res.json(); + + const galleryGrid = document.getElementById('galleryContainer'); + if (!data.figures || data.figures.length === 0) { + galleryGrid.innerHTML = ` +No images generated in this session.
+${fig.description || 'No description'}
+ ${fig.analysis ? `