"""性能测试 - 验证系统性能指标。 测试内容: 1. 数据理解阶段性能(< 30秒) 2. 完整分析流程性能(< 30分钟) 3. 大数据集处理(100万行) 4. 内存使用 需求:NFR-1.1, NFR-1.2 """ import pytest import time import pandas as pd import numpy as np import psutil import os from pathlib import Path from typing import Dict, Any from src.main import run_analysis from src.data_access import DataAccessLayer from src.engines.data_understanding import understand_data class TestDataUnderstandingPerformance: """测试数据理解阶段的性能。""" def test_small_dataset_performance(self, tmp_path): """测试小数据集(1000行)的性能。""" # 生成测试数据 data_file = tmp_path / "small_data.csv" df = self._generate_test_data(rows=1000, cols=10) df.to_csv(data_file, index=False) # 测试性能 start_time = time.time() dal = DataAccessLayer.load_from_file(str(data_file)) profile = understand_data(dal) elapsed = time.time() - start_time # 验证:应该在5秒内完成 assert elapsed < 5, f"小数据集理解耗时 {elapsed:.2f}秒,超过5秒限制" assert profile.row_count == 1000 assert profile.column_count == 10 def test_medium_dataset_performance(self, tmp_path): """测试中等数据集(10万行)的性能。""" # 生成测试数据 data_file = tmp_path / "medium_data.csv" df = self._generate_test_data(rows=100000, cols=20) df.to_csv(data_file, index=False) # 测试性能 start_time = time.time() dal = DataAccessLayer.load_from_file(str(data_file)) profile = understand_data(dal) elapsed = time.time() - start_time # 验证:应该在15秒内完成 assert elapsed < 15, f"中等数据集理解耗时 {elapsed:.2f}秒,超过15秒限制" assert profile.row_count == 100000 assert profile.column_count == 20 def test_large_dataset_performance(self, tmp_path): """测试大数据集(100万行)的性能。 需求:NFR-1.1 - 数据理解阶段 < 30秒 需求:NFR-1.2 - 支持最大100万行数据 """ # 生成测试数据 data_file = tmp_path / "large_data.csv" df = self._generate_test_data(rows=1000000, cols=30) df.to_csv(data_file, index=False) # 测试性能 start_time = time.time() dal = DataAccessLayer.load_from_file(str(data_file)) profile = understand_data(dal) elapsed = time.time() - start_time # 验证:应该在30秒内完成 assert elapsed < 30, f"大数据集理解耗时 {elapsed:.2f}秒,超过30秒限制" assert profile.row_count == 1000000 assert profile.column_count == 30 print(f"✓ 大数据集(100万行)理解耗时: {elapsed:.2f}秒") def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame: """生成测试数据。""" data = {} # 生成不同类型的列 for i in range(cols): col_type = i % 4 if col_type == 0: # 数值列 data[f'numeric_{i}'] = np.random.randn(rows) elif col_type == 1: # 分类列 categories = ['A', 'B', 'C', 'D', 'E'] data[f'category_{i}'] = np.random.choice(categories, rows) elif col_type == 2: # 日期列 start_date = pd.Timestamp('2020-01-01') data[f'date_{i}'] = pd.date_range(start_date, periods=rows, freq='H') else: # 文本列 data[f'text_{i}'] = [f'text_{j}' for j in range(rows)] return pd.DataFrame(data) class TestFullAnalysisPerformance: """测试完整分析流程的性能。""" @pytest.mark.slow def test_small_dataset_full_analysis(self, tmp_path): """测试小数据集的完整分析流程。""" # 生成测试数据 data_file = tmp_path / "test_data.csv" df = self._generate_ticket_data(rows=1000) df.to_csv(data_file, index=False) # 设置输出目录 output_dir = tmp_path / "output" # 测试性能 start_time = time.time() result = run_analysis( data_file=str(data_file), user_requirement="分析工单数据", output_dir=str(output_dir) ) elapsed = time.time() - start_time # 验证:应该在5分钟内完成 assert elapsed < 300, f"小数据集完整分析耗时 {elapsed:.2f}秒,超过5分钟限制" assert result['success'] is True print(f"✓ 小数据集(1000行)完整分析耗时: {elapsed:.2f}秒") @pytest.mark.slow @pytest.mark.skipif( os.getenv('SKIP_LONG_TESTS') == '1', reason="跳过长时间运行的测试" ) def test_large_dataset_full_analysis(self, tmp_path): """测试大数据集的完整分析流程。 需求:NFR-1.1 - 完整分析流程 < 30分钟 """ # 生成测试数据 data_file = tmp_path / "large_test_data.csv" df = self._generate_ticket_data(rows=100000) df.to_csv(data_file, index=False) # 设置输出目录 output_dir = tmp_path / "output" # 测试性能 start_time = time.time() result = run_analysis( data_file=str(data_file), user_requirement="分析工单健康度", output_dir=str(output_dir) ) elapsed = time.time() - start_time # 验证:应该在30分钟内完成 assert elapsed < 1800, f"大数据集完整分析耗时 {elapsed:.2f}秒,超过30分钟限制" assert result['success'] is True print(f"✓ 大数据集(10万行)完整分析耗时: {elapsed:.2f}秒") def _generate_ticket_data(self, rows: int) -> pd.DataFrame: """生成工单测试数据。""" statuses = ['待处理', '处理中', '已关闭', '已解决'] priorities = ['低', '中', '高', '紧急'] types = ['故障', '咨询', '投诉', '建议'] models = ['Model A', 'Model B', 'Model C', 'Model D'] data = { 'ticket_id': [f'T{i:06d}' for i in range(rows)], 'status': np.random.choice(statuses, rows), 'priority': np.random.choice(priorities, rows), 'type': np.random.choice(types, rows), 'model': np.random.choice(models, rows), 'created_at': pd.date_range('2023-01-01', periods=rows, freq='5min'), 'closed_at': pd.date_range('2023-01-01', periods=rows, freq='5min') + pd.Timedelta(hours=24), 'duration_hours': np.random.randint(1, 100, rows), } return pd.DataFrame(data) class TestMemoryUsage: """测试内存使用。""" def test_data_loading_memory(self, tmp_path): """测试数据加载的内存使用。""" # 生成测试数据 data_file = tmp_path / "memory_test.csv" df = self._generate_test_data(rows=100000, cols=50) df.to_csv(data_file, index=False) # 记录初始内存 process = psutil.Process() initial_memory = process.memory_info().rss / 1024 / 1024 # MB # 加载数据 dal = DataAccessLayer.load_from_file(str(data_file)) profile = understand_data(dal) # 记录最终内存 final_memory = process.memory_info().rss / 1024 / 1024 # MB memory_increase = final_memory - initial_memory # 验证:内存增长应该合理(不超过500MB) assert memory_increase < 500, f"内存增长 {memory_increase:.2f}MB,超过500MB限制" print(f"✓ 数据加载内存增长: {memory_increase:.2f}MB") def test_large_dataset_memory(self, tmp_path): """测试大数据集的内存使用。 需求:NFR-1.2 - 支持最大100MB的CSV文件 """ # 生成测试数据(约100MB) data_file = tmp_path / "large_memory_test.csv" df = self._generate_test_data(rows=500000, cols=50) df.to_csv(data_file, index=False) # 检查文件大小 file_size = os.path.getsize(data_file) / 1024 / 1024 # MB print(f"测试文件大小: {file_size:.2f}MB") # 记录初始内存 process = psutil.Process() initial_memory = process.memory_info().rss / 1024 / 1024 # MB # 加载数据 dal = DataAccessLayer.load_from_file(str(data_file)) profile = understand_data(dal) # 记录最终内存 final_memory = process.memory_info().rss / 1024 / 1024 # MB memory_increase = final_memory - initial_memory # 验证:内存增长应该合理(不超过1GB) assert memory_increase < 1024, f"内存增长 {memory_increase:.2f}MB,超过1GB限制" print(f"✓ 大数据集内存增长: {memory_increase:.2f}MB") def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame: """生成测试数据。""" data = {} for i in range(cols): col_type = i % 4 if col_type == 0: data[f'col_{i}'] = np.random.randn(rows) elif col_type == 1: data[f'col_{i}'] = np.random.choice(['A', 'B', 'C', 'D'], rows) elif col_type == 2: data[f'col_{i}'] = pd.date_range('2020-01-01', periods=rows, freq='H') else: data[f'col_{i}'] = [f'text_{j % 1000}' for j in range(rows)] return pd.DataFrame(data) class TestStagePerformance: """测试各阶段的性能指标。""" def test_data_understanding_stage(self, tmp_path): """测试数据理解阶段的性能。""" # 生成测试数据 data_file = tmp_path / "stage_test.csv" df = self._generate_test_data(rows=50000, cols=30) df.to_csv(data_file, index=False) # 测试性能 start_time = time.time() dal = DataAccessLayer.load_from_file(str(data_file)) profile = understand_data(dal) elapsed = time.time() - start_time # 验证:应该在20秒内完成 assert elapsed < 20, f"数据理解阶段耗时 {elapsed:.2f}秒,超过20秒限制" print(f"✓ 数据理解阶段(5万行)耗时: {elapsed:.2f}秒") def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame: """生成测试数据。""" data = {} for i in range(cols): if i % 3 == 0: data[f'col_{i}'] = np.random.randn(rows) elif i % 3 == 1: data[f'col_{i}'] = np.random.choice(['A', 'B', 'C'], rows) else: data[f'col_{i}'] = pd.date_range('2020-01-01', periods=rows, freq='min') return pd.DataFrame(data) @pytest.fixture def performance_report(tmp_path): """生成性能测试报告。""" report_file = tmp_path / "performance_report.txt" yield report_file # 测试结束后,如果报告文件存在,打印内容 if report_file.exists(): print("\n" + "="*60) print("性能测试报告") print("="*60) print(report_file.read_text()) print("="*60) class TestOptimizationEffectiveness: """测试性能优化的有效性。""" def test_memory_optimization(self, tmp_path): """测试内存优化的效果。""" # 生成测试数据 data_file = tmp_path / "optimization_test.csv" df = self._generate_test_data(rows=100000, cols=30) df.to_csv(data_file, index=False) # 不优化内存 dal_no_opt = DataAccessLayer.load_from_file(str(data_file), optimize_memory=False) memory_no_opt = dal_no_opt._data.memory_usage(deep=True).sum() / 1024 / 1024 # 优化内存 dal_opt = DataAccessLayer.load_from_file(str(data_file), optimize_memory=True) memory_opt = dal_opt._data.memory_usage(deep=True).sum() / 1024 / 1024 # 验证:优化后内存应该减少 memory_saved = memory_no_opt - memory_opt savings_percent = (memory_saved / memory_no_opt) * 100 print(f"✓ 内存优化效果: {memory_no_opt:.2f}MB -> {memory_opt:.2f}MB") print(f"✓ 节省内存: {memory_saved:.2f}MB ({savings_percent:.1f}%)") # 验证:至少节省10%的内存 assert memory_saved > 0, "内存优化应该减少内存使用" def test_cache_effectiveness(self, tmp_path): """测试缓存的有效性。""" from src.performance_optimization import LLMCache cache_dir = tmp_path / "cache" cache = LLMCache(str(cache_dir)) # 第一次调用(未缓存) prompt = "测试提示" response = {"result": "测试响应"} # 设置缓存 cache.set(prompt, response) # 第二次调用(应该命中缓存) cached_response = cache.get(prompt) assert cached_response is not None assert cached_response == response print("✓ 缓存功能正常工作") def test_batch_processing(self): """测试批处理的效果。""" from src.performance_optimization import BatchProcessor processor = BatchProcessor(batch_size=10) # 测试数据 items = list(range(100)) # 批处理函数 def process_item(item): return item * 2 # 执行批处理 start_time = time.time() results = processor.process_batch(items, process_item) elapsed = time.time() - start_time # 验证结果 assert len(results) == 100 assert results[0] == 0 assert results[50] == 100 print(f"✓ 批处理100个项目耗时: {elapsed:.3f}秒") def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame: """生成测试数据。""" data = {} for i in range(cols): if i % 3 == 0: data[f'col_{i}'] = np.random.randint(0, 100, rows) elif i % 3 == 1: data[f'col_{i}'] = np.random.choice(['A', 'B', 'C', 'D'], rows) else: data[f'col_{i}'] = [f'text_{j % 100}' for j in range(rows)] return pd.DataFrame(data) class TestPerformanceMonitoring: """测试性能监控功能。""" def test_performance_monitor(self): """测试性能监控器。""" from src.performance_optimization import PerformanceMonitor monitor = PerformanceMonitor() # 记录一些指标 monitor.record("test_metric", 1.5) monitor.record("test_metric", 2.0) monitor.record("test_metric", 1.8) # 获取统计信息 stats = monitor.get_stats("test_metric") assert stats['count'] == 3 assert stats['mean'] == pytest.approx(1.767, rel=0.01) assert stats['min'] == 1.5 assert stats['max'] == 2.0 print("✓ 性能监控器正常工作") def test_timed_decorator(self): """测试计时装饰器。""" from src.performance_optimization import timed, PerformanceMonitor monitor = PerformanceMonitor() @timed(metric_name="test_function", monitor=monitor) def slow_function(): time.sleep(0.1) return "done" # 执行函数 result = slow_function() assert result == "done" # 检查是否记录了性能指标 stats = monitor.get_stats("test_function") assert stats['count'] == 1 assert stats['mean'] >= 0.1 print("✓ 计时装饰器正常工作") class TestEndToEndPerformance: """端到端性能测试。""" def test_performance_report_generation(self, tmp_path): """测试性能报告生成。""" from src.performance_optimization import get_global_monitor # 生成测试数据 data_file = tmp_path / "e2e_test.csv" df = self._generate_ticket_data(rows=5000) df.to_csv(data_file, index=False) # 获取性能监控器 monitor = get_global_monitor() monitor.clear() # 执行数据理解 dal = DataAccessLayer.load_from_file(str(data_file)) profile = understand_data(dal) # 获取性能统计 stats = monitor.get_all_stats() print("\n性能统计:") for metric_name, metric_stats in stats.items(): if metric_stats: print(f" {metric_name}: {metric_stats['mean']:.3f}秒") assert profile is not None def _generate_ticket_data(self, rows: int) -> pd.DataFrame: """生成工单测试数据。""" statuses = ['待处理', '处理中', '已关闭'] types = ['故障', '咨询', '投诉'] data = { 'ticket_id': [f'T{i:06d}' for i in range(rows)], 'status': np.random.choice(statuses, rows), 'type': np.random.choice(types, rows), 'created_at': pd.date_range('2023-01-01', periods=rows, freq='5min'), 'duration': np.random.randint(1, 100, rows), } return pd.DataFrame(data) class TestPerformanceBenchmarks: """性能基准测试。""" def test_data_loading_benchmark(self, tmp_path, benchmark_report): """数据加载性能基准。""" sizes = [1000, 10000, 100000] results = [] for size in sizes: data_file = tmp_path / f"benchmark_{size}.csv" df = self._generate_test_data(rows=size, cols=20) df.to_csv(data_file, index=False) start_time = time.time() dal = DataAccessLayer.load_from_file(str(data_file)) elapsed = time.time() - start_time results.append({ 'size': size, 'time': elapsed, 'rows_per_second': size / elapsed }) # 打印基准结果 print("\n数据加载性能基准:") print(f"{'行数':<10} {'耗时(秒)':<12} {'行/秒':<15}") print("-" * 40) for r in results: print(f"{r['size']:<10} {r['time']:<12.3f} {r['rows_per_second']:<15.0f}") def test_data_understanding_benchmark(self, tmp_path): """数据理解性能基准。""" sizes = [1000, 10000, 50000] results = [] for size in sizes: data_file = tmp_path / f"understanding_{size}.csv" df = self._generate_test_data(rows=size, cols=20) df.to_csv(data_file, index=False) dal = DataAccessLayer.load_from_file(str(data_file)) start_time = time.time() profile = understand_data(dal) elapsed = time.time() - start_time results.append({ 'size': size, 'time': elapsed, 'rows_per_second': size / elapsed }) # 打印基准结果 print("\n数据理解性能基准:") print(f"{'行数':<10} {'耗时(秒)':<12} {'行/秒':<15}") print("-" * 40) for r in results: print(f"{r['size']:<10} {r['time']:<12.3f} {r['rows_per_second']:<15.0f}") def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame: """生成测试数据。""" data = {} for i in range(cols): if i % 3 == 0: data[f'col_{i}'] = np.random.randn(rows) elif i % 3 == 1: data[f'col_{i}'] = np.random.choice(['A', 'B', 'C'], rows) else: data[f'col_{i}'] = pd.date_range('2020-01-01', periods=rows, freq='min') return pd.DataFrame(data) @pytest.fixture def benchmark_report(): """基准测试报告fixture。""" yield # 可以在这里生成报告文件