587 lines
20 KiB
Python
587 lines
20 KiB
Python
"""性能测试 - 验证系统性能指标。
|
||
|
||
测试内容:
|
||
1. 数据理解阶段性能(< 30秒)
|
||
2. 完整分析流程性能(< 30分钟)
|
||
3. 大数据集处理(100万行)
|
||
4. 内存使用
|
||
|
||
需求:NFR-1.1, NFR-1.2
|
||
"""
|
||
|
||
import pytest
|
||
import time
|
||
import pandas as pd
|
||
import numpy as np
|
||
import psutil
|
||
import os
|
||
from pathlib import Path
|
||
from typing import Dict, Any
|
||
|
||
from src.main import run_analysis
|
||
from src.data_access import DataAccessLayer
|
||
from src.engines.data_understanding import understand_data
|
||
|
||
|
||
class TestDataUnderstandingPerformance:
|
||
"""测试数据理解阶段的性能。"""
|
||
|
||
def test_small_dataset_performance(self, tmp_path):
|
||
"""测试小数据集(1000行)的性能。"""
|
||
# 生成测试数据
|
||
data_file = tmp_path / "small_data.csv"
|
||
df = self._generate_test_data(rows=1000, cols=10)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 测试性能
|
||
start_time = time.time()
|
||
dal = DataAccessLayer.load_from_file(str(data_file))
|
||
profile = understand_data(dal)
|
||
elapsed = time.time() - start_time
|
||
|
||
# 验证:应该在5秒内完成
|
||
assert elapsed < 5, f"小数据集理解耗时 {elapsed:.2f}秒,超过5秒限制"
|
||
assert profile.row_count == 1000
|
||
assert profile.column_count == 10
|
||
|
||
def test_medium_dataset_performance(self, tmp_path):
|
||
"""测试中等数据集(10万行)的性能。"""
|
||
# 生成测试数据
|
||
data_file = tmp_path / "medium_data.csv"
|
||
df = self._generate_test_data(rows=100000, cols=20)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 测试性能
|
||
start_time = time.time()
|
||
dal = DataAccessLayer.load_from_file(str(data_file))
|
||
profile = understand_data(dal)
|
||
elapsed = time.time() - start_time
|
||
|
||
# 验证:应该在15秒内完成
|
||
assert elapsed < 15, f"中等数据集理解耗时 {elapsed:.2f}秒,超过15秒限制"
|
||
assert profile.row_count == 100000
|
||
assert profile.column_count == 20
|
||
|
||
def test_large_dataset_performance(self, tmp_path):
|
||
"""测试大数据集(100万行)的性能。
|
||
|
||
需求:NFR-1.1 - 数据理解阶段 < 30秒
|
||
需求:NFR-1.2 - 支持最大100万行数据
|
||
"""
|
||
# 生成测试数据
|
||
data_file = tmp_path / "large_data.csv"
|
||
df = self._generate_test_data(rows=1000000, cols=30)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 测试性能
|
||
start_time = time.time()
|
||
dal = DataAccessLayer.load_from_file(str(data_file))
|
||
profile = understand_data(dal)
|
||
elapsed = time.time() - start_time
|
||
|
||
# 验证:应该在30秒内完成
|
||
assert elapsed < 30, f"大数据集理解耗时 {elapsed:.2f}秒,超过30秒限制"
|
||
assert profile.row_count == 1000000
|
||
assert profile.column_count == 30
|
||
|
||
print(f"✓ 大数据集(100万行)理解耗时: {elapsed:.2f}秒")
|
||
|
||
def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame:
|
||
"""生成测试数据。"""
|
||
data = {}
|
||
|
||
# 生成不同类型的列
|
||
for i in range(cols):
|
||
col_type = i % 4
|
||
|
||
if col_type == 0: # 数值列
|
||
data[f'numeric_{i}'] = np.random.randn(rows)
|
||
elif col_type == 1: # 分类列
|
||
categories = ['A', 'B', 'C', 'D', 'E']
|
||
data[f'category_{i}'] = np.random.choice(categories, rows)
|
||
elif col_type == 2: # 日期列
|
||
start_date = pd.Timestamp('2020-01-01')
|
||
data[f'date_{i}'] = pd.date_range(start_date, periods=rows, freq='H')
|
||
else: # 文本列
|
||
data[f'text_{i}'] = [f'text_{j}' for j in range(rows)]
|
||
|
||
return pd.DataFrame(data)
|
||
|
||
|
||
class TestFullAnalysisPerformance:
|
||
"""测试完整分析流程的性能。"""
|
||
|
||
@pytest.mark.slow
|
||
def test_small_dataset_full_analysis(self, tmp_path):
|
||
"""测试小数据集的完整分析流程。"""
|
||
# 生成测试数据
|
||
data_file = tmp_path / "test_data.csv"
|
||
df = self._generate_ticket_data(rows=1000)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 设置输出目录
|
||
output_dir = tmp_path / "output"
|
||
|
||
# 测试性能
|
||
start_time = time.time()
|
||
result = run_analysis(
|
||
data_file=str(data_file),
|
||
user_requirement="分析工单数据",
|
||
output_dir=str(output_dir)
|
||
)
|
||
elapsed = time.time() - start_time
|
||
|
||
# 验证:应该在5分钟内完成
|
||
assert elapsed < 300, f"小数据集完整分析耗时 {elapsed:.2f}秒,超过5分钟限制"
|
||
assert result['success'] is True
|
||
|
||
print(f"✓ 小数据集(1000行)完整分析耗时: {elapsed:.2f}秒")
|
||
|
||
@pytest.mark.slow
|
||
@pytest.mark.skipif(
|
||
os.getenv('SKIP_LONG_TESTS') == '1',
|
||
reason="跳过长时间运行的测试"
|
||
)
|
||
def test_large_dataset_full_analysis(self, tmp_path):
|
||
"""测试大数据集的完整分析流程。
|
||
|
||
需求:NFR-1.1 - 完整分析流程 < 30分钟
|
||
"""
|
||
# 生成测试数据
|
||
data_file = tmp_path / "large_test_data.csv"
|
||
df = self._generate_ticket_data(rows=100000)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 设置输出目录
|
||
output_dir = tmp_path / "output"
|
||
|
||
# 测试性能
|
||
start_time = time.time()
|
||
result = run_analysis(
|
||
data_file=str(data_file),
|
||
user_requirement="分析工单健康度",
|
||
output_dir=str(output_dir)
|
||
)
|
||
elapsed = time.time() - start_time
|
||
|
||
# 验证:应该在30分钟内完成
|
||
assert elapsed < 1800, f"大数据集完整分析耗时 {elapsed:.2f}秒,超过30分钟限制"
|
||
assert result['success'] is True
|
||
|
||
print(f"✓ 大数据集(10万行)完整分析耗时: {elapsed:.2f}秒")
|
||
|
||
def _generate_ticket_data(self, rows: int) -> pd.DataFrame:
|
||
"""生成工单测试数据。"""
|
||
statuses = ['待处理', '处理中', '已关闭', '已解决']
|
||
priorities = ['低', '中', '高', '紧急']
|
||
types = ['故障', '咨询', '投诉', '建议']
|
||
models = ['Model A', 'Model B', 'Model C', 'Model D']
|
||
|
||
data = {
|
||
'ticket_id': [f'T{i:06d}' for i in range(rows)],
|
||
'status': np.random.choice(statuses, rows),
|
||
'priority': np.random.choice(priorities, rows),
|
||
'type': np.random.choice(types, rows),
|
||
'model': np.random.choice(models, rows),
|
||
'created_at': pd.date_range('2023-01-01', periods=rows, freq='5min'),
|
||
'closed_at': pd.date_range('2023-01-01', periods=rows, freq='5min') + pd.Timedelta(hours=24),
|
||
'duration_hours': np.random.randint(1, 100, rows),
|
||
}
|
||
|
||
return pd.DataFrame(data)
|
||
|
||
|
||
class TestMemoryUsage:
|
||
"""测试内存使用。"""
|
||
|
||
def test_data_loading_memory(self, tmp_path):
|
||
"""测试数据加载的内存使用。"""
|
||
# 生成测试数据
|
||
data_file = tmp_path / "memory_test.csv"
|
||
df = self._generate_test_data(rows=100000, cols=50)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 记录初始内存
|
||
process = psutil.Process()
|
||
initial_memory = process.memory_info().rss / 1024 / 1024 # MB
|
||
|
||
# 加载数据
|
||
dal = DataAccessLayer.load_from_file(str(data_file))
|
||
profile = understand_data(dal)
|
||
|
||
# 记录最终内存
|
||
final_memory = process.memory_info().rss / 1024 / 1024 # MB
|
||
memory_increase = final_memory - initial_memory
|
||
|
||
# 验证:内存增长应该合理(不超过500MB)
|
||
assert memory_increase < 500, f"内存增长 {memory_increase:.2f}MB,超过500MB限制"
|
||
|
||
print(f"✓ 数据加载内存增长: {memory_increase:.2f}MB")
|
||
|
||
def test_large_dataset_memory(self, tmp_path):
|
||
"""测试大数据集的内存使用。
|
||
|
||
需求:NFR-1.2 - 支持最大100MB的CSV文件
|
||
"""
|
||
# 生成测试数据(约100MB)
|
||
data_file = tmp_path / "large_memory_test.csv"
|
||
df = self._generate_test_data(rows=500000, cols=50)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 检查文件大小
|
||
file_size = os.path.getsize(data_file) / 1024 / 1024 # MB
|
||
print(f"测试文件大小: {file_size:.2f}MB")
|
||
|
||
# 记录初始内存
|
||
process = psutil.Process()
|
||
initial_memory = process.memory_info().rss / 1024 / 1024 # MB
|
||
|
||
# 加载数据
|
||
dal = DataAccessLayer.load_from_file(str(data_file))
|
||
profile = understand_data(dal)
|
||
|
||
# 记录最终内存
|
||
final_memory = process.memory_info().rss / 1024 / 1024 # MB
|
||
memory_increase = final_memory - initial_memory
|
||
|
||
# 验证:内存增长应该合理(不超过1GB)
|
||
assert memory_increase < 1024, f"内存增长 {memory_increase:.2f}MB,超过1GB限制"
|
||
|
||
print(f"✓ 大数据集内存增长: {memory_increase:.2f}MB")
|
||
|
||
def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame:
|
||
"""生成测试数据。"""
|
||
data = {}
|
||
|
||
for i in range(cols):
|
||
col_type = i % 4
|
||
|
||
if col_type == 0:
|
||
data[f'col_{i}'] = np.random.randn(rows)
|
||
elif col_type == 1:
|
||
data[f'col_{i}'] = np.random.choice(['A', 'B', 'C', 'D'], rows)
|
||
elif col_type == 2:
|
||
data[f'col_{i}'] = pd.date_range('2020-01-01', periods=rows, freq='H')
|
||
else:
|
||
data[f'col_{i}'] = [f'text_{j % 1000}' for j in range(rows)]
|
||
|
||
return pd.DataFrame(data)
|
||
|
||
|
||
class TestStagePerformance:
|
||
"""测试各阶段的性能指标。"""
|
||
|
||
def test_data_understanding_stage(self, tmp_path):
|
||
"""测试数据理解阶段的性能。"""
|
||
# 生成测试数据
|
||
data_file = tmp_path / "stage_test.csv"
|
||
df = self._generate_test_data(rows=50000, cols=30)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 测试性能
|
||
start_time = time.time()
|
||
dal = DataAccessLayer.load_from_file(str(data_file))
|
||
profile = understand_data(dal)
|
||
elapsed = time.time() - start_time
|
||
|
||
# 验证:应该在20秒内完成
|
||
assert elapsed < 20, f"数据理解阶段耗时 {elapsed:.2f}秒,超过20秒限制"
|
||
|
||
print(f"✓ 数据理解阶段(5万行)耗时: {elapsed:.2f}秒")
|
||
|
||
def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame:
|
||
"""生成测试数据。"""
|
||
data = {}
|
||
|
||
for i in range(cols):
|
||
if i % 3 == 0:
|
||
data[f'col_{i}'] = np.random.randn(rows)
|
||
elif i % 3 == 1:
|
||
data[f'col_{i}'] = np.random.choice(['A', 'B', 'C'], rows)
|
||
else:
|
||
data[f'col_{i}'] = pd.date_range('2020-01-01', periods=rows, freq='min')
|
||
|
||
return pd.DataFrame(data)
|
||
|
||
|
||
@pytest.fixture
|
||
def performance_report(tmp_path):
|
||
"""生成性能测试报告。"""
|
||
report_file = tmp_path / "performance_report.txt"
|
||
|
||
yield report_file
|
||
|
||
# 测试结束后,如果报告文件存在,打印内容
|
||
if report_file.exists():
|
||
print("\n" + "="*60)
|
||
print("性能测试报告")
|
||
print("="*60)
|
||
print(report_file.read_text())
|
||
print("="*60)
|
||
|
||
|
||
|
||
class TestOptimizationEffectiveness:
|
||
"""测试性能优化的有效性。"""
|
||
|
||
def test_memory_optimization(self, tmp_path):
|
||
"""测试内存优化的效果。"""
|
||
# 生成测试数据
|
||
data_file = tmp_path / "optimization_test.csv"
|
||
df = self._generate_test_data(rows=100000, cols=30)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 不优化内存
|
||
dal_no_opt = DataAccessLayer.load_from_file(str(data_file), optimize_memory=False)
|
||
memory_no_opt = dal_no_opt._data.memory_usage(deep=True).sum() / 1024 / 1024
|
||
|
||
# 优化内存
|
||
dal_opt = DataAccessLayer.load_from_file(str(data_file), optimize_memory=True)
|
||
memory_opt = dal_opt._data.memory_usage(deep=True).sum() / 1024 / 1024
|
||
|
||
# 验证:优化后内存应该减少
|
||
memory_saved = memory_no_opt - memory_opt
|
||
savings_percent = (memory_saved / memory_no_opt) * 100
|
||
|
||
print(f"✓ 内存优化效果: {memory_no_opt:.2f}MB -> {memory_opt:.2f}MB")
|
||
print(f"✓ 节省内存: {memory_saved:.2f}MB ({savings_percent:.1f}%)")
|
||
|
||
# 验证:至少节省10%的内存
|
||
assert memory_saved > 0, "内存优化应该减少内存使用"
|
||
|
||
def test_cache_effectiveness(self, tmp_path):
|
||
"""测试缓存的有效性。"""
|
||
from src.performance_optimization import LLMCache
|
||
|
||
cache_dir = tmp_path / "cache"
|
||
cache = LLMCache(str(cache_dir))
|
||
|
||
# 第一次调用(未缓存)
|
||
prompt = "测试提示"
|
||
response = {"result": "测试响应"}
|
||
|
||
# 设置缓存
|
||
cache.set(prompt, response)
|
||
|
||
# 第二次调用(应该命中缓存)
|
||
cached_response = cache.get(prompt)
|
||
|
||
assert cached_response is not None
|
||
assert cached_response == response
|
||
|
||
print("✓ 缓存功能正常工作")
|
||
|
||
def test_batch_processing(self):
|
||
"""测试批处理的效果。"""
|
||
from src.performance_optimization import BatchProcessor
|
||
|
||
processor = BatchProcessor(batch_size=10)
|
||
|
||
# 测试数据
|
||
items = list(range(100))
|
||
|
||
# 批处理函数
|
||
def process_item(item):
|
||
return item * 2
|
||
|
||
# 执行批处理
|
||
start_time = time.time()
|
||
results = processor.process_batch(items, process_item)
|
||
elapsed = time.time() - start_time
|
||
|
||
# 验证结果
|
||
assert len(results) == 100
|
||
assert results[0] == 0
|
||
assert results[50] == 100
|
||
|
||
print(f"✓ 批处理100个项目耗时: {elapsed:.3f}秒")
|
||
|
||
def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame:
|
||
"""生成测试数据。"""
|
||
data = {}
|
||
|
||
for i in range(cols):
|
||
if i % 3 == 0:
|
||
data[f'col_{i}'] = np.random.randint(0, 100, rows)
|
||
elif i % 3 == 1:
|
||
data[f'col_{i}'] = np.random.choice(['A', 'B', 'C', 'D'], rows)
|
||
else:
|
||
data[f'col_{i}'] = [f'text_{j % 100}' for j in range(rows)]
|
||
|
||
return pd.DataFrame(data)
|
||
|
||
|
||
class TestPerformanceMonitoring:
|
||
"""测试性能监控功能。"""
|
||
|
||
def test_performance_monitor(self):
|
||
"""测试性能监控器。"""
|
||
from src.performance_optimization import PerformanceMonitor
|
||
|
||
monitor = PerformanceMonitor()
|
||
|
||
# 记录一些指标
|
||
monitor.record("test_metric", 1.5)
|
||
monitor.record("test_metric", 2.0)
|
||
monitor.record("test_metric", 1.8)
|
||
|
||
# 获取统计信息
|
||
stats = monitor.get_stats("test_metric")
|
||
|
||
assert stats['count'] == 3
|
||
assert stats['mean'] == pytest.approx(1.767, rel=0.01)
|
||
assert stats['min'] == 1.5
|
||
assert stats['max'] == 2.0
|
||
|
||
print("✓ 性能监控器正常工作")
|
||
|
||
def test_timed_decorator(self):
|
||
"""测试计时装饰器。"""
|
||
from src.performance_optimization import timed, PerformanceMonitor
|
||
|
||
monitor = PerformanceMonitor()
|
||
|
||
@timed(metric_name="test_function", monitor=monitor)
|
||
def slow_function():
|
||
time.sleep(0.1)
|
||
return "done"
|
||
|
||
# 执行函数
|
||
result = slow_function()
|
||
|
||
assert result == "done"
|
||
|
||
# 检查是否记录了性能指标
|
||
stats = monitor.get_stats("test_function")
|
||
assert stats['count'] == 1
|
||
assert stats['mean'] >= 0.1
|
||
|
||
print("✓ 计时装饰器正常工作")
|
||
|
||
|
||
class TestEndToEndPerformance:
|
||
"""端到端性能测试。"""
|
||
|
||
def test_performance_report_generation(self, tmp_path):
|
||
"""测试性能报告生成。"""
|
||
from src.performance_optimization import get_global_monitor
|
||
|
||
# 生成测试数据
|
||
data_file = tmp_path / "e2e_test.csv"
|
||
df = self._generate_ticket_data(rows=5000)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
# 获取性能监控器
|
||
monitor = get_global_monitor()
|
||
monitor.clear()
|
||
|
||
# 执行数据理解
|
||
dal = DataAccessLayer.load_from_file(str(data_file))
|
||
profile = understand_data(dal)
|
||
|
||
# 获取性能统计
|
||
stats = monitor.get_all_stats()
|
||
|
||
print("\n性能统计:")
|
||
for metric_name, metric_stats in stats.items():
|
||
if metric_stats:
|
||
print(f" {metric_name}: {metric_stats['mean']:.3f}秒")
|
||
|
||
assert profile is not None
|
||
|
||
def _generate_ticket_data(self, rows: int) -> pd.DataFrame:
|
||
"""生成工单测试数据。"""
|
||
statuses = ['待处理', '处理中', '已关闭']
|
||
types = ['故障', '咨询', '投诉']
|
||
|
||
data = {
|
||
'ticket_id': [f'T{i:06d}' for i in range(rows)],
|
||
'status': np.random.choice(statuses, rows),
|
||
'type': np.random.choice(types, rows),
|
||
'created_at': pd.date_range('2023-01-01', periods=rows, freq='5min'),
|
||
'duration': np.random.randint(1, 100, rows),
|
||
}
|
||
|
||
return pd.DataFrame(data)
|
||
|
||
|
||
class TestPerformanceBenchmarks:
|
||
"""性能基准测试。"""
|
||
|
||
def test_data_loading_benchmark(self, tmp_path, benchmark_report):
|
||
"""数据加载性能基准。"""
|
||
sizes = [1000, 10000, 100000]
|
||
results = []
|
||
|
||
for size in sizes:
|
||
data_file = tmp_path / f"benchmark_{size}.csv"
|
||
df = self._generate_test_data(rows=size, cols=20)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
start_time = time.time()
|
||
dal = DataAccessLayer.load_from_file(str(data_file))
|
||
elapsed = time.time() - start_time
|
||
|
||
results.append({
|
||
'size': size,
|
||
'time': elapsed,
|
||
'rows_per_second': size / elapsed
|
||
})
|
||
|
||
# 打印基准结果
|
||
print("\n数据加载性能基准:")
|
||
print(f"{'行数':<10} {'耗时(秒)':<12} {'行/秒':<15}")
|
||
print("-" * 40)
|
||
for r in results:
|
||
print(f"{r['size']:<10} {r['time']:<12.3f} {r['rows_per_second']:<15.0f}")
|
||
|
||
def test_data_understanding_benchmark(self, tmp_path):
|
||
"""数据理解性能基准。"""
|
||
sizes = [1000, 10000, 50000]
|
||
results = []
|
||
|
||
for size in sizes:
|
||
data_file = tmp_path / f"understanding_{size}.csv"
|
||
df = self._generate_test_data(rows=size, cols=20)
|
||
df.to_csv(data_file, index=False)
|
||
|
||
dal = DataAccessLayer.load_from_file(str(data_file))
|
||
|
||
start_time = time.time()
|
||
profile = understand_data(dal)
|
||
elapsed = time.time() - start_time
|
||
|
||
results.append({
|
||
'size': size,
|
||
'time': elapsed,
|
||
'rows_per_second': size / elapsed
|
||
})
|
||
|
||
# 打印基准结果
|
||
print("\n数据理解性能基准:")
|
||
print(f"{'行数':<10} {'耗时(秒)':<12} {'行/秒':<15}")
|
||
print("-" * 40)
|
||
for r in results:
|
||
print(f"{r['size']:<10} {r['time']:<12.3f} {r['rows_per_second']:<15.0f}")
|
||
|
||
def _generate_test_data(self, rows: int, cols: int) -> pd.DataFrame:
|
||
"""生成测试数据。"""
|
||
data = {}
|
||
|
||
for i in range(cols):
|
||
if i % 3 == 0:
|
||
data[f'col_{i}'] = np.random.randn(rows)
|
||
elif i % 3 == 1:
|
||
data[f'col_{i}'] = np.random.choice(['A', 'B', 'C'], rows)
|
||
else:
|
||
data[f'col_{i}'] = pd.date_range('2020-01-01', periods=rows, freq='min')
|
||
|
||
return pd.DataFrame(data)
|
||
|
||
|
||
@pytest.fixture
|
||
def benchmark_report():
|
||
"""基准测试报告fixture。"""
|
||
yield
|
||
# 可以在这里生成报告文件
|