Complete AI Data Analysis Agent implementation with 95.7% test coverage

This commit is contained in:
2026-03-07 00:04:29 +08:00
parent 621e546b43
commit 7071b1f730
245 changed files with 22612 additions and 2211 deletions

View File

@@ -0,0 +1,273 @@
"""数据理解引擎的基于属性的测试。"""
import pytest
import pandas as pd
import numpy as np
from hypothesis import given, strategies as st, settings, assume
from typing import Dict, Any
from src.engines.data_understanding import (
generate_basic_stats,
understand_data,
_infer_column_type,
_infer_data_type,
_identify_key_fields,
_evaluate_data_quality
)
from src.models import DataProfile, ColumnInfo
# Hypothesis 策略用于生成测试数据
@st.composite
def dataframe_strategy(draw, min_rows=10, max_rows=100, min_cols=2, max_cols=10):
"""生成随机的 DataFrame 实例。"""
n_rows = draw(st.integers(min_value=min_rows, max_value=max_rows))
n_cols = draw(st.integers(min_value=min_cols, max_value=max_cols))
data = {}
for i in range(n_cols):
col_type = draw(st.sampled_from(['int', 'float', 'str', 'datetime']))
col_name = f'col_{i}'
if col_type == 'int':
data[col_name] = draw(st.lists(
st.integers(min_value=-1000, max_value=1000),
min_size=n_rows,
max_size=n_rows
))
elif col_type == 'float':
data[col_name] = draw(st.lists(
st.floats(min_value=-1000.0, max_value=1000.0, allow_nan=False, allow_infinity=False),
min_size=n_rows,
max_size=n_rows
))
elif col_type == 'datetime':
start_date = pd.Timestamp('2020-01-01')
data[col_name] = pd.date_range(start=start_date, periods=n_rows, freq='D')
else: # str
data[col_name] = draw(st.lists(
st.text(min_size=1, max_size=10, alphabet=st.characters(whitelist_categories=('Lu', 'Ll'))),
min_size=n_rows,
max_size=n_rows
))
return pd.DataFrame(data)
# Feature: true-ai-agent, Property 1: 数据类型识别
@given(df=dataframe_strategy(min_rows=10, max_rows=100))
@settings(max_examples=20, deadline=None)
def test_data_type_inference(df):
"""
属性 1对于任何有效的 CSV 文件,数据理解引擎应该能够推断出数据的业务类型
(如工单、销售、用户等),并且推断结果应该基于列名、数据类型和值分布的分析。
验证需求场景1验收.1
"""
# 执行数据理解
profile = understand_data(file_path='test.csv', data=df)
# 验证:应该有推断的类型
assert profile.inferred_type is not None, "推断的数据类型不应为 None"
assert profile.inferred_type in ['ticket', 'sales', 'user', 'unknown'], \
f"推断的数据类型应该是预定义的类型之一,但得到:{profile.inferred_type}"
# 验证:推断应该基于数据特征
# 至少应该识别出一些关键字段或生成摘要
assert len(profile.summary) > 0, "应该生成数据摘要"
# Feature: true-ai-agent, Property 2: 数据画像完整性
@given(df=dataframe_strategy(min_rows=5, max_rows=50))
@settings(max_examples=20, deadline=None)
def test_data_profile_completeness(df):
"""
属性 2对于任何有效的 CSV 文件,生成的数据画像应该包含所有必需字段
(行数、列数、列信息、推断类型、关键字段、质量分数),并且列信息应该
包含每列的名称、类型、缺失率和统计信息。
验证需求FR-1.2, FR-1.3, FR-1.4
"""
# 执行数据理解
profile = understand_data(file_path='test.csv', data=df)
# 验证:数据画像应该包含所有必需字段
assert hasattr(profile, 'file_path'), "数据画像缺少 file_path 字段"
assert hasattr(profile, 'row_count'), "数据画像缺少 row_count 字段"
assert hasattr(profile, 'column_count'), "数据画像缺少 column_count 字段"
assert hasattr(profile, 'columns'), "数据画像缺少 columns 字段"
assert hasattr(profile, 'inferred_type'), "数据画像缺少 inferred_type 字段"
assert hasattr(profile, 'key_fields'), "数据画像缺少 key_fields 字段"
assert hasattr(profile, 'quality_score'), "数据画像缺少 quality_score 字段"
assert hasattr(profile, 'summary'), "数据画像缺少 summary 字段"
# 验证:行数和列数应该正确
assert profile.row_count == len(df), f"行数不匹配:期望 {len(df)},得到 {profile.row_count}"
assert profile.column_count == len(df.columns), \
f"列数不匹配:期望 {len(df.columns)},得到 {profile.column_count}"
# 验证:列信息应该完整
assert len(profile.columns) == len(df.columns), \
f"列信息数量不匹配:期望 {len(df.columns)},得到 {len(profile.columns)}"
for col_info in profile.columns:
# 验证:每列应该有名称、类型、缺失率
assert hasattr(col_info, 'name'), "列信息缺少 name 字段"
assert hasattr(col_info, 'dtype'), "列信息缺少 dtype 字段"
assert hasattr(col_info, 'missing_rate'), "列信息缺少 missing_rate 字段"
assert hasattr(col_info, 'unique_count'), "列信息缺少 unique_count 字段"
assert hasattr(col_info, 'statistics'), "列信息缺少 statistics 字段"
# 验证:数据类型应该是预定义的类型之一
assert col_info.dtype in ['numeric', 'categorical', 'datetime', 'text'], \
f"{col_info.name} 的数据类型应该是预定义的类型之一,但得到:{col_info.dtype}"
# 验证:缺失率应该在 0-1 之间
assert 0.0 <= col_info.missing_rate <= 1.0, \
f"{col_info.name} 的缺失率应该在 0-1 之间,但得到:{col_info.missing_rate}"
# 验证:唯一值数量应该合理
assert col_info.unique_count >= 0, \
f"{col_info.name} 的唯一值数量应该非负,但得到:{col_info.unique_count}"
assert col_info.unique_count <= len(df), \
f"{col_info.name} 的唯一值数量不应超过总行数"
# 验证:质量分数应该在 0-100 之间
assert 0.0 <= profile.quality_score <= 100.0, \
f"质量分数应该在 0-100 之间,但得到:{profile.quality_score}"
# 额外测试:验证列类型推断的正确性
@given(
numeric_data=st.lists(st.floats(min_value=-1000, max_value=1000, allow_nan=False, allow_infinity=False),
min_size=10, max_size=100),
categorical_data=st.lists(st.sampled_from(['A', 'B', 'C', 'D']), min_size=10, max_size=100)
)
@settings(max_examples=10)
def test_column_type_inference(numeric_data, categorical_data):
"""测试列类型推断的正确性。"""
# 测试数值列
numeric_series = pd.Series(numeric_data)
numeric_type = _infer_column_type(numeric_series)
assert numeric_type == 'numeric', f"数值列应该被识别为 'numeric',但得到:{numeric_type}"
# 测试分类列
categorical_series = pd.Series(categorical_data)
categorical_type = _infer_column_type(categorical_series)
assert categorical_type == 'categorical', \
f"分类列应该被识别为 'categorical',但得到:{categorical_type}"
# 额外测试:验证数据质量评估的合理性
@given(
missing_rate=st.floats(min_value=0.0, max_value=1.0),
n_cols=st.integers(min_value=1, max_value=10)
)
@settings(max_examples=10)
def test_data_quality_evaluation(missing_rate, n_cols):
"""测试数据质量评估的合理性。"""
# 创建具有指定缺失率的列信息
columns = []
for i in range(n_cols):
col_info = ColumnInfo(
name=f'col_{i}',
dtype='numeric',
missing_rate=missing_rate,
unique_count=100,
sample_values=[1, 2, 3],
statistics={}
)
columns.append(col_info)
# 评估数据质量
quality_score = _evaluate_data_quality(columns, row_count=100)
# 验证:质量分数应该在 0-100 之间
assert 0.0 <= quality_score <= 100.0, \
f"质量分数应该在 0-100 之间,但得到:{quality_score}"
# 验证:缺失率越高,质量分数应该越低
if missing_rate > 0.5:
assert quality_score < 70, \
f"高缺失率({missing_rate})应该导致较低的质量分数,但得到:{quality_score}"
# 额外测试:验证基础统计生成的完整性
@given(df=dataframe_strategy(min_rows=5, max_rows=50))
@settings(max_examples=10, deadline=None)
def test_basic_stats_generation(df):
"""测试基础统计生成的完整性。"""
# 生成基础统计
stats = generate_basic_stats(df, file_path='test.csv')
# 验证:应该包含必需字段
assert 'file_path' in stats, "基础统计缺少 file_path 字段"
assert 'row_count' in stats, "基础统计缺少 row_count 字段"
assert 'column_count' in stats, "基础统计缺少 column_count 字段"
assert 'columns' in stats, "基础统计缺少 columns 字段"
# 验证:统计信息应该准确
assert stats['row_count'] == len(df), "行数统计不准确"
assert stats['column_count'] == len(df.columns), "列数统计不准确"
assert len(stats['columns']) == len(df.columns), "列信息数量不匹配"
# 额外测试:验证关键字段识别
def test_key_field_identification():
"""测试关键字段识别功能。"""
# 创建包含典型字段名的列信息
columns = [
ColumnInfo(name='created_at', dtype='datetime', missing_rate=0.0, unique_count=100),
ColumnInfo(name='status', dtype='categorical', missing_rate=0.0, unique_count=5),
ColumnInfo(name='ticket_id', dtype='text', missing_rate=0.0, unique_count=100),
ColumnInfo(name='amount', dtype='numeric', missing_rate=0.0, unique_count=50),
]
# 识别关键字段
key_fields = _identify_key_fields(columns)
# 验证:应该识别出时间字段
assert 'created_at' in key_fields, "应该识别出 created_at 为关键字段"
# 验证:应该识别出状态字段
assert 'status' in key_fields, "应该识别出 status 为关键字段"
# 验证应该识别出ID字段
assert 'ticket_id' in key_fields, "应该识别出 ticket_id 为关键字段"
# 验证:应该识别出金额字段
assert 'amount' in key_fields, "应该识别出 amount 为关键字段"
# 额外测试:验证数据类型推断
def test_data_type_inference_with_keywords():
"""测试基于关键词的数据类型推断。"""
# 工单数据
ticket_columns = [
ColumnInfo(name='ticket_id', dtype='text', missing_rate=0.0, unique_count=100),
ColumnInfo(name='status', dtype='categorical', missing_rate=0.0, unique_count=5),
ColumnInfo(name='created_at', dtype='datetime', missing_rate=0.0, unique_count=100),
]
ticket_type = _infer_data_type(ticket_columns)
assert ticket_type == 'ticket', f"应该识别为工单数据,但得到:{ticket_type}"
# 销售数据
sales_columns = [
ColumnInfo(name='order_id', dtype='text', missing_rate=0.0, unique_count=100),
ColumnInfo(name='product', dtype='categorical', missing_rate=0.0, unique_count=10),
ColumnInfo(name='amount', dtype='numeric', missing_rate=0.0, unique_count=50),
ColumnInfo(name='sales_date', dtype='datetime', missing_rate=0.0, unique_count=100),
]
sales_type = _infer_data_type(sales_columns)
assert sales_type == 'sales', f"应该识别为销售数据,但得到:{sales_type}"
# 用户数据
user_columns = [
ColumnInfo(name='user_id', dtype='text', missing_rate=0.0, unique_count=100),
ColumnInfo(name='name', dtype='text', missing_rate=0.0, unique_count=100),
ColumnInfo(name='email', dtype='text', missing_rate=0.0, unique_count=100),
ColumnInfo(name='age', dtype='numeric', missing_rate=0.0, unique_count=50),
]
user_type = _infer_data_type(user_columns)
assert user_type == 'user', f"应该识别为用户数据,但得到:{user_type}"