优化数据预处理
This commit is contained in:
82
data_preprocessing/sorter.py
Normal file
82
data_preprocessing/sorter.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据排序模块
|
||||
|
||||
按时间列对 CSV 文件进行排序
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
from typing import Optional
|
||||
from .config import default_config
|
||||
|
||||
|
||||
def sort_by_time(
|
||||
input_path: str,
|
||||
output_path: Optional[str] = None,
|
||||
time_column: str = None,
|
||||
inplace: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
按时间列对 CSV 文件排序
|
||||
|
||||
Args:
|
||||
input_path: 输入 CSV 文件路径
|
||||
output_path: 输出路径。如果为 None 且 inplace=False,则输出到 cleaned_data 目录
|
||||
time_column: 时间列名,默认使用配置中的 default_time_column
|
||||
inplace: 是否原地覆盖输入文件
|
||||
|
||||
Returns:
|
||||
输出文件的绝对路径
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: 输入文件不存在
|
||||
KeyError: 时间列不存在
|
||||
"""
|
||||
# 参数处理
|
||||
time_column = time_column or default_config.default_time_column
|
||||
|
||||
if not os.path.exists(input_path):
|
||||
raise FileNotFoundError(f"文件不存在: {input_path}")
|
||||
|
||||
# 确定输出路径
|
||||
if inplace:
|
||||
output_path = input_path
|
||||
elif output_path is None:
|
||||
default_config.ensure_dirs()
|
||||
basename = os.path.basename(input_path)
|
||||
name, ext = os.path.splitext(basename)
|
||||
output_path = os.path.join(
|
||||
default_config.cleaned_data_dir,
|
||||
f"{name}_sorted{ext}"
|
||||
)
|
||||
|
||||
print(f"[READ] 正在读取: {input_path}")
|
||||
df = pd.read_csv(input_path, low_memory=False)
|
||||
print(f" 数据行数: {len(df)}")
|
||||
|
||||
# 检查时间列是否存在
|
||||
if time_column not in df.columns:
|
||||
available_cols = list(df.columns)
|
||||
raise KeyError(
|
||||
f"未找到时间列 '{time_column}'。可用列: {available_cols}"
|
||||
)
|
||||
|
||||
print(f"[PARSE] 正在解析时间列 '{time_column}'...")
|
||||
df[time_column] = pd.to_datetime(df[time_column], errors='coerce')
|
||||
|
||||
# 统计无效时间
|
||||
nat_count = df[time_column].isna().sum()
|
||||
if nat_count > 0:
|
||||
print(f"[WARN] 发现 {nat_count} 行无效时间数据,排序时将排在最后")
|
||||
|
||||
print("[SORT] 正在按时间排序...")
|
||||
df_sorted = df.sort_values(by=time_column, na_position='last')
|
||||
|
||||
print(f"[SAVE] 正在保存: {output_path}")
|
||||
df_sorted.to_csv(output_path, index=False, encoding=default_config.csv_encoding)
|
||||
|
||||
abs_output = os.path.abspath(output_path)
|
||||
print(f"[OK] 排序完成!输出文件: {abs_output}")
|
||||
|
||||
return abs_output
|
||||
Reference in New Issue
Block a user