优化数据预处理

2026-02-02 09:44:07 +08:00
parent c8fe5e6d6f
commit b033eb61cc
12 changed files with 516 additions and 39 deletions
--- a/data_preprocessing/sorter.py
+++ b/data_preprocessing/sorter.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+"""
+数据排序模块
+
+按时间列对 CSV 文件进行排序
+"""
+
+import os
+import pandas as pd
+from typing import Optional
+from .config import default_config
+
+
+def sort_by_time(
+    input_path: str,
+    output_path: Optional[str] = None,
+    time_column: str = None,
+    inplace: bool = False
+) -> str:
+    """
+    按时间列对 CSV 文件排序
+    
+    Args:
+        input_path: 输入 CSV 文件路径
+        output_path: 输出路径。如果为 None 且 inplace=False，则输出到 cleaned_data 目录
+        time_column: 时间列名，默认使用配置中的 default_time_column
+        inplace: 是否原地覆盖输入文件
+        
+    Returns:
+        输出文件的绝对路径
+        
+    Raises:
+        FileNotFoundError: 输入文件不存在
+        KeyError: 时间列不存在
+    """
+    # 参数处理
+    time_column = time_column or default_config.default_time_column
+    
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"文件不存在: {input_path}")
+    
+    # 确定输出路径
+    if inplace:
+        output_path = input_path
+    elif output_path is None:
+        default_config.ensure_dirs()
+        basename = os.path.basename(input_path)
+        name, ext = os.path.splitext(basename)
+        output_path = os.path.join(
+            default_config.cleaned_data_dir, 
+            f"{name}_sorted{ext}"
+        )
+    
+    print(f"[READ] 正在读取: {input_path}")
+    df = pd.read_csv(input_path, low_memory=False)
+    print(f"    数据行数: {len(df)}")
+    
+    # 检查时间列是否存在
+    if time_column not in df.columns:
+        available_cols = list(df.columns)
+        raise KeyError(
+            f"未找到时间列 '{time_column}'。可用列: {available_cols}"
+        )
+    
+    print(f"[PARSE] 正在解析时间列 '{time_column}'...")
+    df[time_column] = pd.to_datetime(df[time_column], errors='coerce')
+    
+    # 统计无效时间
+    nat_count = df[time_column].isna().sum()
+    if nat_count > 0:
+        print(f"[WARN] 发现 {nat_count} 行无效时间数据，排序时将排在最后")
+    
+    print("[SORT] 正在按时间排序...")
+    df_sorted = df.sort_values(by=time_column, na_position='last')
+    
+    print(f"[SAVE] 正在保存: {output_path}")
+    df_sorted.to_csv(output_path, index=False, encoding=default_config.csv_encoding)
+    
+    abs_output = os.path.abspath(output_path)
+    print(f"[OK] 排序完成！输出文件: {abs_output}")
+    
+    return abs_output