Files
2026-02-02 09:44:07 +08:00

141 lines
4.0 KiB
Python

# -*- coding: utf-8 -*-
"""
数据预处理命令行接口
使用示例:
# 合并 Excel 文件
python -m data_preprocessing.cli merge --source raw_data/remotecontrol --output cleaned_data/merged.csv
# 合并并排序
python -m data_preprocessing.cli merge --source raw_data/remotecontrol --sort-by SendTime
# 排序已有 CSV
python -m data_preprocessing.cli sort --input data.csv --output sorted.csv --time-col SendTime
# 初始化目录结构
python -m data_preprocessing.cli init
"""
import argparse
import sys
from .config import default_config
from .sorter import sort_by_time
from .merger import merge_files
def main():
parser = argparse.ArgumentParser(
prog="data_preprocessing",
description="数据预处理工具:排序、合并",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
%(prog)s merge --source raw_data/remotecontrol --sort-by SendTime
%(prog)s sort --input data.csv --time-col CreateTime
%(prog)s init
"""
)
subparsers = parser.add_subparsers(dest="command", help="可用命令")
# ========== merge 命令 ==========
merge_parser = subparsers.add_parser("merge", help="合并同类文件")
merge_parser.add_argument(
"--source", "-s",
required=True,
help="源数据目录路径"
)
merge_parser.add_argument(
"--output", "-o",
default=None,
help="输出文件路径 (默认: cleaned_data/<目录名>_merged.csv)"
)
merge_parser.add_argument(
"--pattern", "-p",
default="*.xlsx",
help="文件匹配模式 (默认: *.xlsx)"
)
merge_parser.add_argument(
"--sort-by",
default=None,
dest="time_column",
help="合并后按此时间列排序"
)
merge_parser.add_argument(
"--no-source-col",
action="store_true",
help="不添加来源文件列"
)
# ========== sort 命令 ==========
sort_parser = subparsers.add_parser("sort", help="按时间排序 CSV")
sort_parser.add_argument(
"--input", "-i",
required=True,
help="输入 CSV 文件路径"
)
sort_parser.add_argument(
"--output", "-o",
default=None,
help="输出文件路径 (默认: cleaned_data/<文件名>_sorted.csv)"
)
sort_parser.add_argument(
"--time-col", "-t",
default=None,
dest="time_column",
help=f"时间列名 (默认: {default_config.default_time_column})"
)
sort_parser.add_argument(
"--inplace",
action="store_true",
help="原地覆盖输入文件"
)
# ========== init 命令 ==========
init_parser = subparsers.add_parser("init", help="初始化目录结构")
# 解析参数
args = parser.parse_args()
if args.command is None:
parser.print_help()
sys.exit(0)
try:
if args.command == "merge":
result = merge_files(
source_dir=args.source,
output_file=args.output,
pattern=args.pattern,
time_column=args.time_column,
add_source_column=not args.no_source_col
)
print(f"\n✅ 合并成功: {result}")
elif args.command == "sort":
result = sort_by_time(
input_path=args.input,
output_path=args.output,
time_column=args.time_column,
inplace=args.inplace
)
print(f"\n✅ 排序成功: {result}")
elif args.command == "init":
default_config.ensure_dirs()
print("\n✅ 目录初始化完成")
except FileNotFoundError as e:
print(f"\n❌ 错误: {e}")
sys.exit(1)
except KeyError as e:
print(f"\n❌ 错误: {e}")
sys.exit(1)
except Exception as e:
print(f"\n❌ 未知错误: {e}")
sys.exit(1)
if __name__ == "__main__":
main()