141 lines
4.0 KiB
Python
141 lines
4.0 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
||
|
|
"""
|
||
|
|
数据预处理命令行接口
|
||
|
|
|
||
|
|
使用示例:
|
||
|
|
# 合并 Excel 文件
|
||
|
|
python -m data_preprocessing.cli merge --source raw_data/remotecontrol --output cleaned_data/merged.csv
|
||
|
|
|
||
|
|
# 合并并排序
|
||
|
|
python -m data_preprocessing.cli merge --source raw_data/remotecontrol --sort-by SendTime
|
||
|
|
|
||
|
|
# 排序已有 CSV
|
||
|
|
python -m data_preprocessing.cli sort --input data.csv --output sorted.csv --time-col SendTime
|
||
|
|
|
||
|
|
# 初始化目录结构
|
||
|
|
python -m data_preprocessing.cli init
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import sys
|
||
|
|
from .config import default_config
|
||
|
|
from .sorter import sort_by_time
|
||
|
|
from .merger import merge_files
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
prog="data_preprocessing",
|
||
|
|
description="数据预处理工具:排序、合并",
|
||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
|
|
epilog="""
|
||
|
|
示例:
|
||
|
|
%(prog)s merge --source raw_data/remotecontrol --sort-by SendTime
|
||
|
|
%(prog)s sort --input data.csv --time-col CreateTime
|
||
|
|
%(prog)s init
|
||
|
|
"""
|
||
|
|
)
|
||
|
|
|
||
|
|
subparsers = parser.add_subparsers(dest="command", help="可用命令")
|
||
|
|
|
||
|
|
# ========== merge 命令 ==========
|
||
|
|
merge_parser = subparsers.add_parser("merge", help="合并同类文件")
|
||
|
|
merge_parser.add_argument(
|
||
|
|
"--source", "-s",
|
||
|
|
required=True,
|
||
|
|
help="源数据目录路径"
|
||
|
|
)
|
||
|
|
merge_parser.add_argument(
|
||
|
|
"--output", "-o",
|
||
|
|
default=None,
|
||
|
|
help="输出文件路径 (默认: cleaned_data/<目录名>_merged.csv)"
|
||
|
|
)
|
||
|
|
merge_parser.add_argument(
|
||
|
|
"--pattern", "-p",
|
||
|
|
default="*.xlsx",
|
||
|
|
help="文件匹配模式 (默认: *.xlsx)"
|
||
|
|
)
|
||
|
|
merge_parser.add_argument(
|
||
|
|
"--sort-by",
|
||
|
|
default=None,
|
||
|
|
dest="time_column",
|
||
|
|
help="合并后按此时间列排序"
|
||
|
|
)
|
||
|
|
merge_parser.add_argument(
|
||
|
|
"--no-source-col",
|
||
|
|
action="store_true",
|
||
|
|
help="不添加来源文件列"
|
||
|
|
)
|
||
|
|
|
||
|
|
# ========== sort 命令 ==========
|
||
|
|
sort_parser = subparsers.add_parser("sort", help="按时间排序 CSV")
|
||
|
|
sort_parser.add_argument(
|
||
|
|
"--input", "-i",
|
||
|
|
required=True,
|
||
|
|
help="输入 CSV 文件路径"
|
||
|
|
)
|
||
|
|
sort_parser.add_argument(
|
||
|
|
"--output", "-o",
|
||
|
|
default=None,
|
||
|
|
help="输出文件路径 (默认: cleaned_data/<文件名>_sorted.csv)"
|
||
|
|
)
|
||
|
|
sort_parser.add_argument(
|
||
|
|
"--time-col", "-t",
|
||
|
|
default=None,
|
||
|
|
dest="time_column",
|
||
|
|
help=f"时间列名 (默认: {default_config.default_time_column})"
|
||
|
|
)
|
||
|
|
sort_parser.add_argument(
|
||
|
|
"--inplace",
|
||
|
|
action="store_true",
|
||
|
|
help="原地覆盖输入文件"
|
||
|
|
)
|
||
|
|
|
||
|
|
# ========== init 命令 ==========
|
||
|
|
init_parser = subparsers.add_parser("init", help="初始化目录结构")
|
||
|
|
|
||
|
|
# 解析参数
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
if args.command is None:
|
||
|
|
parser.print_help()
|
||
|
|
sys.exit(0)
|
||
|
|
|
||
|
|
try:
|
||
|
|
if args.command == "merge":
|
||
|
|
result = merge_files(
|
||
|
|
source_dir=args.source,
|
||
|
|
output_file=args.output,
|
||
|
|
pattern=args.pattern,
|
||
|
|
time_column=args.time_column,
|
||
|
|
add_source_column=not args.no_source_col
|
||
|
|
)
|
||
|
|
print(f"\n✅ 合并成功: {result}")
|
||
|
|
|
||
|
|
elif args.command == "sort":
|
||
|
|
result = sort_by_time(
|
||
|
|
input_path=args.input,
|
||
|
|
output_path=args.output,
|
||
|
|
time_column=args.time_column,
|
||
|
|
inplace=args.inplace
|
||
|
|
)
|
||
|
|
print(f"\n✅ 排序成功: {result}")
|
||
|
|
|
||
|
|
elif args.command == "init":
|
||
|
|
default_config.ensure_dirs()
|
||
|
|
print("\n✅ 目录初始化完成")
|
||
|
|
|
||
|
|
except FileNotFoundError as e:
|
||
|
|
print(f"\n❌ 错误: {e}")
|
||
|
|
sys.exit(1)
|
||
|
|
except KeyError as e:
|
||
|
|
print(f"\n❌ 错误: {e}")
|
||
|
|
sys.exit(1)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"\n❌ 未知错误: {e}")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|