Files

84 lines
3.2 KiB
Python
Raw Permalink Normal View History

2026-01-09 16:52:45 +08:00
import pandas as pd
import glob
import os
def merge_excel_files(source_dir="remotecontrol", output_file="merged_all_files.csv"):
"""
将指定目录下的所有 Excel 文件 (.xlsx, .xls) 合并为一个 CSV 文件
"""
2026-01-31 18:00:05 +08:00
print(f"[SEARCH] 正在扫描目录: {source_dir} ...")
2026-01-09 16:52:45 +08:00
# 支持 xlsx 和 xls
files_xlsx = glob.glob(os.path.join(source_dir, "*.xlsx"))
files_xls = glob.glob(os.path.join(source_dir, "*.xls"))
files = files_xlsx + files_xls
if not files:
2026-01-31 18:00:05 +08:00
print("[WARN] 未找到 Excel 文件。")
2026-01-09 16:52:45 +08:00
return
# 按文件名中的数字进行排序 (例如: 1.xlsx, 2.xlsx, ..., 10.xlsx)
try:
files.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
2026-01-31 18:00:05 +08:00
print("[NUM] 已按文件名数字顺序排序")
2026-01-09 16:52:45 +08:00
except ValueError:
# 如果文件名不是纯数字,退回到字母排序
files.sort()
2026-01-31 18:00:05 +08:00
print("[TEXT] 已按文件名包含非数字字符,使用字母顺序排序")
2026-01-09 16:52:45 +08:00
2026-01-31 18:00:05 +08:00
print(f"[DIR] 找到 {len(files)} 个文件: {files}")
2026-01-09 16:52:45 +08:00
all_dfs = []
for file in files:
try:
2026-01-31 18:00:05 +08:00
print(f"[READ] 读取: {file}")
2026-01-09 16:52:45 +08:00
# 使用 ExcelFile 读取所有 sheet
xls = pd.ExcelFile(file)
2026-01-31 18:00:05 +08:00
print(f" [PAGES] 包含 Sheets: {xls.sheet_names}")
2026-01-09 16:52:45 +08:00
file_dfs = []
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name)
if not df.empty:
2026-01-31 18:00:05 +08:00
print(f" [OK] Sheet '{sheet_name}' 读取成功: {len(df)}")
2026-01-09 16:52:45 +08:00
file_dfs.append(df)
else:
2026-01-31 18:00:05 +08:00
print(f" [WARN] Sheet '{sheet_name}' 为空,跳过")
2026-01-09 16:52:45 +08:00
if file_dfs:
# 合并该文件的所有非空 sheet
file_merged_df = pd.concat(file_dfs, ignore_index=True)
# 可选:添加一列标记来源文件
file_merged_df['Source_File'] = os.path.basename(file)
all_dfs.append(file_merged_df)
else:
2026-01-31 18:00:05 +08:00
print(f"[WARN] 文件 {file} 所有 Sheet 均为空")
2026-01-09 16:52:45 +08:00
except Exception as e:
2026-01-31 18:00:05 +08:00
print(f"[ERROR] 读取 {file} 失败: {e}")
2026-01-09 16:52:45 +08:00
if all_dfs:
2026-01-31 18:00:05 +08:00
print("[LOOP] 正在合并数据...")
2026-01-09 16:52:45 +08:00
merged_df = pd.concat(all_dfs, ignore_index=True)
# 按 SendTime 排序
if 'SendTime' in merged_df.columns:
2026-01-31 18:00:05 +08:00
print("[TIMER] 正在按 SendTime 排序...")
2026-01-09 16:52:45 +08:00
merged_df['SendTime'] = pd.to_datetime(merged_df['SendTime'], errors='coerce')
merged_df = merged_df.sort_values(by='SendTime')
else:
2026-01-31 18:00:05 +08:00
print("[WARN] 未找到 SendTime 列,跳过排序")
2026-01-09 16:52:45 +08:00
2026-01-31 18:00:05 +08:00
print(f"[CACHE] 保存到: {output_file}")
2026-01-09 16:52:45 +08:00
merged_df.to_csv(output_file, index=False, encoding="utf-8-sig")
2026-01-31 18:00:05 +08:00
print(f"[OK] 合并及排序完成!总行数: {len(merged_df)}")
2026-01-09 16:52:45 +08:00
print(f" 输出文件: {os.path.abspath(output_file)}")
else:
2026-01-31 18:00:05 +08:00
print("[WARN] 没有成功读取到任何数据。")
2026-01-09 16:52:45 +08:00
if __name__ == "__main__":
# 如果需要在当前目录运行并合并 remotecontrol 文件夹下的内容
merge_excel_files(source_dir="remotecontrol", output_file="remotecontrol_merged.csv")