149 lines
5.5 KiB
Python
149 lines
5.5 KiB
Python
|
|
import os
|
|||
|
|
import csv
|
|||
|
|
import glob
|
|||
|
|
import pandas as pd
|
|||
|
|
import json
|
|||
|
|
|
|||
|
|
def merge_csv_files(output_file='merged_results.csv'):
|
|||
|
|
"""合并当前目录下的所有 temp_results_*.csv 和 query_results.csv 文件"""
|
|||
|
|
print("🚀 开始合并 CSV 文件...")
|
|||
|
|
|
|||
|
|
# 查找所有相关 CSV 文件
|
|||
|
|
csv_files = glob.glob('temp_results_*.csv')
|
|||
|
|
if os.path.exists('query_results.csv'):
|
|||
|
|
csv_files.append('query_results.csv')
|
|||
|
|
|
|||
|
|
if not csv_files:
|
|||
|
|
print("⚠️ 未找到任何结果文件")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print(f"📋 找到 {len(csv_files)} 个文件: {csv_files}")
|
|||
|
|
|
|||
|
|
all_data = []
|
|||
|
|
seen_vins = set()
|
|||
|
|
|
|||
|
|
for file in csv_files:
|
|||
|
|
try:
|
|||
|
|
# 尝试使用 pandas 读取,处理编码问题
|
|||
|
|
try:
|
|||
|
|
df = pd.read_csv(file, encoding='utf-8-sig')
|
|||
|
|
except:
|
|||
|
|
df = pd.read_csv(file, encoding='gbk')
|
|||
|
|
|
|||
|
|
# 标准化列名(处理之前版本可能存在的列名差异)
|
|||
|
|
# 假设现在的列是 ['VIN', '结果', '查询时间', '状态']
|
|||
|
|
# 旧的可能是 ['VIN', '单车在线时长', ...]
|
|||
|
|
|
|||
|
|
for index, row in df.iterrows():
|
|||
|
|
vin = str(row.get('VIN', '')).strip().upper()
|
|||
|
|
if not vin or vin in seen_vins:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 获取结果数据
|
|||
|
|
result = row.get('结果', row.get('单车在线时长', ''))
|
|||
|
|
|
|||
|
|
# 尝试解析 JSON 结果
|
|||
|
|
parsed_data = {}
|
|||
|
|
is_json = False
|
|||
|
|
try:
|
|||
|
|
if result and result.strip().startswith('{'):
|
|||
|
|
parsed_data = json.loads(result)
|
|||
|
|
is_json = True
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 基础信息
|
|||
|
|
entry = {
|
|||
|
|
'VIN': vin,
|
|||
|
|
'查询时间': row.get('查询时间', ''),
|
|||
|
|
'状态': row.get('状态', ''),
|
|||
|
|
'原始结果': result
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 如果是 JSON 数据,提取详细字段
|
|||
|
|
if is_json:
|
|||
|
|
# 提取 intervalList 和 daysList
|
|||
|
|
intervals = parsed_data.get('intervalList', [])
|
|||
|
|
days = parsed_data.get('daysList', [])
|
|||
|
|
|
|||
|
|
if len(intervals) == len(days):
|
|||
|
|
for i, interval in enumerate(intervals):
|
|||
|
|
entry[f'时长区间_{interval}'] = days[i]
|
|||
|
|
|
|||
|
|
# 也可以提取 startTime 和 endTime
|
|||
|
|
# entry['startTime'] = parsed_data.get('startTime', '')
|
|||
|
|
# entry['endTime'] = parsed_data.get('endTime', '')
|
|||
|
|
|
|||
|
|
all_data.append(entry)
|
|||
|
|
seen_vins.add(vin)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ 处理文件 {file} 出错: {e}")
|
|||
|
|
|
|||
|
|
if not all_data:
|
|||
|
|
print("⚠️ 没有有效数据")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 转换为 DataFrame
|
|||
|
|
result_df = pd.DataFrame(all_data)
|
|||
|
|
|
|||
|
|
# 计算估算在线总时长
|
|||
|
|
# 映射规则:取区间下限作为估算值('≤0.5'按0.5算,'>6'按6算,区间按平均值或下限算)
|
|||
|
|
# 用户需求是“至少在线时长”,所以我们统一使用区间下限
|
|||
|
|
# 区间列表: ['≤0.5', '(0.5,1]', '(1,2]', '(2,3]', '(3,4]', '(4,6]', '>6']
|
|||
|
|
# 对应权重: [0, 0.5, 1, 2, 3, 4, 6] (≤0.5 算作 0,确保是“至少”)
|
|||
|
|
|
|||
|
|
weight_map = {
|
|||
|
|
'时长区间_≤0.5': 0,
|
|||
|
|
'时长区间_(0.5,1]': 0.5,
|
|||
|
|
'时长区间_(1,2]': 1,
|
|||
|
|
'时长区间_(2,3]': 2,
|
|||
|
|
'时长区间_(3,4]': 3,
|
|||
|
|
'时长区间_(4,6]': 4,
|
|||
|
|
'时长区间_>6': 6
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def calculate_min_duration(row):
|
|||
|
|
total_hours = 0
|
|||
|
|
for col, weight in weight_map.items():
|
|||
|
|
if col in row:
|
|||
|
|
try:
|
|||
|
|
days = float(row[col])
|
|||
|
|
if pd.notnull(days):
|
|||
|
|
total_hours += days * weight
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
return total_hours
|
|||
|
|
|
|||
|
|
# 只有当包含相关列时才计算
|
|||
|
|
if any(col in result_df.columns for col in weight_map.keys()):
|
|||
|
|
result_df['估算至少在线总时长(小时)'] = result_df.apply(calculate_min_duration, axis=1)
|
|||
|
|
print("✅ 已计算‘至少在线总时长’")
|
|||
|
|
|
|||
|
|
# 调整列顺序
|
|||
|
|
# 优先显示的列
|
|||
|
|
first_cols = ['VIN', '估算至少在线总时长(小时)', '状态', '查询时间']
|
|||
|
|
# 动态生成的区间列
|
|||
|
|
interval_cols = [c for c in result_df.columns if c.startswith('时长区间_')]
|
|||
|
|
# 其他列
|
|||
|
|
other_cols = [c for c in result_df.columns if c not in first_cols and c not in interval_cols]
|
|||
|
|
|
|||
|
|
final_cols = first_cols + interval_cols + other_cols
|
|||
|
|
result_df = result_df.reindex(columns=final_cols)
|
|||
|
|
|
|||
|
|
# 导出
|
|||
|
|
result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
|
|||
|
|
print(f"\n✅ 合并完成!结果已保存至: {output_file}")
|
|||
|
|
print(f"📊 总计合并数据: {len(result_df)} 条")
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
# 检查 pandas 是否安装
|
|||
|
|
try:
|
|||
|
|
import pandas
|
|||
|
|
merge_csv_files()
|
|||
|
|
except ImportError:
|
|||
|
|
print("❌ 需要安装 pandas 库: pip install pandas")
|
|||
|
|
# 降级处理:使用标准库简单的合并(不解析JSON)
|
|||
|
|
print("⚠️ 降级模式:仅简单合并文件")
|
|||
|
|
# ... (简易合并逻辑可以省略,通常用户环境已有 pandas)
|