Files
tsp_v_onlinetime_pa/merge_results.py

149 lines
5.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import csv
import glob
import pandas as pd
import json
def merge_csv_files(output_file='merged_results.csv'):
"""合并当前目录下的所有 temp_results_*.csv 和 query_results.csv 文件"""
print("🚀 开始合并 CSV 文件...")
# 查找所有相关 CSV 文件
csv_files = glob.glob('temp_results_*.csv')
if os.path.exists('query_results.csv'):
csv_files.append('query_results.csv')
if not csv_files:
print("⚠️ 未找到任何结果文件")
return
print(f"📋 找到 {len(csv_files)} 个文件: {csv_files}")
all_data = []
seen_vins = set()
for file in csv_files:
try:
# 尝试使用 pandas 读取,处理编码问题
try:
df = pd.read_csv(file, encoding='utf-8-sig')
except:
df = pd.read_csv(file, encoding='gbk')
# 标准化列名(处理之前版本可能存在的列名差异)
# 假设现在的列是 ['VIN', '结果', '查询时间', '状态']
# 旧的可能是 ['VIN', '单车在线时长', ...]
for index, row in df.iterrows():
vin = str(row.get('VIN', '')).strip().upper()
if not vin or vin in seen_vins:
continue
# 获取结果数据
result = row.get('结果', row.get('单车在线时长', ''))
# 尝试解析 JSON 结果
parsed_data = {}
is_json = False
try:
if result and result.strip().startswith('{'):
parsed_data = json.loads(result)
is_json = True
except:
pass
# 基础信息
entry = {
'VIN': vin,
'查询时间': row.get('查询时间', ''),
'状态': row.get('状态', ''),
'原始结果': result
}
# 如果是 JSON 数据,提取详细字段
if is_json:
# 提取 intervalList 和 daysList
intervals = parsed_data.get('intervalList', [])
days = parsed_data.get('daysList', [])
if len(intervals) == len(days):
for i, interval in enumerate(intervals):
entry[f'时长区间_{interval}'] = days[i]
# 也可以提取 startTime 和 endTime
# entry['startTime'] = parsed_data.get('startTime', '')
# entry['endTime'] = parsed_data.get('endTime', '')
all_data.append(entry)
seen_vins.add(vin)
except Exception as e:
print(f"❌ 处理文件 {file} 出错: {e}")
if not all_data:
print("⚠️ 没有有效数据")
return
# 转换为 DataFrame
result_df = pd.DataFrame(all_data)
# 计算估算在线总时长
# 映射规则:取区间下限作为估算值('≤0.5'按0.5算,'>6'按6算区间按平均值或下限算
# 用户需求是“至少在线时长”,所以我们统一使用区间下限
# 区间列表: ['≤0.5', '(0.5,1]', '(1,2]', '(2,3]', '(3,4]', '(4,6]', '>6']
# 对应权重: [0, 0.5, 1, 2, 3, 4, 6] (≤0.5 算作 0确保是“至少”)
weight_map = {
'时长区间_≤0.5': 0,
'时长区间_(0.5,1]': 0.5,
'时长区间_(1,2]': 1,
'时长区间_(2,3]': 2,
'时长区间_(3,4]': 3,
'时长区间_(4,6]': 4,
'时长区间_>6': 6
}
def calculate_min_duration(row):
total_hours = 0
for col, weight in weight_map.items():
if col in row:
try:
days = float(row[col])
if pd.notnull(days):
total_hours += days * weight
except:
pass
return total_hours
# 只有当包含相关列时才计算
if any(col in result_df.columns for col in weight_map.keys()):
result_df['估算至少在线总时长(小时)'] = result_df.apply(calculate_min_duration, axis=1)
print("✅ 已计算‘至少在线总时长’")
# 调整列顺序
# 优先显示的列
first_cols = ['VIN', '估算至少在线总时长(小时)', '状态', '查询时间']
# 动态生成的区间列
interval_cols = [c for c in result_df.columns if c.startswith('时长区间_')]
# 其他列
other_cols = [c for c in result_df.columns if c not in first_cols and c not in interval_cols]
final_cols = first_cols + interval_cols + other_cols
result_df = result_df.reindex(columns=final_cols)
# 导出
result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"\n✅ 合并完成!结果已保存至: {output_file}")
print(f"📊 总计合并数据: {len(result_df)}")
if __name__ == '__main__':
# 检查 pandas 是否安装
try:
import pandas
merge_csv_files()
except ImportError:
print("❌ 需要安装 pandas 库: pip install pandas")
# 降级处理使用标准库简单的合并不解析JSON
print("⚠️ 降级模式:仅简单合并文件")
# ... (简易合并逻辑可以省略,通常用户环境已有 pandas)