#!/usr/bin/env python # -*- coding: utf-8 -*- """ 文件编码检查工具 检查项目中所有文件是否使用UTF-8编码 """ import os import sys import chardet from pathlib import Path def check_file_encoding(file_path: Path) -> dict: """检查文件编码""" try: with open(file_path, 'rb') as f: raw_data = f.read() result = chardet.detect(raw_data) encoding = result.get('encoding', 'unknown') confidence = result.get('confidence', 0) # 检查文件是否有BOM has_bom = False if raw_data.startswith(b'\xef\xbb\xbf'): has_bom = True encoding = 'utf-8-sig' return { 'file': str(file_path), 'encoding': encoding, 'confidence': confidence, 'has_bom': has_bom, 'is_utf8': encoding.lower() in ['utf-8', 'utf-8-sig', 'ascii'], 'size': len(raw_data) } except Exception as e: return { 'file': str(file_path), 'error': str(e) } def check_python_file_header(file_path: Path) -> bool: """检查Python文件是否有编码声明""" try: with open(file_path, 'r', encoding='utf-8') as f: first_lines = [f.readline() for _ in range(3)] for line in first_lines: if 'coding' in line.lower() or 'encoding' in line.lower(): return True return False except: return False def main(): """主函数""" project_root = Path(__file__).parent # 需要检查的文件扩展名 check_extensions = {'.py', '.json', '.md', '.txt', '.html', '.css', '.js', '.sql', '.bat', '.sh'} # 排除的目录 exclude_dirs = {'.git', '.venv', '__pycache__', 'node_modules', '.idea', 'logs', 'data', 'dist', 'build'} results = [] python_files_without_encoding = [] print("=" * 80) print("文件编码检查工具") print("=" * 80) print() # 遍历所有文件 for root, dirs, files in os.walk(project_root): # 排除指定目录 dirs[:] = [d for d in dirs if d not in exclude_dirs] for file in files: file_path = Path(root) / file # 只检查指定扩展名的文件 if file_path.suffix.lower() not in check_extensions: continue # 检查编码 result = check_file_encoding(file_path) results.append(result) # 检查Python文件的编码声明 if file_path.suffix == '.py': if not check_python_file_header(file_path): python_files_without_encoding.append(file_path) # 统计结果 total_files = len(results) utf8_files = sum(1 for r in results if r.get('is_utf8', False)) non_utf8_files = total_files - utf8_files print(f"总计检查文件: {total_files}") print(f"UTF-8 编码文件: {utf8_files}") print(f"非 UTF-8 编码文件: {non_utf8_files}") print() # 显示非UTF-8文件 if non_utf8_files > 0: print("=" * 80) print("⚠️ 非 UTF-8 编码文件:") print("=" * 80) for result in results: if not result.get('is_utf8', False) and 'error' not in result: print(f" {result['file']}") print(f" 编码: {result['encoding']} (置信度: {result['confidence']:.2%})") if result.get('has_bom'): print(f" ⚠️ 包含 BOM") print() # 显示缺少编码声明的Python文件 if python_files_without_encoding: print("=" * 80) print("⚠️ Python 文件缺少编码声明:") print("=" * 80) for file_path in python_files_without_encoding: print(f" {file_path}") print() print("建议在这些文件开头添加: # -*- coding: utf-8 -*-") print() # 显示错误 errors = [r for r in results if 'error' in r] if errors: print("=" * 80) print("❌ 检查出错的文件:") print("=" * 80) for result in errors: print(f" {result['file']}: {result['error']}") print() # 总结 print("=" * 80) if non_utf8_files == 0 and not python_files_without_encoding: print("✅ 所有文件编码检查通过!") else: print("⚠️ 发现编码问题,请根据上述信息修复") print("=" * 80) return non_utf8_files == 0 and not python_files_without_encoding if __name__ == "__main__": try: import chardet except ImportError: print("错误: 需要安装 chardet 库") print("运行: pip install chardet") sys.exit(1) success = main() sys.exit(0 if success else 1)