Files
assist/check_encoding.py

158 lines
4.9 KiB
Python
Raw Normal View History

2025-11-05 10:43:36 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
文件编码检查工具
检查项目中所有文件是否使用UTF-8编码
"""
import os
import sys
import chardet
from pathlib import Path
def check_file_encoding(file_path: Path) -> dict:
"""检查文件编码"""
try:
with open(file_path, 'rb') as f:
raw_data = f.read()
result = chardet.detect(raw_data)
encoding = result.get('encoding', 'unknown')
confidence = result.get('confidence', 0)
# 检查文件是否有BOM
has_bom = False
if raw_data.startswith(b'\xef\xbb\xbf'):
has_bom = True
encoding = 'utf-8-sig'
return {
'file': str(file_path),
'encoding': encoding,
'confidence': confidence,
'has_bom': has_bom,
'is_utf8': encoding.lower() in ['utf-8', 'utf-8-sig', 'ascii'],
'size': len(raw_data)
}
except Exception as e:
return {
'file': str(file_path),
'error': str(e)
}
def check_python_file_header(file_path: Path) -> bool:
"""检查Python文件是否有编码声明"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
first_lines = [f.readline() for _ in range(3)]
for line in first_lines:
if 'coding' in line.lower() or 'encoding' in line.lower():
return True
return False
except:
return False
def main():
"""主函数"""
project_root = Path(__file__).parent
# 需要检查的文件扩展名
check_extensions = {'.py', '.json', '.md', '.txt', '.html', '.css', '.js', '.sql', '.bat', '.sh'}
# 排除的目录
exclude_dirs = {'.git', '.venv', '__pycache__', 'node_modules', '.idea', 'logs', 'data', 'dist', 'build'}
results = []
python_files_without_encoding = []
print("=" * 80)
print("文件编码检查工具")
print("=" * 80)
print()
# 遍历所有文件
for root, dirs, files in os.walk(project_root):
# 排除指定目录
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for file in files:
file_path = Path(root) / file
# 只检查指定扩展名的文件
if file_path.suffix.lower() not in check_extensions:
continue
# 检查编码
result = check_file_encoding(file_path)
results.append(result)
# 检查Python文件的编码声明
if file_path.suffix == '.py':
if not check_python_file_header(file_path):
python_files_without_encoding.append(file_path)
# 统计结果
total_files = len(results)
utf8_files = sum(1 for r in results if r.get('is_utf8', False))
non_utf8_files = total_files - utf8_files
print(f"总计检查文件: {total_files}")
print(f"UTF-8 编码文件: {utf8_files}")
print(f"非 UTF-8 编码文件: {non_utf8_files}")
print()
# 显示非UTF-8文件
if non_utf8_files > 0:
print("=" * 80)
print("⚠️ 非 UTF-8 编码文件:")
print("=" * 80)
for result in results:
if not result.get('is_utf8', False) and 'error' not in result:
print(f" {result['file']}")
print(f" 编码: {result['encoding']} (置信度: {result['confidence']:.2%})")
if result.get('has_bom'):
print(f" ⚠️ 包含 BOM")
print()
# 显示缺少编码声明的Python文件
if python_files_without_encoding:
print("=" * 80)
print("⚠️ Python 文件缺少编码声明:")
print("=" * 80)
for file_path in python_files_without_encoding:
print(f" {file_path}")
print()
print("建议在这些文件开头添加: # -*- coding: utf-8 -*-")
print()
# 显示错误
errors = [r for r in results if 'error' in r]
if errors:
print("=" * 80)
print("❌ 检查出错的文件:")
print("=" * 80)
for result in errors:
print(f" {result['file']}: {result['error']}")
print()
# 总结
print("=" * 80)
if non_utf8_files == 0 and not python_files_without_encoding:
print("✅ 所有文件编码检查通过!")
else:
print("⚠️ 发现编码问题,请根据上述信息修复")
print("=" * 80)
return non_utf8_files == 0 and not python_files_without_encoding
if __name__ == "__main__":
try:
import chardet
except ImportError:
print("错误: 需要安装 chardet 库")
print("运行: pip install chardet")
sys.exit(1)
success = main()
sys.exit(0 if success else 1)