158 lines
4.9 KiB
Python
158 lines
4.9 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
文件编码检查工具
|
|
检查项目中所有文件是否使用UTF-8编码
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import chardet
|
|
from pathlib import Path
|
|
|
|
def check_file_encoding(file_path: Path) -> dict:
|
|
"""检查文件编码"""
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
raw_data = f.read()
|
|
result = chardet.detect(raw_data)
|
|
encoding = result.get('encoding', 'unknown')
|
|
confidence = result.get('confidence', 0)
|
|
|
|
# 检查文件是否有BOM
|
|
has_bom = False
|
|
if raw_data.startswith(b'\xef\xbb\xbf'):
|
|
has_bom = True
|
|
encoding = 'utf-8-sig'
|
|
|
|
return {
|
|
'file': str(file_path),
|
|
'encoding': encoding,
|
|
'confidence': confidence,
|
|
'has_bom': has_bom,
|
|
'is_utf8': encoding.lower() in ['utf-8', 'utf-8-sig', 'ascii'],
|
|
'size': len(raw_data)
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'file': str(file_path),
|
|
'error': str(e)
|
|
}
|
|
|
|
def check_python_file_header(file_path: Path) -> bool:
|
|
"""检查Python文件是否有编码声明"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
first_lines = [f.readline() for _ in range(3)]
|
|
for line in first_lines:
|
|
if 'coding' in line.lower() or 'encoding' in line.lower():
|
|
return True
|
|
return False
|
|
except:
|
|
return False
|
|
|
|
def main():
|
|
"""主函数"""
|
|
project_root = Path(__file__).parent
|
|
|
|
# 需要检查的文件扩展名
|
|
check_extensions = {'.py', '.json', '.md', '.txt', '.html', '.css', '.js', '.sql', '.bat', '.sh'}
|
|
|
|
# 排除的目录
|
|
exclude_dirs = {'.git', '.venv', '__pycache__', 'node_modules', '.idea', 'logs', 'data', 'dist', 'build'}
|
|
|
|
results = []
|
|
python_files_without_encoding = []
|
|
|
|
print("=" * 80)
|
|
print("文件编码检查工具")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# 遍历所有文件
|
|
for root, dirs, files in os.walk(project_root):
|
|
# 排除指定目录
|
|
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
|
|
|
for file in files:
|
|
file_path = Path(root) / file
|
|
|
|
# 只检查指定扩展名的文件
|
|
if file_path.suffix.lower() not in check_extensions:
|
|
continue
|
|
|
|
# 检查编码
|
|
result = check_file_encoding(file_path)
|
|
results.append(result)
|
|
|
|
# 检查Python文件的编码声明
|
|
if file_path.suffix == '.py':
|
|
if not check_python_file_header(file_path):
|
|
python_files_without_encoding.append(file_path)
|
|
|
|
# 统计结果
|
|
total_files = len(results)
|
|
utf8_files = sum(1 for r in results if r.get('is_utf8', False))
|
|
non_utf8_files = total_files - utf8_files
|
|
|
|
print(f"总计检查文件: {total_files}")
|
|
print(f"UTF-8 编码文件: {utf8_files}")
|
|
print(f"非 UTF-8 编码文件: {non_utf8_files}")
|
|
print()
|
|
|
|
# 显示非UTF-8文件
|
|
if non_utf8_files > 0:
|
|
print("=" * 80)
|
|
print("⚠️ 非 UTF-8 编码文件:")
|
|
print("=" * 80)
|
|
for result in results:
|
|
if not result.get('is_utf8', False) and 'error' not in result:
|
|
print(f" {result['file']}")
|
|
print(f" 编码: {result['encoding']} (置信度: {result['confidence']:.2%})")
|
|
if result.get('has_bom'):
|
|
print(f" ⚠️ 包含 BOM")
|
|
print()
|
|
|
|
# 显示缺少编码声明的Python文件
|
|
if python_files_without_encoding:
|
|
print("=" * 80)
|
|
print("⚠️ Python 文件缺少编码声明:")
|
|
print("=" * 80)
|
|
for file_path in python_files_without_encoding:
|
|
print(f" {file_path}")
|
|
print()
|
|
print("建议在这些文件开头添加: # -*- coding: utf-8 -*-")
|
|
print()
|
|
|
|
# 显示错误
|
|
errors = [r for r in results if 'error' in r]
|
|
if errors:
|
|
print("=" * 80)
|
|
print("❌ 检查出错的文件:")
|
|
print("=" * 80)
|
|
for result in errors:
|
|
print(f" {result['file']}: {result['error']}")
|
|
print()
|
|
|
|
# 总结
|
|
print("=" * 80)
|
|
if non_utf8_files == 0 and not python_files_without_encoding:
|
|
print("✅ 所有文件编码检查通过!")
|
|
else:
|
|
print("⚠️ 发现编码问题,请根据上述信息修复")
|
|
print("=" * 80)
|
|
|
|
return non_utf8_files == 0 and not python_files_without_encoding
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
import chardet
|
|
except ImportError:
|
|
print("错误: 需要安装 chardet 库")
|
|
print("运行: pip install chardet")
|
|
sys.exit(1)
|
|
|
|
success = main()
|
|
sys.exit(0 if success else 1)
|
|
|