assist/check_encoding.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
文件编码检查工具
检查项目中所有文件是否使用UTF-8编码
"""

import os
import sys
import chardet
from pathlib import Path

def check_file_encoding(file_path: Path) -> dict:
    """检查文件编码"""
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            result = chardet.detect(raw_data)
            encoding = result.get('encoding', 'unknown')
            confidence = result.get('confidence', 0)

            # 检查文件是否有BOM
            has_bom = False
            if raw_data.startswith(b'\xef\xbb\xbf'):
                has_bom = True
                encoding = 'utf-8-sig'

            return {
                'file': str(file_path),
                'encoding': encoding,
                'confidence': confidence,
                'has_bom': has_bom,
                'is_utf8': encoding.lower() in ['utf-8', 'utf-8-sig', 'ascii'],
                'size': len(raw_data)
            }
    except Exception as e:
        return {
            'file': str(file_path),
            'error': str(e)
        }

def check_python_file_header(file_path: Path) -> bool:
    """检查Python文件是否有编码声明"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            first_lines = [f.readline() for _ in range(3)]
            for line in first_lines:
                if 'coding' in line.lower() or 'encoding' in line.lower():
                    return True
        return False
    except:
        return False

def main():
    """主函数"""
    project_root = Path(__file__).parent

    # 需要检查的文件扩展名
    check_extensions = {'.py', '.json', '.md', '.txt', '.html', '.css', '.js', '.sql', '.bat', '.sh'}

    # 排除的目录
    exclude_dirs = {'.git', '.venv', '__pycache__', 'node_modules', '.idea', 'logs', 'data', 'dist', 'build'}

    results = []
    python_files_without_encoding = []

    print("=" * 80)
    print("文件编码检查工具")
    print("=" * 80)
    print()

    # 遍历所有文件
    for root, dirs, files in os.walk(project_root):
        # 排除指定目录
        dirs[:] = [d for d in dirs if d not in exclude_dirs]

        for file in files:
            file_path = Path(root) / file

            # 只检查指定扩展名的文件
            if file_path.suffix.lower() not in check_extensions:
                continue

            # 检查编码
            result = check_file_encoding(file_path)
            results.append(result)

            # 检查Python文件的编码声明
            if file_path.suffix == '.py':
                if not check_python_file_header(file_path):
                    python_files_without_encoding.append(file_path)

    # 统计结果
    total_files = len(results)
    utf8_files = sum(1 for r in results if r.get('is_utf8', False))
    non_utf8_files = total_files - utf8_files

    print(f"总计检查文件: {total_files}")
    print(f"UTF-8 编码文件: {utf8_files}")
    print(f"非 UTF-8 编码文件: {non_utf8_files}")
    print()

    # 显示非UTF-8文件
    if non_utf8_files > 0:
        print("=" * 80)
        print("⚠️  非 UTF-8 编码文件:")
        print("=" * 80)
        for result in results:
            if not result.get('is_utf8', False) and 'error' not in result:
                print(f"  {result['file']}")
                print(f"    编码: {result['encoding']} (置信度: {result['confidence']:.2%})")
                if result.get('has_bom'):
                    print(f"    ⚠️  包含 BOM")
                print()

    # 显示缺少编码声明的Python文件
    if python_files_without_encoding:
        print("=" * 80)
        print("⚠️  Python 文件缺少编码声明:")
        print("=" * 80)
        for file_path in python_files_without_encoding:
            print(f"  {file_path}")
        print()
        print("建议在这些文件开头添加: # -*- coding: utf-8 -*-")
        print()

    # 显示错误
    errors = [r for r in results if 'error' in r]
    if errors:
        print("=" * 80)
        print("❌ 检查出错的文件:")
        print("=" * 80)
        for result in errors:
            print(f"  {result['file']}: {result['error']}")
        print()

    # 总结
    print("=" * 80)
    if non_utf8_files == 0 and not python_files_without_encoding:
        print("✅ 所有文件编码检查通过！")
    else:
        print("⚠️  发现编码问题，请根据上述信息修复")
    print("=" * 80)

    return non_utf8_files == 0 and not python_files_without_encoding

if __name__ == "__main__":
    try:
        import chardet
    except ImportError:
        print("错误: 需要安装 chardet 库")
        print("运行: pip install chardet")
        sys.exit(1)

    success = main()
    sys.exit(0 if success else 1)