文本合并

特殊格式文本合并

import os
import chardet  # pip install chardet

# 指定目录路径（修改为你的目录）
directory = r'D:\txthb'  # 示例：r'D:\data\texts'

# 输出文件路径
output_file = os.path.join(directory, 'merged.txt')
error_log = os.path.join(directory, 'errors.log')

# 扩展常见编码列表，支持更多（如繁体Big5、UTF-16）
common_encodings = ['utf-8', 'gbk', 'utf-8-sig', 'gb2312', 'big5', 'utf-16', 'latin1', 'cp1252']

def read_file_with_encoding(file_path):
    # 先尝试检测编码
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        detected = chardet.detect(raw_data)
        encoding = detected['encoding'] if detected['encoding'] else None
    
    # 如果检测失败，逐个尝试常见编码
    if not encoding:
        for enc in common_encodings:
            try:
                with open(file_path, 'r', encoding=enc) as f:
                    return f.read().strip()
            except UnicodeDecodeError:
                continue
        raise ValueError(f"无法读取文件 {file_path}，编码不支持。")
    
    # 使用检测到的编码读取
    try:
        with open(file_path, 'r', encoding=encoding) as f:
            return f.read().strip()
    except UnicodeDecodeError:
        raise ValueError(f"检测到编码 {encoding} 但读取失败：{file_path}")

# 遍历目录，合并文件
with open(output_file, 'w', encoding='utf-8') as out, open(error_log, 'w', encoding='utf-8') as log:
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):  # 只处理.txt文件（可修改）
            file_path = os.path.join(directory, filename)
            try:
                content = read_file_with_encoding(file_path)
                # 处理成一行：替换换行符为空格（保持原内容，不转换简繁）
                content_one_line = content.replace('\n', ' ').replace('\r', ' ')
                out.write(content_one_line + '\n')  # 每行一个文件的内容
                print(f"成功合并: {filename}")
            except Exception as e:
                error_msg = f"错误处理 {filename}: {e}\n"
                log.write(error_msg)
                print(error_msg)

print(f"合并完成，输出文件: {output_file}")
print(f"如果有错误，查看: {error_log}")