特殊格式文本合并
import os
import chardet # pip install chardet
# 指定目录路径(修改为你的目录)
directory = r'D:\txthb' # 示例:r'D:\data\texts'
# 输出文件路径
output_file = os.path.join(directory, 'merged.txt')
error_log = os.path.join(directory, 'errors.log')
# 扩展常见编码列表,支持更多(如繁体Big5、UTF-16)
common_encodings = ['utf-8', 'gbk', 'utf-8-sig', 'gb2312', 'big5', 'utf-16', 'latin1', 'cp1252']
def read_file_with_encoding(file_path):
# 先尝试检测编码
with open(file_path, 'rb') as f:
raw_data = f.read()
detected = chardet.detect(raw_data)
encoding = detected['encoding'] if detected['encoding'] else None
# 如果检测失败,逐个尝试常见编码
if not encoding:
for enc in common_encodings:
try:
with open(file_path, 'r', encoding=enc) as f:
return f.read().strip()
except UnicodeDecodeError:
continue
raise ValueError(f"无法读取文件 {file_path},编码不支持。")
# 使用检测到的编码读取
try:
with open(file_path, 'r', encoding=encoding) as f:
return f.read().strip()
except UnicodeDecodeError:
raise ValueError(f"检测到编码 {encoding} 但读取失败:{file_path}")
# 遍历目录,合并文件
with open(output_file, 'w', encoding='utf-8') as out, open(error_log, 'w', encoding='utf-8') as log:
for filename in os.listdir(directory):
if filename.endswith('.txt'): # 只处理.txt文件(可修改)
file_path = os.path.join(directory, filename)
try:
content = read_file_with_encoding(file_path)
# 处理成一行:替换换行符为空格(保持原内容,不转换简繁)
content_one_line = content.replace('\n', ' ').replace('\r', ' ')
out.write(content_one_line + '\n') # 每行一个文件的内容
print(f"成功合并: {filename}")
except Exception as e:
error_msg = f"错误处理 {filename}: {e}\n"
log.write(error_msg)
print(error_msg)
print(f"合并完成,输出文件: {output_file}")
print(f"如果有错误,查看: {error_log}")
❤️ 转载文章请注明出处,谢谢!❤️