WSL doc2docx 并进行替换
import os
import subprocess
import tempfile
import shutil
from docx import Document
from doc2docx import convert
# 定义替换对照表
REPLACEMENT_DICT = {
'陕西省': '甘肃省',
'西安': '兰州',
'咸阳': '天水',
'SXCDCSOP': 'GSCDCSOP',
}
def convert_doc_to_docx(doc_path):
"""使用 Windows PowerShell 调用 Word 来转换文件,并在转换成功后删除原doc文件"""
try:
# 将 Linux 路径转换为 Windows 路径
win_path = subprocess.check_output(['wslpath', '-w', doc_path]).decode().strip()
docx_path = os.path.splitext(doc_path)[0] + ".docx"
win_docx_path = subprocess.check_output(['wslpath', '-w', docx_path]).decode().strip()
# 检查目标文件是否已存在
if os.path.exists(docx_path):
print(f"目标文件已存在: {docx_path}")
# 如果docx已存在,删除原doc文件
try:
os.remove(doc_path)
print(f"已删除原文件: {doc_path}")
except Exception as e:
print(f"删除原文件失败: {doc_path}, 错误: {str(e)}")
return docx_path
print(f"正在转换文件: {doc_path}")
# PowerShell 脚本
ps_script = f'''
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$doc = $word.Documents.Open("{win_path}")
$doc.SaveAs([ref] "{win_docx_path}", [ref] 16)
$doc.Close()
$word.Quit()
[System.Runtime.Interopservices.Marshal]::ReleaseComObject($word)
'''
# 执行 PowerShell 脚本
cmd = ['powershell.exe', '-Command', ps_script]
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
if os.path.exists(docx_path):
print(f"转换成功: {docx_path}")
# 转换成功后删除原doc文件
try:
os.remove(doc_path)
print(f"已删除原文件: {doc_path}")
except Exception as e:
print(f"删除原文件失败: {doc_path}, 错误: {str(e)}")
return docx_path
else:
print(f"转换失败: {doc_path}")
if error:
print(f"错误信息: {error.decode()}")
return None
except Exception as e:
print(f"转换过程中发生错误: {str(e)}")
return None
def convert_all_docs_to_docx(directory):
"""递归转换目录下所有.doc文件为.docx"""
converted_files = 0
error_files = 0
for root, dirs, files in os.walk(directory):
for file in files:
if file.lower().endswith('.doc') and not file.lower().endswith('.docx'):
file_path = os.path.join(root, file)
if is_temp_file(file_path):
continue
try:
docx_path = convert_doc_to_docx(file_path)
if docx_path and os.path.exists(docx_path):
converted_files += 1
else:
error_files += 1
except Exception as e:
error_files += 1
print(f"转换文件 {file_path} 时发生错误: {str(e)}")
return converted_files, error_files
def is_temp_file(file_path):
"""检查是否是临时文件"""
file_name = os.path.basename(file_path)
temp_patterns = ['~$', '~WR', 'Backup of', '.tmp', '.bak']
return any(pattern in file_name for pattern in temp_patterns)
def replace_text_in_paragraph(paragraph):
"""处理段落中的文本替换"""
modified = False
original_text = paragraph.text
new_text = original_text
for target, replacement in REPLACEMENT_DICT.items():
if target in new_text:
new_text = new_text.replace(target, replacement)
modified = True
if modified:
# 保留格式化特性的替换方法
for i, run in enumerate(paragraph.runs):
if i == 0 and run.text:
run.text = new_text
elif i > 0:
run.text = ""
return modified
def process_word_file(file_path):
"""处理单个Word文件的文本替换"""
try:
# 打开Word文档
doc = Document(file_path)
# 标记是否有修改
modified = False
# 遍历所有段落
for paragraph in doc.paragraphs:
if replace_text_in_paragraph(paragraph):
modified = True
# 遍历所有表格
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if replace_text_in_table_cell(cell):
modified = True
# 处理页眉和页脚
for section in doc.sections:
for paragraph in section.header.paragraphs:
if replace_text_in_paragraph(paragraph):
modified = True
for paragraph in section.footer.paragraphs:
if replace_text_in_paragraph(paragraph):
modified = True
# 尝试处理字段代码
if process_field_codes(doc):
modified = True
return doc, modified
except Exception as e:
print(f"处理文件 {file_path} 时发生错误: {str(e)}")
return None, False
def replace_text_in_table_cell(cell):
"""处理表格单元格中的文本替换"""
modified = False
for paragraph in cell.paragraphs:
if replace_text_in_paragraph(paragraph):
modified = True
return modified
def process_field_codes(doc):
"""尝试处理文档中的字段代码"""
modified = False
try:
for element in doc._element.xpath('//w:instrText'):
original_text = element.text
if original_text:
new_text = original_text
for target, replacement in REPLACEMENT_DICT.items():
if target in new_text:
new_text = new_text.replace(target, replacement)
if original_text != new_text:
element.text = new_text
modified = True
except:
pass
return modified
def rename_files_in_directory(directory):
"""递归重命名目录下所有文件,将文件名中的'SXCDCSOP'替换为'GSCDCSOP'"""
renamed_count = 0
error_count = 0
for root, dirs, files in os.walk(directory):
# 首先处理文件
for file in files:
if 'SXCDCSOP' in file:
old_path = os.path.join(root, file)
new_file = file.replace('SXCDCSOP', 'GSCDCSOP')
new_path = os.path.join(root, new_file)
try:
# 检查目标文件是否已存在
if os.path.exists(new_path):
print(f"目标文件已存在,无法重命名: {new_path}")
error_count += 1
continue
os.rename(old_path, new_path)
print(f"重命名文件: {old_path} -> {new_path}")
renamed_count += 1
except Exception as e:
print(f"重命名文件失败: {old_path}, 错误: {str(e)}")
error_count += 1
# 然后处理目录(从后往前处理,以避免路径问题)
for i in range(len(dirs) - 1, -1, -1):
dir_name = dirs[i]
if 'SXCDCSOP' in dir_name:
old_dir_path = os.path.join(root, dir_name)
new_dir_name = dir_name.replace('SXCDCSOP', 'GSCDCSOP')
new_dir_path = os.path.join(root, new_dir_name)
try:
# 检查目标目录是否已存在
if os.path.exists(new_dir_path):
print(f"目标目录已存在,无法重命名: {new_dir_path}")
error_count += 1
continue
os.rename(old_dir_path, new_dir_path)
print(f"重命名目录: {old_dir_path} -> {new_dir_path}")
renamed_count += 1
# 更新dirs列表,以确保后续处理正确
dirs[i] = new_dir_name
except Exception as e:
print(f"重命名目录失败: {old_dir_path}, 错误: {str(e)}")
error_count += 1
return renamed_count, error_count
def main(directory_path):
"""主函数:先转换所有文件,然后进行替换"""
if not os.path.exists(directory_path):
print(f"目录不存在: {directory_path}")
return
print("第一步:转换所有.doc文件为.docx格式")
converted_files, error_files = convert_all_docs_to_docx(directory_path)
print(f"转换完成!成功转换: {converted_files} 个文件,失败: {error_files} 个文件\n")
print("第二步:进行文本替换")
total_files = 0
modified_files = 0
error_files = 0
skipped_temp_files = 0
try:
for root, dirs, files in os.walk(directory_path):
for file in files:
if file.lower().endswith('.docx'):
file_path = os.path.join(root, file)
if is_temp_file(file_path):
skipped_temp_files += 1
continue
total_files += 1
try:
doc, modified = process_word_file(file_path)
if doc and modified:
doc.save(file_path)
modified_files += 1
print(f"已成功修改并保存: {file_path}")
else:
print(f"无需修改: {file_path}")
except Exception as e:
error_files += 1
print(f"处理文件 {file_path} 时发生错误: {str(e)}")
print("\n处理完成!统计信息:")
print(f"总计处理文件数: {total_files}")
print(f"成功修改内容文件数: {modified_files}")
print(f"处理失败文件数: {error_files}")
print(f"未作修改文件数: {total_files - modified_files - error_files}")
print(f"跳过的临时文件数: {skipped_temp_files}")
print("\n第三步:重命名文件和目录")
renamed_count, rename_error_count = rename_files_in_directory(directory_path)
print(f"重命名完成!成功重命名: {renamed_count} 个文件/目录,失败: {rename_error_count} 个")
except Exception as e:
print(f"处理过程中发生错误: {str(e)}")
if __name__ == "__main__":
directory_path = "/mnt/c/Users/xuefliang/Downloads/PCV24项目SOP 6.17修订版/"
main(directory_path)
评论
发表评论