WSL doc2docx 并进行替换

import os
import subprocess
import tempfile
import shutil
from docx import Document
from doc2docx import convert

# 定义替换对照表
REPLACEMENT_DICT = {
    '陕西省': '甘肃省',
    '西安': '兰州',
    '咸阳': '天水',
    'SXCDCSOP': 'GSCDCSOP',
}

def convert_doc_to_docx(doc_path):
    """使用 Windows PowerShell 调用 Word 来转换文件，并在转换成功后删除原doc文件"""
    try:
        # 将 Linux 路径转换为 Windows 路径
        win_path = subprocess.check_output(['wslpath', '-w', doc_path]).decode().strip()
        docx_path = os.path.splitext(doc_path)[0] + ".docx"
        win_docx_path = subprocess.check_output(['wslpath', '-w', docx_path]).decode().strip()
        
        # 检查目标文件是否已存在
        if os.path.exists(docx_path):
            print(f"目标文件已存在: {docx_path}")
            
            # 如果docx已存在，删除原doc文件
            try:
                os.remove(doc_path)
                print(f"已删除原文件: {doc_path}")
            except Exception as e:
                print(f"删除原文件失败: {doc_path}, 错误: {str(e)}")
                
            return docx_path
            
        print(f"正在转换文件: {doc_path}")
        
        # PowerShell 脚本
        ps_script = f'''
        $word = New-Object -ComObject Word.Application
        $word.Visible = $false
        $doc = $word.Documents.Open("{win_path}")
        $doc.SaveAs([ref] "{win_docx_path}", [ref] 16)
        $doc.Close()
        $word.Quit()
        [System.Runtime.Interopservices.Marshal]::ReleaseComObject($word)
        '''
        
        # 执行 PowerShell 脚本
        cmd = ['powershell.exe', '-Command', ps_script]
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        output, error = process.communicate()
        
        if os.path.exists(docx_path):
            print(f"转换成功: {docx_path}")
            
            # 转换成功后删除原doc文件
            try:
                os.remove(doc_path)
                print(f"已删除原文件: {doc_path}")
            except Exception as e:
                print(f"删除原文件失败: {doc_path}, 错误: {str(e)}")
                
            return docx_path
        else:
            print(f"转换失败: {doc_path}")
            if error:
                print(f"错误信息: {error.decode()}")
            return None
    except Exception as e:
        print(f"转换过程中发生错误: {str(e)}")
        return None

def convert_all_docs_to_docx(directory):
    """递归转换目录下所有.doc文件为.docx"""
    converted_files = 0
    error_files = 0
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.doc') and not file.lower().endswith('.docx'):
                file_path = os.path.join(root, file)
                
                if is_temp_file(file_path):
                    continue
                    
                try:
                    docx_path = convert_doc_to_docx(file_path)
                    if docx_path and os.path.exists(docx_path):
                        converted_files += 1
                    else:
                        error_files += 1
                except Exception as e:
                    error_files += 1
                    print(f"转换文件 {file_path} 时发生错误: {str(e)}")
    
    return converted_files, error_files

def is_temp_file(file_path):
    """检查是否是临时文件"""
    file_name = os.path.basename(file_path)
    temp_patterns = ['~$', '~WR', 'Backup of', '.tmp', '.bak']
    return any(pattern in file_name for pattern in temp_patterns)

def replace_text_in_paragraph(paragraph):
    """处理段落中的文本替换"""
    modified = False
    original_text = paragraph.text
    new_text = original_text
    
    for target, replacement in REPLACEMENT_DICT.items():
        if target in new_text:
            new_text = new_text.replace(target, replacement)
            modified = True
            
    if modified:
        # 保留格式化特性的替换方法
        for i, run in enumerate(paragraph.runs):
            if i == 0 and run.text:
                run.text = new_text
            elif i > 0:
                run.text = ""
            
    return modified
def process_word_file(file_path):
    """处理单个Word文件的文本替换"""
    try:
        # 打开Word文档
        doc = Document(file_path)
        # 标记是否有修改
        modified = False
        
        # 遍历所有段落
        for paragraph in doc.paragraphs:
            if replace_text_in_paragraph(paragraph):
                modified = True
        
        # 遍历所有表格
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if replace_text_in_table_cell(cell):
                        modified = True
        
        # 处理页眉和页脚
        for section in doc.sections:
            for paragraph in section.header.paragraphs:
                if replace_text_in_paragraph(paragraph):
                    modified = True
            for paragraph in section.footer.paragraphs:
                if replace_text_in_paragraph(paragraph):
                    modified = True
        
        # 尝试处理字段代码
        if process_field_codes(doc):
            modified = True
        
        return doc, modified
    
    except Exception as e:
        print(f"处理文件 {file_path} 时发生错误: {str(e)}")
        return None, False
def replace_text_in_table_cell(cell):
    """处理表格单元格中的文本替换"""
    modified = False
    for paragraph in cell.paragraphs:
        if replace_text_in_paragraph(paragraph):
            modified = True
    return modified
def process_field_codes(doc):
    """尝试处理文档中的字段代码"""
    modified = False
    
    try:
        for element in doc._element.xpath('//w:instrText'):
            original_text = element.text
            if original_text:
                new_text = original_text
                for target, replacement in REPLACEMENT_DICT.items():
                    if target in new_text:
                        new_text = new_text.replace(target, replacement)
                        
                if original_text != new_text:
                    element.text = new_text
                    modified = True
    except:
        pass
    
    return modified

def rename_files_in_directory(directory):
    """递归重命名目录下所有文件，将文件名中的'SXCDCSOP'替换为'GSCDCSOP'"""
    renamed_count = 0
    error_count = 0
    
    for root, dirs, files in os.walk(directory):
        # 首先处理文件
        for file in files:
            if 'SXCDCSOP' in file:
                old_path = os.path.join(root, file)
                new_file = file.replace('SXCDCSOP', 'GSCDCSOP')
                new_path = os.path.join(root, new_file)
                
                try:
                    # 检查目标文件是否已存在
                    if os.path.exists(new_path):
                        print(f"目标文件已存在，无法重命名: {new_path}")
                        error_count += 1
                        continue
                        
                    os.rename(old_path, new_path)
                    print(f"重命名文件: {old_path} -> {new_path}")
                    renamed_count += 1
                except Exception as e:
                    print(f"重命名文件失败: {old_path}, 错误: {str(e)}")
                    error_count += 1
        
        # 然后处理目录（从后往前处理，以避免路径问题）
        for i in range(len(dirs) - 1, -1, -1):
            dir_name = dirs[i]
            if 'SXCDCSOP' in dir_name:
                old_dir_path = os.path.join(root, dir_name)
                new_dir_name = dir_name.replace('SXCDCSOP', 'GSCDCSOP')
                new_dir_path = os.path.join(root, new_dir_name)
                
                try:
                    # 检查目标目录是否已存在
                    if os.path.exists(new_dir_path):
                        print(f"目标目录已存在，无法重命名: {new_dir_path}")
                        error_count += 1
                        continue
                        
                    os.rename(old_dir_path, new_dir_path)
                    print(f"重命名目录: {old_dir_path} -> {new_dir_path}")
                    renamed_count += 1
                    # 更新dirs列表，以确保后续处理正确
                    dirs[i] = new_dir_name
                except Exception as e:
                    print(f"重命名目录失败: {old_dir_path}, 错误: {str(e)}")
                    error_count += 1
    
    return renamed_count, error_count


def main(directory_path):
    """主函数：先转换所有文件，然后进行替换"""
    if not os.path.exists(directory_path):
        print(f"目录不存在: {directory_path}")
        return
        
    print("第一步：转换所有.doc文件为.docx格式")
    converted_files, error_files = convert_all_docs_to_docx(directory_path)
    print(f"转换完成！成功转换: {converted_files} 个文件，失败: {error_files} 个文件\n")
    
    print("第二步：进行文本替换")
    total_files = 0
    modified_files = 0
    error_files = 0
    skipped_temp_files = 0
    
    try:
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                if file.lower().endswith('.docx'):
                    file_path = os.path.join(root, file)
                    
                    if is_temp_file(file_path):
                        skipped_temp_files += 1
                        continue
                        
                    total_files += 1
                    
                    try:
                        doc, modified = process_word_file(file_path)
                        if doc and modified:
                            doc.save(file_path)
                            modified_files += 1
                            print(f"已成功修改并保存: {file_path}")
                        else:
                            print(f"无需修改: {file_path}")
                    except Exception as e:
                        error_files += 1
                        print(f"处理文件 {file_path} 时发生错误: {str(e)}")
        
        print("\n处理完成！统计信息：")
        print(f"总计处理文件数: {total_files}")
        print(f"成功修改内容文件数: {modified_files}")
        print(f"处理失败文件数: {error_files}")
        print(f"未作修改文件数: {total_files - modified_files - error_files}")
        print(f"跳过的临时文件数: {skipped_temp_files}")

        print("\n第三步：重命名文件和目录")
        renamed_count, rename_error_count = rename_files_in_directory(directory_path)
        print(f"重命名完成！成功重命名: {renamed_count} 个文件/目录，失败: {rename_error_count} 个")
    
    except Exception as e:
        print(f"处理过程中发生错误: {str(e)}")




if __name__ == "__main__":
    directory_path = "/mnt/c/Users/xuefliang/Downloads/PCV24项目SOP 6.17修订版/"
    main(directory_path)
搜索此博客

xuefliang

WSL doc2docx 并进行替换

评论

发表评论

此博客中的热门博文

windows 命令行下查看端口占用情况的方法

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理

WSL doc2docx 并进行替换

评论

发表评论

此博客中的热门博文

windows 命令行下 查看端口占用情况的方法

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理

windows 命令行下查看端口占用情况的方法