博文

目前显示的是 十二月, 2025的博文

将docx中的表格读入pandas

  from docx import Document import pandas as pd import polars as pl def tables_to_pd ( word_file ):     doc = Document ( word_file )     all_data = []   # 存储所有表格的数据行     headers = None   # 存储表头     for i , table in enumerate ( doc . tables ):         for j , row in enumerate ( table . rows ):             row_data = [ cell . text for cell in row . cells ]             if j == 0 :   # 第一行作为表头                 if headers is None :                     headers = row_data   # 只保存第一个表格的表头             else :   # 数据行                 all_data . append ( row_data )     # 创建一个DataFrame     if headers and all_data :       ...

WSL doc2docx 并进行替换

  import os import subprocess import tempfile import shutil from docx import Document from doc2docx import convert # 定义替换对照表 REPLACEMENT_DICT = {     '陕西省' : '甘肃省' ,     '西安' : '兰州' ,     '咸阳' : '天水' ,     'SXCDCSOP' : 'GSCDCSOP' , } def convert_doc_to_docx ( doc_path ):     """使用 Windows PowerShell 调用 Word 来转换文件,并在转换成功后删除原doc文件"""     try :         # 将 Linux 路径转换为 Windows 路径         win_path = subprocess.check_output([ 'wslpath' , '-w' , doc_path]).decode().strip()         docx_path = os.path.splitext(doc_path)[ 0 ] + ".docx"         win_docx_path = subprocess.check_output([ 'wslpath' , '-w' , docx_path]).decode().strip()                 # 检查目标文件是否已存在         if os.path.exists(docx_path):             print ( f "目标文件已存在: { docx_path } " ) ...

polars 长转宽

import polars as pl # pl.show_versions() yg = pl . read_excel ( "/mnt/c/Users/xuefliang/Downloads/乙肝.xlsx" ). with_columns (     pl . col ( "有效证件号" ). str . strip_chars ( "'" ),     pl . col ( '卡片ID' ). str . strip_chars ( "'" ) )   jz = (     jz . sort ( 'jz_sj' )     . with_columns (         pl . col ( 'grda_et_lsh' )         . cum_count ()         . over ( 'grda_et_lsh' )         . alias ( 'jc' )     ) ) #pivot 长转宽 jz = (     pl . read_database_uri ( query = query , uri = uri )     . rename ( lambda col : col . lower ())     . with_columns ( pl . col ( "jz_zc" ). cast ( pl . Int32 ))     . pivot ( index = "zjhm" , on = "jz_zc" , values = "jz_sj" , aggregate_function = "first" ) ) (     yg . select (         pl . concat_str ([ pl . lit ( "'" ), pl . col ( '有效证件号' ), pl . lit ...

detect_encoding

  import chardet def detect_encoding ( filename : str ) -> str :     with open (filename, 'rb' ) as f:         raw_data = f.read()         result = chardet.detect(raw_data)         return result[ 'encoding' ] detect_encoding( 'data/directors.csv' ) directors = pl.read_csv( "data/directors.csv" , encoding = "EUC-JP" )