xuefliang

博文

目前显示的是十二月, 2025的博文

将docx中的表格读入pandas

十二月 24, 2025

from docx import Document import pandas as pd import polars as pl def tables_to_pd ( word_file ): doc = Document ( word_file ) all_data = [] # 存储所有表格的数据行 headers = None # 存储表头 for i , table in enumerate ( doc . tables ): for j , row in enumerate ( table . rows ): row_data = [ cell . text for cell in row . cells ] if j == 0 : # 第一行作为表头 if headers is None : headers = row_data # 只保存第一个表格的表头 else : # 数据行 all_data . append ( row_data ) # 创建一个DataFrame if headers and all_data : ...

阅读全文

WSL doc2docx 并进行替换

十二月 09, 2025

import os import subprocess import tempfile import shutil from docx import Document from doc2docx import convert # 定义替换对照表 REPLACEMENT_DICT = { '陕西省' : '甘肃省' , '西安' : '兰州' , '咸阳' : '天水' , 'SXCDCSOP' : 'GSCDCSOP' , } def convert_doc_to_docx ( doc_path ): """使用 Windows PowerShell 调用 Word 来转换文件，并在转换成功后删除原doc文件""" try : # 将 Linux 路径转换为 Windows 路径 win_path = subprocess.check_output([ 'wslpath' , '-w' , doc_path]).decode().strip() docx_path = os.path.splitext(doc_path)[ 0 ] + ".docx" win_docx_path = subprocess.check_output([ 'wslpath' , '-w' , docx_path]).decode().strip() # 检查目标文件是否已存在 if os.path.exists(docx_path): print ( f "目标文件已存在: { docx_path } " ) ...

阅读全文

polars 长转宽

十二月 08, 2025

import polars as pl # pl.show_versions() yg = pl . read_excel ( "/mnt/c/Users/xuefliang/Downloads/乙肝.xlsx" ). with_columns ( pl . col ( "有效证件号" ). str . strip_chars ( "'" ), pl . col ( '卡片ID' ). str . strip_chars ( "'" ) ) jz = ( jz . sort ( 'jz_sj' ) . with_columns ( pl . col ( 'grda_et_lsh' ) . cum_count () . over ( 'grda_et_lsh' ) . alias ( 'jc' ) ) ) #pivot 长转宽 jz = ( pl . read_database_uri ( query = query , uri = uri ) . rename ( lambda col : col . lower ()) . with_columns ( pl . col ( "jz_zc" ). cast ( pl . Int32 )) . pivot ( index = "zjhm" , on = "jz_zc" , values = "jz_sj" , aggregate_function = "first" ) ) ( yg . select ( pl . concat_str ([ pl . lit ( "'" ), pl . col ( '有效证件号' ), pl . lit ...

阅读全文

detect_encoding

十二月 07, 2025

import chardet def detect_encoding ( filename : str ) -> str : with open (filename, 'rb' ) as f: raw_data = f.read() result = chardet.detect(raw_data) return result[ 'encoding' ] detect_encoding( 'data/directors.csv' ) directors = pl.read_csv( "data/directors.csv" , encoding = "EUC-JP" )

阅读全文