出生队列接种率不合格乡镇率
import math
import pandas as pd
import numpy as np
import janitor
import oracledb
shi_bm=pd.DataFrame.from_dict({'地区名称': {0: '兰州市',
1: '嘉峪关市',
2: '金昌市',
3: '白银市',
4: '天水市',
5: '武威市',
6: '张掖市',
7: '平凉市',
8: '酒泉市',
9: '庆阳市',
10: '定西市',
11: '陇南市',
12: '临夏回族自治州',
13: '甘南藏族自治州',
14: '兰州新区'},
'地区编码': {0: 6201,
1: 6202,
2: 6203,
3: 6204,
4: 6205,
5: 6206,
6: 6207,
7: 6208,
8: 6209,
9: 6210,
10: 6211,
11: 6212,
12: 6229,
13: 6230,
14: 6232}}
).astype({'地区编码':'str'})
xian_bm=pd.DataFrame.from_dict({'地区名称': {1: '城关区',
37: '七里河区',
73: '西固区',
109: '安宁区',
145: '红古区',
181: '永登县',
217: '皋兰县',
253: '榆中县',
289: '嘉峪关市',
325: '金川区',
361: '永昌县',
397: '白银区',
433: '平川区',
469: '靖远县',
505: '会宁县',
541: '景泰县',
577: '秦州区',
613: '麦积区',
649: '清水县',
685: '秦安县',
721: '甘谷县',
757: '武山县',
793: '张家川回族自治县',
829: '凉州区',
865: '民勤县',
901: '古浪县',
937: '天祝藏族自治县',
973: '甘州区',
1009: '肃南裕固族自治县',
1045: '民乐县',
1081: '临泽县',
1117: '高台县',
1153: '山丹县',
1189: '崆峒区',
1225: '泾川县',
1261: '灵台县',
1297: '崇信县',
1333: '庄浪县',
1369: '静宁县',
1405: '华亭市',
1441: '肃州区',
1477: '金塔县',
1513: '瓜州县',
1549: '肃北蒙古族自治县',
1585: '阿克塞哈萨克族自治县',
1621: '玉门市',
1657: '敦煌市',
1693: '西峰区',
1729: '庆城县',
1765: '环县',
1801: '华池县',
1837: '合水县',
1873: '正宁县',
1909: '宁县',
1945: '镇原县',
1981: '安定区',
2017: '通渭县',
2053: '陇西县',
2089: '渭源县',
2125: '临洮县',
2161: '漳县',
2197: '岷县',
2233: '武都区',
2269: '成县',
2305: '文县',
2341: '宕昌县',
2377: '康县',
2413: '西和县',
2449: '礼县',
2485: '徽县',
2521: '两当县',
2557: '临夏市',
2593: '临夏县',
2629: '康乐县',
2665: '永靖县',
2701: '广河县',
2737: '和政县',
2773: '东乡族自治县',
2809: '积石山保安族东乡族撒拉族自治县',
2845: '合作市',
2881: '临潭县',
2917: '卓尼县',
2953: '舟曲县',
2989: '迭部县',
3025: '玛曲县',
3061: '碌曲县',
3097: '夏河县',
3133: '兰州新区'},
'地区编码': {1: 620102,
37: 620103,
73: 620104,
109: 620105,
145: 620111,
181: 620121,
217: 620122,
253: 620123,
289: 620201,
325: 620302,
361: 620321,
397: 620402,
433: 620403,
469: 620421,
505: 620422,
541: 620423,
577: 620502,
613: 620503,
649: 620521,
685: 620522,
721: 620523,
757: 620524,
793: 620525,
829: 620602,
865: 620621,
901: 620622,
937: 620623,
973: 620702,
1009: 620721,
1045: 620722,
1081: 620723,
1117: 620724,
1153: 620725,
1189: 620802,
1225: 620821,
1261: 620822,
1297: 620823,
1333: 620825,
1369: 620826,
1405: 620881,
1441: 620902,
1477: 620921,
1513: 620922,
1549: 620923,
1585: 620924,
1621: 620981,
1657: 620982,
1693: 621002,
1729: 621021,
1765: 621022,
1801: 621023,
1837: 621024,
1873: 621025,
1909: 621026,
1945: 621027,
1981: 621102,
2017: 621121,
2053: 621122,
2089: 621123,
2125: 621124,
2161: 621125,
2197: 621126,
2233: 621202,
2269: 621221,
2305: 621222,
2341: 621223,
2377: 621224,
2413: 621225,
2449: 621226,
2485: 621227,
2521: 621228,
2557: 622901,
2593: 622921,
2629: 622922,
2665: 622923,
2701: 622924,
2737: 622925,
2773: 622926,
2809: 622927,
2845: 623001,
2881: 623021,
2917: 623022,
2953: 623023,
2989: 623024,
3025: 623025,
3061: 623026,
3097: 623027,
3133: 623201}}).astype({'地区编码':'str'})
def windows_to_wsl_path(windows_path):
wsl_prefix = '/mnt/'
path = windows_path.replace(':', '').replace('\\', '/').lower()
wsl_path = wsl_prefix + path
return wsl_path
def clean_file(input_path):
# Read Excel file and perform data cleaning
df = (
pd.read_excel(input_path, sheet_name='sheet1', skiprows=3)
.clean_names()
.apply(lambda x: x.fillna(method="ffill") if x.name in ['unnamed_0','unnamed_1','unnamed_2'] else x, axis=0)
.filter(regex='^un|率')
.query("unnamed_2 not in ['总人数','全程接种']")
.apply(lambda x: pd.to_numeric(x.str.replace("%", ""), errors='coerce') if '率' in x.name else x, axis=0)
)
# Rename columns
rename_dict = {df.columns[0]: 'dq', df.columns[1]: 'bm', df.columns[2]: 'ym', df.columns[3]: 'jc'}
for i in range(4, 22):
rename_dict[df.columns[i]] = f'age_{i - 3}'
df.rename(columns=rename_dict, inplace=True)
# Fill missing values based on conditions
df = df.apply(lambda x: x.fillna(0) if pd.api.types.is_numeric_dtype(x.dtype) else x.fillna('.'))
return df
# Save cleaned dataframe to a new Excel file
df.to_excel(output_path)
input_path = windows_to_wsl_path(r'C:\Users\xuefeng\Downloads\gnldet_jzl.xlsx')
output_path = windows_to_wsl_path(r'C:\Users\xuefeng\Downloads\gnldet_jzltest.xlsx')
jzl=(
clean_file(input_path)
.assign(shi=lambda x:x['bm'].apply(lambda x: str(x)[:4]))
.assign(xian=lambda x:x['bm'].apply(lambda x: str(x)[:6]))
.assign(bm=lambda x:x['bm'].apply(lambda x: str(x)[:8]))
.replace({'jc': {'.': 1}}))
shi=(
jzl
.groupby(['shi'])
.agg(乡镇数=('bm', lambda x: x.nunique()))
)
xian=(
jzl
.groupby(['xian'])
.agg(乡镇数=('bm', lambda x: x.nunique()))
)
def jzl_age(df,level, age_col,ym_list):
if level == 'shi':
group_cols = ['shi', 'ym', 'jc']
merge_df = shi
elif level == 'xian':
group_cols = ['xian', 'ym', 'jc']
merge_df = xian
age_df = (
df.query(f"{age_col} < 90")
.groupby(group_cols)
.agg(不合格乡镇数=('bm', lambda x: x.nunique()))
.reset_index()
.merge(merge_df, how='left', left_on=group_cols[0], right_on=group_cols[0])
.assign(不合格率=lambda x: x.不合格乡镇数 / x.乡镇数 * 100)
.astype({'jc': 'str'})
.assign(ym_jc=lambda x: x.ym.str.cat(x.jc, sep='_'))
.query("ym_jc.isin(@ym_list)")
.assign(age=age_col)
)
return age_df
age1_ym_list = ['卡介苗_1','乙肝疫苗_1','乙肝疫苗_2','乙肝疫苗_3','脊灰疫苗_1','脊灰疫苗_2','脊灰疫苗_3','百白破疫苗_1','百白破疫苗_2','百白破疫苗_3','含麻疹类疫苗_1','乙脑疫苗_1','A群流脑多糖疫苗_1','A群流脑多糖疫苗_2']
age2_ym_list=['甲肝疫苗_1','百白破疫苗_4','含麻疹类疫苗_2']
age3_ym_list=['乙脑疫苗_2']
age4_ym_list=['A群C群流脑多糖疫苗_1']
age5_ym_list=['脊灰疫苗_4']
age7_ym_list=['A群C群流脑多糖疫苗_2','白破疫苗_1']
shi_age1=jzl_age(df=jzl, level='shi',age_col='age_1', ym_list=age1_ym_list)
shi_age2=jzl_age(df=jzl, level='shi',age_col='age_2', ym_list=age2_ym_list)
shi_age3=jzl_age(df=jzl, level='shi',age_col='age_3', ym_list=age3_ym_list)
shi_age4=jzl_age(df=jzl, level='shi',age_col='age_4', ym_list=age4_ym_list)
shi_age5=jzl_age(df=jzl, level='shi',age_col='age_5', ym_list=age5_ym_list)
shi_age7=jzl_age(df=jzl, level='shi',age_col='age_7', ym_list=age7_ym_list)
(
pd.concat([shi_age1,shi_age2,shi_age3,shi_age4,shi_age5,shi_age7])
.drop('ym_jc',axis=1)
.merge(shi_bm,how='left',left_on='shi',right_on='地区编码')
.drop('shi',axis=1)
.assign(不合格率=lambda x: round(x.不合格率.astype(float), 2)) # 保留2位小数
.to_excel(windows_to_wsl_path(r'C:\Users\xuefeng\Downloads\shi_age.xlsx'),index=False)
)
xian_age1 = jzl_age(df=jzl, level='xian',age_col='age_1', ym_list=age1_ym_list)
xian_age2 = jzl_age(df=jzl, level='xian',age_col='age_2', ym_list=age2_ym_list)
xian_age3 = jzl_age(df=jzl, level='xian',age_col='age_3', ym_list=age3_ym_list)
xian_age4 = jzl_age(df=jzl, level='xian',age_col='age_4', ym_list=age4_ym_list)
xian_age5 = jzl_age(df=jzl, level='xian',age_col='age_5', ym_list=age5_ym_list)
xian_age7 = jzl_age(df=jzl, level='xian',age_col='age_7', ym_list=age7_ym_list)
(
pd.concat([xian_age1,xian_age2,xian_age3,xian_age4,xian_age5,xian_age7])
.drop('ym_jc',axis=1)
.merge(xian_bm,how='left',left_on='xian',right_on='地区编码')
.drop('xian',axis=1)
.assign(不合格率=lambda x: round(x.不合格率.astype(float), 2)) # 保留2位小数
.to_excel(windows_to_wsl_path(r'C:\Users\xuefeng\Downloads\xian_age.xlsx'),index=False)
)
评论
发表评论