ibis实战
import ibis
import pandas as pd
from ibis import _
import ibis.selectors as s
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
ibis.options.interactive = True
con=ibis.duckdb.connect()
brk=con.read_xlsx("/mnt/c/users/Administrator/Downloads/report.xlsx")
tj=(
brk
.mutate(mon=_.时间.substr(5,2))
.group_by('mon')
.aggregate(
fbs=_.发病数.sum()
)
.order_by('mon')
.execute()
)
tj.to_excel("/mnt/c/users/Administrator/Downloads/tj.xlsx")
bl=con.read_csv("/mnt/c/users/Administrator/Downloads/2020.csv")
tmp=(
bl
.mutate(age=ibis.case().when(_.年龄.contains('月'),0)
.when(_.年龄.contains('岁'),_.年龄.re_extract(r'(\d+)岁', 1).cast('int'))
.else_(0)
.end())
.mutate(age_group=ibis.case().when(_.age==0,'0岁')
.when((_.age<=4) & (_.age>=1),'1-4岁')
.when((_.age>=5) & (_.age<=9),'5-9岁')
.when(_.age>=10,'>=10岁')
.else_('未知')
.end())
)
(
tmp
.group_by("age_group")
.aggregate(
n=_.年龄.count()
)
.mutate(prop=lambda x: (x.n / x.n.sum()*100).round(2))
.execute()
.to_excel("/mnt/c/users/Administrator/Downloads/f2020.xlsx")
)
(
bl
.mutate(zjhm=_.有效证件号.replace("'", ""))
.filter(_.zjhm.notnull())
.execute()
.to_excel("/mnt/c/users/Administrator/Downloads/tmp.xlsx")
)
(
bl
.mutate(zjhm=_.有效证件号.replace("'", ""))
.filter(_.zjhm.isin(['621226202411207222','640381201605302517']))
)
#####################################
import ibis
import ibis.selectors as s
from ibis import _
ibis.options.interactive = True
con=ibis.duckdb.connect()
brk=con.read_csv("/mnt/c/Users/xuefl/Downloads/报告卡2025-03-21+11_36_40.csv")
my=con.read_xlsx('/mnt/c/users/xuefl/Downloads/免疫史查询导入模板.xlsx')
brk=(
brk
.mutate(age=ibis.case().when(_.年龄.contains('月'),0)
.when(_.年龄.contains('岁'),_.年龄.re_extract(r'(\d+)岁', 1).cast('int'))
.else_(0)
.end())
.mutate(age_group=ibis.case().when(_.age==0,'0岁')
.when((_.age<=4) & (_.age>=1),'1-4岁')
.when((_.age>=5) & (_.age<=9),'5-9岁')
.when(_.age>=10,'>=10岁')
.else_('未知')
.end())
.mutate(zjhm=_.有效证件号.replace("'",""))
.asof_join(my,on=('zjhm','身份证号码'))
.mutate(shi=_.报告单位地区编码.cast('string').substr(0,4))
)
# tmp=(
# brk
# .filter(lambda t:t.shi.re_search("^62"))
# .execute()
# )
tmp=(
brk
.filter(_.shi.re_search("^62"))
.group_by('age_group','百白破疫苗免疫史')
.aggregate(n=_.卡片ID.count())
.pivot_wider(names_from='age_group',values_from='n')
.order_by('百白破疫苗免疫史')
.select('百白破疫苗免疫史','0岁','1-4岁','5-9岁','>=10岁')
.execute()
)
# tmp2=(
# brk
# .mutate(yue=_.发病日期.cast('date')-_.出生日期)
# .execute()
# )
tmp2 = (
brk
.mutate(发病日期=_.发病日期.cast('date'))
.mutate(
# 计算年份差
years_diff=(_.发病日期.year() - _.出生日期.year()),
# 计算月份差
months_diff=(_.发病日期.month() - _.出生日期.month()),
# 计算日差的调整项
day_adjust=ibis.case()
.when(_.发病日期.day() < _.出生日期.day(), -1)
.else_(0)
.end()
)
# 最终月龄计算:年份差×12 + 月份差 + 日调整
.mutate(月龄=_.years_diff * 12 + _.months_diff + _.day_adjust)
)
tj=(
tmp2
.filter((_.月龄>3) & (_.月龄<18*12))
.group_by('百白破疫苗免疫史')
.aggregate(n=_.卡片ID.count())
.mutate(prop=_.n/5038*100)
)
tj.execute().to_excel('/mnt/c/users/xuefl/Downloads/tj.xlsx',index=False)
评论
发表评论