ibis 中使用duckdb
import duckdb
import pandas as pd
import numpy as np
import ibis
from ibis import _
import ibis.selectors as s
import ibis.expr.operations as ops
ibis.options.interactive = True
# Create a DuckDB connection
# con = duckdb.connect('/mnt/d/duckdb.db')
# # Create a table
# con.execute("CREATE TABLE XG as select * from read_csv_auto('/mnt/c/Users/xuefliang/Downloads/新冠存疑2.csv')")
# con.sql('SHOW ALL TABLES')
con = ibis.connect("duckdb:/mnt/d/duckdb.db")
# con.create_table(
# "penguins", ibis.examples.penguins.fetch().to_pyarrow(), overwrite = True
# )
con.list_tables()
xg = con.table("XG")
xg.head()
(
xg
.group_by("YM_PH")
.aggregate(
count=lambda _: _.count(),
distinct_count=lambda _: _.ZJHM.nunique()
)
)
tmp=(
xg
.mutate(CSRQ=_.CSRQ.cast("string"))
.filter(_.ZJHM.length() == 18)
.mutate(
id_csrq=(
_.ZJHM.substr(6, 4)
.concat('-')
.concat(_.ZJHM.substr(10, 2))
.concat('-')
.concat(_.ZJHM.substr(12, 2))
)
)
.filter(_.id_csrq.notnull() & _.CSRQ.notnull())
.filter(_.id_csrq != _.CSRQ)
)
tmp.compile()
tmp2=tmp.to_pandas()
# 使用 execute 执行查询后保存
result = tmp.execute()
con.create_table('tmp', result, overwrite=True)
#将结果保存到 DuckDB 表中
con.create_table('tmp', tmp.execute(), overwrite=True)
#删除临时表
con.drop_table('tmp')
con.list_tables()
# 重命名
xg.rename(str.lower)
xg.rename(grda_code='GRDA_CODE')
# 查看表结构
xg.schema
(
xg
.group_by("YM_PH")
.aggregate(
count=_.count(),
distinct_count=_.ZJHM.nunique()
)
)
tmp=(
xg
.mutate(CSRQ=_.CSRQ.cast("string"))
.filter(_.ZJHM.length() == 18)
.mutate(
csrq=(
_.ZJHM.substr(6, 4)
.concat('-')
.concat(_.ZJHM.substr(10, 2))
.concat('-')
.concat(_.ZJHM.substr(12, 2))
)
)
.filter(_.csrq.notnull() & _.CSRQ.notnull())
.filter(_.csrq != _.CSRQ)
.execute()
)
#条件赋值
xg = xg.mutate(
ZJHM=ibis.case()
.when(_.ZJHM=='622924198402249014', None)
.else_(xg.ZJHM)
.end()
)
#筛选缺失值
(
xg
.filter(_.ZJHM.isnull())
)
评论
发表评论