ibis 中使用duckdb

import duckdb
import pandas as pd
import numpy as np
import ibis
from ibis import _
import ibis.selectors as s
import ibis.expr.operations as ops
ibis.options.interactive = True

# Create a DuckDB connection
# con = duckdb.connect('/mnt/d/duckdb.db')

# # Create a table
# con.execute("CREATE TABLE XG as select * from read_csv_auto('/mnt/c/Users/xuefliang/Downloads/新冠存疑2.csv')")
# con.sql('SHOW ALL TABLES')

con = ibis.connect("duckdb:/mnt/d/duckdb.db")

# con.create_table(
#     "penguins", ibis.examples.penguins.fetch().to_pyarrow(), overwrite = True
# )
con.list_tables()

xg = con.table("XG")

xg.head()

(
    xg
    .group_by("YM_PH")
    .aggregate(
        count=lambda _: _.count(),
        distinct_count=lambda _: _.ZJHM.nunique()
    )
)

tmp=(
    xg
    .mutate(CSRQ=_.CSRQ.cast("string"))
    .filter(_.ZJHM.length() == 18)
    .mutate(
        id_csrq=(
            _.ZJHM.substr(6, 4)
            .concat('-')
            .concat(_.ZJHM.substr(10, 2))
            .concat('-')
            .concat(_.ZJHM.substr(12, 2))
        )
    )
    .filter(_.id_csrq.notnull() & _.CSRQ.notnull())
    .filter(_.id_csrq != _.CSRQ)
)


tmp.compile()

tmp2=tmp.to_pandas()

# 使用 execute 执行查询后保存
result = tmp.execute()
con.create_table('tmp', result, overwrite=True)

#将结果保存到 DuckDB 表中
con.create_table('tmp', tmp.execute(), overwrite=True)

#删除临时表
con.drop_table('tmp')

con.list_tables()

# 重命名
xg.rename(str.lower)

xg.rename(grda_code='GRDA_CODE')

# 查看表结构
xg.schema

(
    xg
    .group_by("YM_PH")
    .aggregate(
        count=_.count(),
        distinct_count=_.ZJHM.nunique()
    )
)


tmp=(
    xg
    .mutate(CSRQ=_.CSRQ.cast("string"))
    .filter(_.ZJHM.length() == 18)
    .mutate(
        csrq=(
            _.ZJHM.substr(6, 4)
            .concat('-')
            .concat(_.ZJHM.substr(10, 2))
            .concat('-')
            .concat(_.ZJHM.substr(12, 2))
        )
    )
    .filter(_.csrq.notnull() & _.CSRQ.notnull())
    .filter(_.csrq != _.CSRQ)
    .execute()
)

#条件赋值
xg = xg.mutate(
    ZJHM=ibis.case()
        .when(_.ZJHM=='622924198402249014', None)
        .else_(xg.ZJHM)
        .end()
)

#筛选缺失值
(
    xg
    .filter(_.ZJHM.isnull())
)



评论

此博客中的热门博文

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理