polars
import polars as pl
df = pl.read_csv(
"/mnt/c/Users/Administrator/Downloads/百日咳小于2月大于7岁接种1.csv",
schema_overrides={
"编码": pl.Utf8,
"出生日期": pl.Date,
"百日咳接种日期": pl.Date,
}, # 将编码列指定为字符串类型
)
df.filter(pl.col("出生日期") > pl.date(2025, 1, 25))
df.glimpse()
df.head()
df.group_by(pl.col("出生日期")).agg(pl.col("编码").n_unique()).sort("出生日期")
df.group_by(pl.col("出生日期")).agg(pl.col("编码").count()).sort("出生日期")
df = pl.DataFrame(
{
"name": ["Alice", "Bob", "Charlie", "David"],
"department": ["HR", "IT", "Finance", "IT"],
"salary": [70000, 80000, 120000, 95000],
}
)
df = df.with_columns(
[pl.col("salary").cum_sum().over("department").alias("cumulative_salary")]
)
df = pl.DataFrame(
{
"name": ["Alice", "Bob", "Charlie", "David"],
"age": [25, 30, 35, 28],
"salary": [70000, 80000, 120000, 95000],
}
)
df_filter = df.filter((pl.col("age") > 30) & (pl.col("salary") > 80000))
df = pl.DataFrame(
{
"department": ["HR", "IT", "Finance", "IT", "HR", "Finance"],
"name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank"],
"salary": [70000, 80000, 120000, 95000, 75000, 130000],
}
)
df.group_by(["department"]).agg([pl.col("salary").mean().alias("avg_salary")])
df.group_by("department").agg([pl.col("salary").mean().alias("avg_salary")])
test = df.group_by("department").agg([pl.col("salary").sum().alias("sum_salary")])
df.group_by(pl.col("department")).agg([pl.col("salary").sum().alias("sum_salary")])
df = pl.read_csv(
"/mnt/c/Users/Administrator/Downloads/2013百白破.csv",
schema_overrides={
"GRDA_CODE": pl.Utf8,
"GRDA_XM": pl.Utf8,
"CSRQ": pl.Datetime, # 改为Datetime类型
"YM_MC": pl.Utf8,
"JZ_SJ": pl.Datetime,
"JZ_ZC": pl.UInt32,
"YM_YXQ": pl.Utf8,
"YM_PH": pl.Utf8,
"YMSCCJ_MC": pl.Utf8,
"ZZXS": pl.UInt16,
"JZDD_MC": pl.Utf8,
"JZDD_DM": pl.Utf8,
"JZYS_MC": pl.Utf8,
"GLDW_MC": pl.Utf8,
"GLDW_BM": pl.UInt64,
},
encoding="GB18030",
)
df = df.rename(lambda col_name: col_name.lower())
df = df.with_columns(
[
(pl.col("jz_sj").dt.date() - pl.col("csrq").dt.date())
.dt.total_days()
.alias("interval_days"),
((pl.col("jz_sj").dt.date() - pl.col("csrq").dt.date()).dt.total_days() / 28)
.floor()
.cast(pl.Int32)
.alias("months"),
(pl.col("jzdd_dm").str.slice(0, 4).alias("shi")),
(pl.col("jzdd_dm").str.slice(0, 6).alias("xian")),
]
)
df.filter((pl.col("months") < 2) & (pl.col("months") > 0)).group_by("shi", "xian").agg(
pl.col("grda_code").count().alias("数量")
).sort("shi")
tj = (
df.filter(
(pl.col("months") < 2)
& (pl.col("months") > 0)
& (pl.col("shi").str.starts_with("62"))
)
.group_by([pl.col("shi"), pl.col("xian")])
.agg(pl.col("grda_code").count().alias("数量"))
.sort("shi")
)
tj = (
df.filter(
(pl.col("months") < 2)
& (pl.col("months") > 0)
& (pl.col("shi").str.contains("^62"))
)
.group_by([pl.col("shi"), pl.col("xian")])
.agg(pl.col("grda_code").count().alias("数量"))
.sort("shi")
)
tj.to_pandas().to_excel('/mnt/c/Users/Administrator/Downloads/小于2月龄.xlsx')
(
df.lazy()
.filter((pl.col("months") > 12 * 7) & (pl.col("shi").str.starts_with("62")))
.group_by("shi")
.agg(pl.col("grda_code").count())
.sort(pl.col('shi'))
.collect()
)
# 条件赋值
df_conditional = df.select(
pl.col("nrs"),
pl.when(pl.col("nrs") > 2)
.then(True)
.otherwise(False)
.alias("conditional"),
)
# 利用前值进行填充
df = pl.DataFrame(
{
"a": [1, 2, None, 4],
"b": [0.5, 4, None, 13],
}
)
df.fill_null(99) #用单个值
df.fill_null(strategy="forward")
import polars as pl
df=pl.read_csv("/mnt/c/Users/Administrator/Downloads/jzjlcf.csv",
schema_overrides={'GRDA_ET_LSH':pl.Utf8,'GLDW_BM':pl.Utf8,'XT_DJJGDM':pl.Utf8,'XT_XGJGDM':pl.Utf8,
'JZDD_DM':pl.Utf8,'GRDA_CODE':pl.Utf8,'YM_BM':pl.Utf8,'config_ymxx_lsh':pl.Utf8})
df = df.rename({col: col.lower() for col in df.columns})
df.select('grda_code').n_unique()
df = df.with_columns(
pl.col('jz_sj').str.split(" ").list.first().str.to_date(strict=False).alias('jz_sj'),
pl.col('ym_bm').str.slice(0,2).alias('ymdl')
).with_columns(
pl.concat_str([
pl.col('grda_code'),
pl.col('ymdl'),
pl.col('jz_sj').dt.strftime('%Y-%m-%d')
], separator='_').alias('id')
)
tmp=df.filter(pl.col('id').is_duplicated())
df.group_by(['grda_code']).agg(pl.col('grda_code').count().alias("N"))
tmp=df.filter(pl.col('grda_code').is_unique())
result = (df
.with_columns(
pl.when(pl.col("ym_bm").str.slice(0, 2) != "28")
.then(pl.len().over("id"))
.otherwise(0)
.alias("dup_count")
)
.filter(pl.col("dup_count") > 1)
.drop("dup_count")
)
duplicate_df = (
df.filter(pl.col("ym_bm").str.slice(0, 2) != "28")
.group_by("id")
.agg(pl.len().alias("count"))
.filter(pl.col("count") > 1)
.select("id")
)
result = df.join(duplicate_df, on="id", how="inner")
# 选择重复
result=(df.filter(pl.col("ym_bm").str.slice(0, 2) != "28").filter(pl.col('id').is_duplicated()).sort(pl.col('id')))
df.select(pl.col('grda_code').count()).item()
df.select(pl.col('grda_code').len()).item()
df.select('grda_code').count()
pl.show_versions()
df.describe()
评论
发表评论