博文

目前显示的是 十二月, 2021的博文

pandas eval及query

import pandas as pd data = pd.DataFrame({     "Student": ["A", "B", "C", "D"],      "Physics": [89,34,23,56],      "Chemistry": [34,56,98,56],      "Math": [34,94,50,59],     'Name':["LI","Zhang","Wang","Xiao"]     }) pd.eval("data.Physics+data.Chemistry+data.Math") data.eval("total=Physics+Chemistry+Math").query('Math>50') data.eval('total2=Student',engine='python') data.eval('total2=Student+Name',engine='python') # 'numexpr': This default engine evaluates pandas objects using # numexpr for large speed ups in complex expressions with large frames. # 'python': Performs operations as if you had eval’d in top # level python. This engine is generally not that useful. (data.eval('total=sqrt(Math)')  .query('Chemistry>34')) data=pd.read_excel(r"C:\Users\xuefe\Desk

onehot 编码

 from sklearn.preprocessing import LabelEncoder #Auto encodes any dataframe column of type category or object. def dummyEncode(df):         columnsToEncode = list(df.select_dtypes(include=['category','object']))         le = LabelEncoder()         for feature in columnsToEncode:             try:                 df[feature] = le.fit_transform(df[feature])             except:                 print('Error encoding '+feature)         return df      X=dummyEncode(X)

pandas 去重及选择重复

 import cx_Oracle import os import pandas as pd import numpy as np import geopandas as gpd yg=pd.read_excel(r"C:\Users\xuefe\Documents\乙肝项目.xlsx") yg.info() # duplicated 判断是否有重复项 mask=yg[['GRDA_CODE','YM_MC']].duplicated(keep=False) #选择重复 yg[mask] yg.duplicated() #去重 yg[~mask] yg['YM_MC'].drop_duplicates() yg.drop_duplicates(['YM_MC']) # 当keep=False时,就是去掉所有的重复行  # 当keep=‘first'时,就是保留第一次出现的重复行  # 当keep='last'时就是保留最后一次出现的重复行。

pandas 条件赋值

 sqltxt = '''   select t.grda_code,        t.grda_xm,        e.csrq,        t.ym_mc,        t.jz_sj,        t.jz_zc,        t.ym_yxq,        t.ym_ph,        t.ym_bm,        t.ymsccj_mc,        t.jzdd_mc,        t.jzdd_dm   from inoc_jzjl t, grda_cr e  where e.grda_cr_lsh = t.grda_et_lsh  and t.config_ymxx_lsh in ('2901','2902','2903','2904','2905','2906')  and  t.jz_sj>=to_date('2021-01-01','yyyy-mm-dd')  and jcqk='1'  '''   jzjl=pd.read_sql(sqltxt,con) jzjl.columns=[str(item).lower() for item in jzjl.columns] jzjl['jz_sj']=jzjl['jz_sj'].dt.date jzjl['csrq']=pd.to_datetime(jzjl['csrq'],format='%Y%m%d') jzjl=jzjl[jzjl['ym_ph'].isin(['20210411','20210310','20210205','20210206','20201212','20210102'])] test=jzjl test['pc']=np.where(test['ym_ph'].isin(['20210411','202103

install contextily

  conda install contextily --channel conda-forge

pandas 长转宽

 import cx_Oracle import os import re import math import pandas as pd import numpy as np from sqlalchemy import create_engine from sqlalchemy.engine import url from matplotlib import pyplot as plt  os.environ['NLS_LANG'] = 'AMERICAN_AMERICA.ZHS16GBK' con = cx_Oracle.connect("ipvsdb", "Mygh@2021#", "192.168.30.48/JZDB1") sqltxt = '''  select i.grda_code,        e.zjhm,        e.csrq,        i.jz_zc,        i.jz_sj,        i.jzdd_dm,        i.ym_bm   from inoc_jzjl i, grda_cr e  where e.grda_cr_lsh = i.grda_et_lsh    and e.jcqk = '1'    and i.config_ymxx_lsh in ('1801','1802','1803')    and i.jz_sj >=to_date('2020-01-01','yyyy-mm-dd')  '''   cxr=pd.read_sql(sqltxt,con) cxr.columns=[str.lower(item) for item in cxr.columns] cxr['shi']=cxr['jzdd_dm'].str[0:4] cxr['xian']=cxr['jzdd_dm'].str[0:6] cxr['age']=((cxr['jz_sj']-cxr[