博文

目前显示的是 九月, 2021的博文

分组按量抽样

 df <- data.frame(id = 1:15,                  grp = rep(1:3,each = 5),                   frq = rep(c(3,2,4), each = 5)) set.seed(22) df %>%   group_by(grp) %>%                                 # for every group   summarise(d = list(data.frame(id=id)),            # create a data frame of ids             frq = unique(frq)) %>%                  # get the unique frq value   mutate(v = map2(d, frq, ~sample_n(.x, .y))) %>%   # sample using data frame of ids and frq value   unnest(v) %>%                                     # unnest sampled values   select(-frq)                                      # remove frq column (if needed)

指定csv文件编码方式读入

 xzqh <- read.csv('./Documents/地区编码.csv', fileEncoding = "GBK",stringsAsFactors=FALSE) # locale:locale参数是 readr 包中很重要的一个参数,指定日期使用的月和日的名称,时区,字符编码,日期格式,数字的小数和点位数和分隔符。 xzqh <- read_csv('./Documents/地区编码.csv',                   locale = locale(date_format = "%Y/%m/%d",                                  time_format = "h%Hm%Ms%S %p",                                  encoding = 'GBK',                                  tz = 'Asia/Shanghai'))

pandas 分组统计

  import  numpy  as  np import  pandas  as  pd df=pd.read_excel( r 'C: \U sers \r uiying \D esktop \评 审结果 . xlsx' ) df[ 'xiang' ]=df[ 'jgbm' ].str[ 0 : 8 ] df[ 'xiang' ].count() df[ 'xiang' ].value_counts() df[df[ 'pdjb_bm' ]>= 3 ][ 'pdjb_bm' ].count() df[ 'xiang' ].drop_duplicates().count() df.describe() result=df.groupby( 'xiang' ).apply( lambda  x: x[x[ 'pdjb_bm' ]>= 3 ][ 'pdjb_bm' ].count()).to_frame() type (result) result.head() result.columns=[ 'xiang' , 'num' ] result.describe() result.columns result.rename( columns ={ 'xiang' : 'xian' }, inplace = True ) result.to_csv( r 'C: \U sers \r uiying \D esktop \结 果 . csv' )

cx_Oracle 链接

  import  cx_Oracle import  os import  re import  math import  pandas  as  pd import  numpy  as  np from  sqlalchemy  import  create_engine from  sqlalchemy.engine  import  url from  id_validator  import  validator from  matplotlib  import  pyplot  as  plt  os.environ[ 'NLS_LANG' ] =  'AMERICAN_AMERICA.ZHS16GBK' sqltxt= ''' select jz_sj,ym_mc,jzdd_dm from inoc_jzjl where jz_sj >=to_date('2020-01-01','yyyy-mm-dd') and jz_sj <=to_date('2020-08-30','yyyy-mm-dd') and sfmf=1 ''' con = cx_Oracle.connect( "****" ,  "********" ,  "192.168.30.48/JZDB1" ) jzjl = pd.read_sql(sqltxt,con)

数据预处理python

import pandas as pd import numpy as np from sklearn import datasets, preprocessing from sklearn.model_selection import train_test_split #%% iris = datasets.load_iris() x = pd.DataFrame(iris['data'], columns=iris['feature_names']) y = pd.DataFrame(iris['target'].astype(str), columns=['target_names']) df = pd.concat([x,y], axis=1) #%% '檢查&補值' df.isna().sum() df=df.fillna(0) #0補值 df=df.fillna(method='pad') #前一筆 df=df.fillna(method='bfill') #後一筆 df['target_names']=df['target_names'].fillna((df['target_names'].mode())) #眾數 df['sepal length (cm)']=df['sepal length (cm)'].fillna((df['sepal length (cm)'].mean())) #平均值 #%% '資料轉換-標準化' min_max_scaler = preprocessing.MinMaxScaler() df = min_max_scaler.fit_transform(df) #%% '單熱' one_hot = pd.get_dummies(df) labels = np.array(one_hot['sepal length (cm)']) #應變數 features= one_hot.drop(&

pandas 按条件新增列

 import  pandas as pd import numpy as np df=pd.DataFrame({'amount':[100,200,300,400,500],'list':['','商品1','商品2','','商品3']}) df['x1'] = df.apply(lambda x: x.amount if x.list != "" else 0, axis=1) df['x2']=np.where(df['list']=='',0,df['amount']) df.loc[df['list']!='','x3']=df['amount'] df=df.fillna(0)