博文

目前显示的是 八月, 2021的博文

年龄分组

 cr %>%    filter(str_detect(shi,'^62')) %>%    group_by(shi) %>%    summarise(v1=sum(year>=18,na.rm = T),v2=sum(year<18,na.rm = T),v3=sum(year>=0),n=n()) cr %>%    mutate(age_group=case_when(     year<6 ~ '<6',     year>=6 & year<12 ~ '6-11',     year>=12 & year<15 ~ '12~14',     year>=15 & year<18 ~ '15-17',     year>=18 & year<59 ~ '18-59',     year>=60 & year<70 ~ '60-69',     year>=70 ~ '>=70')) %>%    filter(str_detect(shi,'^62')) %>%    group_by(age_group) %>%   summarise(n=n())   

批量读文件夹excel

 first_category_name = list.files("./Documents/礼县附件二/")             #list.files命令得到文件夹下所有文件夹的名称【修改】 dir = paste("./Documents/礼县附件二/",first_category_name,sep="")    #用paste命令构建路径变量dir, n = length(dir)    datalist = list() for(i in 1:n){   if(str_detect(dir[i],'xlsx')){     datalist[[i]]=readxl::read_xlsx(dir[i],skip = 3)     }   else{     datalist[[i]]=readxl::read_xls(dir[i],skip = 3)   } } nrows=sapply(datalist, nrow) sum(nrows) n = length(dir)    i=17 datalist = data.frame() if (str_detect(dir[i], 'xlsx')) {   datalist = readxl::read_xlsx(dir[i], skip = 2, sheet = 1) } else {   datalist = readxl::read_xls(dir[i], skip = 2, sheet = 1) }

svm

 data(iris) iris <- iris[,-c(5)] library(rminer)  model <- fit(Sepal.Length~.,data=iris,model="svm",              kpar=list(sigma=0.1),C=1) svm <- Importance(model,iris,measure="AAD") svm$imp par(mfrow=c(1,3)) boxplot(iris$Sepal.Width)$out boxplot(iris$Petal.Length)$out boxplot(iris$Petal.Width)$out #因為離群值不多,所以先不轉 n <- nrow(iris) set.seed(1117) subiris <- sample(seq_len(n), size = round(0.7 * n)) traindata <- iris[subiris,] testdata <- iris[ - subiris,] library(e1071) tune.model <- tune.svm(Sepal.Length~.,data=traindata,type="eps-regression",kernel="radial",                        range=list(cost = 2^c(-8,-4,-2,0), epsilon = seq(0,10,0.1))) tune.model$best.model #挑 lowest MSE  model <- svm(Sepal.Length~.,data=traindata,type="eps-regression",kernel="radial",cost=1,epsilon=0.1,gamma=0.3333333) future <- predict(model,testdata) future <- as.data.frame(future) final <- cbind(future,testdata) lib

xgb模型

 #GB library(gbm) set.seed(123) model <- gbm(formula = Sepal.Length ~ .,distribution = "gaussian",              data = iris,n.trees = 479,              interaction.depth = 1,              shrinkage = 0.1,n.minobsinnode = 15,              bag.fraction = 1,              train.fraction = 0.5,cv.folds = 10) #分類用 bernoulli 迴歸用gaussian(RMSE) #XGB data("iris") par(mfrow=c(1,4)) boxplot(iris$Sepal.Length)$out boxplot(iris$Sepal.Width)$out boxplot(iris$Petal.Length)$out boxplot(iris$Petal.Width)$out #因為離群值不多,所以先不轉 n <- nrow(iris) set.seed(1117) subiris <- sample(seq_len(n), size = round(0.7 * n)) traindata <- iris[subiris,] testdata <- iris[ - subiris,] features <- setdiff(names(iris), "Sepal.Length") library(vtreat)#單熱  library(dplyr) onehot <- vtreat::designTreatmentsZ(iris, features, verbose = FALSE) 單熱 <- onehot %>%   magrittr::use_series(scoreFrame) %>%           dplyr::filter(code %in% c("clean", "lev")) %>

随机森林

 library(naniar)  data("iris") iris <- iris[,-c(5)] any_na(iris) library(randomForest) model <- randomForest(Sepal.Length ~ Sepal.Width+Petal.Length+Petal.Width, data = iris,                          importane = T, proximity = T,                        do.trace = 100,na.action = na.roughfix) par(mfrow=c(1,1)) plot(model)  importance(model) library(rminer) rf <- Importance(model,iris,measure="AAD") rf$imp par(mfrow=c(1,3)) boxplot(iris$Sepal.Width)$out boxplot(iris$Petal.Length)$out boxplot(iris$Petal.Width)$out #因為離群值不多,所以先不轉 n <- nrow(iris) set.seed(1117) subiris <- sample(seq_len(n), size = round(0.7 * n)) traindata <- iris[subiris,] testdata <- iris[ - subiris,] features <- setdiff(x = names(traindata), y = "Sepal.Length") par(mfrow=c(1,1)) set.seed(123) tuneRF(x = traindata[features], y = traindata$Sepal.Length,        mtryStart = 1,ntreeTry = 450) set.seed(123) model <- randomForest(Sepal.Length ~ Sepal.Width+Petal.Length+Pe

决策树

library(naniar)  data("iris") iris <- subset(iris,Species!="setosa") any_na(iris) n <- nrow(iris) set.seed(1117) new <- iris[sample(n),] t_idx <- sample(seq_len(n), size = round(0.7 * n)) traindata <- iris[t_idx,] testdata <- iris[ - t_idx,] library(rpart)  library(rpart.plot) dtreeM <- rpart(Species ~ ., data = traindata,                  method = "class") #數值型改anova rpart.plot(dtreeM,digits=2,varlen=20)#圖 printcp(dtreeM) #最佳cp=min(xerror) 每次分割能改善模型 剪枝 <- prune(dtreeM,cp = dtreeM$cptable[which.min(dtreeM$cptable[,"xerror"]),"CP"]) future <- predict(剪枝 , testdata, type = "class") future <- as.data.frame(future) final <- cbind(future,testdata) confusion <- table(final$Species,final$future, dnn = c("实际", "预测")) confusion accuracy <- sum(diag(confusion)) / sum(confusion) accuracy  

多项式回归

 data(iris) iris <- iris[,-c(5)] #找自變數 library(rminer) library(tidyverse) model <- lm(Sepal.Length~Sepal.Width+Petal.Length+Petal.Width,iris) lm <- Importance(model,iris,measure="AAD") lm$imp #檢查離群值 par(mfrow=c(1,3)) boxplot(iris$Sepal.Width)$out boxplot(iris$Petal.Length)$out boxplot(iris$Petal.Width)$out par(mfrow=c(1,3)) scatter.smooth(x=iris$Sepal.Width, y=iris$Sepal.Length) scatter.smooth(x=iris$Petal.Length, y=iris$Sepal.Length) scatter.smooth(x=iris$Petal.Width, y=iris$Sepal.Length) #隨機抽樣 n <- nrow(iris) set.seed(1117) subiris <- sample(seq_len(n), size = round(0.7 * n)) traindata <- iris[subiris,] testdata <- iris[ - subiris,] #建模 残差的三个检验>0.05 model <- lm(Sepal.Length~poly(Sepal.Width,3)+Petal.Length+Petal.Width,traindata) #條件2~4 library(car) ncvTest(model)#>a 殘差變異數有同質性 shapiro.test(model$residuals) #>a 殘差常態 library(lmtest) dwtest(model)#>a 殘差獨立 vif(model)  #<10 ok 10~100可能過度配適 #預測 future <- predict(model,testdata) future <

lasso回归

 library(glmnet) library(tidyverse) data("iris") iris <- iris[,-c(5)] #自变量重要性 #ridge 自变量重要性变不为0 ,lasso 自变量重要性变为0 par(mfrow=c(1,3)) boxplot((iris$Sepal.Width))$out boxplot(iris$Petal.Length)$out boxplot(iris$Petal.Width)$out set.seed(1117) subiris <- sample(seq_len(nrow(iris)),size = round(0.7*nrow(iris))) traindata <- iris[subiris,] %>% as.matrix() testdata <- iris[-subiris,] %>% as.matrix() trainx <- traindata[,c(2:4)] trainy <- traindata[,c(1)] testx <- testdata[,c(2:4)] testy <- testdata[,c(1)] ridge <- cv.glmnet(x=trainx,y=trainy,alpha=0) #alpha=0为ridge,=1为lasso,k=10,交叉验证 #視覺化&選自變量 coef(ridge, s = "lambda.min") %>%   as.matrix() %>%   as.data.frame() %>%   add_rownames(var = "var") %>%   rename(coef=`1`) %>%    filter(var != "(Intercept)") %>% #剔除截距項   top_n(3, wt = coef) %>%   ggplot(aes(coef, reorder(var, coef))) +   geom_bar(stat = "identity", width=0.2,            color=&

合并列及拆分列,向下填充NA

  df03 <- billboard[,3] separate(data = df03,          col = date.entered,          into = c("year", "month", "day"),          sep = "\\-",          remove = F) -> df04 df04 unite(data = df04,       col = date,       year, month, day,       sep = "/") df05 <- read_xlsx('./Documents/批号价格.xlsx',sheet = 2) df05 %>%    fill(c(企业名称,批号,`价格/剂(元)`),.direction='down') %>%    unite(col = id,企业名称,批号,sep='-')

pip 安装报proxy错

 HKEY_CURRENT_USER\SOFTWARE\Microsoft\Windows\CurrentVersion\Internet Settings Find a file with the name 'ProxyServer' and delete it.