xgb模型

 #GB

library(gbm)

set.seed(123)

model <- gbm(formula = Sepal.Length ~ .,distribution = "gaussian",

             data = iris,n.trees = 479,

             interaction.depth = 1,

             shrinkage = 0.1,n.minobsinnode = 15,

             bag.fraction = 1,

             train.fraction = 0.5,cv.folds = 10)

#分類用 bernoulli 迴歸用gaussian(RMSE)


#XGB

data("iris")

par(mfrow=c(1,4))

boxplot(iris$Sepal.Length)$out

boxplot(iris$Sepal.Width)$out

boxplot(iris$Petal.Length)$out

boxplot(iris$Petal.Width)$out

#因為離群值不多,所以先不轉

n <- nrow(iris)

set.seed(1117)

subiris <- sample(seq_len(n), size = round(0.7 * n))

traindata <- iris[subiris,]

testdata <- iris[ - subiris,]


features <- setdiff(names(iris), "Sepal.Length")

library(vtreat)#單熱 

library(dplyr)

onehot <- vtreat::designTreatmentsZ(iris, features, verbose = FALSE)

單熱 <- onehot %>%

  magrittr::use_series(scoreFrame) %>%        

  dplyr::filter(code %in% c("clean", "lev")) %>% 

  magrittr::use_series(varName)     

trainx <- vtreat::prepare(onehot, traindata, varRestriction = 單熱) %>% as.matrix()

trainy <- traindata$Sepal.Length

testx <- vtreat::prepare(onehot, testdata, varRestriction = 單熱) %>% as.matrix()

testy <- testdata$Sepal.Length

library(xgboost)

set.seed(123)

model <-  xgboost(data = trainx,

                  label = trainy,

                  nrounds = 531,eta=0.1,

                  min_child_weight=3,max_depth=1,

                  subsample=1,colsample_bytree=0.9,

                  objective = "reg:squarederror",verbose = 0)


importance <- xgb.importance(model = model)

par(mfrow=c(1,1))

xgb.plot.importance(importance, top_n = 3, measure = "Gain")


hyper_grid <- expand.grid(eta = c( .05, .1),

                          max_depth = c( 5, 7),

                          min_child_weight = c( 3, 5, 7),

                          subsample = c(.65, .8), 

                          colsample_bytree = c(.8, .9, 1),

                          nrounds = 0, 

                          RMSE = 0) 

nrow(hyper_grid)


for(i in 1:nrow(hyper_grid)) {

  params <- list(

    eta = hyper_grid$eta[i],

    max_depth = hyper_grid$max_depth[i],

    min_child_weight = hyper_grid$min_child_weight[i],

    subsample = hyper_grid$subsample[i],

    colsample_bytree = hyper_grid$colsample_bytree[i]

  )

  set.seed(123)

  xgb.tune <- xgb.cv(

    params = params,

    data = trainx[,-c(4:6)],

    label = trainy,

    nrounds = 500, 

    nfold = 5,

    objective = "reg:squarederror",  # binary:logistic分類

    verbose = 0,               

    early_stopping_rounds = 10 

  )

  


hyper_grid %>% dplyr::arrange(RMSE) %>% head(10)


set.seed(123)

model <-  xgboost(data = trainx[,-c(4:6)],

                  label = trainy,

                  nrounds = 105,eta=0.05,

                  min_child_weight=3,max_depth=7,

                  subsample=0.8,colsample_bytree=1,

                  objective = "reg:squarederror",verbose = 0) 


future <- predict(model,testx[,-c(4:6)])

future <- as.data.frame(future)

final <- cbind(future,testy,testx[,-c(4:6)])

final <- mutate(final,mape=abs(future-testy)/testy)

mean(final$mape)


hyper_grid$nrounds[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)

hyper_grid$RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)

}



评论

此博客中的热门博文

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理