xgb模型
#GB
library(gbm)
set.seed(123)
model <- gbm(formula = Sepal.Length ~ .,distribution = "gaussian",
data = iris,n.trees = 479,
interaction.depth = 1,
shrinkage = 0.1,n.minobsinnode = 15,
bag.fraction = 1,
train.fraction = 0.5,cv.folds = 10)
#分類用 bernoulli 迴歸用gaussian(RMSE)
#XGB
data("iris")
par(mfrow=c(1,4))
boxplot(iris$Sepal.Length)$out
boxplot(iris$Sepal.Width)$out
boxplot(iris$Petal.Length)$out
boxplot(iris$Petal.Width)$out
#因為離群值不多,所以先不轉
n <- nrow(iris)
set.seed(1117)
subiris <- sample(seq_len(n), size = round(0.7 * n))
traindata <- iris[subiris,]
testdata <- iris[ - subiris,]
features <- setdiff(names(iris), "Sepal.Length")
library(vtreat)#單熱
library(dplyr)
onehot <- vtreat::designTreatmentsZ(iris, features, verbose = FALSE)
單熱 <- onehot %>%
magrittr::use_series(scoreFrame) %>%
dplyr::filter(code %in% c("clean", "lev")) %>%
magrittr::use_series(varName)
trainx <- vtreat::prepare(onehot, traindata, varRestriction = 單熱) %>% as.matrix()
trainy <- traindata$Sepal.Length
testx <- vtreat::prepare(onehot, testdata, varRestriction = 單熱) %>% as.matrix()
testy <- testdata$Sepal.Length
library(xgboost)
set.seed(123)
model <- xgboost(data = trainx,
label = trainy,
nrounds = 531,eta=0.1,
min_child_weight=3,max_depth=1,
subsample=1,colsample_bytree=0.9,
objective = "reg:squarederror",verbose = 0)
importance <- xgb.importance(model = model)
par(mfrow=c(1,1))
xgb.plot.importance(importance, top_n = 3, measure = "Gain")
hyper_grid <- expand.grid(eta = c( .05, .1),
max_depth = c( 5, 7),
min_child_weight = c( 3, 5, 7),
subsample = c(.65, .8),
colsample_bytree = c(.8, .9, 1),
nrounds = 0,
RMSE = 0)
nrow(hyper_grid)
for(i in 1:nrow(hyper_grid)) {
params <- list(
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i],
min_child_weight = hyper_grid$min_child_weight[i],
subsample = hyper_grid$subsample[i],
colsample_bytree = hyper_grid$colsample_bytree[i]
)
set.seed(123)
xgb.tune <- xgb.cv(
params = params,
data = trainx[,-c(4:6)],
label = trainy,
nrounds = 500,
nfold = 5,
objective = "reg:squarederror", # binary:logistic分類
verbose = 0,
early_stopping_rounds = 10
)
hyper_grid %>% dplyr::arrange(RMSE) %>% head(10)
set.seed(123)
model <- xgboost(data = trainx[,-c(4:6)],
label = trainy,
nrounds = 105,eta=0.05,
min_child_weight=3,max_depth=7,
subsample=0.8,colsample_bytree=1,
objective = "reg:squarederror",verbose = 0)
future <- predict(model,testx[,-c(4:6)])
future <- as.data.frame(future)
final <- cbind(future,testy,testx[,-c(4:6)])
final <- mutate(final,mape=abs(future-testy)/testy)
mean(final$mape)
hyper_grid$nrounds[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
hyper_grid$RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
}
评论
发表评论