# Examples of section 3 library("adabag") data("iris") train <- c(sample(1:50, 25), sample(51:100, 25), sample(101:150, 25)) iris.adaboost <- boosting(Species ~ ., data=iris[train,], mfinal=10, control=rpart.control(maxdepth=1)) iris.adaboost barplot(iris.adaboost$imp[order(iris.adaboost$imp,decreasing=TRUE)], ylim = c(0, 100), main="Variables relative importance", col = "lightblue") # Training error table(iris.adaboost$class, iris$Species[train], dnn=c("Predicted Class", "Observed Class")) 1-sum(iris.adaboost$class==iris$Species[train])/length(iris$Species[train]) iris.predboosting<- predict.boosting(iris.adaboost, newdata=iris[-train,]) iris.predboosting iris.boostcv <- boosting.cv(Species ~ ., v=10, data=iris, mfinal=10, control=rpart.control(maxdepth=1)) iris.boostcv iris.bagging <- bagging(Species ~ ., data=iris[train,], mfinal=10, control=rpart.control(maxdepth=1)) iris.bagging barplot( iris.bagging$imp[order(iris.bagging$imp,decreasing=TRUE)], ylim = c(0, 100), main="Variables relative importance", col = "lightblue") #Training error table(iris.bagging$class, iris$Species[train], dnn=c("Predicted Class", "Observed Class")) 1-sum(iris.bagging$class==iris$Species[train])/length(iris$Species[train]) iris.predbagging<- predict.bagging(iris.bagging, newdata=iris[-train,]) iris.predbagging iris.baggingcv <- bagging.cv(Species ~ ., v=10, data=iris, mfinal=10, control=rpart.control(maxdepth=1)) iris.baggingcv # 3.3. The margins and errorevol functions iris.bagging.margins <- margins(iris.bagging,iris[train,]) # training set iris.bagging.predmargins <- margins(iris.predbagging,iris[-train,]) # test set iris.bagging.margins # training set iris.bagging.predmargins # test set margins.test<-iris.bagging.predmargins[[1]] margins.train<-iris.bagging.margins[[1]] plot(sort(margins.train), (1:length(margins.train))/length(margins.train), type="l", xlim=c(-1,1),main="Margin cumulative distribution graph", xlab="m", ylab="% observations", col="blue3", lwd=2) abline(v=0, col="red",lty=2, lwd=2) lines(sort(margins.test), (1:length(margins.test))/length(margins.test), type="l", cex = .5 ,col="green", lwd=2) legend("topleft", c("test","train"), col = c("green", "blue"), lty=1, lwd=2) evol.test <- errorevol(iris.adaboost,iris[-train,]) evol.train <- errorevol(iris.adaboost,iris[train,]) plot(evol.test$error, type="l", ylim=c(0,1), main="Boosting error versus number of trees", xlab="Iterations", ylab="Error", col = "red", lwd=2) lines(evol.train$error, cex = .5 ,col="blue", lty=2, lwd=2) legend("topleft", c("test","train"), col = c("red", "blue"), lty=1:2, lwd=2) # 4. Examples # 4.1. A dichotomous example n <- 12000 p <- 10 set.seed(100) x <- matrix(rnorm(n*p), ncol=p) y <- as.factor(c(-1, 1)[as.numeric(apply(x^2, 1, sum) > 9.34) + 1]) data <- data.frame(y, x) train <- sample(1:n, 2000, FALSE) formula <- y ~ . vardep <- data[,as.character(formula[[2]])] cntrl <- rpart.control(maxdepth = 1, minsplit = 0, cp = -1) mfinal <-400 data.boosting <- boosting(formula=formula, data=data[train,], mfinal=mfinal, coeflearn= 'Breiman', boos=FALSE, control=cntrl) data.boostingBreimanFalse <- data.boosting #Training error table(data.boosting$class, vardep[train], dnn=c("Clase estimada", "Clase real")) 1-sum(data.boosting$class==vardep[train])/length(vardep[train]) #Test error data.predboost <- predict.boosting(data.boosting, newdata=data[-train,]) data.predboost$confusion data.predboost$error data.boosting$imp data.boosting <- boosting(formula=formula, data=data[train,], mfinal=mfinal, coeflearn= 'Breiman', boos=TRUE, control=cntrl) data.boostingBreimanTrue <- data.boosting # Training error table(data.boosting$class, vardep[train], dnn=c("Predicted Class", "Observed Class")) 1-sum(data.boosting$class==vardep[train])/length(vardep[train]) # Test error data.predboost <- predict.boosting(data.boosting, newdata=data[-train,]) data.predboost$confusion data.predboost$error data.boosting$imp data.boosting <- boosting(formula=formula, data=data[train,], mfinal=mfinal, coeflearn= 'Freund', boos=FALSE, control=cntrl) data.boostingFreundFalse <- data.boosting # Training error table(data.boosting$class, vardep[train], dnn=c("Predicted Class", "Observed Class")) 1-sum(data.boosting$class==vardep[train])/length(vardep[train]) # Test error data.predboost <- predict.boosting(data.boosting, newdata=data[-train,]) data.predboost$confusion data.predboost$error data.boosting$imp data.boosting <- boosting(formula=formula, data=data[train,], mfinal=mfinal, coeflearn= 'Freund', boos=TRUE, control=cntrl) data.boostingFreundTrue <- data.boosting # Training error table(data.boosting$class, vardep[train], dnn=c("Predicted Class", "Observed Class")) 1-sum(data.boosting$class==vardep[train])/length(vardep[train]) # Test error data.predboost <- predict.boosting(data.boosting, newdata=data[-train,]) data.predboost$confusion data.predboost$error data.boosting$imp # The error evolution plots data.boosting <- data.boostingBreimanFalse errorevol.train <- errorevol(data.boosting, data[train,]) errorevol.test <- errorevol(data.boosting, data[-train,]) plot(errorevol.test[[1]], type="l", ylim=c(0,0.5), main="Adaboost error versus number of trees", xlab="Iterations", ylab="Error", col = "red",lwd=2) lines(errorevol.train[[1]], cex = .5 ,col="blue", lty=1,lwd=2) legend("topright", c("test","train"), col = c("red", "blue"), lty=1,lwd=2) abline(h=min(errorevol.test[[1]]), col="red",lty=2,lwd=2) abline(h=min(errorevol.train[[1]]), col="blue",lty=2,lwd=2) data.boosting <- data.boostingBreimanTrue errorevol.train <- errorevol(data.boosting, data[train,]) errorevol.test <- errorevol(data.boosting, data[-train,]) plot(errorevol.test[[1]], type="l", ylim=c(0,0.5), main="Adaboost error versus number of trees", xlab="Iterations", ylab="Error", col = "red",lwd=2) lines(errorevol.train[[1]], cex = .5 ,col="blue", lty=1,lwd=2) legend("topright", c("test","train"), col = c("red", "blue"), lty=1,lwd=2) abline(h=min(errorevol.test[[1]]), col="red",lty=2,lwd=2) abline(h=min(errorevol.train[[1]]), col="blue",lty=2,lwd=2) data.boosting <- data.boostingFreundFalse errorevol.train <- errorevol(data.boosting, data[train,]) errorevol.test <- errorevol(data.boosting, data[-train,]) plot(errorevol.test[[1]], type="l", ylim=c(0,0.5), main="Adaboost error versus number of trees", xlab="Iterations", ylab="Error", col = "red",lwd=2) lines(errorevol.train[[1]], cex = .5 ,col="blue", lty=1,lwd=2) legend("topright", c("test","train"), col = c("red", "blue"), lty=1,lwd=2) abline(h=min(errorevol.test[[1]]), col="red",lty=2,lwd=2) abline(h=min(errorevol.train[[1]]), col="blue",lty=2,lwd=2) data.boosting <- data.boostingFreundTrue errorevol.train <- errorevol(data.boosting, data[train,]) errorevol.test <- errorevol(data.boosting, data[-train,]) plot(errorevol.test[[1]], type="l", ylim=c(0,0.5), main="Adaboost error versus number of trees", xlab="Iterations", ylab="Error", col = "red",lwd=2) lines(errorevol.train[[1]], cex = .5 ,col="blue", lty=1,lwd=2) legend("topright", c("test","train"), col = c("red", "blue"), lty=1,lwd=2) abline(h=min(errorevol.test[[1]]), col="red",lty=2,lwd=2) abline(h=min(errorevol.train[[1]]), col="blue",lty=2,lwd=2) # pruning data.prune <- predict.boosting(data.boosting, newdata=data[-train,], newmfinal=253) data.prune$confusion data.prune$error # 4.2. A multiclass example data("Vehicle") # repeat 50 iterations to calculate the average error rows<-50 col<-4 errortrain<- array(0, c(rows,col)) errortest<- array(0, c(rows,col)) l <- length(Vehicle[,1]) maxdepth <- 5 mfinal<-50 matrix.sub <- array(0, c(2*l/3,rows)) cntrl<-rpart.control(maxdepth=5, cp=-1, minsplit=0) for (m in 1:rows) { sub <- sample(1:l,2*l/3) matrix.sub[,m]<-sub # rpart Vehicle.rpart <- rpart(Class~.,data=Vehicle[sub,],maxdepth=maxdepth) # Training error Vehicle.rpart.pred <- predict(Vehicle.rpart,newdata=Vehicle[sub, ],type="class") tb <- table(Vehicle.rpart.pred,Vehicle$Class[sub]) errortrain[m,1] <- 1-(sum(diag(tb))/sum(tb)) # Test error Vehicle.rpart.pred <- predict(Vehicle.rpart,newdata=Vehicle[-sub, ],type="class") tb <- table(Vehicle.rpart.pred,Vehicle$Class[-sub]) errortest[m,1] <- 1-(sum(diag(tb))/sum(tb)) # bagging Vehicle.bagging <- bagging(Class ~ ., data=Vehicle[sub,], mfinal=mfinal, control=cntrl) # Training error errortrain[m,2] <-1-sum(Vehicle.bagging$class==Vehicle$Class[sub])/length(Vehicle$Class[sub]) # Test error Vehicle.predbagging<- predict.bagging(Vehicle.bagging, newdata=Vehicle[-sub,]) errortest[m,2] <-Vehicle.predbagging$error # AdaBoost.M1 Vehicle.adaboost <- boosting(Class ~.,data=Vehicle[sub, ],mfinal=mfinal, coeflearn="Freund", boos=T, control=cntrl) # Training error errortrain[m,3] <-1-sum(Vehicle.adaboost$class==Vehicle$Class[sub])/length(Vehicle$Class[sub]) # Test error Vehicle.adaboost.pred <- predict.boosting(Vehicle.adaboost,newdata=Vehicle[-sub, ]) errortest[m,3] <-Vehicle.adaboost.pred$error Vehicle.SAMME <- boosting(Class ~.,data=Vehicle[sub, ],mfinal=mfinal, coeflearn="Zhu", boos=T, control=cntrl) # Training error # table(Vehicle.SAMME$class, Vehicle$Class[sub], dnn=c("Predicted Class", "Observed Class")) errortrain[m,4] <-1-sum(Vehicle.SAMME$class==Vehicle$Class[sub])/length(Vehicle$Class[sub]) # Test error Vehicle.SAMME.pred <- predict.boosting(Vehicle.SAMME,newdata=Vehicle[-sub, ]) #Vehicle.SAMME.pred$confusion errortest[m,4] <-Vehicle.SAMME.pred$error if(m==1){ Vehicle.rpart.best <- Vehicle.rpart Vehicle.bagging.best <- Vehicle.bagging Vehicle.adaboost.best <- Vehicle.adaboost Vehicle.SAMME.best <- Vehicle.SAMME } else{ if(errortest[m,1]