# This program demonstrates the sampling process needed for cross-validation # modified by Andy Tsao based on procedures by Drago Chen # 2010.11.02 # Given a dataframe, here we use iris data as an example # Proceed with some prelim exploratory data analysis iris summary(iris) pairs(iris) N<-dim(iris)[1] UD<-iris # Randomly Choose a proportion (TrainingRate) of training data # and the rest as the testing/generalization data # Training Data and Generating Data TrainingRate<-1/3; TR <- TrainingRate; index <- 1:N TrainingNumber <- floor(N*TR); TN <- TrainingNumber GeneratingNumber <- N - TN; GN <- GeneratingNumber GeneratingIndex <- sample(index, GN); GI <- GeneratingIndex GeneratingData <- UD[GI,]; GD <- GeneratingData TrainingData <- UD[-GI,]; TD <- TrainingData # K-Cross validation kv<-sample(index,N,replace = FALSE) UD<-UD[kv,] # Refer to adaboost.M1 {adabag} and predict.boosting {adabag} # for training and testing using AdaBoost (M1) # In addition, for k-fold cross validation, refer to boosting.cv {adabag} # This procedure is written by Drago Chen for SVM experiments # SETE is the abbreviation of Support Vector Machine Error and Tree Error # SETE can get the errors of SVM and Tree predictions # TrainingData and GeneratingData are data matrices with response variable at the last column # Type of response variables is Discrete or Real # Output is the error of SVM, Tree predictions, and their running times SETE <- function( TrainingData, GeneratingData, Level = c(1:3), Delta = 0.5 ) { # package library(rpart); library(e1071) # Input Data, Training Data and Generating Data d <- Delta; result <- {} TD <- TrainingData; TN <- dim(TD)[1]; C <- dim(TD)[2] GD <- GeneratingData; GN <- dim(GD)[1] # SVM prediction and Running Time result[1] <- date(); SM <- svm(TD[, -C], TD[, C], kernel = "linear")# STp <- predict(SM, TD[,-C]); SGp <- predict(SM, GD[,-C]) SLGp <- R2ML(SGp, Level); SVME <- length(which(abs(SLGp-GD$Y)>d))/GN result[2] <- SVME; result[3] <- date() # Tree prediction and Running Time result[4] <- date(); TM <- rpart(Y ~ ., data = TD) TTp <- predict(TM, type="vector"); TGp <- predict(TM, GD[,-C], type="vector") TLGp <- R2ML(TGp, Level); TreeE <- length(which(abs(TLGp-GD$Y)>d))/GN result[5] <- TreeE; result[6] <- date() # Return Result return(result) } R2ML <- function( InputData, Level = c(-1,1) ) { Distance <- abs(InputData - Level[1]) LevelData <- InputData - InputData + Level[1] Length <- length(Level) for ( i in 1:Length ) { " NewDistance <- abs(InputData - Level[i]) WhichNegative <- which( NewDistance - Distance < 0 ) LevelData[WhichNegative] <- Level[i] Distance[WhichNegative] <- NewDistance[WhichNegative] } return(LevelData) }