# This program demonstrates the sampling process needed for cross-validation
# modified by Andy Tsao based on procedures by Drago Chen
# 2010.11.02


# Given a dataframe, here we use iris data as an example
# Proceed with some prelim exploratory data analysis
iris
summary(iris)
pairs(iris)
N<-dim(iris)[1]
UD<-iris

# Randomly Choose a proportion (TrainingRate) of training data
# and the rest as the testing/generalization data
# Training Data and Generating Data
TrainingRate<-1/3;
TR <- TrainingRate; index <- 1:N
TrainingNumber <- floor(N*TR); TN <- TrainingNumber
GeneratingNumber <- N - TN; GN <- GeneratingNumber
GeneratingIndex <- sample(index, GN); GI <- GeneratingIndex
GeneratingData <- UD[GI,]; GD <- GeneratingData
TrainingData <- UD[-GI,]; TD <- TrainingData

# K-Cross validation
kv<-sample(index,N,replace = FALSE)
UD<-UD[kv,]
# Refer to adaboost.M1 {adabag} and predict.boosting {adabag}
# for training and testing using AdaBoost (M1)
# In addition, for k-fold cross validation, refer to boosting.cv {adabag}


# This procedure is written by Drago Chen for SVM experiments
# SETE is the abbreviation of Support Vector Machine Error and Tree Error
# SETE can get the errors of SVM and Tree predictions
# TrainingData and GeneratingData are data matrices with response variable at the last column
# Type of response variables is Discrete or Real
# Output is the error of SVM, Tree predictions, and their running times

SETE <- function( TrainingData, GeneratingData, Level = c(1:3), Delta = 0.5 )
        {
          # package
            library(rpart); library(e1071)

          # Input Data, Training Data and Generating Data
            d <- Delta; result <- {}
            TD <- TrainingData; TN <- dim(TD)[1]; C <- dim(TD)[2]
            GD <- GeneratingData; GN <- dim(GD)[1]

          # SVM prediction and Running Time
            result[1] <- date(); SM <- svm(TD[, -C], TD[, C], kernel = "linear")#
            STp <- predict(SM, TD[,-C]); SGp <- predict(SM, GD[,-C])
            SLGp <- R2ML(SGp, Level); SVME <- length(which(abs(SLGp-GD$Y)>d))/GN
            result[2] <- SVME; result[3] <- date()

          # Tree prediction and Running Time
            result[4] <- date(); TM <- rpart(Y ~ ., data = TD)
            TTp <- predict(TM, type="vector"); TGp <- predict(TM, GD[,-C], type="vector")
            TLGp <- R2ML(TGp, Level); TreeE <- length(which(abs(TLGp-GD$Y)>d))/GN
            result[5] <- TreeE; result[6] <- date()


          # Return Result
            return(result)
        }
R2ML <- function( InputData, Level = c(-1,1) )
        {
          Distance <- abs(InputData - Level[1])
          LevelData <- InputData - InputData + Level[1]
          Length <- length(Level)
          for ( i in 1:Length )
              {
 "              NewDistance <- abs(InputData - Level[i])
                WhichNegative <- which( NewDistance - Distance < 0 )
                LevelData[WhichNegative] <- Level[i]
                Distance[WhichNegative] <- NewDistance[WhichNegative]
              }
          return(LevelData)
        }