# GLM_Handout for Model Selection Y. L. Tseng crab<-read.table(file = "horseshoe_all.txt", header=FALSE, col.names = c("color", "spine","width","satellite", "weight")) y<-ifelse(crab$satellite>0, 1,0) ndark<-ifelse(crab$color<5, 1,0) crab5<-data.frame("c"=relevel(as.factor(crab$color),ref="5"),"s"=relevel(as.factor(crab$spine),ref="3"),"satellite"=crab$satellite,"y"=y,"w"=crab$width,"wt"=crab$weight/1000,"dark"=ndark) # Table 5.1, p. 139 m0<-glm(formula = y ~ c+s+wt+w, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) summary(m0) dG2<-m0$null.deviance-m0$deviance dfdG2<-m0$df.null-m0$df.residual pvalue<-1-pchisq(dG2,dfdG2) cbind(dG2, dfdG2, pvalue) # For testing if any of the predictors is sig. c.f. p.138 # This shows extremely strong evidence that at least one predictor has an eefect. # Although this overall test is highly sig., the Table 5.1 results, p. 139, are discouraging. # --> Small p-value for the overall test yet the lack of sig. for individual effects # is a warning sign of multicollinearity. # Width is sig. when consider only color and width as predictors, as shown in last handout; # controlling for weight, color and spine condition, little evidence remains for a width effect. cov(crab5$w,crab5$wt)/(var(crab5$w)*var(crab5$wt))^(0.5) # strong correlation between w and wt # ==> Further analysis uses w, c and s as predictors, then. # For stepwise variable selection algorrithms 5.1.3. # (1) Forward selection adds terms sequentially until further additions do not improve the fit. # (2) Backwar elimination begins with a complex model and sequentially removes terms # (highest-order terms with largest p-value........), the process stops when any further deletion # leads to a sig.'ly poorer fit. # Caution!!! stat.ly sig. =/= practically sig. Read p.140, please. # EX: Backward eliminationfor horseshoe crab data # To get Table 5.2, p. 141 m1<-glm(formula = y ~ c+s+w+c*s+c*w+s*w, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) m2<-glm(formula = y ~ c+s+w, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) m3a<-glm(formula = y ~ c+s, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) m3b<-glm(formula = y ~ s+w, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) m3c<-glm(formula = y ~ c+w, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) m4a<-glm(formula = y ~ c, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) m4b<-glm(formula = y ~ w, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) m5<-glm(formula = y ~ dark+w, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) m6<-glm(formula = y ~ 1, data = crab5, family = binomial(link = logit), na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) G2dfAic<-rbind(c(m1$deviance,m1$df.residual,m1$aic), + c(m2$deviance,m2$df.residual,m2$aic), + c(m3a$deviance,m3a$df.residual,m3a$aic), + c(m3b$deviance,m3b$df.residual,m3b$aic), + c(m3c$deviance,m3c$df.residual,m3c$aic), + c(m4a$deviance,m4a$df.residual,m4a$aic), + c(m4b$deviance,m4b$df.residual,m4b$aic), + c(m5$deviance,m5$df.residual,m5$aic), + c(m6$deviance,m6$df.residual,m6$aic)) G2dfAic anova(m2,m1) anova(m3a,m2) anova(m3b,m2) anova(m3c,m2) anova(m4a,m3c) anova(m4b,m3c) anova(m5,m3c) anova(m6,m5)