> # Handout for GLM Y. L. Tseng > # On using Deviance for Goodness-of-Test and Models' comparison > # With Plum root stock survival data > > length<-c("long", "long", "short", "short") > plantingTime<-c("Now", "Spring", "Now", "Spring") > Noofsurvive<-c(156, 84, 107, 31) > Freq<-c(240, 240, 240, 240) > plum.root<-data.frame("length"=length, "plantingTime"=plantingTime, "Noofsurvive"=Noofsurvive, "Freq"=Freq) > plum.root length plantingTime Noofsurvive Freq 1 long Now 156 240 2 long Spring 84 240 3 short Now 107 240 4 short Spring 31 240 > > ## options(contrasts=c("contr.sum", "contr.poly")) > # What difference in the design matrix if you use this option? > > > M1<-glm(formula = Noofsurvive/Freq ~ 1, data = plum.root, weight=Freq, family = binomial(link = logit), + na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) Deviance = 153.3278 Iterations - 1 Deviance = 151.0195 Iterations - 2 Deviance = 151.0193 Iterations - 3 > summary(M1) Call: glm(formula = Noofsurvive/Freq ~ 1, family = binomial(link = logit), data = plum.root, weights = Freq, na.action = na.exclude, control = list(epsilon = 1e-04, maxit = 50, trace = T)) Deviance Residuals: 1 2 3 4 8.006 -1.397 1.640 -9.071 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -0.43158 0.06605 -6.534 6.41e-11 *** --- Signif. codes: 0 .***・ 0.001 .**・ 0.01 .*・ 0.05 ..・ 0.1 . ・ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 151.02 on 3 degrees of freedom Residual deviance: 151.02 on 3 degrees of freedom AIC: 175.76 Number of Fisher Scoring iterations: 3 > > M2<-glm(formula = Noofsurvive/Freq ~ length, data = plum.root, weight=Freq, family = binomial(link = logit), + na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) Deviance = 107.6734 Iterations - 1 Deviance = 105.1845 Iterations - 2 Deviance = 105.1824 Iterations - 3 > summary(M2) Call: glm(formula = Noofsurvive/Freq ~ length, family = binomial(link = logit), data = plum.root, weights = Freq, na.action = na.exclude, control = list(epsilon = 1e-04, maxit = 50, trace = T)) Deviance Residuals: 1 2 3 4 4.684 -4.684 5.200 -5.854 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.135e-16 9.129e-02 -1.24e-15 1 lengthshort -9.076e-01 1.360e-01 -6.675 2.47e-11 *** --- Signif. codes: 0 .***・ 0.001 .**・ 0.01 .*・ 0.05 ..・ 0.1 . ・ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 151.02 on 3 degrees of freedom Residual deviance: 105.18 on 2 degrees of freedom AIC: 131.92 Number of Fisher Scoring iterations: 3 > > M3<-glm(formula = Noofsurvive/Freq ~ plantingTime, data = plum.root, weight=Freq, family = binomial(link = logit), + na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) Deviance = 54.46386 Iterations - 1 Deviance = 53.44108 Iterations - 2 Deviance = 53.44041 Iterations - 3 > summary(M3) Call: glm(formula = Noofsurvive/Freq ~ plantingTime, family = binomial(link = logit), data = plum.root, weights = Freq, na.action = na.exclude, control = list(epsilon = 1e-04, maxit = 50, trace = T)) Deviance Residuals: 1 2 3 4 3.211 3.837 -3.168 -4.286 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 0.19226 0.09171 2.096 0.0360 * plantingTimeSpring -1.34722 0.14082 -9.567 <2e-16 *** --- Signif. codes: 0 .***・ 0.001 .**・ 0.01 .*・ 0.05 ..・ 0.1 . ・ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 151.02 on 3 degrees of freedom Residual deviance: 53.44 on 2 degrees of freedom AIC: 80.183 Number of Fisher Scoring iterations: 3 > > M4<-glm(formula = Noofsurvive/Freq ~ length + plantingTime, data = plum.root, weight=Freq, family = binomial(link = logit), + na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) Deviance = 2.29652 Iterations - 1 Deviance = 2.293839 Iterations - 2 Deviance = 2.293839 Iterations - 3 > summary(M4) Call: glm(formula = Noofsurvive/Freq ~ length + plantingTime, family = binomial(link = logit), data = plum.root, weights = Freq, na.action = na.exclude, control = list(epsilon = 1e-04, maxit = 50, trace = T)) Deviance Residuals: 1 2 3 4 -0.6966 0.6966 0.6642 -0.9393 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 0.7138 0.1217 5.867 4.45e-09 *** lengthshort -1.0177 0.1455 -6.995 2.64e-12 *** plantingTimeSpring -1.4275 0.1465 -9.747 < 2e-16 *** --- Signif. codes: 0 .***・ 0.001 .**・ 0.01 .*・ 0.05 ..・ 0.1 . ・ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 151.0193 on 3 degrees of freedom Residual deviance: 2.2938 on 1 degrees of freedom AIC: 31.036 Number of Fisher Scoring iterations: 3 > G2<-c(M1$deviance,M2$deviance,M3$deviance, M4$deviance) > DF<-c(M1$df.residual,M2$df.residual,M3$df.residual,M4$df.residual) > cbind(G2,DF) G2 DF [1,] 151.019316 3 [2,] 105.182415 2 [3,] 53.440412 2 [4,] 2.293839 1 > 1-pchisq(G2,DF) # p-values for G-O-F tests [1] 0.000000e+00 0.000000e+00 2.486344e-12 1.298883e-01 > > # Test for length effect > DGlength<-c(M1$deviance-M2$deviance,M3$deviance-M4$deviance) > DFlength<-c(M1$df.residual-M2$df.residual,M3$df.residual-M4$df.residual) > cbind(DGlength, DFlength) DGlength DFlength [1,] 45.83690 1 [2,] 51.14657 1 > 1-pchisq(DGlength,DFlength) [1] 1.285194e-11 8.572032e-13 > > # Test for plantingTime effect > DGplantingTime<-c(M1$deviance-M3$deviance,M2$deviance-M4$deviance) > DFplantingTime<-c(M1$df.residual-M3$df.residual,M2$df.residual-M4$df.residual) > cbind(DGplantingTime, DFplantingTime) DGplantingTime DFplantingTime [1,] 97.5789 1 [2,] 102.8886 1 > 1-pchisq(DGplantingTime,DFplantingTime) [1] 0 0 > > # Test if both predictors are of no effect > c(M1$deviance-M4$deviance, M1$df.residual-M4$df.residual) [1] 148.7255 2.0000 > 1-pchisq(M1$deviance-M4$deviance, M1$df.residual-M4$df.residual) [1] 0 > > > #Check what the X matrix looks like > model.matrix(M1) (Intercept) 1 1 2 1 3 1 4 1 attr(,"assign") [1] 0 > model.matrix(M2) (Intercept) lengthshort 1 1 0 2 1 0 3 1 1 4 1 1 attr(,"assign") [1] 0 1 attr(,"contrasts") attr(,"contrasts")$length [1] "contr.treatment" > model.matrix(M3) (Intercept) plantingTimeSpring 1 1 0 2 1 1 3 1 0 4 1 1 attr(,"assign") [1] 0 1 attr(,"contrasts") attr(,"contrasts")$plantingTime [1] "contr.treatment" > model.matrix(M4) (Intercept) lengthshort plantingTimeSpring 1 1 0 0 2 1 0 1 3 1 1 0 4 1 1 1 attr(,"assign") [1] 0 1 2 attr(,"contrasts") attr(,"contrasts")$length [1] "contr.treatment" attr(,"contrasts")$plantingTime [1] "contr.treatment" > # Handout for GLM Y. L. Tseng > # Logistic model with two predictors:one conti., qualitative > # Example: Horseshoe crabs with color and width predictors > > # Part I: Color as factor with 4 levels....... p. 116 > > crab<-read.table(file = "horseshoe_all.txt", header=FALSE, col.names = c("color", "spine","width","satellite", "weight")) > crab4.6<-data.frame("color"=as.factor(crab$color),"satellite"=crab$satellite,"width"=crab$width) > y<-ifelse(crab4.6$satellite>0, 1,0) > crab4<-data.frame("color"=as.factor(crab$color),"satellite"=crab$satellite,"y"=y,"width"=crab$width) > > # Table 4.6, p.116 > m1<-glm(formula = y ~ color+width , data = crab4, family = binomial(link = logiv), + na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) Deviance = 187.9823 Iterations - 1 Deviance = 187.4589 Iterations - 2 Deviance = 187.4570 Iterations - 3 > summary(m1) # Not right! Baseline is dark color....... Call: glm(formula = y ~ color + width, family = binomial(link = logit), data = crab4, na.action = na.exclude, control = list(epsilon = 1e-04, maxit = 50, trace = T)) Deviance Residuals: Min 1Q Median 3Q Max -2.1124 -0.9848 0.5243 0.8513 2.1413 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -11.3847 2.8636 -3.976 7.02e-05 *** color3 0.0724 0.7390 0.098 0.922 color4 -0.2238 0.7762 -0.288 0.773 color5 -1.3299 0.8516 -1.562 0.118 width 0.4679 0.1051 4.450 8.58e-06 *** --- Signif. codes: 0 .***・ 0.001 .**・ 0.01 .*・ 0.05 ..・ 0.1 . ・ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 225.76 on 172 degrees of freedom Residual deviance: 187.46 on 168 degrees of freedom AIC: 197.46 Number of Fisher Scoring iterations: 3 > > # Let c=5 be the baseline reference > crab.dark<-data.frame(crab4, color.new = relevel(crab4$color, ref = "5")) > mod.fit<-glm(formula = y ~ color.new+width , data = crab.dark, family = binomial(link = logit), + na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) Deviance = 187.9823 Iterations - 1 Deviance = 187.4589 Iterations - 2 Deviance = 187.4570 Iterations - 3 > summary(mod.fit) # c.f. Table 4.6 Call: glm(formula = y ~ color.new + width, family = binomial(link = logit), data = crab.dark, na.action = na.exclude, control = list(epsilon = 1e-04, maxit = 50, trace = T)) Deviance Residuals: Min 1Q Median 3Q Max -2.1124 -0.9848 0.5243 0.8513 2.1413 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -12.7146 2.7513 -4.621 3.81e-06 *** color.new2 1.3299 0.8516 1.562 0.1184 color.new3 1.4023 0.5478 2.560 0.0105 * color.new4 1.1061 0.5914 1.870 0.0614 . width 0.4679 0.1052 4.450 8.58e-06 *** --- Signif. codes: 0 .***・ 0.001 .**・ 0.01 .*・ 0.05 ..・ 0.1 . ・ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 225.76 on 172 degrees of freedom Residual deviance: 187.46 on 168 degrees of freedom AIC: 197.46 Number of Fisher Scoring iterations: 3 > anova(mod.fit) Deviance = 212.5084 Iterations - 1 Deviance = 212.0612 Iterations - 2 Deviance = 212.0608 Iterations - 3 Analysis of Deviance Table Model: binomial, link: logit Response: y Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev NULL 172 225.76 color.new 3 13.698 169 212.06 width 1 24.604 168 187.46 > mod.fit2<-glm(formula = y ~ width+color.new , data = crab.dark, family = binomial(link = logit), + na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) Deviance = 187.9823 Iterations - 1 Deviance = 187.4589 Iterations - 2 Deviance = 187.4570 Iterations - 3 > summary(mod.fit2) Call: glm(formula = y ~ width + color.new, family = binomial(link = logit), data = crab.dark, na.action = na.exclude, control = list(epsilon = 1e-04, maxit = 50, trace = T)) Deviance Residuals: Min 1Q Median 3Q Max -2.1124 -0.9848 0.5243 0.8513 2.1413 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -12.7146 2.7513 -4.621 3.81e-06 *** width 0.4679 0.1052 4.450 8.58e-06 *** color.new2 1.3299 0.8516 1.562 0.1184 color.new3 1.4023 0.5478 2.560 0.0105 * color.new4 1.1061 0.5914 1.870 0.0614 . --- Signif. codes: 0 .***・ 0.001 .**・ 0.01 .*・ 0.05 ..・ 0.1 . ・ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 225.76 on 172 degrees of freedom Residual deviance: 187.46 on 168 degrees of freedom AIC: 197.46 Number of Fisher Scoring iterations: 3 > anova(mod.fit2) Deviance = 194.7263 Iterations - 1 Deviance = 194.4533 Iterations - 2 Deviance = 194.4527 Iterations - 3 Analysis of Deviance Table Model: binomial, link: logit Response: y Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev NULL 172 225.76 width 1 31.3059 171 194.45 color.new 3 6.9956 168 187.46 > > # Fig. 4.4, p. 117 > windows() > plot(crab.dark$width, crab.dark$y, ylab="Predicted Probability", + main=" Logistic reg. model using width and color predictors", + xlab="Width \n (red solid c=5, dashed blue c=2, dotted green c=3, dot-dashed black c=4)") > > xx <- seq(18,34, length=400) > lines(xx, exp(mod.fit$coef[1]+mod.fit$coef[5]*xx)/(1+exp(mod.fit$coef[1]+mod.fit$coef[5]*xx)), + col='red',lty=1,lwd=3) > lines(xx, exp(mod.fit$coef[1]+mod.fit$coef[2]+mod.fit$coef[5]*xx)/(1+exp(mod.fit$coef[1]+mod.fit$coef[2]+mod.fit$coef[5]*xx)), + col='blue',lty=2,lwd=3) > lines(xx, exp(mod.fit$coef[1]+mod.fit$coef[3]+mod.fit$coef[5]*xx)/(1+exp(mod.fit$coef[1]+mod.fit$coef[3]+mod.fit$coef[5]*xx)), + col='green',lty=3,lwd=3) > lines(xx, exp(mod.fit$coef[1]+mod.fit$coef[4]+mod.fit$coef[5]*xx)/(1+exp(mod.fit$coef[1]+mod.fit$coef[4]+mod.fit$coef[5]*xx)), + col='black',lty=4,lwd=3) > > # For 4.4.4, p. 119 - 121 > ndark<-ifelse(crab$color<5, 1,0) > crab.ndark<-data.frame(crab4, "c"=ndark) > mod.nointact<-glm(formula = y ~ c+width, data = crab.ndark, family = binomial(link = logit), + na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) Deviance = 188.4575 Iterations - 1 Deviance = 187.9595 Iterations - 2 Deviance = 187.9579 Iterations - 3 > summary(mod.nointact) Call: glm(formula = y ~ c + width, family = binomial(link = logit), data = crab.ndark, na.action = na.exclude, control = list(epsilon = 1e-04, maxit = 50, trace = T)) Deviance Residuals: Min 1Q Median 3Q Max -2.0820 -0.9932 0.5274 0.8606 2.1553 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -12.9791 2.7174 -4.776 1.79e-06 *** c 1.3005 0.5253 2.476 0.0133 * width 0.4782 0.1038 4.608 4.06e-06 *** --- Signif. codes: 0 .***・ 0.001 .**・ 0.01 .*・ 0.05 ..・ 0.1 . ・ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 225.76 on 172 degrees of freedom Residual deviance: 187.96 on 170 degrees of freedom AIC: 193.96 Number of Fisher Scoring iterations: 3 > anova(mod.nointact) Deviance = 215.3512 Iterations - 1 Deviance = 214.7933 Iterations - 2 Deviance = 214.7929 Iterations - 3 Analysis of Deviance Table Model: binomial, link: logit Response: y Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev NULL 172 225.76 c 1 10.966 171 214.79 width 1 26.835 170 187.96 > mod.intact<-glm(formula = y ~ c+width+c*width , data = crab.ndark, family = binomial(link = logit), + na.action = na.exclude, control = list(epsilon = 0.0001, maxit = 50, trace = T)) Deviance = 187.5344 Iterations - 1 Deviance = 186.7909 Iterations - 2 Deviance = 186.7864 Iterations - 3 > summary(mod.intact) Call: glm(formula = y ~ c + width + c * width, family = binomial(link = logit), data = crab.ndark, na.action = na.exclude, control = list(epsilon = 1e-04, maxit = 50, trace = T)) Deviance Residuals: Min 1Q Median 3Q Max -2.1365 -0.9344 0.4996 0.8554 1.7753 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -5.8538 6.6899 -0.875 "0.382 c -6.9565 7.3070 -0.952 0.341 width 0.2004 0.2615 0.766 0.443 c:width 0.3217 0.2852 1.128 0.259 (Dispersion parameter for binomial family taken to be 1) Null deviance: 225.76 on 172 degrees of freedom Residual deviance: 186.79 on 169 degrees of freedom AIC: 194.79 Number of Fisher Scoring iterations: 3 > anova(mod.intact) Deviance = 215.3512 Iterations - 1 Deviance = 214.7933 Iterations - 2 Deviance = 214.7929 Iterations - 3 Deviance = 188.4575 Iterations - 1 Deviance = 187.9595 Iterations - 2 Deviance = 187.9579 Iterations - 3 Analysis of Deviance Table Model: binomial, link: logit Response: y Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev NULL 172 225.76 c 1 10.9656 171 214.79 width 1 26.8351 170 187.96 c:width 1 1.1715 169 186.79 > > windows() > plot(crab.ndark$width, crab.ndark$y, ylab="Predicted Probability", + main="Logistic model with width and whether color is dark as predictors", + xlab="Width \n (red for c=0, solid lines = no interaction model)") > > xx <- seq(18,34, length=400) > lines(xx, exp(mod.nointact$coef[1]+mod.nointact$coef[3]*xx)/(1+exp(mod.nointact$coef[1]+mod.nointact$coef[3]*xx)), + col='red',lty=1,lwd=4) > lines(xx, exp(mod.nointact$coef[1]+mod.nointact$coef[2]+mod.nointact$coef[3]*xx)/(1+exp(mod.nointact$coef[1]+mod.nointact$coef[2]+mod.nointact$coef[3]*xx)), + col='blue',lty=1,lwd=2) > lines(xx, exp(mod.intact$coef[1]+mod.intact$coef[3]*xx)/(1+exp(mod.intact$coef[1]+mod.intact$coef[3]*xx)), + col='red',lty=3,lwd=4) > lines(xx, exp(mod.intact$coef[1]+mod.intact$coef[2]+(mod.intact$coef[3]+mod.intact$coef[4])*xx)/(1+exp(mod.intact$coef[1]+mod.intact$coef[2]+(mod.intact$coef[3]+mod.intact$coef[4])*xx)), + col='blue',lty=3,lwd=2) >