LOG IN R CONCERNING OVER AND UNDER DISPERSION IN GLM'S ======================================================= 4/18/08 > pbcdat = read.table("pbcdata.asc", header=T) ### 216 X 10 > names(pbcdat) [1] "OBS" "IDNUM" "DTH" "EVTTIME" "TREATGP" "LOGBILI" "AGEVAR" [8] "CIRRH" "CCHOL" "ALBUMIN" > ind = pbcdat$DTH==1 | pbcdat$EVTTIME >= 41 Surv41 = as.numeric(pbcdat$EVTTIME < 41) pbcdat = cbind.data.frame(pbcdat[,3:10], Surv41)[ind,] ## 172 x 9 > tmpglm = glm(cbind(Surv41, 1-Surv41) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family = binomial) > tmpprbt = glm(cbind(Surv41, 1-Surv41) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family = binomial(link="probit")) > cbind(table(trunc(5*tmpglm$fit),pbcdat$Surv41), table(trunc(5*tmpprbt$fit),pbcdat$Surv41)) 0 1 0 1 0 66 5 66 5 1 22 13 22 12 2 11 10 11 11 3 12 16 12 16 ## First two columns logit, 2nd two probit 4 0 17 0 17 ## Almost no difference !? > c(logit.dev=tmpglm$dev, probit.dev=tmpprbt$dev) logit.dev probit.dev 155.9506 154.9561 ### small difference in deviance is not enough to determine ## model choice! ============================================================= Now consider OVER/UNDER-DISPERSION: this really makes sense only when there is a natural way to estimate scale parameter. ## First simulate data from pbcdat design matrix, but in ## batches n_i = 10 with and without intracluster correlation. ## Intracluster correlation comes from random effect term ## common to all members of same cluster > sum(abs(tmpglm$fitted - plogis(tmpglm$lin))) [1] 9.957313e-16 > summary(tmpglm$lin) Min. 1st Qu. Median Mean 3rd Qu. Max. -4.4100 -2.3430 -0.9497 -0.9290 0.4246 3.7280 reff = rnorm(172)*2 yrsp1 = rbinom(172, rep(10,172),plogis(tmpglm$lin + reff)) ## If these yrsp1 were sums of indep indicators, then we ## would expect average within-cluster variance > mean(dlogis(tmpglm$lin + reff)) ## = 0.1474842 ## But what we actually see for the average cluster ## sample variances is > mean((10/9)*.1*yrsp1*(1-0.1*yrsp1)) ### = 0.1052300 ## This reflects UNDERdispersion because the members of a cluster are more similar to each other than to the general population. glmfitA = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=binomial) glmfitB = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=quasibinomial) ### Exactly the same coefficient fits! But: > summary(glmfitB)$disp [1] 4.565743 > summary(glmfitA)$disp [1] 1 ### NOTE that what we call intraclass correlation results in ## INTERclass overdispersion ! > sum(yrsp1*(1-yrsp1/10)) [1] 167.1 > sum(glmfitA$fit*(1-glmfitA$fit/10)) [1] 68.40343 > var(yrsp1) [1] 14.75462 > var(10*glmfitA$fit) [1] 6.085447 ## How was dispersion estimated ? According to formula (7.8) ## of Venables-Ripley: > sum(10*(yrsp1/10-glmfitB$fit)^2/(glmfitB$fit*(1-glmfitB$fit)))/(172-4) [1] 4.565743 #### PERFECT !! > prbtA = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=binomial(link="probit")) > prbtA$coef (Intercept) LOGBILI TREATGP AGEVAR -3.127706893 1.510092331 0.419532408 0.005579737 ### same as > prbtB= glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=quasibinomial(var = "mu(1-mu)", link="probit")) ### and again, we find > summary(prbtB)$disp #### = 4.548416 glmfitC = glm(cbind(yrsp1, 10-yrsp1) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=quasi(link="logit", var="mu(1-mu)")) ### does not let us transform by logit in cases of 0 successes or failures! ### Now compare the same sort of fits when we re-simulate with actual ### binomial responses yrsp2 = rbinom(172, rep(10,172),tmpglm$fit) glmfitC = glm(cbind(yrsp2, 10-yrsp2) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=quasibinomial) > summary(glmfitC)$disp [1] 1.024752 ### This indicates that the quasilikelihood estimation ### CAN be used to distinguish over or under dispersion ### from an appropriate logistic regression model ! ### So now we return to the "original" (modified) PBC data: > summary(update(tmpglm, family=quasibinomial))$disp ## 0.8799445 > prbtC = glm(cbind(Surv41, 1-Surv41) ~ LOGBILI + TREATGP + AGEVAR, data=pbcdat, family=quasibinomial(link="probit")) [1] 0.8770686