# This file contains the code used in the Examples of manuscript: # Calcagno & de Mazancourt J. Stat. Soft. # It uses the R package glmulti (version 0.6-1 or later) # Vincent Calcagno, 2009 library("glmulti") # Using the "d" method # Section 3.1 # Figure 2 ################################# # How the number of models increases with the number of predictors dd = matrix(nc= 6, nr = 6) ddsimple= matrix(nc= 6, nr = 6) ddmarg= matrix(nc= 6, nr = 6) # make a dummy data frame dummyfactor=as.factor(c(rep("y",5),rep("yn",5),rep("n",10))) dummycovariate = 1:20 dod = data.frame(2:21,dummyfactor,dummyfactor,dummyfactor,dummyfactor,dummyfactor,dummyfactor,dummycovariate,dummycovariate,dummycovariate,dummycovariate,dummycovariate,dummycovariate) # compute for (i in 1:6) for (j in 1:6) dd[i,j]= glmulti(names(dod)[1],names(dod)[c(2:(2+i-1),8:(8+j-1))], data=dod, method="d") for (i in 1:6) for (j in 1:6) ddsimple[i,j]= glmulti(names(dod)[1],names(dod)[c(2:(2+i-1),8:(8+j-1))], data=dod, method="d",level=1) for (i in 1:6) for (j in 1) ddmarg[i,j]= glmulti(names(dod)[1],names(dod)[c(2:(2+i-1),8:(8+j-1))], data=dod, method="d", marginality=T) for (i in 1) for (j in 1:6) ddmarg[i,j]= glmulti(names(dod)[1],names(dod)[c(2:(2+i-1),8:(8+j-1))], data=dod, method="d", marginality=T) # the increase is not the same depending on whether factors or covariates are added dd # this is not true when interactions are not considered ddsimple # plots Figure 2 A par(mfrow=c(2,1)) dd = .readRDS(file="dd") plot(log(dd[,1],10),type="l",xlab="Number of predictors added",ylab="Log10 number of candidate models",cex.axis=1.2, cex.lab=1.2, cex.main=1.2, lwd=2) lines(log(dd[1,],10),col="red",lwd=2) points(log(ddsimple[,1],10)+0.05,lwd=2) points(log(ddsimple[1,],10),col="red",lwd=2) lines(log(ddmarg[1,],10),col="red", lty=21, lwd=2) # What is the complexity of the models? # With 1 factor and 5 covariates # plots Figure 2 B dd = integer(40) for (i in 1:40) dd[i]= glmulti("X2.21", names(dod)[c(2:(2+1-1),8:(8+5-1))], data=dod, method="d", minK=(i), maxK=(i)) plot(dd/sum(dd),cex.axis=1.2, cex.lab=1.2, cex.main=1.2, lwd=2, type="b",xlab="Model complexity (K)",ylab="Density of candidate models") # With 3 factors and 2 covariates dd = integer(40) for (i in 1:40) dd[i]= glmulti("X2.21", names(dod)[c(2:(2+3-1),8:(8+2-1))], data=dod, method="d", minK=(i), maxK=(i)) lines(dd/sum(dd),col="red",cex.axis=1.2, cex.lab=1.2, cex.main=1.2, lwd=2) # This shows that most models have intermediate complexity # This implies that setting constraints on model complexity is unlikely to reduce the number of candidate models importantly # We now use simulated datasets # Section 3.2 ################################ # this generates datasets gaga = function(ss, nono=1) { a = runif(ss) b = runif(ss) c = rexp(ss) f1 = as.factor(sample(c(rep("y",round(ss/3)),rep("n",ss-round(ss/3))))) f2 = as.factor(sample(c(rep("y",round(ss/3)),rep("n",ss-round(ss/3))))) y = function (i) { if (f1[i]=="y") return(-2*b[i]*a[i] + 3*a[i] -2 + rnorm(1,sd=nono)) else return(-2*b[i]*a[i] -3*a[i] + 2 + rnorm(1,sd=nono)) } resp = numeric(ss) for (j in 1:ss) resp[j] = y(j) data.frame(r=resp,a=a,b=b,c=c,f1=f1,f2=f2) } # estimate R2 of full model riri = 0 for (i in 1:100) riri = riri+ summary(lm(r~(.)^2, data=gaga(100, nono = 0.5)))$r.squared print(riri/100) # number of models to be compared glmulti(r~a*b*c*f1*f2, data= gaga(20), plotty=F, report=F, fitfunc=lm, crit=bic, method="d") # example (Figure 3) obj <- glmulti(r~a*b*c*f1*f2, data= gaga(20), plotty=F, report=F, fitfunc=lm, crit=bic, method="h") plot(obj, highlight=c("f2:f1")) plot(obj, type = "w", highlight=c("f2:f1")) plot(obj, type = "s") # Replicate exhaustive screening with n=20, 50, 100 and 150 # uses bic and no marginality aloha= list() aloharaw = list() for (ss in c(20, 50, 100, 150)) { billy = matrix(0,nr=100, nc=15) for (r in 1:100) { datamock = gaga(ss, nono=0.5) rez <- glmulti(r~a*b*c*f1*f2, data=datamock, plotty=F, report=F, fitfunc=lm, marginality=F, crit=bic, confsetsize=100) aloharaw = c(aloharow, rez) billy[r,]=summary(rez)$termweights } aloha=c(aloha,billy) } # is the true model found ? # box plot of results (Figure 4) nous = as.numeric(unlist(aloha)) par (mfrow=c(1,4)) for (ss in c(0:3)) { bibi = matrix(nous[(1+ss*1500):(1500+ss*1500)], nc=15) dimnames(bibi) = list(c(),names(rez@codes[,!is.na(rez@codes[1,])])) boxplot(as.data.frame(bibi), col=c("gray", "white", "gray", gray(0.9), rep("white",2), "gray", rep("white", 2), "gray", rep("white",5)), xlab="Terms", ylab=c("","Importance")[ss==0], main=paste("n=",c(20,50,100,10000,150)[ss+1],sep=""),las=2,cex.axis=1.2, cex.lab=1.2, cex.main=2) abline(h=0.8, col="red") points(y= rep(1.015,4), x=c(1,3,7,10), pch=19, col="red") } # Type I and Type II errors nbs = 50 alpha = matrix(nc=nbs,nr=5) beta = matrix(nc=nbs,nr=5) alphaP = matrix(nc=nbs,nr=5) betaP = matrix(nc=nbs,nr=5) for (ss in 0:3) { bibi = matrix(nous[(1+ss*1500):(1500+ss*1500)], nc=15) dimnames(bibi) = list(c(),names(rez@codes[,!is.na(rez@codes[1,])])) for (t in 1:nbs) { baba = bibi>=(0.99*t/nbs) alf = (rep(1,100)%*%baba)/100 alpha[ss+1, t] = sum(alf[-c(1,3,7,10)]) baba = !baba alf = (rep(1,100)%*%baba)/100 beta[ss+1, t] = sum(alf[c(1,3,7,10)]) baba = bibi>=(0.99*t/nbs) alf = (baba[,-c(1,3,7,10)])%*%rep(1,11)>= 1 alphaP[ss+1, t] = sum(alf)/100 baba = !baba alf = (baba[,c(1,3,7,10)])%*%rep(1,4)>= 1 betaP[ss+1, t] = sum(alf)/100 } } # plots Figure 5 plot(alphaP[1,],type="l", ylim=c(0,1), col=grey(0.5), xlab="Importance threshold",lwd=2,cex.axis=1.2, cex.lab=1.2, cex.main=1.2, ylab="Probabilities (types I and II)", main="Prob(at least one mistake)",axes=F) axis(1,at=c(0,10,20,30,40,50), labels=round(0.99*c(0,10,20,30,40,50)/50,1)) axis(2,at=c(0,0.2,0.4,0.6,0.8,1), labels=c(0,0.2,0.4,0.6,0.8,1)) lines(alphaP[2,], col=grey(0.3), lwd=2) lines(alphaP[3,], col=grey(0.25), lwd=2) lines(alphaP[4,], col=grey(0), lwd=2) lines(betaP[1,], ylim=c(0,1), col=grey(0.5),lwd=4, lty=21) lines(betaP[2,], col=grey(0.3),lwd=4, lty=21) lines(betaP[3,], col=grey(0.25),lwd=4, lty=21) lines(betaP[4,], col=grey(0),lwd=4, lty=21) abline(v= which(abs(0.99*(1:50)/50-0.8)==min(abs(0.99*(1:50)/50-0.8))),col="red") abline(h=0.05, col="blue") # We now use the genetic algorithm approach on a sample data set of size 100 dolly = gaga(100, nono=0.5) # time it takes to perform exhaustive screening ini = Sys.time() reference = glmulti(r~a*b*c*f1*f2, data=dolly, plotty=F, report=F, fitfunc=lm, marginality=F, crit=bic, confsetsize=100) fini = Sys.time() fini-ini # run 500 simulations with random parameters rob = list() for (r in 1:500) rob=c(rob,glmulti(r~a*b*c*f1*f2, data=dolly, plotty=F, report=F, fitfunc=lm, marginality=F, crit=bic, confsetsize=100, method="g", popsize=sample(c(10,20,30,40,100,150),1), mutrate=sample(c(10^-3,10^-2,10^-4),1), sexrate=sample(c(0,0.1,0.2),1),imm=sample(c(0,0.2,0.5),1), deltaM=sample(c(0,0.1,0.2),1), deltaB=sample(c(0.1,0.2,0.5),1), conseq=sample(c(2,3,4,5),1))) # Distribution of computation time titou = as.numeric(lapply(rob,function(x) x@params$elapsed)) hist(titou) # I C profiles against the truth (Figure 6) okays = 1:500 par(mfrow=c(1,1)) plot(x=reference@crits,y=rob[[okays[1]]]@crits,type="p",pch=19, col="grey",xlab="IC profile from exhaustive screening", main="", ylab="IC profile from genetic algorithm", cex.axis=1.2, cex.lab=1.2, cex.main=1.2, lwd=2, ylim=c(min(reference@crits),340)) for(i in 2:length(okays)) if (length(rob[[okays[i]]]@crits)==100) points(x=reference@crits,y=rob[[okays[i]]]@crits,pch=19, col="grey") abline(a=0,b=1, col="red") # how often did we find the best model ? ref = as.character(reference@formulas[1]) mamaga = as.numeric(unlist(lapply(aloharawn, function(x) as.character(x@formulas[1])==ref))) sum(mamas)/500 # Convergence dynamics (Figure 7) par(mfrow=c(2,3)) tt1=2 hist(as.numeric(unlist(lapply(rob, function (t) t@params$dynab[min(tt1,length(t@params$dynab))]))), ann=F, xlab="",ylim=c(0,0.3),freq=F,breaks=150+3*(0:49), main="100 gen.", col="grey", cex.axis=1.2, cex.lab=1.2, cex.main=2) abline(v=reference@crits[1], col="red") tt1=50 hist(as.numeric(unlist(lapply(rob, function (t) t@params$dynab[min(tt1,length(t@params$dynab))]))), xlab="Best IC in confidence set", ylab="", main="2500 gen.",ylim=c(0,0.3),freq=F,,breaks=150+3*(0:49),col="grey",cex.axis=1.2, cex.lab=1.2, cex.main=2) abline(v=reference@crits[1], col="red") tt1=100 hist(as.numeric(unlist(lapply(rob, function (t) t@params$dynab[min(tt1,length(t@params$dynab))]))), xlab="", ylab="", main="5000 gen.", ylim=c(0,0.3),freq=F,,breaks=150+3*(0:49),col="grey",cex.axis=1.2, cex.lab=1.2, cex.main=2) abline(v=reference@crits[1], col="red") #tt1=300 #hist(as.numeric(unlist(lapply(rob, function (t) t@params$dynab[min(tt1,length(t@params$dynab))]))), #xlab="", ylab="", main="10000 gen.", ylim=c(0,0.3),freq=F,,breaks=150+3*(0:49),col="grey") tt1=2 hist(as.numeric(unlist(lapply(rob, function (t) t@params$dynam[min(tt1,length(t@params$dynab))]))), xlab="", main="",ylim=c(0,0.06),freq=F,,breaks=150+4.5*(0:49),col="grey",cex.axis=1.2, cex.lab=1.2, cex.main=2) abline(v=mean(reference@crits), col="red") tt1=50 hist(as.numeric(unlist(lapply(rob, function (t) t@params$dynam[min(tt1,length(t@params$dynab))]))), xlab="Mean IC in confidence set", ylab="", main="",ylim=c(0,0.06),freq=F,,breaks=150+4.5*(0:49),col="grey",cex.axis=1.2, cex.lab=1.2, cex.main=2) abline(v=mean(reference@crits), col="red") tt1=100 hist(as.numeric(unlist(lapply(rob, function (t) t@params$dynam[min(tt1,length(t@params$dynab))]))), xlab="", ylab="", main="",ylim=c(0,0.06),freq=F,,breaks=150+4.5*(0:49),col="grey",cex.axis=1.2, cex.lab=1.2, cex.main=2) abline(v=mean(reference@crits), col="red") # Effect of immigration on convergence (Figure 8) # Figure bimodes + immigration mimi = as.numeric(lapply(rob, function(x) mean(x@crits))) immvals = as.numeric(lapply(rob, function(x) x@params$imm)) iilow=which(((immvals==0)*(as.numeric(unlist(lapply(rob, function(x) length(x@crits))))==100))==T) iihigh=which(((immvals==0.5)*(as.numeric(unlist(lapply(rob, function(x) length(x@crits))))==100))==T) par(mfrow = c(1,2)) baby <- hist(mimi, breaks=40, freq=F, right=F,xlab="Mean IC in returned confidence set", main="No immigration",col="gray", ylim=c(0,0.042),plot=F)$breaks hist(mimi[iilow], breaks=baby, freq=F, right=F,xlab="Mean IC in returned confidence set", main="No immigration",col="gray", ylim=c(0,0.042)) abline(v=mean(reference@crits),col="red") text(x=mean(reference@crits)+15, y=0.041,col="red", labels="Target value") hist(mimi[iihigh],breaks=baby, right=F, freq=F,xlab="Mean IC in returned confidence set", main="Immigration (0.5)",col="gray", ylim=c(0,0.13)) abline(v=mean(reference@crits),col="red") text(x=mean(reference@crits)+14, y=0.127,col="red", labels="Target value") # Efficiency of consensus # take random consensus of 2 or 4 objects robby = list() for (i in 1:500) robby=c(robby, consensus(list(rob[[sample(okays,1)]],rob[[sample(okays,1)]]),confsetsize=100)) robby4 = list() for (i in 1:500) robby4=c(robby4, consensus(list(rob[[sample(okays,1)]],rob[[sample(okays,1)]],rob[[sample(okays,1)]],rob[[sample(okays,1)]]),confsetsize=100)) susuall = matrix(nr=500,nc=length(summary(reference)$termweights)) for (ww in 1:500) susuall[ww,] = summary(rob[[ww]])$termweights susuC = matrix(nr=500,nc=length(summary(reference)$termweights)) for (ww in 1:500) susuC[ww,] = summary(robby[[ww]])$termweights susuC4 = matrix(nr=500,nc=length(summary(reference)$termweights)) for (ww in 1:500) susuC4[ww,] = summary(robby4[[ww]])$termweights dimnames(susuC4) <- dimnames(susuC) <- dimnames(susuall) <- list(c(),names(rob[[1]]@codes[,!is.na(rob[[1]]@codes[1,])])) # plots Figure 9 par(mfrow = c(1,3)) boxplot(as.data.frame(susuall),xlab="", ylab="Term importance", main="No consensus",cex.axis=1.2, cex.lab=1.2, cex.main=2, las=2) points(x=1:15,y=summary(reference)$termweights, col="red", pch=19) boxplot(as.data.frame(susuC),xlab="", ylab="", main="Consensus of 2",cex.axis=1.2, cex.lab=1.2, cex.main=2, las=2) points(x=1:15,y=summary(reference)$termweights, col="red", pch=19) boxplot(as.data.frame(susuC4),xlab="", ylab="", main="Consensus of 4",cex.axis=1.2, cex.lab=1.2, cex.main=2, las=2) points(x=1:15,y=summary(reference)$termweights, col="red", pch=19) # We now use the birthwt dataset in package MASS # Section 3.3 ################################ library("MASS") # this prepares the data exactly like Venables1997 attach(birthwt) race <- factor(race,labels=c("white","black","other")) ptd <- factor(ptl>0,labels=c("y","n")) ftv <- factor(ftv) levels(ftv)[-(1:2)] <- "2+" bwt = data.frame(low=factor(low),age,lwt,race,smoke=factor(smoke>0,labels=c("y","n")), ptd, ht=factor(ht>0,labels=c("y","n")),ui=factor(ui>0,labels=c("y","n")),ftv) detach("birthwt") rm(race, ptd,ftv) # there are lots of models: glmulti("low", names(bwt)[-1], data=bwt, family=binomial, method="d", level=1, confsetsize=300) glmulti("low", names(bwt)[-1], data=bwt, family=binomial, method="d", marginality=FALSE) glmulti("low", names(bwt)[-1], data=bwt, family=binomial, method="d", marginality=TRUE) # Exhaustive screening # main effects only test1 <- glmulti(low~., data=bwt, family=binomial, crit=aic, level=1) # Plots Figure 10 par(mfrow= c(2,1)) plot(test1, type="p") plot(test1, type="s") # With interactions: split exhaustive search into several parts (faster) # This was run on 20 nodes on a cluster partobj <- glmulti(low~., data=bwt, family=binomial, chunk=X, chunks=20, plotty=F, report=F, marginality=T, crit=aic, name="exhausting") write(partobj, file="|object") # X in 1.. 20 # write used this way produces raw R object # they can read with function consensus or more directly with .readRDS() # put them together fullobj <- consensus(list.files(pattern="exhausting"), confsetsize = 100) # best models: print(fullobj) # Figure 11: par(mfrow= c(2,1)) plot(fullobj, type="p") plot(fullobj, type="s") # We now run 20 genetic algorithms rob = list() for (i in 1:20) { rez <- glmulti(low~.*., data=bwt, family=binomial, name=paste("ga", i), plotty=FALSE, marginality=T, report=F, method="g", popsize=100, mutrate=0.06, sexrate=0.15, imm=0.2, deltaM=0.01, deltaB=0, conseq=6) rob=c(rob, rez) } # Convergence through time (Figure 12) plot(rob[[1]]@params$dynab, type="p", col="grey",pch=19) for (i in 2:500) points(rob[[i]]@params$dynab, col="grey",pch=19) meanB = matrix(c(rob[[1]]@params$dynab, rep(0, (300-length(rob[[1]]@params$dynab)))), nr=1, nc=300) for (i in 2:500) { meanB = rbind(meanB, c(rob[[i]]@params$dynab, rep(mean(rob[[i]]@params$dynab), (300-length(rob[[i]]@params$dynab))))) } meanB2 = numeric(300) for (i in 1:120) meanB2[i]=mean(meanB[,i]) lines(x=50*(1:300),y=meanB2, col="blue") # make a consensus of them and check results are the same as with exhaustive search coco <- consensus(rob, confsetsize = 100) par(mfrow= c(2,1)) plot(coco, type="p") plot(coco, type="s")