###################################################################### ## SQUAREM: An R Package for Off-the-Shelf Acceleration of EM ## License: GPL-2 | GPL-3 [expanded from: GPL (≥ 2)] ## Replication Material ###################################################################### library("SQUAREM") ## 4 Poisson Mixtures ## log likelihood function poissmix.loglik poissmix.loglik <- function(p, y) { i <- 0:(length(y)-1) loglik <- y*log(p[1]*exp(-p[2])*p[2]^i/exp(lgamma(i+1)) + (1 - p[1])*exp(-p[3])*p[3]^i/exp(lgamma(i+1))) ## returns negative log likelihood of data return(-sum(loglik)) } ## basic EM algorithm EM.poisson.mixture <- function(p, maxiter = 5000, tol = 1e-08, y) { ## y is the data iter <- 1 conv <- FALSE pnew <- rep(NA_real_, 3) while (iter < maxiter) { ## start of one EM step i <- 0:(length(y)-1) zi <- p[1]*exp(-p[2])*p[2]^i / (p[1]*exp(-p[2])*p[2]^i + (1 - p[1])*exp(-p[3])*p[3]^i) pnew[1] <- sum(y*zi)/sum(y) pnew[2] <- sum(y*i*zi)/sum(y*zi) pnew[3] <- sum(y*i*(1-zi))/sum(y*(1-zi)) res <- sqrt(crossprod(pnew - p)) p <- pnew ## end of one EM step if (res < tol) { conv <- TRUE break } iter <- iter + 1 } loglik <- poissmix.loglik(p, y = y) return(list(par = p, value.objfn = loglik, fpevals = iter, convergence = conv)) } ## function poissmix.em argument for fixptfn in squarem() poissmix.em <- function(p, y) { ## The fixed point mapping giving a single iteration of ## the original EM algorithm ## y is the data pnew <- rep(NA_real_, 3) i <- 0:(length(y)-1) zi <- p[1]*exp(-p[2])*p[2]^i/(p[1]*exp(-p[2])*p[2]^i + (1 - p[1])*exp(-p[3])*p[3]^i) pnew[1] <- sum(y*zi)/sum(y) pnew[2] <- sum(y*i*zi)/sum(y*zi) pnew[3] <- sum(y*i*(1-zi))/sum(y*(1-zi)) p <- pnew return(pnew) } ## Compare Squarem and basic EM algorithm poissmix.dat <- data.frame(death = 0:9, freq = c(162, 267, 271, 185, 111, 61, 27, 8, 3, 1)) y <- poissmix.dat$freq p0 <- c(0.3, 1, 5) ## EM standard loop system.time(f0 <- EM.poisson.mixture(p = p0, y = y)) f0 ## EM fpiter system.time(f1 <- fpiter(par = p0, fixptfn = poissmix.em, objfn = poissmix.loglik, control = list(tol = 1.e-08), y = y)) f1 ## Squarem system.time(f2 <- squarem(par = p0, fixptfn = poissmix.em, objfn = poissmix.loglik, control = list(tol = 1.e-08), y = y)) f2 ## randomly generate 5000 starting values and compare set.seed(407) sim <- 5000 tem <- tsqem <- fem <- fsqem <- rep(NA_real_, sim) for (i in 1:sim) { print(i) p0 <- c(runif(1, 0.05, 0.95), runif(2, 0, 20)) tem[i] <- system.time(f1 <- fpiter(par = p0, fixptfn = poissmix.em, objfn = poissmix.loglik, control = list(tol = 1.e-08, maxiter = 10000), y = y))[3] fem[i] <- f1$fpevals tsqem[i] <- system.time(f2 <- squarem(par = p0, fixptfn = poissmix.em, objfn = poissmix.loglik, control = list(tol = 1.e-08, maxiter = 4000), y = y))[3] fsqem[i] <- f2$fpevals print(c(f1$convergence, f2$convergence)) } quantile(tem, probs = c(0.025, 0.5, 0.975)) quantile(tsqem, probs = c(0.025, 0.5, 0.975)) quantile(fem, probs = c(0.025, 0.5, 0.975)) quantile(fsqem, probs = c(0.025, 0.5, 0.975)) ## Compute Error Curve poissmix.dat <- data.frame(death = 0:9, freq = c(162, 267, 271, 185, 111, 61, 27, 8, 3, 1)) y <- poissmix.dat$freq p0 <- c(0.3, 1, 5) true.theta <- squarem(par = p0, fixptfn = poissmix.em, objfn = poissmix.loglik, control = list(tol = 1.e-18), y = y)$par squarem1 <- function(par, fixptfn, objfn, ... , control = list()) { ## par = starting value of parameter vector ## fixptfn = fixed-point iteration F(x) ## for which the solution: F(x*) = x* is sought ## objfn = underlying objective function which is minimized at x* control.default <- list(K = 1, square = TRUE, method = 3, step.min0 = 1, step.max0 = 1, mstep = 4, kr = 1, objfn.inc = 1, tol = 1.e-07, maxiter = 1500, trace = FALSE) namc <- names(control) if (!all(namc %in% names(control.default))) stop("unknown names in control: ", namc[!(namc %in% names(control.default))]) ctrl <- modifyList(control.default, control) ## ## method = 1, 2, or 3, indicating the type of steplength to be used ## A key parameter is `step.min0'. This can be either positive or negative if the eigenvalues of the Jacobian of fixed-point mapping are all positive; ## We set it to +1 for contraction mappings such as EM and MM algorithms ## Must be negative, e.g. -1, if the fixed-point mapping can have negative eiegnvalues ## ## parameter "objfn.inc" dictates the amount of non-monotonicity in the objective function ## setting objfn.inc=0 would enforce monotonicity, whereas objfn.inc=Inf would be a non-monotonic scheme ## The defalut objfn.inc=1 would enforce monotonicity far from solution, but allows for non-monotonicity closer to solution ## method <- ctrl$method maxiter <- ctrl$maxiter tol <- ctrl$tol step.min <- ctrl$step.min0 step.max0 <- ctrl$step.max0 step.max <- ctrl$step.max0 mstep <- ctrl$mstep objfn.inc <- ctrl$objfn.inc trace <- ctrl$trace if (trace) cat("Squarem-1 \n") if (missing(objfn)) stop("\n squarem2 should be used if objective function is not available \n\n") iter <- 1 p <- par lold <- objfn(p, ...) leval <- 1 if (trace) cat("Objective fn: ", lold, "\n") feval <- 0 conv <- TRUE error <- NULL while (feval < maxiter) { extrap <- TRUE p1 <- try(fixptfn(p, ...), silent = TRUE) feval <- feval + 1 if (is(p1, "try-error") || any(is.nan(unlist(p1)))) stop("Error in function evaluation") q1 <- p1 - p sr2 <- crossprod(q1) if (sqrt(sr2) < tol) break p2 <- try(fixptfn(p1, ...), silent = TRUE) feval <- feval + 1 if (is(p2, "try-error") || any(is.nan(unlist(p2)))) stop("Error in function evaluation") q2 <- p2 - p1 sq2 <- sqrt(crossprod(q2)) if (sq2 < tol) break sv2 <- crossprod(q2-q1) srv <- crossprod(q1, q2-q1) alpha <- switch(method, -srv/sv2, -sr2/srv, sqrt(sr2/sv2)) alpha <- max(step.min, min(step.max, alpha)) p.new <- p + 2*alpha*q1 + alpha^2*(q2-q1) if (abs(alpha - 1) > 0.01 ) { p.new <- try(fixptfn(p.new , ...), silent=TRUE) # stabilization step feval <- feval + 1 } if (is(p.new, "try-error") || any(is.nan(p.new))) { p.new <- p2 lnew <- try(objfn(p2, ...), silent = TRUE) leval <- leval + 1 if (alpha == step.max) step.max <- max(step.max0, step.max/mstep) alpha <- 1 extrap <- FALSE } else { if (is.finite(objfn.inc)) { lnew <- try(objfn(p.new, ...), silent = TRUE) leval <- leval + 1 } else lnew <- lold if (is(lnew, "try-error") || is.nan(lnew) || (lnew > lold + objfn.inc)) { p.new <- p2 lnew <- try(objfn(p2, ...), silent = TRUE) leval <- leval + 1 if (alpha == step.max) step.max <- max(step.max0, step.max/mstep) alpha <- 1 extrap <- FALSE } } if (alpha == step.max) step.max <- mstep*step.max if (step.min < 0 & alpha == step.min) step.min <- mstep*step.min p <- p.new if (!is.nan(lnew)) lold <- lnew if (trace) cat("Objective fn: ", lnew, " Extrapolation: ", extrap, " Steplength: ", alpha, "\n") iter <- iter + 1 error <- rbind(error, c(feval, sqrt(crossprod(p - true.theta)))) } if (feval > maxiter) conv <- FALSE if (is.infinite(objfn.inc)) { lold <- objfn(p, ...) leval <- leval + 1 } return(list(par = p, value.objfn = lold, iter = iter, fpevals = feval, objfevals = leval, convergence = conv, error = error)) } fpiter1 <- function(par, fixptfn, objfn = NULL, control = list(), ...) { control.default <- list(tol = 1e-07, maxiter = 5000, trace = FALSE) namc <- names(control) if (!all(namc %in% names(control.default))) stop("unknown names in control: ", namc[!(namc %in% names(control.default))]) ctrl <- modifyList(control.default, control) tol <- ctrl$tol maxiter <- ctrl$maxiter trace <- ctrl$trace if (trace) cat("fpiter \n") iter <- 1 objeval <- 0 conv <- FALSE error <- NULL while (iter < maxiter) { p.new <- fixptfn(par, ...) res <- sqrt(crossprod(p.new - par)) if (res < tol) { conv <- TRUE break } if (trace) { if (!is.null(objfn)) { cat("Iter: ", iter, "Objective fn: ", objfn(par, ...), "\n") objeval <- objeval + 1 } else cat("Iter: ", iter, "Residual: ", res, "\n") } par <- p.new iter <- iter + 1 error <- rbind(error, c(iter, sqrt(crossprod(par - true.theta)))) } loglik.best <- if (!is.null(objfn)) objfn(par, ...) else NA_real_ return(list(par = par, value.objfn = loglik.best, fpevals = iter, objfevals = objeval, convergence = conv, error = error)) } errsq <- squarem1(par = p0, fixptfn = poissmix.em, objfn = poissmix.loglik, control = list(tol = 1.e-08), y = y)$error errem <- fpiter1(par = p0, fixptfn = poissmix.em, objfn = poissmix.loglik, control = list(tol = 1.e-08), y = y)$error err.data <- data.frame(rbind(errsq, errem)) err.data$group <- rep(c("Squarem", "EM"), c(nrow(errsq), nrow(errem))) names(err.data) <- c("feval", "error", "group") library("ggplot2") library("latex2exp") p <- ggplot(err.data[err.data$feval <= 100, ], aes(x = feval, y = error, color = group)) + geom_point() + geom_line() + theme_bw() + xlab("The number of EM Evaluations") + scale_y_log10() + ylab(TeX(paste("Error", "$||\\theta^{(k)} - \\theta^{*}||$", "(log10 scale)"))) + guides(colour = guide_legend(title = "Algorithm")) + theme(plot.title = element_text(size = 20), axis.title = element_text(size = 15), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) p ## 5.1 Interval Censoring library("interval") ## Function to compute alpha matrix and intervals Aintmap <- function(L, R, Lin = NULL, Rin = NULL) { n <- length(L) Lin <- rep(FALSE, n) Rin <- rep(TRUE, n) Lin[L == R] <- TRUE Rin[R == Inf] <- FALSE if(n != length(R)) stop("length of L and R must be the same") LRvalues <- sort(unique(c(0, L, R, Inf))) eps <- min(diff(LRvalues))/2 Le <- L Re <- R Le[!Lin] <- L[!Lin]+eps Re[!Rin] <- R[!Rin]-eps oLR <- order(c(Le, Re+eps/2)) Leq1.Req2 <- c(rep(1, n), rep(2, n)) flag <- c(0, diff(Leq1.Req2[oLR])) R.right.of.L <- (1:(2*n))[flag == 1] intmapR <- c(L, R)[oLR][R.right.of.L] intmapL <- c(L, R)[oLR][R.right.of.L - 1] intmapRin <- c(Lin, Rin)[oLR][R.right.of.L] intmapLin <- c(Lin, Rin)[oLR][R.right.of.L - 1] intmap <- matrix(c(intmapL, intmapR), byrow = TRUE, nrow = 2) attr(intmap, "LRin") <- matrix(c(intmapLin, intmapRin), byrow = TRUE, nrow = 2) k <- dim(intmap)[[2]] Lbracket <- rep("(", k) Lbracket[intmapLin] <- "[" Rbracket <- rep(")", k) Rbracket[intmapRin] <- "]" intname <- paste(Lbracket, intmapL, ", ", intmapR, Rbracket, sep = "") A <- matrix(0, n, k, dimnames = list(1:n, intname)) intmapLe <- intmapL intmapLe[!intmapLin] <- intmapL[!intmapLin]+eps intmapRe <- intmapR intmapRe[!intmapRin] <- intmapR[!intmapRin]-eps for (i in 1:n) { tempint <- Le[i] <= intmapRe & Re[i] >= intmapLe A[i, tempint] <- 1 } if (k==1 & intmap[1, 1] == 0 & intmap[2, 1] == Inf) A[A == 0] <- 1 return(A) } ## A wrapper to EM and Squarem IntervalCensor <- function(Imat, algorithm = c("EM", "Squarem"), control.setting = list()) { ## Imat = a 2-column matrix containing ## the intervals for each individual if (is.list(Imat)) Imat <- as.matrix(Imat) N <- nrow(Imat) ## Use Michael Fay's function to generate `A' matrix A <- Aintmap(Imat[, 1], Imat[, 2]) m <- ncol(A) tA <- t(A) ## EM mapping - a single EM iteration intEM <- function(pvec) { Ap <- pvec*tA pnew <- colMeans(t(Ap)/colSums(Ap)) pnew * (pnew > 0) } ## Function of log-likelihood loglik <- function(pvec) { - sum(log(c(A %*% pvec))) } ## starting values pvec <- rep(1/m, length = m) if (algorithm == "Squarem") { ## SQUAREM acceleration control.sq.default <- list(K = 1, maxiter = 10000, tol = 1.e-08) control <- modifyList(control.sq.default, control.setting) ans <- squarem(par = pvec, fixptfn = intEM, objfn = loglik, control = control) pvec <- ans$par iter <- ans$fpevals } else { control.em.default <- list(tol = 1.e-08, maxiter = 20000) control <- modifyList(control.em.default, control.setting) ans <- fpiter(par = pvec, fixptfn = intEM, objfn = loglik, control = control) pvec <- ans$par iter <- ans$fpevals } return(list(par = round(pvec, 4), loglik = -round(loglik(pvec = pvec), 4), iters = iter, convergence = ans$convergence)) } ## Modified EM-ICM to have same starting values EMICMmac.mod <- function(A, EMstep = TRUE, ICMstep = TRUE, keepiter = FALSE, tol = 1e-07, tolbis = 1e-07, maxiter = 1000) { if (!EMstep && !ICMstep) { print("One of EMstep or ICMstep must be true.") return(NULL) } Meps <- .Machine$double.eps m <- dim(A)[1] n <- dim(A)[2] tA <- t(A) if (m == 1) { ret <- NULL ret$sigma <- 1 ret$weights <- n ret$lastchange <- 0 ret$numiter <- 0 return(ret) } WW <- matrix(0, m, n) for (i in 1:(m - 1)) WW[i, ] <- A[i, ] - A[i + 1, ] WW[m, ] <- A[m, ] pvec <- rep(1/m, length = m) sigma <- cumsum(pvec) numiter <- 0 oldsigma <- rep(-1, m) if (keepiter) iter <- sigma while (sqrt(crossprod(oldsigma - sigma)) > tol && numiter <= maxiter) { oldsigma <- sigma if (EMstep) { pvec <- diff(c(0, sigma)) temp <- sweep(A, 1, pvec, FUN = "*") if (sum(apply(temp, 2, sum) == 0) == 0) { pvec <- apply(sweep(temp, 2, apply(temp, 2, sum), FUN = "/"), 1, sum) sigma <- cumsum(pvec)/sum(pvec) } if (keepiter) iter <- rbind(iter, sigma) } if (ICMstep) { Wps <- 1/(t(WW) %*% sigma) weights <- (abs(WW) %*% Wps^2) increment <- as.vector((WW %*% Wps)/weights) sigma <- sigma + increment sigma[m] <- 1 if (keepiter) iter <- rbind(iter, sigma) temp <- PMGA(sigma[-m], weights[-m]) poolnum <- c(0, cumsum(temp$poolnum)) for (i in 2:length(poolnum)) for (j in (poolnum[i - 1] + 1):poolnum[i]) sigma[j] <- temp$est[i - 1] if (keepiter) iter <- rbind(iter, sigma) temp <- c(0, sigma) pvec <- diff(c(0, oldsigma)) ndir <- diff(c(0, temp[2:(m + 1)])) - pvec pvec <- Bisect(tA, pvec, ndir, Meps, tolbis = 1e-07) sigma <- cumsum(pvec) if (keepiter) iter <- rbind(iter, sigma) } numiter <- numiter + 1 } if (numiter == maxiter) warning("EM/ICM may have failed to converge.") pf <- diff(c(0, sigma)) ret <- list(sigma = sigma, pf = pf, llk = sum(log(t(A) %*% pf)), weights = as.vector(weights), lastchange = sigma - oldsigma, numiter = numiter, eps = tol) if (keepiter) { if (EMstep && ICMstep) dimnames(iter) <- list(c("Seed", rep(c("EM", "Fisher", "PMGA", "Bisect"), numiter)), NULL) else if (EMstep) dimnames(iter) <- list(rep("EM", numiter + 1), NULL) else dimnames(iter) <- list(c("Seed", rep(c("Fisher", "PMGA"), numiter)), NULL) ret$iter <- iter } ret } EMICM.mod <- function (A, EMstep = TRUE, ICMstep = TRUE, keepiter = FALSE, tol = 1e-07, maxiter = 1000) { if (ncol(A) == 2 && all(A[, 2] >= A[, 1])) { ml <- Maclist(A) intmap <- t(MLEintvl(A, ml)) A <- Macmat(ml)$pmat } else intmap <- NULL temp <- EMICMmac.mod(A, EMstep = EMstep, ICMstep = ICMstep, keepiter = keepiter, tol = tol, maxiter = maxiter) if (is.null(temp)) return(NULL) class(temp) <- "icsurv" temp$intmap <- intmap temp } ## real data dat <- c(45, 6, 0, 46, 46, 7, 17, 7, 37, 0, 4, 15, 11, 22, 46, 46, 25, 46, 26, 46, 27, 36, 46, 36, 37, 40, 17, 46, 11, 38, 5, 37, 0, 18, 24, 36, 5, 19, 17, 24, 32, 33, 19, 37, 34, 36, Inf, 10, 7, Inf, Inf, 16, Inf, 14, 44, 8, 11, Inf, 15, Inf, Inf, Inf, 37, Inf, 40, Inf, 34, 44, Inf, 48, Inf, Inf, 25, Inf, 18, Inf, 12, Inf, 5, Inf, Inf, Inf, 11, 35, 25, Inf, Inf, Inf, 26, Inf, Inf, Inf) dat <- data.frame(matrix(dat, 46, 2)) names(dat) <- c("L", "R") dat system.time(ans1 <- IntervalCensor(Imat = dat, algorithm = "EM")) ## Number of EM Steps ans1$iters system.time(ans2 <- IntervalCensor(Imat = dat, algorithm = "Squarem")) ## Number of EM Steps ans2$iters ## The maximum difference in absolute value between EM and Squarem estimated parameters max(abs(ans1$par-ans2$par)) system.time(ans3 <- EMICM.mod(dat, EMstep = TRUE, ICMstep = TRUE, keepiter = FALSE, tol = 1e-08, maxiter = 1000)) ## The maximum difference in absolute value between EM, Squarem and EM-ICM estimated parameters max(abs(ans2$par-ans3$pf)) ## Apply EM, Squarem individually without the wrapper ## Function of one EM evaluation intEM <- function(pvec, A) { tA <- t(A) Ap <- pvec*tA pnew <- colMeans(t(Ap)/colSums(Ap)) pnew * (pnew > 0) } ## Function of negative log-likelihood loglik <- function(pvec, A) { - sum(log(c(A %*% pvec))) } A <- Aintmap(dat[, 1], dat[, 2]) m <- ncol(A) ## starting values pvec <- rep(1/m, length = m) ## EM system.time(ans1 <- fpiter(par = pvec, fixptfn = intEM, objfn = loglik, A = A, control = list(tol = 1e-8))) ## Squarem system.time(ans2 <- squarem(par = pvec, fixptfn = intEM, objfn = loglik, A = A, control = list(tol = 1e-8))) ## simulation example ## data generation gendata <- function(n, mu.nexam = 5) { foo <- matrix(NA_real_, nrow = n, ncol = 3) for (i in 1:n) { st <- rweibull(1, shape = 1, scale = 5) nexam <- rpois(1, mu.nexam) exam <- round(runif(nexam, 0, 10), 1) exam <- c(0, exam, Inf) foo[i, ] <- c(time = st, L = max(exam[st > exam]), R = min(exam[st <= exam])) } return(foo) } ## sample size 200 library("ggplot2") set.seed(350) nsim <- 100 t.sq <- t.emicm <- t.sqemicm <- t.em <- rep(NA_real_, nsim) f.sq <- f.em <- rep(NA_real_, nsim) for (i in 1:nsim) { dat.simul <- gendata(200)[, 2:3] ## EM t.em[i] <- system.time(ans1 <- IntervalCensor(Imat = dat.simul, algorithm = "EM", control.setting = list()))[3] f.em[i] <- ans1$iters ## SQUAREM - interval censoring t.sq[i] <- system.time(ans2 <- IntervalCensor(Imat = dat.simul, algorithm = "Squarem", control.setting = list()))[3] f.sq[i] <- ans2$iters ## EMICM t.emicm[i] <- system.time(ans3 <- EMICM.mod(dat.simul, EMstep = TRUE, ICMstep = TRUE, keepiter = FALSE, tol = 1e-08, maxiter = 1000))[3] } times <- c(t.em, t.sq, t.emicm) method <- rep(c("EM", "SQUAREM", "EMICM"), each = nsim) comp.algor <- data.frame(Algorithm = factor(method, levels = c("EM", "SQUAREM", "EMICM")), time = times) p <- ggplot(comp.algor, aes(x = Algorithm, y = time)) + geom_boxplot(aes(fill = Algorithm)) + scale_y_log10() + xlab("Algorithm") + ylab("CPU running time (log10 scale)") + theme_bw() + guides(colour = guide_legend(title = "Algorithm")) + theme(plot.title = element_text(size = 20), axis.title = element_text(size = 15), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) p mean(f.em); sd(f.em) mean(f.sq); sd(f.sq) ## sample size 2000 library("ggplot2") set.seed(350) nsim <- 100 t.sq <- t.emicm <- t.em <- rep(NA_real_, nsim) f.sq <- f.em <- rep(NA_real_, nsim) for (i in 1:nsim) { dat.simul <- gendata(2000)[, 2:3] ## EM t.em[i] <- system.time(ans1 <- IntervalCensor(Imat = dat.simul, algorithm = "EM", control.setting = list()))[3] f.em[i] <- ans1$iters ## SQUAREM - interval censoring t.sq[i] <- system.time(ans2 <- IntervalCensor(Imat = dat.simul, algorithm = "Squarem", control.setting = list()))[3] f.sq[i] <- ans2$iters ## EMICM t.emicm[i] <- system.time(ans3 <- EMICM.mod(dat.simul, EMstep = TRUE, ICMstep = TRUE, keepiter = FALSE, tol = 1e-08, maxiter = 1000))[3] } times <- c(t.em, t.sq, t.emicm) method <- rep(c("EM", "SQUAREM", "EMICM"), each = nsim) comp.algor <- data.frame(Algorithm = factor(method, levels = c("EM", "SQUAREM", "EMICM")), time = times) p <- ggplot(comp.algor, aes(x = Algorithm, y = time)) + geom_boxplot(aes(fill = Algorithm)) + scale_y_log10() + xlab("Algorithm") + ylab("CPU running time (log10 scale)") + theme_bw() + guides(colour = guide_legend(title = "Algorithm")) + theme(plot.title = element_text(size = 20), axis.title = element_text(size = 15), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) p mean(f.em); sd(f.em) mean(f.sq); sd(f.sq) ## 5.2 Genetics Global Ancestry Estimation Problem ## log likelihood function loglike <- function(param, X, K) { n <- nrow(X); p <- ncol(X) F <- matrix(param[1: (p * K)], p, K) Q <- matrix(param[(p * K + 1): (p * K + n * K)], n, K) loglikelihood <- sum(X * log(Q %*% t(F)) + (2 - X) * log(Q %*% (1-t(F)))) ##returns negative loglikelihood for minimization return(-loglikelihood) } admixture.em <- function(param, X, K) { eps <- 1e-6 n <- nrow(X); p <- ncol(X) ## Initialize the counts. m <- matrix(eps, n, K) n0 <- n1 <- matrix(eps, p, K) ## recover the parameters F, Q F <- matrix(param[1: (p * K)], p, K) Q <- matrix(param[(p * K + 1): (p * K + n * K)], n, K) ## Initialize storage for the posterior probabilities of the hidden ## (phased) genotypes x population indicators for a single sample. r <- array(0, dim = c(p, 4, K, K)) ## Update the expected allele counts and population counts. Repeat ## for each sample. for (i in 1:n) { colnames(r) <- c("00", "01", "10", "11") ## Repeat for each combination of ancestral populations. for (j in 1:K) { for (k in 1:K) { ## Compute the posterior probabilities r[, "00", j, k] <- (X[i, ] == 0) * (1 - F[, j]) * (1 - F[, k]) r[, "01", j, k] <- (X[i, ] == 1) * (1 - F[, j]) * F[, k] r[, "10", j, k] <- (X[i, ] == 1) * F[, j] * (1 - F[, k]) r[, "11", j, k] <- (X[i, ] == 2) * F[, j] * F[, k] r[, , j, k] <- r[, , j, k] * Q[i, j] * Q[i, k] } } ## Normalize the posterior probabilities. dim(r) <- c(p, 4*K^2) r <- r/rowSums(r) ## Add the posterior probabilities to sufficient statistics. dim(r) <- c(p, 4, K, K) colnames(r) <- c("00", "01", "10", "11") m[i, ] <- m[i, ] + apply(r, 3, sum) + apply(r, 4, sum) for (k in 1:K) { n0[, k] <- n0[, k] + rowSums(drop(r[, "00", k, ])) + rowSums(drop(r[, "01", k, ])) + rowSums(drop(r[, "00", , k])) + rowSums(drop(r[, "10", , k])) n1[, k] <- n1[, k] + rowSums(drop(r[, "10", k, ])) + rowSums(drop(r[, "11", k, ])) + rowSums(drop(r[, "01", , k])) + rowSums(drop(r[, "11", , k])) } } F <- n1 / (n0 + n1) Q <- m / rowSums(m) return(c(as.vector(F), as.vector(Q))) } load("geno.sim.RData") set.seed(413) ## starting value p <- 100 n <- 150 K <- 3 F <- matrix(runif(p*K), p, K) Q <- matrix(1/K, n, K) param.start <- c(as.vector(F), as.vector(Q)) ## EM system.time(f1 <- fpiter(par = param.start, fixptfn = admixture.em, objfn = loglike, control = list(tol = 1e-4), X = geno, K = 3)) f1$fpevals ## Squarem system.time(f2 <- squarem(par = param.start, fixptfn = admixture.em, objfn = loglike, control = list(tol = 1e-4, maxiter = 2000), X = geno, K = 3)) f2$fpevals ## 5.3 MM Problem ld <- read.table("lee_data.txt") head(ld, 5) ## log likelihood function binom.loglike <- function(par, Z, y) { zb <- c(Z %*% par) pib <- 1 / (1 + exp(-zb)) return(as.numeric(-t(y) %*% (Z %*% par) - sum(log(1 - pib)))) } ## quadratic majorization uniform bound qmub.update <- function(par, Z, y) { Zmat <- solve(crossprod(Z)) %*% t(Z) zb <- c(Z %*% par) pib <- 1 / (1 + exp(-zb)) ub <- pib - y par <- par - 4 * c(Zmat %*% ub) par } ## quadratic majorization non uniform bound qmvb.update <- function(par, Z, y) { zb <- c(Z %*% par) pib <- 1 / (1 + exp(-zb)) wmat <- diag((2 * pib - 1)/(2 * zb)) ub <- pib - y Zmat <- solve(t(Z) %*% wmat %*% Z) %*% t(Z) par <- par - c(Zmat %*% ub) par } Z <- as.matrix(ld[, 1:7]) y <- ld[, 8] p0 <- rep(10, 7) ## uniform bound system.time(ans1 <- fpiter(par = p0, fixptfn = qmub.update, objfn = binom.loglike, control = list(maxiter = 20000), Z = Z, y = y)) ans1 ## squared uniform bound system.time(ans2 <- squarem(par = p0, fixptfn = qmub.update, objfn = binom.loglike, Z = Z, y = y)) ans2 ## non-uniform bound system.time(ans3 <- fpiter(par = p0, fixptfn = qmvb.update, objfn = binom.loglike, control = list(maxiter = 20000), Z = Z, y = y)) ans3 ## squared non-uniform bound system.time(ans4 <- squarem(par = p0, fixptfn = qmvb.update, objfn = binom.loglike, Z = Z, y = y)) ans4 ## simulation set.seed(412) sim <- 500 tqm1 <- tsqqm1 <- tqm2 <- tsqqm2 <- rep(NA_real_, sim) fqm1 <- fsqqm1 <- fqm2 <- fsqqm2 <- rep(NA_real_, sim) for(i in 1:sim) { p0 <- runif(7, 0, 10) ## QM-uniform bound tqm1[i] <- system.time(ans1 <- fpiter(par = p0, fixptfn = qmub.update, objfn = binom.loglike, control = list(maxiter = 20000), Z = Z, y = y))[3] tsqqm1[i] <- system.time(ans2 <- squarem(par = p0, fixptfn = qmub.update, objfn = binom.loglike, Z = Z, y = y))[3] ## Squared QM-uniform bound fqm1[i] <- ans1$fpevals fsqqm1[i] <- ans2$fpevals ## QM-non-uniform bound tqm2[i] <- system.time(ans3 <- fpiter(par = p0, fixptfn = qmvb.update, objfn = binom.loglike, control = list(maxiter = 20000), Z = Z, y = y))[3] tsqqm2[i] <- system.time(ans4 <- squarem(par = p0, fixptfn = qmvb.update, objfn = binom.loglike, Z = Z, y = y))[3] ## Squared QM-non-uniform bound fqm2[i] <- ans3$fpevals fsqqm2[i] <- ans4$fpevals } times <- c(tqm1, tsqqm1, tqm2, tsqqm2) fevals <- c(fqm1, fsqqm1, fqm2, fsqqm2) method <- rep(c("QM_Unif", "SQUARE QM_Unif", "QM_Non", "SQUARE QM_Non"), each = sim) mean(tqm1); sd(tqm1) mean(tsqqm1); sd(tsqqm1) mean(tqm2); sd(tqm2) mean(tsqqm2); sd(tsqqm2) mean(fqm1); sd(fqm1) mean(fsqqm1); sd(fsqqm1) mean(fqm2); sd(fqm2) mean(fsqqm2); sd(fsqqm2) comp.algorithm <- data.frame(Algorithm = factor(method, levels = c("QM_Unif", "SQUARE QM_Unif", "QM_Non", "SQUARE QM_Non")), time = times, fevals = fevals) p <- ggplot(comp.algorithm, aes(x = Algorithm, y = time)) + geom_boxplot(aes(fill = Algorithm)) + scale_y_log10() + xlab("Algorithm") + ylab("CPU running time(log10 scale)") + theme_bw() + guides(colour = guide_legend(fill = "Algorithm")) + theme(plot.title = element_text(size = 20), axis.title = element_text(size = 15), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) p q <- ggplot(comp.algorithm, aes(x = Algorithm, y = fevals)) + geom_boxplot(aes(fill = Algorithm)) + scale_y_log10() + xlab("Algorithm") + ylab("The number of QM updates(log10 scale)") + theme_bw() + guides(fill = guide_legend(title = "Algorithm")) + theme(plot.title = element_text(size = 20), axis.title = element_text(size = 15), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) q ## Appendix Factor Analysis ## Real data example ## log-likelihood factor.loglik <- function(param, cyy) { ## extract beta matrix and tau2 from param beta.vec <- param[1:36] beta.mat <- matrix(beta.vec, 4, 9) tau2 <- param[37:45] tau2.mat <- diag(tau2) Sig <- tau2.mat + t(beta.mat) %*% beta.mat ## suppose n=145 since this does not impact the parameter estimation loglik <- -145/2 * log(det(Sig)) - 145/2 * sum(diag(solve(Sig, cyy))) ## the negative log-likelihood is returned return(-loglik) } ## one EM update factor.em <- function(param, cyy) { ## extract beta matrix and tau2 from param beta.vec <- param[1:36] beta.mat <- matrix(beta.vec, 4, 9) tau2 <- param[37:45] tau2.mat <- diag(tau2) ## compute delta/Delta inv.quantity <- solve(tau2.mat + t(beta.mat) %*% beta.mat) small.delta <- inv.quantity %*% t(beta.mat) big.delta <- diag(4) - beta.mat %*% inv.quantity %*% t(beta.mat) cyy.inverse <- t(small.delta) %*% cyy %*% small.delta + big.delta cyy.mat <- t(small.delta) %*% cyy ## update betas and taus beta.new <- matrix(0, 4, 9) beta.p1 <- solve(cyy.inverse[1:3, 1:3]) %*% cyy.mat[1:3, 1:4] beta.p2 <- solve(cyy.inverse[c(1, 2, 4), c(1, 2, 4)]) %*% cyy.mat[c(1, 2, 4), 5:9] beta.new[1:3, 1:4] <- beta.p1 beta.new[c(1, 2, 4), 5:9] <- beta.p2 tau.p1 <- diag(cyy)[1:4] - diag(t(cyy.mat[1:3, 1:4]) %*% solve(cyy.inverse[1:3, 1:3]) %*% cyy.mat[1:3, 1:4]) tau.p2 <- diag(cyy)[5:9] - diag(t(cyy.mat[c(1, 2, 4), 5:9]) %*% solve(cyy.inverse[c(1, 2, 4), c(1, 2, 4)]) %*% cyy.mat[c(1, 2, 4), 5:9]) tau.new <- c(tau.p1, tau.p2) param.new <- c(as.numeric(beta.new), tau.new) return(param.new) } ## one ECME update factor.ecme <- function(param, cyy) { n <- 145 ## extract beta matrix and tau2 from param beta.vec <- param[1:36] beta.mat <- matrix(beta.vec, 4, 9) tau2 <- param[37:45] tau2.mat <- diag(tau2) ## compute delta/Delta inv.quantity <- solve(tau2.mat + t(beta.mat) %*% beta.mat) small.delta <- inv.quantity %*% t(beta.mat) big.delta <- diag(4) - beta.mat %*% inv.quantity %*% t(beta.mat) cyy.inverse <- t(small.delta) %*% cyy %*% small.delta + big.delta cyy.mat <- t(small.delta) %*% cyy ## update betas beta.new <- matrix(0, 4, 9) beta.p1 <- solve(cyy.inverse[1:3, 1:3]) %*% cyy.mat[1:3, 1:4] beta.p2 <- solve(cyy.inverse[c(1, 2, 4), c(1, 2, 4)]) %*% cyy.mat[c(1, 2, 4), 5:9] beta.new[1:3, 1:4] <- beta.p1 beta.new[c(1, 2, 4), 5:9] <- beta.p2 ## update taus given betas A <- solve(tau2.mat + t(beta.new) %*% beta.new) sum.B <- A %*% (n * cyy) %*% A gradient <- - tau2/2 * (diag(n*A) - diag(sum.B)) hessian <- (0.5 * (tau2 %*% t(tau2))) * (A * (n * A - 2 * sum.B)) diag(hessian) <- diag(hessian) + gradient U <- log(tau2) U <- U - solve(hessian, gradient) # Newton step tau.new <- exp(U) param.new <- c(as.numeric(beta.new), tau.new) return(param.new) } ## Cyy cyy <- diag(9) cyy[upper.tri(cyy)] <- c(.554, .227, .296, .189, .219, .769, .461, .479, .237, .212, .506, .530, .243, .226, .520, .408, .425, .304, .291, .514, .473, .280, .311, .718, .681, .313, .348, .374, .241, .311, .730, .661, .245, .290, .306, .672) cyy[lower.tri(cyy)] <- t(cyy)[lower.tri(t(cyy))] ## starting value beta.trans <- matrix(c(0.5954912, 0.6449102, 0.7630006, 0.7163828, 0.6175647, 0.6464100, 0.6452737, 0.7868222, 0.7482302, -0.4893347, -0.4408213, 0.5053083, 0.5258722, -0.4714808, -0.4628659, -0.3260013, 0.3690580, 0.4326963, -0.3848925, -0.3555598, -0.0535340, 0.0219100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1931459, 0.4606456, -0.3622682, 0.0630371, 0.0431256), 9, 4) beta.start <- t(beta.trans) tau2.start <- rep(10^(-8), 9) param.start <- c(as.numeric(beta.start), tau2.start) ## EM library("SQUAREM") system.time(f1 <- fpiter(par = param.start, cyy = cyy, fixptfn = factor.em, objfn = factor.loglik, control = list(tol = 10^(-8), maxiter = 20000))) f1$fpevals ## ECME system.time(f2 <- fpiter(par = param.start, cyy = cyy, fixptfn = factor.ecme, objfn = factor.loglik, control = list(tol = 10^(-8), maxiter = 20000))) f2$fpevals ## Squared EM system.time(f3 <- squarem(par = param.start, cyy = cyy, fixptfn = factor.em, objfn = factor.loglik, control = list(tol = 10^(-8)))) f3$fpevals ## Squarem ECME system.time(f4 <- squarem(par = param.start, cyy = cyy, fixptfn = factor.ecme, objfn = factor.loglik, control = list(tol = 10^(-8)))) f4$fpevals ## compare elapsed time and iterations tem <- tsqem <- tecme <- tsqecme <- rep(NA_real_, 100) for (i in 1:100) { tem[i] <- system.time(f1 <- fpiter(par = param.start, cyy = cyy, fixptfn = factor.em, objfn = factor.loglik, control = list(tol = 10^(-8), maxiter = 20000)))[3] tsqem[i] <- system.time(f2 <- squarem(par = param.start, cyy = cyy, fixptfn = factor.em, objfn = factor.loglik, control = list(tol = 10^(-8))))[3] tecme[i] <- system.time(f3 <- fpiter(par = param.start, cyy = cyy, fixptfn = factor.ecme, objfn = factor.loglik, control = list(tol = 10^(-8), maxiter = 10000)))[3] tsqecme[i] <- system.time(f4 <- squarem(par = param.start, cyy = cyy, fixptfn = factor.ecme, objfn = factor.loglik, control = list(tol = 10^(-8))))[3] } mean(tem); sd(tem); f1$fpevals mean(tsqem); sd(tsqem); f2$fpevals mean(tecme); sd(tecme); f3$fpevals mean(tsqecme); sd(tsqecme); f4$fpevals ## Simulation Example library("wakefield") library("BiocParallel") library("ggplot2") simulate.FAEM <- function(seed) { set.seed(seed) ## generate 200*32 score dataframe generate.score <- function(a) { set.seed(a + seed) integer <- ceiling(a/8) if (integer == 1) { return(grade(200, mean = 60, sd = 5, digits = 0)) } else if(integer == 2) { return(grade(200, mean = 70, sd = 7, digits = 0)) } else if(integer == 3) { return(grade(200, mean = 80, sd = 6, digits = 0)) } else { return(grade(200, mean = 90, sd = 8, digits = 0)) } } Y.List <- bplapply(1:32, generate.score) Y.Matrix <- do.call(rbind, Y.List) Y.dat <- data.frame(t(Y.Matrix)) row.names(Y.dat) <- NULL names(Y.dat) <- paste0("Subject", 1:32) ## Compute the factor estimates ## Center the data Y.dat Y.dat <- scale(Y.dat, center = TRUE, scale = TRUE) cyy <- cov(Y.dat) ## EM Iteration factor.em <- function(param, cyy) { ## extract beta matrix and tau2 from param beta.vec <- param[1:128] beta.mat <- matrix(beta.vec, 4, 32) tau2 <- param[129:160] tau2.mat <- diag(tau2) ## compute delta/Delta inv.quantity <- solve(tau2.mat + t(beta.mat) %*% beta.mat) small.delta <- inv.quantity %*% t(beta.mat) big.delta <- diag(4) - beta.mat %*% inv.quantity %*% t(beta.mat) cyy.inverse <- t(small.delta) %*% cyy %*% small.delta + big.delta cyy.mat <- t(small.delta) %*% cyy ## update betas and taus beta.new <- solve(cyy.inverse, cyy.mat) tau.new <- diag(cyy - t(cyy.mat) %*% solve(cyy.inverse, cyy.mat)) param.new <- c(as.numeric(beta.new), tau.new) return(param.new) } ## log-likelihood function factor.loglik <- function(param, cyy) { ## extract beta matrix and tau2 from param beta.vec <- param[1:128] beta.mat <- matrix(beta.vec, 4, 32) tau2 <- param[129:160] tau2.mat <- diag(tau2) Sig <- tau2.mat + t(beta.mat) %*% beta.mat ## n=200 loglik <- -200/2 * log(det(Sig)) - 200/2 * sum(diag(solve(Sig, cyy))) return(-loglik) } ## starting value set.seed(seed + 1000) beta.start <- matrix(runif(4*32, -1, 1), 4, 32) tau2.start <- rep(10^(-8), 32) param.start <- c(as.numeric(beta.start), tau2.start) ## EM t1 <- system.time(f1 <- fpiter(par = param.start, cyy = cyy, fixptfn = factor.em, objfn = factor.loglik, control = list(tol = 10^(-8), maxiter = 200000)))[3] ## Squared EM t2 <- system.time(f2 <- squarem(par = param.start, cyy = cyy, fixptfn = factor.em, objfn = factor.loglik, control = list(tol = 10^(-8), maxiter = 20000)))[3] return(c(f1$fpevals, f2$fpevals, f1$convergence, f2$convergence, t1, t2)) } result.matrix <- matrix(0, 50, 6) for (i in 1:50) { result.matrix[i, ] <- simulate.FAEM(1000 + i) } result.dat <- data.frame(number = rep(1:50, 2), feval = c(result.matrix[, 1], result.matrix[, 2]), time = c(result.matrix[, 5], result.matrix[, 6]), group = rep(c("EM", "SQUAREM"), each = 50), convergence = c(result.matrix[, 3], result.matrix[, 4])) result.dat <- within(result.dat, convergence <- ifelse(convergence == 1, "converge", "notconverge")) ## Distribution Function of Runtime r <- ggplot(result.dat, aes(x = log(time, base = 10), colour = factor(group))) + stat_ecdf() + xlab("CPU running time") + ylab("Cumulative probability") + guides(colour = guide_legend(title = "Algorithm")) + theme_bw() + theme(plot.title = element_text(size = 20), axis.title = element_text(size = 19), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) r ## Distribution Function of Evaluations e <- ggplot(result.dat, aes(x = log(feval, base = 10), colour = factor(group))) + stat_ecdf() + xlab("The number of EM evaluations") + ylab("Cumulative probability") + guides(colour = guide_legend(title = "Algorithm")) + theme_bw() + theme(plot.title = element_text(size = 20), axis.title = element_text(size = 19), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) e ## Scatterplot of running time and iterations sr <- ggplot(data.frame(result.matrix), aes(x = X6, y = X5)) + geom_point(colour = "red") + geom_abline(slope = 10, intercept = 0, colour = "blue") + xlab("CPU running time for Squarem") + ylab("CPU running time for EM") + theme_bw() + annotate("text", label = "y = 10x", x = 5, y = 40, size = 8) + theme(plot.title = element_text(size = 20), axis.title = element_text(size = 19), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) sr se <- ggplot(data.frame(result.matrix), aes(x = X2, y = X1)) + geom_point(colour = "red") + geom_abline(slope = 10, intercept = 0, colour = "blue") + xlab("The fevals for Squarem") + ylab("The fevals for EM") + theme_bw() + annotate("text", label = "y = 10x", x = 8000, y = 50000, size = 8) + theme(plot.title = element_text(size = 20), axis.title = element_text(size = 19), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) se