library("SNPMClust") library("mclust") library("mixtools") ## used to draw ellipses in third panel of figure ## Figure 1 ## Most of the code below is copied from the package source. Certain intermediate ## data are used to illustrate the method. load("GenotypeData.Rdata") ## index of example SNP p <- 1282 indata <- prepdata(buccaltable) priorfrac <- 0.2 uncertcutoff <- 0.01 xm1 <- NA xm2 <- NA xm3 <- NA ym1 <- NA ym2 <- NA ym3 <- NA ranseed <- 1969 R.lowcutoff <- 0.05 snp <- indata$SNP[p] score <- indata$Score[p, ] pdf("fig1.pdf") par(mfrow = c(2, 2), mar = c(5, 4, 2, 2) + 0.1) plot(indata$Theta[p, ], indata$R[p, ], type = "n", xlim = c(0, 1), xlab = "Theta", ylab = "R", las = 1, main = paste("A", paste(rep(" ", 77), collapse = ""), collapse = "")) points(indata$Theta[p, indata$GType[p, ] == "AA" & indata$Score[p, ] > 0.15], indata$R[p, indata$GType[p, ] == "AA" & indata$Score[p, ] > 0.15], pch = 1, col = 4) points(indata$Theta[p, indata$GType[p, ] == "AB" & indata$Score[p, ] > 0.15], indata$R[p, indata$GType[p, ] == "AB" & indata$Score[p, ] > 0.15], pch = 2, col = 3) points(indata$Theta[p, indata$GType[p, ] == "BB" & indata$Score[p, ] > 0.15], indata$R[p, indata$GType[p, ] == "BB" & indata$Score[p, ] > 0.15], pch = 5, col = 6) points(indata$Theta[p, indata$GType[p, ] == "NC" | indata$Score[p, ] < 0.15], indata$R[p, indata$GType[p, ] == "NC" & indata$Score[p, ] < 0.15], pch = 2, col = "gray") text(0.08, 0.69, "AA", cex = 0.9) text(0.55, 0.57, "AB", cex = 0.9) text(1, 0.63, "BB", cex = 0.9) lims <- max(abs(indata$logratio[p, ])) plot(indata$logratio[p, ], indata$R.trans[p, ], xlab = "Normalized log ratio", ylab = "R-transformed", xlim = c(-lims, lims), main = paste("B", paste(rep(" ", 77), collapse = ""), collapse = ""), las = 1) R <- indata$R.trans[p, ] theta <- indata$logratio[p, ] R[indata$R[p, ] < R.lowcutoff] <- NA R[R == "NaN"] <- NA R[is.element(theta, c(-Inf, Inf, "NaN"))] <- NA R[is.na(theta)] <- NA theta[is.na(R)] <- NA priorpoints <- round(length(theta) * priorfrac) ## compute mean and variance of homozygous clusters xtmp <- theta Rtmp <- R keepit <- !is.na(theta) & !is.na(R) xtmp <- theta[keepit] Rtmp <- R[keepit] gscalltmp <- indata$GType[p, ][keepit] lowesstmp <- lowess(xtmp, Rtmp) leftmean <- lowesstmp$y[order(lowesstmp$x)][1] rightmean <- lowesstmp$y[order(lowesstmp$x)][length(lowesstmp$y)] tmpresid <- Rtmp[order(xtmp)] - lowesstmp$y[order(lowesstmp$x)] noisesd <- sd(tmpresid) pseudodata <- generatepriors(xtmp, Rtmp, gscalltmp, priorpoints, xm1 = xm1, xm2 = xm2, xm3 = xm3, ym1 = ym1, ym2 = ym2, ym3 = ym3, ranseed = ranseed) ## add pseudodata xtrans <- theta Rtrans <- R if (is.na(pseudodata)[1]) { snpdat <- data.frame(xtrans[!is.na(xtrans) & !is.na(Rtrans)], Rtrans[!is.na(xtrans) & !is.na(Rtrans)]) } else { snpdat <- data.frame(c(xtrans[!is.na(xtrans) & !is.na(Rtrans)], pseudodata[, 1]), c(Rtrans[!is.na(xtrans) & !is.na(Rtrans)], pseudodata[, 2])) } ## determine number of clusters tmptab <- table(indata$GType[p, ][is.element(indata$GType[p, ], c("AA", "AB", "BB"))]) tmptab <- tmptab[tmptab > 0] numclust <- length(tmptab) if (numclust == 0) { numclust <- 3 tmptab <- table(c("AA", "AB", "BB")) } if (dim(unique(snpdat))[1] > 5) { snpmc <- Mclust(snpdat, G = numclust:numclust, modelNames = c("EEE", "EEV")) classif <- rep(NA, length(xtrans)) uncert <- rep(NA, length(xtrans)) classif[!is.na(xtrans) & !is.na(Rtrans)] <- snpmc$classification[1:length(xtrans[!is.na(xtrans) & !is.na(Rtrans)])] uncert[!is.na(xtrans) & !is.na(Rtrans)] <- snpmc$uncertainty[1:length(xtrans[!is.na(xtrans) & !is.na(Rtrans)])] classif2 <- classif classif2[uncert > uncertcutoff] <- 4 reorder <- rank(snpmc$parameters$mean[1, ]) classif2 <- names(tmptab)[reorder[classif2]] classif2[is.na(classif2)] <- 4 callrate <- (table(classif2)["AA"] + table(classif2)["AB"] + table(classif2)["BB"])/length(classif2) calltxt <- rep("NC", length(classif2)) calltxt[classif2 == "AA"] <- "AA" calltxt[classif2 == "AB"] <- "AB" calltxt[classif2 == "BB"] <- "BB" } else { calltxt <- rep("NC", length(theta)) uncert <- rep(NA, length(theta)) } plot(indata$logratio[p, ], indata$R.trans[p, ], xlab = "Normalized log-ratio", ylab = "R-transformed", xlim = c(-lims, lims), las = 1, main = paste("C", paste(rep(" ", 77), collapse = ""), collapse = "")) points(pseudodata, col = 2, pch = 4) for (i in 1:3) { mu <- snpmc$parameters$mean[, i] sigma <- snpmc$parameters$variance$sigma[, , i] ellipse(mu, sigma, alpha = 0.5) } plot(theta, R, type = "n", main = paste("D", paste(rep(" ", 77), collapse = ""), collapse = ""), xlab = "Normalized log-ratio", ylab = "R-transformed", xlim = c(-lims, lims), las = 1) points(theta[classif2 == "AA"], R[classif2 == "AA"], col = 4, pch = 1) points(theta[classif2 == "AB"], R[classif2 == "AB"], col = 3, pch = 2) points(theta[classif2 == "BB"], R[classif2 == "BB"], col = 6, pch = 5) points(theta[classif2 == "4"], R[classif2 == "4"], pch = 4) dev.off() ## Figure 2 ## Create reference genotypes and save to RefGenos.Rdata load("GenotypeData.Rdata") rm(buccaltable) bloodtable2 <- prepdata(bloodtable) np <- bloodtable2$N * bloodtable2$P calls <- data.frame(SNP = "", SampleID = "", MClustCalls = "", Uncertainty = rep(NA, np), stringsAsFactors = FALSE) for (j in 1:bloodtable2$P) { calls[(1:bloodtable2$N + (j - 1) * bloodtable2$N), ] <- snpmclust(bloodtable2, p = j, uncertcutoff = 1)$calls } GScalls <- as.vector(t(bloodtable[, setdiff(grep("GType", names(bloodtable)), grep("Custom.GType", names(bloodtable)))])) GSscore <- as.vector(t(bloodtable[, setdiff(grep("Score", names(bloodtable)), grep("GenTrain.Score", names(bloodtable)))])) SNP <- rep(" ", length(GSscore)) N <- length(setdiff(grep("GType", names(bloodtable)), grep("Custom.GType", names(bloodtable)))) for (i in 1:dim(bloodtable)[1]) { SNP[(((i - 1) * 94) + 1):(i * 94)] <- bloodtable$Name[i] } SampleID <- rep(sub(".Theta", "", names(bloodtable)[grep("Theta", names(bloodtable))], 1, 12), length(bloodtable$Name)) bloodcalls <- data.frame(SNP, SampleID, GScalls, GSscore, stringsAsFactors = FALSE) calls <- merge(calls, bloodcalls, by = c("SNP", "SampleID")) uncertcut <- 10^(-7) scorecut <- 0.5 sum(calls$Uncertainty < uncertcut, na.rm = TRUE)/length(calls$Uncertainty) sum(calls$GSscore > scorecut, na.rm = TRUE)/length(calls$GSscore) goodcalls <- calls[is.element(calls$MClustCalls, c("AA", "AB", "BB")) & calls$MClustCalls == calls$GScalls & calls$Uncertainty < uncertcut & calls$GSscore > scorecut, ] tabl <- table(goodcalls$SNP) keepsnps <- names(tabl)[tabl == bloodtable2$N] length(keepsnps) goodcalls <- goodcalls[is.element(goodcalls$SNP, keepsnps), ] dim(goodcalls) length(unique(goodcalls$SNP)) table(goodcalls$MClustCalls, goodcalls$GScalls) RefGenos <- data.frame(SNP = as.character(goodcalls$SNP), SampleID = as.character(goodcalls$SampleID), GT = as.character(goodcalls$MClustCalls), stringsAsFactors = FALSE) ## table(table(RefGenos$SNP)) ## table(table(RefGenos$SampleID)) ## table(RefGenos$GT) ## str(RefGenos) ## save(RefGenos, file = "RefGenos.Rdata") ## Create buccal genotype calls and save to BuccalCalls.Rdata load("GenotypeData.Rdata") buccaltable2 <- prepdata(buccaltable) np <- buccaltable2$N * buccaltable2$P calls <- data.frame(SNP = "", SampleID = "", MClustCalls = "", Uncertainty = rep(NA, np), stringsAsFactors = FALSE) for (j in 1:buccaltable2$P) { calls[(1:buccaltable2$N + (j - 1) * buccaltable2$N), ] <- snpmclust(buccaltable2, p = j, uncertcutoff = 1)$calls } buccalcalls <- calls rm(calls) ## save(calls, file = "BuccalCalls.Rdata") ## rm(list = ls()) ## Create bootstrap confidence intervals for figure. This takes a while to run; ## you may wish to test using a lower value of BB. Saves results to ## MedianCutoffCI.Rdata BB <- 400 ## value used for manuscript BB <- 20 ## value used for testing load("GenotypeData.Rdata") GScalls <- as.vector(t(buccaltable[, setdiff(grep("GType", names(buccaltable)), grep("Custom.GType", names(buccaltable)))])) GSscore <- as.vector(t(buccaltable[, setdiff(grep("Score", names(buccaltable)), grep("GenTrain.Score", names(buccaltable)))])) SNP <- rep(" ", length(GSscore)) N <- length(setdiff(grep("GType", names(buccaltable)), grep("Custom.GType", names(buccaltable)))) for (i in 1:dim(buccaltable)[1]) { SNP[(((i - 1) * 94) + 1):(i * 94)] <- buccaltable$Name[i] } SampleID <- rep(substr(names(buccaltable)[grep("Theta", names(buccaltable))], 1, 12), length(buccaltable$Name)) SampleID <- sub(".Theta", "", SampleID) GScalls <- data.frame(SNP, SampleID, GScalls, GSscore, stringsAsFactors = FALSE) ## load("BuccalCalls.Rdata") calls <- merge(buccalcalls, GScalls, by = c("SNP", "SampleID")) ## load("RefGenos.Rdata") xref <- merge(calls, RefGenos, by = c("SNP", "SampleID")) xref$Uncertainty[is.na(xref$Uncertainty)] <- 1 xref$MCconc <- xref$MClustCalls == xref$GT xref$GSscore[is.na(xref$GSscore)] <- 0 xref$GSconc <- xref$GScall == xref$GT getQ <- function(snp, Qcut = 0.5) { tmp <- xref[xref$SNP == snp, "Uncertainty"] return(quantile(tmp, Qcut)) } snps <- unique(xref$SNP) quantval <- sapply(snps, function(x) getQ(x)) tmp <- data.frame(SNP = snps, Q50 = quantval) xref <- merge(xref, tmp, by = "SNP") dim(xref)[1]/dim(tmp)[1] xref$Uncertainty <- apply(cbind(xref$Uncertainty, xref$Q50), 1, max) N <- length(unique(xref$SampleID)) P <- length(unique(xref$SNP)) mcfuns <- list() set.seed(1927) for (jj in 1:BB) { tmpsnp <- as.data.frame(table(sample(unique(xref$SNP), P, replace = TRUE))) names(tmpsnp) <- c("SNP", "snps") tmpout <- merge(xref, tmpsnp, by = "SNP") tmpout$n <- tmpout$snps tmpout$mcwt <- tmpout$MCconc * tmpout$n mc <- aggregate(mcwt ~ Uncertainty, data = tmpout, FUN = "sum") tmp <- aggregate(n ~ Uncertainty, data = tmpout, FUN = "sum") mc <- merge(mc, tmp, by = "Uncertainty") rm(tmp) mc$fp <- 1 - cumsum(mc[, 2])/cumsum(mc[, 3]) mc$cr <- cumsum(mc[, 2])/sum(mc[, 2]) mcline <- approxfun(mc$cr, mc$fp) mcfuns <- append(mcfuns, mcline) } mcx <- unique(mc$cr) mc975 <- sapply(mcx, function(x) quantile(unlist(lapply(mcfuns, function(fun) fun(x))), 0.975, na.rm = TRUE)) mc025 <- sapply(mcx, function(x) quantile(unlist(lapply(mcfuns, function(fun) fun(x))), 0.025, na.rm = TRUE)) ## save(mcx, mc975, mc025, file = "MedianCutoffCI.Rdata") ## rm(list = ls()) ## Merge GenomeStudio, SNPMClust, and reference genotype calls, and plot results. load("GenotypeData.Rdata") GScalls <- as.vector(t(buccaltable[, setdiff(grep("GType", names(buccaltable)), grep("Custom.GType", names(buccaltable)))])) GSscore <- as.vector(t(buccaltable[, setdiff(grep("Score", names(buccaltable)), grep("GenTrain.Score", names(buccaltable)))])) SNP <- rep(" ", length(GSscore)) N <- length(setdiff(grep("GType", names(buccaltable)), grep("Custom.GType", names(buccaltable)))) for (i in 1:dim(buccaltable)[1]) { SNP[(((i - 1) * 94) + 1):(i * 94)] <- buccaltable$Name[i] } SampleID <- rep(substr(names(buccaltable)[grep("Theta", names(buccaltable))], 1, 12), length(buccaltable$Name)) SampleID <- sub(".Theta", "", SampleID) GScalls <- data.frame(SNP, SampleID, GScalls, GSscore, stringsAsFactors = FALSE) ## load("BuccalCalls.Rdata") calls <- merge(buccalcalls, GScalls, by = c("SNP", "SampleID")) ## load("RefGenos.Rdata") xref <- merge(calls, RefGenos, by = c("SNP", "SampleID")) pdf("fig2.pdf") par(mar = c(5, 4, 2, 2) + 0.1) plot(1, 1, type = "n", xlim = c(60, 100), ylim = c(0, 0.8), las = 1, xlab = "Overall call rate (%)", ylab = "False call rate (%)") ## load("MedianCutoffCI.Rdata") mcup <- lowess(mcx, mc975, f = 0.1) mclo <- lowess(mcx, mc025, f = 0.1) polygon(x = c(100 * mclo$x, 100 * mcup$x[length(mcup$x):1]), y = c(100 * mclo$y, 100 * mcup$y[length(mcup$y):1]), col = "lightgray", border = FALSE) getQ <- function(snp, Qcut = 0) { tmp <- xreftmp[xreftmp$SNP == snp, "GSscore"] return(quantile(tmp, Qcut)) } qval <- c(0.1, 0.3, 0.5, 1) txt <- c("No quantile cutoff", "Q=0.50 (GC50)", "Q=0.70 (GC30)", "Q=0.90 (GC10)") lwgt <- c(1, 1, 1, 2) colr <- c(4, 3, 2, 1) snps <- unique(xref$SNP) for (j in 1:4) { xreftmp <- xref xreftmp$Uncertainty[is.na(xreftmp$Uncertainty)] <- 1 xreftmp$MCconc <- xreftmp$MClustCalls == xreftmp$GT xreftmp$GSscore[is.na(xreftmp$GSscore)] <- 0 xreftmp$GSconc <- xreftmp$GScall == xreftmp$GT quantval <- sapply(snps, function(x) getQ(x, qval[j])) tmp <- data.frame(SNP = snps, Q85 = quantval, stringsAsFactors = FALSE) xreftmp <- merge(xreftmp, tmp, by = "SNP") xreftmp$GSscore <- apply(cbind(xreftmp$GSscore, xreftmp$Q85), 1, min) mc <- aggregate(MCconc ~ Uncertainty, data = xreftmp, FUN = "sum") tmp <- aggregate(MCconc ~ Uncertainty, data = xreftmp, FUN = "length") mc <- merge(mc, tmp, by = "Uncertainty") rm(tmp) mc$fp <- 1 - cumsum(mc[, 2])/cumsum(mc[, 3]) mc$cr <- cumsum(mc[, 2])/sum(mc[, 2]) mcline <- approxfun(100 * mc$cr, 100 * mc$fp) gs <- aggregate(GSconc ~ GSscore, data = xreftmp, FUN = "sum") tmp <- aggregate(GSconc ~ GSscore, data = xreftmp, FUN = "length") gs <- merge(gs, tmp, by = "GSscore") rm(tmp) gs <- gs[order(-gs$GSscore), ] gs$fp <- 1 - cumsum(gs[, 2])/cumsum(gs[, 3]) gs$cr <- cumsum(gs[, 2])/sum(gs[, 2]) gsline <- approxfun(100 * gs$cr, 100 * gs$fp) curve(gsline, add = TRUE, col = colr[j], lwd = lwgt[j]) } qval <- c(0.9, 0.7, 0.5, 0) getQ <- function(snp, Qcut = 0) { tmp <- xreftmp[xreftmp$SNP == snp, "Uncertainty"] return(quantile(tmp, Qcut)) } for (j in 1:4) { xreftmp <- xref xreftmp$Uncertainty[is.na(xreftmp$Uncertainty)] <- 1 xreftmp$MCconc <- xreftmp$MClustCalls == xreftmp$GT xreftmp$GSscore[is.na(xreftmp$GSscore)] <- 0 xreftmp$GSconc <- xreftmp$GScall == xreftmp$GT snps <- unique(xreftmp$SNP) quantval <- sapply(snps, function(x) getQ(x, qval[j])) tmp <- data.frame(SNP = snps, Q85 = quantval) xreftmp <- merge(xreftmp, tmp, by = "SNP") xreftmp$Uncertainty <- apply(cbind(xreftmp$Uncertainty, xreftmp$Q85), 1, max) mc <- aggregate(MCconc ~ Uncertainty, data = xreftmp, FUN = "sum") tmp <- aggregate(MCconc ~ Uncertainty, data = xreftmp, FUN = "length") mc <- merge(mc, tmp, by = "Uncertainty") rm(tmp) mc$fp <- 1 - cumsum(mc[, 2])/cumsum(mc[, 3]) mc$cr <- cumsum(mc[, 2])/sum(mc[, 2]) mcline <- approxfun(100 * mc$cr, 100 * mc$fp) gs <- aggregate(GSconc ~ GSscore, data = xreftmp, FUN = "sum") tmp <- aggregate(GSconc ~ GSscore, data = xreftmp, FUN = "length") gs <- merge(gs, tmp, by = "GSscore") rm(tmp) gs <- gs[order(-gs$GSscore), ] gs$fp <- 1 - cumsum(gs[, 2])/cumsum(gs[, 3]) gs$cr <- cumsum(gs[, 2])/sum(gs[, 2]) gsline <- approxfun(gs$cr, gs$fp) curve(mcline, add = TRUE, col = colr[j], lwd = lwgt[j]) } par(lend = 1) legend(60, 0.8, legend = c(txt, "SNPMClust (Q=0.50) 95% CI"), lwd = c(lwgt[4:1], 15), col = c(colr[4:1], "lightgray")) text(70, 0.25, "GenCall") text(85, 0.15, "SNPMClust") dev.off()