################################################################################################# ## Manuscript: Pseudo-Ranks: How to Calculate Them Efficiently in R ## ## Author: Martin Happ, Georg Zimmermann, Edgar Brunner, Arne C. Bathke ## ## Description: This file generates Tables 5, 6, 7, 8 as well as Figures 1, 2, 3 and ## reproduces the example from Section 'Application of pseudo-ranks' ## Note that the evaluation of this script takes very long (several days for the simulations). ## Results (Tables and Figures) are saved in the current working directory. ################################################################################################# # load packages library("compiler") library("data.table") library("pracma") library("pseudorank") library("xtable") library("ggplot2") library("doBy") suppressWarnings(RNGversion("3.5")) # number each algorithm is run for microbenchmark nrun <- 10^3 ## Section 3.3: Examples for the usage of the function 'pseudorank' ###################################################################################### df <- data.frame(data = c(1, 2, 2, 3, 4), group = as.factor(c(1, 1, 2, 2, 3))) pseudorank(df$data, df$group) pseudorank(data ~ group, df) df <- data.frame(data = c(NA, 2, 2, 3, 4), group = as.factor(c(1, 1, 2, 2, 3))) pseudorank(data ~ group, data = df, na.last = TRUE) pseudorank(data ~ group, data = df, na.last = FALSE) pseudorank(data ~ group, data = df, na.last = NA) df <- data.frame(data = c(1, 7, 1, 2, 3, 3, 5.5, 6, 7), group = as.factor( c(1, 1, 1, 2, 2, 3, 3, 3, 3))) pseudorank(df$data, df$group, ties.method = "max") ## Section 5: Example 'Application of pseudo-ranks' ###################################################################################### # artificial data set data("ParadoxicalRanks", package = "pseudorank") dat <- ParadoxicalRanks N <- dim(dat)[1] dat[, "ranks"] <- (rank(dat[, "score"]) - 1/2)/N dat[, "pseudoranks"] <- (pseudorank(score ~ conc, data = dat) - 1/2)/N summaryBy(score + ranks + pseudoranks ~ conc, data = dat, FUN = mean) tab5_result <- quote(print(xtable(summaryBy(score + ranks + pseudoranks ~ conc, data = dat, FUN = mean)), include.rownames = FALSE)) # generate larger samples based on artificial data set.seed(1) n <- c(60, 360, 120) x1 <- sample(subset(dat, dat$conc == 1)$score, n[1], replace = TRUE) x2 <- sample(subset(dat, dat$conc == 2)$score, n[2], replace = TRUE) x3 <- sample(subset(dat, dat$conc == 3)$score, n[3], replace = TRUE) dat2 <- data.frame(score = c(x1, x2, x3), conc = factor(c(rep(1, n[1]), rep(2, n[2]), rep(5, n[3])), ordered = TRUE)) # Hettmansperger-Norton test hettmansperger_norton_test(score ~ conc, data = dat2, pseudoranks = FALSE, alternative = "increasing") hettmansperger_norton_test(score ~ conc, data = dat2, pseudoranks = TRUE, alternative = "increasing") hettmansperger_norton_test(score ~ conc, data = dat, pseudoranks = TRUE)$pHat ## Definition of different pseudo-rank algorithm ###################################################################################### # count-algorithm generalizedRanks_tmp <- function(dat, n, t, method = "unweighted") { a <- length(n) X <- dat[, data] nobs <- length(X) granks <- pr <- rep(0, nobs) # pseudo-ranks if (method == "unweighted") { weights <- rep(1/n[1], n[1]*t) for (i in 2:a) { weights <- c(weights, rep(1/n[i], n[i]*t)) } ff <- function(k) { return(((sign(X[k] - t(X)) + 1)*0.5) %*% (weights)*sum(n)/a + 0.5) } granks <- vapply(1:nobs, FUN = ff, FUN.VALUE = numeric(1L)) } # ranks else if (method == "weighted") { granks <- ranks(dat[, data], ties.method = "average") } else { granks <- ranks(dat[, data], ties.method = "average") } return(granks) } generalizedRanks <- cmpfun(generalizedRanks_tmp) # AB-algorithm AB_tmp <- function(dat, n, d, lambda, kgv) { if (max(lambda) > 1) { len <- dim(dat)[1] prData <- list(dat) z <- levels(dat[, group]) a <- nlevels(dat[, group]) # amplify data to artificially create balanced groups for (i in 1:a) { prData[[i+1]] <- dat[group == z[i]][rep(1:(n[i]*d), each = (lambda[i] - 1)), ] } dat <- rbindlist(prData) dat[, data := (rank(dat[, data], ties.method = "average") - 1/2)*1/(kgv*a*d)] dat <- dat[1:len, ] } else { dat[, data := rank(dat[, data], ties.method = "average")] } # select original observations from amplified data return(dat[, data]) } AB <- cmpfun(AB_tmp) # pairwise alogrithm pairwise_tmp <- function(data, group, n) { group <- factor(group, labels = 1:length(n)) df <- data.frame(data = data, group = group) a <- length(n) N <- sum(n) samples <- split(df, df$group) W <- kronecker(t(rep(1/a, a)), diag(a)) samplesR <- lapply(1:a, function(arg) { helpmat <- rbind(1:a, matrix(1:a, nrow = a, ncol = a)) x1 <- samples[[helpmat[1, arg]]] x1 <- x1$data help <- 0 for (j in 1:a) { x2 <- samples[[helpmat[j+1, arg]]] x2 <- x2$data help <- help + 1/length(x2)*(rank(c(x1, x2))[1:length(x1)] - rank(x1)) } N/a*help + 1/2 }) return(samplesR) } pairwise <- cmpfun(pairwise_tmp) ## Table 6, Figure 1 ############################################################################################## cat("Benchmark 1\n") # vector for group sizes m <- c(10, 100, 1000, 2000, 4000, 6000, 8000, 10000) timeRank <- rep(0, length(m)) timeAB <- rep(0, length(m)) timePRank <- rep(0, length(m)) timePRankCpp <- rep(0, length(m)) timePRankPair <- rep(0, length(m)) timePRankDirect <- rep(0, length(m)) for (l in 1:length(m)) { n <- c(m[l], m[l], 2*m[l]) kgv <- Reduce(Lcm, n) lambda <- kgv/n cat("l: ", l, "/", length(m), "\n") t <- 1 group1p1 = matrix( rnorm(n[[1]], 0, 1), ncol = 1, nrow = n[1]) group2p1 = matrix( rnorm(n[[2]], 0, 1), ncol = 1, nrow = n[2]) group3p1 = matrix( rnorm(n[[3]], 0, 1), ncol = 1, nrow = n[3]) g1p1 = data.frame(data = group1p1, group = 1) g2p1 = data.frame(data = group2p1, group = 2) g3p1 = data.frame(data = group3p1, group = 3) X = as.data.frame(rbind(g1p1, g2p1, g3p1)) X$group <- as.factor(X$group) X <- as.data.table(X) ord <- order(X$data) # perform benchmarks benchPRankCpp <- microbenchmark::microbenchmark(pseudorank(X$data, X$group), times = nrun, unit = "ms") benchRank <- microbenchmark::microbenchmark(rank(X$data, ties.method = "average"), times = nrun, unit = "ms") benchAB <- microbenchmark::microbenchmark(AB(X, n, t, lambda, kgv)*sum(n)*t + 0.5, times = nrun, unit = "ms") benchPair <- microbenchmark::microbenchmark(pairwise(X$data, X$group, n), times = nrun, unit = "ms") benchDirect <- microbenchmark::microbenchmark(generalizedRanks(X, n, t), times = nrun, unit = "ms") timePRankCpp[l] <- summary(benchPRankCpp)$median timeAB[l] <- summary(benchAB)$median timeRank[l] <- summary(benchRank)$median timePRankPair[l] <- summary(benchPair)$median timePRankDirect[l] <- summary(benchDirect)$median } df <- data.frame(dimension = m, Ranks = timeRank, PseudoRankCpp = timePRankCpp, AB = timeAB, PseudoRankPair = timePRankPair, PseudoRankDirect = timePRankDirect) df_three_groups <- data.frame(Method = c(rep("Ranks", length(m)), rep("RECPR", length(m)), rep("AB", length(m)), rep("Pairwise", length(m)), rep("count", length(m))), n = df$dimension) df_three_groups$time <- c(df$Ranks, df$PseudoRanks, df$PseudoRankCpp, df$AB, df$PseudoRankPair , df$PseudoRankDirect) df_three_groups$logtime <- log(df_three_groups$time) p1 <- ggplot(data = df_three_groups, aes(x = n, y = logtime, group = Method, col = Method)) + geom_line(aes(linetype = Method)) + geom_point() + scale_linetype_manual(values = c("solid", "solid", "solid", "solid", "solid")) + xlab("Sample Size n") + ylab("Log Computation Time") + # theme(axis.text = element_text(size = 15), axis.title = element_text(size = 15, face = "bold")) + theme(axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) + theme(axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0))) + scale_y_continuous(limits = c(-4, 6)) + theme_bw(base_size = 12) + theme(# Adjust legend position to maximize space, use a vector of proportion # across the plot and up the plot where you want the legend. # You can also use "left", "right", "top", "bottom", for legends on # the side of the plot legend.position = "none", # increase the font size text = element_text(size = 14)) + annotate("text", label = "count", x = 1100, y = 4, color = "orange") + annotate("text", label = "pairwise", x = 3000, y = 3.2, color = "darkgreen") + annotate("text", label = "AB", x = 3000, y = 2, color = "red") + annotate("text", label = "RECPR", x = 2000, y = 3/4, color = "purple") + annotate("text", label = "ranks", x = 3010, y = -0.5, color = "blue") ggsave(file = "three_groups.pdf", plot = p1, width = 6, height = 4) ## Table 7, Figure 2 ################################################################################################# cat("Benchmark 2\n") # vector of group sizes m <- c(10, 100, 1000, 2000, 4000, 6000, 8000, 10000) timeRank <- rep(0, length(m)) timeAB <- rep(0, length(m)) timePRank <- rep(0, length(m)) timePRankCpp <- rep(0, length(m)) timePRankPair <- rep(0, length(m)) timePRankDirect <- rep(0, length(m)) timesIQR <- data.frame(dim = gl(length(m), 4, labels = m), lower = 1:4, upper = 1:4) # Setting n <- c(20, 20, 40) # sample sizes N <- sum(n) alpha <- 0.05 kgv <- Reduce(Lcm, n) lambda <- kgv/n for (l in 1:length(m)) { n <- c(m[l], m[l], m[l], m[l], 2*m[l]) kgv <- Reduce(Lcm, n) lambda <- kgv/n cat("l: ", l, "/", length(m), "\n") t <- 1 group1p1 <- round(matrix( rnorm(n[[1]], 0, 1), ncol = 1, nrow = n[1])) group2p1 <- round(matrix( rnorm(n[[2]], 0, 1), ncol = 1, nrow = n[2])) group3p1 <- round(matrix( rnorm(n[[3]], 0, 1), ncol = 1, nrow = n[3])) group4p1 <- round(matrix( rnorm(n[[4]], 0, 1), ncol = 1, nrow = n[4])) group5p1 <- round(matrix( rnorm(n[[5]], 0, 1), ncol = 1, nrow = n[5])) g1p1 <- data.frame(data = group1p1, group = 1) g2p1 <- data.frame(data = group2p1, group = 2) g3p1 <- data.frame(data = group3p1, group = 3) g4p1 <- data.frame(data = group4p1, group = 4) g5p1 <- data.frame(data = group5p1, group = 5) X <- as.data.frame(rbind(g1p1, g2p1, g3p1, g4p1, g5p1)) X$group <- as.factor(X$group) X <- as.data.table(X) # perform benchmarks benchPRankCpp <- microbenchmark::microbenchmark(pseudorank(X$data, X$group), times = nrun, unit = "ms") benchRank <- microbenchmark::microbenchmark(rank(X$data, ties.method = "average"), times = nrun, unit = "ms") benchAB <- microbenchmark::microbenchmark(AB(X, n, t, lambda, kgv)*sum(n)*t+0.5, times = nrun, unit = "ms") benchPair <- microbenchmark::microbenchmark(pairwise(X$data, X$group, n), times = nrun, unit = "ms") benchDirect <- microbenchmark::microbenchmark(generalizedRanks(X, n, t), times = nrun, unit = "ms") timePRankCpp[l] <- summary(benchPRankCpp)$median timeAB[l] <- summary(benchAB)$median timeRank[l] <- summary(benchRank)$median timePRankPair[l] <- summary(benchPair)$median timePRankDirect[l] <- summary(benchDirect)$median } df5 <- data.frame(dimension = m, Ranks = timeRank, PseudoRankCpp = timePRankCpp, AB = timeAB, PseudoRankPair = timePRankPair, PseudoRankDirect = timePRankDirect) df_five_groups <- data.frame(Method = c(rep("Ranks", length(m)), rep("Stepwise", length(m)), rep("AB", length(m)), rep("Pairwise", length(m)), rep("Direct", length(m))), n = df5$dimension) df_five_groups$time <- c(df5$Ranks, df5$PseudoRanks, df5$PseudoRankCpp, df5$AB, df5$PseudoRankPair, df5$PseudoRankDirect) df_five_groups$logtime <- log(df_five_groups$time) p2 <- ggplot(data = df_five_groups, aes(x = n, y = logtime, group = Method, col = Method)) + geom_line() + geom_point() + xlab("Sample Size n") + ylab("Log Computation Time") + #theme(axis.text = element_text(size = 15), axis.title = element_text(size = 15, face = "bold")) + theme(axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) + theme(axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0))) + theme_bw(base_size = 12) + scale_y_continuous(limits = c(-3, 8)) + annotate("text", label = "count", x = 1000, y = 4, color = "orange") + annotate("text", label = "pairwise", x = 3000, y = 4, color = "darkgreen") + annotate("text", label = "AB", x = 3000, y = 2, color = "red") + annotate("text", label = "RECPR", x = 2000, y = 1, color = "purple") + annotate("text", label = "ranks", x = 3010, y = 0, color = "blue") + theme(# Adjust legend position to maximize space, use a vector of proportion # across the plot and up the plot where you want the legend. # You can also use "left", "right", "top", "bottom", for legends on # the side of the plot legend.position = "none", # increase the font size text = element_text(size = 14)) ggsave(file = "five_groups.pdf", plot = p2, width = 6, height = 4) ## Table 8, Figure 3 ################################################################################################ cat("Benchmark 3\n") # group sizes m <- c(10, seq(from = 100, to = 1000, by = 100)) timeRank <- rep(0, length(m)) timeAB <- rep(0, length(m)) timePRank <- rep(0, length(m)) timePRankCpp <- rep(0, length(m)) timePRankPair <- rep(0, length(m)) timePRankR <- rep(0, length(m)) timePRankDirect <- rep(0, length(m)) timesIQR <- data.frame(dim = gl(length(m), 4, labels = m), lower = 1:4, upper = 1:4) # Setting n <- c(20, 20, 40) # sample sizes N <- sum(n) alpha <- 0.05 kgv <- Reduce(Lcm, n) lambda <- kgv/n for (l in 1:length(m)) { n <- c(m[l], m[l], m[l], m[l], m[l], m[l], m[l], m[l], m[l], m[l], m[l], 2*m[l]) kgv <- Reduce(Lcm, n) lambda <- kgv/n cat("l: ", l, "/", length(m), "\n") t <- 1 group1p1 <- round(matrix( rnorm(n[[1]], 0, 1), ncol = 1, nrow = n[1])) group2p1 <- round(matrix( rnorm(n[[2]], 0, 1), ncol = 1, nrow = n[2])) group3p1 <- round(matrix( rnorm(n[[3]], 0, 1), ncol = 1, nrow = n[3])) group4p1 <- round(matrix( rnorm(n[[4]], 0, 1), ncol = 1, nrow = n[4])) group5p1 <- round(matrix( rnorm(n[[5]], 0, 1), ncol = 1, nrow = n[5])) group6p1 <- round(matrix( rnorm(n[[6]], 0, 1), ncol = 1, nrow = n[6])) group7p1 <- round(matrix( rnorm(n[[7]], 0, 1), ncol = 1, nrow = n[7])) group8p1 <- round(matrix( rnorm(n[[8]], 0, 1), ncol = 1, nrow = n[8])) group9p1 <- round(matrix( rnorm(n[[9]], 0, 1), ncol = 1, nrow = n[9])) group10p1 <- round(matrix( rnorm(n[[10]], 0, 1), ncol = 1, nrow = n[10])) group11p1 <- round(matrix( rnorm(n[[11]], 0, 1), ncol = 1, nrow = n[11])) group12p1 <- round(matrix( rnorm(n[[12]], 0, 1), ncol = 1, nrow = n[12])) g1p1 <- data.frame(data = group1p1, group = 1) g2p1 <- data.frame(data = group2p1, group = 2) g3p1 <- data.frame(data = group3p1, group = 3) g4p1 <- data.frame(data = group4p1, group = 4) g5p1 <- data.frame(data = group5p1, group = 5) g6p1 <- data.frame(data = group6p1, group = 6) g7p1 <- data.frame(data = group7p1, group = 7) g8p1 <- data.frame(data = group8p1, group = 8) g9p1 <- data.frame(data = group9p1, group = 9) g10p1 <- data.frame(data = group10p1, group = 10) g11p1 <- data.frame(data = group11p1, group = 11) g12p1 <- data.frame(data = group12p1, group = 12) X <- as.data.frame(rbind(g1p1, g2p1, g3p1, g4p1, g5p1, g6p1, g7p1, g8p1, g9p1, g10p1, g11p1, g12p1)) X$group <- as.factor(X$group) X <- as.data.table(X) # perform benchmarks benchPRankCpp <- microbenchmark::microbenchmark(pseudorank(X$data, X$group), times = nrun, unit = "ms") benchRank <- microbenchmark::microbenchmark(rank(X$data, ties.method = "average"), times = nrun, unit = "ms") benchAB <- microbenchmark::microbenchmark(AB(X, n, t, lambda, kgv)*sum(n)*t+0.5, times = nrun, unit = "ms") benchPair <- microbenchmark::microbenchmark(pairwise(X$data, X$group, n), times = nrun, unit = "ms") benchDirect <- microbenchmark::microbenchmark(generalizedRanks(X, n, t), times = nrun, unit = "ms") timePRankCpp[l] <- summary(benchPRankCpp)$median timeAB[l] <- summary(benchAB)$median timeRank[l] <- summary(benchRank)$median timePRankPair[l] <- summary(benchPair)$median timePRankDirect[l] <- summary(benchDirect)$median } df12 <- data.frame(dimension = m, Ranks = timeRank, PseudoRankCpp = timePRankCpp, AB = timeAB, PseudoRankPair = timePRankPair, PseudoRankDirect = timePRankDirect) df_twelve_groups <- data.frame(Method = c(rep("Ranks", length(m)), rep("RECPR", length(m)), rep("AB", length(m)), rep("Pairwise", length(m)), rep("count", length(m))), n = df12$dimension) df_twelve_groups$time <- c(df12$Ranks, df12$PseudoRanks, df12$PseudoRankCpp, df12$AB, df12$PseudoRankPair, df12$PseudoRankDirect) df_twelve_groups$logtime <- log(df_twelve_groups$time) p3 <- ggplot(data = df_twelve_groups, aes(x = n, y = logtime, group = Method, col = Method)) + geom_line() + geom_point() + xlab("Sample Size n") + ylab("Log Computation Time") + # theme(axis.text = element_text(size = 15), axis.title = element_text(size = 15, face = "bold")) + theme(axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) + theme(axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0))) + theme_bw(base_size = 12) + scale_y_continuous(limits = c(-4, 8)) + annotate("text", label = "count", x = 100, y = 4.5, color = "orange") + annotate("text", label = "pairwise", x = 300, y = 4, color = "darkgreen") + annotate("text", label = "AB", x = 300, y = 2, color = "red") + annotate("text", label = "RECPR", x = 200, y = 0, color = "purple") + annotate("text", label = "ranks", x = 301, y = -2, color = "blue") + theme(# Adjust legend position to maximize space, use a vector of proportion # across the plot and up the plot where you want the legend. # You can also use "left", "right", "top", "bottom", for legends on # the side of the plot legend.position = "none", # increase the font size text = element_text(size = 14)) ggsave(file = "twelve_groups.pdf", plot = p3, width = 6, height = 4) ## Save tables in working directory as .tex files #################################################################################### directory <- getwd() fileConn <- file(file.path(directory, "Table_5.tex")) writeLines(eval(tab5_result), fileConn) close(fileConn) fileConn <- file(file.path(directory, "Table_6.tex")) writeLines(print(xtable(df), include.rownames = FALSE), fileConn) close(fileConn) fileConn <- file(file.path(directory, "Table_7.tex")) writeLines(print(xtable(df5), include.rownames = FALSE), fileConn) close(fileConn) fileConn <- file(file.path(directory, "Table_8.tex")) writeLines(print(xtable(df12), include.rownames = FALSE), fileConn) close(fileConn)