#################################################################################################
## Manuscript: Pseudo-Ranks: How to Calculate Them Efficiently in R
##
## Author: Martin Happ, Georg Zimmermann, Edgar Brunner, Arne C. Bathke
##
## Description: This file generates Tables 5, 6, 7, 8 as well as Figures 1, 2, 3 and 
##    reproduces the example from Section 'Application of pseudo-ranks' 
##    Note that the evaluation of this script takes very long (several days for the simulations).
##    Results (Tables and Figures) are saved in the current working directory.
#################################################################################################

# load packages
library("compiler")
library("data.table")
library("pracma")
library("pseudorank")
library("xtable")
library("ggplot2")
library("doBy")

suppressWarnings(RNGversion("3.5"))

# number each algorithm is run for microbenchmark
nrun <- 10^3

## Section 3.3: Examples for the usage of the function 'pseudorank'
######################################################################################

df <- data.frame(data = c(1, 2, 2, 3, 4), group = as.factor(c(1, 1, 2, 2, 3)))
pseudorank(df$data, df$group)
pseudorank(data ~ group, df)

df <- data.frame(data = c(NA, 2, 2, 3, 4), group = as.factor(c(1, 1, 2, 2, 3)))
pseudorank(data ~ group, data = df, na.last = TRUE)
pseudorank(data ~ group, data = df, na.last = FALSE)
pseudorank(data ~ group, data = df, na.last = NA)

df <- data.frame(data = c(1, 7, 1, 2, 3, 3, 5.5, 6, 7),
                 group = as.factor( c(1, 1, 1, 2, 2, 3, 3, 3, 3)))
pseudorank(df$data, df$group, ties.method = "max")

## Section 5: Example 'Application of pseudo-ranks'
######################################################################################

# artificial data set
data("ParadoxicalRanks", package = "pseudorank")
dat <- ParadoxicalRanks
N <- dim(dat)[1]
dat[, "ranks"] <- (rank(dat[, "score"]) - 1/2)/N
dat[, "pseudoranks"] <- (pseudorank(score ~ conc, data = dat) - 1/2)/N

summaryBy(score + ranks + pseudoranks ~ conc, data = dat, FUN = mean)

tab5_result <- quote(print(xtable(summaryBy(score + ranks + pseudoranks ~ conc, data = dat, FUN = mean)), 
                           include.rownames = FALSE))

# generate larger samples based on artificial data
set.seed(1)
n <- c(60, 360, 120)
x1 <- sample(subset(dat, dat$conc ==  1)$score, n[1], replace = TRUE)
x2 <- sample(subset(dat, dat$conc == 2)$score, n[2], replace = TRUE)
x3 <- sample(subset(dat, dat$conc == 3)$score, n[3], replace = TRUE)
dat2 <- data.frame(score = c(x1, x2, x3), conc = factor(c(rep(1, n[1]), rep(2, n[2]), rep(5, n[3])), 
  ordered = TRUE))

# Hettmansperger-Norton test
hettmansperger_norton_test(score ~ conc, data = dat2, pseudoranks = FALSE, 
  alternative = "increasing")
hettmansperger_norton_test(score ~ conc, data = dat2, pseudoranks = TRUE, 
  alternative = "increasing")

hettmansperger_norton_test(score ~ conc, data = dat, pseudoranks = TRUE)$pHat

## Definition of different pseudo-rank algorithm
######################################################################################

# count-algorithm
generalizedRanks_tmp <- function(dat, n, t, method = "unweighted") {
  a <- length(n)
  X <- dat[, data]
  nobs <- length(X)
  granks <- pr <- rep(0, nobs)
  
  # pseudo-ranks
  if (method == "unweighted") {
    weights <- rep(1/n[1], n[1]*t)
    for (i in 2:a) {
      weights <- c(weights, rep(1/n[i], n[i]*t))
    }
    ff <- function(k) {
      return(((sign(X[k] - t(X)) + 1)*0.5) %*% (weights)*sum(n)/a + 0.5)
    }
    granks <- vapply(1:nobs, FUN = ff, FUN.VALUE = numeric(1L))
  }
  # ranks
  else if (method == "weighted") {
    granks <- ranks(dat[, data], ties.method = "average")
  }
  else {
    granks <- ranks(dat[, data], ties.method = "average")
  }
  return(granks)
}

generalizedRanks <- cmpfun(generalizedRanks_tmp)

# AB-algorithm
AB_tmp <- function(dat, n, d, lambda, kgv) {
  if (max(lambda) > 1) {
    len <- dim(dat)[1]
    prData <- list(dat)
    z <- levels(dat[, group])
    a <- nlevels(dat[, group])
    
    # amplify data to artificially create balanced groups
    for (i in 1:a) {
      prData[[i+1]] <- dat[group == z[i]][rep(1:(n[i]*d), each = (lambda[i] - 1)), ]
    }
    dat <- rbindlist(prData)
    dat[, data := (rank(dat[, data], ties.method = "average") - 1/2)*1/(kgv*a*d)]
    dat <- dat[1:len, ]
  } else {
    dat[, data := rank(dat[, data], ties.method = "average")] 
  }
  
  # select original observations from amplified data
  return(dat[, data])
}
AB <- cmpfun(AB_tmp)

# pairwise alogrithm
pairwise_tmp <- function(data, group, n) {
  group <- factor(group, labels = 1:length(n))
  df <- data.frame(data = data, group = group)
  a <- length(n)
  N <- sum(n)
  
  samples <- split(df, df$group)
  W <- kronecker(t(rep(1/a, a)), diag(a))
  samplesR <- lapply(1:a, function(arg) {
    helpmat <- rbind(1:a, matrix(1:a, nrow = a, ncol = a))
    x1 <- samples[[helpmat[1, arg]]]
    x1 <- x1$data
    help <- 0
    for (j in 1:a) {
      x2 <- samples[[helpmat[j+1, arg]]]
      x2 <- x2$data
      help <- help + 1/length(x2)*(rank(c(x1, x2))[1:length(x1)] - rank(x1))
    }
    N/a*help + 1/2
  })
  return(samplesR)
}

pairwise <- cmpfun(pairwise_tmp)

## Table 6, Figure 1
##############################################################################################
cat("Benchmark 1\n")

# vector for group sizes
m <- c(10, 100, 1000, 2000, 4000, 6000, 8000, 10000)

timeRank <- rep(0, length(m))
timeAB <- rep(0, length(m))
timePRank <- rep(0, length(m))
timePRankCpp <- rep(0, length(m))
timePRankPair <- rep(0, length(m))
timePRankDirect <- rep(0, length(m))


for (l in 1:length(m)) {
  n <- c(m[l], m[l], 2*m[l])
  kgv <- Reduce(Lcm, n)
  lambda <- kgv/n
  
  cat("l: ", l, "/", length(m), "\n")
  t <- 1
  group1p1 =  matrix( rnorm(n[[1]], 0, 1), ncol = 1, nrow = n[1])
  group2p1 =  matrix( rnorm(n[[2]], 0, 1), ncol = 1, nrow = n[2])
  group3p1 =  matrix( rnorm(n[[3]], 0, 1), ncol = 1, nrow = n[3])
  
  g1p1 = data.frame(data = group1p1, group = 1)
  g2p1 = data.frame(data = group2p1, group = 2)
  g3p1 = data.frame(data = group3p1, group = 3)
  
  X = as.data.frame(rbind(g1p1, g2p1, g3p1))
  X$group <- as.factor(X$group)
  
  X <- as.data.table(X)
  ord <- order(X$data)
  

  # perform benchmarks
  benchPRankCpp <- microbenchmark::microbenchmark(pseudorank(X$data, X$group), times = nrun, 
    unit = "ms")
  benchRank <- microbenchmark::microbenchmark(rank(X$data, ties.method = "average"), times = nrun, 
    unit = "ms")
  benchAB <- microbenchmark::microbenchmark(AB(X, n, t, lambda, kgv)*sum(n)*t + 0.5, times = nrun, 
    unit = "ms")
  benchPair <- microbenchmark::microbenchmark(pairwise(X$data, X$group, n), times = nrun, 
    unit = "ms")
  benchDirect <- microbenchmark::microbenchmark(generalizedRanks(X, n, t), times = nrun, 
    unit = "ms")
  
  timePRankCpp[l] <- summary(benchPRankCpp)$median
  timeAB[l] <- summary(benchAB)$median
  timeRank[l] <- summary(benchRank)$median
  timePRankPair[l] <- summary(benchPair)$median
  timePRankDirect[l] <- summary(benchDirect)$median

  
}

df <- data.frame(dimension = m, Ranks = timeRank, PseudoRankCpp = timePRankCpp, AB = timeAB, 
                 PseudoRankPair = timePRankPair, PseudoRankDirect = timePRankDirect)

df_three_groups <- data.frame(Method = c(rep("Ranks", length(m)), rep("RECPR", length(m)), 
                                         rep("AB", length(m)), rep("Pairwise", length(m)), 
                                         rep("count", length(m))), n = df$dimension)
df_three_groups$time <- c(df$Ranks, df$PseudoRanks, df$PseudoRankCpp, 
                          df$AB, df$PseudoRankPair , df$PseudoRankDirect)
df_three_groups$logtime <- log(df_three_groups$time)

p1 <- ggplot(data = df_three_groups, aes(x = n, y = logtime, group = Method, col = Method)) +
  geom_line(aes(linetype = Method)) +
  geom_point() +
  scale_linetype_manual(values = c("solid", "solid", "solid", "solid", "solid")) +
  xlab("Sample Size n") +
  ylab("Log Computation Time") +
  # theme(axis.text = element_text(size = 15), axis.title = element_text(size = 15, face = "bold")) +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
  theme(axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0))) +
  scale_y_continuous(limits = c(-4, 6)) +
  theme_bw(base_size = 12) +
  theme(# Adjust legend position to maximize space, use a vector of proportion
        # across the plot and up the plot where you want the legend. 
        # You can also use "left", "right", "top", "bottom", for legends on 
        # the side of the plot
        legend.position = "none", 
        # increase the font size
        text = element_text(size = 14)) +
  annotate("text", label = "count", x = 1100, y = 4, color = "orange") +
  annotate("text", label = "pairwise", x = 3000, y = 3.2, color = "darkgreen") +
  annotate("text", label = "AB", x = 3000, y = 2, color = "red") +
  annotate("text", label = "RECPR", x = 2000, y = 3/4, color = "purple") +
  annotate("text", label = "ranks", x = 3010, y = -0.5, color = "blue")

ggsave(file = "three_groups.pdf", plot = p1, width = 6, height = 4)

## Table 7, Figure 2
#################################################################################################
cat("Benchmark 2\n")

# vector of group sizes
m <- c(10, 100, 1000, 2000, 4000, 6000, 8000, 10000)

timeRank <- rep(0, length(m))
timeAB <- rep(0, length(m))
timePRank <- rep(0, length(m))
timePRankCpp <- rep(0, length(m))
timePRankPair <- rep(0, length(m))
timePRankDirect <- rep(0, length(m))
timesIQR <- data.frame(dim = gl(length(m), 4, labels = m), lower = 1:4, upper = 1:4)

# Setting
n <- c(20, 20, 40) # sample sizes
N <- sum(n)
alpha <- 0.05
kgv <- Reduce(Lcm, n)
lambda <- kgv/n

for (l in 1:length(m)) {
  n <- c(m[l], m[l], m[l], m[l], 2*m[l])
  kgv <- Reduce(Lcm, n)
  lambda <- kgv/n
  cat("l: ", l, "/", length(m), "\n")
  
  t <- 1
  group1p1 <- round(matrix( rnorm(n[[1]], 0, 1), ncol = 1, nrow = n[1]))
  group2p1 <- round(matrix( rnorm(n[[2]], 0, 1), ncol = 1, nrow = n[2]))
  group3p1 <- round(matrix( rnorm(n[[3]], 0, 1), ncol = 1, nrow = n[3]))
  group4p1 <- round(matrix( rnorm(n[[4]], 0, 1), ncol = 1, nrow = n[4]))
  group5p1 <- round(matrix( rnorm(n[[5]], 0, 1), ncol = 1, nrow = n[5]))
  
  g1p1 <- data.frame(data = group1p1, group = 1)
  g2p1 <- data.frame(data = group2p1, group = 2)
  g3p1 <- data.frame(data = group3p1, group = 3)
  g4p1 <- data.frame(data = group4p1, group = 4)
  g5p1 <- data.frame(data = group5p1, group = 5)
  
  X <- as.data.frame(rbind(g1p1, g2p1, g3p1, g4p1, g5p1))
  X$group <- as.factor(X$group)
  
  X <- as.data.table(X)
  
  # perform benchmarks
  benchPRankCpp <- microbenchmark::microbenchmark(pseudorank(X$data, X$group), times = nrun, 
    unit = "ms")
  benchRank <- microbenchmark::microbenchmark(rank(X$data, ties.method = "average"), times = nrun, 
    unit = "ms")
  benchAB <- microbenchmark::microbenchmark(AB(X, n, t, lambda, kgv)*sum(n)*t+0.5, times = nrun, 
    unit = "ms")
  benchPair <- microbenchmark::microbenchmark(pairwise(X$data, X$group, n), times = nrun, 
    unit = "ms")
  benchDirect <- microbenchmark::microbenchmark(generalizedRanks(X, n, t), times = nrun, 
    unit = "ms")
  
  
  timePRankCpp[l] <- summary(benchPRankCpp)$median
  timeAB[l] <- summary(benchAB)$median
  timeRank[l] <- summary(benchRank)$median
  timePRankPair[l] <- summary(benchPair)$median
  timePRankDirect[l] <- summary(benchDirect)$median
  
  
}

df5 <- data.frame(dimension = m, Ranks = timeRank, PseudoRankCpp = timePRankCpp, AB = timeAB, 
                  PseudoRankPair = timePRankPair, PseudoRankDirect = timePRankDirect)

df_five_groups <- data.frame(Method = c(rep("Ranks", length(m)), rep("Stepwise", length(m)), 
                                        rep("AB", length(m)), rep("Pairwise", length(m)), 
                                        rep("Direct", length(m))), n = df5$dimension)
df_five_groups$time <- c(df5$Ranks, df5$PseudoRanks, df5$PseudoRankCpp, 
                         df5$AB, df5$PseudoRankPair, df5$PseudoRankDirect)
df_five_groups$logtime <- log(df_five_groups$time)

p2 <- ggplot(data = df_five_groups, aes(x = n, y = logtime, group = Method, col = Method)) +
  geom_line() +
  geom_point() +
  xlab("Sample Size n") +
  ylab("Log Computation Time") +
  #theme(axis.text = element_text(size = 15), axis.title = element_text(size = 15, face = "bold")) +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
  theme(axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0))) +
  theme_bw(base_size = 12) +
  scale_y_continuous(limits = c(-3, 8)) +
  annotate("text", label = "count", x = 1000, y = 4, color = "orange") +
  annotate("text", label = "pairwise", x = 3000, y = 4, color = "darkgreen") +
  annotate("text", label = "AB", x = 3000, y = 2, color = "red") +
  annotate("text", label = "RECPR", x = 2000, y = 1, color = "purple") +
  annotate("text", label = "ranks", x = 3010, y = 0, color = "blue") +
  theme(# Adjust legend position to maximize space, use a vector of proportion
    # across the plot and up the plot where you want the legend. 
    # You can also use "left", "right", "top", "bottom", for legends on 
    # the side of the plot
    legend.position = "none", 
    # increase the font size
    text = element_text(size = 14)) 

ggsave(file = "five_groups.pdf", plot = p2, width = 6, height = 4)


## Table 8, Figure 3
################################################################################################
cat("Benchmark 3\n")

# group sizes
m <- c(10, seq(from = 100, to = 1000, by = 100))

timeRank <- rep(0, length(m))
timeAB <- rep(0, length(m))
timePRank <- rep(0, length(m))
timePRankCpp <- rep(0, length(m))
timePRankPair <- rep(0, length(m))
timePRankR <- rep(0, length(m))
timePRankDirect <- rep(0, length(m))
timesIQR <- data.frame(dim = gl(length(m), 4, labels = m), lower = 1:4, upper = 1:4)

# Setting
n <- c(20, 20, 40) # sample sizes
N <- sum(n)
alpha <- 0.05
kgv <- Reduce(Lcm, n)
lambda <- kgv/n

for (l in 1:length(m)) {  
  n <- c(m[l], m[l], m[l], m[l], m[l], m[l], m[l], m[l], m[l], m[l], m[l], 2*m[l])
  kgv <- Reduce(Lcm, n)
  lambda <- kgv/n
  cat("l: ", l, "/", length(m), "\n")
  
  t <- 1
  group1p1 <- round(matrix( rnorm(n[[1]], 0, 1), ncol = 1, nrow = n[1]))
  group2p1 <- round(matrix( rnorm(n[[2]], 0, 1), ncol = 1, nrow = n[2]))
  group3p1 <- round(matrix( rnorm(n[[3]], 0, 1), ncol = 1, nrow = n[3]))
  group4p1 <- round(matrix( rnorm(n[[4]], 0, 1), ncol = 1, nrow = n[4]))
  group5p1 <- round(matrix( rnorm(n[[5]], 0, 1), ncol = 1, nrow = n[5]))
  group6p1 <- round(matrix( rnorm(n[[6]], 0, 1), ncol = 1, nrow = n[6]))
  group7p1 <- round(matrix( rnorm(n[[7]], 0, 1), ncol = 1, nrow = n[7]))
  group8p1 <- round(matrix( rnorm(n[[8]], 0, 1), ncol = 1, nrow = n[8]))
  group9p1 <- round(matrix( rnorm(n[[9]], 0, 1), ncol = 1, nrow = n[9]))
  group10p1 <- round(matrix( rnorm(n[[10]], 0, 1), ncol = 1, nrow = n[10]))
  group11p1 <- round(matrix( rnorm(n[[11]], 0, 1), ncol = 1, nrow = n[11]))
  group12p1 <- round(matrix( rnorm(n[[12]], 0, 1), ncol = 1, nrow = n[12]))
  
  g1p1 <- data.frame(data = group1p1, group = 1)
  g2p1 <- data.frame(data = group2p1, group = 2)
  g3p1 <- data.frame(data = group3p1, group = 3)
  g4p1 <- data.frame(data = group4p1, group = 4)
  g5p1 <- data.frame(data = group5p1, group = 5)
  g6p1 <- data.frame(data = group6p1, group = 6)
  g7p1 <- data.frame(data = group7p1, group = 7)
  g8p1 <- data.frame(data = group8p1, group = 8)
  g9p1 <- data.frame(data = group9p1, group = 9)
  g10p1 <- data.frame(data = group10p1, group = 10)
  g11p1 <- data.frame(data = group11p1, group = 11)
  g12p1 <- data.frame(data = group12p1, group = 12)
  
  X <- as.data.frame(rbind(g1p1, g2p1, g3p1, g4p1, g5p1, g6p1, g7p1, g8p1, g9p1, g10p1, g11p1, g12p1))
  X$group <- as.factor(X$group)
  
  X <- as.data.table(X) 
  
  # perform benchmarks
  benchPRankCpp <- microbenchmark::microbenchmark(pseudorank(X$data, X$group), times = nrun, 
    unit = "ms")
  benchRank <- microbenchmark::microbenchmark(rank(X$data, ties.method = "average"), times = nrun, 
    unit = "ms")
  benchAB <- microbenchmark::microbenchmark(AB(X, n, t, lambda, kgv)*sum(n)*t+0.5, times = nrun, 
    unit = "ms")
  benchPair <- microbenchmark::microbenchmark(pairwise(X$data, X$group, n), times = nrun, 
    unit = "ms")
  benchDirect <- microbenchmark::microbenchmark(generalizedRanks(X, n, t), times = nrun, 
    unit = "ms")
  
  timePRankCpp[l] <- summary(benchPRankCpp)$median
  timeAB[l] <- summary(benchAB)$median
  timeRank[l] <- summary(benchRank)$median
  timePRankPair[l] <- summary(benchPair)$median
  timePRankDirect[l] <- summary(benchDirect)$median    
}

df12 <- data.frame(dimension = m, Ranks = timeRank, PseudoRankCpp = timePRankCpp, AB = timeAB, 
                   PseudoRankPair = timePRankPair, PseudoRankDirect = timePRankDirect)

df_twelve_groups <- data.frame(Method = c(rep("Ranks", length(m)), rep("RECPR", length(m)), 
                                          rep("AB", length(m)), rep("Pairwise", length(m)), 
                                          rep("count", length(m))), n = df12$dimension)
df_twelve_groups$time <- c(df12$Ranks, df12$PseudoRanks, df12$PseudoRankCpp, df12$AB, 
                           df12$PseudoRankPair, df12$PseudoRankDirect)
df_twelve_groups$logtime <- log(df_twelve_groups$time)

p3 <- ggplot(data = df_twelve_groups, aes(x = n, y = logtime, group = Method, col = Method)) +
  geom_line() +
  geom_point() +
  xlab("Sample Size n") +
  ylab("Log Computation Time") +
  # theme(axis.text = element_text(size = 15), axis.title = element_text(size = 15, face = "bold")) +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0))) +
  theme(axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0))) +
  theme_bw(base_size = 12) +
  scale_y_continuous(limits = c(-4, 8)) +
  annotate("text", label = "count", x = 100, y = 4.5, color = "orange") +
  annotate("text", label = "pairwise", x = 300, y = 4, color = "darkgreen") +
  annotate("text", label = "AB", x = 300, y = 2, color = "red") +
  annotate("text", label = "RECPR", x = 200, y = 0, color = "purple") +
  annotate("text", label = "ranks", x = 301, y = -2, color = "blue") +
  theme(# Adjust legend position to maximize space, use a vector of proportion
    # across the plot and up the plot where you want the legend. 
    # You can also use "left", "right", "top", "bottom", for legends on 
    # the side of the plot
    legend.position = "none", 
    # increase the font size
    text = element_text(size = 14)) 

ggsave(file = "twelve_groups.pdf", plot = p3, width = 6, height = 4)

## Save tables in working directory as .tex files
####################################################################################
directory <- getwd()

fileConn <- file(file.path(directory, "Table_5.tex"))
writeLines(eval(tab5_result), fileConn)
close(fileConn)

fileConn <- file(file.path(directory, "Table_6.tex"))
writeLines(print(xtable(df), include.rownames = FALSE), fileConn)
close(fileConn)

fileConn <- file(file.path(directory, "Table_7.tex"))
writeLines(print(xtable(df5), include.rownames = FALSE), fileConn)
close(fileConn)

fileConn <- file(file.path(directory, "Table_8.tex"))
writeLines(print(xtable(df12), include.rownames = FALSE), fileConn)
close(fileConn)