## Install necessary packages
# install.packages(c("mdendro", "cluster", "dendextend", "ape"))


## 2. Agglomerative hierarchical clustering algorithms -------------------------

## 2.1. Pair-group algorithm ---------------------------------------------------

## Genetic profiles of 51 grapevine cultivars. The distance between two 
## cultivars is defined as one minus the fraction of shared alleles, and this 
## definition is used to calculate distance matrix 'd'.
dt <- read.csv("grapevine.csv")
n <- nrow(dt)
cols <- 2:ncol(dt)
nc <- length(cols)
m <- matrix(0.0, nrow = n, ncol = n,
            dimnames = list(dt$Name, dt$Name))
for (i in 1:(n-1)) {
    for (j in (i+1):n) {
        m[i, j] <- 1 - sum(dt[i, cols] == dt[j, cols]) / nc
        m[j, i] <- m[i, j]
    }
}
d <- as.dist(m)

## The main characteristic of this kind of data is that the number of different 
## distances is very small.
length(unique(d))

## Calculate different pair-group dendrograms for random permutations of the 
## same grapevine cultivars dataset, and plot the sorted values of their 
## cophenetic correlation coefficients.
library("mdendro")
set.seed(666)
nreps <- 1000
cors <- vector(length = nreps)
lnks <- list()
for (r in 1:nreps) {
    ord <- sample(attr(d, "Size"))
    d2 <- as.dist(as.matrix(d)[ord, ord])
    lnks[[r]] <- linkage(d2, group = "pair", digits = 3)
    cors[r] <- lnks[[r]]$cor
}
plot(sort(cors), ylab = "Cophenetic correlation coefficient")


## 2.2. Variable-group algorithm -----------------------------------------------

## Matrix of pairwise distances between four individuals in a toy example.
d <- as.dist(matrix(c(0, 2, 4, 7,
                      2, 0, 2, 5,
                      4, 2, 0, 3,
                      7, 5, 3, 0), nrow = 4))

## Using the pair-group algorithm, we can obtain three different binary 
## dendrograms depending on the order of rows and columns in the toy distance 
## matrix. Using the variable-group algorithm, we obtain a unique 
## multidendrogram.
par(mfrow = c(1, 4))
lnk1 <- linkage(d, group = "pair")
plot(lnk1, main = "dendrogram 1")
d2 <- as.dist(as.matrix(d)[c(2, 3, 4, 1), c(2, 3, 4, 1)])
lnk2 <- linkage(d2, group = "pair")
plot(lnk2, main = "dendrogram 2")
d3 <- as.dist(as.matrix(d)[c(4, 1, 2, 3), c(4, 1, 2, 3)])
lnk3 <- linkage(d3, group = "pair")
plot(lnk3, main = "dendrogram 3")
lnk4 <- linkage(d, group = "variable")
plot(lnk4, main = "multidendrogram")

## In the following example, exchanging soils 15 and 20 yields two different 
## binary dendrograms.
soils <- 1 - as.dist(read.csv("soils.csv", header = FALSE))
labs <- paste(1:23)
brown_soils <- c(1, 2, 6, 12, 13)
labs[brown_soils] <- paste("B-", labs[brown_soils], sep="")
attr(soils, "Labels") <- labs
lnk <- linkage(soils, method = "complete", group = "variable")
lnk1 <- linkage(soils, method = "complete", group = "pair")
s <- 1:23
s[c(15, 20)] <- c(20, 15)
soils2 <- as.dist(as.matrix(soils)[s, s])
lnk2 <- linkage(soils2, method = "complete", group = "pair")
par(mfrow = c(3, 1), mar = c(2, 4, 2, 0))
plot(lnk, col.rng = "pink", main = "multidendrogram")
plot(lnk1, main = "dendrogram 1")
plot(lnk2, main = "dendrogram 2")

## With function linkage(), you can construct both pair-group dendrograms and 
## variable-group multidendrograms. Additionally, you can select the resolution,
## i.e., the number of significant digits.
par(mfrow = c(3, 1))
cars <- dist(scale(mtcars))
cars1 <- round(cars, digits = 1)
nodePar <- list(cex = 0, lab.cex = 0.7)
lnk1 <- linkage(cars1, method = "complete", group = "pair")
plot(lnk1, main = "dendrogram", nodePar = nodePar)
lnk2 <- linkage(cars1, method = "complete", group = "variable")
plot(lnk2, col.rng = "pink", main = "multidendrogram", nodePar = nodePar)
lnk3 <- linkage(cars, method = "complete", group = "variable", digits = 1)
plot(lnk3, col.rng = NULL, main = "multidendrogram (no ranges)",
     nodePar = nodePar)


## 3. Linkage methods ----------------------------------------------------------

## 3.1. Common linkage methods -------------------------------------------------

## Observe the differences between the most common linkage methods on the 
## UScitiesD dataset.
par(mfrow = c(2, 3))
methods <- c("single", "complete", "arithmetic", "ward", "centroid")
for (m in methods) {
  lnk <- linkage(UScitiesD, method = m)
  plot(lnk, cex = 0.4, main = m)
}


## 3.2. Descriptive measures ---------------------------------------------------

## Summary of the dendrogram obtained when we use function linkage() to 
## calculate the "complete" linkage of the UScitiesD dataset.
lnk <- linkage(UScitiesD, method = "complete")
summary(lnk)

## Calculate a variable-group multidendrogram and a pair-group dendrogram for 
## the same data.
cars <- round(dist(scale(mtcars)), digits = 1)
lnk1 <- linkage(cars, method = "complete", group = "variable")
lnk2 <- linkage(cars, method = "complete", group = "pair")

## Now, we apply a random permutation to data.
set.seed(1234)
ord <- sample(attr(cars, "Size"))
carsp <- as.dist(as.matrix(cars)[ord, ord])
lnk1p <- linkage(carsp, method = "complete", group = "variable")
lnk2p <- linkage(carsp, method = "complete", group = "pair")

## Check that the original and the permuted cophenetic correlation coefficients 
## are identical for variable-group multidendrograms ...
c(lnk1$cor, lnk1p$cor)

## ... and they are different for pair-group dendrograms.
c(lnk2$cor, lnk2p$cor)


## 3.3. Parametric linkage methods ---------------------------------------------

## Some examples of the "flexible" and "versatile" linkage methods on the 
## UScitiesD dataset.
par(mfrow = c(2, 3))
vals <- c(-0.8, 0.0, 0.8)
for (v in vals) {
  lnk <- linkage(UScitiesD, method = "flexible", par.method = v)
  plot(lnk, cex = 0.4, main = sprintf("flexible (%.1f)", v))
}
vals <- c(-10.0, 0.0, 10.0)
for (v in vals) {
  lnk <- linkage(UScitiesD, method = "versatile", par.method = v)
  plot(lnk, cex = 0.4, main = sprintf("versatile (%.1f)", v))
}


## Beta-flexible linkage -------------------------------------------------------

## Obtain descriptive measures for the "flexible" linkage method on the 
## UScitiesD dataset.
par(mfrow = c(1, 5))
measures <- c("cor", "sdr", "ac", "cc", "tb")
vals <- seq(from = -1, to = +1, by = 0.1)
for (m in measures)
  descplot(UScitiesD, method = "flexible",
           measure = m, par.method = vals,
           type = "o",  main = m, col = "blue")


## Versatile linkage -----------------------------------------------------------

## Obtain descriptive measures for the "versatile" linkage method on the 
## UScitiesD dataset.
par(mfrow = c(1, 5))
measures <- c("cor", "sdr", "ac", "cc", "tb")
vals <- c(-Inf, (-20:+20), +Inf)
for (m in measures)
  descplot(UScitiesD, method = "versatile",
           measure = m, par.method = vals,
           type = "o",  main = m, col = "blue")

## Plot different dendrograms obtained as we increase the "versatile" linkage 
## parameter, indicating the corresponding named methods.
d <- as.dist(matrix(c( 0,  7, 16, 12,
                       7,  0,  9, 19,
                      16,  9,  0, 12,
                      12, 19, 12, 0), nrow = 4))
par(mfrow = c(2, 3), mar = c(2,2,2,2))
vals <- c(-Inf, -1, 0, 1, Inf)
names <- c("single", "harmonic", "geometric", "arithmetic", "complete")
titles <- sprintf("versatile (%.1f) = %s", vals, names)
for (i in 1:length(vals)) {
  lnk <- linkage(d, method = "versatile", par.method = vals[i], digits = 2)
  plot(lnk, ylim = c(0, 20), cex = 0.6, main = titles[i])
}


## 4. Comparison with other packages -------------------------------------------

## Comparison of "complete" linkage on the UScitiesD dataset, using functions 
## linkage(), hclust() and agnes().
lnk <- mdendro::linkage(UScitiesD, method = "complete")
hcl <- stats::hclust(UScitiesD, method = "complete")
agn <- cluster::agnes(UScitiesD, method = "complete")
par(mar = c(6, 4, 4, 0), mfrow = c(1, 3))
plot(lnk)
plot(hcl, main = "")
plot(agn, which.plots = 2, main = "")

## Comparison between the cophenetic matrix available as component 'coph' of the
## returned 'linkage' object, and those obtained using functions hclust() and 
## agnes().
hcl.coph <- cophenetic(hcl)
all(lnk$coph == hcl.coph)
agn.coph <- cophenetic(agn)
all(lnk$coph == agn.coph)

## Comparison of the cophenetic correlation coefficients and the agglomerative 
## coefficients.
hcl.cor <- cor(UScitiesD, hcl.coph)
all.equal(lnk$cor, hcl.cor)
all.equal(lnk$ac, agn$ac)

## Plot a dendrogram with ties drawing a range rectangle for tied distances, 
## hiding it, and plotting the result returned by function as.dendrogram().
cars <- round(dist(scale(mtcars)), digits = 1)
lnk <- linkage(cars, method = "complete", group = "variable")
par(mfrow = c(3, 1), mar = c(1, 4, 2, 0))
plot(lnk, col.rng = "pink", main = "with range", leaflab = "none")
plot(lnk, col.rng = NULL, main = "without range", leaflab = "none")
plot(as.dendrogram(lnk), main = "as.dendrogram()", leaflab = "none")

## Function to compare the computational efficiency of functions agnes(), 
## linkage() and hclust().
time.test <- function(sizes, filename, nreps=20L, agnes.limit=5000L) {
  nsizes <- length(sizes)
  # methods
  lnk.methods <- c("single", "complete", "arithmetic", "ward")
  hcl.methods <- c("single", "complete", "average", "ward.D2")
  agn.methods <- c("single", "complete", "average", "ward")
  nmethods <- length(lnk.methods)
  # data frame to store times
  times <- data.frame(size = sizes, linkage = rep(0, nsizes), 
                      hclust = rep(0, nsizes), agnes = rep(0, nsizes))
  set.seed(666)
  for (i in 1:nsizes) {
    n <- sizes[i]
    print(n)
    for (r in 1:nreps) {
      # random distance matrix
      m <- matrix(nrow=n, ncol=n)
      m[lower.tri(m)] <- runif((n-1)*n/2)
      d <- as.dist(m)
      for (j in 1:nmethods) {
        # linkage
        start <- Sys.time()
        lnk <- mdendro::linkage(d, method=lnk.methods[j])
        end <- Sys.time()
        times$linkage[i] <- times$linkage[i] + difftime(end,start,units="secs")
        # hclust
        start <- Sys.time()
        hcl <- stats::hclust(d, method=hcl.methods[j])
        end <- Sys.time()
        times$hclust[i] <- times$hclust[i] + difftime(end,start,units="secs")
        # agnes
        if (n <= agnes.limit) {
          start <- Sys.time()
          agn <- cluster::agnes(d, method=agn.methods[j])
          end <- Sys.time()
          times$agnes[i] <- times$agnes[i] + difftime(end,start,units="secs")
        }
      }
    }
    # Calculate mean times dividing by the number of methods and repetitions
    times$linkage[i] <- times$linkage[i] / (nmethods * nreps)
    times$hclust[i] <- times$hclust[i] / (nmethods * nreps)
    if (n <= agnes.limit) {
      times$agnes[i] <- times$agnes[i] / (nmethods * nreps)
    } else {
      times$agnes[i] <- NA
    }
  }
  # Save results
  write.table(times, file=filename, quote=FALSE, sep=",", row.names=FALSE)
}

## The comparison of the computational efficiency of functions agnes(), 
## linkage() and hclust() is very time-consuming. In case you want to run it, 
## please uncomment the following two instructions.
# time.test(sizes=seq(from=100L, to=1000L, by=100L), 
#          filename="times-linear2.csv")
# time.test(sizes=round(exp(seq(from=log(100), to=log(20000), length.out=14L))),
#          filename="times-log2.csv")

## Otherwise, the comparison can be started from the runtimes previously saved, 
## both in linear scale (dt1) and in log-log scale (dt2).
dt1 <- read.csv("times-linear.csv")
dt2 <- read.csv("times-log.csv")
par(mfrow = c(2, 1), mar = c(5, 4, 0.5, 2))
plot(x = dt1$size, y = dt1$agnes, type = "l", col = 2, 
     xlab = "Size", ylab = "Time (s)")
lines(x = dt1$size, y = dt1$linkage, type = "l", col = 3)
lines(x = dt1$size, y = dt1$hclust, type = "l", col = 4)
legend(x = "topleft", legend = c("agnes", "linkage", "hclust"), col = 2:4, 
       lty = "solid")
options(scipen = 10)
plot(x = dt2$size, y = dt2$agnes, type = "l", col = 2, 
     xlab = "Size", ylab = "Time (s)", log = "xy")
lines(x = dt2$size, y = dt2$linkage, type = "l", col = 3)
lines(x = dt2$size, y = dt2$hclust, type = "l", col = 4)
legend(x = "topleft", legend = c("agnes", "linkage", "hclust"), col = 2:4, 
       lty = "solid")

## Convert objects of class 'linkage' using function as.dendrogram(), to take 
## advantage of other dendrogram plotting packages, such as dendextend and ape.
par(mar = c(5, 0, 4, 0), mfrow = c(1, 2))
cars <- round(dist(scale(mtcars)), digits = 1)
lnk <- linkage(cars, method = "complete")
lnk.dend <- as.dendrogram(lnk)
plot(dendextend::set(lnk.dend, "branches_k_color", k = 4),
     main = "dendextend package",
     nodePar = list(cex = 0.4, lab.cex = 0.5))
lnk.hcl <- as.hclust(lnk)
pal4 <- c("red", "forestgreen", "purple", "orange")
clu4 <- cutree(lnk.hcl, k = 4)
plot(ape::as.phylo(lnk.hcl),
     type = "fan",
     main = "ape package",
     tip.color = pal4[clu4],
     cex = 0.4)

## Use function linkage() to plot heatmaps containing multidendrograms.
heatmap(scale(mtcars), hclustfun = linkage)

## Construct a dendrogram directly from a matrix of nonnegative correlations, 
## without having to convert them to distances.
par(mar = c(5, 4, 4, 2) + 0.1, mfrow = c(1, 1))
sim <- as.dist(Harman23.cor$cov)
lnk <- linkage(sim, type.prox = "sim")
plot(lnk)
