## ---------------------------------------------------------------------------
## Replication code accompanying the paper "Hyperspectral Data Analysis 
## in R: The hsdar Package" by Lukas W. Lehnert, Hanna Meyer, Wolfgang A. 
## Obermeier, Brenner Silva, Bianca Regeling and Joerg Bendix
## ---------------------------------------------------------------------------

## Check if installation of required packages is needed.
if (!require("hsdar")) {
    install.packages("hsdar")
    library("hsdar")
}

## Set random seed
seed <- 1104
set.seed(seed)

## Example how to create an object of class `Speclib' from offline data.
data("spectral_data", package = "hsdar")
reflectance <- spectra(spectral_data)
class(reflectance)
wv <- wavelength(spectral_data)
class(wv)
spec_lib <- speclib(reflectance, wv)
class(spec_lib)


## Example how to obtain spectral data from an internet library (may take some
## time depending on the internet connection).
avl <- USGS_get_available_files()
grass_spectra <- USGS_retrieve_files(avl = avl, pattern = "grass-fescue")

##-------------------------------------
## Working with large raster files
##-------------------------------------
## Create an example hyperspectral image using PROSAIL based on a photograph 
## of an oak leaf published on wikipedia

## Download example file from wikipedia
url <- "https://upload.wikimedia.org/wikipedia/commons/thumb/7/72/Quercus_ilex_rotundifolia_leaf.jpg/800px-Quercus_ilex_rotundifolia_leaf.jpg"
download.file(url, "exmpl_img.jpg")

## Read file and coerce green band to matrix
img <- brick("exmpl_img.jpg")
img <- as.matrix(img[[2]])

## Define subset of image to process (to reduce computational time)
xsub <- 50:200
ysub <- 150:300

## Create list of parameters for PROSAIL
parameterList <- matrix(ncol = 8, nrow = length(xsub)*length(ysub))
j <- 0
for (k in ysub)
{
  for (i in xsub)
  {
    j <- j + 1
    if (img[k, i] > 220) 
    {
      ## Create soil background spectra beside the leaf
      parameterList[j, ] <- c(1, 40, 8, 0, 0.01, 0.009, 0, 0)
    } else {
      ## Modify chlorophyll and caroteniod content according to green band values
      parameterList[j, ] <- c(1,
                              35 + 20/(img[k, i] + 1),
                              5 + 40/(img[k, i] + 1),
                              0, 0.01, 0.009, 0, 1)
    }
  }
  cat(round(j/(length(xsub)*length(ysub))*100, 2), "\n")
}
parameterList <- as.data.frame(parameterList)
names(parameterList) <- c("N", "Cab", "Car", "Cbrown",
                          "Cw", "Cm", "psoil", "LAI")

## Run PROSAIL
spec <- PROSAIL(parameterList = parameterList)

## Create hyperspectral brick
spec_brick <- brick(raster(img[ysub, xsub]), values = TRUE, nl = nbands(spec)) 
spec_brick <- brick(spec_brick, nl = nbands(spec))
spec_brick <- setValues(spec_brick, 
                        aperm(array(t(spectra(spec)),
                                    dim = c(nbands(spec), ncol(spec_brick), nrow(spec_brick))), 
                              c(3, 2, 1)))

## Write brick to disk (used again later to demonstrate the cubePlot function)
writeRaster(spec_brick, "exmpl_cube.tif")

## Apply Savitzky-Golay Filter to spectra and write filtered data to disk
infile <- "exmpl_cube.tif"
outfile <- "exmpl_cube_filtered.tif"

ra <- speclib(infile, 400:2500)
tr <- blockSize(ra)

res <- writeStart(ra, outfile, nl = nbands(ra), format = "GTiff")
for (i in 1:tr$n) {
  v1 <- getValuesBlock(ra, row=tr$row[i], nrows=tr$nrows[i])
  v2 <- noiseFiltering(v1, method = "sgolay", n = 25)
  res <- writeValues(res, v2, tr$row[i])
}
res <- writeStop(res)


##-------------------------------------
## Figure 2: Effect of noise reduction
##-------------------------------------
## 

## As an example, the 10th spectrum in the `Speclib' object is
## taken. Since the effect of filtering can be better shown, if not
## the entire spectrum (1400 bands) is shown, we cut the spectrum
## additionally to the red edge part, where a high information content
## is available for vegetation remote sensing.
spectral_data_1 <- spectral_data[10,400:550]

## The noise in the example dataset shipped with the package is
## already removed because otherwise the examples in the package would
## have always the noise-reduction as a first step. Thus, the data is
## artificially affected by random noise to show the effect of
## filtering in this example. The artificial noise is created using
## normally distributed random numbers.
spectra(spectral_data_1) <- spectra(spectral_data_1) + spectra(spectral_data_1) * 
                                    matrix(rnorm(nbands(spectral_data_1)*nspectra(spectral_data_1),
                                           mean = 0, sd = .01),
                                    ncol = nbands(spectral_data_1))
                                    
## In the following, several filters are applied to the noisy spectral
## data.
sgolay <- noiseFiltering(spectral_data_1, method = "sgolay", n = 25)
lowess <- noiseFiltering(spectral_data_1, method = "lowess", f = .1)
meanflt <- noiseFiltering(spectral_data_1, method = "mean", p = 5)
spline <- noiseFiltering(spectral_data_1, method = "spline", 
                         n = round(nbands(spectral_data_1)/10, 0))

## In the following, the plot is created. Please note that the
## reflectance values resulting from the lowess-, mean- and
## spline-filters are added with 10%, 20% and 30%, respectively. This
## allows a better comparison because the curves are not overlaying.
par(mar = c(4, 4, 0.1, 0.1))
plot(spectral_data_1, ylim = c(10, 90))
plot(sgolay, new = FALSE, col = "red")
text(labels = "Savitzky-Golay-Filter", x = wavelength(spectral_data_1)[nbands(spectral_data_1)],
     y = spectra(spectral_data_1)[1, nbands(spectral_data_1)] + 2, adj = c(1, 0))

spectra(spectral_data_1) <- spectra(spectral_data_1) + 10
spectra(lowess) <- spectra(lowess) + 10
plot(spectral_data_1, new = FALSE)
plot(lowess, new = FALSE, col = "red")
text(labels = "Lowess-Filter", x = wavelength(spectral_data_1)[nbands(spectral_data_1)],
     y = spectra(spectral_data_1)[1, nbands(spectral_data_1)] + 2, adj = c(1, 0))

spectra(spectral_data_1) <- spectra(spectral_data_1) + 10
spectra(meanflt) <- spectra(meanflt) + 20
plot(spectral_data_1, new = FALSE)
plot(meanflt, new = FALSE, col = "red")
text(labels = "Mean-Filter", x = wavelength(spectral_data_1)[nbands(spectral_data_1)],
     y = spectra(spectral_data_1)[1, nbands(spectral_data_1)] + 2, adj = c(1, 0))

spectra(spectral_data_1) <- spectra(spectral_data_1) + 10
spectra(spline) <- spectra(spline) + 30
plot(spectral_data_1, new = FALSE)
plot(spline, new = FALSE, col = "red")
text(labels = "Spline-Filter", x = wavelength(spectral_data_1)[nbands(spectral_data_1)],
     y = spectra(spectral_data_1)[1, nbands(spectral_data_1)] + 2, adj = c(1, 0))

##-------------------------------------
## Figure 3b: Hyperspectral cube
##-------------------------------------
## The hyperspectral dataset used for this illustration is by far too
## large to be included in an R package. Therefore, we can only
## describe, how such a plot can be created in hsdar. Instead, we will
## use the simulated spectral data from the oak leaf as described above.

## Create an object of class `Speclib'.
img_c <- speclib(infile, 400:2500)

## Plot the cube. Please note that the package "rgl" is required which will be
## installed first, if necessary.
## Check if installation of required packages is needed.
if (!require("rgl")) {
    install.packages("rgl")
    library("rgl")
}

## For unknown reasons, the plot may only appear after the second call
## on some systems. We will only plot the bands between 400 and 1000 nm.
## Color for the spectral information is chosen out of the rainbow-palette.
cubePlot(img_c[, 1:599], sidecol = colorRamp(palette(rainbow(6)[5:1])), z_interpolate = TRUE)


##-------------------------------------
## Case study: Chlorophyll
##-------------------------------------

## Load the example dataset and apply a noise reduction filter. The
## filter would not be necessary in this example because the data is
## already filtered. However, for most datasets, this is a really
## critical step which is why we include it here.
data("spectral_data", package = "hsdar")
spectral_data <- noiseFiltering(spectral_data, method = "sgolay", p = 15)

## Convert the chlorophyll measurements stored in the SI from
## SPAD-values into mg.
SI(spectral_data)$chlorophyll <- 
  (117.1 * SI(spectral_data)$chlorophyll) /
  (148.84 - SI(spectral_data)$chlorophyll)

## Cut the data to the spectral range, where chlorophyll absorbs light.
spectral_data <- spectral_data[, wavelength(spectral_data) >= 310 & 
  wavelength(spectral_data) <= 1000]

## Transform reflectance values into band depth applying a segmented upper hull
## continuum removal.  
spec_bd <- transformSpeclib(spectral_data, method = "sh", out = "bd")

## Select the chlorophyll absorption features at 460 nm and 670 nm for further
## processing.
featureSpace <- specfeat(spec_bd, c(460, 670))

## Calculate all parameters from both selected features such as area, distance
## to Gauss curve etc.
featureSpace <- feature_properties(featureSpace)

## Set response and additional predictor variables for random forest
## model.
featureSpace <- setResponse(featureSpace, "chlorophyll")
featureSpace <- setPredictor(featureSpace,
  names(SI(featureSpace))[5:ncol(SI(featureSpace))])

## Define training and cross validation for random forest model tuning.
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, 
  savePredictions = "final")

## Train random forest model.
rfe_trained <- train(featureSpace, trControl = ctrl, method = "rf")

## Extract root mean square error from cross validation.
estimates <- tapply(rfe_trained$pred$pred[order(rfe_trained$pred$rowIndex)], 
                    INDEX = rfe_trained$pred$rowIndex[order(rfe_trained$pred$rowIndex)], mean)
measured <- SI(featureSpace)$chlorophyll
r_squared <- round(summary(lm(estimates ~ measured))$r.squared, 2)
rmse_val <- round(sqrt(mean((estimates - measured)^2, na.rm = TRUE)),2)


##-------------------------------------
## Case study: Chlorophyll: Figure 4
##-------------------------------------
## 
 
par(mfrow = c(1, 2), mar = c(4, 4, 0.2, 0.2))

## Plot the raw spectra and the gray boxes.
plot(spectral_data)
polygon(c(315, 530, 530, 315, 315), c(0, 0, 30, 30, 0), col = "lightgrey", lty = "blank")
polygon(c(570, 700, 700, 570, 570), c(0, 0, 30, 30, 0), col = "lightgrey", lty = "blank")
plot(spectral_data, new = FALSE)

## Calculate the mean + 1 sd spectrum out of all available ones in the
## `Speclib' object.
me <- apply(spectral_data, 1, "mean")
spectra(me) <- spectra(me) + spectra(apply(spectral_data, 1, "sd"))

## Construct the convex and the upper segmented hull.
me_ch <- transformSpeclib(me, method = "ch", out = "raw")
me_sh <- transformSpeclib(me, method = "sh", out = "raw")

## Plot both hulls.
plot(me_ch, ispec = 1, new = FALSE, numeratepoints = FALSE,
     hull.style = list(lty = "dotted", col = "darkred", lwd = 2),
     points.style = NULL, type = "n")
plot(me_sh, ispec = 1, new = FALSE, numeratepoints = FALSE,
     hull.style = list(lty = "dotted", col = "darkblue", lwd = 2),
     points.style = NULL, type = "n")

## Plot all band depth spectra.
for (i in 1:nspectra(spec_bd))
  plot(spec_bd, FUN = i, new = i == 1, ylim = c(0, 1))

  
##-------------------------------------
## Case study: Chlorophyll: Figure 5
##-------------------------------------
## 
  
## Small helper function to pretty format a, b, c ... of subfigures in
## the corner of the plot.
includeLetter <- function(
  letter,                  ## Text to print to plot. If integer value is passed, 
                           ## the i-th letter in alphabet is printed
  position = "upperleft",  ## Position of text (May be one of "upperleft", 
                           ## "upperright", "lowerleft" or "lowerright")
  xdis = .05,              ## Distance from left/right border of plot in percent
  ydis = NA,               ## Distance from upper/lower border of plot in percent
                           ## If NA, ydis is calculated relatively to xdis regarding 
                           ## width/height of the plot
  ...                      ## Further arguements passed to function text
) {
  t <- if (is.numeric(letter)) letters[letter]
       else letter
  
  p <- par()$usr
  
  if (is.na(ydis)) {
    f <- par()$fin
    ydis <- xdis*f[1]/f[2]
  }
  x <- switch(position,
              upperleft  = c(p[1]+(p[2]-p[1])*xdis, p[4]-(p[4]-p[3])*ydis),
              upperright = c(p[2]-(p[2]-p[1])*xdis, p[4]-(p[4]-p[3])*ydis),
              lowerleft  = c(p[1]+(p[2]-p[1])*xdis, p[3]+(p[4]-p[3])*ydis),
              lowerright = c(p[2]-(p[2]-p[1])*xdis, p[3]+(p[4]-p[3])*ydis))
  
  text(x[1], x[2], t, ...)
  
  invisible(x)
}

## Reset plot interface.
par(mfrow = c(1, 1))

## Plot the scatterplot.
lim <- range(c(range(estimates), range(measured)))
plot(estimates ~ measured, ylim = lim, xlim = lim, pch  = 19,
     xlab = expression(paste("Measured chlorophyll content (", 
                             mu, plain(g)~plain(cm)^{-2}, ")", 
                             sep = "")),
     ylab = "")
lines(lim, lim, lty = "dotted")
includeLetter(substitute(atop(RMSE == x, R^2 == y), 
                         list(x = rmse_val, 
                              y = r_squared)),
              xdis = 0.15, ydis = 0.1)
title(ylab = expression(paste("Estimated chlorophyll content (", 
                              mu, plain(g)~plain(cm)^{-2}, ")", 
                              sep = "")),
      line = 2.5)

##-------------------------------------
## Case study: Chlorophyll: Table 2
##-------------------------------------
##
## Load package xtable (install if missing).
if (!require("xtable")) {
    install.packages("xtable")
    library("xtable")
}

## Define header of the table.
header <- paste("\\toprule\n ", 
                paste("ID",
                      "\\multicolumn{2}{c}{Area}",
                      "\\multicolumn{2}{c}{Width}",
                      "\\multicolumn{2}{c}{Max.~Band}",
                      "\\multicolumn{4}{c}{Dist.~to Gauss Curve}\\\\\n ",
                      collapse = "", sep = "&"),
                paste("&&&",
                      "&\\multicolumn{2}{c}{Width}",
                      "\\multicolumn{2}{c}{$f_{450}$}",
                      "\\multicolumn{2}{c}{$f_{700}$}\\\\\n ",
                      sep = "&"),
                paste("\\cmidrule(r){2-3}\\cmidrule(r){4-5}\\",
                      "cmidrule(r){6-7}\\cmidrule(r){8-9}\\",
                      "cmidrule(r){10-11}\n ", sep = ""),
                paste("&$f_{450}$", "$f_{700}$",
                      "$f_{450}$", "$f_{700}$",
                      "$f_{450}$", "$f_{700}$",
                      "left", "right", 
                      "left", "right\n",
                      sep = "&"),
                "\\\\", "\\midrule \n", sep = "")

## Select columns that are displayed (the format of the journal does
## not allow to print all columns).
tmp <- SI(featureSpace)[, 4:ncol(SI(featureSpace))]
row.names(tmp) <- idSpeclib(featureSpace)
tmp <- cbind(data.frame(ID = 1:nrow(tmp)), 
             tmp[, c(1:4, 5, 10, 6, 11, 7, 12, 15, 16, 8, 9, 13, 14)[c(1:2, 9:16)]])

## Print the table.
print(xtable(tmp, digits = c(0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2)), 
      floating = FALSE,
      size = "tiny", 
      include.rownames = FALSE, 
      include.colnames = FALSE, 
      caption.placement = "top", 
      hline.after = NULL, 
      add.to.row = list(pos = list(-1, nrow(tmp)), 
                        command = c(header, "\\bottomrule \n")))

##-------------------------------------
## Case study: Cancer
##-------------------------------------
##

## Load dataset.
data("cancer_spectra", package = "hsdar")

## Plot spectra from cancerous and non-cancerous parts separately.
plot(subset(cancer_spectra, infected == 1), ylim = c(0, 400), 
     col = "darkred")
plot(subset(cancer_spectra, infected == 0), new = FALSE)

## Convert response variable to factor.
SI(cancer_spectra)$infected <- as.factor(SI(cancer_spectra)$infected)

## Calculate all possible normalized ratio indices.
nri_data <- nri(cancer_spectra, recursive = TRUE)

## In general, if many bands and/or many observations are available in
## the dataset, the following process would take a long time. In this
## case one could easily parallelize the execution of the following code
## by uncommenting and running:
# n_cores <- 2
# 
# # For Linux/Mac OS
# library("doMC")
# registerDoMC(n_cores)
# 
# # For Windows:
# library("doMPI")
# cl <- startMPIcluster(count = n_cores)
# registerDoMPI(cl)

## Fitting Generalized Linear Models between each index and the
## decision if the respective part of the tissue is cancerous or not.
glm_models <- glm.nri(infected ~ nri_data, preddata = cancer_spectra, 
  family = binomial)

## Plot the result of the glms in a 2D-correlogram (Figure 7).
par(mar = c(5.1, 4.1, 4.1, 5))

## Define colors
cols <- c(rgb(82, 0, 0, maxColorValue = 255), 
          rgb(179, 179, 179, maxColorValue = 255), 
          rgb(0, 0, 82, maxColorValue = 255))

## Get symmetric ranges of values (required for a symmetric colorrange).
range <- max(abs(as.array(glm_models@multivariate$z.value)[, , 2]), na.rm = TRUE)
range <- c(-range, range)

## Plot the lower triang (z-values).
plot(glm_models, coefficient = "z.value", legend = "outer",
     range = range, colspace = "rgb", col = cols)
## Plot the upper triang (p-values).
plot(glm_models, coefficient = "p.value", uppertriang = TRUE, 
     colspace = "rgb", col = cols[c(1, 3, 2)], zlog = TRUE)

## Plot the diagonal line and the texts to the legends.
lines(c(300, 1705), c(300, 1705))
mtext("p-value", 3, 3.2)
mtext("z-value", 4, 3)

## Get worst and best models.
n_best <- nri_best_performance(glm_models, n = 1, coefficient = "z.value", 
                     predictor = 2, abs = TRUE, findMax = TRUE, family = binomial)
n_worst <- nri_best_performance(glm_models, n = 1, coefficient = "z.value", 
                     predictor = 2, abs = TRUE, findMax = FALSE, family = binomial)

## Mark worst and best models in the plot.
mark_nri_best_performance(n_best, glm_models, border = "white", lwd = 2)
mark_nri_best_performance(n_worst, glm_models, border = "black",
                          lwd = 0.5, density = 20)
mark_nri_best_performance(n_worst, glm_models, border = "black",
                          lwd = 0.5, density = 20, angle = 45+90)
mark_nri_best_performance(n_worst, glm_models, border = "black", lwd = 2)

mark_nri_best_performance(n_best, glm_models, border = "white",
                          lwd = 2, uppertriang = TRUE)
mark_nri_best_performance(n_worst, glm_models, border = "black", 
                          lwd = 0.5, uppertriang = TRUE, density = 20)
mark_nri_best_performance(n_worst, glm_models, border = "black", 
                          lwd = 0.5, uppertriang = TRUE, density = 20, angle = 45+90)
mark_nri_best_performance(n_worst, glm_models, border = "black", 
                          lwd = 2, uppertriang = TRUE)
## End of Figure creation.


## Since we want to use a multivariate approach to classify the
## spectra into cancerous and non-cancerous (in this case using SVM
## and neuronal networks), we have to define the response and
## additional predictor variables. Please note that all nri-values are
## used as predictors in default.
nri_data <- setResponse(nri_data, "infected")
nri_data <- setPredictor(nri_data, "stage")

## Perform a recursive feature selection to reduce the number of
## predictors and to exclude those which are heavily correlated.
sel_feat <- rfe(nri_data, cutoff = 0.9)

## Define training and cross validation for SVM and nnet model tuning.
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, 
  savePredictions = "final")    

## Train both models.
set.seed(seed)
rfe_trained_svm  <- train(sel_feat, trControl = ctrl, 
  importance = TRUE, method = "svmRadial")
set.seed(seed)
rfe_trained_nnet <- train(sel_feat, trControl = ctrl, 
  importance = TRUE, method = "nnet", seed = 1104)

##-------------------------------------
## Case study: Table 3, Error Matrix
##-------------------------------------
##

## Construct a confusion matrix.
em_svm  <- confusionMatrix(rfe_trained_svm$pred$pred,
                           reference = rfe_trained_svm$pred$obs)$table

## Print the table.
em <- as.data.frame(matrix(em_svm/ctrl$repeats, ncol = 2))
row.names(em) <- c("Infected", "not Infected")
print(xtable(em), 
      floating = FALSE, 
      include.colnames = FALSE, 
      hline.after = NULL, 
      add.to.row = list(pos = list(-1, 2), 
                        command = c(paste("\\toprule \n", 
                                          "& Infected & not Infected ", 
                                          "\\\\\\midrule \n"), 
                                    "\\bottomrule \n")))

## Calculate overall accuracy and percentage of false classified spectra.
(oa_svm <- round((1- (em_svm[2, 1] + em_svm[1, 2])/sum(em_svm))*100, 2))
fc_svm <- em_svm[2, 1] + em_svm[1, 2]

## Same as above but for the nnet-model.
em_nnet <- confusionMatrix(rfe_trained_nnet$pred$pred,
                           reference = rfe_trained_nnet$pred$obs)$table
em <- as.data.frame(matrix(em_nnet/ctrl$repeats, ncol = 2))
row.names(em) <- c("Infected", "not Infected")
print(xtable(em), 
      floating = FALSE, 
      include.colnames = FALSE, 
      hline.after = NULL, 
      add.to.row = list(pos = list(-1, 2), 
                        command = c(paste("\\toprule \n", 
                                          "& Infected & not Infected ", 
                                          "\\\\\\midrule \n"), 
                                    "\\bottomrule \n")))
(oa_nnet <- round((1- (em_nnet[2, 1] + em_nnet[1, 2])/sum(em_nnet))*100, 2))
fc_nnet <- em_nnet[2, 1] + em_nnet[1, 2]