## --------------------------------------------------------------------------- ## Replication code accompanying the paper "Hyperspectral Data Analysis ## in R: The hsdar Package" by Lukas W. Lehnert, Hanna Meyer, Wolfgang A. ## Obermeier, Brenner Silva, Bianca Regeling and Joerg Bendix ## --------------------------------------------------------------------------- ## Check if installation of required packages is needed. if (!require("hsdar")) { install.packages("hsdar") library("hsdar") } ## Set random seed seed <- 1104 set.seed(seed) ## Example how to create an object of class `Speclib' from offline data. data("spectral_data", package = "hsdar") reflectance <- spectra(spectral_data) class(reflectance) wv <- wavelength(spectral_data) class(wv) spec_lib <- speclib(reflectance, wv) class(spec_lib) ## Example how to obtain spectral data from an internet library (may take some ## time depending on the internet connection). avl <- USGS_get_available_files() grass_spectra <- USGS_retrieve_files(avl = avl, pattern = "grass-fescue") ##------------------------------------- ## Working with large raster files ##------------------------------------- ## Create an example hyperspectral image using PROSAIL based on a photograph ## of an oak leaf published on wikipedia ## Download example file from wikipedia url <- "https://upload.wikimedia.org/wikipedia/commons/thumb/7/72/Quercus_ilex_rotundifolia_leaf.jpg/800px-Quercus_ilex_rotundifolia_leaf.jpg" download.file(url, "exmpl_img.jpg") ## Read file and coerce green band to matrix img <- brick("exmpl_img.jpg") img <- as.matrix(img[[2]]) ## Define subset of image to process (to reduce computational time) xsub <- 50:200 ysub <- 150:300 ## Create list of parameters for PROSAIL parameterList <- matrix(ncol = 8, nrow = length(xsub)*length(ysub)) j <- 0 for (k in ysub) { for (i in xsub) { j <- j + 1 if (img[k, i] > 220) { ## Create soil background spectra beside the leaf parameterList[j, ] <- c(1, 40, 8, 0, 0.01, 0.009, 0, 0) } else { ## Modify chlorophyll and caroteniod content according to green band values parameterList[j, ] <- c(1, 35 + 20/(img[k, i] + 1), 5 + 40/(img[k, i] + 1), 0, 0.01, 0.009, 0, 1) } } cat(round(j/(length(xsub)*length(ysub))*100, 2), "\n") } parameterList <- as.data.frame(parameterList) names(parameterList) <- c("N", "Cab", "Car", "Cbrown", "Cw", "Cm", "psoil", "LAI") ## Run PROSAIL spec <- PROSAIL(parameterList = parameterList) ## Create hyperspectral brick spec_brick <- brick(raster(img[ysub, xsub]), values = TRUE, nl = nbands(spec)) spec_brick <- brick(spec_brick, nl = nbands(spec)) spec_brick <- setValues(spec_brick, aperm(array(t(spectra(spec)), dim = c(nbands(spec), ncol(spec_brick), nrow(spec_brick))), c(3, 2, 1))) ## Write brick to disk (used again later to demonstrate the cubePlot function) writeRaster(spec_brick, "exmpl_cube.tif") ## Apply Savitzky-Golay Filter to spectra and write filtered data to disk infile <- "exmpl_cube.tif" outfile <- "exmpl_cube_filtered.tif" ra <- speclib(infile, 400:2500) tr <- blockSize(ra) res <- writeStart(ra, outfile, nl = nbands(ra), format = "GTiff") for (i in 1:tr$n) { v1 <- getValuesBlock(ra, row=tr$row[i], nrows=tr$nrows[i]) v2 <- noiseFiltering(v1, method = "sgolay", n = 25) res <- writeValues(res, v2, tr$row[i]) } res <- writeStop(res) ##------------------------------------- ## Figure 2: Effect of noise reduction ##------------------------------------- ## ## As an example, the 10th spectrum in the `Speclib' object is ## taken. Since the effect of filtering can be better shown, if not ## the entire spectrum (1400 bands) is shown, we cut the spectrum ## additionally to the red edge part, where a high information content ## is available for vegetation remote sensing. spectral_data_1 <- spectral_data[10,400:550] ## The noise in the example dataset shipped with the package is ## already removed because otherwise the examples in the package would ## have always the noise-reduction as a first step. Thus, the data is ## artificially affected by random noise to show the effect of ## filtering in this example. The artificial noise is created using ## normally distributed random numbers. spectra(spectral_data_1) <- spectra(spectral_data_1) + spectra(spectral_data_1) * matrix(rnorm(nbands(spectral_data_1)*nspectra(spectral_data_1), mean = 0, sd = .01), ncol = nbands(spectral_data_1)) ## In the following, several filters are applied to the noisy spectral ## data. sgolay <- noiseFiltering(spectral_data_1, method = "sgolay", n = 25) lowess <- noiseFiltering(spectral_data_1, method = "lowess", f = .1) meanflt <- noiseFiltering(spectral_data_1, method = "mean", p = 5) spline <- noiseFiltering(spectral_data_1, method = "spline", n = round(nbands(spectral_data_1)/10, 0)) ## In the following, the plot is created. Please note that the ## reflectance values resulting from the lowess-, mean- and ## spline-filters are added with 10%, 20% and 30%, respectively. This ## allows a better comparison because the curves are not overlaying. par(mar = c(4, 4, 0.1, 0.1)) plot(spectral_data_1, ylim = c(10, 90)) plot(sgolay, new = FALSE, col = "red") text(labels = "Savitzky-Golay-Filter", x = wavelength(spectral_data_1)[nbands(spectral_data_1)], y = spectra(spectral_data_1)[1, nbands(spectral_data_1)] + 2, adj = c(1, 0)) spectra(spectral_data_1) <- spectra(spectral_data_1) + 10 spectra(lowess) <- spectra(lowess) + 10 plot(spectral_data_1, new = FALSE) plot(lowess, new = FALSE, col = "red") text(labels = "Lowess-Filter", x = wavelength(spectral_data_1)[nbands(spectral_data_1)], y = spectra(spectral_data_1)[1, nbands(spectral_data_1)] + 2, adj = c(1, 0)) spectra(spectral_data_1) <- spectra(spectral_data_1) + 10 spectra(meanflt) <- spectra(meanflt) + 20 plot(spectral_data_1, new = FALSE) plot(meanflt, new = FALSE, col = "red") text(labels = "Mean-Filter", x = wavelength(spectral_data_1)[nbands(spectral_data_1)], y = spectra(spectral_data_1)[1, nbands(spectral_data_1)] + 2, adj = c(1, 0)) spectra(spectral_data_1) <- spectra(spectral_data_1) + 10 spectra(spline) <- spectra(spline) + 30 plot(spectral_data_1, new = FALSE) plot(spline, new = FALSE, col = "red") text(labels = "Spline-Filter", x = wavelength(spectral_data_1)[nbands(spectral_data_1)], y = spectra(spectral_data_1)[1, nbands(spectral_data_1)] + 2, adj = c(1, 0)) ##------------------------------------- ## Figure 3b: Hyperspectral cube ##------------------------------------- ## The hyperspectral dataset used for this illustration is by far too ## large to be included in an R package. Therefore, we can only ## describe, how such a plot can be created in hsdar. Instead, we will ## use the simulated spectral data from the oak leaf as described above. ## Create an object of class `Speclib'. img_c <- speclib(infile, 400:2500) ## Plot the cube. Please note that the package "rgl" is required which will be ## installed first, if necessary. ## Check if installation of required packages is needed. if (!require("rgl")) { install.packages("rgl") library("rgl") } ## For unknown reasons, the plot may only appear after the second call ## on some systems. We will only plot the bands between 400 and 1000 nm. ## Color for the spectral information is chosen out of the rainbow-palette. cubePlot(img_c[, 1:599], sidecol = colorRamp(palette(rainbow(6)[5:1])), z_interpolate = TRUE) ##------------------------------------- ## Case study: Chlorophyll ##------------------------------------- ## Load the example dataset and apply a noise reduction filter. The ## filter would not be necessary in this example because the data is ## already filtered. However, for most datasets, this is a really ## critical step which is why we include it here. data("spectral_data", package = "hsdar") spectral_data <- noiseFiltering(spectral_data, method = "sgolay", p = 15) ## Convert the chlorophyll measurements stored in the SI from ## SPAD-values into mg. SI(spectral_data)$chlorophyll <- (117.1 * SI(spectral_data)$chlorophyll) / (148.84 - SI(spectral_data)$chlorophyll) ## Cut the data to the spectral range, where chlorophyll absorbs light. spectral_data <- spectral_data[, wavelength(spectral_data) >= 310 & wavelength(spectral_data) <= 1000] ## Transform reflectance values into band depth applying a segmented upper hull ## continuum removal. spec_bd <- transformSpeclib(spectral_data, method = "sh", out = "bd") ## Select the chlorophyll absorption features at 460 nm and 670 nm for further ## processing. featureSpace <- specfeat(spec_bd, c(460, 670)) ## Calculate all parameters from both selected features such as area, distance ## to Gauss curve etc. featureSpace <- feature_properties(featureSpace) ## Set response and additional predictor variables for random forest ## model. featureSpace <- setResponse(featureSpace, "chlorophyll") featureSpace <- setPredictor(featureSpace, names(SI(featureSpace))[5:ncol(SI(featureSpace))]) ## Define training and cross validation for random forest model tuning. ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, savePredictions = "final") ## Train random forest model. rfe_trained <- train(featureSpace, trControl = ctrl, method = "rf") ## Extract root mean square error from cross validation. estimates <- tapply(rfe_trained$pred$pred[order(rfe_trained$pred$rowIndex)], INDEX = rfe_trained$pred$rowIndex[order(rfe_trained$pred$rowIndex)], mean) measured <- SI(featureSpace)$chlorophyll r_squared <- round(summary(lm(estimates ~ measured))$r.squared, 2) rmse_val <- round(sqrt(mean((estimates - measured)^2, na.rm = TRUE)),2) ##------------------------------------- ## Case study: Chlorophyll: Figure 4 ##------------------------------------- ## par(mfrow = c(1, 2), mar = c(4, 4, 0.2, 0.2)) ## Plot the raw spectra and the gray boxes. plot(spectral_data) polygon(c(315, 530, 530, 315, 315), c(0, 0, 30, 30, 0), col = "lightgrey", lty = "blank") polygon(c(570, 700, 700, 570, 570), c(0, 0, 30, 30, 0), col = "lightgrey", lty = "blank") plot(spectral_data, new = FALSE) ## Calculate the mean + 1 sd spectrum out of all available ones in the ## `Speclib' object. me <- apply(spectral_data, 1, "mean") spectra(me) <- spectra(me) + spectra(apply(spectral_data, 1, "sd")) ## Construct the convex and the upper segmented hull. me_ch <- transformSpeclib(me, method = "ch", out = "raw") me_sh <- transformSpeclib(me, method = "sh", out = "raw") ## Plot both hulls. plot(me_ch, ispec = 1, new = FALSE, numeratepoints = FALSE, hull.style = list(lty = "dotted", col = "darkred", lwd = 2), points.style = NULL, type = "n") plot(me_sh, ispec = 1, new = FALSE, numeratepoints = FALSE, hull.style = list(lty = "dotted", col = "darkblue", lwd = 2), points.style = NULL, type = "n") ## Plot all band depth spectra. for (i in 1:nspectra(spec_bd)) plot(spec_bd, FUN = i, new = i == 1, ylim = c(0, 1)) ##------------------------------------- ## Case study: Chlorophyll: Figure 5 ##------------------------------------- ## ## Small helper function to pretty format a, b, c ... of subfigures in ## the corner of the plot. includeLetter <- function( letter, ## Text to print to plot. If integer value is passed, ## the i-th letter in alphabet is printed position = "upperleft", ## Position of text (May be one of "upperleft", ## "upperright", "lowerleft" or "lowerright") xdis = .05, ## Distance from left/right border of plot in percent ydis = NA, ## Distance from upper/lower border of plot in percent ## If NA, ydis is calculated relatively to xdis regarding ## width/height of the plot ... ## Further arguements passed to function text ) { t <- if (is.numeric(letter)) letters[letter] else letter p <- par()$usr if (is.na(ydis)) { f <- par()$fin ydis <- xdis*f[1]/f[2] } x <- switch(position, upperleft = c(p[1]+(p[2]-p[1])*xdis, p[4]-(p[4]-p[3])*ydis), upperright = c(p[2]-(p[2]-p[1])*xdis, p[4]-(p[4]-p[3])*ydis), lowerleft = c(p[1]+(p[2]-p[1])*xdis, p[3]+(p[4]-p[3])*ydis), lowerright = c(p[2]-(p[2]-p[1])*xdis, p[3]+(p[4]-p[3])*ydis)) text(x[1], x[2], t, ...) invisible(x) } ## Reset plot interface. par(mfrow = c(1, 1)) ## Plot the scatterplot. lim <- range(c(range(estimates), range(measured))) plot(estimates ~ measured, ylim = lim, xlim = lim, pch = 19, xlab = expression(paste("Measured chlorophyll content (", mu, plain(g)~plain(cm)^{-2}, ")", sep = "")), ylab = "") lines(lim, lim, lty = "dotted") includeLetter(substitute(atop(RMSE == x, R^2 == y), list(x = rmse_val, y = r_squared)), xdis = 0.15, ydis = 0.1) title(ylab = expression(paste("Estimated chlorophyll content (", mu, plain(g)~plain(cm)^{-2}, ")", sep = "")), line = 2.5) ##------------------------------------- ## Case study: Chlorophyll: Table 2 ##------------------------------------- ## ## Load package xtable (install if missing). if (!require("xtable")) { install.packages("xtable") library("xtable") } ## Define header of the table. header <- paste("\\toprule\n ", paste("ID", "\\multicolumn{2}{c}{Area}", "\\multicolumn{2}{c}{Width}", "\\multicolumn{2}{c}{Max.~Band}", "\\multicolumn{4}{c}{Dist.~to Gauss Curve}\\\\\n ", collapse = "", sep = "&"), paste("&&&", "&\\multicolumn{2}{c}{Width}", "\\multicolumn{2}{c}{$f_{450}$}", "\\multicolumn{2}{c}{$f_{700}$}\\\\\n ", sep = "&"), paste("\\cmidrule(r){2-3}\\cmidrule(r){4-5}\\", "cmidrule(r){6-7}\\cmidrule(r){8-9}\\", "cmidrule(r){10-11}\n ", sep = ""), paste("&$f_{450}$", "$f_{700}$", "$f_{450}$", "$f_{700}$", "$f_{450}$", "$f_{700}$", "left", "right", "left", "right\n", sep = "&"), "\\\\", "\\midrule \n", sep = "") ## Select columns that are displayed (the format of the journal does ## not allow to print all columns). tmp <- SI(featureSpace)[, 4:ncol(SI(featureSpace))] row.names(tmp) <- idSpeclib(featureSpace) tmp <- cbind(data.frame(ID = 1:nrow(tmp)), tmp[, c(1:4, 5, 10, 6, 11, 7, 12, 15, 16, 8, 9, 13, 14)[c(1:2, 9:16)]]) ## Print the table. print(xtable(tmp, digits = c(0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2)), floating = FALSE, size = "tiny", include.rownames = FALSE, include.colnames = FALSE, caption.placement = "top", hline.after = NULL, add.to.row = list(pos = list(-1, nrow(tmp)), command = c(header, "\\bottomrule \n"))) ##------------------------------------- ## Case study: Cancer ##------------------------------------- ## ## Load dataset. data("cancer_spectra", package = "hsdar") ## Plot spectra from cancerous and non-cancerous parts separately. plot(subset(cancer_spectra, infected == 1), ylim = c(0, 400), col = "darkred") plot(subset(cancer_spectra, infected == 0), new = FALSE) ## Convert response variable to factor. SI(cancer_spectra)$infected <- as.factor(SI(cancer_spectra)$infected) ## Calculate all possible normalized ratio indices. nri_data <- nri(cancer_spectra, recursive = TRUE) ## In general, if many bands and/or many observations are available in ## the dataset, the following process would take a long time. In this ## case one could easily parallelize the execution of the following code ## by uncommenting and running: # n_cores <- 2 # # # For Linux/Mac OS # library("doMC") # registerDoMC(n_cores) # # # For Windows: # library("doMPI") # cl <- startMPIcluster(count = n_cores) # registerDoMPI(cl) ## Fitting Generalized Linear Models between each index and the ## decision if the respective part of the tissue is cancerous or not. glm_models <- glm.nri(infected ~ nri_data, preddata = cancer_spectra, family = binomial) ## Plot the result of the glms in a 2D-correlogram (Figure 7). par(mar = c(5.1, 4.1, 4.1, 5)) ## Define colors cols <- c(rgb(82, 0, 0, maxColorValue = 255), rgb(179, 179, 179, maxColorValue = 255), rgb(0, 0, 82, maxColorValue = 255)) ## Get symmetric ranges of values (required for a symmetric colorrange). range <- max(abs(as.array(glm_models@multivariate$z.value)[, , 2]), na.rm = TRUE) range <- c(-range, range) ## Plot the lower triang (z-values). plot(glm_models, coefficient = "z.value", legend = "outer", range = range, colspace = "rgb", col = cols) ## Plot the upper triang (p-values). plot(glm_models, coefficient = "p.value", uppertriang = TRUE, colspace = "rgb", col = cols[c(1, 3, 2)], zlog = TRUE) ## Plot the diagonal line and the texts to the legends. lines(c(300, 1705), c(300, 1705)) mtext("p-value", 3, 3.2) mtext("z-value", 4, 3) ## Get worst and best models. n_best <- nri_best_performance(glm_models, n = 1, coefficient = "z.value", predictor = 2, abs = TRUE, findMax = TRUE, family = binomial) n_worst <- nri_best_performance(glm_models, n = 1, coefficient = "z.value", predictor = 2, abs = TRUE, findMax = FALSE, family = binomial) ## Mark worst and best models in the plot. mark_nri_best_performance(n_best, glm_models, border = "white", lwd = 2) mark_nri_best_performance(n_worst, glm_models, border = "black", lwd = 0.5, density = 20) mark_nri_best_performance(n_worst, glm_models, border = "black", lwd = 0.5, density = 20, angle = 45+90) mark_nri_best_performance(n_worst, glm_models, border = "black", lwd = 2) mark_nri_best_performance(n_best, glm_models, border = "white", lwd = 2, uppertriang = TRUE) mark_nri_best_performance(n_worst, glm_models, border = "black", lwd = 0.5, uppertriang = TRUE, density = 20) mark_nri_best_performance(n_worst, glm_models, border = "black", lwd = 0.5, uppertriang = TRUE, density = 20, angle = 45+90) mark_nri_best_performance(n_worst, glm_models, border = "black", lwd = 2, uppertriang = TRUE) ## End of Figure creation. ## Since we want to use a multivariate approach to classify the ## spectra into cancerous and non-cancerous (in this case using SVM ## and neuronal networks), we have to define the response and ## additional predictor variables. Please note that all nri-values are ## used as predictors in default. nri_data <- setResponse(nri_data, "infected") nri_data <- setPredictor(nri_data, "stage") ## Perform a recursive feature selection to reduce the number of ## predictors and to exclude those which are heavily correlated. sel_feat <- rfe(nri_data, cutoff = 0.9) ## Define training and cross validation for SVM and nnet model tuning. ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, savePredictions = "final") ## Train both models. set.seed(seed) rfe_trained_svm <- train(sel_feat, trControl = ctrl, importance = TRUE, method = "svmRadial") set.seed(seed) rfe_trained_nnet <- train(sel_feat, trControl = ctrl, importance = TRUE, method = "nnet", seed = 1104) ##------------------------------------- ## Case study: Table 3, Error Matrix ##------------------------------------- ## ## Construct a confusion matrix. em_svm <- confusionMatrix(rfe_trained_svm$pred$pred, reference = rfe_trained_svm$pred$obs)$table ## Print the table. em <- as.data.frame(matrix(em_svm/ctrl$repeats, ncol = 2)) row.names(em) <- c("Infected", "not Infected") print(xtable(em), floating = FALSE, include.colnames = FALSE, hline.after = NULL, add.to.row = list(pos = list(-1, 2), command = c(paste("\\toprule \n", "& Infected & not Infected ", "\\\\\\midrule \n"), "\\bottomrule \n"))) ## Calculate overall accuracy and percentage of false classified spectra. (oa_svm <- round((1- (em_svm[2, 1] + em_svm[1, 2])/sum(em_svm))*100, 2)) fc_svm <- em_svm[2, 1] + em_svm[1, 2] ## Same as above but for the nnet-model. em_nnet <- confusionMatrix(rfe_trained_nnet$pred$pred, reference = rfe_trained_nnet$pred$obs)$table em <- as.data.frame(matrix(em_nnet/ctrl$repeats, ncol = 2)) row.names(em) <- c("Infected", "not Infected") print(xtable(em), floating = FALSE, include.colnames = FALSE, hline.after = NULL, add.to.row = list(pos = list(-1, 2), command = c(paste("\\toprule \n", "& Infected & not Infected ", "\\\\\\midrule \n"), "\\bottomrule \n"))) (oa_nnet <- round((1- (em_nnet[2, 1] + em_nnet[1, 2])/sum(em_nnet))*100, 2)) fc_nnet <- em_nnet[2, 1] + em_nnet[1, 2]