#------------ Reproduction Script for JSS submission "Fast Kernel Smoothing in R
#---------------------- with Applications to Projection Pursuit"

# The entire script can be run by selecting all (ctrl + A) and either
# clicking the "Run" button, or pressing ctrl + Enter You will be prompted
# for input from time-to-time.  The numerical results presented in the paper
# and referenced herein were obtained by running the script on RStudio and
# in Windows.  Slightly different outputs have been observed when running in
# Linux, despite careful setting of random seeds throughout.


run_reproduction_script <- function(Interactive = FALSE) {

  # ------------------------------------ Section 1:

  # Code from page 4: Install and load the package and check help files.  No
  # output produce, but help page for the package should be brought up.

  cat("To begin, we must first install the package, if not already on the system. We then \n
      bring up the first page of the package documentation. The code will not produce any \n
      output, but some notifications associated with the installation and loading of the \n
      package may be printed to the console. Press [Enter] to continue. \n \n")
  if (Interactive) readline()

  if (system.file(package = "FKSUM") == "") install.packages("FKSUM")
  library("FKSUM")

  # help("FKSUM")

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }
  # ------------------------------------ Section 2:

  # Code from page 6, near bottom.  The following code produces the plots in
  # Figure 1.


  cat("Next we execute the code near the bottom of page 6 in the submission, which produces \n
      Figure 1. The plots show four of the kernels used in the implementation, along \n
      with the Gaussian kernel (dotted lines) for comparison. Press [Enter] to continue. \n \n")
  if (Interactive) readline()

  op <- par(no.readonly = TRUE)
  
  par(mfrow = c(1, 4), mar = rep(2, 4))

  for (k in 1:4) {
     # plot smooth kernels with coefficients proportional to 1/k! using function
     # plot_kernel() from the package
     plot_kernel(1 / factorial(0:k), ylim = c(0, .5))
     # add Gaussian kernel for comparison
     lines(seq(-4, 4, length = 500), dnorm(seq(-4, 4, length = 500)), lty = 2)
  }

  
  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  par(op)

  # Example: Kernel Regression

  cat("We now move on to the problem of kernel regression, and the example \n
      beginning at the bottom of page 7 in the submission. As before, we \n
      we begin by sampling the data and then plotting the estimates \n
      for a range of bandwidth values. The following, using the code \n
      beginning near the bottom of page 8 in the submission, will \n
      produce the left plot in Figure 2, seen at the top of page 9 \n
      in the submission. Press [Enter] to continue. \n \n")
  if (Interactive) readline()

  # Code from page 8.  The following produces the plots in Figure 2.

  set.seed(1)
  n <- 5000
  # simulate the values of the covariate
  x <- rbeta(n, 2, 2) * 10
  # compute exact functional values and the corresponding
  # error modified responses
  fx <- 3 * sin(2 * x) + 10 * (x > 5) * (x - 5)
  y <- fx + rt(n, 3) + (rgamma(n, 2, 2) - 1) * ((x - 5)^2 + 3)
  # set bandwidth parameters and plot the estimates
  hs <- seq(.05, .25, length = 5)
  xeval <- seq(0, 10, length = 1000)
  
  op <- par(no.readonly = TRUE)
  par(mfrow = c(1, 2))
  for (h in c(0.25, 0.05)) {
    plot(x, y, xlab = "x", ylab = "y")
    fhat <- fk_sum(x, y, h, x_eval = xeval) / fk_sum(x, rep(1, n),
                                                     h, x_eval = xeval)
    lines(xeval, fhat, col = 2, lwd = 2)
    lines(xeval, 3 * sin(2 * xeval) + 10 * (xeval > 5) * (xeval - 5),
          col = 3, lwd = 2)
  }
  
  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  par(op)
  

  cat("The following will \n
      implement the the code snippet from the middle of page \n
      9 in the submission, which produces Figue 3 \n
      in the submission. Press [Enter] to continue. \n \n")

  if (Interactive) readline()

  op <- par(no.readonly = TRUE)
  
  par(mfrow = c(1, 2))
  plot(fk_regression(x, y, type = "NW"))
  lines(xeval, 3 * sin(2 * xeval) + 10 * (xeval > 5) * (xeval - 5),
           col = 3, lwd = 2)
  plot(fk_regression(x, y, type = "NW", h = "cv"))
  lines(xeval, 3 * sin(2 * xeval) + 10 * (xeval > 5) * (xeval - 5),
           col = 3, lwd = 2)

  par(op)
  
  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  # ------------------------------------ Section 3:

  # ------- section 3.1:

  # Example: Simulation

  cat("We now move on to Section 3, and the simulation example described \n
      on page 14 in the submission. The simulation compares \n
      multiple implementations of Independent Component Analysis (ICA) \n
      available through CRAN. First the packages associated with the \n
      other implementations must be loaded. The following will not \n
      produce any output, but messages associated with the installation \n
      and loading of these packages may be printed to the console. \n
      Press [Enter] to continue. \n \n")
  if (Interactive) readline()

  # Code from page 14, bottom.  The following installs and loads other
  # packages for performing independent component analysis.  It produces no
  # output, but may print messages produced by the loading of these packages

  if (system.file(package = "fastICA") == "") install.packages("fastICA")
  if (system.file(package = "ProDenICA") == "") install.packages("ProDenICA")
  if (system.file(package = "PearsonICA") == "") install.packages("PearsonICA")
  if (system.file(package = "JADE") == "") install.packages("JADE")

  library("fastICA")
  library("ProDenICA")
  library("PearsonICA")
  library("JADE")


  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("We can now perform the simulation study. \n
      We first perform a single run and plot the solution. Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # The following performs a single run of the experiment using only the
  # fk_ICA function, and plots the resulting solution.  The code is found at
  # the bottom of page 14, and continuing on page 15 in the submission.  The
  # code produces Figure 4 in the submission.

  set.seed(1)
  X <- matrix(0, 2000, 4)
  densities <- sample(letters[1:18], 4)
  for (i in 1:4) X[,i] <- rjordan(densities[i], 2000)
  R <- mixmat(4)
  Xx <- X %*% R
  model <- fk_ICA(Xx, 4)

  op <- par(no.readonly = TRUE)
  par(mfrow = c(4, 3), mar = c(0, 0, 3, 0))
  for (i in 1:4) {
    plot(fk_density(Xx[,i]), main = paste("Mixed component", i),
         xaxt = "n", yaxt = "n")
    plot(fk_density(model$S[,i]), main = paste("Estimated component", i),
         xaxt = "n", yaxt = "n")
    plot(fk_density(X[,i]), main = paste("True component", i),
         xaxt = "n", yaxt = "n")
  }

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  par(op)
  

  cat("We can now continue the simulation study. We first define settings for the \n
      simulation, and the define objects used to store the results. Next the \n
      simulation will be run. It will likely take \n
      a few minutes to run all 20 replications which are set up. While \n
      the following will not produce output, various Warning and Error \n
      messages may be printed to the console. These are caused by the \n
      implementation in the package ProDenICA. Press [Enter] to continue. \n \n")
  if (Interactive) readline()

  # The following performs a simulation study comparing the running times
  # and accuracy of different methods for ICA.  This will take about a
  # minute and a half, depending on the system being used.  It produces no
  # output, but may produce warnings.  These arise from the function
  # ProDenICA, in the package of the same name.

  amari_all <- list(fk = c(), fk_bin = c(), fast = c(), Pearson = c(), ProDen = c(), Jade = c())
  t_all <- list(fk = c(), fk_bin = c(), fast = c(), Pearson = c(), ProDen = c(), Jade = c())


  for (rep in 1:20) {
    set.seed(rep)
    # matrix X contains the independent sources signals
    X <- matrix(0, 2000, 4)
    # the components have marginal distributions from a
    # sample of the densities from Bach and Jordan
    densities <- sample(letters[1:18], 4)
    for (i in 1:4) X[,i] <- rjordan(densities[i], 2000)
    # R represents the mixing matrix, and Xx the mixed components
    # given to the algorithms
    R <- mixmat(4)
    Xx <- X %*% R
    # fit the different models and store their running times and
    # performance. ProDenICA is wrapped in try() since the function
    # occasional produces errors.

    t_all$fk[rep] <- system.time(model <- fk_ICA(Xx, 4))[1]
    amari_all$fk[rep] <- amari(model$K %*% model$W, solve(R))
    t_all$fk_bin[rep] <- system.time(model <- fk_ICA(Xx, 4, nbin = 2000))[1]
    amari_all$fk_bin[rep] <- amari(model$K %*% model$W, solve(R))
    t_all$fast[rep] <- system.time(model <- fastICA(Xx, 4))[1]
    amari_all$fast[rep] <- amari(model$K %*% model$W, solve(R))
    t_all$ProDen[rep] <- system.time(model <- try(suppressWarnings(ProDenICA(Xx, 4, whiten = TRUE))))[1]
    if (inherits(model, "try-error")) {
      t_all$ProDen[rep] <- NA
      amari_all$ProDen[rep] <- NA
    }
    else amari_all$ProDen[rep] <- amari(model$whitener %*% model$W, solve(R))
    t_all$Pearson[rep] <- system.time(model <- PearsonICA(Xx, 4))[1]
    amari_all$Pearson[rep] <- amari(model$W, solve(R))
    t_all$Jade[rep] <- system.time(model <- JADE(Xx))[1]
    amari_all$Jade[rep] <- amari(t(model$W), solve(R))
  }


  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("Next we plot the results of the simulations as boxplots. The following \n
      will produce Figure 5 in the submission. It should be noted that despite carefully
      controlling the random seeds, differences in the performance of ProDen and Perason
      have been observed, which we believe to be a result of random number generation outside
      R. Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  
  op <- par(no.readonly = TRUE)
  
  boxplot(amari_all)
  abline(h = median(amari_all$fk))
  points(1:6, unlist(lapply(amari_all, mean)), col = 2, lwd = 6)


  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  par(op)
  
  # Example: The Cocktail Party Problem

  cat("We now move on to a real data example, beginning \n
      at the bottom of page 16 in the submission. Here three sound \n
      signals are first augmented by an additional white-noise \n
      signal, and then given a random linear transformation (mixing). \n
      The random elements (the noise signal and mixing) are repeated \n
      twenty times, and the results (as before) stored for the different \n
      methods. We first install and load some more packages which are \n
      necessary for this example. We then load the three sound signals. \n
      The following will not produce any output, but may, as before, \n
      print messages to the console associated with the installation \n
      and loading of the packages. Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # Code from page.  The following installs and loads a package required for
  # this example.  It then loads data from this packages.  It produces no
  # output, but may produce messages

  if (system.file(package = "tuneR") == "") install.packages("tuneR")
  library("tuneR")
  S1 <- readWave(system.file("datafiles/source5.wav", package = "JADE"))
  S2 <- readWave(system.file("datafiles/source7.wav", package = "JADE"))
  S3 <- readWave(system.file("datafiles/source9.wav", package = "JADE"))


  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("We now run the experiment, much as the previous simulation example, \n
      using the code in the main part of page 17. in the submission \n
      This will likely again take a few minutes. \n
      While again the code will not produce output, warnings and error messages \n
      could arise from the ProDenICA package. Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # Code from page 17, middle.  The following repeatedly adds a noise
  # component to these data, and then mixes them with a random matrix.  It
  # then estimates the unmixing matrix, as in the previous simulation.  It
  # will take about two and a half minutes.  It may produce warnings, again
  # arising from ProDenICA.

  # this example is very similar to the previous one, and all code can be interpreted
  # exactly as above, except the setting of the independent source signals
  amari_all <- list(fk = c(), fk_bin = c(), fast = c(),
                    Pearson = c(), ProDen = c(), Jade = c())
  t_all <- list(fk = c(), fk_bin = c(), fast = c(),
                   Pearson = c(), ProDen = c(), Jade = c())
  for (rep in 1:20) {
      set.seed(rep)
      NOISE <- noise("white", duration = 50000)
      X <- cbind(S1@left, S2@left, S3@left, NOISE@left)
      R <- mixmat(4)
      Xx <- X %*% R
      t_all$fk[rep] <- system.time(model <- fk_ICA(Xx, 4))[1]
      amari_all$fk[rep] <- amari(model$K %*% model$W, solve(R))
      t_all$fk_bin[rep] <- system.time(model <- fk_ICA(Xx, 4, nbin = 5000))[1]
      amari_all$fk_bin[rep] <- amari(model$K %*% model$W, solve(R))
      t_all$fast[rep] <- system.time(model <- fastICA(Xx, 4))[1]
      amari_all$fast[rep] <- amari(model$K %*% model$W, solve(R))
      t_all$ProDen[rep] <- system.time(model <- suppressWarnings(ProDenICA(Xx, 4, whiten = TRUE)))[1]
      amari_all$ProDen[rep] <- amari(model$whitener %*% model$W, solve(R))
      t_all$Pearson[rep] <- system.time(model <- PearsonICA(Xx, 4))[1]
      amari_all$Pearson[rep] <- amari(model$W, solve(R))
      t_all$Jade[rep] <- system.time(model <- JADE(Xx))[1]
      amari_all$Jade[rep] <- amari(t(model$W), solve(R))
  }


  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("Again, we plot the accuracy as boxplots. \n
      The following will produce Figure 6 in the submission. As before \n
      we have observed differences in some of the competing methods desipte \n
      controlling random seeds. The general performance should, however, be \n
      similar. Press [Enter] to continue.")
  if (Interactive) readline()
  boxplot(amari_all)
  abline(h = median(amari_all$fk))
  points(1:6, unlist(lapply(amari_all, mean)), col = 2, lwd = 6)

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }


  # ------- section 3.2:

  cat("The next problem we consider is the estimation of low-density hyperplanes \n
      (Section 3.2 in the submission). As before, we first conduct a simulation \n
      study comparing the implementation in the package with the existing \n
      method. We first install and load the relevant package. As always, this \n
      produces no output but may result in messages being printed to the console. \n
      Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # Code from page 20, upper middle.  The following will install the package
  # "PPCI".  The code produces no output, but may produce messages from the
  # installation of the package.

  if (system.file(package = "PPCI") == "") install.packages("PPCI")
  library("PPCI")

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("We now conduct the simulation study described on page 20, which will take a few minutes.\n
      Press [Enter] to continue.")
  if (Interactive) readline()
  n_dat <- 2000
  n_dim <- 10
  n_rep <- 100
  t_fk <- numeric(n_rep)
  t_pp <- numeric(n_rep)
  dens_int_fk <- numeric(n_rep)
  dens_int_pp <- numeric(n_rep)
  s_ratio_fk <- numeric(n_rep)
  s_ratio_pp <- numeric(n_rep)


  for (rep in 1:n_rep) {
    set.seed(rep)
    n_comp <- rpois(1, 4) + 2
    # to test the performance of the implementations we generate
    # clustered data from a Gaussian mixture. mu contais
    # the component means while sds contains the diagonals
    # of the component covariances. ps stored the mixing
    # proportions
    mu <- matrix(runif(n_dim * n_comp), n_comp, n_dim)
    sds <- matrix(rexp(n_dim * n_comp), n_dim, n_comp) / 7
    ps <- runif(n_comp) + .1
    ps <- ps / sum(ps)
    # the matrix I is the cluster indicator matrix which
    # allows for easy definition of the clustered data set
    # by generating the mean (M) and residual (R) components
    # separately and then adding them
    I <- t(rmultinom(n_dat, 1, ps))
    M <- I %*% mu
    R <- matrix(rnorm(n_dat * n_dim), n_dat, n_dim) * (I %*% t(sds))
    X <- M + R
    # the models are now fit and the performances stored
    t_fk[rep] <- system.time(model <- fk_mdh(X))[1]
    # the density on the hyperplane is the density of the
    # projected random variable on the optimal projection
    # model$v, evaluated at the split point model$b.
    dens_int_fk[rep] <- sum(ps * dnorm(model$b, mu %*% model$v,
                                       model$v %*% (sds * model$v)))
    # similarly the induced partition corresponds to the sides
    # of model$b on which the data lie, when projected onto model$v
    s_ratio_fk[rep] <- success_ratio(X %*% model$v < model$b[1],
                                     apply(I, 1, which.max))
    t_pp[rep] <- system.time(model <- mdh(X))[1]
    dens_int_pp[rep] <- sum(ps * dnorm(model$b, mu %*% model$v,
                                       model$v %*% (sds * model$v)))
    s_ratio_pp[rep] <- success_ratio(X %*% model$v < model$b[1],
                                     apply(I, 1, which.max))
  }

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("We now compare the performance of the two implementations based on \n
      three criteria. The first is based on the running time of the methods. \n
      The results will vary, as always, based on the implementation. However \n
      you should find that the method from the package (t_fk) runs roughly four \n
      times faster than the existing method (t_pp). The results are shown in Table 1 in the \n
      submission. Press [Enter] to continue.\n \n")
  if (Interactive) readline()
  print(colMeans(cbind(t_fk, t_pp)))

  print(colMeans(cbind(dens_int_fk, dens_int_pp)))

  print(colMeans(cbind(s_ratio_fk, s_ratio_pp)))

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }


  # Example: Digit Recognition

  cat("We now move on to a real-world data sets associated with \n
      digit recognition. The following applies both impementations \n
      to the optical recognition of handwritten digits data set. \n
      This will produce four outputs. The first is the running time \n
      of the method in the package, and will differ depending on the \n
      system being used. The second is the resulting success ratio, \n
      which should be 0.9299176. The code is found in the lower part \n
      of page 20 in the submission. Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # Code beginning at the bottom og page 20.  The following loads the
  # optidigits data set, and then obtains minimum density hyperplanes using
  # the two implementations.

  data("optidigits", package = "PPCI")

  print(system.time(fk_sol <- fk_mdh(optidigits$x)))

  # The output of the above will differ depending on the system

  print(success_ratio(optidigits$x %*% fk_sol$v < fk_sol$b[1], optidigits$c))

  # The output of the above should be
  # [1] 0.9299176

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("The next two outputs are the same criteria for the existing method. The \n
      success ratio (second output) should be 0.9040637. Press \n
      [Enter] to continue. \n \n")
  if (Interactive) readline()
  print(system.time(pp_sol <- mdh(optidigits$x)))

  # The output of the above will differ depending on the system

  print(success_ratio(optidigits$x %*% pp_sol$v < pp_sol$b[1], optidigits$c))

  # The output of the above should be
  # [1] 0.9040637

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("We now plot the resulting hyperplane solutions, using the code \n
      from the middle of page 21 in the submission.  This will produce
      Figure 7.  Press [Enter] to continue.  \n \n")
  if (Interactive) readline()
  # Code from page 21.  The following can be used to produce Figure 7 in the
  # paper.

  plot(fk_sol)
  fk_sol$v <- pp_sol$v
  fk_sol$b <- pp_sol$b
  plot(fk_sol)

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }


  # ------- section 3.3:

  cat("The final method presented in the submission is that \n
      of projection pursuit regression, found in Section 3.3. \n
      The first pieces of code, found on pages 23 and 26, define the objectve function \n
      and its gradient. Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # Code from page 23.  The following defines the function for computing the
  # projection index for projection pursuit regression

  phi_ppr <- function(w, X, r, h, beta) {
    n <- nrow(X)
    # compute projected covariates
    p <- X %*% w / sqrt(sum(w^2))
    # compute numerator and denominator terms
    # for fitted values
    Sr <- fk_sum(p, r, h, beta = beta) - beta[1] * r
    S1 <- fk_sum(p, rep(1, n), h, beta = beta) - beta[1]
    # buffer small values in the denominator for stability
    S1[S1 < 1e-20] <- 1e-20
    r_hat <- Sr / S1
    # compute sum-square error for the projection
    sum((r - r_hat)^2)
  }


  # Code from page 26.  The following defines a function which can be used
  # to compute the gradient of the projection index for projection pursuit
  # regression.

  dphi_ppr <- function(w, X, r, h, beta) {
    n <- nrow(X)
    # the first steps are almost exactly as in
    # the function for evaluating the objective/
    # projection index
    nw <- sqrt(sum(w^2))
    p <- X %*% w / nw
    S1 <- fk_sum(p, rep(1, n), h, beta = beta, type = "both")
    S1[, 1] <- S1[, 1] - beta[1]
    S1[S1[, 1] < 1e-20, 1] <- 1e-20
    Sr <- fk_sum(p, r, h, beta = beta, type = "both")
    Sr[, 1] <- Sr[, 1] - beta[1] * r
    r_hat <- Sr[, 1] / S1[, 1]
    # the expression for the partial derivatives of
    # the objective w.r.t. the projected covariates
    # has a complex form, described in the paper,
    # and so is broken into three components T1, T2 and T3
    T1 <- fk_sum(p, r_hat * (r_hat - r) / S1[, 1], h,
                 beta = beta, type = "dksum")
    T2 <- r * fk_sum(p, (r_hat - r) / S1[, 1], h,
                     beta = beta, type = "dksum")
    T3 <- (r_hat - r) / S1[, 1] * (r_hat * S1[, 2] - Sr[, 2])
    # dphi_dp is the vector of partial derivatives mentioned above
    dphi_dp <- (T1 - T2 + T3) * 2 / h
    # dp_dw is the matrix of partial derivatives of the projected
    # points w.r.t. the elements of the projection vector
    dp_dw <- (X / nw - p %*% t(w) / nw^2)
    # the gradient of the objective is the product of these
    # as a result of the chain rule
    c(t(dphi_dp) %*% dp_dw)
  }

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("We now verify the correctness of the gradient function by comparing its \n
      output with that of a numerically approximated gradient based on \n
      finite differences. First we simulate some data on which to test \n
      this correctness, using the code at the bottom of page 26, and \n
      continuing on page 27. Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # Code beginning at the bottom og page 26.  The following simulates pairs
  # of covariates and response values for non-linear multivariate regression

  set.seed(1)
  n_dat <- 1000
  n_dim <- 10
  # simulate matrix of covariates
  X <- matrix(rnorm(n_dat * n_dim), n_dat, n_dim) %*%
    matrix(2 * runif(n_dim^2) - 1, n_dim, n_dim)

  # simulate random "true" projection vectors
  wtrue1 <- rnorm(n_dim)
  wtrue2 <- rnorm(n_dim)

  # define response a non-linear function of the two sets of projected
  # covariates, along the two projection vectors
  y <- (X %*% wtrue1 > 1) * (X %*% wtrue1 - 1) + tanh(X %*% wtrue2 / 2) *
    (X %*% wtrue1) + (X %*% (wtrue1 - wtrue2) / 5)^2 + rnorm(n_dat)


  # Code from page 37, middle.  The following computes the analytical and
  # numerically approximated gradients evaluated at a randomly generated
  # projection vector.

  w <- rnorm(n_dim)
  h <- runif(1)
  beta <- c(0.25, 0.25)
  # to compute the finite differences, we add and subtract 1e-5 from each
  # element in the vector in turn. This can easily be achieved using the
  # rows of the matrix I x 1e-5, where I is the identity matrix
  Eh <- diag(n_dim) * 1e-5
  # dphi_approx is the approximated gradient
  dphi_approx <- apply(Eh, 1, function(eh) (phi_ppr(w + eh, X, y, h, beta)
                        - phi_ppr(w - eh, X, y, h, beta)) / 2e-5)
  # dphi is the exact gradient provided our gradient function is correct
  dphi <- dphi_ppr(w, X, y, h, beta)

  # we check how close these are, in an absolute and relative sense

  print(max(abs(dphi / dphi_approx - 1)))

  # The output of the above should be
  # [1] 7.004954e-10

  print(max(abs(dphi - dphi_approx)))

  # The output of the above should be
  # [1] 1.47976e-07


  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("We now define a function which will perform the projection pursuit, using \n
      the code starting at the bottom og page 27. We then apply this to the \n
      simulated data we generated before, and plot the results (code from middle 'n
      of page 28). This will produce Figure 8. Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # Code from page 27.  The following defines a function to perform
  # projection pursuit for regression.

  ppr_nw <- function(X, r, w = NULL) {
    n <- nrow(X)
    d <- ncol(X)
    # to illustrate the capability of finding a good solution from a poor
    # initialisation, we allow for a user-specified initialisation. 
    # Otherwise we use a linear model with a small ridge penalty to
    # initialise.
    if (is.null(w)) w <- solve(t(X) %*% X + .01 * diag(d)) %*% t(X) %*% r
    # a simple oversmoothing heuristic for the bandiwdth to be used for
    # projection pursuit
    h <- sqrt(eigen(cov(X))$values[1]) / n^.2
    # we use the base function optim(), provided with the objective and
    # gradient we defined above, to perform the actual optimisation
    w <- optim(w, phi_ppr, dphi_ppr, X, r, h, c(.25, .25),
               method = "L-BFGS-B")$par
    # optim will not enforce the unit norm condition during optimisation, so
    # we do so after termination
    w <- w / sqrt(sum(w^2))
    # we now compute the final bandwidth for deteriming the fitted model and
    # for prediction using cross-validation
    loo_sse <- function(h) phi_ppr(w, X, r, h, c(.25, .25))
    h <- optimise(loo_sse, c(h/50, h))$minimum
    list(w = w, h = h)
  }

  # Code from page 28.  The following obtains the optimal projection for
  # regression, and plots the resulting fit.

  mu <- mean(y)
  r <- y - mu

  w_opt <- ppr_nw(X, r, w = w)

  p <- X %*% w_opt$w
  S1 <- fk_sum(p, rep(1, n_dat), w_opt$h)
  Sr <- fk_sum(p, r, w_opt$h)
  fitted <- mu + Sr / S1

  # The following can be used to produce Figure 8 from the paper.

  op <- par(no.readonly = TRUE)
  
  par(mfrow = c(1, 3))
  plot(X %*% w, y, main = "Response against initial projection",
       xlab = "optimal projection", ylab = "y")
  plot(X %*% w_opt$w, y, main = "Response against optimal projection",
       xlab = "optimal projection", ylab = "y")
  points(X %*% w_opt$w, fitted, col = 2)
  plot(fitted, y, main = "Response against fitted values",
       xlab = "fitted", ylab = "y")
  abline(0, 1, lwd = 2, col = 2)

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  par(op)

  # Example: Simulation


  cat("As in the previous two cases, we begin with a simulation study. \n
      The example begins at the bottom of page 28 in the submission. \n
      The first part of this example uses a lower dimensional example. \n
      The following will simulate 50 data sets in the \n
      same manner as we did above to test the gradient. It then fits \n
      PPR models using the functions defined above and the existing \n
      implementation from R's base stats package. As always, we store \n
      performance criteria, including running time and accuracy of the \n
      ouputs. Press [Enter] to continue. This will take about a minute. \n \n")
  if (Interactive) readline()

  n_rep <- 50
  t_stats <- numeric(n_rep)
  t_nw <- numeric(n_rep)
  R2_stats <- numeric(n_rep)
  R2_nw <- numeric(n_rep)
  for (rep in 1:n_rep) {
    set.seed(rep)
    # the simulation of the data is as before.
    # Considerable variation in the difficulty
    # of the problems will come primarily from
    # the relative orientations of the two "true"
    # projection vectors, and the corresponding
    # projections of the covariates.
    X <- matrix(rnorm(n_dat * n_dim), n_dat, n_dim) %*%
      matrix(2 * runif(n_dim * n_dim) - 1, n_dim, n_dim)
    wtrue1 <- rnorm(n_dim)
    wtrue2 <- rnorm(n_dim)
    y <- (X %*% wtrue1 > 1) * (X %*% wtrue1 - 1) + tanh(X %*% wtrue2 / 2) *
      (X %*% wtrue1) + (X %*% (wtrue1 - wtrue2) / 5)^2 + rnorm(n_dat)
    t_stats[rep] <- system.time(model <- ppr(X[1:(n_dat / 2),],
                                             y[1:(n_dat / 2)], nterms = 1))[1]
    yhat <- predict(model, X[(n_dat / 2 + 1):n_dat,])
    R2_stats[rep] <- 1 - mean((yhat - y[(n_dat / 2 + 1):n_dat])^2) /
      var(y[(n_dat / 2 + 1):n_dat])
    t_nw[rep] <- system.time(model <- ppr_nw(X[1:(n_dat / 2),],
                                             y[1:(n_dat / 2)] - mean(y[1:(n_dat / 2)])))[1]
    # the prediction for the test data using the method we constructed above
    # is done explicitly, using the fitted values from the training data for
    # smoothing along the optimal projection.
    p <- X[1:(n_dat / 2),] %*% model$w
    ptest <- X[(n_dat / 2 + 1):n_dat,] %*% model$w
    S1 <- fk_sum(p, rep(1, n_dat / 2), model$h, x_eval = ptest)
    Sr <- fk_sum(p, y[1:(n_dat / 2)] - mean(y[1:(n_dat / 2)]), model$h,
                 x_eval = ptest)
    yhat <- mean(y[1:(n_dat / 2)]) + Sr / S1
    R2_nw[rep] <- 1 - mean((yhat - y[(n_dat / 2 + 1):n_dat])^2) /
      var(y[(n_dat / 2 + 1):n_dat])
  }

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("We can now compare the results. The following prints the \n
      average running times of the methods, and the values will \n
      differ depending on the system being used. You should see \n
      that the existing method runs much faster on these data \n
      than does the method in the package (t_stats << t_nw). \n
      Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  print(colMeans(cbind(t_stats, t_nw)))

  # The output of the above will differ depending on the system being used.

  cat("The other performance measure is the estimated coefficient \n
      of determination based on a test set from the same distributon \n
      as the training data. The values should be \n \n
      R2_stats R2_nw \n
      0.6948070 0.6493921 \n \n
      Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  print(colMeans(cbind(R2_stats, R2_nw)))

  # The output of the above should be
  # R2_stats R2_nw
  # 0.6948070 0.6493921

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("We now do the same as above, but for a much larger problem, \n
      both in terms of number of data and number of dimensions/covariates. \n
      Although we now do fewer repetitions, this \n
      will take longer than the last example, likely a few minutes. \n
      Press [Enter] to continue. \n \n")
  if (Interactive) readline()


  n_dat <- 5000
  n_dim <- 200
  n_rep <- 20
  t_stats <- numeric(n_rep)
  t_nw <- numeric(n_rep)
  R2_stats <- numeric(n_rep)
  R2_nw <- numeric(n_rep)
  for (rep in 1:n_rep) {
    set.seed(rep)
    X <- matrix(rnorm(n_dat * n_dim), n_dat, n_dim) %*%
      matrix(2 * runif(n_dim * n_dim) - 1, n_dim, n_dim)
    wtrue1 <- rnorm(n_dim)
    wtrue2 <- rnorm(n_dim)
    y <- (X %*% wtrue1 > 1) * (X %*% wtrue1 - 1) + tanh(X %*% wtrue2 / 2) *
      (X %*% wtrue1) + (X %*% (wtrue1 - wtrue2) / 5)^2 + rnorm(n_dat)
    t_stats[rep] <- system.time(model <- ppr(X[1:(n_dat / 2),],
                                             y[1:(n_dat / 2)], nterms = 1))[1]
    yhat <- predict(model, X[(n_dat / 2 + 1):n_dat,])
    R2_stats[rep] <- 1 - mean((yhat - y[(n_dat / 2 + 1):n_dat])^2) /
      var(y[(n_dat / 2 + 1):n_dat])
    t_nw[rep] <- system.time(model <- ppr_nw(X[1:(n_dat / 2),],
                                             y[1:(n_dat / 2)] - mean(y[1:(n_dat / 2)])))[1]
    p <- X[1:(n_dat / 2),] %*% model$w
    ptest <- X[(n_dat / 2 + 1):n_dat,] %*% model$w
    S1 <- fk_sum(p, rep(1, n_dat / 2), model$h, x_eval = ptest)
    Sr <- fk_sum(p, y[1:(n_dat / 2)] - mean(y[1:(n_dat / 2)]), model$h,
                 x_eval = ptest)
    yhat <- mean(y[1:(n_dat / 2)]) + Sr / S1
    R2_nw[rep] <- 1 - mean((yhat - y[(n_dat / 2 + 1):n_dat])^2) /
      var(y[(n_dat / 2 + 1):n_dat])
  }

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("As before, we now investigate the running time and the estimated \n
      coefficient of determination. The running times should show that now \n
      the method in the package is slightly faster than the existing \n
      implementation (t_nw < t_stats). We have observed slight differences \n
      in the average coefficients of determination on different systems, \n
      but should be roughly \n \n
      R2_stats R2_nw \n
      -0.7454252  0.7890950 \n \n
      Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  print(colMeans(cbind(t_stats, t_nw)))

  # The output of the above will differ depending on the system being used.

  print(colMeans(cbind(R2_stats, R2_nw)))

  # The output of the above should be roughly
  # R2_stats R2_nw
  # -0.8266645  0.7890161


  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  # Example: Baseball Salaries

  cat("The final example in the submission begins in the \n
      lower half of page 29. We begin by loading a relevant \n
      package and defining the data set on which we apply the \n
      two methods. The following will not produce any output \n
      but messages associated with the installation and loading \n
      of the package may be displayed. Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # Code from page 29, near bottom. The following loads the "ISLR" package
  # and sets up the data for the example.

  if (system.file(package = "ISLR") == "") install.packages("ISLR")
  library("ISLR")

  X <- as.matrix(Hitters[! is.na(Hitters[, 19]), c(1:13, 16:18)])
  y <- Hitters[! is.na(Hitters[, 19]), 19]

  if (Interactive) {
    cat("Press [Enter] to continue. \n \n")
    readline()
  }

  cat("In the following we repeatedly split the data into training and test \n
      sets, and fit models with the implementation we defined above and the \n
      existing implementation. We then consider the results using the estimated \n
      coefficient of determination. The following will run this experiment, and \n
      print these average coefficients. Because we also now include models with \n
      two components, for values will be printed, which should be \n \n
      R2_stats_1 R2_stats_2 R2_nw_1 R2_nw_2 \n
      0.3350620  0.2966666  0.3720452  0.4322518  \n \n
      Press [Enter] to continue. \n \n")
  if (Interactive) readline()
  # Code from main part of page 30.  The following repeatedly splits the
  # data into training and test sets, and fits regression models using the
  # two implementations, as before.  It then computes the mean estimated
  # coefficients of determination.

  n_rep <- 50
  R2_stats_1 <- numeric(n_rep)
  R2_stats_2 <- numeric(n_rep)
  R2_nw_1 <- numeric(n_rep)
  R2_nw_2 <- numeric(n_rep)

  for (rep in 1:n_rep) {
    set.seed(rep)

    # sample training indices
    sm <- sample(1:nrow(X), floor(.7 * nrow(X)))
    # fit model using implementation in stats package
    model <- ppr(X[sm,], y[sm], nterms = 1)
    # compute coefficient of determination on test set
    yhat <- predict(model, X[-sm,])
    R2_stats_1[rep] <- 1 - mean((yhat - y[-sm])^2) / var(y[-sm])
    # do as above but for a two-component model
    model <- ppr(X[sm,], y[sm], nterms = 2)
    yhat <- predict(model, X[-sm,])
    R2_stats_2[rep] <- 1 - mean((yhat - y[-sm])^2) / var(y[-sm])
    # for the implementation we constructed in the paper, we explicitly
    # perform the forward step-wise fitting, beginning with the first
    # component
    component_1 <- ppr_nw(X[sm,], y[sm])
    # we now obtain the fitted values on the test set and compute R^2
    p <- X %*% component_1$w
    S1 <- fk_sum(p[sm], rep(1, length(sm)), component_1$h, x_eval = p)
    Sy <- fk_sum(p[sm], y[sm], component_1$h, x_eval = p)
    fitted <- Sy[sm] / S1[sm]
    yhat <- Sy[-sm] / S1[-sm]
    R2_nw_1[rep] <- 1 - mean((yhat - y[-sm])^2) / var(y[-sm])
    # we now fit the second component using the partial residuals
    # and similarly produce fitted values for the test cases and
    # compute R^2
    component_2 <- ppr_nw(X[sm,], y[sm] - fitted)
    p <- X %*% component_2$w
    S1 <- fk_sum(p[sm], rep(1, length(sm)), component_2$h, x_eval = p[-sm])
    Sy <- fk_sum(p[sm], y[sm] - fitted, component_2$h, x_eval = p[-sm])
    yhat <- yhat + Sy / S1
    R2_nw_2[rep] <- 1 - mean((yhat - y[-sm])^2) / var(y[-sm])
  }

  ### we now print the accuracy of the different models

  print(colMeans(cbind(R2_stats_1, R2_stats_2, R2_nw_1, R2_nw_2)))

  # The output of the above should be
  # R2_stats_1 R2_stats_2    R2_nw_1    R2_nw_2
  # 0.3350620  0.2966666  0.3720452  0.4322518

  cat("This concludes all examples and code snippets from the submission.")
}


run_reproduction_script(TRUE)