# Averaged gene expressions for regression
library(methods)
library(Matrix)

avg <- function(X, y, hc = NULL, method = "average", dist = c("euclidian", "correlation"), K = 10, s = c("lambda.1se","lambda.min"))
{
  
  # Dimension
  n = nrow(X)
  p = ncol(X)
  
  
  temps = rep(NA,3)
  
  if(is.null(hc))
  {
    t1 = proc.time()
    dist = match.arg(dist)
    s = match.arg(s)
        
    ##### clustering part
    require(fastcluster)
    D = matrix()
    if(dist == "euclidian")
    {
      X2 <- scale(X, center = TRUE, scale = FALSE)
      X2 = scale(X2, center = FALSE, scale = sqrt(colSums(X2^2)/n))
      
      D = dist(t(X))
    }else{
      
      # correlation based distance
      D = as.dist(1 - abs(cor(X)))  
    }
    
    
    hc = fastcluster::hclust(D, method = method)
    t2 = proc.time()
    
    temps[1] = t2[3] - t1[3]
  }

  foldcv <- rep(1:K, ceiling(n/K))[1:n]
  foldcv = foldcv[sample(n)]
  
  
  t1 = proc.time()

  ##### lasso part
  require(glmnet)

  res = c()
  cverror = rep(Inf, p)
  beta = c()
  cvlim = Inf
  for(lv in 2:p)
  {
    # partition
    group = cutree(hc, lv)
    

      Xbar = do.call(cbind, tapply(1:p, group, function(x){rowMeans(X[,x,drop=FALSE])}))
      
      rescv = cv.glmnet(Xbar, y, foldid = foldcv, nfold = K)

    
    
    indcv = which.min(rescv$cvm)
    cverror[lv] = rescv$cvm[indcv]
    
    if(cverror[lv]<cvlim)
    {
      res = rescv$glmnet.fit
      cvlim = cverror[lv]
      beta  = coef(rescv, s = s)
    }
    
  }# fin for level
  
  t2 = proc.time() 
  lvmin = which.min(cverror)
  
  temps[2] = t2[3] - t1[3] 
  
  return(list(group = cutree(hc,lvmin), res = res, beta = beta[-1], intercept=beta[1], cverror = cverror, time = temps))
}
