/***********************************************************************

 SGNMANOVA_C.SAS

 Date:          16 May 2006 
 Written by:    Jaakko Nevalainen

 Written for:   SAS version 8.2 on Windows XP

 Description:   Macro performs a spatial sign MANOVA in a c-sample case
                for p-dimensional data.

 Input dataset: N x (p+1) data set. First column of the data set is 
                assumed to be the sample identifier coded sequentially 
                from 1 to c. There should be no missing values nor extra 
                variables. The number of observations should be greater 
                than the number of variables.

 Parameters:    dataset - name of the input data set
                nsamples - number of samples
                eps - precision for estimates (default is 1E-6)
                nperm - number of permutations for the permutation test 
                        (1000)
                nboot - number of bootstrap samples (500)
                
 Output:        Shape matrix estimate - V (p x p matrix)

                Estimates for location - mu (c x p matrix of row vectors) 
                Standard error of mus by bootstrapping - se_bs_mu (c x p)
                  and by large sample approximation - se_as_mu (c x p)
                Covariance matrices of mus by bootstrapping - cov_bs 
                  (cp x (p+1))
                  and by large sample approximation - cov_as (cp x (p+1)).
                  First column identifies the sample.

                Estimates for differences - delta 
                                            (c x p matrix of row vectors)
                Standard error of deltas by bootstrapping - se_bs_delta 
                                                           (c x p)
                  and by large sample approximation - se_as_delta 
                                                      (c x p)
 
                Value of the test statistic - Q2
                p-value based on limiting chi2-distribution - p_as
                p-value based on permutation distribution - p_perm
                Standard error of p_perm - se_p
                
 Example calls: 

 %include "c:\my sas files\sgnmanova_c.sas";
 %sgnmanova_c(mydata, 3);
 %sgnmanova_c(mydata, 3, eps=1E-9, nperm=10000, nboot=1000);

***********************************************************************/

%macro sgnmanova_c (dataset, nsamples, eps=1E-6, nperm=1000, nboot=500);

proc iml; 
  use &dataset;
  read all into data;
  p = ncol(data) - 1;

***********************************************************************;
*** Module ESTIMATE_C for estimation of V, mus and deltas           ***;
***********************************************************************;

  start estimate_c( _X_, eps, nsamples);
    p  = ncol(_X_) - 1;
    nobs = nrow (_X_);

    *** Starting values ***;
    *** Vectors of marginal medians and individual sample sizes ***;
    do j = 1 to nsamples;

      *** jth sample ***;
      _Xsample_ = ( _X_[loc(_X_[,1] = j),] ) [, (2:(p+1))]; 

      if j = 1 then do;
        _N_ = nrow(_Xsample_);
        mu = median(_Xsample_);
      end; 
      else do;
        _N_ = _N_ // nrow(_Xsample_);
        mu = mu // median(_Xsample_);
      end;
    end;
    
    *** p x p identity matrix ***;
    V = I(p);

    stop = 0;
    do until ( stop = 1 ); 

      *** Stack mus to a nobs x p matrix by their sample sizes ***;
      do j = 1 to nsamples;
        if j = 1 then _mu_ = J(_N_[1],1,1) * mu[1,];
        else _mu_ = _mu_ // (J(_N_[j],1,1) * mu[j,]);
      end;

      *** Define symmetric matrix A such that V = A*A ***;
      call eigen(evalues, evectors, V);
      A =  evectors * sqrt(diag(evalues)) * evectors`; 

      *** Standardized observations ***;
      Z = (inv(A) * ( _X_[,2:p+1] - _mu_ )`)`;

      *** Radius r = ||z|| ***;
      r = sqrt(Z[,##]);

      *** Direction vectors u = z / ||z|| ***;
      if r[1] > 0 then U = Z[1,] / r[1];
      if r[1] < 1E-9 then U = 0 * J(1,p);
      do i = 2 to nobs;
        if i > 1 & r[i] > 0    then U = U // ( Z[i,] / r[i]);
        if i > 1 & r[i] < 1E-9 then U = U // 0 * J(1,p);
      end;       

      *** Iteration step for V ***;
      V0 = V;
      V = A * (  U` * U ) * A;

      *** Scale V such that tr(V) = p ***;
      V = p / trace(V) * V;

      *** Iteration step for mu (c x p matrix)***;
      mu0 = mu;
      ndone = 0;
      do j = 1 to nsamples;

        * distances of the jth sample *;
        _r_ = r[(ndone+1):(ndone+_N_[j])];   

        * directions of the jth sample *;
        _U_ = U[(ndone+1):(ndone+_N_[j]),];  

        *** Protection against stopping iteration at a data point ***;
        if min(_r_) < 1E-9 then do;
          k = ncol(loc(_r_ < 1E-9));
          _r_[loc(_r_ < 1E-9)] = .;

          *** Vardi & Zhang weight ***;
		  _Usum_ = _U_[+,];
          _w_ = k / sqrt( _Usum_  * ( _Usum_ )`);  
        end;
        else _w_ = 0;

        *** Iteration step for the first sample mu ***;
        if j = 1 then 
          mu = max(0,1-_w_) * 
              ( mu0[1,] + ( 1 / ( (1/_r_)[:] )* A * _U_[:,]` )` ) 
             + min(1,_w_) * mu0[1,];

        *** Iteration step for the jth sample mu ***;
        else mu = mu // 
                  max(0,1-_w_) * 
                  ( mu0[j,] + ( 1 / ( (1/_r_)[:] )* A * _U_[:,]` )` ) 
                + min(1, _w_) * mu0[1,];

        ndone = ndone + _N_[j];
      end;

      *** Stopping criteria w.r.t. mu and V***;
      if trace( ( mu - mu0 )` * ( mu - mu0 ) ) < eps & 
         trace( ( V - V0 )` * ( V - V0 ) ) < eps 
         then stop = 1;
    end;    
    return(mu // V);
  finish estimate_c;

***********************************************************************;
*** Module ESTIMATE_1 for estimation of V, joint mu                 ***;
***********************************************************************;
  start estimate_1( _X_, eps);
    nobs = nrow(_X_);
    p  = ncol(_X_);

    *** Starting values ***;
    *** Vector of marginal medians ***;
    mu = median(_X_); 

    *** p x p identity matrix;
    V = I(p);

    stop = 0;
    do until ( stop = 1 ); 

      *** Define symmetric matrix A such that V = A*A ***;
      call eigen(evalues, evectors, V);
      A =  evectors * sqrt(diag(evalues)) * evectors`; 

      *** Standardized observations ***;
      Z = (inv(A) * ( _X_ - J(nobs,1,1) * mu )`)`;

      *** Radius r = ||z|| ***;
      r = sqrt(Z[,##]);

      *** Direction vectors u = z / ||z|| ***;
      if r[1] > 0 then U = Z[1,] / r[1];
      if r[1] < 1E-9 then U = 0 * J(1,p);
      do i = 2 to nobs;
        if i > 1 & r[i] > 0    then U = U // ( Z[i,] / r[i]);
        if i > 1 & r[i] < 1E-9 then U = U // 0 * J(1,p);
      end;       

      *** Iteration step for V ***;
      V0 = V;
      V = A * (  U` * U ) * A;

      *** Scale V such that tr(V) = p ***;
      V = p / trace(V) * V;

      *** Protection against stopping iteration at a data point ***;
      if min(r) < 1E-9 then do;
        k = ncol(loc(r < 1E-9));
        r[loc(r < 1E-9)] = .;
        w = k / sqrt( U[+,] * ( U[+,] )`); *** Vardi & Zhang weight ***;
      end;
      else w = 0;

      *** Iteration step for mu ***;
      mu0 = mu;
      mu =  max(0, 1-w) * ( mu + ( 1 / ( (1/r)[:] ) * A * U[:,]` )` ) 
          + min(1, w) * mu;

      *** Stopping criteria w.r.t. mu and V***;
      if trace( ( mu - mu0 )` * ( mu - mu0 ) ) < eps & 
         trace( ( V - V0 )` * ( V - V0 ) ) < eps 
         then stop = 1;
    end;
    return(mu // V);
  finish estimate_1;

***********************************************************************;
*** Module TEST for estimation of mu, V under the null hypothesis,  ***;
*** and for the calculation of the test statistic.                  ***;
***********************************************************************;
  start test_c( _X_, eps, nperm);
    p = ncol(_X_) - 1;
    nobs = nrow(_X_);

    *** Null case mu and V ***;
    theta0 = estimate_1(_X_[,2:p+1], eps);
    _V_ = theta0[2:(p+1),];
    _mu_ = theta0[1,];

    *** Define symmetric matrix A such that V = A*A ***;
    call eigen(evalues, evectors, _V_);
    A =  evectors * sqrt(diag(evalues)) * evectors`; 

    *** Null case standardized observations ***;
    _Z_ = (( inv(A) * ( _X_[,2:(p+1)] - J(nobs,1,1)*_mu_ )` )`); 

    *** Null case length ||z|| ***;
    _r_ = sqrt(_Z_[,##]);

    *** Null case direction z / ||z|| ***;
    if _r_[1] > 0 then _U_ = _Z_[1,] / _r_[1];
    if _r_[1] < 1E-9 then _U_ = 0 * J(1,p);
    do i = 2 to nobs;
      if i > 1 & _r_[i] > 0    then _U_ = _U_ // ( _Z_[i,] / _r_[i]);
      if i > 1 & _r_[i] < 1E-9 then _U_ = _U_ // 0 * J(1,p);
    end; 

    *** Add sample identification ***;
    _U_ = _X_[,1]||_U_; 

    *** p-value based on the limiting distribution ***;
    do j = 1 to &nsamples;
      *** jth sample ***;
      _Usample_ = ( _U_[loc(_U_[,1] = j),] ) [, 2:(p+1)];
      if j = 1 then 
        Qj = nrow(_Usample_) * (_Usample_[:,])*(_Usample_[:,])`;
      else 
        Qj = Qj // (nrow(_Usample_) * 
                   ((_Usample_[:,])*(_Usample_[:,])`));	  
    end;
    Q2 = p * Qj[+];
    p_as = 1 - probchi(Q2, (p * (&nsamples - 1)));

    *** p-value based on permutations of the standardized signs ***;
    do i = 1 to nperm;

      *** Permute the elements of the sample assignment vector ***;
      *** First column = sample identificator ***;
      _U1_ = _U_[,1];  
      do until ( nrow(_U1_) = 1 );
        *** kth observation selected randomly***;
        k = int(ranuni(8371231)*nrow(_U1_)) + 1;  

        *** gammavec is the new (random) sample assignment vector ***;
        if nrow(_U1_) = nobs then gammavec = _U1_[k];
        else gammavec = gammavec // _U1_[k];
		
        if k = 1 then do;
          if nrow(_U1_ > 1) then _U1_ = _U1_[2:(nrow(_U1_))];
          else goto finalize;
        end;
        else if ( k < nrow(_U1_))      
          then _U1_ = _U1_[1:(k-1)]//_U1_[(k+1):(nrow(_U1_))]; 
        else if k = nrow(_U1_) 
          then _U1_ = _U1_[1:(nrow(_U1_)-1)];
      end;
      finalize:
      gammavec = gammavec // _U1_;
      _U_ = gammavec||_U_[,2:(p+1)];

      do j = 1 to &nsamples;
        *** jth sample ***;
        _Usample_ = ( _U_[loc(_U_[,1] = j),] ) [, 2:(p+1)]; 

        if j = 1 then 
          Qj = nrow(_Usample_) * ((_Usample_[:,])*(_Usample_[:,])`);
        else 
          Qj = Qj // (nrow(_Usample_) * 
                     ((_Usample_[:,])*(_Usample_[:,])`));
      end;
      
      Q2_p = p * Qj[+];

      *** Store values of the test statistic***;
      if i = 1 then Q2_dist = Q2_p;
      else Q2_dist = Q2_dist // Q2_p;
    end;

    *** Estimated p-value based on permutated distribution of Q2 ***;
    p_perm = 1 - (( J(nperm, 1, 1) * Q2 ) >= Q2_dist)[:];

    *** Its precision from binomial distribution ***; 
    se_p = sqrt(p_perm * ( 1 - p_perm ) / nperm); 

    return(Q2||p_as||p_perm||se_p);
  finish test_c;

***********************************************************************;
*** Module BOOTSTRAP for estimation of accuracy                     ***;
***********************************************************************;
  start bootstrap(_X_, _theta_, eps, nboot, nsamples);
    p = ncol(_X_) - 1;
    _V_ = _theta_[(nsamples+1):(nsamples+p),];
    
    *** Define symmetric matrix A such that V = A*A ***;
    call eigen(evalues, evectors, _V_);
    A =  evectors * sqrt(diag(evalues)) * evectors`; 

    *** Standardized observations (iid) ***;
    do j = 1 to nsamples;
      _Xsample_ = (_X_[loc(_X_[,1] = j),])[,(2:(p+1))];
      if j = 1 
        then _Z_=(inv(A)*(_Xsample_ -
                          repeat(_theta_[j,], nrow(_Xsample_),1))`)`;
      else 
        _Z_=_Z_//(inv(A)*(_Xsample_ - 
                          repeat(_theta_[j,], nrow(_Xsample_),1))`)`;
    end;
    
    *** Generate bootstrap samples and the corresponding estimates ***;
    do i = 1 to nboot;
      _nobs_ = nrow(_Z_);

      *** Generate a bootstrap sample ***; 
      do k = 1 to _nobs_;
        obs = int(_nobs_ * ranuni(9388427)) + 1;
        if k = 1 then _Zboot_ = _Z_[obs,];
        else _Zboot_ = _Zboot_//_Z_[obs,];
      end;

      *** Add a dummy sample identification ***;
      _Zboot_ = _X_[,1] || _Zboot_;

      *** Return to original measurement scale by sample ***;
      do j = 1 to nsamples;
        *** jth sample ***;	 
        _Zsample_ = ( _Zboot_[loc(_Zboot_[,1] = j),] );
        _nobs_ = nrow(_Zsample_);

        _Ysample_ = _Zsample_[,1] || 
                   ( repeat(_theta_[j,], _nobs_, 1) + 
                     _Zsample_[,2:(p+1)] * A );  
        
        if j = 1 then _Yboot_ = _Ysample_;
        else _Yboot_ = _Yboot_ // _Ysample_;
      end;

      *** Estimate parameters ***;
      _bs_ = estimate_c(_Yboot_, eps, nsamples);

      *** Store location estimates with sample identification ***;
      if i = 1 then bs = (1:nsamples)` || _bs_[(1:nsamples ),];
      else bs = bs // ((1:nsamples)`||_bs_[(1:nsamples),]);
    end;
   
    *** Covariance matrix estimates by sample ***;
    do j = 1 to nsamples;
      bssample  = ( bs[loc(bs[,1] = j),] )[,(2:(p+1))];
      _cov_j_ = 1 / (nboot - 1) * ( bssample - 
                                    J(nboot,1,1)* bssample[:,] )` * 
                    ( bssample - J(nboot,1,1)* bssample[:,] );
      if j = 1 then _cov_ = J(p,1,j) || _cov_j_;
      else _cov_ = _cov_ // ( J(p,1,j) ||_cov_j_ );
    end;
    return(_cov_);
  finish bootstrap;

***********************************************************************;
*** Module ASYMPTOTIC for estimation of accuracy                    ***;
***********************************************************************;
  start asymptotic(_X_, _theta_, eps, nsamples);
    nobs = nrow(_X_);
    p  = ncol(_X_) - 1;
    _V_ = _theta_[(nsamples+1):(nsamples+p),];

    *** Define symmetric matrix A such that V = A*A ***;
    call eigen(evalues, evectors, _V_);
    A =  evectors * sqrt(diag(evalues)) * evectors`; 

    *** Standardized observations (iid) ***;
    do j = 1 to nsamples;
      _Xsample_ = (_X_[loc(_X_[,1] = j),])[,(2:(p+1))];
      if j = 1 then 
        _Z_=(inv(A)*(_Xsample_ - 
                     repeat(_theta_[j,], nrow(_Xsample_),1))`)`;
      else 
        _Z_=_Z_// (inv(A)*(_Xsample_ - 
                           repeat(_theta_[j,], nrow(_Xsample_),1))`)`;
    end;

    *** Mean of inverse lengths ***;
    _r_ = sqrt(_Z_[,##]);
    E = (_r_##(-1))[:];

    *** Asymptotic covariance matrix in the spherical case ***;
    B = p/(p-1)**2 * E**(-2) * I(p);

    *** Re-transformation back to the original scale ***;
    do j = 1 to nsamples;
      _nobs_ = nrow((_X_[loc(_X_[,1] = j),]));
      _cov_j_ = 1/_nobs_ * A * B * A;
      if j = 1 then _cov_ = J(p,1,j) || _cov_j_;
      else _cov_ = _cov_ // ( J(p,1,j) ||_cov_j_ );
    end;	
    return(_cov_);
  finish asymptotic;

***********************************************************************;
*** End of modules - master code executes the modules and prints    ***;
*** the results                                                     ***;
***********************************************************************;

  *** Location and shape matrix estimates ***;
  theta = estimate_c(data, &eps, &nsamples);

  *** Sample differences with respect to sample 1 ***;
  delta = J(1, p, 0);
  do j = 2 to &nsamples;
    delta = delta // (theta[j,] - theta[1,]);
  end;

  mu = theta[(1:&nsamples),];
  V = theta[(&nsamples+1):(&nsamples+p),];

  *** Multivariate multisample spatial sign test ***;
  testres = test_c(data, &eps, &nperm);
  Q2 = testres[1];
  p_as = testres[2];
  p_perm = testres[3];
  se_p = testres[4];

  *** Bootstrap estimates of covariance matrices ***;
  cov_bs = bootstrap(data, theta, &eps, &nboot, &nsamples);

  *** Bootstrap standard error estimates ***;
  se_bs_mu = (sqrt(diag(cov_bs[(1:p),(2:(p+1))])))[+,];
  se_bs_mu1 = se_bs_mu;
  do j = 2 to &nsamples;
    se_bs_mu = se_bs_mu // 
              (sqrt(diag(cov_bs[(p*(j-1)+1):(p*j),(2:(p+1))])))[+,];
  end;
  se_bs_delta = J(1,p,.) // 
               sqrt(se_bs_mu[2:&nsamples,]##2 + 
               repeat(se_bs_mu1,&nsamples-1,1)##2);

  *** Asymptotic covariance matrices ***;
  cov_as = asymptotic(data, theta, &eps, &nsamples);

  *** Asymptotic standard error estimates ***;
  se_as_mu = (sqrt(diag(cov_as[(1:p),(2:(p+1))])))[+,];
  se_as_mu1 = se_as_mu;
  do j = 2 to &nsamples;
    se_as_mu = se_as_mu // 
              (sqrt(diag(cov_as[(p*(j-1)+1):(p*j),(2:(p+1))])))[+,];
  end;
  se_as_delta = J(1,p,.) // 
                sqrt(se_as_mu[2:&nsamples,]##2 + 
                repeat(se_as_mu1,&nsamples-1,1)##2);

  print "Scatter matrix estimate:           "         V,,,
        "Location and shift estimates:      "         mu delta ,,,
        "Location standard error estimates: "         se_bs_mu 
                                                      se_as_mu,,,
        "Shift standard error estimates:    "         se_bs_delta 
                                                      se_as_delta /
        "Large sample covariance matrix estimate: "   cov_as,,, 
        "Bootstrap estimate of covariance matrix: "   cov_bs,,,
        "Value of the test statistic:               " Q2,, 
        "p-value (large sample approximation):      " p_as ,,  
        "p-value (conditionally distribution-free): " p_perm se_p 
                                        "(" &nperm "permutations)"; 
quit;

%mend sgnmanova_c;