/*********************************************************************** SGNMANOVA_C.SAS Date: 16 May 2006 Written by: Jaakko Nevalainen Written for: SAS version 8.2 on Windows XP Description: Macro performs a spatial sign MANOVA in a c-sample case for p-dimensional data. Input dataset: N x (p+1) data set. First column of the data set is assumed to be the sample identifier coded sequentially from 1 to c. There should be no missing values nor extra variables. The number of observations should be greater than the number of variables. Parameters: dataset - name of the input data set nsamples - number of samples eps - precision for estimates (default is 1E-6) nperm - number of permutations for the permutation test (1000) nboot - number of bootstrap samples (500) Output: Shape matrix estimate - V (p x p matrix) Estimates for location - mu (c x p matrix of row vectors) Standard error of mus by bootstrapping - se_bs_mu (c x p) and by large sample approximation - se_as_mu (c x p) Covariance matrices of mus by bootstrapping - cov_bs (cp x (p+1)) and by large sample approximation - cov_as (cp x (p+1)). First column identifies the sample. Estimates for differences - delta (c x p matrix of row vectors) Standard error of deltas by bootstrapping - se_bs_delta (c x p) and by large sample approximation - se_as_delta (c x p) Value of the test statistic - Q2 p-value based on limiting chi2-distribution - p_as p-value based on permutation distribution - p_perm Standard error of p_perm - se_p Example calls: %include "c:\my sas files\sgnmanova_c.sas"; %sgnmanova_c(mydata, 3); %sgnmanova_c(mydata, 3, eps=1E-9, nperm=10000, nboot=1000); ***********************************************************************/ %macro sgnmanova_c (dataset, nsamples, eps=1E-6, nperm=1000, nboot=500); proc iml; use &dataset; read all into data; p = ncol(data) - 1; ***********************************************************************; *** Module ESTIMATE_C for estimation of V, mus and deltas ***; ***********************************************************************; start estimate_c( _X_, eps, nsamples); p = ncol(_X_) - 1; nobs = nrow (_X_); *** Starting values ***; *** Vectors of marginal medians and individual sample sizes ***; do j = 1 to nsamples; *** jth sample ***; _Xsample_ = ( _X_[loc(_X_[,1] = j),] ) [, (2:(p+1))]; if j = 1 then do; _N_ = nrow(_Xsample_); mu = median(_Xsample_); end; else do; _N_ = _N_ // nrow(_Xsample_); mu = mu // median(_Xsample_); end; end; *** p x p identity matrix ***; V = I(p); stop = 0; do until ( stop = 1 ); *** Stack mus to a nobs x p matrix by their sample sizes ***; do j = 1 to nsamples; if j = 1 then _mu_ = J(_N_[1],1,1) * mu[1,]; else _mu_ = _mu_ // (J(_N_[j],1,1) * mu[j,]); end; *** Define symmetric matrix A such that V = A*A ***; call eigen(evalues, evectors, V); A = evectors * sqrt(diag(evalues)) * evectors`; *** Standardized observations ***; Z = (inv(A) * ( _X_[,2:p+1] - _mu_ )`)`; *** Radius r = ||z|| ***; r = sqrt(Z[,##]); *** Direction vectors u = z / ||z|| ***; if r[1] > 0 then U = Z[1,] / r[1]; if r[1] < 1E-9 then U = 0 * J(1,p); do i = 2 to nobs; if i > 1 & r[i] > 0 then U = U // ( Z[i,] / r[i]); if i > 1 & r[i] < 1E-9 then U = U // 0 * J(1,p); end; *** Iteration step for V ***; V0 = V; V = A * ( U` * U ) * A; *** Scale V such that tr(V) = p ***; V = p / trace(V) * V; *** Iteration step for mu (c x p matrix)***; mu0 = mu; ndone = 0; do j = 1 to nsamples; * distances of the jth sample *; _r_ = r[(ndone+1):(ndone+_N_[j])]; * directions of the jth sample *; _U_ = U[(ndone+1):(ndone+_N_[j]),]; *** Protection against stopping iteration at a data point ***; if min(_r_) < 1E-9 then do; k = ncol(loc(_r_ < 1E-9)); _r_[loc(_r_ < 1E-9)] = .; *** Vardi & Zhang weight ***; _Usum_ = _U_[+,]; _w_ = k / sqrt( _Usum_ * ( _Usum_ )`); end; else _w_ = 0; *** Iteration step for the first sample mu ***; if j = 1 then mu = max(0,1-_w_) * ( mu0[1,] + ( 1 / ( (1/_r_)[:] )* A * _U_[:,]` )` ) + min(1,_w_) * mu0[1,]; *** Iteration step for the jth sample mu ***; else mu = mu // max(0,1-_w_) * ( mu0[j,] + ( 1 / ( (1/_r_)[:] )* A * _U_[:,]` )` ) + min(1, _w_) * mu0[1,]; ndone = ndone + _N_[j]; end; *** Stopping criteria w.r.t. mu and V***; if trace( ( mu - mu0 )` * ( mu - mu0 ) ) < eps & trace( ( V - V0 )` * ( V - V0 ) ) < eps then stop = 1; end; return(mu // V); finish estimate_c; ***********************************************************************; *** Module ESTIMATE_1 for estimation of V, joint mu ***; ***********************************************************************; start estimate_1( _X_, eps); nobs = nrow(_X_); p = ncol(_X_); *** Starting values ***; *** Vector of marginal medians ***; mu = median(_X_); *** p x p identity matrix; V = I(p); stop = 0; do until ( stop = 1 ); *** Define symmetric matrix A such that V = A*A ***; call eigen(evalues, evectors, V); A = evectors * sqrt(diag(evalues)) * evectors`; *** Standardized observations ***; Z = (inv(A) * ( _X_ - J(nobs,1,1) * mu )`)`; *** Radius r = ||z|| ***; r = sqrt(Z[,##]); *** Direction vectors u = z / ||z|| ***; if r[1] > 0 then U = Z[1,] / r[1]; if r[1] < 1E-9 then U = 0 * J(1,p); do i = 2 to nobs; if i > 1 & r[i] > 0 then U = U // ( Z[i,] / r[i]); if i > 1 & r[i] < 1E-9 then U = U // 0 * J(1,p); end; *** Iteration step for V ***; V0 = V; V = A * ( U` * U ) * A; *** Scale V such that tr(V) = p ***; V = p / trace(V) * V; *** Protection against stopping iteration at a data point ***; if min(r) < 1E-9 then do; k = ncol(loc(r < 1E-9)); r[loc(r < 1E-9)] = .; w = k / sqrt( U[+,] * ( U[+,] )`); *** Vardi & Zhang weight ***; end; else w = 0; *** Iteration step for mu ***; mu0 = mu; mu = max(0, 1-w) * ( mu + ( 1 / ( (1/r)[:] ) * A * U[:,]` )` ) + min(1, w) * mu; *** Stopping criteria w.r.t. mu and V***; if trace( ( mu - mu0 )` * ( mu - mu0 ) ) < eps & trace( ( V - V0 )` * ( V - V0 ) ) < eps then stop = 1; end; return(mu // V); finish estimate_1; ***********************************************************************; *** Module TEST for estimation of mu, V under the null hypothesis, ***; *** and for the calculation of the test statistic. ***; ***********************************************************************; start test_c( _X_, eps, nperm); p = ncol(_X_) - 1; nobs = nrow(_X_); *** Null case mu and V ***; theta0 = estimate_1(_X_[,2:p+1], eps); _V_ = theta0[2:(p+1),]; _mu_ = theta0[1,]; *** Define symmetric matrix A such that V = A*A ***; call eigen(evalues, evectors, _V_); A = evectors * sqrt(diag(evalues)) * evectors`; *** Null case standardized observations ***; _Z_ = (( inv(A) * ( _X_[,2:(p+1)] - J(nobs,1,1)*_mu_ )` )`); *** Null case length ||z|| ***; _r_ = sqrt(_Z_[,##]); *** Null case direction z / ||z|| ***; if _r_[1] > 0 then _U_ = _Z_[1,] / _r_[1]; if _r_[1] < 1E-9 then _U_ = 0 * J(1,p); do i = 2 to nobs; if i > 1 & _r_[i] > 0 then _U_ = _U_ // ( _Z_[i,] / _r_[i]); if i > 1 & _r_[i] < 1E-9 then _U_ = _U_ // 0 * J(1,p); end; *** Add sample identification ***; _U_ = _X_[,1]||_U_; *** p-value based on the limiting distribution ***; do j = 1 to &nsamples; *** jth sample ***; _Usample_ = ( _U_[loc(_U_[,1] = j),] ) [, 2:(p+1)]; if j = 1 then Qj = nrow(_Usample_) * (_Usample_[:,])*(_Usample_[:,])`; else Qj = Qj // (nrow(_Usample_) * ((_Usample_[:,])*(_Usample_[:,])`)); end; Q2 = p * Qj[+]; p_as = 1 - probchi(Q2, (p * (&nsamples - 1))); *** p-value based on permutations of the standardized signs ***; do i = 1 to nperm; *** Permute the elements of the sample assignment vector ***; *** First column = sample identificator ***; _U1_ = _U_[,1]; do until ( nrow(_U1_) = 1 ); *** kth observation selected randomly***; k = int(ranuni(8371231)*nrow(_U1_)) + 1; *** gammavec is the new (random) sample assignment vector ***; if nrow(_U1_) = nobs then gammavec = _U1_[k]; else gammavec = gammavec // _U1_[k]; if k = 1 then do; if nrow(_U1_ > 1) then _U1_ = _U1_[2:(nrow(_U1_))]; else goto finalize; end; else if ( k < nrow(_U1_)) then _U1_ = _U1_[1:(k-1)]//_U1_[(k+1):(nrow(_U1_))]; else if k = nrow(_U1_) then _U1_ = _U1_[1:(nrow(_U1_)-1)]; end; finalize: gammavec = gammavec // _U1_; _U_ = gammavec||_U_[,2:(p+1)]; do j = 1 to &nsamples; *** jth sample ***; _Usample_ = ( _U_[loc(_U_[,1] = j),] ) [, 2:(p+1)]; if j = 1 then Qj = nrow(_Usample_) * ((_Usample_[:,])*(_Usample_[:,])`); else Qj = Qj // (nrow(_Usample_) * ((_Usample_[:,])*(_Usample_[:,])`)); end; Q2_p = p * Qj[+]; *** Store values of the test statistic***; if i = 1 then Q2_dist = Q2_p; else Q2_dist = Q2_dist // Q2_p; end; *** Estimated p-value based on permutated distribution of Q2 ***; p_perm = 1 - (( J(nperm, 1, 1) * Q2 ) >= Q2_dist)[:]; *** Its precision from binomial distribution ***; se_p = sqrt(p_perm * ( 1 - p_perm ) / nperm); return(Q2||p_as||p_perm||se_p); finish test_c; ***********************************************************************; *** Module BOOTSTRAP for estimation of accuracy ***; ***********************************************************************; start bootstrap(_X_, _theta_, eps, nboot, nsamples); p = ncol(_X_) - 1; _V_ = _theta_[(nsamples+1):(nsamples+p),]; *** Define symmetric matrix A such that V = A*A ***; call eigen(evalues, evectors, _V_); A = evectors * sqrt(diag(evalues)) * evectors`; *** Standardized observations (iid) ***; do j = 1 to nsamples; _Xsample_ = (_X_[loc(_X_[,1] = j),])[,(2:(p+1))]; if j = 1 then _Z_=(inv(A)*(_Xsample_ - repeat(_theta_[j,], nrow(_Xsample_),1))`)`; else _Z_=_Z_//(inv(A)*(_Xsample_ - repeat(_theta_[j,], nrow(_Xsample_),1))`)`; end; *** Generate bootstrap samples and the corresponding estimates ***; do i = 1 to nboot; _nobs_ = nrow(_Z_); *** Generate a bootstrap sample ***; do k = 1 to _nobs_; obs = int(_nobs_ * ranuni(9388427)) + 1; if k = 1 then _Zboot_ = _Z_[obs,]; else _Zboot_ = _Zboot_//_Z_[obs,]; end; *** Add a dummy sample identification ***; _Zboot_ = _X_[,1] || _Zboot_; *** Return to original measurement scale by sample ***; do j = 1 to nsamples; *** jth sample ***; _Zsample_ = ( _Zboot_[loc(_Zboot_[,1] = j),] ); _nobs_ = nrow(_Zsample_); _Ysample_ = _Zsample_[,1] || ( repeat(_theta_[j,], _nobs_, 1) + _Zsample_[,2:(p+1)] * A ); if j = 1 then _Yboot_ = _Ysample_; else _Yboot_ = _Yboot_ // _Ysample_; end; *** Estimate parameters ***; _bs_ = estimate_c(_Yboot_, eps, nsamples); *** Store location estimates with sample identification ***; if i = 1 then bs = (1:nsamples)` || _bs_[(1:nsamples ),]; else bs = bs // ((1:nsamples)`||_bs_[(1:nsamples),]); end; *** Covariance matrix estimates by sample ***; do j = 1 to nsamples; bssample = ( bs[loc(bs[,1] = j),] )[,(2:(p+1))]; _cov_j_ = 1 / (nboot - 1) * ( bssample - J(nboot,1,1)* bssample[:,] )` * ( bssample - J(nboot,1,1)* bssample[:,] ); if j = 1 then _cov_ = J(p,1,j) || _cov_j_; else _cov_ = _cov_ // ( J(p,1,j) ||_cov_j_ ); end; return(_cov_); finish bootstrap; ***********************************************************************; *** Module ASYMPTOTIC for estimation of accuracy ***; ***********************************************************************; start asymptotic(_X_, _theta_, eps, nsamples); nobs = nrow(_X_); p = ncol(_X_) - 1; _V_ = _theta_[(nsamples+1):(nsamples+p),]; *** Define symmetric matrix A such that V = A*A ***; call eigen(evalues, evectors, _V_); A = evectors * sqrt(diag(evalues)) * evectors`; *** Standardized observations (iid) ***; do j = 1 to nsamples; _Xsample_ = (_X_[loc(_X_[,1] = j),])[,(2:(p+1))]; if j = 1 then _Z_=(inv(A)*(_Xsample_ - repeat(_theta_[j,], nrow(_Xsample_),1))`)`; else _Z_=_Z_// (inv(A)*(_Xsample_ - repeat(_theta_[j,], nrow(_Xsample_),1))`)`; end; *** Mean of inverse lengths ***; _r_ = sqrt(_Z_[,##]); E = (_r_##(-1))[:]; *** Asymptotic covariance matrix in the spherical case ***; B = p/(p-1)**2 * E**(-2) * I(p); *** Re-transformation back to the original scale ***; do j = 1 to nsamples; _nobs_ = nrow((_X_[loc(_X_[,1] = j),])); _cov_j_ = 1/_nobs_ * A * B * A; if j = 1 then _cov_ = J(p,1,j) || _cov_j_; else _cov_ = _cov_ // ( J(p,1,j) ||_cov_j_ ); end; return(_cov_); finish asymptotic; ***********************************************************************; *** End of modules - master code executes the modules and prints ***; *** the results ***; ***********************************************************************; *** Location and shape matrix estimates ***; theta = estimate_c(data, &eps, &nsamples); *** Sample differences with respect to sample 1 ***; delta = J(1, p, 0); do j = 2 to &nsamples; delta = delta // (theta[j,] - theta[1,]); end; mu = theta[(1:&nsamples),]; V = theta[(&nsamples+1):(&nsamples+p),]; *** Multivariate multisample spatial sign test ***; testres = test_c(data, &eps, &nperm); Q2 = testres[1]; p_as = testres[2]; p_perm = testres[3]; se_p = testres[4]; *** Bootstrap estimates of covariance matrices ***; cov_bs = bootstrap(data, theta, &eps, &nboot, &nsamples); *** Bootstrap standard error estimates ***; se_bs_mu = (sqrt(diag(cov_bs[(1:p),(2:(p+1))])))[+,]; se_bs_mu1 = se_bs_mu; do j = 2 to &nsamples; se_bs_mu = se_bs_mu // (sqrt(diag(cov_bs[(p*(j-1)+1):(p*j),(2:(p+1))])))[+,]; end; se_bs_delta = J(1,p,.) // sqrt(se_bs_mu[2:&nsamples,]##2 + repeat(se_bs_mu1,&nsamples-1,1)##2); *** Asymptotic covariance matrices ***; cov_as = asymptotic(data, theta, &eps, &nsamples); *** Asymptotic standard error estimates ***; se_as_mu = (sqrt(diag(cov_as[(1:p),(2:(p+1))])))[+,]; se_as_mu1 = se_as_mu; do j = 2 to &nsamples; se_as_mu = se_as_mu // (sqrt(diag(cov_as[(p*(j-1)+1):(p*j),(2:(p+1))])))[+,]; end; se_as_delta = J(1,p,.) // sqrt(se_as_mu[2:&nsamples,]##2 + repeat(se_as_mu1,&nsamples-1,1)##2); print "Scatter matrix estimate: " V,,, "Location and shift estimates: " mu delta ,,, "Location standard error estimates: " se_bs_mu se_as_mu,,, "Shift standard error estimates: " se_bs_delta se_as_delta / "Large sample covariance matrix estimate: " cov_as,,, "Bootstrap estimate of covariance matrix: " cov_bs,,, "Value of the test statistic: " Q2,, "p-value (large sample approximation): " p_as ,, "p-value (conditionally distribution-free): " p_perm se_p "(" &nperm "permutations)"; quit; %mend sgnmanova_c;