############################################################## # This file contains the R functions for the paper: # # "Two-Sample Tests for High Dimensional Means with Thresholding # and Data Transformation" # by Song Xi Chen, Jun Li and Ping-Shou Zhong # # Questions and comments, please contact # csx@gsm.pku.edu.cn, junli@math.kent.edu, pszhong@stt.msu.edu ########################################################### ########################################################## # function to obtain parametric bootstrap copy of # Maximum thresholding test statistic # input: p--dimension of random vector # mat--covariance matrix of random vector # gam--related with mat by relationship gam=mat^{1/2} # size1(2)--sample size of random sample 1(2) # output: bootstrap copy of Maximum thresholding statistic # based on generated samples: sam1 and sam2 # inside the code: # random sample 1 (2) with size=size1 (size2) and dimension=p # is generated by the multivariate model: # X_{ij}=\Sigma_i^{1/2}Z_{ij}+\mu_i # where Z_{ij} are iid p dimensional random vectors such that # E(Z_{ij})=0, Var(Z_{ij})=I_p (p by p identity matrix) ################################################################ resampling_method<-function(p,mat,gam,size1,size2){ mu1<-rep(0,p);mu2<-rep(0,p);eta<-0.05 Z_mat1<-matrix(rnorm(size1*p,0, 1),ncol=p) Z_mat2<-matrix(rnorm(size2*p,0, 1),ncol=p) sam1<-(Z_mat1)%*% t(gam)+matrix(mu1,ncol=p,nrow=size1,byrow=T) sam2<-(Z_mat2)%*% t(gam)+matrix(mu2,ncol=p,nrow=size2,byrow=T) test_stat<-max_threshold(sam1,sam2,mat,size1,size2) return(test_stat) } ######################################################## # function to obtain Maximum thresholding test statistic # defined by eqn (3.2) without data tranformation or # eqn (4.9) with data transformation in the paper # input: sample1(2)--two random samples # sig_mat--either covariance matrix or inverse # covariance matrix depending on # if test statistic is (3.2) or (4.9) # n1(n2)--sample size # output: Maximum thresholding statistic (3.2) or (4.9) ######################################################## max_threshold<-function(sample1,sample2,sig_mat,n1,n2){ a_f<-sqrt(2*log(log(p))) b_f<-2*log(log(p))+0.5*log(log(log(p)))-0.5*log(4*3.1416/(1-0.05)^2) eta<-0.05 T_orig<-(colMeans(sample1)-colMeans(sample2))^2/((1/n1+1/n2)*diag(sig_mat)) s_level<-T_orig[sign(T_orig)>=0.01 & T_orig<=2*(1-eta)*log(p)] s_m<-matrix(s_level,length(s_level),p,byrow=F) T_m<-matrix((T_orig-1),length(s_level),p,byrow=T) T_m[sign(T_m+1-s_m)==-1]<-0 thr<-rowSums(T_m) mean_thr<-2*sqrt(s_level)*dnorm(sqrt(s_level))*p sd_thr<-sqrt(p*(2*((sqrt(s_level))^3+sqrt(s_level))*dnorm(sqrt(s_level))+4-4*pnorm(sqrt(s_level)))-mean_thr^2/p) max_threshold<-max((thr-mean_thr)/sd_thr)*a_f-b_f return(max_threshold) } ############################################################## # function to obtain the inverse of covariance matrix # input: sample1(2)--two random samples # band--banding width parameter chosen in the cholesky # decomposition to estimate Omega defined by (4.6) # in the paper # output: cholesky decomposition estimate of Omega ############################################################## chole<-function(sample1,sample2,band){ n1<-dim(sample1)[1];n2<-dim(sample2)[1] sample_cen<-sample1-(n1/n2)^0.5*sample2[1:n1,]+(1/(n1*n2)^0.5)*colSums(sample2[1:n1,])-(1/n2)*colSums(sample2) d<-NULL; d[1]<-1/(n1)*sum((sample_cen[,1])^2) a<-matrix(0,ncol=p,nrow=p) #estimate a_i and d_i^2 using least square regression for(j in 2:p){ X_mat<-sample_cen[,(max(j-band,1)):(j-1)] beta_hat<-solve(t(X_mat)%*%X_mat)%*%t(X_mat)%*%sample_cen[,j] a[j,(max(j-band,1)):(j-1)]<-beta_hat d[j]<-1/(n1)*sum((sample_cen[,j]-sample_cen%*%a[j,])^2) } ma<-t(diag(1,p)-a)%*%solve(diag(d,p))%*%(diag(1,p)-a) inv_hat<-((n1+n2)/n2)*ma return(inv_hat) } ############################################################## # function to obtain covariance matrix which is called by the # function banding_choice to estimate the optimal banding # parameter # input: sa--random sample # band--banding parameter # output: cholesky decomposition estimating the covariance matrix ############################################################## chole_sigma<-function(sa,band){ n<-dim(sa)[1];p<-dim(sa)[2] d<-NULL; d[1]<-var(sa[,1]) a<-matrix(0,ncol=p,nrow=p) #estimate a_i and d_i^2 using least square regression for(j in 2:p){ if(band>=1){ X_mat<-sa[,(max(j-band,1)):(j-1)] beta_hat<-qr.solve(X_mat, sa[,j], tol=1e-200) a[j,(max(j-band,1)):(j-1)]<-beta_hat d[j]<-var(sa[,j]-sa%*%a[j,]) }else{ d[j]<-var(sa[,j]) } } mat_hat<-solve(diag(1,p)-a)%*%(diag(d,p))%*%solve(t(diag(1,p)-a)) return(mat_hat) } ############################################################### # function to find optimal banding parameter by minimizing (5.1) # in the paper # input: x--random sample # k.vec--initial choice of banding parameter # output: optimal banding parameter defined by (5.1) ############################################################### banding_choice<-function(x, k.vec=NULL){ n<-dim(x)[1]; p<-dim(x)[2] n.tr<-round(n*2/3);n.va<-n-n.tr if(is.null(k.vec)) { k.vec<-0:min(c(n.tr-2, p-1)) } n.split<-50 disc<-array(0, c(length(k.vec), n.split)) for(i in 1:n.split){ id<-sample(n); id1<-id[1:n.tr]; id2<-id[(n.tr+1):n] x.tr<-scale( x[id1,,drop=FALSE], scale=FALSE) s.va<-cov(x[id2,,drop=FALSE])*(n.va-1)/n.va for(j in 1:length(k.vec)) { sigma_hat<-chole_sigma(x.tr,k.vec[j]) disc[j,i]<-sum((sigma_hat-s.va)^2) } } risk<-apply(disc, 1, sum) optim.k<-k.vec[which.min(risk)] return(optim.k) } ################################################################ # one example to obtain empirical sizes or powers of maximum # thresholding test ################################################################ library(mvtnorm) p<-200 # the dimension of multivariate random vector k<-1 # optimal banding parameter n1<-30; n2<-40 # two sample sizes ##################################################### # two covariance matrices following AR(1) model rho1<-0.6; rho2<-0.6 times<-1:p; H<- abs(outer(times, times, "-")) sigma1<-rho1^H; sigma2<-rho2^H ####################################################### # inverse of linear combination of two covariance matrices # used to transform data to enhance signal strength sigma_inv<-solve((n2/(n1+n2))*sigma1+(n1/(n1+n2))*sigma2) ######################################################## s_beta<-0.4 # sparsity parameter used to control the number of # signals mm<-round(p^(1-s_beta)) # number of nonzero signals mu1<-rep(0,p) # without loss of generality, assume mu1=0 post<-sort(sample(1:p,mm,replace=F))# randomly select the location # of non-zero coordiantes of mu2 r<-0 # signal strength of mu2 # when r=0, the simulation study is for empirical size # otherwise, it is for empirical power mu2<-rep(0,p) mu2[post]<-sqrt(2*r*log(p)*(1/n1+1/n2)) ################################################################ test_thr1<-NULL test_thr2<-NULL p_thr1<-p_thr2<-NULL gam<-eigen(sigma1,symmetric=T)$vectors%*%diag(sqrt(eigen(sigma1,symmetric=T)$values),p) n<-100 # the number of the simulations for (i in 1:n){ ####################################################### # generate samples by the factor model ####################################################### Z_mat1<-matrix(rnorm(n1*p,0, 1),ncol=p) Z_mat2<-matrix(rnorm(n2*p,0, 1),ncol=p) sam1<-(Z_mat1)%*% t(gam)+matrix(mu1,ncol=p,nrow=n1,byrow=T) sam2<-(Z_mat2)%*% t(gam)+matrix(mu2,ncol=p,nrow=n2,byrow=T) ####################################################### sigma_inv_est<-chole(sam1,sam2,k)# estimate Omega by cholesky gam_inv_est<-eigen(sigma_inv_est,symmetric=T)$vectors%*%diag(sqrt(eigen(sigma_inv_est,symmetric=T)$values),p) sam1_tran_est<-sam1%*%sigma_inv_est # transform data by Omega sam2_tran_est<-sam2%*%sigma_inv_est sigma_e<-solve(sigma_inv_est) gam_e<-eigen(sigma_e,symmetric=T)$vectors%*%diag(sqrt(eigen(sigma_e,symmetric=T)$values),p) ############################################################## # bootstrap copies of maximum thresholding test statistics without # and with data transformation ############################################################## iter<-100 stat.thr1<-stat.thr2<-NULL for(j in 1:iter){ stat.thr1[j]<-resampling_method(p,sigma_e,gam_e,100,100) stat.thr2[j]<-resampling_method(p,sigma_inv_est,gam_inv_est,100,100) } ############################################################### # obtain p-values of maximum thresholding tests # ############################################################# test_thr1<-max_threshold(sam1,sam2,sigma1,n1,n2) test_thr2<-max_threshold(sam1_tran_est,sam2_tran_est,sigma_inv_est,n1,n2) p_thr1[i]<-length(which(sort(stat.thr1)>=test_thr1))/iter p_thr2[i]<-length(which(sort(stat.thr2)>=test_thr2))/iter } ############################################################# # power_thres1: empirical size or power of maximum thresholding # test without data transformation # power_thres2: empirical size or power of maximum thresholding # test with data transformation ############################################################# power_thres1<-length(which(p_thr1<=0.05))/n power_thres2<-length(which(p_thr2<=0.05))/n power<-c(power_thres1,power_thres2)