########################################################## # the proposed method to identify non-null dimensions # input: #(1)sam1;sam2: two data matrix from which we identify dimensions # where two population means vectors are different # data matrix must be an ni by p matrix: # row is the number of observations # column of data matrix is the number of variables #(2)omega: a p by p matrix used to transform original data # omega can be estimated by cholesky decomposition if # it is bandable; or glasso if it is sparse #(3)s1;s2: two threshold levels. # s1 used in the thresholding step chosen from (0,beta) # s2 used in estimating tuning parameters in excising # step chosen from (beta,min(diag(omega))*r) #(4)alpha: nominal significance level for FDR #(5)n: an integer that is a function of sample sizes # for example, n=n1*n2/(n1+n2) when identifying dimensions # where two population means vectors are different # output of function is estimated differences between two mean # vectors ################################################################ DATE<-function(sam1,sam2,omega,s1,s2,alpha,n){ p<-dim(sam1)[2] lambda1<-2*s1*log(p)#threshold level in function "thresholding" lambda2<-2*s2*log(p)#threshold used to estimate tuning parameters ##################################################### #step 1: transform the original data ##################################################### omega.reg<-omega omega.reg[abs(omega.reg[,])<(log(p)/n)]<-0 #regularizing sam1_t<-sam1%*%omega.reg sam2_t<-sam2%*%omega.reg ##################################################### #step 2: threshold all dimensions ##################################################### date.thr<-thresholding(sam1_t,sam2_t,omega.reg,lambda1,n) ##################################################### #step 3: excising fake signals from all coordinates survived # from "thresholding" ############################################################ tuning<-est_tuning(sam1,sam2,omega.reg,lambda2,alpha,n) date<-signal_recovery(date.thr,sam1_t,sam2_t,omega.reg,tuning[1],tuning[2],n) return(date) } ############################################################### # function " thresholding" used to remove null dimensions # input: # (1) sam1; sam2: transformed samples # (2) mat: matrix used to transform original data # (3) t: threshold level # (4) n: an integer that is function of sample sizes # output:function return "survival" which is a vector with # component equal 1 if survived from thresholding # and equal 0 otherwise ############################################################### thresholding<-function(sam1,sam2,mat,t,n){ p<-dim(sam1)[2] # dimension of data stat<-n*(colMeans(sam2)-colMeans(sam1))^2/diag(mat) survival<-rep(0,p) survival[stat>=t]<-1 # 0: screened; 1: survival return(survival) } ############################################################### # function "signal_recovery" used to clean coordinates # that survived from funtion "thresholding" # Within this function, another function PMLE is called to remove # fake signals # First allocate all survivals from funtion "thresholding" # into self-connected graph guided by regularized trasformation # matrix. For each self-connected graph, using function # PMLE to remove fake signals # input: # (1) sv: vector that is output of function "thresholding" # (2) sam1;sam2: transformed samples # (3) mat.reg: regularized trasformation matrix # (4) t1;t2: two tuning parameters used in penalized likelihood # (5) n: an integer that is function of sample sizes # output is the signal.date, recovered signals by proposed method ############################################################# signal_recovery<-function(sv,sam1,sam2,mat.reg,t1,t2,n){ p<-dim(sam1)[2]; loc_nonzero<-which(sv==1) mat.new<-mat.reg[loc_nonzero,loc_nonzero] s1<-colMeans(sam1)[loc_nonzero];s2<-colMeans(sam2)[loc_nonzero] num_nonzero<-length(loc_nonzero) cleaned.sv<-rep(0,num_nonzero) all.com<- rep(1,num_nonzero) while (sum(all.com) > 0) { k <-which(all.com==1)[1] size1<-k size2<-which(abs(mat.new[,k])!=0) while(!identical(size1, size2)) { size1 <- size2 index<-rowSums(abs(mat.new[,size1])) size2 <-which(index!=0) } if(length(size1) > 20){ stop("computation intensive because of too big cluster") } # clean fake signals from each self-connected cluster cleaned.sv[size1] <- PMLE(mat.new[size1,size1],s1[size1],s2[size1],t1,t2,n) all.com[size1] <-0 } signal.date <- rep(0,p) signal.date[loc_nonzero] <-cleaned.sv return(signal.date) } ############################################################ # function "PMLE" used to remove fake signals by # minimizing penalized likelihood function # output is cleaned coordinates of each cluster ############################################################# PMLE<-function(mat,sam.m1,sam.m2,t1,t2,n){ num_sig<-length(sam.m1) value<-c(0,rep(0,num_sig)) for (k in 0:(3^num_sig-1)){ mu_ups<-VectorizeBase(k, 3, num_sig)-1 mu_mag<-mu_ups*t2 # nonzero mu's numbs<-sum(abs(mu_ups)) value<-cbind(value,c((n*t(((sam.m2)-(sam.m1))-mat%*%mu_mag)%*%solve(mat)%*%(((sam.m2)-(sam.m1))-mat%*%mu_mag)+t1*numbs),mu_ups)) } value<-value[,-1] cleaned<-value[2:(num_sig+1),which(value[1,]==min(value[1,]))] return(cleaned) } ################################################################## #express the number i on the base as a vector ################################################################## VectorizeBase <- function(i, base, length){ vector <- rep(0, length) for( j in length:1) { vector[j] <- i - base * floor(i/base) i <- (i - vector[j]) / base; } return(vector) } ############################################################### # function to estimate tuning parameters # mat: estimated precision matrix # t: threshold # alpha: the level of FDR # n: an integer that is function of sample sizes ############################################################### est_tuning<-function(sam1,sam2,mat,t,alpha,n){ p<-dim(sam1)[2] stat<-n*(colMeans(sam2%*%mat)-colMeans(sam1%*%mat))^2/diag(mat) survival<-rep(0,p) survival[stat>=t]<-1 beta_hat<-(-1)*log(sum(survival)/p)/log(p) r_hat<-(sum(((stat-1)/diag(mat))*survival))/(2*p^(1-beta_hat)*log(p)) a_min<-min(diag(mat)) Lamb<-(sqrt(a_min*r_hat)-sqrt(beta_hat))^2 Eta.f<-function(x){ value<-(1-pnorm(((beta_hat-a_min*r_hat-Lamb)*log(p)-x/2)/(sqrt(2*a_min*r_hat*log(p)))))/(2-2*pnorm(((beta_hat+a_min*r_hat-Lamb)*log(p)-x/2)/(sqrt(2*a_min*r_hat*log(p)))))-(1/alpha-1)*(p-p^(1-beta_hat))/p^(1-beta_hat) return(value) } Eta<-uniroot(Eta.f,c(-10,0),tol=0.01)$root tuning1<-2*(beta_hat-Lamb)*log(p)-Eta tuning2<-sqrt(2*r_hat*log(p)/n) tuning<-c(tuning1,tuning2) return(tuning) } ############################################################## ############################################################## ############################################################## # one example to check the mFDR control of the DATE procedure # p: dimension; n1 and n2: sample sizes # beta: sparsity parameter to control the number of differences # r: magnitude of the difference between two mean vectors # s1;s2: two threshold levels. # s1 used in the thresholding step chosen from (0,beta) # s2 used in estimating tuning parameters in excising # step chosen from (beta,min(diag(omega))*r) # alpha: norminal significance level for FDR ############################################################## # the following functions are used in the example ############################################################## # function to obtain the inverse of covariance matrix ############################################################## chole<-function(sample,band,dim,s_size){ sample_cen<-sample-matrix(colMeans(sample),ncol=dim,nrow=s_size,byrow=T) d<-NULL; d[1]<-var(sample_cen[,1]) a<-matrix(0,ncol=dim,nrow=dim) #estimate a_i and d_i^2 using least square regression for(j in 2:dim){ X_mat<-sample_cen[,(max(j-band,1)):(j-1)] beta_hat<-solve(t(X_mat)%*%X_mat)%*%t(X_mat)%*%sample_cen[,j] a[j,(max(j-band,1)):(j-1)]<-beta_hat d[j]<-1/(s_size)*sum((sample_cen[,j]-sample_cen%*%a[j,])^2) } inv_hat<-t(diag(1,dim)-a)%*%solve(diag(d,dim))%*%(diag(1,dim)-a) return(inv_hat) } ############################################################## # function to find number of false positives and false negatives ############################################################# FD_FN<-function(est.sig,true.sig){ if(length(which(est.sig==1))==0){ FD<-0;TP<-0 }else{ FD<-length(which(est.sig==1 & true.sig==0)) TP<-length(which(est.sig==1 & true.sig==1)) } if(length(which(est.sig==0))==0){ FN<-0 }else{ FN<-length(which(est.sig==0 & true.sig==1)) } two.error<-c(FD,FN,TP) return(two.error) } ############################################################### # functions to obtain covariance matrix # "1":AR(1) model; "2":block diagonal matrix;"3":(3) triplex matrix; (4):sparse matrix ############################################################### mat.make<-function(p,rho,type){ if(type=="1"){ times<-1:p H<-abs(outer(times, times, "-")) sigma<-rho^H } if(type=="2"){ #(2) block diagonal matrix matr1<-diag(1,p,p);matr2<-cbind(rep(0,p-1),diag(c(rep(c(0.6,0),p-2),0.6),p-1,p-1)) matr3<-rbind(matr2,rep(0,p)) sigma<-matr1+matr3+t(matr3) } if(type=="3"){ #(3) triplex matrix matr1<-diag(1,p,p);matr2<-cbind(rep(0,p-1),diag(0.5,p-1,p-1)) matr3<-rbind(matr2,rep(0,p)) matr4<-cbind(matrix(0,p-2,2),diag(0.2,p-2,p-2)) matr5<-rbind(matr4,matrix(0,2,p)) sigma<-matr1+matr3+matr5+t(matr3)+t(matr5) } if(type=="4"){ #(4) sparse matrix mat1<-matrix(0,p,p) for(i in 1:p){ mat1[i,sample(c(1:p),1)]<-runif(1,1,2)*sample(c(-1,1),1) } mat2<-mat1%*%t(mat1) mat3<-mat2+1*diag(1,p,p) sigma<-diag(diag(mat3)^(-0.5),p,p)%*%mat3%*%diag(diag(mat3)^(-0.5),p,p) } return(sigma) } library(glasso) p<-500;beta<-0.6;r<-0.8; s1<-0.3; s2<-0.9; alpha<-0.05 ############################################################ # generate covariance matrices using function "mat.make" ############################################################ rho<-0.6; cov.type<-1 sigma1<-mat.make(p,rho,cov.type) sigma2<-sigma1 gam<-eigen(sigma1,symmetric=T)$vectors%*%diag(sqrt(eigen(sigma1,symmetric=T)$values),p) n1<-60; n2<-60 # the sample sizes n<-n1*n2/(n1+n2) mm<-p^{1-beta} #number of dimensions where two means are different replic<-30 # number of iterations count.FDR<-count.FNR<-NULL count.TP<-NULL rej<-n.rej<-NULL ############################################################# # loops to get empirical mFDR, mFNR and mean of true positives ############################################################ for (i in 1:replic){ # generate two random samples mu1<-rep(0,p); mu2<-rep(0,p) signal.sign<-sample(c(-1,1),mm,replace=T) signal.mag<-runif(mm, min=(sqrt(r*log(p)*(1/n1+1/n2))), max=(sqrt(3*r*log(p)*(1/n1+1/n2)))) post<-sample(1:p,mm,replace=F);post<-sort(post) mu2[post]<-signal.sign*signal.mag Z_mat1<-matrix(rnorm(n1*p,0, 1),ncol=p) Z_mat2<-matrix(rnorm(n2*p,0, 1),ncol=p) sample1<-(Z_mat1)%*% t(gam)+matrix(mu1,ncol=p,nrow=n1,byrow=T) sample2<-(Z_mat2)%*% t(gam)+matrix(mu2,ncol=p,nrow=n2,byrow=T) ############################################################# # different methods to estimate precision matrix # if bandable, use function "chole" to estimate # if sparse, use "glasso" to estimate ############################################################# sample<-rbind(sample1,sample2) omega_hat<-chole(sample,1,p,(n1+n2)) # for bandable structure #omega_hat<-glasso(var(sample),rho=0.1)$wi# for sparse structure omega_reg<-omega_hat omega_reg[abs(omega_reg[,])<(log(p)/n)]<-0 #regularizing ############################################################# # implement function "DATE" to estimate difference between two # mean vectors ############################################################# signal<-DATE(sample1,sample2,omega_reg,s1,s2,alpha,n) count.FDR[i]<-FD_FN(abs(sign(signal)),abs(sign(mu2)))[1] count.FNR[i]<-FD_FN(abs(sign(signal)),abs(sign(mu2)))[2] count.TP[i]<-FD_FN(abs(sign(signal)),abs(sign(mu2)))[3] rej[i]<-length(which(abs(sign(signal))==1)) n.rej[i]<-length(which(abs(sign(signal))==0)) } FDP<-count.FDR/rej;FDP[is.na(FDP)]<-0 mean.FDR<-mean(count.FDR)/mean(rej) sd.FDR<-sd(FDP) mean.FNR<-mean(count.FNR)/mean(n.rej) sd.FNR<-sd(count.FNR/n.rej) mean.TP<-mean(count.TP) sd.TP<-sd(count.TP) mean.FDR mean.FNR mean.TP sd.FDR sd.FNR sd.TP