# script for re-analysis of Dave et al data # R. Tibshirani 2005 #read in clinical data from Dave et al website d<- matrix(scan("FL_Clinical_Data.txt",sep="\t",what=""),ncol=13, byrow=TRUE) collabs.clin=d[1,] sample.num.clin<-d[-1,1] traintest.clin<-as.numeric(d[-1,2]) tim<-as.numeric(d[-1,6]) status<-as.numeric(d[-1,7]) stage<-as.numeric(d[-1,12]) ps=as.numeric(d[-1,11]) ipi.gp=as.numeric(d[-1,13]) ir1=as.numeric(d[-1,3]) ir2=as.numeric(d[-1,4]) model.value=as.numeric(d[-1,5]) #read in expression data from Dave et al Website dd<- scan("FLdata.txt",sep="\t",what="") ddd<-matrix(dd,byrow=TRUE,ncol=195) genenames<- ddd[-1,4] geneid=ddd[-1,1] collabs.exp=ddd[1,] sample.num.exp=collabs.exp[-(1:4)] # match clinical data to expression data # everything is kept in clin data order x<-matrix(as.numeric(as.character(ddd[-1,-(1:4)])),ncol=191) o=match(paste("FL",sample.num.clin,sep=""),sample.num.exp) xc=x[,o] om<-!is.na(status) xcomp=xc[,traintest.clin==1] xcc=xc[,om] status<-status[om] tim<-tim[om] traintest.clin=traintest.clin[om] stage=stage[om] ipi.gp=ipi.gp[om] ps=ps[om] ir1=ir1[om] ir2=ir2[om] model.value=model.value[om] sample.num.clin=sample.num.clin[om] xtr<-xcc[,traintest.clin==1] tim.tr<-tim[traintest.clin==1] icens.tr<-status[traintest.clin==1] sample.num.clin.train=sample.num.clin[traintest.clin==1] ir1.tr=ir1[traintest.clin==1] ir2.tr=ir2[traintest.clin==1] model.value.tr=model.value[traintest.clin==1] xte<-xcc[,traintest.clin==2] tim.te<-tim[traintest.clin==2] icens.te<-status[traintest.clin==2] ir1.te=ir1[traintest.clin==2] ir2.te=ir2[traintest.clin==2] model.value.te=model.value[traintest.clin==2] sample.num.clin.test=sample.num.clin[traintest.clin==2] library(survival) d3<-list(x=xtr,y=tim.tr,censoring.status=icens.tr, featurenames=genenames, survival=T) d3.test<- list(x=xte,y=tim.te,censoring.status=icens.te, survival=T) # compute wald tests for each gene wald=rep(NA,nrow(d3$x)) wald.te=rep(NA,nrow(d3$x)) for(i in 1:nrow(d3$x)){ cat(i,fill=T) junk=coxph(Surv(d3$y,d3$cens)~d3$x[i,]) wald[i]=sign(junk$coef)*sqrt(junk$wald) junk2=coxph(Surv(d3.test$y,d3.test$cens)~d3.test$x[i,]) wald.te[i]=sign(junk2$coef)*sqrt(junk2$wald) } # try to repeat analysis of Dave et al # filter genes oo=abs(wald)>1.645 o33=apply(d3$x,1,median) > 6 x1=d3$x[oo& wald>0 &o33,] x2=d3$x[oo& wald<0 &o33, ] x1.test=d3.test$x[oo& wald>0 & o33,] x2.test=d3.test$x[oo& wald<0 & o33,] genenames1=genenames[oo& wald>0 &o33] genenames2=genenames[oo& wald<0 &o33] x1c<-t(scale(t(x1),center=TRUE,scale=TRUE))/sqrt(ncol(x1)-1) ## extract positive gene clusters from Eisen's Cluster program applied # to this data options(expressions=3000) # dave.gtr is the gtr file from Cluster, provided by George Wright # dave.ctr contains first two columns of the Cluster ctr file, provided by George Wright foo=matrix(scan("dave.gtr",sep="\t",what="" ),ncol=4,byrow=T) g1=matrix(scan("dave.ctr",sep="\t",what="" ),ncol=2,byrow=T) p=nrow(foo) foo[,1]=1:p for(i in 1:p){ for(j in 2:3){ if(substring(foo[i,j],1,4)=="NODE"){ nn=nchar(foo[i,j]) foo[i,j] =substring(foo[i,j],5,nn-1) }}} for(i in 1:p){ for(j in 2:3){ if(substring(foo[i,j],1,4)=="GENE"){ foo[i,j]=-(1+as.numeric(substring(foo[i,j],5,nchar(foo[i,j])-1))) }}} k1=list(merge=matrix(as.numeric(foo[,2:3]),ncol=2)) rho1=as.numeric(foo[,4]) list1=vector("list",300) ii=0 for(i in 1:nrow(k1$merge)){ cat(i,fill=T) if(k1$merge[i,1]>0 | k1$merge[i,2]>0 ){ j1=my.descendants(k1$merge,i,1) j2=my.descendants(k1$merge,i,2) jj=c(j1,j2) if(length(jj)>24 & length(jj)<51 ){ if(rho1[i] > .5){ ii=ii+1 list1[[ii]]=jj }}}} list1=list1[1:ii] ############# same for other direction options(expressions=3000) foo=matrix(scan("h2.txt",sep="\t",what="" ),ncol=4,byrow=T) g2=matrix(scan("geneid2",sep="\t",what="" ),ncol=2,byrow=T) p=nrow(foo) foo[,1]=1:p for(i in 1:p){ for(j in 2:3){ if(substring(foo[i,j],1,4)=="NODE"){ nn=nchar(foo[i,j]) foo[i,j] =substring(foo[i,j],5,nn-1) }}} for(i in 1:p){ for(j in 2:3){ if(substring(foo[i,j],1,4)=="GENE"){ foo[i,j]=-(1+as.numeric(substring(foo[i,j],5,nchar(foo[i,j])-1))) }}} k2=list(merge=matrix(as.numeric(foo[,2:3]),ncol=2)) rho=as.numeric(foo[,4]) list2=vector("list",300) ii=0 for(i in 1:nrow(k2$merge)){ cat(i,fill=T) if(k2$merge[i,1]>0 | k2$merge[i,2]>0 ){ j1=my.descendants(k2$merge,i,1) j2=my.descendants(k2$merge,i,2) jj=c(j1,j2) if(length(jj)>24 & length(jj)<51 ){ if(rho[i] > .5){ ii=ii+1 list2[[ii]]=jj }}}} list2=list2[1:ii] ## # form expression data matrix, in order GENE0x, GENE1x etc # o1=match(g1[,2],geneid) oo=paste("GENE",as.character(0:(nrow(g1)-1)),"X",sep="") ooo=match(oo,g1[,1]) x1=d3$x[o1,] x1=x1[ooo,] x1.test=d3.test$x[o1,] x1.test=x1.test[ooo,] geneid1=geneid[o1][ooo] o2=match(g2[,2],geneid) oo=paste("GENE",as.character(0:(nrow(g2)-1)),"X",sep="") ooo=match(oo,g2[,1]) o2=match(g2[,2],geneid) x2=d3$x[o2,] x2=x2[ooo,] x2.test=d3.test$x[o2,] x2.test=x2.test[ooo,] geneid2=geneid[o2][ooo] # now compute means of each cluster and fit Cox model to every pair library(survival) # version 1: try all possible pairs xall=rbind(x1,x2) xtest.all=rbind(x1.test, x2.test) listall=c(list1,list2) for(i in (length(list1)+1): length(listall)){ listall[[i]]= listall[[i]]+ nrow(x1) } nn=length(listall) lik=matrix(NA,ncol=3,nrow=nn*(nn-1)/2) likt=matrix(NA,ncol=3,nrow=nn*(nn-1)/2) ii=0 for(i in 1:(nn-1)){ for(j in (i+1):nn){ cat(c(i,j),fill=T) z1=apply(xall[listall[[i]],],2,mean) z2=apply(xall[listall[[j]],],2,mean) ii=ii+1 d=coxph(Surv(d3$y,d3$cens)~z1+z2)$log lik[ii,]=c(i,j,2*(d[2]-d[1])) z1t=apply(xtest.all[listall[[i]],],2,mean) z2t=apply(xtest.all[listall[[j]],],2,mean) d2=coxph(Surv(d3.test$y,d3.test$cens)~z1t+z2t)$log likt[ii,]=c(i,j,2*(d2[2]-d2[1])) }} # version 2: pick one positive and one negative cluster xall=rbind(x1,x2) xtest.all=rbind(x1.test, x2.test) n1=length(list1) n2=length(list2) lik=matrix(NA,ncol=3,nrow=n1*n2) likt=matrix(NA,ncol=3,nrow=n1*n2) ulik1=rep(NA,n1) ulik1t=rep(NA,n1) ulik2=rep(NA,n2) ulik2t=rep(NA,n2) for(i in 1:n1){ z1=apply(x1[list1[[i]],],2,mean) z1t=apply(x1.test[list1[[i]],],2,mean) d2=coxph(Surv(d3$y,d3$cens)~z1)$log ulik1[i]=2*(d2[2]-d2[1]) d2=coxph(Surv(d3.test$y,d3.test$cens)~z1t)$log ulik1t[i]=2*(d2[2]-d2[1]) } for(i in 1:n2){ z2=apply(x2[list2[[i]],],2,mean) z2t=apply(x2.test[list2[[i]],],2,mean) d2=coxph(Surv(d3$y,d3$cens)~z2)$log ulik2[i]=2*(d2[2]-d2[1]) d2=coxph(Surv(d3.test$y,d3.test$cens)~z2t)$log ulik2t[i]=2*(d2[2]-d2[1]) } ii=0 for(i in 1:n1){ for(j in 1:n2){ cat(c(i,j),fill=T) z1=apply(x1[list1[[i]],],2,mean) z2=apply(x2[list2[[j]],],2,mean) ii=ii+1 d=coxph(Surv(d3$y,d3$cens)~z1+z2)$log lik[ii,]=c(i,j,2*(d[2]-d[1])) z1t=apply(x1.test[list1[[i]],],2,mean) z2t=apply(x2.test[list2[[j]],],2,mean) d2=coxph(Surv(d3.test$y,d3.test$cens)~z1t+z2t)$log likt[ii,]=c(i,j,2*(d2[2]-d2[1])) }} # make a plot of train vs test likelihood ratio scores postscript(file="fig1.ps",width=5,height=5) plot(1-pchisq(lik[,3],2),1-pchisq(likt[,3],2),xlab="train set p-value", ylab="test set p-value", log="xy") abline(h=.05,lty=2) dev.off() # make univariate plots postscript(file="fig2.ps",width=8,height=4) par(mfrow=c(1,2)) plot(1-pchisq(ulik1,1), 1-pchisq(ulik1t,1), xlab="training pval", ylab="test pval", log="xy") title("poor prognosis genes") points((1-pchisq(ulik1[70],1)), (1-pchisq(ulik1t[70],1)), col=2 ) plot(1-pchisq(ulik2,1), 1-pchisq(ulik2t,1), xlab="training pval", ylab="test pval", log="xy") title("good prognosis genes") points((1-pchisq(ulik2[67],1)), (1-pchisq(ulik2t[67],1)), col=2 ) dev.off()