% Use the principal components of the filtered genes to select the initial % motif pool. load filtered1500.mat Y = filtered1500_Y-repmat(mean(filtered1500_Y,2),1,10); [pc, score, latent, tsquare] = princomp(Y); %perc_exp = 100*latent/sum(latent); %for i=1:10 % subplot(5,2,i); % hold off % plot(pc(1:5,i)); % hold on % plot(pc(6:10,i),':'); % label = sprintf('pc %d, explains %2.1f%% of var.', i, perc_exp(i)); % ylim([-1 1]) % xlabel(label); %end % Strategy: do stepwise filtering. For each principal component, % 1. Add top M genes by regression pvalue. % 2. Add in best predictor, now add top r=floor(M/2) genes by pvalue. % 3. Add in best predictor, now add top r=floor(M/4) genes by pvalue. % ... repeat until we have 2M genes, or r=0. % Typically, use different values of M for different principal components. % The idea is to get rid of some redundancy in the top motifs by this % greedy approach. A better idea is to use the stochastic search by % Tadesse and Vanucci to narrow down on this set. load 'len6fuzzymotifs.mat' motifcountfile = 'filtered1500_Len6FuzzyMotifCounts.txt'; initialmotifpool=len6fuzzymotifs; minpc=1; maxpc=1; M=[150]; pcmotifs = cell(1,maxpc-minpc+1); nmotifs = length(initialmotifpool); ngenes = length(filtered1500_geneID); tdf=ngenes-1; for pcind = minpc:maxpc Y=score(:,pcind); % center Y around mean. Y = Y-mean(Y); pcind2=pcind-minpc+1; r=M(pcind2); while r>=1 %simply checking r>=1 will ensure that the motifpool <2M. fprintf('principal component %d: r=%d\n', pcind, r); fid = fopen(motifcountfile); l = fgetl(fid); pval = zeros(1,nmotifs); minpval = Inf; minpvalX = []; xpool = []; % contains the X's already added to the model. minpvalmotif = ''; for i=2:nmotifs motifname=textscan(fid,'%s\t',1); X=textscan(fid,'%d\t',ngenes); X=double(X{1}); % center X around mean X = X-mean(X); % project X onto complementary space of xpool if length(xpool)>0 X = X-xpool*(xpool'*xpool)\(xpool'*X); end % do simple linear regression and calculate p-value. XtX=X'*X; if XtX==0 fprintf('i=%d: XtX=%d, %s\n', i, XtX,motifname{1}{1}); end betahat = (X'*Y)/XtX; resid = Y-X*betahat; rss = resid'*resid/ngenes; sigmabhat = sqrt(rss/XtX); t = betahat/sigmabhat; pval(i) = 1-tcdf(t,tdf); if pval(i)