function [pcmotifs] = buildInitialMotifPoolFromMultipleCountFiles(Y,motifcountfile,initialmotifpool,pc,M,iterate) [ngenes ncond]=size(Y); % Use the principal components of the filtered genes to select the initial % motif pool. [pcomp, score, latent, tsquare] = princomp(Y); %perc_exp = 100*latent/sum(latent); %for i=1:10 % subplot(5,2,i); % hold off % plot(pc(1:5,i)); % hold on % plot(pc(6:10,i),':'); % label = sprintf('pc %d, explains %2.1f%% of var.', i, perc_exp(i)); % ylim([-1 1]) % xlabel(label); %end % Strategy: do stepwise filtering. For each principal component, % 1. Add top M genes by regression pvalue. % 2. Add in best predictor, now add top r=floor(M/2) genes by pvalue. % 3. Add in best predictor, now add top r=floor(M/4) genes by pvalue. % ... repeat until we have 2M genes, or r=0. % Typically, use different values of M for different principal components. % The idea is to get rid of some redundancy in the top motifs by this % greedy approach. A better idea is to use the stochastic search by % Tadesse and Vanucci to narrow down on this set. nfiles=length(motifcountfile); pcmotifs = cell(1,length(pc)); combinedinitialmotifpool = {}; for k=1:nfiles combinedinitialmotifpool= {combinedinitialmotifpool{:} initialmotifpool{k}{:}}; end totalnmotifs=length(combinedinitialmotifpool); tdf=ngenes-1; for pcind = 1:length(pc) Y=score(:,pc(pcind)); % center Y around mean. Y = Y-mean(Y); r=M(pcind); ind=0; %indexes the size of the array of variables that were added to the model (not to the pool). xpool = []; % contains the X's already added to the model. while r>=1 %simply checking r>=1 will ensure that the motifpool <2M. ind=ind+1; fprintf('principal component %d: r=%d\n', pc(pcind), r); pval = zeros(1,totalnmotifs); minpval = Inf; minpvalX = []; minpvalmotif = ''; motifind=0; for k=1:nfiles fid = fopen(motifcountfile{k}); l = fgetl(fid); nmotifs = length(initialmotifpool{k}); for i=1:nmotifs % i=i+1; motifind = motifind+1; motifname=textscan(fid,'%s\t',1); X=textscan(fid,'%d\t',ngenes); X=double(X{1}); % center X around mean X = X-mean(X); % project X onto complementary space of xpool if length(xpool)>0 X = X-xpool*inv(xpool'*xpool)*(xpool'*X); end % do simple linear regression and calculate p-value. XtX=X'*X; betahat = (X'*Y)/XtX; resid = Y-X*betahat; rss = resid'*resid/ngenes; sigmabhat = sqrt(rss/XtX); t = abs(betahat)/sigmabhat; pval(motifind) = 2*(1-tcdf(t,tdf)); if pval(motifind)