% ---------------------------------------------------------% % LOAD DATA: the filtered, processed data are stored in % % cellcycle_testcontrol_2.mat. % % ---------------------------------------------------------% filteredfn='yeastdata/cellcycle_testcontrol_2' % This file contains: % Y: the matrix of Yg values (expression vector for each gene). % geneID: the cell array of gene names. % promoter: the cell array of promoter sequences for each gene. % sampletimes: the times the samples were taken. load( [filteredfn '.mat']) motiflen=[5]; % we want to look at length 5,6,7 motifs. promoterlen=700; % length of promoter sequence. fuzzypos=[]; % which positions within a motif can be fuzzy? currently, none. pc=[1 2 3]; % the principal components to keep. M=[100 100 100]; % size of dictionary for each principal component. maxdepth=3; % maximum depth of interactions allowed. [ngenes ncond] = size(Y); Y= Y-repmat(mean(Y,2),1,ncond); Y=Y ./ repmat(sqrt(var(Y)),ngenes,1); % --- plot out the principal components. [pcomp, score, latent, tsquare] = princomp(Y); perc_exp = 100*latent/sum(latent); for i=1:18 hold off subplot(6,3,i); label = sprintf('pc %d, explains %2.1f%% of var.', i, perc_exp(i)); plotCellcycleTimeseries(sampletimes, pcomp(:,i),label, '','',[-1 1]); end % --- plot the percentage of variance explained. subplot(1,1,1) bar(perc_exp); ylabel('% Variance Explained') xlabel('Principal component number') xlim([0 length(perc_exp)]) set(gca,'XTick',[1:18]); % print -dpng 'cellcycle_testcontrol2_screeplot.png' % ---------------------------------------------------------% % STEP A: BUILD DICTIONARY % % ---------------------------------------------------------% t0=clock; initialmotifpool=cell(1,length(motiflen)); motifcountfile=cell(1,length(motiflen)); for i=1:length(motiflen) initialmotifpool{i} = enumerateMotifs(motiflen(i),'fuzzy',fuzzypos); % --- the "motif count files" store the count of motif i in promoter of % gene j. These are files containing large matrices of integers. motifcountfile{i}=sprintf('yeastdata/cellcycle_testcontrol_2_imputed_700upstream_len%dnonfuzzy_motifcounts.txt',motiflen(i)); end % The function called below does the work. [pcmotifs] = buildInitialMotifPoolFromMultipleCountFiles(Y,motifcountfile,initialmotifpool,pc,M,false) % Save results for future reference. save(sprintf('%s_len%s_pc%s_motifs.mat',filteredfn,num2str(motiflen,'%d'),num2str(pc,'%d'))); % Building the dictionary can sometimes take a while. If a dictionary is % pre-built, can skip the previous steps and simply load the file: %load(sprintf('%s_len%s_pc%s_motifs.mat',filteredfn,num2str(motiflen,'%d'),num2str(pc,'%d'))); % Take the union of motifs found for each length. words={}; %words is the pool of motifs to choose from. for i=1:length(pcmotifs) words=union(words, pcmotifs{i}); end fprintf('--- Building initial dictionary took %d seconds\n', etime(clock,t0)); t0=clock; % Find the count, loc, and reverseLoc of words in the dictionary % in promoters of filtered genes. This is important to speed up future % model building process. ns = length(words); counts = zeros(ngenes,ns); loc = cell(ngenes,ns); rloc = cell(ngenes,ns); fprintf('Building counts and location vectors...\n'); t0=clock; for j = 1:ns word = words{j}; fprintf('---------- j=%d, ',j);tic for i=1:ngenes [counts(i,j), loc{i,j}, rloc{i,j}] = motifCount(word, promoter{i}, false); end toc; end % Put the special word `TSS START' in the dictionary. words = {'TSS START' words{:}} counts = [ones(ngenes,1) counts]; tssloc=cell(ngenes,1); tssrloc=cell(ngenes,1); for i=1:ngenes tssloc{i}=[promoterlen]; tssrloc{i}=[1]; end temp=reshape({tssloc{:} loc{:}},ngenes,ns+1); loc=temp; temp=reshape({tssrloc{:} rloc{:}},ngenes,ns+1); rloc=temp; ns=length(words); fprintf('--- Building count vector took %d seconds\n', etime(clock,t0)); % ---------------------------------------------------------% % STEP B: FIT MODEL % % ---------------------------------------------------------% [pcomp, score, latent, tsquare] = princomp(Y); y = score(:,pc); mvweights=latent(pc).^2; fprintf('Starting lapem on %d motifs...', ns); t0=clock; w = ones(1,ngenes); intervals = [30 100 400]; modelInteractions=true; dumpfilename=sprintf('%s_pc%s_dump.mat',filteredfn,num2str(pc,'%d')); [beta,SE,PVAL,inpe,xin,xinInd,pepool,xpool] = lapem5(y,counts,loc,rloc,words,intervals, 'dumpfile', dumpfilename, 'pthreshold',0.01,'maxin',40,'maxdepth',maxdepth,'mvweights',mvweights); fprintf('--- lapem took %d seconds\n', etime(clock,t0)); % save results to file. savefilename=sprintf('%s_pc%s_len%s_depth%d_save.mat',filteredfn,num2str(pc,'%d'),num2str(motiflen,'%d'),maxdepth); save(savefilename, '*'); % ---------------------------------------------------------% % STEP C: PRUNE MODEL % % ---------------------------------------------------------% %lambda=[NaN,2,3,5,10] lambda=NaN; [inmodel,gcv,history,RSS,DOF,inmodelHistory] = backwardsStepwisePrune(xin,inpe,y,lambda,'mvweights',mvweights); %inthismodel=inmodelHistory{30}; inthismodel=inmodel; fprintf('After pruning, %d variables left.\n',sum(inmodel)); for i=1:length(inpe) if(inthismodel(i+1)) fprintf('%s\n',char(inpe{i})); end end prunedFileName=sprintf('%s_pc%s_depth%d_pruned.mat',filteredfn,num2str(pc,'%d'),maxdepth); save(prunedFileName, 'inmodel','gcv','RSS','inmodelHistory','xin','inpe','y','pc','geneID','maxdepth'); plot([40:-1:1],gcv) [mingcv,minix]=min(gcv); line([40-minix 40-minix],ylim,'Color','r') for i=1:length(inpe) if(inmodel(i+1)) fprintf('%s\n',char(inpe{i})); end end % -- SUDAR: you don't need to worry about the code below for now. % % % ---- Produce a table of motifs. % load(savefilename) % % load(prunedFileName) % xinfinal=xin; % numPruneSteps = length(inmodelHistory); % lastinModel = inmodelHistory{numPruneSteps}; % prunedIndex = zeros(1,numPruneSteps-1); % for i=1:numPruneSteps-2 % currinModel = inmodelHistory{numPruneSteps-i}; % prunedIndex(i) = find(currinModel-lastinModel); % lastinModel = currinModel; % end % prunedIndex(numPruneSteps-1) = find(ones(1,numPruneSteps)-lastinModel); % % % get the master 800 list. % fid = fopen('cellcycle_Spellman800.txt'); % temp= textscan(fid, '%s', 'delimiter', '\t'); % fclose(fid); % master800_IDs = temp{1}; % % numGenesWithMotif = zeros(1,length(prunedIndex)); % numGenesInSpellman800 = zeros(1,length(prunedIndex)); % fid=fopen('motiflist.txt','w'); % fprintf(fid,'Motif\tn\tm\n'); % for i=1:length(prunedIndex) % motifind = prunedIndex(i)-1; % motifstr = inpe{motifind}; % geneids = find(xinfinal(:,motifind+1)>0); % genesWithMotif=geneID(geneids); % numGenesWithMotif(i) = length(genesWithMotif); % promotersWithMotif=promoter(geneids); % % which genes with motif are in master list? % inmaster = indexXinY(genesWithMotif,master800_IDs); % numGenesInSpellman800(i) = sum(inmaster~=0); % fprintf(fid,'%s \t %d \t %d \n',char(motifstr),numGenesWithMotif(i),numGenesInSpellman800(i)); % end % fclose(fid); % % % % ---- PLOT THE PROFILES FOR EACH MOTIF ----- % % finalind=[1:40]; % xinfinal = xin(:,finalind); % inpefinal = inpe(finalind(2:length(finalind))-1); % % beta= (xinfinal'*xinfinal)\(xinfinal')*Y; % [dummy,M]=size(xin); % [dummy,T]=size(Y); % beta=zeros(M-1,T); % for i=1:M-1 % temp= (xinfinal(:,[1 i+1])'*xinfinal(:,[1 i+1]))\(xinfinal(:,[1 i+1])')*Y; % beta(i,:)=temp(2,:); % end % % cellcycletp = [14 28 49 63 77 91 112] % % [M T] = size(beta); % % maxY = max(beta(1:M,:), [], 2); % minY = min(beta(1:M,:), [], 2); % maxRange = max(maxY-minY); % yRange = maxRange+0.1; % % for i=1:M % m=i+1; % numpts = sum(xinfinal(:,m)~=0); % plotnum = mod(i,8); % if plotnum==0 % plotnum=8; % end % subplot(4,2,plotnum); % plot(sampletimes, beta(i,:),'b'); % xlabel(sprintf('%d. %s, %d',i, char(inpefinal{i}),numpts)); % hold off % xlim([min(sampletimes) max(sampletimes)]); % % ylim([minY(i)-0.1 minY(i)+yRange]); % % set(gca,'XTick',cellcycletp); % if (mod(i,8)==0) % fname = sprintf('%s_pc%s_depth%d_marginal_%d.ps',filteredfn,num2str(pc,'%d'),maxdepth,i/8 ); % print('-depsc2', fname) % subplot(1,1,1); % end % end % % if(mod(M-1,16)~=0) % fname = sprintf('%s_pc%s_depth%d_marginal_%d.ps',filteredfn,num2str(pc,'%d'),maxdepth,ceil((M-1)/8) ); % print('-depsc2', fname); % end % % % ---- ANALYZE THE GENES FOR A SINGLE PROMOTER ELEMENT ---- % % % get gene list. % motifind=25; % fprintf('Analyzing element %s.\n',char(inpe{motifind})); % geneids = find(xinfinal(:,motifind+1)>0); % genesWithMotif=geneID(geneids); % promotersWithMotif=promoter(geneids); % % % get the master 800 list. % fid = fopen('cellcycle_Spellman800.txt'); % temp= textscan(fid, '%s', 'delimiter', '\t'); % fclose(fid); % master800_IDs = temp{1}; % % % which genes with motif are in master list? % inmaster = indexXinY(genesWithMotif,master800_IDs); % numinmaster = sum(inmaster~=0); % % % % ----- Plot the effect curve ----- % subplot(1,1,1) % hold off % mainlabel=sprintf('%s',char(inpefinal{motifind})); % plotCellcycleTimeseries(sampletimes, beta(motifind,:), 'time (minutes)', 'effect',mainlabel, [min(beta(motifind,:))-0.1,max(beta(motifind,:))+0.1]); % % fname = sprintf('%s_effects.png',char(inpefinal{motifind},'forfile',true) ); % print('-depsc2', fname) % % % -------- Plot the occurrences upstream for these motifs, and get flanking % % sequence. --------------------------------------------------------------- % % % pe=inpe{motifind}; % nflank=10; % plotPromoters=false; % matrixfn_prefix=sprintf('pe%d_',motifind); % flankingfn_prefix=sprintf('pe%d_',motifind); % switch depth(pe) % case 1 % [motifs, flankingseqs, matrices] = analyzeFlanking_depth1(pe, promotersWithMotif, genesWithMotif,nflank,promoterlen,plotPromoters,matrixfn_prefix,flankingfn_prefix); % case 2 % [motifs, flankingseqs, matrices] = analyzeFlanking_depth2(pe, promotersWithMotif, genesWithMotif,nflank,promoterlen,plotPromoters,matrixfn_prefix,flankingfn_prefix); % case 3 % [motifs, flankingseqs, matrices] = analyzeFlanking_depth3(pe, promotersWithMotif, genesWithMotif,nflank,promoterlen,plotPromoters,matrixfn_prefix,flankingfn_prefix); % otherwise % fprintf('Promoter element has depth greater than 3, do nothing.\n'); % end % % % % find id in annotations. % annoids = indexXinY(genesWithMotif,ORF); % % if length(geneids) <= 56 % % % plot the profiles of these genes. % % for i=1:length(geneids) % % plotnum = mod(i,8); % % if plotnum==0 % % plotnum=8; % % end % % subplot(4,2,plotnum) % % hold off % % plot(sampletimes,Y(geneids(i),:)) % % hold on % % plot(sampletimes,Y(geneids(i),:),'.') % % set(gca,'XTick',cellcycletp); % % % % titlestr = genesWithMotif{i}; % % if annoids(i)~=0 % % titlestr = [titlestr ' ' YPD{annoids(i)}]; % % end % % title(titlestr) % % if (mod(i,8)==0) % % fname = sprintf('genes_with_%s_profiles_%d.ps',char(inpe{motifind},'forfile',true),i/8 ); % % print('-depsc2', fname) % % subplot(1,1,1); % % end % % end % % % % if(mod(length(geneids),8)~=0) % % fname = sprintf('genes_with_%s_profiles_%d.ps',char(inpe{motifind},'forfile',true),ceil((length(geneids)-1)/8)); % % print('-depsc2', fname); % % end % % end % % % % fid=fopen(sprintf('genes_with_%s_annotations.txt',char(inpe{motifind},'forfile',true)),'w'); % % fprintf(fid,'ORF\tYPD\tSGD\tProcess\tFunction\tPeak\tDescription\n'); % % for i=1:length(geneids) % % fprintf(fid,'%s\t',genesWithMotif{i}); % % if annoids(i) ~=0 % % fprintf(fid,'%s\t%s\t%s\t%s\t%s\t%s',YPD{annoids(i)},SGD{annoids(i)},Process{annoids(i)},Function{annoids(i)},Peak{annoids(i)},Description{annoids(i)}); % % end % % fprintf(fid,'\n'); % % end % % fclose(fid); % % % % --- Plot out the profile for this motif.