% % ---- GET PROMOTER DATA -------- [Header, Sequence] = fastaread('yeastdata/utr5_sc_1000.fasta'); Npromoters = length(Header); % get the gene ids promoterID = cell(1,Npromoters); for i=1:Npromoters promoterID{i} = Header{i}(1:7); end [promoterID,IX] = sort(promoterID); % promoterID{IX(i)} = promoterIDsorted{i} Sequence = Sequence(IX); SequenceLen=1000; % ---- GET MICROARRAY DATA ------- cellcycle=importdata('yeastdata/cellcycle_combined_imputed.txt') [G,N]=size(cellcycle.data); geneID=cellcycle.textdata(2:1+G,1); sampleID=cellcycle.textdata(1,2:1+N); cellcycle.alpha = cellcycle.data(:,7:24); ststr = sampleID(7:24); sampletimes=length(ststr); for i=1:length(ststr) temp=ststr{i}; % fprintf('%s\n',temp); [st ed extents match tokens names]=regexp(temp,'\d+'); temp2=temp(st(1):ed(1)); % fprintf('%d\n',str2num(temp2)); sampletimes(i)=str2num(temp2); end [G,N2]=size(cellcycle.alpha); % NEED TO TAKE OUT NaN VALUES!!! % ---- find all rows in data with NaN and take row out --- keeprow= ones(G,1); numnans = zeros(1,G); for i=1:G numnans(i) = sum(isnan(cellcycle.alpha(i,:))); if numnans(i)>0 keeprow(i)=0; end end alpha_keep=cellcycle.alpha(find(keeprow),:); geneID_keep=geneID(find(keeprow)); % ---- LINK THE DATA ------- indexInPromoterID = indexXinY(geneID_keep, promoterID); indices = find(indexInPromoterID); % get all the indexes of indexInPromoterID that correspond to non-zero elements. % promoterIDsorted{indexInPromoterID(indices(i))} is guaranteed to be equal % to locusIDsorted{indices(i)} for each i=1,...,length(indices). % Y are the gene expression values. Y = zeros(length(indices), N2); for i=1:length(indices) Y(i,:) = alpha_keep(indices(i),:); end % promoter and promoterComp are the promoter sequences. promoter = Sequence(indexInPromoterID(indices)); geneID = promoterID(indexInPromoterID(indices)); PROMOTER_LEN=700; originalLen=zeros(1,length(promoter)); for i=1:length(promoter) originalLen(i)=length(promoter{i}); promoter{i} = promoter{i}(max(originalLen(i)-PROMOTER_LEN+1,1):originalLen(i)); end save 'cellcycle_alpha.mat' geneID promoter Y sampletimes % ---- count the number of occurrences of each motif of length motiflen. % CAUTION: THIS IS A VERY LENGTHY PROCESS AND CAN TAKE TWO DAYS! load 'cellcycle_alpha.mat' for motiflen=5:7 fprintf('Processing motiflen=%d...\n',motiflen); fuzzypos=[]; outfn=sprintf('cellcycle_imputed_700upstream_len%dnonfuzzy_motifcounts.txt',motiflen); tic [motifs]=fuzzyMotifFunction(geneID,promoter,motiflen,fuzzypos,outfn); toc end % ---- Look at principal components ---- [pc, score, latent, tsquare] = princomp(Y); perc_exp = 100*latent/sum(latent); for i=1:18 subplot(6,3,i); plot(sampletimes,pc(:,i),'.'); label = sprintf('pc %d, explains %2.1f%% of var.', i, perc_exp(i)); ylim([-1 1]) xlabel(label); end [G,T]=size(Y); Y_normed=Y ./ repmat(sqrt(var(Y)),G,1); [pc, score, latent, tsquare] = princomp(Y_normed); perc_exp = 100*latent/sum(latent); for i=1:18 subplot(6,3,i); hold off plot(sampletimes,pc(:,i),'.'); hold on plot(sampletimes,pc(:,i)); label = sprintf('pc %d, explains %2.1f%% of var.', i, perc_exp(i)); ylim([-1 1]) xlim([sampletimes(1),sampletimes(length(sampletimes))]) xlabel(label); end subplot(1,1,1) plot(sampletimes,pc(:,1),'b') hold on plot(sampletimes,pc(:,2),'r') plot(sampletimes,pc(:,3),'g') plot(sampletimes,pc(:,10),'k') % ---- choose the pc's that are useful. ---- pcset=[1 2 3]; pcset=[1:17]; pcset=[1]; pcset=[2]; pcset=[3]; motiflen=7; % ---- cluster analysis ------ k=10; Y_clust = kmeans(Y,k); cellcycletp = [14 28 49 63 77 91 112] for i=1:k subplot(ceil(k/2),2,i) plot(sampletimes,mean(Y(find(Y_clust==i),:))); title(sprintf('Cluster %d: %d genes',i,sum(Y_clust==i))); set(gca,'XTick',cellcycletp); end % plot each cluster. ind=9; subplot(1,1,1); ingenes=find(Y_clust==ind); for i=1:length(ingenes) plot(sampletimes,Y(ingenes(i),:),'m'); hold on end title(sprintf('Cluster %d: %d genes',ind,sum(Y_clust==ind))); hold off set(gca,'XTick',cellcycletp); % ----------------------------------------------------------- % Create a test and control gene list for motif search % % ----------------------------------------------------------- load 'cellcycle_alpha.mat' fid = fopen('cellcycle_Spellman800.txt'); temp= textscan(fid, '%s', 'delimiter', '\t'); fclose(fid); test_IDs = temp{1}; test_IDs = sort(test_IDs); testinds = indexXinY(test_IDs, geneID); % take out genes that don't appear in master list. test_IDs = test_IDs(find(testinds)); testinds = indexXinY(test_IDs, geneID); % now all of testinds should be >0. numtest = length(test_IDs); % ---- look at these genes. ---- for i=1:length(testinds) plot(sampletimes,Y(testinds(i),:)); hold on end k=6; test_clust = kmeans(Y(testinds,:),6); ind=1; ingenes=find(test_clust==ind); for i=1:length(ingenes) plot(sampletimes,Y(testinds(ingenes(i)),:),'m'); hold on end title(sprintf('Cluster %d: %d genes',ind,sum(test_clust==ind))); hold off cellcycletp = [14 28 49 63 77 91 112] for i=1:k subplot(k,1,i) plot(sampletimes,mean(Y(testinds(find(test_clust==i)),:))); title(sprintf('Cluster %d: %d genes',i,sum(test_clust==i))); set(gca,'XTick',cellcycletp); end % ---- make a control list, adhoc, same number as test list ---- numcontrols = numtest; k=10; Y_clust = kmeans(Y,k); % inspect the clusters, find the least interesting one. for i=1:k subplot(ceil(k/2),2,i) plot(sampletimes,mean(Y(find(Y_clust==i),:))); title(sprintf('Cluster %d: %d genes',i,sum(Y_clust==i))); set(gca,'XTick',cellcycletp); end % cluster 3 is the winner, it has more than numcontrol genes. clusterinds =[8]; ingenes=[]; for i=1:length(clusterinds) ingenes=[ingenes find(Y_clust==clusterinds(i))']; end subplot(1,1,1); for i=1:length(ingenes) plot(sampletimes,Y(ingenes(i),:),'m'); hold on end title(sprintf('Ingenes %d',sum(ingenes))); hold off diffinds=setdiff(ingenes,testinds); control_inds = diffinds(randsample(length(diffinds),numcontrols)); control_IDs = geneID(control_inds); % look at the control exp profiles. hold off for i=1:numcontrols plot(sampletimes,Y(control_inds(i),:),'m'); hold on end cellcycletp = [14 28 49 63 77 91 112] % output gene list to file. fid=fopen('cellcycle_testcontrol_2.txt','w'); for i=1:numtest fprintf(fid,'%s\n',test_IDs{i}); end for i=1:numcontrols fprintf(fid,'%s\n',control_IDs{i}); end fclose(fid); % ---- load gene list from file, create filtered.mat ---- fid=fopen('cellcycle_testcontrol_2.txt'); temp = textscan(fid,'%s\n'); filteredIDs = temp{1}; fclose(fid); load 'cellcycle_alpha.mat' inds=indexXinY(filteredIDs, geneID); Y = Y(inds,:); promoter=promoter(inds); geneID=geneID(inds); save 'cellcycle_testcontrol_2.mat' Y promoter geneID sampletimes ---- do the motif count files ---- filteredfn = 'cellcycle_testcontrol_2'; load([filteredfn '.mat']); motifCountfn='cellcycle_imputed_700upstream_len5nonfuzzy_motifcounts.txt'; load len5nonfuzzymotifs.mat; nmotifs=length(len5nonfuzzymotifs); outputfn = [filteredfn '_imputed_700upstream_len5nonfuzzy_motifcounts.txt']; geneIDprefix='Y'; geneIDlen=7; counts=getMotifCountsForGeneListFromMasterFile(motifCountfn, geneID,nmotifs, outputfn,geneIDprefix,geneIDlen); % ---- Get the Master 800 annotations. ---- fid=fopen('YeastMaster800Annotation.txt'); temp=textscan(fid,'%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s','delimiter','\t'); fclose(fid); Process=temp{1}(2:800); Function=temp{2}(2:800); Peak=temp{3}(2:800); PhaseOrder=temp{4}(2:800); ClusterOrder=temp{5}(2:800); ORF=temp{6}(2:800); YPD=temp{7}(2:800); SGD=temp{8}(2:800); Geomean=temp{9}(2:800); Absolute=temp{10}(2:800); Deletion=temp{11}(2:800); Known=temp{12}(2:800); Description=temp{13}(2:800); AggregateScore=temp{14}(2:800); Phase=temp{15}(2:800); save 'YeastMaster800Annotation.mat' Process Function Peak PhaseOrder ClusterOrder ... ORF YPD SGD Geomean Absolute Deletion Known Description AggregateScore Phase