% % ---- GET PROMOTER DATA -------- [Header, Sequence] = fastaread('TAIR6_upstream_1000_20051108'); Npromoters = length(Header); % get the gene ids promoterID = cell(1,Npromoters); for i=1:Npromoters [v1,v2,v3,v4] = regexp(Header{i}, 'AT\w+'); promoterID{i} = v4{1}; end [promoterIDsorted,IX] = sort(promoterID); % promoterID{IX(i)} = promoterIDsorted{i} SequenceSorted = Sequence(IX); SequenceSortedReverse = cell(1,Npromoters); for i=1:Npromoters SequenceSortedReverse{i} = seqcomplement(SequenceSorted{i}); end HeaderIX = Header(IX); % ---- Get Expression Data ----------------- fid = fopen('RMA-april-04_altered.txt'); headers = textscan(fid, '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n', 1,'delimiter', '\t'); N=22747; locusIDs = cell(1,N); data = zeros(N, 48); for i=1:N temp = textscan(fid, '%s\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%n\t%s\n',1); locusIDs{i} = temp{50}; for j=1:48 data(i,j) = temp{j+1}; end end fclose(fid); for i=1:N locusIDs{i} = locusIDs{i}{1}; end % ---- Get list of filtered gene ids ---------- filteredIDs = load('filtered5000.txt'); filteredIDs = filteredIDs(find(filteredIDs<=N)); filteredData = data(filteredIDs, :); filteredGnames = locusIDs(filteredIDs); [filteredGnamesSorted,filteredIX] = sort(filteredGnames); filteredDataSorted = filteredData(filteredIX,:); % first index is time. % second index is gene number. % third index is Col (1) or eds (2). % fourth index is repeat number. filteredRMA(:,:,1,1) = filteredDataSorted(:, 1:6)'; filteredRMA(:,:,2,1) = filteredDataSorted(:, 7:12)'; filteredRMA(:,:,1,2) = filteredDataSorted(:, 13:18)'; filteredRMA(:,:,2,2) = filteredDataSorted(:, 19:24)'; filteredRMA(:,:,1,3) = filteredDataSorted(:, 25:30)'; filteredRMA(:,:,2,3) = filteredDataSorted(:, 31:36)'; filteredRMA(:,:,1,4) = filteredDataSorted(:, 37:42)'; filteredRMA(:,:,2,4) = filteredDataSorted(:, 43:48)'; avgfilteredRMA(:,:,1) = (filteredRMA(:,:,1,1)+filteredRMA(:,:,1,2)+filteredRMA(:,:,1,3)+filteredRMA(:,:,1,4))/4; avgfilteredRMA(:,:,2) = (filteredRMA(:,:,2,1)+filteredRMA(:,:,2,2)+filteredRMA(:,:,2,3)+filteredRMA(:,:,2,4))/4; sampletimes = [0 6 24 72 120 168]; % ---- Link the two datasets ---------- Nfiltered = length(filteredIDs); counterP = 1; indexInPromoterID = zeros(1,Nfiltered); %indexInPromoterID(i) holds the index in promoterID of the locusIDsorted{i} for counterM = 1:Nfiltered while (counterP<=Npromoters && mystrcmpi(filteredGnamesSorted{counterM}, promoterIDsorted{counterP})>0) counterP = counterP+1; end if counterP>Npromoters for i=counterM:Nfiltered indexInPromoterID(i) = 0; end break; end if mystrcmpi(filteredGnamesSorted{counterM}, promoterIDsorted{counterP})== 0 indexInPromoterID(counterM) = counterP; else indexInPromoterID(counterM) = 0; end end filtered_indices = find(indexInPromoterID); % get all the indexes of indexInPromoterID that correspond to non-zero elements. % promoterIDsorted{indexInPromoterID(indices(i))} is guaranteed to be equal % to locusIDsorted{indices(i)} for each i=1,...,length(indices). N_final = length(filtered_indices); % Y are the gene expression values. % FIRST 5 are COL SECOND 5 are EDS filteredY = zeros(length(filtered_indices), 10); for i=1:length(filtered_indices) filteredY(i,1:5) = avgfilteredRMA([1 3 4 5 6],filtered_indices(i),1); filteredY(i,6:10) = avgfilteredRMA([1 3 4 5 6],filtered_indices(i),2); end % promoter and promoterReverse are the promoter sequences. promoter = SequenceSorted(indexInPromoterID(filtered_indices)); promoterReverse = SequenceSortedReverse(indexInPromoterID(filtered_indices)); geneID = promoterIDsorted(indexInPromoterID(filtered_indices)); HeaderIXsorted = HeaderIX(indexInPromoterID(filtered_indices)); % NOTE promoterReverse is actually the complementary strand. % ---- orient the promoter sequences from -1, -2,... -1000 ------ for i=1:N_final if length(regexp(HeaderIXsorted{i},'BACKWARD'))>0 promoter{i} = seqreverse(promoter{i}); promoterReverse{i} = seqreverse(promoterReverse{i}); end end save 'filteredData.mat' filteredY filtered_indices promoter promoterReverse geneID % ---- PCA --------------------- timeaxis = [0 1 3 5 7] fY_genecentered = filteredY - repmat(mean(filteredY, 2), 1, 10); [pc, score, latent, tsquare] = princomp(fY_genecentered); perc_exp = 100*latent/sum(latent); for i=1:10 subplot(5,2,i); hold off plot(timeaxis, pc(1:5,i),'-o'); hold on plot(timeaxis, pc(6:10,i),':+'); label = sprintf('pc %d, explains %2.1f%% of var.', i, perc_exp(i)); ylim([-1 1]) xlim([0 7]) xlabel(label); set(gca,'XTick',[0 1 3 5 7]); end plotmatrix(score(:,1:9)) title('Pair-wise Principle Component Plot') [H,AX,BigAx,P,PAx] = PLOTMATRIX(score(:,1:4)); % among filtered genes the score on pc1 is bi-modal! subplot(4,1,1) hist(score(:,1), 100) h = findobj(gca,'Type','patch'); set(h, 'FaceColor', 'c', 'EdgeColor', 'b') subplot(4,1,2) hist(score(:,2), 100) h = findobj(gca,'Type','patch'); set(h, 'FaceColor', 'c', 'EdgeColor', 'b') subplot(4,1,3) hist(score(:,3), 100) h = findobj(gca,'Type','patch'); set(h, 'FaceColor', 'c', 'EdgeColor', 'b') subplot(4,1,4) hist(score(:,4), 100) h = findobj(gca,'Type','patch'); set(h, 'FaceColor', 'c', 'EdgeColor', 'b') n = length(filteredY); pc1 = zeros(2,n); pc1(1,:) = score(:,1)>2; pc1(2,:) = score(:,1)<-2; pc2 = zeros(3,n); pc2(1,:) = score(:,2)>1; pc2(3,:) = score(:,2)<-1; pc2(2,:) = ~(pc2(1,:) | pc2(3,:)); pc3 = zeros(2,n); pc3(1,:) = score(:,3)>1; pc3(3,:) = score(:,3)<-1; pc3(2,:) = ~(pc3(1,:) | pc3(3,:)); pc4 = zeros(2,n); pc4(1,:) = score(:,4)>1; pc4(3,:) = score(:,4)<-1; pc4(2,:) = ~(pc4(1,:) | pc4(3,:)); pcsetinds = zeros(length(filteredY),2,3,3,3); pcsets = cell(2,3,3,3); for i=1:2 for j=1:3 for k=1:3 for l=1:3 pcsetinds(:,i,j,k,l) = pc1(i,:) & pc2(j,:) & pc3(k,:) & pc4(l,:); pcsets{i,j,k,l} = filteredY(find(pcsetinds(:,i,j,k,l)),:); end end end end i=2; j=1;k=2;l=2; [setsize, temp] = size(pcsets{i,j,k,l}); for c=1:setsize plotnum = mod(c,8); if plotnum==0 plotnum=8; end subplot(4,2,plotnum); plot(timeaxis, pcsets{i,j,k,l}(c, 1:5), '-'); hold on plot(timeaxis, pcsets{i,j,k,l}(c, 6:10), '--'); hold off xlim([0 7]); set(gca,'XTick',[0 1 3 5 7]); if (mod(c,8)==0) fname = sprintf('set%d%d%d%d_%d.ps',i,j,k,l,c/8 ); print('-depsc2', fname) subplot(1,1,1); end end if(mod(setsize,16)~=0) fname =sprintf('set%d%d%d%d_%d.ps',i,j,k,l,ceil(setsize/8)); print('-depsc2', fname); end % --------------------------------------------- % plot the promoter regions of these genes indices = find(pcsetinds(:,i,j,k,l)) words = {'TGAC' 'CAGT' 'TTTTCAG' 'GACTTTT' 'GAGTGAG' 'CCACCC' 'AAGTCAA' 'ACACA'}; count = zeros(length(words), length(indices)); loc = cell(length(words), length(indices)); reverseLoc = cell(length(words), length(indices)); for j=1:length(words) w = words{j} for i=1:size(indices) [count(j,i), loc{j,i}, reverseLoc{j,i}] = motifCount(w, promoter{indices(i)}, promoterReverse{indices(i)}); end end colors = [1 0 0; 0.5 0.5 0; 0 0.5 0.5; 0 1 0; 0 0 1; 0.8 0.8 0; 0.8 0 0.8; 1 1 0] symbols = ['>' '<' '>' '<' 'd' 'd' 'd' 'd'] h = zeros(1,length(words)) hold off for j=1:length(words) for k=1:length(indices) temp =plotMotifLoc(loc{j,k}, reverseLoc{j,k}, 2*k, '<', colors(j,:)) if length(temp)>0 h(j) = temp; end hold on end end legend(h, words, 'Location', 'South', 'Orientation', 'horizontal') ylim([0 2*length(indices)+1]) % where are these genes on the pair-wise scatterplot of coefficients of pc1 % and pc2? subplot(1,1,1) scatter(score(:,1), score(:,2)) hold on scatter(score(find(pcsetinds(:,2,1,2,2)),1), score(find(pcsetinds(:,2,1,2,2)),2),'r'); scatter(score(find(pcsetinds(:,2,3,2,2)),1), score(find(pcsetinds(:,2,3,2,2)),2),'g'); % i don't believe the result! % what is going wrong? tempa = score(:,1)<=-2; tempb= score(:,2)>=1; temp = tempa & tempb; temp2 = find(temp>0); length(temp2) c=0; c=c+1; hold off plot(timeaxis, filteredY(temp2(c), 1:5), '-'); hold on plot(timeaxis, filteredY(temp2(c), 6:10), '--'); % Separate principal components for the wild-type and mutant. [pcwt, scorewt, latentwt, tsquarewt] = princomp(fY_genecentered(:,1:5)); perc_expwt = 100*latentwt/sum(latentwt); for i=1:5 subplot(5,2,i); hold off plot(timeaxis, pcwt(:,i),'-o'); hold on label = sprintf('pc %d, explains %2.1f%% of var.', i, perc_exp(i)); ylim([-1 1]) xlim([0 7]) xlabel(label); set(gca,'XTick',[0 1 3 5 7]); end [pcmt, scoremt, latentmt, tsquaremt] = princomp(fY_genecentered(:,1:5)); perc_expmt = 100*latentmt/sum(latentmt); for i=1:5 subplot(5,2,i); hold off plot(timeaxis, pcmt(:,i),'-o'); hold on label = sprintf('pc %d, explains %2.1f%% of var.', i, perc_exp(i)); ylim([-1 1]) xlim([0 7]) xlabel(label); set(gca,'XTick',[0 1 3 5 7]); end subplot(1,1,1); scatter(scorewt(:,1), scoremt(:,2))