load linkedData.mat; % -- settings --- load len6nonfuzzymotifs.mat; fuzzypos=[]; motiflen=6; motiflist = len6nonfuzzymotifs; promoterlist = promoterWUTR; fn=sprintf('linkedData_len%dnonfuzzy_motifcounts.txt', motiflen); fid = fopen(fn,'w'); ngenes = length(geneID); nmotifs = length(motiflist); okayoverlap=motiflen-1; siz = 4*ones(1,motiflen); grandtable = uint8(zeros(nmotifs,ngenes)); for i=1:ngenes %for i=1:105 fprintf('i= %d out of %d...',i, ngenes); tic if rem(i,100)==0 save(sprintf('fuzzyMotifScript%d.mat',motiflen), '*'); end seq=promoterlist{i}; % **** seqc=seqcomplement(seq); promoterlen = length(seq); seqn = zeros(1,promoterlen); seqcn = zeros(1,promoterlen); % motiftable contains counts of each 4^motiflen word in both the template % and coding strand. A word in the coding strand is counted if it % it is not the same with any word in the last 'okayoverlay' positions % in either the coding or reverse template strand, or if that word has % not been counted in 'okayoverlay' times. Same rule applies to the % word in the reverse template strand, with the additional check that % it is not palindromic. motiftable = zeros(siz); prevmotif = cell(1,okayoverlap); for j=1:okayoverlap prevmotif{j} = ''; end prevmotifc = cell(1,okayoverlap); for j=1:okayoverlap prevmotifc{j} = ''; end for k=1:promoterlen seqn(k) = mynt2int(seq(k)); seqcn(k) = mynt2int(seqc(k)); end havenotcounted = zeros(siz); % when was the last time this motif was counted? % a motif is not counted if it overlaps with the % previous motiflen-1 motifs, or if this motif % has not been counted in the last motiflen-1 % positions. for k=1:(promoterlen-(motiflen-1)) currmotif = seq(k:(k+motiflen-1)); currmotifc = seqc((k+motiflen-1):(-1):k); %reverse complement word s= seqn(k:(k+motiflen-1)); sc= seqcn((k+motiflen-1):(-1):k); if (max(s)<=4 && min(s)>0) linearind = mysub2ind(siz,s); linearindc = mysub2ind(siz,sc); else % there are fuzzy characters in s (discovered for 1423rd gene % in top 5000 paired T2 list). fprintf('fuzzy %d.',k); linearind=-1; linearindc=-1; end % with every advance of k the matrix havenotcounted gets % incremented by 1. Later, if the current word is counted, then % havenotcounted(linearind) is set to 0. If the current reverse % complement word is counted, then havenotcounted(linearindc) is % set to 0. havenotcounted= havenotcounted+1; % did currmotif (coding strand word) appear in the last okayoverlap positions? overlapself= false; for j=1:okayoverlap if strcmp(prevmotif{j},currmotif)==1 overlapself = true; end end % did currmotif occur in the last okayoverlap positions in % complementary strand? overlapselfc=false; for j=1:okayoverlap if strcmp(prevmotifc{j},currmotif)==1 overlapselfc = true; end end % if it _does_not_ overlap with self, OR we haven't counted the % word in okayoverlap positions, count the word. if linearind>0 if ~(overlapself || overlapselfc) || (havenotcounted(linearind) >=okayoverlap) motiftable(linearind) = motiftable(linearind)+1; havenotcounted(linearind)=0; end end % if currmotif (coding strand word) overlapped with itself, then % the template strand also overlapped its a same word in its own % strand. Therefore, no need to compare with itself. Also, if % coding strand word overlapped with a previous word in reverse template % strand, then the template strand overlapped with a previous word % in coding strand. Thus, only need to check whether the word is % palindromic. if ~overlapself % if here, then currmotifc is sure not to have overlapped with % any previous words, in its own strand or in the reverse % complement. if strcmp(currmotif, currmotifc)==1 %is this word palindromic? overlapself = true; end end if linearindc>0 if ~overlapself || (havenotcounted(linearindc) >=okayoverlap) motiftable(linearindc) = motiftable(linearindc)+1; havenotcounted(linearindc)=0; end end % update prevmotif, prevmotifc. for j=1:(okayoverlap-1) prevmotif{j} = prevmotif{j+1}; prevmotifc{j} = prevmotifc{j+1}; end prevmotif{okayoverlap} = currmotif; prevmotifc{okayoverlap} = currmotifc; end count = zeros(1,nmotifs); % this is not yet fully optimized. it's better to % loop over all entries in motiftable (4^p) rather % than all entries in motiflist. s=zeros(1,motiflen); for j=1:nmotifs % ---- Tabulate all instances of this motif --- word = motiflist{j}; if isfuzzy(word,fuzzypos) motifset = reduceFuzzyMotif(word,true); for k=1:length(motifset) linearind = mysub2ind(siz,motifset{k}); count(j) = count(j)+ motiftable(linearind); end else for m=1:motiflen s(m) = mynt2int(word(m)); end linearind = mysub2ind(siz,s); count(j) = motiftable(linearind); % simply the count in the single motiftable entry. end end grandtable(:,i) = uint8(count); toc end % print results to file fprintf(fid, 'Locus ID:\t'); for i=1:ngenes fprintf(fid, '%s\t',geneID{i}); end fprintf(fid,'\n'); for j=1:nmotifs fprintf(fid,'%s\t',motiflist{j}); fprintf(fid,'%d\t',grandtable(j,:)); fprintf(fid,'\n'); end fclose(fid); % for debugging %for j=1:nmotifs % fprintf('%s\t%d\n',motiflist{j},count(j)); %end % for debugging %j=1; %words = {'AATT'}; %symbols ={'>' }; %colors = {[1 0 0]}; %count, loc, reverseLoc] = motifCount(words{1}, promoterlist{j}, seqcomplement(promoterlist{j})); %ht = plotMotifLoc(loc, reverseLoc, 1, symbols{1}, colors{1}); % darn! can't read in the file one motif at a time. will have to turn it % into motif by gene storage. %fid = fopen('Len6FuzzyMotifCounts.txt'); %fgetl(fid); %grandtable = uint8(zeros(nmotifs,ngenes)); %for i=1:ngenes % c=textscan(fid,'%s\t',1); % a=textscan(fid,'%d\t',nmotifs); % a = uint8(a{1}); % grandtable(:,i) = a; %end %fclose(fid); % %fid = fopen('Len6FuzzyMotifCounts.txt','w'); %fprintf(fid, 'Locus ID:\t'); %for i=1:ngenes % fprintf(fid, '%s\t',filtered1500_geneID{i}); %end %fprintf(fid,'\n'); %for j=1:nmotifs % fprintf(fid,'%s\t',motiflist{j}); % fprintf(fid,'%d\t',grandtable(j,:)); % fprintf(fid,'\n'); %end %fclose(fid);