function [count, loc, comploc] = motifCount(motif, seq, countOverlap) % Inputs: % motif: a string (e.g. 'TTGAC') for the motif. % seq: a string for the promoter sequence, 5'-3' coding. % countOverlap: true if overlapping motifs are counted twice. % false if we want to discount overlapping. % Returns: % count: number of times 'motif' appears in 'seq'. % loc: locations of the motif in seq. should have % seq(loc(i):loc(i+len-1))==motif. % comploc: locations of m'=seqreverse(seqcomplement(motif)) in the % complementary sequence. comploc(i) is the _start_ position of m' in % the 3'-5' direction in complementary sequence. % % To try it out: % seq='AAAATTTTCCCCGGGGAAAATTTTCCCCGGGG'; % [c,loc,rloc]=motifCount('TTCC',seq,false); % Or for degenerate motif: % [c,loc,rloc]=motifCount('YCCS',seq,false) % % Also, as an independent check, can use the function: % seqshowwords(seq,'YCCS'); % seqshowwords(seqcomplement(seq),'SCCY'); seqlen = length(seq); if strcmp(motif, 'TSS START')==1 count=1; loc=[seqlen]; comploc=[seqlen]; return end motifset = reduceFuzzyMotif(motif,false); nm=length(motifset); seqrc=seqreverse(seqcomplement(seq)); loc=[]; comploc=[]; for i=1:nm loc= [loc strfind(seq, motifset{i})]; comploc= [comploc strfind(seqrc, motifset{i})]; end loc=sort(loc); comploc = seqlen+1-comploc; comploc = sort(comploc); codingOccupancy = zeros(1,seqlen); % =1 if those indices of the coding strand belongs to a _counted_ motif., 0 if otherwise. count=0; len=length(motif); if countOverlap count= length(loc)+length(comploc); else if length(loc)<=1 count =count+length(loc); if length(loc)==1 codingOccupancy(loc(1):(loc(1)+len-1))=1; end else count=count+1; %count the first one. codingOccupancy(loc(1):(loc(1)+len-1))=1; for i=1:length(loc)-1 if loc(i+1)-loc(i)>=len count = count+1; %for each next one, count if no overlap. codingOccupancy(loc(i+1):(loc(i+1)+len-1))=1; end end end if length(comploc)<=1 if length(comploc)==1 % check that it does not overlap with complementary strand. if sum(codingOccupancy(comploc(1):(-1):(comploc(1)-len+1)))==0 count =count+1; end end else if sum(codingOccupancy(comploc(1):(-1):(comploc(1)-len+1)))==0 count =count+1; end for i=1:(length(comploc)-1) if comploc(i+1)-comploc(i)>=len if sum(codingOccupancy(comploc(i+1):(-1):(comploc(i+1)-len+1)))==0 count =count+1; end end end end end comploc = comploc-len+1;