function [B,SE,PVAL,inpe,xin,xinInd,pepool,xpool] = lapem5(y,counts,loc,rloc,motifs,intervals,varargin) % THIS PERFORMS STEP B. % % CHANGE: allows distance from TSS as an automatic interaction. % TO CALL THIS, the first element if counts, loc, rloc, and motifs must be % TSS. % % LAPEM: Locally Adaptive Promoter Element Modeling % --- Arguments: % y GxT, where G is number of genes, T is dimension of response % loc loc{i,j} locations of motif{j} in sense strand of promoter of y(i) % rloc rloc{i,j} locations of motif{j} in antisense strand of promoter of y(i) % counts counts of motif(j) in the promoter of y(i), may not equal to % length(loc{i,j})+length(rloc{i,j}) because we are not counting % overlaps. % motifs motifs{j} is a string describing motif j % w w(i) is the weight for y(i) % steps a vector of cutoff values for distance btw promoter elements % dumpfile a file to dump intermediate variables into. % --- Returns: [G T] = size(y); M = length(motifs); nintervals = length(intervals); nargin okargs = {'xinInd' 'xin' 'pepool' 'xpool' 'df' 'dumpfile' 'pthreshold' 'maxin' 'maxdepth' 'mvweights'}; defaults = {[1] ones(G,1) {} [] [] '' 0.0001 20 Inf ones(1,T) }; [eid,emsg,xinInd, xin,pepool,xpool,df,dumpfile,pthreshold,maxin,maxdepth,mvweights] = mygetargs(okargs,defaults,varargin{:}); eid emsg if length(pepool) == 0 || length(xpool)==0 || length(df)==0 pepool = cell(1,M); % pepool is the pool of promoter elements to choose from. xpool = zeros(G,M); % xpool is the pool of x (predictor) values corresponding to pepool. df = zeros(M,1); % df(i) is the marginal df pepool(i) adds to model. % construct the initial pepool, xpool. for i=1:M pepool{i} = promoterElement(i,motifs{i},{},'',0); df(i) = 1; for j=1:G % initially the xpool contains only count of each motif. xpool(j,i) = counts(j,i); end end end %occupancy = motifOccupancy(motifs,loc,rloc) % ----- NEED TO DEAL WITH CASE OF XIN PRESPECIFIED. B=inv(xin'*xin)*xin'*y; yhat = xin*B; res = y-yhat; temprss=0; for j=1:T temprss =temprss+mvweights(j)*res(:,j)'*res(:,j); end rss = temprss; cumdf = sum(df(xinInd(2:length(xinInd)))); numin = length(xinInd); [numin maxin] countthresh = 5; % Neglect all promoter elements that appear in less than 5 genes. ixRSS2=1; newpe = pepool{ixRSS2}; newd = d(newpe); n = length(pepool); while true if depth(newpe)M || sum(xinInd(1:numin-1)==i)==0) % if both i, ixRSS2 are simple motifs, and i is in model, then (i,ixRSS2) is in pool already. for j=1:nintervals if intervals(j)>=newd petemp = promoterElement(pepool{i},char(pepool{i}),pepool{ixRSS2},char(pepool{ixRSS2}),intervals(j)); for g=1:G % TODO: add field `numWords' to promoterElement, % use evaluatePromoter2.m for promoters with >2 % elements, otherwise use evaluatePromoter.m. %tempx(g)=evaluatePromoter2(petemp,occupancy(g,:)); tempx(g)=evaluatePromoter(petemp,loc(g,:),rloc(g,:)); end if sum(tempx)>=countthresh counter = counter+1; pepool{counter} = petemp; xpool(:,counter) = tempx; df(counter) = 1; end end end end if i==2 unittime = etime(clock,t0); fprintf('-- Updating pool with %s, estimated time till completion: %.1f minutes for %d words.\n',char(newpe), (M-i)*unittime/60, M-i); % fprintf('.'); else % fprintf('.'); end end toc % truncate to their actual size. pepool = pepool(1:counter); xpool = xpool(:,1:counter); df = df(1:counter); end if length(dumpfile)>0 save(dumpfile, 'pepool', 'xpool', 'xinInd','y','motifs'); end if (numin>=maxin) fprintf('exit: numin=%d, maxin=%d\n',numin,maxin); break; end tic fprintf('Calculating regression coefficients for xpool...'); % xpool2 = xpool - xin*inv(xin'*xin)*xin'*xpool; xpool2 = xpool - xin*((xin'*xin)\(xin'))*xpool; y2 = y-yhat; n = length(pepool); B2 = zeros(n,T); rss2 = ones(n,1)*rss; ninmodel = length(xinInd)-1; for i=setdiff([1:n],xinInd) % regress y on the residual xpool2 iscollinear = false; for j=1:ninmodel r = corr([xin(:,j+1) xpool(:,i)]); if r(2,1)==1 iscollinear = true; end end if iscollinear B2(i,:) = 0; else B2(i,:) = (xpool2(:,i)'*y2)/(xpool2(:,i)'*xpool2(:,i)); end temp=y2 - xpool2(:,i)*B2(i,:); temprss = 0; for j=1:T temprss =temprss+mvweights(j)*temp(:,j)'*temp(:,j); end rss2(i) = temprss; end toc [minRSS2, ixRSS2]=min(rss2); % NOTE: with this weighted sum of RSS across dimensions, the F and % pvalue needs to be redefined, right now they don't make sense. F = ((rss-rss2)./(T*df))./(rss2./(T*(G-(cumdf+df)-1))); pval = 1-fcdf(F, T*df, T*(G-(cumdf+df)-1)); [minpval, ix] = min(pval); % stop stepwise regression if pvalue is too large. if (minpval > pthreshold) fprintf('exit: minpval=%f, pthreshold=%d\n',minpval,pthreshold); break; end if sum(xinInd==ixRSS2)>0 fprintf('exit: %s will be added to model when it is already in model.\n', char(pepool{ixRSS2}) ); break; end % choose the promoter element with maximum F threshold, %NOTE THIS ONLY % WORKS IF df IS THE SAME FOR ALL. % add it to the model... xinInd = [xinInd ixRSS2]; xin = [xin xpool(:,ixRSS2)]; % B = inv(xin'*xin)*xin'*y; B = (xin'*xin)\(xin')*y; yhat = xin*B; res = y-yhat; temprss=0; for j=1:T temprss =temprss+mvweights(j)*res(:,j)'*res(:,j); end rss = temprss; cumdf = cumdf +df(ixRSS2); numin = length(xinInd); fprintf('Iteration %d: %s added to model, pvalue = %1.2e, RSS=%1.2e.\n', numin-1, char(pepool{ixRSS2}),pval(ixRSS2),rss2(ixRSS2)); newpe = pepool{ixRSS2}; newd = d(newpe); end inpe = pepool(xinInd(2:length(xinInd))); % calculate SE and PVAL for B invxtx = inv(xin'*xin); sigmahat = sqrt(rss/(G-cumdf-1)); SE = sqrt(diag(invxtx))*sigmahat; Z = abs(B./repmat(SE,1,T)); PVAL = 1-tcdf(Z, G-cumdf-1);