#Examples starting with peak data, no batches #example with batches appears near bottom of this file ### READ IN data;##################### # The spectra are read in and put into a square matrix # the first spectra provides the common set of mz's. For the rest, # interpolate to this common set library("ppc",lib.loc="mylib") temp<-ppc.read.peaks.nobatch("data.peaks.nobatch.short/control") peaklist.tr<-temp$peaklist f1<-temp$filenames logmz<-temp$logmz temp2<-ppc.read.peaks.nobatch("data.peaks.nobatch.short/diseased") f2<-temp2$filenames peaklist.tr<-c(peaklist.tr,temp2$peaklist) logmz<-sort(c(logmz,temp2$logmz)) # class labels are derived from filenames ytr<- c(rep("control",length(f1)), rep("diseased",length(f2))) # grab the sample labels from the filenames sample.labels<- c(ppc.remove.beforeslash.and.suffix(f1), ppc.remove.beforeslash.and.suffix(f2)) #glossary of user parameters # # stn=1 min signal to noise ratio for a peak # minht=0 min height for a peak # span= 0.005 window size (# proportion of m/z sites) for peak detection running window # # smoothing.span=.05 fractional window size for supersmoother estimate of background # peak.gap=.005 peak width on log scale # mz.min=0- min m/z value to consider # mz.max=infinity- max m/z value to consider # nsplits=10 number of splits considered in optimal split point seach # fix.at.one=FALSE should only splits of the form "no peak vs peak" be considered? # nperms=20 number of permutations in estimation of FDR # recluster=FALSE ignore user.parms <- list(stn=1, minht=0, span=0.005, smoothing.span=.05, nsplits=10, fix.at.one=F, peak.gap=.005, recluster=F, mz.min=3000, mz.max=100000, nperms=20) data <- list(mz=exp(logmz), logmz=logmz, peaklist=peaklist.tr, ytr=as.factor(ytr), sample.labels=sample.labels) data<-ppc.subset.and.reshape.peakdata(data, user.parms) centroid.fit<- ppc.make.centroid.list(data, user.parms) peak.fit <- ppc.predict.peaks(centroid.fit, data) split.fit <- ppc.find.splits(centroid.fit, peak.fit, data, user.parms) ppc.fit <- ppc.predict(centroid.fit, split.fit, data$logmz, data$peaklist) foo <- ppc.error(data$ytr, ppc.fit$yhat) foo2 <- ppc.cv(ppc.fit, data, user.parms) info <- ppc.peak.summary(centroid.fit, peak.fit, data, user.parms, split.fit) ppc.plot.hist(peak.fit, ppc.fit, centroid.fit,split.fit, data) foo3<-ppc.fdr(data, centroid.fit, peak.fit ,split.fit ,ppc.fit, user.parms) ppc.plotfdr(foo3) ## test set prediction, starting from an external directory (folder) # here i'm just using the "control" directory as an example temp<-ppc.read.peaks.nobatch("data.peaks.nobatch.short/control") data.test<- list(peaklist=temp$peaklist,logmz=temp$logmz,mz=exp(temp$logmz)) data.test<-ppc.subset.and.reshape.peakdata(data.test, user.parms) threshold<-1 ppc.fit <- ppc.predict1(centroid.fit, split.fit, data.test$logmz, data.test$peaklist, threshold) ######### example starting with peak data, AND batches############### batches<-system("ls data.peaks.batch/control",intern=T) temp<- ppc.read.peaks.batch("data.peaks.hatch/control", batches) peaklist.tr<- temp$peaklist f1<-temp$filenames logmz<-temp$logmz temp2<- ppc.read.peaks.batch("data.peaks.batch", batches) peaklist.tr<- c(peaklist.tr, temp2$peaklist) f2<-temp2$filenames logmz<-sort(unique(c(logmz,temp2$logmz))) ytr<- c(rep("control",length(f1)), rep("diseased",length(f2))) batch.labels<-c(sort(rep(batches, length(f1)/length(batches))), sort(rep(batches, length(f2)/length(batches)))) patient.labels<- c(ppc.remove.beforeslash.and.suffix(f1), ppc.remove.beforeslash.and.suffix(f2)) sample.labels<- paste(batch.labels,".",patient.labels,sep="") data<-list(peaklist=peaklist.tr,ytr=ytr,sample.labels=sample.labels, batch.labels=batch.labels, patient.labels=patient.labels, logmz=logmz, mz=exp(logmz), batches=batches) user.parms <- list(stn=1, minht=0, span=0.005, smoothing.span=.05, nsplits=10, fix.at.one=F, peak.gap=.005, recluster=F, mz.min=3000, mz.max=50000, nperms=20) data<-ppc.subset.and.reshape.peakdata(data, user.parms) # NOTE: don't need to do these 2 steps! we already have the peaks ##peaklist.tr <- ppc.make.peaklist(data, user.parms) ##data$peaklist<-peaklist.tr centroid.fit<- ppc.make.centroid.list(data, user.parms) peak.fit <- ppc.predict.peaks(centroid.fit, data) split.fit <- ppc.find.splits(centroid.fit, peak.fit, data, user.parms) ppc.fit <- ppc.predict(centroid.fit, split.fit, data$logmz, data$peaklist) foo <- ppc.error(data$ytr, ppc.fit$yhat) threshold<- .25 ppc.fit1<- ppc.predict1(centroid.fit, split.fit, data$logmz, data$peaklist, threshold) foo2 <- ppc.cv(ppc.fit, peaklist.tr, user.parms) info <- ppc.peak.summary(centroid.fit, peak.fit, data, user.parms, split.fit) ppc.plot.hist(peak.fit, ppc.fit, split.fit, data, nsites=25) ## test set prediction, starting from an external directory (folder) temp<- ppc.read.peaks.batch("data.peaks.batch/control", batches) logmz<-temp$logmz ftest<-temp$filenames batch.labels.test<-c(sort(rep(batches, length(ftest)/length(batches)))) patient.labels.test <- ppc.remove.beforeslash.and.suffix(ftest) sample.labels.test<- paste(batch.labels.test,".",patient.labels.test,sep="") data.test<- list(peaklist=temp$peaklist,logmz=temp$logmz,mz=exp(temp$logmz), sample.labels=sample.labels.test, batch.labels=batch.labels.test, patient.labels=patient.labels.test) data.test<-ppc.subset.and.reshape.peakdata(data.test, user.parms) threshold<-0 ppc.fit <- ppc.predict1(centroid.fit, split.fit, data.test$logmz, data.test$peaklist, threshold)