# script for ppc example in R # raw data case ### READ IN data;##################### # xtr is raw spectra. # The spectra are read in and put into a square matrix # the first spectra provides the common set of mz's. For the rest, # interpolate to this common set library("ppc",lib.loc="mylib") temp<- ppc.read.raw.nobatch("data.raw.nobatch/control") xtr<-temp$xtr mz<-as.numeric(temp$mz) f1<-temp$filenames temp2<-ppc.read.raw.nobatch("data.raw.nobatch/diseased",mz=mz) xtr<-cbind(xtr,temp2$xtr) f2<-temp2$filenames # class labels are derived from filenames ytr<- c(rep("control",length(f1)), rep("diseased",length(f2))) # grab the sample labels from the filenames sample.labels<- c(ppc.remove.beforeslash.and.suffix(f1), ppc.remove.beforeslash.and.suffix(f2)) #END OF DATA READ ### # create lists. data <- list(mz=mz, logmz=log(mz) ,xtr=xtr,ytr=as.factor(ytr), sample.labels=sample.labels) #glossary of user parameters # # stn=1 min signal to noise ratio for a peak # minht=0 min height for a peak # span= 0.005 window size (# proportion of m/z sites) for peak detection running window # # # smoothing.span=.05 fractional window size for supersmoother estimate of background # peak.gap=.005 peak width on log scale # mz.min=0- min m/z value to consider # mz.max=infinity- max m/z value to consider # nsplits=10 number of splits considered in optimal split point seach # fix.at.one=FALSE should only splits of the form "no peak vs peak" be considered? # nperms=20 number of permutations in estimation of FDR # recluster=FALSE ignore user.parms <- list(stn=1, minht=0, span=.005, smoothing.span=.05, nsplits=10, fix.at.one=F, peak.gap=.005, recluster=F, mz.min=15, mz.max=1500, nperms=20) # subset data if requested, and reshape when there are batches data<- ppc.subset.and.reshape(data, user.parms) #extract peaks and put them in a list peaklist.tr <- ppc.make.peaklist(data, user.parms) # add peaklist to data object data$peaklist<- peaklist.tr # NOTE: if data is given as peaks, rather than raw spectra, we simply read in #peaklist.tr, and proceed from here on down #cluster peaks centroid.fit<- ppc.make.centroid.list(data, user.parms) #look for these peaks in the individual spectra peak.fit <- ppc.predict.peaks(centroid.fit, data) #find best probability split points for each peak split.fit <- ppc.find.splits(centroid.fit, peak.fit, data, user.parms) #do nearest shrunken centroid predictions for a test set ppc.fit <- ppc.predict(centroid.fit, split.fit, data$logmz, data$peaklist) # make predictions for a single threshold threshold<- .25 ppc.fit1<- ppc.predict1(centroid.fit, split.fit, data$logmz, data$peaklist, threshold) # compute training error and confusion matrices foo <- ppc.error(data$ytr, ppc.fit$yhat) # cross-validation foo2 <- ppc.cv(ppc.fit, data, user.parms) # note: in a number of function calls (eg in the call to ppc.cv), I pass the # entire data object to keep the code tidy. # This object includes the big matrix xtr, which is often not used # (eg in ppc.cv). If this slows things down, we could just pass the data that # is used. # summarize peaks in a worksheet # be sure to first recreate data list above, making sure it includes # sample labels # please put the result into a Excel spreadsheet, and makes the NAs into #blanks info <- ppc.peak.summary(centroid.fit, peak.fit, data, user.parms, split.fit) # FDR analysis foo3<-ppc.fdr(data, centroid.fit, peak.fit, split.fit, ppc.fit, user.parms) ppc.plotfdr(foo3) #make histogram plot: ppc.plot.hist(peak.fit, ppc.fit, centroid.fit, split.fit, data) ##test set prediction, from an external file temp<- ppc.read.raw.nobatch("data.raw.nobatch/control") xtr.test<-temp$xtr mz<-temp$mz f1<-temp$filenames sample.labels.test <- ppc.remove.suffix(f1) data.test <- list(mz=mz, logmz=log(mz) ,xtr=xtr.test, sample.labels=sample.labels.test) data.test<- ppc.subset.and.reshape(data.test, user.parms) peaklist.test<- ppc.make.peaklist(data.test, user.parms) threshold<- 0 ppc.fit1<- ppc.predict1(centroid.fit, split.fit, data$logmz, peaklist.test,threshold) ##### example with batches################################### # The spectra are read in and put into a square matrix # the first spectra provides the common set of mz's. For the rest, # interpolate to this common set #READ IN DATA###################### # read in batch names first batches<-system("ls data.raw.batch/control",intern=T) temp<-ppc.read.raw.batch("data.raw.batch/control",batches) xtr<-temp$xtr mz<-temp$mz f1<-temp$filenames temp2<-ppc.read.raw.batch("data.raw.batch/diseased",batches, mz=mz) f2<-temp2$filenames xtr<- cbind(xtr,temp2$xtr) ytr<- c(rep("control",length(f1)), rep("diseased",length(f2))) # form batch labels , one per sample. We assume that every patient appears in #every batch # sample labels are a concatenation of the batch labels and file names batch.labels<-c(sort(rep(batches, length(f1)/length(batches))), sort(rep(batches, length(f2)/length(batches)))) patient.labels<- c(ppc.remove.beforeslash.and.suffix(f1), ppc.remove.beforeslash.and.suffix(f2)) sample.labels<- paste(batch.labels,".",patient.labels,sep="") data <- list(mz=mz, xtr=xtr, ytr=as.factor(ytr), sample.labels=sample.labels, patient.labels=patient.labels, batch.labels=batch.labels, batches=batches) # release space rm(xtr) ##################################### #set user parameters user.parms <- list(stn=1, minht=0, span=101, smoothing.span=.05, nsplits=10, fix.at.one=F, peak.gap=.005, recluster=F, mz.min=3000, nperms=20) # subset data if requested, and reshape when there are batches data<- ppc.subset.and.reshape(data, user.parms) #extract peaks and put them in a list peaklist.tr <- ppc.make.peaklist(data, user.parms) #add peaklist to data object data$peaklist<-peaklist.tr #cluster peaks centroid.fit<- ppc.make.centroid.list(data, user.parms) #look for these peaks in the individual spectra peak.fit <- ppc.predict.peaks(centroid.fit, data) #find best probability split points for each peak split.fit <- ppc.find.splits(centroid.fit, peak.fit, data, user.parms) #do nearest shrunken centroid predictions for a test set # (see below for an example starting with an external file) ppc.fit <- ppc.predict(centroid.fit, split.fit, data$logmz, data$peaklist) # compute errors foo <- ppc.error(data$ytr, ppc.fit$yhat) #do nearest shrunken centroid predictions for a test set, # for a single threshold threshold<- .25 ppc.fit1<- ppc.predict1(centroid.fit, split.fit, data$logmz, data$peaklist.tr, threshold) # cross-validation foo2 <- ppc.cv(ppc.fit, peaklist.tr, user.parms) # summarize peaks in a worksheet info <- ppc.peak.summary(centroid.fit, peak.fit, data, user.parms, split.fit) #make histogram plot: ppc.plot.hist(peak.fit, ppc.fit, split.fit, data, nsites=25) ####test set prediction from an external folder (directory) batches<-system("ls data.raw.batch/control",intern=T) temp<-ppc.read.raw.batch("data.raw.batch/control",batches) xtr.test<-temp$xtr mz<-temp$mz ftest<-temp$filenames batch.labels.test<-sort(rep(batches, length(ftest)/length(batches))) patient.labels.test<- ppc.remove.beforeslash.and.suffix(ftest) sample.labels.test<- paste(batch.labels,".",patient.labels,sep="") data.test <- list(mz=mz, xtr=xtr.test, sample.labels=sample.labels.test, patient.labels.test=patient.labels.test, batch.labels=batch.labels.test) data.test<- ppc.subset.and.reshape(data.test, user.parms) peaklist.test<- ppc.make.peaklist(data.test, user.parms) threshold<- 0 ppc.fit1<- ppc.predict1(centroid.fit, split.fit, data$logmz, peaklist.test,threshold)