@path.r # # R procedures for PathSeeker # # Friedman & Popescu (2004) # # (4/02/05) # # Coded and copyright (2005) by Jerome H. Friedman # # psversion=function() { print(" R/PathSeeker 1.0 (4/02/05).") print(" Copyright (2005) by Jerome H. Friedman.") invisible() } path <- function(x,y,mode='regres',obsweights='equal',varweights='equal', xmiss=9.9e35,huber=0.8,start=0,end=1,numval=5,nfold=3,constraints='none', spectra='none',fast='no', maxstep=100000,kfreq=100,delnu=0.01,convfac=1.1, modsel='a_roc',impl='auto',quiet=F,data.check=T) { oldpath <- getwd(); setwd(pshome) if (!is.character(mode)) stop("mode must be of type character.") if (mode != "regres" && mode != "class") stop(paste('mode must equal either "regres" for regression,', 'or "class" for classification.')) if (!is.character(modsel)) stop("modsel must be of type character.") if (modsel != "ramp" && modsel != "a_roc") stop(paste('modsel must equal either "ramp",','or "a_roc".')) if (!is.character(impl)) stop("impl must be of type character.") if (impl != "update" && impl != "brute" && impl != "auto") stop(' invalid value for impl.') if(mode(x)!="numeric") stop(" x must be numeric.") if(!is.matrix(x)) stop(" x must be a matrix.") if(mode(y)!="numeric") stop(" y must be numeric.") if(length(y)!= nrow(x)) stop (" x and y are of unequal length.") if (!is.logical(data.check)) { data.check=T warning(" data.check must be logical (T/F) - T substituted.") } if (data.check) { if(any(is.na(x))) { x[is.na(x)] <- xmiss; warning(" x contains NA's - xmiss substituted.") } if(any(is.na(y))) stop(" y contains NA's.") if(mode=='class') { if(any(y!=-1 && y!=1)) stop( 'y must be +1 or -1 for classification') } if(!missing(xmiss)) xmiss <- parchk("xmiss",xmiss,2*max(x[x!=xmiss]),9.9e35,9.9e35) } zz=file(paste(pshome,'/x-data',sep=''),'wb') writeBin(as.real(as.vector(x)),zz,size=4); close(zz) zz=file(paste(pshome,'/y-data',sep=''),'wb') writeBin(as.real(y),zz,size=4); close(zz) if(obsweights=='equal') { obstmp='equal'} else { if(mode(obsweights)!="numeric") stop(" obsweights must be numeric in this context.") if(length(obsweights)!= nrow(x)) stop (" x and obsweights are of unequal length.") if (data.check) { if(any(is.na(obsweights))) { obsweights[is.na(obsweights)] <- 0; warning(" obsweights contains NA's - zeros substituted.") } if(any(obsweights<0)) { obsweights[obsweights<0] <- 0; warning(" obsweights contains negative numbers - zeros substituted.") } } obstmp='obsweights' zz=file(paste(pshome,'/obsweights',sep=''),'wb') writeBin(as.real(obsweights),zz,size=4); close(zz) } if(varweights=='equal' || varweights=='std') { vartmp=varweights} else { if(mode(varweights)!="numeric") stop(" varweights must be numeric in this context.") if(length(varweights)!= ncol(x)) stop (" x and varweights are of unequal length.") if (data.check) { if(any(is.na(varweights))) { varweights[is.na(varweights)] <- 0; warning(" varweights contains NA's - zeros substituted.") } if(any(varweights<0)) { varweights[varweights<0] <- 0; warning(" varweights contains negative numbers - zeros substituted.") } } vartmp='varweights' zz=file(paste(pshome,'/varweights',sep=''),'wb') writeBin(as.real(varweights),zz,size=4); close(zz) } if(constraints=='none' || constraints=='all') { constrtmp=constraints} else { if(mode(constraints)!="numeric") stop(" constraints must be numeric in this context.") if(any(constraints<1) || any(constraints>ncol(x))) stop(" some constraint specifications invalid.") constrtmp='constraints' zz=file(paste(pshome,'/constraints',sep=''),'wb') writeBin(as.integer(c(length(constraints),constraints)),zz,size=4); close(zz) } huber <- parchk("huber",huber,0.5,5.0,0.8) start <- parchk("start",start,0.0,end,0.0) end <- parchk("end",end,start,2.0,1.0) numval <- parchk("numval",numval,1,100,5) maxstep <- parchk("maxstep",maxstep,kfreq+1,100000000,100000) kfreq <- parchk("kfreq",kfreq,1,1000000000,100) delnu <- parchk("delnu",delnu,1.e-10,1,0.01) convfac <- parchk("convfac",convfac,1.01,10.0,1.1) parm.tr <- c( "@mode=" %&% mode, "@model_file=model.pth","@coeffs_file=coeffs.pth", "@nvar=" %&% ncol(x), "@nobs=" %&% nrow(x), "@format=binary", "@pred_data=x-data", "@response_data=y-data", "@org=by_var", "@missing=" %&% xmiss, "@obs_weights=" %&% obstmp, "@var_weights=" %&% vartmp, "@quantile=0.0", "@constraints=" %&% constrtmp, "@nfold=" %&% nfold, "@start=" %&% start, "@end=" %&% end, "@numval=" %&% numval, "@alpha=" %&% huber, "@modsel=" %&% modsel, "@delnu=" %&% delnu, "@maxstep=" %&% maxstep, "@kfreq=" %&% kfreq, "@convfac=" %&% convfac, "@fast=" %&% fast, "@impl=" %&% impl ) if(is.character(spectra)) {parm.tr=c(parm.tr,"@numspect=0")} else { if(mode(spectra)!="numeric") stop(" spectra must be numeric in this context.") if(any(spectra<1) || any(spectra>ncol(x))) stop(" some spectra specifications invalid.") if(is.matrix(spectra)) { if(ncol(spectra)!=2) stop(" spectra must have two columns.") numspect=nrow(spectra) } else if(is.vector(spectra)) { if(length(spectra)!=2) stop(" spectra must have two entries.") numspect=1; spectra=matrix(spectra,nrow=1) } else { stop(" spectra must be a matrix or vector in this context.")} parm.tr=c(parm.tr,"@numspect=" %&% numspect) for (i in 1:numspect) { parm.tr=c(parm.tr,paste('@',spectra[i,1],spectra[i,2])) } } write(rep(0,(ncol(x)+1)),file='coeffs.pth') write(parm.tr, file='parm.txt') if (platform=='windows') { system('cmd.exe /c PS_train.exe < parm.txt > psout',minimized=T) } else { system('rm psout') system("./PS_train.exe < parm.txt > psout") } coeff=scan('coeffs.pth',quiet=T) if(!quiet) catfile('psout') setwd(oldpath) coeff } parchk <- function (ax,x,lx,ux,df) { if(x < lx || x > ux) { df warning(paste(" invalid value for",ax,"- default (",df,") used.")) } else { x} } read.file=function(file='psout') { wd=getwd(); setwd(pshome) if (platform=='windows') { system('cmd.exe /c readfile.exe',input=file, show.output.on.console=T,minimized=T); } else { system('/usr/X11R6/bin/xterm -e ./readfile.exe', input=file,show.output.on.console=T,minimized=T) } setwd(wd) } path.help <- function(fun="path.help") { if (!is.character(fun)) stop("argument must be of type character.") if (fun!="path" && fun!="path.pred" && fun!="path.help" && fun!="path.robust" && fun!="psversion") { stop(paste(fun,"is not an PathSeeker procedure.")) } wd=getwd(); setwd(pshome) if (platform=='windows'){system(paste('cmd.exe /k type ',fun,'.txt',sep=''), wait=F)} else { system(paste('/usr/X11R6/bin/xterm -sb -sl 500 -hold -T ',fun, ' -rightbar -e cat ',fun,'.txt&',sep=''))} setwd(wd) invisible() } catfile <- function(fun="psout") { if (!is.character(fun)) stop("argument must be of type character.") wd=getwd(); setwd(pshome) if (platform=='windows'){system(paste('cmd.exe /k type ',fun,sep=''), wait=F)} else { system(paste('/usr/X11R6/bin/xterm -sb -sl 500 -hold -T ',fun, ' -rightbar -e cat ',fun,'&',sep=''))} setwd(wd) invisible() } # # This is an infix operator version of paste(), with no spaces between # elements pasted together. # Posted by David Brahm to r-help mailing list, 9/25/01 "%&%" <- function(a, b) paste(a, b, sep="") path.pred=function(xp,a,x=NULL,xmiss=9.9e35) { if(mode(xp)!="numeric") stop(" xp must be numeric.") if(is.matrix(xp)) { p=ncol(xp)} else if(is.vector(xp)) { p=length(xp)} else { stop(' xp must be a vector or matrix')} if(mode(a)!="numeric") stop(" a must be numeric.") if(!is.vector(a)) stop(' a must be a vector') if(length(a)!=(p+1)) stop(' xp and a inconsistent') if(any(is.na(xp))) { xp[is.na(xp)] <- xmiss} if(!missing(x)) { if(mode(x)!="numeric") stop(" x must be numeric.") if(!is.matrix(x)) stop(" x must be a matrix.") if(ncol(x)!=p) stop(' xp and x inconsistent') if(!is.null(attr(x,'quants'))) { q=attr(x,'quants') for (j in 1:p) { xt=xp[xp[,j]!=xmiss,j]; xt=pmax(q[j,1],pmin(q[j,2],xt)) xp[xp[,j]!=xmiss,j]=xt } } } if(any(xp==xmiss)) { if(!missing(x)) { for (j in 1:p) { if(!is.null(attr(x,'means'))) { xp[xp[,j]==xmiss,j]=attr(x,'means')[j] } else { xp[xp[,j]==xmiss,j]=mean(x[x[,j]!=xmiss,j],na.rm=T)} } } else {stop(' missing values in xp and x not specified')} } xp%*%a[1:p]+a[p+1] } path.robust=function(x,qntl=0.025,xmiss=9.9e35) { if(mode(x)!="numeric") stop(" x must be numeric.") if(is.matrix(x)) { p=ncol(x)} else if(is.vector(x)) { p=length(x)} else { stop(' x must be a vector or matrix')} if(mode(qntl)!="numeric") stop(" qntl must be numeric.") if(qntl<0 || qntl >=0.5) stop(' invalid value for qntl') if(qntl>0) q=matrix(nrow=p,ncol=2); ave=rep(0,p) for (j in 1:p) { xt=x[,j] xt=xt[!is.na(xt) && xt!=xmiss] if(qntl>0) { q[j,]=quantile(xt,c(qntl,1-qntl)); xt=pmax(q[j,1],pmin(q[j,2],xt)) x[!is.na(x[,j]) && x[,j]!=xmiss,j]=xt } ave[j]=mean(xt) } if(qntl>0) attr(x,'quants')=q; attr(x,'means')=ave invisible(x) } @README.txt Instructions for installing PathSeeker in R Version 1.0 (4/02/05) Copyright 2005 by Jerome H. Friedman WINDOWS NT/XP/2000 (1) Enter R: Double click icon, or Start -> Programs -> R -> R 2.0.1 (2) Enter the following commands at the R command prompt ">": > platform="windows" > pshome="PSDIR" > source("PSDIR\\path.r") Here PSDIR is the full path name (omitting drive letter) of the directory containing the downloaded R_PathSeeker installation files, using double back slashes (\\) in place of back slashes (\). LINUX (1) Enter R: Type "R" at the system command prompt. This must be done from a different directory than that containing the PathSeeker installation files. (2) Enter the following commands at the R command prompt ">": > platform="linux" > pshome="PSDIR" > source("PSDIR/path.r") Here PSDIR is the full path name of the directory containing the downloaded PathSeeker installation files. NOTE: PSDIR should never be attached by the user as an R working data directory. The R/PathSeeker interface is now installed. These commands need not be repeated when R is entered again at future times, provided the 'yes' option is selected at the 'save worksave image' prompt on exit. For an overview and guide to documentation, enter the command: > path.help() @path.help.txt Help procedure for PathSeeker 1.0 functions (4/2/05) Description: This function provides a rudimentary help facility for the R/PathSeeker interface. It can be invoked from the R command line at any time to see documentation concerning any of the functions listed below. Usage: path.help (fun="path.help") Argument: fun = "name" of function. R/PathSeeker function names: "path", "path.robust", "path.pred", "path.help" Example: path.help("path") Reference: Friedman, J. H. and Popescu, B. E. (2004). Gradient directed regularization for linear regression and classification. http://www-stat.stanford.edu/~jhf/ftp/path.pdf Remarks: For use as an introduction/overview, the help files should be read in the above listed order. Familiarity with the above reference is assumed. In Windows, the appearance of help windows can be customized (size, font, layout, foreground/background colors, etc, by selecting the button in the upper left corner of the window and choosing "properties". After customization, be sure to select the "Save properties for future windows with same title" button before exiting, so that the changes will pertain to all future PathSeeker help windows. @path.txt PathSeeker Regression/Classification Description: This function implements regularized linear regression and classification procedures described in Friedman and Popescu 2004 http://www.stat.stanford.edu/~jhf/ftp/path.pdf Usage: a <- path( x, y, mode="regres", obsweights="equal", varweights="equal", xmiss=9.9e35, huber=0.8, start=0, end=1, numval=5, nfold=3, constraints="none", spectra="none", fast="no", maxstep=100000, kfreq=100, delnu=0.01, convfac=1.1, modsel="a_roc", impl="auto", quiet=F, data.check=T) { Required arguments: x = input predictor data matrix Rows are observations and columns are variables. Must be numeric and type matrix. y = input response values. Must be numeric and type vector. For classification (mode="class", see below) values must only be +1 or -1 Optional arguments: mode = regression /classification flag mode="regres" => regression. The outcome or response variable is regarded as numerically valued and the resulting linear model is used to predict the value of the response. mode="class" => binary classification. The resulting linear model produces a numeric score whose absolute value reflects confidence that its sign is the same as that of the response. The sign of this score can be used for prediction. obsweights= observation weights obsweights="equal" => all observations have equal weight. Otherwise, this input is interpreted as a numeric vector containing a weight for each observation. varweights = predictor variable influence weights varweights="equal" => all predictor variables are to have equal (a priori) influence varweights="std" => influence proportionial to training data standard deviations Otherwise, this input is interpreted as a numeric vector containing an (a priori) influence weight for each input variable. xmiss = predictor variable missing value flag. Must be numeric and larger than any predictor variable value. huber = fraction of observations subject to squared-error loss for Huber loss criterion (huber >= 1.0 => ordinary squared-error loss). (regression, mode="regres", only; ignored for mode="class") Gradient threshold optimization control: start = beginning gradient threshold value end = ending gradient threshold value numval = number of equally spaced evaluation points including (start,end) nfold = number of cross-validation iterations (folds) nfold > 1 => ordinary multifold cross-validation nfold < -1 => use one-fold with test set size = nobs/abs(nfold) (faster, but less accurate for small training samples) constraints = identities of variables whose coefficients are constrained to be non negative constraints="none" => no such variables constraints="all" => all coefficients are constrained Otherwise, this input is interpreted as a numeric vector containing the identities of the variables spectra = contiguous sets of variables and corresponding smoothing parameters defining separate spectra subject to smoothness constraints on their coefficients spectra="none" => no smoothness constraints on any coefficients Otherwise, this input is interpreted as a vector or a matrix with two columns. A vector (length=2) is considered as a matrix with one row. Each row specifies one spectrum (contiguous variable set) subject to smoothing: variables spectra[j,1]:(spectra[j+1,1]-1) comprise jth spectrum variables spectra[nrow(spectra):nrow(x),1] comprise last spectrum variables 1:(spectra[1,1]-1) are not subject to a smoothness constraint spectra[j,2] = smoothing parameters for each corresponding spectra 2*spectra[j,2]+1 = number of variables in gradient smoother span for jth spectrum (See Friedman & Popescu 2004 Section 6.1) fast = fast approximation flag (for large data sets): fast = "no" => no effect Otherwise use the input (fast) value as the gradient threshold, and one-fold cross-validation with test set size = nobs/abs(nfold) to estimate only number of path steps. Model estimates are from the learning data partition only. Iteration control: maxstep = maximum number of threshold gradient descent iterations (Increase if estimated optimal number of steps is too large.) kfreq = frequency for recomputing gradient, and checking test risk (gradient and/or test risk are re-evaluated every kfreq_th iteration). delnu =maximum factor (step size) scaling gradient for incrementing selected coefficients at each step. (Decrease if estimated optimal number of steps is too small.) convfac = convergence factor for early stopping. Iterations stop when error > convfac*(minimum error so far). modsel = classification model selection flag modsel="ramp" => use ramp risk for model selection modsel="a_roc" => use area under ROC curve (classification, mode="class", only. ignored for mode="regres") impl = algorithm implementation flag impl="update" => use fast updating algorithm (Friedman & Popescu 2004, Sections 5.1-5.3) impl="brute" => use standard (brute force) algorithm (Friedman & Popescu 2004, Section 5.4) impl="auto" => automatic choice based on problem properties quiet = output flag quiet=TRUE/FALSE => do/don't print summary output in separate window data.check = data check flag data.check=TRUE/FALSE => do/don't check input data for consistancy Output: vector of coefficient values. The last entry is the intercept value. Examples: a=path(x,y) # full regression a=path(x,y,mode="class") # full classification a=path(x,y,start=0,end=0,numval=1) # ~ ridge(PLS) regression a=path(x,y,start=1,end=1,numval=1) # ~ lasso regression a=path(x,y,mode="class",start=0,end=0,numval=1) # ~ ridge classification a=path(x,y,mode="class",start=1,end=1,numval=1) # ~ lasso classification @path.robust.txt Predictor Robustification Description: Robustify input predictor data matrix for linear regression by Winsorizing the data variables Useage: xr <- path.robust( x, qntl=0.025, xmiss=9.9e35) Required argument: x = input predictor data matrix Rows are observations and columns are variables. Must be numeric and type matrix. Optional arguments: qntl = Winsorizing quantile on each input variable xmiss = predictor variable missing value flag. Must be numeric and larger than any predictor variable value. Output: xr = robustified predictor variable data matrix attr(xr,'means') = the respective variable mean values for the robustified data matrix xr. attr(xr,'quants') = the respective lower and upper quantile values for each original variable. Note: this procedure can be called with qntl=0.0 to only calculate variable mean values to be used for faster prediction with missing values in path.pred Example: xr=path.robust(x) a=path(xr,y) yhat=path.pred(xp,a,xr) @path.pred.txt PathSeeker Prediction Description: Predict response values from PathSeeker solutions Usage: yhat <- path.pred( xp, a, x=NULL, xmiss=9.9e35) Required arguments: xp = data matrix of observations to be predicted Rows are observations and columns are variables. Must be numeric and type matrix or vector. a = output from path procedure Optional arguments: x = input data matrix to path procedure (REQUIRED if xp contains missing values and/or x is the output of path.robust) xmiss = predictor variable missing value flag. Must be numeric and larger than any predictor variable value. Output: vector of predicted response values. For classification this is a numeric score whose absolute value reflects confidence that its sign is the same as that of the response. The sign of this score can be used for prediction. Examples: a=path(x,y) yhat=path.pred(xp,a) yhat=path.pred(xp,a,x) @psversion.txt Prints date and version number of PathSeeker installation Usage: psversion () Arguments: none. @psout This is a dummy psout for initialization @@