# # R procedures for GPS # # Friedman 2008 # # (10/17/08) # # Coded and copyright (2008) by Jerome H. Friedman # # gpsversion=function() { cat(' R/GPS 1.0 (10/17/08)\n') cat(' Copyright (2008) by Jerome H. Friedman\n') invisible() } gpsbridge <- function(x,y,mode='regres',pens=c(0.0,0.1,0.2,0.5,1.0,1.5), nfold=-5,std='yes',sel='no',modsel=2,costs=c(1.0,1.0),maxin=ncol(x), obswts='equal',varpens='equal',xmiss=9.9e35,stepsize=0.01,maxstep=10000, qntl=0.025,impl='update',speed=1,quiet=F,data.check=T) { if (!is.character(mode)) stop("mode must be of type character.") if (mode != "regres" && mode != "class") stop(paste('mode must equal either "regres" for regression,', 'or "class" for classification.')) if (!is.character(impl)) stop("impl must be of type character.") if (impl != "update" && impl != "naive") stop('impl must equal "update" or "naive".') if (!is.character(std)) stop("std must be of type character.") if (std != "yes" && std != "no") stop('std must equal "yes" or "no".') if (!is.character(sel)) stop("sel must be of type character.") if (sel != "yes" && sel != "no") stop('sel must equal "yes" or "no".') if (any(pens < 0.0) || any(pens > 2.0)) stop("all pens values must be >= 0.0 and <= 2.0") if(mode(x)!="numeric") stop(" x must be numeric.") if(!is.matrix(x)) stop(" x must be a matrix.") if(mode(y)!="numeric") stop(" y must be numeric.") if (mode=='regres') { if (!is.vector(y)) stop("y must be a vector for regression.") lny=length(y) } else { if(!is.matrix(y)) stop(" y must be a matrix for classification.") if (ncol(y) != 2) stop(" y must have two columns for classification.") lny=nrow(y) } if(lny != nrow(x)) stop (" x and y are of unequal length.") if (!is.logical(data.check)) { data.check=T warning(" data.check must be logical (T/F) - T substituted.") } if (data.check) { if(any(is.na(x))) { x[is.na(x)] <- xmiss; warning(" x contains NA's - xmiss substituted.") } if(any(is.na(y))) stop(" y contains NA's.") if(mode=='class') { if(any(y <0)) stop( 'y must be non negative for classification') } if(!missing(xmiss)) xmiss <- parchk("xmiss",xmiss,2*max(x[x!=xmiss]),9.9e35,9.9e35) } zz=file(paste(gpshome,'/pred.dat',sep=''),'wb') writeBin(as.real(as.vector(x)),zz,size=4); close(zz) zz=file(paste(gpshome,'/response.dat',sep=''),'wb') writeBin(as.real(as.vector(y)),zz,size=4); close(zz) if (mode=='regres') { if(is.character(obswts)) { if (obswts != 'equal') stop('obswts must be "equal" in this context.') obstmp='equal' } else { if(!is.vector(obswts)) stop("obswts must be a vector in this context.") if(mode(obswts)!="numeric") stop(" obswts must be numeric in this context.") if(length(obswts)!= nrow(x)) stop (" x and obswts are of unequal size.") if (data.check) { if(any(is.na(obswts))) { obswts[is.na(obswts)] <- 0; warning(" obswts contains NA's - zeros substituted.") } if(any(obswts<0)) { obswts[obswts<0] <- 0; warning(" obswts contains negative numbers - zeros substituted.") } } obstmp='obsweights' zz=file(paste(gpshome,'/obsweights',sep=''),'wb') writeBin(as.real(obswts),zz,size=4); close(zz) } } else { obstmp='equal'} if(is.character(varpens)) { if (varpens != 'equal') stop('varpens must be "equal" in this context.') vartmp='equal' } else { if(!is.vector(varpens)) stop("varpens must be a vector in this context.") if(mode(varpens)!="numeric") stop(" varpens must be numeric in this context.") if(length(varpens)!= ncol(x)) stop (" x and varpens are of unequal length.") if (data.check) { if(any(is.na(varpens))) { varpens[is.na(varpens)] <- 9.9e30; warning(" varpens contains NA's - 9.9e30 substituted.") } if(any(varpens<0)) { varpens[varpens<0] <- 0; warning(" varpens contains negative numbers - zeros substituted.") } } vartmp='varpens' zz=file(paste(gpshome,'/varpens',sep=''),'wb') writeBin(as.real(varpens),zz,size=4); close(zz) } nfold=parchk("nfold",nfold,-nrow(x),nrow(x),-5) modsel <- parchk("modsel",modsel,1,2,2); costs[1] <- parchk("costs[1]",costs[1],1.0e-7,1.0e6,1.0) costs[2] <- parchk("costs[2]",costs[2],1.0e-7,1.0e6,1.0) maxin=parchk("maxin",maxin,1,ncol(x),ncol(x)) maxstep <- parchk("maxstep",maxstep,1,100000000,10000) stepsize <- parchk("stepsize",stepsize,1.e-7,1.0,0.01) qntl <- parchk("qntl",qntl,0.0,0.4,0.025) speed <- parchk("speed",speed,1,100,1) cpens=paste(c(as.character(pens)),collapse=',') ccosts=paste(c(as.character(costs)),collapse=',') parm.tr <- c( "@mode=" %&% mode, "@nvar=" %&% ncol(x), "@nobs=" %&% nrow(x), "@format=binary", "@org=by_var", "@missing=" %&% xmiss, "@obsweights=" %&% obstmp, "@varpens=" %&% vartmp, "@quantile=" %&% qntl, "@nfold=" %&% nfold, "@modsel=" %&% modsel, "@stepsize=" %&% stepsize, paste("@penalties=",cpens,sep=''), paste("@costs=",ccosts,sep=''), "@maxvarin=" %&% maxin, "@maxstep=" %&% maxstep, "@speed=" %&% speed, "@standardize=" %&% std, "@selector=" %&% sel, "@implementation=" %&% impl ) wd <- getwd(); setwd(gpshome) write(parm.tr, file='parm.txt') write(rep(0,(ncol(x)+1)),file='coeffs.gps') if (platform=='windows') { system('cmd.exe /c GPStrain.exe < parm.txt > gpsout', intern=T,invisible=T, minimized=T) } else { system('rm gpsout') system("./GPStrain.exe < parm.txt > gpsout") } gpsout=readLines('gpsout') setwd(wd) if(!quiet) { t=as.character(Sys.time()); cat(t,'\n'); printgps(gpsout)} if (any(gpsout == 'GPS estimated optimal:')) { inputs=get.inputs(x,mode,pens,nfold,std,sel,modsel,costs,maxin, obswts,varpens,xmiss,stepsize,maxstep,qntl,impl,speed) invisible(gpsmodel(gpsout,inputs)) } else { model[[1]]=gpsout; model[[2]]=Sys.time() cat('GPS errors: no model produced\n'); invisible() } } parchk <- function (ax,x,lx,ux,df) { if(x < lx || x > ux) { df warning(paste(" invalid value for",ax,"- default (",df,") used.")) } else { x} } # # This is an infix operator version of paste(), with no spaces between # elements pasted together. # Posted by David Brahm to r-help mailing list, 9/25/01 "%&%" <- function(a, b) paste(a, b, sep="") # gpspred=function(model, xp, xmiss=9.9e35, quiet=T) { checkmodel(model) if(mode(xp)!="numeric") stop(" xp must be numeric.") if(is.matrix(xp)) { nobs=nrow(xp)} else if(is.vector(xp)) { nobs=1} else { stop(' xp must be a vector or matrix')} if(any(is.na(xp))) { xp[is.na(xp)] <- xmiss} parm.tr <- c("@nobs=" %&% nobs, "@missing=" %&% xmiss, "@data_format=binary", "@org=by_var", "@pred_format=binary") zz=file(paste(gpshome,'/test.dat',sep=''),'wb') writeBin(as.real(as.vector(xp)),zz,size=4); close(zz) zz=file(gpshome%&%'/model.gps','wb'); l2=length(model[[2]]) writeBin(as.integer(model[[2]][1:l2]),zz,size=4); close(zz) wd=getwd(); setwd(gpshome) write(parm.tr, file='parm.txt') if (platform=='windows') { system('cmd.exe /c GPSpred.exe < parm.txt > gpsout', intern=T,invisible=T, minimized=T) } else { system('rm gpsout') system("./GPSpred.exe < parm.txt > gpsout") } if(!quiet) printgps(readLines('gpsout')) setwd(wd) zz=file(paste(gpshome,'/pred.gps',sep=''),'rb') preds=readBin(zz, numeric(), nobs, size=4); close(zz) preds } gpspaths=function(model, vars=1:10, ord='entry', first=T, col='rainbow', title='GPS paths', cex=0.9) { checkmodel(model) if (ord != 'entry' && ord!= 'org') stop('ord must equal "entry" or "org".') itp=model[[3]][2]; ni=model[[3]][3]; nstp=model[[3]][4]; nin=model[[3]][5] if (ord == 'entry') { if (!any(vars <= nin)) stop('all specified vars have zero coefficients at all path points.') m=model[[3]][6:(nin+5)]; m=m[vars[vars<=nin]]; } else { m=vars[vars<=ni]} z=model[[4]][2:(nstp+1)]; a=model[[5]][2:(ni*nstp+1)]; a=matrix(a,nrow=ni,ncol=nstp) gpspathplot(itp,z,a,m,title,first,col,cex) invisible() } gpspathplot=function(itp, z, a, m, title, first, col, cex) { lm=length(m); lz=length(z); rge=z[lz]-z[1] if (first) { if (itp==1) { plot(z,a[m[1],],type='l',xlab='Explainable risk', ylab='Coefficient values',xlim=c(z[1],z[lz]+0.05*rge), ylim=c(min(a),max(a)),col='black') } else { plot(z,a[m[1],],type='l',xlab='Variables entered', ylab='Coefficient values',xlim=c(z[1],z[lz]+0.05*rge), ylim=c(min(a),max(a)),log='x',col='black') } title(title) } if (col == 'rainbow') { rain=rainbow(lm)[sample(1:lm,lm)] for (i in 1:lm) { lines(z,a[m[i],],col=rain[i])} } else { for (i in 1:lm) { lines(z,a[m[i],],col=col)}} lines(z,rep(0,lz),col='black') for (i in 1:lm) { text(z[lz]+0.04*rge,a[m[i],lz],m[i],col='black',cex=cex)} invisible() } gpssoln=function(model, vars=1:20, ord='entry') { checkmodel(model) if (ord != 'entry' && ord!= 'org') stop('ord must equal "entry" or "org".') ni=model[[3]][3]; nin=model[[3]][5] if (ord == 'entry') { if (!any(vars <= nin)) stop('all specified vars have zero coefficients at all path points.') m=model[[3]][6:(nin+5)]; m=m[vars[vars<=nin]]; } else { m=vars[vars<=ni]} coeffs=model[[6]][m+1] intercept=model[[6]][ni+2] list(order=m, coeffs=coeffs, intercept=intercept) } gpsmodel=function(gpsout,inputs) { model=list() model[[1]]=gpsout zz=file(gpshome%&%'/model.gps','rb') it=readBin(zz,integer(),size=4,n=3); close(zz) nvar=it[2]; nin=it[3] zz=file(gpshome%&%'/model.gps','rb') model[[2]]=readBin(zz,integer(),size=4,n=3*(nvar+1)+2*nin+1); close(zz) zz=file(gpshome%&%'/cpaths.gps','rb') it=readBin(zz,integer(),size=4,n=5); close(zz) nvar=it[3]; nstp=it[4]; nin=it[5] zz=file(gpshome%&%'/cpaths.gps','rb') model[[3]]=readBin(zz,integer(),size=4,n=nin+5); close(zz) zz=file(gpshome%&%'/zpaths.gps','rb') model[[4]]=readBin(zz,numeric(),size=4,n=nstp+1); close(zz) zz=file(gpshome%&%'/apaths.gps','rb') model[[5]]=readBin(zz,numeric(),size=4,n=nvar*nstp+1); close(zz) zz=file(gpshome%&%'/coeffs.gps','rb') model[[6]]=readBin(zz,numeric(),size=4,n=nvar+2); close(zz) model[[7]]=Sys.time() model[[8]]=inputs[[1]] model[[9]]=inputs[[2]] invisible(model) } checkmodel=function(model) { if (!is.list(model)) stop('model not from GPS.') if(!is.character(model[[1]])) stop('model not from GPS.') if(length(model) == 2) { cat(as.character(model[[2]]),'\n'); printgps(model[[1]]) stop('GPS errors: no model produced') } else if (length(model) != 9) { stop('model not from GPS.')} invisible() } gpsmodel.info=function(model, inputs=T) { checkmodel(model) cat(as.character(model[[7]]),'\n') if(inputs) print.inputs(model[[8]],model[[9]]) printgps(model[[1]]) invisible() } get.inputs <- function(x,mode,pens,nfold,std,sel,modsel,costs,maxin, obswts,varpens,xmiss,stepsize,maxstep,qntl,impl,speed) { parms=rep(' ',18) parms[1]=as.character(nrow(x)); parms[2]=as.character(ncol(x)) parms[3]=mode; parms[4]=as.character(nfold); parms[5]=std; parms[6]=sel parms[7]=as.character(modsel); parms[8]=as.character(costs[1]) parms[9]=as.character(costs[2]); parms[10]=as.character(maxin) if (obswts == 'equal') { parms[11]='equal'} else { parms[11]='input'} if (varpens == 'equal') { parms[12]='equal'} else { parms[12]='input'} parms[13]=as.character(xmiss); parms[14]=as.character(stepsize) parms[15]=as.character(maxstep); parms[16]=as.character(qntl) parms[17]=impl; parms[18]=as.character(speed) cpens=as.character(pens) list(parms,cpens) } print.inputs=function(parms,cpens) { cat('\n') cat('inputs:\n') cat(' nobs =',parms[1],' nvar =',parms[2],' mode =',parms[3],'\n') cat(' pens =',cpens,'\n') cat(' nfold =',parms[4],' std =',parms[5],' sel =',parms[6], ' modsel =',parms[7],'\n') if (parms[3]=='class' && as.integer(parms[7])==1) { cat(' costs =',parms[8],parms[9],'\n') } cat(' maxin =',parms[10],' obswts =',parms[11],' varpens =', parms[12],'\n') cat(' xmiss =',parms[13],' stepsize =',parms[14],' maxstep =', parms[15],'\n') if (parms[3]=='regres') { cat(' qntl =',parms[16],' impl =',parms[17], ' speed =', parms[18],'\n') } else { cat(' qntl =',parms[16],' speed =',parms[18],'\n') } invisible() } printgps=function(gpsout) { for (i in 1:length(gpsout)) { cat(gpsout[i],'\n')} }