c c test driver program to exercise mstcpr c real x(5,200),deg(200,2) integer mst(7,200) common /answer/ signif(7) call genpts(200,5,x) do 1 i=1,100 1 x(1,i)=x(1,i)+1.0 call mstcpr(5,200,100,x,4,deg,mst) write(6,'('' '')') do 2 i=1,7 2 write (6,3) i,signif(i) 3 format(' signif('i1,') =' g13.5) stop end c c program implementing Friedman-Rafsky two-sample tests. c c Friedman, J. H. and Rafsky, L. C. (1979). Multivariate analogs of c the Wald-Wolfowitz and Smirnov two-sample tests. Annals of c Statistics, 7, 697-717 (June). c c version of 10/17/79. c c coded and copyright (c) by: c c Jerome H. Friedman c c Department of Statistics c and c Stanford Linear Accelerator Center c Stanford University c c c call mstcpr(p,n,n1,data,ntree,deg,mst) c c integer p,n,n1,ntree c c real data(p,n),deg(n,2) c c integer mst(max0(5,ntree+3),n) c c p = dimension. c n = cardinality of pooled sample. c n1 = cardinality of first sample. c ntree = number of mst's comprising graph. c data = pooled sample data matrix. c first n1 rows correspond to first sample. c rows n1+1 through n correspond to second sample. c deg = scratch storage for mstcpr. upon return c deg(1,1) - deg(n,1) contain degree of each node (point) c in the first mst. c deg(1,2) - deg(n,2) contain degree of each node in c the ntree-th mst. c mst = scratch storage for mstcpr. upon return c mst(k,1) - mst(k,n-1) = kth mst (1.le.k.le.ntree). c c labeled commons: c c common /answer/ signif(ltree+3) c c signif(1) = significance sample 1 (degree 1) test. c signif(2) = significance runs test 1st mst. c signif(3) = significance runs test 1st and 2nd mst. c signif(k+1) = significance runs test union(1-k) mst. c signif(ntree+1) = significance runs test union(1-ntree) mst. c signif(ntree+2) = significance "standard" multivariate c smirnov test. c signif(ntree+3) = significance "radial" multivariate c smirnov test. c c common /print/ iprint c c iprint = print flag for program. c = 0 : no printed output. c .ne. 0: printed output. c default value = 1. c block data common /print/ iprint data iprint /1/ end subroutine primk (m,n,points,ftree,ltree,mst,ui,ji,nit) integer m,n,ftree,ltree real points(m,n),ui(n) integer mst(ltree,n),ji(n),nit(n) do 100 ktree=ftree,ltree ktrem1=ktree-1 nitp=n-1 kp=n do 10 i=1,nitp nit(i)=i ui(i)=1.0e73 10 continue 20 do 80 i=1,nitp ni=nit(i) if (ktrem1.le.0) go to 40 do 30 itree=1,ktrem1 if (mst(itree,ni).eq.kp) go to 60 if (mst(itree,kp).eq.ni) go to 60 30 continue 40 d=0.0 do 50 i001=1,m d=d+(points(i001,ni)-points(i001,kp))**2 50 continue go to 70 60 d=1.0e73 70 if (ui(i).le.d) go to 80 ui(i)=d ji(i)=kp 80 continue uk=ui(1) do 90 i=1,nitp if (ui(i).gt.uk) go to 90 uk=ui(i) k=i 90 continue kp=nit(k) mst(ktree,kp)=ji(k) ui(k)=ui(nitp) nit(k)=nit(nitp) ji(k)=ji(nitp) nitp=nitp-1 if (nitp.ne.0) go to 20 100 continue return end subroutine mstcpr (d,n,n1,x,lt,deg,istor) integer d,n,n1,ft,lt real x(d,n),deg(n,2) integer istor(1) integer ttn,ftn,node real*8 peqk,plek common /answer/ sig(4) /print/ iprint real*8 fn1,fn2,fn,p,var1,covar1,covar2,tqshr,var ft=1 nm1=n-1 ttn=max0(lt*nm1,n+n) ftn=ttn+n+n call primk (d,n,x,ft,lt,istor,deg,istor(ttn+1),istor(ftn+1)) do 10 j=1,nm1 deg(j,1)=1.0 10 continue deg(n,1)=0.0 do 20 j=1,nm1 deg(istor(lt*(j-1)+1),1)=deg(istor(lt*(j-1)+1),1)+1.0 20 continue do 30 j=1,n deg(j,2)=deg(j,1) 30 continue runs=1.0 nd1=0 nd11=nd1 n1p1=n1+1 fn=n fn1=n1 fn2=n-n1 p=2.0*fn1*fn2/(fn*(fn-1.0)) var1=p*(1.0-p) covar1=0.5*p-p**2 covar2=2.0*p*(fn1-1.0)*(fn2-1.0)/((fn-2.0)*(fn-3.0))-p**2 do 150 k=1,lt if (k.eq.1) go to 50 do 40 j=1,nm1 deg(j,2)=deg(j,2)+1.0 deg(istor(lt*(j-1)+k),2)=deg(istor(lt*(j-1)+k),2)+1.0 40 continue go to 70 50 do 60 j=1,n if (deg(j,1).ne.1) go to 60 nd1=nd1+1 if (j.le.n1) nd11=nd11+1 60 continue 70 do 80 j=1,n1 if (istor(lt*(j-1)+k).gt.n1) runs=runs+1.0 80 continue do 90 j=n1p1,nm1 if (istor(lt*(j-1)+k).le.n1) runs=runs+1.0 90 continue tqshr=0.0 do 100 j=1,n if (deg(j,2).gt.1.0) tqshr=tqshr+deg(j,2)*(deg(j,2)-1.0) 100 continue if (k.ne.1) go to 120 p11=fn1/fn e11=p11*nd1 v11=e11*(1.0-p11)*(fn-nd1)/(fn-1.0) std11=(nd11-e11)/sqrt(v11) call mdhyp (min0(nd11,nd1-nd11),nd1,n,n1,peqk,plek,ier) sig(1)=2.0*plek-peqk if (iprint.eq.0) go to 120 write (6,110) nd1,nd11,e11,std11,sig(1) 110 format (/,' ',i5,' deg=1 nodes. class 1 =',i5,' expect =',f5.1, 1/,' ',g13.5,' stds with hypergeometric significance ',g13.5) 120 if (k.lt.ft) go to 150 edges=k*nm1 eruns=p*edges+1.0 var=var1*edges+covar1*tqshr+covar2*(edges*(edges-1.0)-tqshr) std=(runs-eruns)/dsqrt(var) sig(k+1)=0.5*(1.0+erf(std/1.41421)) if (iprint.eq.0) go to 150 qshr=0.5*tqshr/(n-2) write (6,130) k,qshr 130 format (/,' ',i2,'th mst. (number of mst edges that share no 1des)/(n-2) = ',g13.5) write (6,140) runs,eruns,std,sig(k+1) 140 format (/,' runs =',f5.0,' expect =',f5.0,/,' ',g13.5, 1' stds with normal significance ',g13.5) 150 continue if (ft.gt.1) return istor(ftn+1)=0 do 160 j=2,n istor(ftn+j)=istor(ftn+j-1)+deg(j-1,1) 160 continue do 170 j=1,nm1 istor(ftn+j)=istor(ftn+j)+1 istor(ttn+istor(ftn+j))=istor(lt*(j-1)+1) node=istor(lt*(j-1)+1) istor(ftn+node)=istor(ftn+node)+1 istor(ttn+istor(ftn+node))=j 170 continue istor(ftn+1)=0 do 180 j=2,n istor(ftn+j)=istor(ftn+j-1)+deg(j-1,1) 180 continue call smrnov (x,d,n,n1,deg,istor(ftn+1),istor(ttn+1),istor,deg(1,2) 1,lt+2) return end subroutine smrnov (x,d,n,n1,deg,pntr,auux,stack,depth,place) integer d,n1,n,place,maxdph,enddia,center,diaend real x(d,1),deg(1),depth(1) integer pntr(1),auux(1),stack(2,1) maxdph=n+1 call maxtag (1,deg,pntr,auux,stack,n,depth) enddia=diaend(1,deg,pntr,auux,depth,maxdph) call maxtag (enddia,deg,pntr,auux,stack,n,depth) call stdsmr (enddia,deg,pntr,auux,depth,n1,n,stack,place) maxdph=depth(enddia)/2.0+1.0 center=diaend(enddia,deg,pntr,auux,depth,maxdph) call radsmr (x,d,n,n1,center,deg,pntr,auux,stack,depth,place+1) return end subroutine stdsmr (root,deg,pntr,auux,depth,n1,n,stack,place) common /answer/ sig(4) /print/ iprint integer root,n1,n,node,stkptr,parent,next,son,place real sum,depth(1),deg(1),distks integer pntr(1),auux(1),stack(1) f1=-1.0/n1 f2=1.0/(n-n1) node=root parent=0 stkptr=parent deg(node)=-deg(node) sum=f1 if (node.gt.n1) sum=f2 distks=abs(sum) 10 jdeg=abs(deg(node)) next=0 dphmin=9.9e73 do 20 i=1,jdeg son=auux(pntr(node)+i) if (deg(son).le.0) go to 20 if (depth(son).ge.dphmin) go to 20 dphmin=depth(son) next=son 20 continue if (next.ne.0) go to 30 if (stkptr.eq.0) go to 60 node=stack(stkptr) parent=0 if (stkptr.gt.1) parent=stack(stkptr-1) stkptr=stkptr-1 go to 10 30 stkptr=stkptr+1 parent=node stack(stkptr)=parent node=next if (node.le.n1) go to 40 sum=sum+f2 go to 50 40 sum=sum+f1 50 distks=amax1(distks,abs(sum)) deg(node)=-deg(node) go to 10 60 call ksnull (distks,max0(n1,n-n1),min0(n1,n-n1),p1side,p2side,ier) if (iprint.eq.0) go to 80 write (6,70) distks,p2side 70 format (/,' mst standard kolmogorov-smirnov distance = ',g13.5, 1/,' with significance = ',g13.5) 80 sig(place)=p2side return end subroutine radsmr (x,d,n,n1,start,deg,pntr,auux,stack,depth,place) common /answer/ sig(4) /print/ iprint integer d,start,n1,n,node,stkptr,parent,next,istart,place real x(d,n),depth(1),deg(1),distks,f1,f2 integer pntr(1),auux(1),stack(n,1) node=start parent=0 istart=1 stkptr=0 10 jdeg=abs(deg(node)) i=istart go to 30 20 i=i+1 30 if ((i).gt.(jdeg)) go to 40 next=auux(pntr(node)+i) if (next.ne.parent) go to 50 go to 20 40 depth(node)=stkptr if (stkptr.eq.0) go to 60 node=stack(stkptr,1) parent=0 if (stkptr.gt.1) parent=stack(stkptr-1,1) istart=stack(stkptr,2) stkptr=stkptr-1 go to 10 50 stkptr=stkptr+1 stack(stkptr,1)=node stack(stkptr,2)=i+1 parent=node node=next istart=1 go to 10 60 do 70 i=1,n stack(i,1)=i 70 continue call sort (depth,stack,1,n) m=0 80 m=m+1 m0=m 90 if (depth(m).ne.depth(m+1)) go to 100 m=m+1 if (m.ge.n) go to 100 go to 90 100 if (m.le.m0) go to 130 do 120 j=m0,m kj=stack(j,1) depth(j)=0 do 110 i=1,d depth(j)=depth(j)+(x(i,kj)-x(i,start))**2 110 continue 120 continue call sort (depth,stack,m0,m) 130 if (m.ge.n) go to 140 go to 80 140 distks=0.0 sum=distks f1=-1.0/n1 f2=1.0/(n-n1) do 170 j=1,n if (stack(j,1).gt.n1) go to 150 sum=sum+f1 go to 160 150 sum=sum+f2 160 distks=amax1(distks,abs(sum)) 170 continue call ksnull (distks,max0(n1,n-n1),min0(n1,n-n1),p1side,p2side,ier) if (iprint.eq.0) go to 190 write (6,180) distks,p2side 180 format (/,' mst radial kolmogorov-smirnov distance = ',g13.5, 1/,' with significance = ',g13.5) 190 sig(place)=p2side return end subroutine maxtag (start,deg,pntr,auux,stack,n,count) integer start,n,node,stkptr,parent,next,istart real count(1),deg(1),maxcnt integer pntr(1),auux(1),stack(2,1) do 10 i=1,n count(i)=0.0 10 continue node=start parent=0 istart=1 stkptr=0 20 jdeg=abs(deg(node)) i=istart go to 40 30 i=i+1 40 if ((i).gt.(jdeg)) go to 50 next=auux(pntr(node)+i) if (next.ne.parent) go to 90 go to 30 50 maxcnt=0.0 i=1 go to 70 60 i=i+1 70 if ((i).gt.(jdeg)) go to 80 next=auux(pntr(node)+i) if (next.ne.parent) maxcnt=amax1(maxcnt,count(next)) go to 60 80 count(node)=maxcnt+1.0 if (stkptr.eq.0) return node=stack(1,stkptr) parent=0 if (stkptr.gt.1) parent=stack(1,stkptr-1) istart=stack(2,stkptr) stkptr=stkptr-1 go to 20 90 stkptr=stkptr+1 stack(1,stkptr)=node stack(2,stkptr)=i+1 parent=node node=next istart=1 go to 20 end integer function diaend(start,deg,pntr,auux,count,maxdph) integer start,node,parent,next,nexts,depth,maxdph real count(1),deg(1),maxcnt integer pntr(1),auux(1) node=start parent=0 depth=0 10 maxcnt=0.0 depth=depth+1 jdeg=abs(deg(node)) i=1 go to 30 20 i=i+1 30 if ((i).gt.(jdeg)) go to 40 next=auux(pntr(node)+i) if (next.eq.parent) go to 20 if (count(next).le.maxcnt) go to 20 maxcnt=count(next) nexts=next go to 20 40 if ((maxcnt.ne.0.0).and.(depth.lt.maxdph)) go to 50 diaend=node return 50 parent=node node=nexts go to 10 end subroutine ksnull (pdif1,n,m,pdif5,pdif6,ier) ier=0 fn=n fm=m fnd=1./fn fmd=1./fm c smirnov continuity correction cc=sqrt(fnd)*.6666667 nl50=1 nl100=1 c set warnings, choose approximation c probabilities if (n.lt.50) nl50=0 if (n.lt.100) nl100=0 if (fm*fnd.lt..1) go to 50 if (n.eq.m*(n/m)) go to 40 c adjust smirnov continuity correcti cc=.6*cc if (nl100.eq.0) go to 30 c calculate the statistic z 10 pdif4=pdif1*sqrt((fn*fm)/(fn+fm))+cc c obtain probabilities 20 call mdsmr (pdif4,pdif5,pdif6) pdif5=1.-pdif5 pdif6=1.-pdif6 return c set error indicators 30 ier=37 if (nl50.eq.0) ier=36 go to 10 40 if (nl100.eq.0) ier=35 go to 10 50 if (nl50.ne.0) go to 70 ier=33 c dm 60 pdif4=pdif1*sqrt(fm) go to 20 70 if (m.ge.80) go to 80 ier=34 go to 60 80 if (nl100.ne.0) go to 10 go to 60 end subroutine mdsmr (x,p1,p2) x1=-x*x p1=exp(x1+x1) if (x.ge..22) go to 10 p2=0.0 go to 40 10 x8=8.*x1 if (x.gt.0.8) go to 20 p2=(2.506628/x)*exp(9.869604/x8) go to 40 20 if (x.le.3.15) go to 30 p2=1.0 p1=1.0 go to 50 30 x8=p1*p1 x8=x8*x8 p2=p1*x8*x8-x8+p1 p2=1.0-p2-p2 40 p1=1.0-p1 50 return end subroutine sort (v,a,ii,jj) c c puts into a the permutation vector which sorts v into c increasing order. only elements from ii to jj are considered. c arrays iu(k) and il(k) permit sorting up to 2**(k+1)-1 elements c c this is a modification of cacm algorithm #347 by r. c. singleton, c which is a modified hoare quicksort. c dimension a(jj),v(1),iu(20),il(20) integer t,tt integer a real v m=1 i=ii j=jj 10 if (i.ge.j) go to 80 20 k=i ij=(j+i)/2 t=a(ij) vt=v(ij) if (v(i).le.vt) go to 30 a(ij)=a(i) a(i)=t t=a(ij) v(ij)=v(i) v(i)=vt vt=v(ij) 30 l=j if (v(j).ge.vt) go to 50 a(ij)=a(j) a(j)=t t=a(ij) v(ij)=v(j) v(j)=vt vt=v(ij) if (v(i).le.vt) go to 50 a(ij)=a(i) a(i)=t t=a(ij) v(ij)=v(i) v(i)=vt vt=v(ij) go to 50 40 a(l)=a(k) a(k)=tt v(l)=v(k) v(k)=vtt 50 l=l-1 if (v(l).gt.vt) go to 50 tt=a(l) vtt=v(l) 60 k=k+1 if (v(k).lt.vt) go to 60 if (k.le.l) go to 40 if (l-i.le.j-k) go to 70 il(m)=i iu(m)=l i=k m=m+1 go to 90 70 il(m)=k iu(m)=j j=l m=m+1 go to 90 80 m=m-1 if (m.eq.0) return i=il(m) j=iu(m) 90 if (j-i.ge.11) go to 20 if (i.eq.ii) go to 10 i=i-1 100 i=i+1 if (i.eq.j) go to 80 t=a(i+1) vt=v(i+1) if (v(i).le.vt) go to 100 k=i 110 a(k+1)=a(k) v(k+1)=v(k) k=k-1 if (vt.lt.v(k)) go to 110 a(k+1)=t v(k+1)=vt go to 100 end subroutine genpts(nvect,ndim,y) dimension y(1),u(2) c n=nvect*ndim do 1000 j=1,n,2 200 continue call ranums(u,2) v1=2.*u(1)-1. v2=2.*u(2)-1. s=v1*v1+v2*v2 if(s.ge.1.) go to 200 s=sqrt(-2.*alog(s)/s) y(j)=v1*s if((j+1).le.n)y(j+1)=v2*s 1000 continue return end c c subroutine ranums(x,n) real x(n) c generate n uniform random numbers in the interval (0.0, 1.0). c urand -- a portable random number generator, by michael a. c malcolm and cleve b. moler, stanford computer science report c stan-cs-73-334, january 1974. c note that the statistical properties of this routine have not c been extensively verified on a large number of architectures. c this is a uniform random number generator based on theory and c suggestions given in d.e. knuth (1969), vol 2. integer ia, ic, itwo, iy, m2, m double precision halfm, datan, dsqrt data m2 / 0 /, itwo / 2 /, iy /123456789/ if (m2 .ne. 0) go to 20 c if first entry, compute machine integer work length m = 1 10 m2 = m m = itwo * m2 if (m .gt. m2) go to 10 halfm = m2 c compute multiplier and increment for linear congruential method ia = 8 * idint(halfm*datan(1.0d0)/8.0d0) + 5 ic = 2 * idint(halfm*(0.5d0-dsqrt(3.0d0)/6.0d0)) + 1 c s is the scale factor for converting to floating point s = 0.5 / halfm c now compute next random number 20 do 30 i=1,n iy = iy * ia + ic c the following statement is for computers where the word length for c addition is greater than that for mulitplication if (iy/2 .gt. m2) iy = (iy - m2) - m2 c the following statement is for computers where integer overflow c affects the sign bit if (iy .lt. 0) iy = (iy + m2) + m2 30 x(i) = float(iy) * s return end function erf(z) double precision a(6),b data a /.0705230784d0,.0422820123d0,.0092705272d0, * .0001520143d0,.0002765672d0,.0000430638d0/ x=abs(z) b=1.0d0+x*(a(1)+x*(a(2)+x*(a(3)+x*(a(4)+x*(a(5)+x*a(6)))))) x=1.0d0-1.0d0/b**16 erf=sign(x,z) return end subroutine mdhyp (k,n,l,nd,peqk,plek,ier) real*8 eps,p,sp,u,a,b,a1,b1,aa,bb,annd,anxd,aj real*8 peqk,plek data eps/1.0d-78/ if ((k.ge.0).and.(k.le.n)) go to 10 ier=129 go to 150 c check range of n 10 if ((n.ge.1).and.(n.le.l)) go to 20 ier=130 go to 150 c check range of nd 20 if ((nd.ge.0).and.(nd.le.l)) go to 30 ier=131 go to 150 30 if (n.le.32767) go to 40 ier=132 go to 150 c initialization for recursive formula 40 ier=0 p=0.0d0 sp=0.0d0 if (k.gt.nd) go to 130 if ((n-k).gt.(l-nd)) go to 140 p=1.0d0 nnd=min0(n,nd) nxd=max0(n,nd) b=l annd=nnd anxd=nxd c choose more efficient direction for c computation of probabilities if ((k*(l+2)).gt.((nd+1)*(n+1))) go to 50 c initialization for calculation of c p(x .le. k) iflag=0 aa=anxd-annd bb=b-anxd-annd k0=n+nd-l if (k0.lt.0) k0=0 aj=k0 a1=annd-aj b1=aj+1.0d0 mm1=k-k0 a=b-anxd+aj mm=nnd-k0 go to 60 c initialization for calculation of c p(x .gt. k) 50 iflag=1 aa=b-anxd-annd bb=anxd-annd a1=annd b1=1.0d0 mm1=nnd-k mm=l-nxd a=b-annd if (nnd.ge.mm) go to 60 mm=nnd a=anxd 60 icnt=0 if (mm.eq.0) go to 90 c loop for peqk do 80 m=1,mm u=a/b c detect and correct for potential c underflow if (u.ge.eps/p) go to 70 p=p/eps icnt=icnt+1 70 p=p*u a=a-1.0d0 b=b-1.0d0 80 continue 90 if (mm1.eq.0) go to 120 c loop for plek do 110 m=1,mm1 if (icnt.eq.0) sp=sp+p u=a1*(a1+aa)/(b1*(b1+bb)) p=u*p if (p.lt.1.0d0) go to 100 c inverse underflow adjustment p=p*eps icnt=icnt-1 100 a1=a1-1.0d0 b1=b1+1.0d0 110 continue 120 if (icnt.ne.0) p=0.0d0 if (iflag.ne.0) go to 130 c p(x .le. k)=p(x .eq. k)+p(x .lt. k) sp=sp+p go to 140 130 sp=1.0d0-sp 140 peqk=p plek=sp go to 150 150 return end