R version 2.9.2 (2009-08-24)
Copyright (C) 2009 The R Foundation for Statistical Computing
ISBN 3-900051-07-0

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

>
>
>
> # package for trees
>
> library(rpart)
>
> # package including data from Elements of Statistical Learning
>
> library(ElemStatLearn)
>
> data(spam)
>
> # make response a 0-1 outcome
>
> #spam$spam = ifelse(spam$spam=="spam",1,0)
>
> spam.sub = c(1:nrow(spam))[spam$spam == 'spam']
> nospam.sub = c(1:nrow(spam))[spam$spam == 'email']
>
> # use 2/3 for training, 1/3 for test
>
> train.spam = sample(spam.sub,floor(length(spam.sub)*2/3))
> train.email = sample(nospam.sub,floor(length(nospam.sub)*2/3))
> train = c(train.spam,train.email)
> train.set = spam[train,]
> test.set = spam[-train,]
>
> rpart.spam = rpart(spam ~ ., data=train.set, method="class",
+ parms=list(split="gini"))
>
> # take a look at the decision rule
>
> print(summary(rpart.spam))
Call:
rpart(formula = spam ~ ., data = train.set, method = "class",
parms = list(split = "gini"))
n= 3066

CP nsplit rel error xerror xstd
1 0.49006623 0 1.0000000 1.0000000 0.02239769
2 0.13907285 1 0.5099338 0.5173841 0.01846592
3 0.04387417 2 0.3708609 0.3774834 0.01630984
4 0.04139073 3 0.3269868 0.3667219 0.01611566
5 0.03145695 4 0.2855960 0.3220199 0.01525619
6 0.01000000 5 0.2541391 0.2756623 0.01426228

Node number 1: 3066 observations, complexity param=0.4900662
predicted class=email expected loss=0.3939987
class counts: 1858 1208
probabilities: 0.606 0.394
left son=2 (2260 obs) right son=3 (806 obs)
Primary splits:
A.53 < 0.0395 to the left, improve=489.7838, (0 missing)
A.52 < 0.0785 to the left, improve=444.7535, (0 missing)
A.7 < 0.01 to the left, improve=403.4466, (0 missing)
A.21 < 0.415 to the left, improve=366.4197, (0 missing)
A.16 < 0.04 to the left, improve=364.0717, (0 missing)
Surrogate splits:
A.24 < 0.045 to the left, agree=0.830, adj=0.355, (0 split)
A.23 < 0.045 to the left, agree=0.829, adj=0.351, (0 split)
A.20 < 0.015 to the left, agree=0.788, adj=0.194, (0 split)
A.11 < 0.035 to the left, agree=0.787, adj=0.191, (0 split)
A.9 < 0.165 to the left, agree=0.786, adj=0.187, (0 split)

Node number 2: 2260 observations, complexity param=0.1390728
predicted class=email expected loss=0.2252212
class counts: 1751 509
probabilities: 0.775 0.225
left son=4 (2052 obs) right son=5 (208 obs)
Primary splits:
A.7 < 0.065 to the left, improve=211.00080, (0 missing)
A.16 < 0.135 to the left, improve=173.46560, (0 missing)
A.52 < 0.086 to the left, improve=166.08940, (0 missing)
A.55 < 3.748 to the left, improve=105.39160, (0 missing)
A.21 < 0.425 to the left, improve= 97.78883, (0 missing)
Surrogate splits:
A.56 < 131.5 to the left, agree=0.913, adj=0.058, (0 split)
A.4 < 6.105 to the left, agree=0.909, adj=0.014, (0 split)
A.17 < 4.07 to the left, agree=0.909, adj=0.014, (0 split)
A.54 < 0.889 to the left, agree=0.909, adj=0.014, (0 split)
A.20 < 1.09 to the left, agree=0.909, adj=0.010, (0 split)

Node number 3: 806 observations, complexity param=0.04387417
predicted class=spam expected loss=0.1327543
class counts: 107 699
probabilities: 0.133 0.867
left son=6 (67 obs) right son=7 (739 obs)
Primary splits:
A.25 < 0.36 to the right, improve=85.03161, (0 missing)
A.26 < 0.09 to the right, improve=39.89185, (0 missing)
A.37 < 0.025 to the right, improve=35.45837, (0 missing)
A.52 < 0.0775 to the left, improve=35.07888, (0 missing)
A.27 < 0.21 to the right, improve=28.96989, (0 missing)
Surrogate splits:
A.26 < 0.265 to the right, agree=0.948, adj=0.373, (0 split)
A.31 < 0.05 to the right, agree=0.934, adj=0.209, (0 split)
A.28 < 0.02 to the right, agree=0.933, adj=0.194, (0 split)
A.30 < 0.05 to the right, agree=0.931, adj=0.164, (0 split)
A.27 < 0.225 to the right, agree=0.929, adj=0.149, (0 split)

Node number 4: 2052 observations, complexity param=0.04139073
predicted class=email expected loss=0.1564327
class counts: 1731 321
probabilities: 0.844 0.156
left son=8 (1848 obs) right son=9 (204 obs)
Primary splits:
A.52 < 0.5085 to the left, improve=98.42923, (0 missing)
A.16 < 0.2 to the left, improve=98.40579, (0 missing)
A.55 < 3.701 to the left, improve=51.29531, (0 missing)
A.21 < 0.865 to the left, improve=42.92236, (0 missing)
A.23 < 0.28 to the left, improve=37.82468, (0 missing)
Surrogate splits:
A.23 < 0.615 to the left, agree=0.904, adj=0.034, (0 split)
A.22 < 12.01 to the left, agree=0.902, adj=0.010, (0 split)
A.24 < 3.305 to the left, agree=0.902, adj=0.010, (0 split)
A.13 < 1.83 to the left, agree=0.901, adj=0.005, (0 split)
A.21 < 8.345 to the left, agree=0.901, adj=0.005, (0 split)

Node number 5: 208 observations
predicted class=spam expected loss=0.09615385
class counts: 20 188
probabilities: 0.096 0.904

Node number 6: 67 observations
predicted class=email expected loss=0.1044776
class counts: 60 7
probabilities: 0.896 0.104

Node number 7: 739 observations
predicted class=spam expected loss=0.06359946
class counts: 47 692
probabilities: 0.064 0.936

Node number 8: 1848 observations
predicted class=email expected loss=0.1049784
class counts: 1654 194
probabilities: 0.895 0.105

Node number 9: 204 observations, complexity param=0.03145695
predicted class=spam expected loss=0.377451
class counts: 77 127
probabilities: 0.377 0.623
left son=18 (100 obs) right son=19 (104 obs)
Primary splits:
A.57 < 63.5 to the left, improve=38.32332, (0 missing)
A.56 < 10.5 to the left, improve=35.46062, (0 missing)
A.16 < 0.105 to the left, improve=26.36195, (0 missing)
A.55 < 2.6405 to the left, improve=25.78796, (0 missing)
A.45 < 0.585 to the right, improve=13.69954, (0 missing)
Surrogate splits:
A.56 < 12.5 to the left, agree=0.887, adj=0.77, (0 split)
A.55 < 2.6695 to the left, agree=0.794, adj=0.58, (0 split)
A.21 < 0.23 to the left, agree=0.735, adj=0.46, (0 split)
A.16 < 0.105 to the left, agree=0.716, adj=0.42, (0 split)
A.3 < 0.06 to the left, agree=0.691, adj=0.37, (0 split)

Node number 18: 100 observations
predicted class=email expected loss=0.31
class counts: 69 31
probabilities: 0.690 0.310

Node number 19: 104 observations
predicted class=spam expected loss=0.07692308
class counts: 8 96
probabilities: 0.077 0.923

n= 3066

node), split, n, loss, yval, (yprob)
* denotes terminal node

1) root 3066 1208 email (0.60600130 0.39399870)
2) A.53< 0.0395 2260 509 email (0.77477876 0.22522124)
4) A.7< 0.065 2052 321 email (0.84356725 0.15643275)
8) A.52< 0.5085 1848 194 email (0.89502165 0.10497835) *
9) A.52>=0.5085 204 77 spam (0.37745098 0.62254902)
18) A.57< 63.5 100 31 email (0.69000000 0.31000000) *
19) A.57>=63.5 104 8 spam (0.07692308 0.92307692) *
5) A.7>=0.065 208 20 spam (0.09615385 0.90384615) *
3) A.53>=0.0395 806 107 spam (0.13275434 0.86724566)
6) A.25>=0.36 67 7 email (0.89552239 0.10447761) *
7) A.25< 0.36 739 47 spam (0.06359946 0.93640054) *
>
>

>
>
> # visualize it (gets difficult for bigger trees)
>
>
> post(rpart.spam, filename='')
>
>

>


> # predict the labels for the test set
>
> predict.spam = predict(rpart.spam, test.set)
> plabels.spam = colnames(predict.spam)[apply(predict.spam, 1, which.max)]
>
> # compute the various measures of accuracy
>
> classification.summary = function(plabels, tlabels) {
+
+ # true positives: things we labelled spam that are spam
+
+ TP = sum((plabels.spam == 'spam') * (tlabels == 'spam'))
+
+ # false positives: things we labelled spam that are email
+
+ FP = sum((plabels.spam == 'spam') * (tlabels == 'email'))
+
+ # true negatives: things we labelled email that are email
+
+ TN = sum((plabels.spam == 'email') * (tlabels == 'email'))
+
+ # false negatives: things we labelled email that are spam
+
+ FN = sum((plabels.spam == 'email') * (tlabels == 'spam'))
+
+ # accuracy
+
+ A = (TP+TN) / (TP+TN+FP+FN)
+
+ # sensitivity
+
+ sens = TP / (TP+FN)
+
+ # specificity
+
+ spec = TN / (TN+FP)
+
+ # precision
+
+ prec = TP / (TP+FN)
+
+ # confusion matrix
+
+ C = matrix(c(TP,FP,FN,TN),2,2)
+ colnames(C) = c('predicted spam', 'predicted email')
+ rownames(C) = c('truly spam', 'truly email')
+
+ return(list(A=A,TP=TP,FP=FP,TN=TN,FN=FN,C=C,sens=sens,spec=spec))
+ }
>
> s = classification.summary(plabels.spam, test.set$spam)
> print(s)
$A
[1] 0.8879479

$TP
[1] 481

$FP
[1] 48

$TN
[1] 882

$FN
[1] 124

$C
predicted spam predicted email
truly spam 481 124
truly email 48 882

$sens
[1] 0.7950413

$spec
[1] 0.9483871

>
>
>

>
>
> # you can control some aspects of the tree building process
> # with rpart.control
>
> rpart.spam.deeper = rpart(spam ~ ., data=train.set, method="class",
+ parms=list(split="gini"), control=rpart.control(cp=0.00001, xval=20))
>
> post(rpart.spam, filename='')
>
>

>


> # let's look at the stability of the tree
>

>
>
>
> train.spam = sample(spam.sub,floor(length(spam.sub)*2/3))
> train.email = sample(nospam.sub,floor(length(nospam.sub)*2/3))
> train = c(train.spam,train.email)
> train.set = spam[train,]
> test.set = spam[-train,]
>
> rpart.spam = rpart(spam ~ ., data=train.set, method="class",
+ parms=list(split="gini"))
> post(rpart.spam, filename='')
>
>

>


>
>
> train.spam = sample(spam.sub,floor(length(spam.sub)*2/3))
> train.email = sample(nospam.sub,floor(length(nospam.sub)*2/3))
> train = c(train.spam,train.email)
> train.set = spam[train,]
> test.set = spam[-train,]
>
> rpart.spam = rpart(spam ~ ., data=train.set, method="class",
+ parms=list(split="gini"))
> post(rpart.spam, filename='')
>
>

>


>
>
> train.spam = sample(spam.sub,floor(length(spam.sub)*2/3))
> train.email = sample(nospam.sub,floor(length(nospam.sub)*2/3))
> train = c(train.spam,train.email)
> train.set = spam[train,]
> test.set = spam[-train,]
>
> rpart.spam = rpart(spam ~ ., data=train.set, method="class",
+ parms=list(split="gini"))
> post(rpart.spam, filename='')
>
>

>


>
>
> train.spam = sample(spam.sub,floor(length(spam.sub)*2/3))
> train.email = sample(nospam.sub,floor(length(nospam.sub)*2/3))
> train = c(train.spam,train.email)
> train.set = spam[train,]
> test.set = spam[-train,]
>
> rpart.spam = rpart(spam ~ ., data=train.set, method="class",
+ parms=list(split="gini"))
> post(rpart.spam, filename='')
>
>

>


>
>
> train.spam = sample(spam.sub,floor(length(spam.sub)*2/3))
> train.email = sample(nospam.sub,floor(length(nospam.sub)*2/3))
> train = c(train.spam,train.email)
> train.set = spam[train,]
> test.set = spam[-train,]
>
> rpart.spam = rpart(spam ~ ., data=train.set, method="class",
+ parms=list(split="gini"))
> post(rpart.spam, filename='')
>
>

>


>
> predict.spam = predict(rpart.spam, test.set)
> l = sort(unique(predict.spam[,'spam']))
> sens = c()
> spec = c()
>
> for (ll in l) {
+ plabels.spam = rep('email', nrow(predict.spam))
+ plabels.spam[(predict.spam[,'spam'] >= ll)] = 'spam'
+ s = classification.summary(plabels.spam, test.set$spam)
+ sens = c(sens, s$sens)
+ spec = c(spec, s$spec)
+ }
>
> sens = c(1,sens,0)
> spec = c(0,spec,1)
>
> plot(1-spec, sens, type='l', col='red', lwd=2)
> abline(0,1,lwd=2, lty=2, col='blue')
>
>

>


>
> proc.time()
user system elapsed
30.653 0.200 31.161
R script