-
Notifications
You must be signed in to change notification settings - Fork 113
/
Part6_CredScoring_DecisionTrees.R
155 lines (134 loc) · 5.06 KB
/
Part6_CredScoring_DecisionTrees.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
######################################################
# Part 5: DECISION TREES
# Apply Decision Trees Analysis
#
#
######################################################
# load package FactoMineR and ggplot2
require(ggplot2)
require(rpart)
# read cleaned data set
dd = read.csv("CleanCreditScoring.csv", header=TRUE, stringsAsFactors=TRUE)
# Let's obtain a decision tree with the function 'rpart'
# using all the variables (both continuous and categorical)
ct = rpart(Status ~ ., data=dd)
# let's see how the output looks like
ct
# how to read the output?
# "node), split, n, loss, yval, (yprob)"
# node): indicates the node number
# split: indicates the split criterion
# n: indicates the number of individuals in the groupe
# loss: indicates the the number of individuals misclassified
# yval: indicates the predicted value
# (yprob): indicates the probability of belonging to each class
# it's much easier to read a tree with a graphic
plot(ct, margin=0.05, compress=TRUE, main="Decision Tree")
text(ct, use.n=TRUE, pretty=1, all=TRUE, cex=0.7)
# one of the goals is to obtain a tree in which
# the nodes are as much homogenous as possible,
# but also a tree with good prediction ability
# In order to improve our decision tree, we need to have
# 1) a train (aka learning) dataset
# 2) a test dataset
# let's keep 2/3 of the data for learning, and 1/3 for testing
n = nrow(dd)
learn = sample(1:n, size=round(0.67 * n))
nlearn = length(learn)
ntest = n - nlearn
# selection of model by crossvalidation
# first we need a maximal tree with low value of cp
# and quiprobability of classes
ct1 = rpart(Status ~ ., data = dd[learn,], method="class",
parms = list(prior=c(0.50, 0.50), split='gini'),
control = rpart.control(cp=0.001, xval=10, maxdepth=15))
# check results of the complexity parameter table
ct1$cptable
# we can use the function 'plotcp' to see the results
# the 'best' tree is the one with the lowest xerror
plotcp(ct1, las=2, cex.axis=0.8)
# what is the minimum XERROR?
min(ct1$cptable[,4])
min.xe = which(ct1$cptable[,4] == min(ct1$cptable[,4]))
# the optimal tree corresponds to a cp=0.003
ct1$cptable[min.xe,]
# Now that we know that the 'best' tree has cp=0.03
# we can plugin that information in the parameters
ct2 = rpart(Status ~ .,
data = dd[learn,],
parms = list(prior=c(0.50, 0.50), split='gini'),
control = rpart.control(cp=0.00285, xval=0, maxdepth=15))
# plot
par(mar = c(1,1,2,0.5))
plot(ct2, margin=0.05, compress=TRUE, main="Decision Tree")
text(ct2, use.n=TRUE, pretty=1, all=TRUE, cex=0.5)
summary(ct2)
# calculate error rate in the learning sample
# (this will give a matrix)
ct2.learn = predict(ct2, data=ddtot[learn,])
# create a vector with predicted status
ct2.learnp = rep("", nlearn)
ct2.learnp[ct2.learn[,1] < 0.5] = "pred_neg"
ct2.learnp[ct2.learn[,1] >= 0.5] = "pred_pos"
# let's make a table
status_learn = table(dd$Status[learn], ct2.learnp)
# classification error
100 * sum(diag(status_learn)) / nlearn
# calculate error rate in the testing sample
# (this will give a matrix)
ct2.test = predict(ct2, newdata=ddtot[-learn,])
# create a vector with predicted status
ct2.testp = rep("", ntest)
ct2.testp[ct2.test[,1] < 0.5] = "pred_neg"
ct2.testp[ct2.test[,1] >= 0.5] = "pred_pos"
# let's make a table
status_test = table(dd$Status[-learn], ct2.testp)
# classification error
100 * sum(diag(status_test)) / ntest
# we'll repeat the same but changing the cp=0.002
ct3 = rpart(Status ~ .,
data = dd[learn,],
parms = list(prior=c(0.50, 0.50), split='gini'),
control = rpart.control(cp=0.002, xval=0, maxdepth=15))
par(mar = c(1,1,2,0.5))
plot(ct3, margin=0.05, compress=TRUE, main="Decision Tree")
text(ct3, use.n=TRUE, all=TRUE, cex=0.5)
# calculate error rate in the learning sample
# (this will give a matrix)
ct3.learn = predict(ct3, data=ddtot[learn,])
# create a vector with predicted status
ct3.learnp = rep("", nlearn)
ct3.learnp[ct3.learn[,1] < 0.5] = "pred_neg"
ct3.learnp[ct3.learn[,1] >= 0.5] = "pred_pos"
# let's make a table
table(dd$Status[learn], ct3.learnp)
# classification error
100 * sum(diag(table(dd$Status[learn], ct3.learnp))) / nlearn
# calculate error rate in the testing sample
# (this will give a matrix)
ct3.test = predict(ct3, newdata=ddtot[-learn,])
# create a vector with predicted status
ct3.testp = rep("", ntest)
ct3.testp[ct3.test[,1] < 0.5] = "pred_neg"
ct3.testp[ct3.test[,1] >= 0.5] = "pred_pos"
# let's make a table
table(dd$Status[-learn], ct3.testp)
# classification error
100 * sum(diag(table(dd$Status[-learn], ct3.testp))) / ntest
# concentration curve
# the positive predictions on the test sample
pred.test = ct2.test[,1]
# the number of individuals in each value
totn = table(-pred.test) / ntest
ac_totn = 100 * cumsim(as.numeric(totn))
# ranking the predictions
rank_pred.test = rank(pred.test)
# how many positive are in each leave?
Status.test = dd$Status[-learn]
table(Status.test)
npos = table(Status.test)[1]
tapply(Status.test == "good", ran_pred.test, sum)
ac_true.pos = 100 * cumsum(rev(as.numeric(tabla))) / npos
# ROC curve
nneg = ntest - npos
ac_fals.pos = 100 * cumsum(rev())