-
Notifications
You must be signed in to change notification settings - Fork 1
/
PreModWorkshop1_logistic.R
239 lines (175 loc) · 6.84 KB
/
PreModWorkshop1_logistic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#####################################
# Predictive modeling in R I
# Logistic regression
library(effects) # plotting effects
library(tidyverse)
library(caret)
library(gridExtra) # arrange ggplots
library(ISLR2) # dataset in Introduction to Statistical Learning, Second Edition
library(rsample) # split dataset
data("Default") # load the dataset in ISLR2
?Default # credit card default data
# split the data randomly into training and test
set.seed(1234) # ensure we get the same split
split = initial_split(Default, prop = .7) # 70% training, 30% testing
train = training(split)
test = testing(split)
# fit logistic regression models
mod_log <- glm(default ~ student + balance + income, family = "binomial",
data = train)
summary(mod_log)
# interaction model
mod_inter <- glm(default ~ student*balance*income, family = "binomial",
data = train)
summary(mod_inter)
# do backward selection with model that has all pairwise combination
mod_step <- step(mod_inter, direction = "backward")
# backward selection using AIC as the metric
# AIC: scores models in terms of potential predictive performance
# small AICs are better than bigger ones
summary(mod_step)
# model selected by backward stepwise selection
# has variables student, balance, and income
# interaction: student and income
###
# Regularized logistic regression
reg_logistic <- train(
default ~ student*balance*income,
train,
preProc = c("center", "scale", "zv"),
method = "glmnet",
family = "binomial", # for logistic model, add this parameter; for OLS, remove it
trControl = trainControl(
method = "cv",
number = 10
)
tuneLength = 10
)
# find coefficients
coef(reg_logistic$finalModel, reg_logistic$finalModel$lambdaOpt)
# Four models
# mod_log: logistic model with additive effects
# mod_inter: logisitc model with interactions
# mod_step: backward selection outupt with logistic model
# reg_logistic:regularized logistic model
# compare the predictive perofrmance of the models
# in the test set
test = test %>% mutate(pred_log = predict(mod_log, newdata = test, type = "response"),
pred_inter = predict(mod_inter, newdata = test, type = "response"),
pred_step = predict(mod_step, newdata = test, type= "response"),
pred_reg = predict(reg_logistic, newdata = test, type = "prob")$Yes)
# reg_logistic is fitted using the 'train' function and is different
# predict(reg_logistic, newdata = test, type = "prob") %>% select(Yes)
head(test)
# find boxplot of prob predictions versus observed outcomes
ggplot(test) +
aes(x = default, y = pred_log) +
geom_boxplot()
# the predicted probabilities of default
# are higher for those who defaulted
# than for those who didn't
ggplot(test) +
aes(x = default, y = pred_inter) +
geom_boxplot()
ggplot(test) +
aes(x = default, y = pred_step) +
geom_boxplot()
ggplot(test) +
aes(x = default, y = pred_reg) +
geom_boxplot()
# find grouped by means and sd of prob predictions versus observed outcomes
# additive model
test %>% group_by(default) %>% summarize(avg_pred = mean(pred_add),
sd_pred = sd(pred_add))
# avg predicted prob. default for those who do not default: .0221
# avg predicted prob. default for those who default: .330
test %>% group_by(default) %>% summarize(avg_pred = mean(pred_log),
sd_pred = sd(pred_log))
test %>% group_by(default) %>% summarize(avg_pred = mean(pred_inter),
sd_pred = sd(pred_inter))
test %>% group_by(default) %>% summarize(avg_pred = mean(pred_step),
sd_pred = sd(pred_step))
test %>% group_by(default) %>% summarize(avg_pred = mean(pred_reg),
sd_pred = sd(pred_reg))
# find confusion matrix for threshold .5
# compare observed outcomes against predicted outcomes
# so far we got probabilities of default
# we don't have predicted outcomes
# outcomes: "Yes" or "No"
# in the absence of any prior information, .5 is a reasonable threshold
# find these predicted outcomes given the
# predicted probabilities of default
out_log = factor(ifelse(test$pred_log > 0.5, "Yes", "No"))
out_inter = factor(ifelse(test$pred_inter > 0.5, "Yes", "No"))
out_step = factor(ifelse(test$pred_step > 0.5, "Yes", "No"))
out_reg = factor(ifelse(test$pred_reg > 0.5, "Yes", "No"))
# I can compare my predictions against the outcomes
# ?confusionMatrix()
confusionMatrix(out_log, test$default, positive = "Yes")
confusionMatrix(out_inter, test$default, positive = "Yes")
confusionMatrix(out_reg, test$default, positive = "Yes")
# conclusion: these models have similar performance
### Tips:
# in backward selection, by default it uses AIC for selecting models
# if you use BIC, can do
# k = log(n); n is the sample size
###################################################
# Practice: using the Boston housing data in ISLR2
# A data set containing housing values in 506 suburbs of Boston.
?Boston
data(Boston)
glimpse(Boston)
# chas should be converted into factor (encoded as categorical variable)
Boston$chas <- as.factor(Boston$chas)
# Outcome variable
Boston <- Boston %>%
mutate(highcrim = as.factor(ifelse(crim > median(crim), "yes", "no")))
# Exclude "crim"
Boston <- Boston %>% select(-crim)
#######################################################
# Task: predict if a suburb has above median crime rate
# set seed
set.seed(1234)
split <- initial_split(Boston, prop = .7)
train <- training(split)
test <- testing(split)
# fit a logistic model
mod_log <- glm(highcrim ~., data = train, family = 'binomial')
summary(mod_log)
# regularized logistic regression
reg_logistic <- train(
highcrim ~.,
train,
preProc = c("center", "scale", "zv"),
method = "glmnet",
family = "binomial",
trControl = trainControl(
method = "cv",
number = 10
)
)
reg_logistic$bestTune
# find variable importance
vip(reg_logistic)
# predict
test = test %>%
mutate(
pred_reg = predict(reg_logistic, newdata = test, type = "prob")$yes,
pred_log = predict(mod_log, newdata = test, type = "response"))
# find boxplot of prob predictions versus observed outcomes
ggplot(test) +
aes(x = highcrim, y = pred_reg) +
geom_boxplot()
ggplot(test) +
aes(x = highcrim, y = pred_log) +
geom_boxplot()
# convert predicted probabilities to predicted outcomes
out_reg <- factor(ifelse(test$pred_reg > 0.5, "yes", "no"))
out_log <- factor(ifelse(test$pred_log > 0.5, "yes", "no"))
# compare predicted outcomes ('yes' or 'no')
# against observed outcomes via a confusion matrix
confusionMatrix(out_reg, test$highcrim, positive = "yes")
confusionMatrix(out_log, test$highcrim, positive = "yes")
# regularized rgression doesn't do well:
# all predictors are independent
# when we have moderate number of predictors and most of them are useful