-
Notifications
You must be signed in to change notification settings - Fork 0
/
.Rhistory
300 lines (300 loc) · 12 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# Step 1: Import the data
path <- 'https://raw.githubusercontent.com/guru99-edu/R-Programming/master/titanic_data.csv'
titanic <-read.csv(path)
str(titanic)
head(titanic)
tail(titanic)
# Data is not shuffled. Split into training and testing datasets will cause a big problem!
# Generate a random vector of index from 1 to 1309.
set.seed(678) # ensures that you get the same result if you start with that same seed each time you run the same process
shuffle_index <- sample(1:nrow(titanic))
head(shuffle_index)
shuffle_index
shuffle_index <- sample(1:nrow(titanic))
head(shuffle_index)
shuffle_index <- sample(1:nrow(titanic))
head(shuffle_index)
# Data is not shuffled. Split into training and testing datasets will cause a big problem!
# Generate a random vector of index from 1 to 1309.
set.seed(678) # ensures that you get the same result if you start with that same seed each time you run the same process
shuffle_index <- sample(1:nrow(titanic))
head(shuffle_index)
# Step 2: Clean the dataset
library(dplyr)
str(titanic)
#first drop home.dest & cabin & name, ...
#second convert pclass and survided to level
clean_titanic <- titanic %>%
select(-c(home.dest, cabin, name, x, ticket)) %>%
#Convert to factor level
mutate(pclass = factor(pclass, levels = c(1, 2, 3), labels = c('Upper', 'Middle', 'Lower')),
survived = factor(survived, levels = c(0, 1), labels = c('No', 'Yes')))
str(clean_titanic)
head(clean_titanic)
tail(clean_titanic)
#find the number of row with "?" for each column
dirty <- sapply(1:ncol(clean_titanic), function(x){
dirty.num <- sum(clean_titanic[, x] == "?")
#(cbind(dirty.num, x))
})
dirty
#clean question mark
clean_titanic <- clean_titanic %>% filter(age != "?", fare != "?", embarked != "?")
str(clean_titanic)
head(clean_titanic)
tail(clean_titanic)
# if train =True, it returns train otherwise test
create_train_test <- function(data, size = 0.8, train = TRUE) {
n_row = nrow(data)
total_row = size * n_row
train_sample <- 1:total_row
if (train == TRUE) {
return (data[train_sample, ])
} else {
return (data[-train_sample, ])
}
}
# function brings back a test set if train argument is false;
data_train <- create_train_test(clean_titanic, 0.8, train = TRUE)
data_test <- create_train_test(clean_titanic, 0.8, train = FALSE)
dim(data_train)
dim(data_test)
# use the function prop.table() combined with table() to verify if the randomization process is correct.
prop.table(table(data_train$survived))
prop.table(table(data_test$survived))
#n = nrow(clean_titanic)
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#prop.table(table(test_set$survived))
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#prop.table(table(test_set$survived))
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#prop.table(table(test_set$survived))
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#prop.table(table(test_set$survived))
# Step 4: Build the DT model
library(rpart) # build decision tree
library(rpart.plot) # draw decision tree
fit <- rpart(survived~., data = data_train, method = 'class') # dot means all columns
fit # lots of useless information so we need to plot
#rpart.plot(fit, extra = 106, fallen.leaves = T, cex = 1)
rpart.plot(fit, extra = 106) #106 is used for binary prediction
# why so compact plot?
str(data_train)
# convert age and fare to number in both test and train
data_train$fare <- as.numeric(as.character(data_train$fare))
data_train$age <- as.numeric(as.character(data_train$age))
data_test$fare <- as.numeric(as.character(data_test$fare))
data_test$age <- as.numeric(as.character(data_test$age))
fit <- rpart(survived~., data = data_train, method = 'class')
rpart.plot(fit, extra = 106)
# Step 5: Make prediction
predict_unseen <-predict(fit, data_test[-2], type = 'class')
predict_unseen
# Step 6: Measure performance
# Create a table to count how many passengers are classified as survivors and passed away compare to the correct classification
table_mat <- table(data_test$survived, predict_unseen)
table_mat
head(data_test[which(predict_unseen != data_test$survived), ])
head(data_test)
# accuracy measurement
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test))
### Random Forest
library(randomForest)
titanic_RF <- randomForest(survived~., data = data_train)
predict_RF <- predict(titanic_RF, data_test)
table_RF <- table(data_test$survived, predict_RF)
table_RF
print(paste('Accuracy for randon forest test', sum(diag(table_RF)) / sum(table_RF)))
#rpart.plot(fit, extra = 106, fallen.leaves = T, cex = 1)
rpart.plot(fit, extra = 106) #106 is used for binary prediction
# Step 4: Build the DT model
library(rpart) # build decision tree
library(rpart.plot) # draw decision tree
fit <- rpart(survived~., data = data_train, method = 'class') # dot means all columns
fit # lots of useless information so we need to plot
#rpart.plot(fit, extra = 106, fallen.leaves = T, cex = 1)
rpart.plot(fit, extra = 106) #106 is used for binary prediction
install.packages("rpart.plot")
# Step 4: Build the DT model
library(rpart) # build decision tree
library(rpart.plot) # draw decision tree
fit <- rpart(survived~., data = data_train, method = 'class') # dot means all columns
fit # lots of useless information so we need to plot
#rpart.plot(fit, extra = 106, fallen.leaves = T, cex = 1)
rpart.plot(fit, extra = 106) #106 is used for binary prediction
# why so compact plot?
str(data_train)
# convert age and fare to number in both test and train
data_train$fare <- as.numeric(as.character(data_train$fare))
data_train$age <- as.numeric(as.character(data_train$age))
data_test$fare <- as.numeric(as.character(data_test$fare))
data_test$age <- as.numeric(as.character(data_test$age))
fit <- rpart(survived~., data = data_train, method = 'class')
rpart.plot(fit, extra = 106)
# Step 5: Make prediction
predict_unseen <-predict(fit, data_test[-2], type = 'class')
predict_unseen
# Step 6: Measure performance
# Create a table to count how many passengers are classified as survivors and passed away compare to the correct classification
table_mat <- table(data_test$survived, predict_unseen)
table_mat
head(data_test[which(predict_unseen != data_test$survived), ])
head(data_test)
# accuracy measurement
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test))
### Random Forest
library(randomForest)
titanic_RF <- randomForest(survived~., data = data_train)
install.packages("randomForest")
### Random Forest
library(randomForest)
### Random Forest
library(randomForest)
install.packages("randomForest")
### Random Forest
library(randomForest)
### Random Forest
install.packages("randomForest")
library(randomForest)
titanic_RF <- randomForest(survived~., data = data_train)
library(randomForest)
df <- data(iris) ##load data
head(iris) ## see the studcture
set.seed(123)
## Generate a random number that is 90% of the total number of rows in dataset.
ran <- sample(1:nrow(iris), 0.9 * nrow(iris))
summary(iris)
## The normalization function is created. Perform min-max normalization.
# Rescales a vector x such that ts minimum value is zero and its maximum value is one.
# It does this by subtracting the minimum value from each value of x and divding by the range of x values.
nor <- function(x) { (x - min(x))/(max(x)-min(x)) }
##Run nomalization on first 4 coulumns of dataset because they are the predictors
iris_norm <- as.data.frame(lapply(iris[,c(1,2,3,4)], nor))
summary(iris_norm)
##extract training set
iris_train <- iris_norm[ran,]
##extract testing set
iris_test <- iris_norm[-ran,]
##extract 5th column of train dataset because it will be used as 'cl' (label or target) argument in knn function.
iris_target_category <- iris[ran,5]
##extract 5th column of test dataset to measure the accuracy
iris_test_category <- iris[-ran,5]
##load the package class
library(class)
?knn
##Run normalization on first 4 columns of dataset because they are the predictors
iris_norm <- as.data.frame(lapply(iris[,c(1,2,3,4)], nor))
summary(iris_norm)
##extract training set
iris_train <- iris_norm[ran,]
##extract testing set
iris_test <- iris_norm[-ran,]
##extract 5th column of train dataset because it will be used as 'cl' (label or target) argument in knn function.
iris_target_category <- iris[ran,5]
##extract 5th column of test dataset to measure the accuracy
iris_test_category <- iris[-ran,5]
##load the package class
library(class)
?knn
##run knn function
pr <- knn(iris_train, iris_test, cl=iris_target_category)
#know the predictions
pr
##create confusion matrix
tab <- table(pr,iris_test_category)
print(tab)
##this function divides the correct predictions by total number of predictions that tell us how accurate the model is.
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab)
#mean(iris_test_category == pr)
plot(pr)
#mean(iris_test_category == pr)
plot(pr, col='red')
# finding out the optimal K
sqrt(nrow(iris_train))
# consider 12 neighbors for majority vote
pr_12 <- knn(iris_train, iris_test, cl=iris_target_category, k=12)
pr_12
tab_12 <- table(pr_12,iris_test_category)
accuracy(tab_12)
##--------------------------------
library(caret)
##--------------------------------
install.packages('caret')
library(caret)
result <- confusionMatrix(pr, iris_test_category)
result
#Ref https://towardsdatascience.com/k-nearest-neighbors-algorithm-with-examples-in-r-simply-explained-knn-1f2c88da405c
##because diamonds dataset is in ggplot2 package
library(ggplot2)
##load data
data(diamonds)
##store it as data frame
dia <- data.frame(diamonds)
set.seed(123)
##create a random number equal 80% of total number of rows
ran <- sample(1:nrow(dia), 0.8 * nrow(dia))
##the normalization function is created
nor <-function(x) { (x -min(x))/(max(x)-min(x)) }
##normalization function is created
dia_nor <- as.data.frame(lapply(dia[,c(1,5,6,7,8,9,10)], nor))
##training dataset extracted
dia_train <- dia_nor[ran,]
##test dataset extracted
dia_test <- dia_nor[-ran,]
str(dia)
##the 2nd column of training dataset because that is what we need to predict about testing dataset
##also convert ordered factor to normal factor
dia_target <- as.factor(dia[ran,2])
##the actual values of 2nd couln of testing dataset to compaire it with values that will be predicted
##also convert ordered factor to normal factor
test_target <- as.factor(dia[-ran,2])
##run knn function
library(class)
# Use the prob parameter to get the proportion of votes for the winning class. prob = TRUE parameter to compute the vote proportions
pr <- knn(dia_train, dia_test, cl=dia_target,k=20, prob = T)
# Get the "prob" attribute from the predicted classes.
# Use the attr() function to obtain the vote proportions for the predicted class. These are stored in the attribute "prob"
prob <- attr(pr, "prob")
# Examine the first several predictions
head(pr)
# Examine the proportion of votes for the winning class
head(prob)
##create the confucion matrix
tb <- table(pr, test_target)
##check the accuracy
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tb)
plot(pr)