.Rhistory

# Step 1: Import the data
path <- 'https://raw.githubusercontent.com/guru99-edu/R-Programming/master/titanic_data.csv'
titanic <-read.csv(path)
str(titanic)
head(titanic)
tail(titanic)
# Data is not shuffled. Split into training and testing datasets will cause a big problem!
# Generate a random vector of index from 1 to 1309.
set.seed(678) # ensures that you get the same result if you start with that same seed each time you run the same process
shuffle_index <- sample(1:nrow(titanic))
head(shuffle_index)
shuffle_index
shuffle_index <- sample(1:nrow(titanic))
head(shuffle_index)
shuffle_index <- sample(1:nrow(titanic))
head(shuffle_index)
# Data is not shuffled. Split into training and testing datasets will cause a big problem!
# Generate a random vector of index from 1 to 1309.
set.seed(678) # ensures that you get the same result if you start with that same seed each time you run the same process
shuffle_index <- sample(1:nrow(titanic))
head(shuffle_index)
# Step 2: Clean the dataset
library(dplyr)
str(titanic)
#first drop home.dest & cabin & name, ...
#second convert pclass and survided to level
clean_titanic <- titanic %>%
select(-c(home.dest, cabin, name, x, ticket)) %>%
#Convert to factor level
mutate(pclass = factor(pclass, levels = c(1, 2, 3), labels = c('Upper', 'Middle', 'Lower')),
survived = factor(survived, levels = c(0, 1), labels = c('No', 'Yes')))
str(clean_titanic)
head(clean_titanic)
tail(clean_titanic)
#find the number of row with "?" for each column
dirty <- sapply(1:ncol(clean_titanic), function(x){
dirty.num <- sum(clean_titanic[, x] == "?")
#(cbind(dirty.num, x))
})
dirty
#clean question mark
clean_titanic <- clean_titanic %>% filter(age != "?", fare != "?", embarked != "?")
str(clean_titanic)
head(clean_titanic)
tail(clean_titanic)
# if train =True, it returns train otherwise test
create_train_test <- function(data, size = 0.8, train = TRUE) {
n_row = nrow(data)
total_row = size * n_row
train_sample <- 1:total_row
if (train == TRUE) {
return (data[train_sample, ])
} else {
return (data[-train_sample, ])
}
}
# function brings back a test set if train argument is false;
data_train <- create_train_test(clean_titanic, 0.8, train = TRUE)
data_test <- create_train_test(clean_titanic, 0.8, train = FALSE)
dim(data_train)
dim(data_test)
# use the function prop.table() combined with table() to verify if the randomization process is correct.
prop.table(table(data_train$survived))
prop.table(table(data_test$survived))
#n = nrow(clean_titanic)
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#prop.table(table(test_set$survived))
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#prop.table(table(test_set$survived))
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#prop.table(table(test_set$survived))
#n = nrow(clean_titanic)
#trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
#train_set = clean_titanic[trainIndex ,]
#test_set = clean_titanic[-trainIndex ,]
#prop.table(table(train_set$survived))
#prop.table(table(test_set$survived))
# Step 4: Build the DT model
library(rpart) # build decision tree
library(rpart.plot) # draw decision tree
fit <- rpart(survived~., data = data_train, method = 'class') # dot means all columns
fit # lots of useless information so we need to plot
#rpart.plot(fit, extra = 106, fallen.leaves = T, cex = 1)
rpart.plot(fit, extra = 106) #106 is used for binary prediction
# why so compact plot?
str(data_train)
# convert age and fare to number in both test and train
data_train$fare <- as.numeric(as.character(data_train$fare))
data_train$age <- as.numeric(as.character(data_train$age))
data_test$fare <- as.numeric(as.character(data_test$fare))
data_test$age <- as.numeric(as.character(data_test$age))
fit <- rpart(survived~., data = data_train, method = 'class')
rpart.plot(fit, extra = 106)
# Step 5: Make prediction
predict_unseen <-predict(fit, data_test[-2], type = 'class')
predict_unseen
# Step 6: Measure performance
# Create a table to count how many passengers are classified as survivors and passed away compare to the correct classification
table_mat <- table(data_test$survived, predict_unseen)
table_mat
head(data_test[which(predict_unseen != data_test$survived), ])
head(data_test)
# accuracy measurement
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test))
### Random Forest
library(randomForest)
titanic_RF <- randomForest(survived~., data = data_train)
predict_RF <- predict(titanic_RF, data_test)
table_RF <- table(data_test$survived, predict_RF)
table_RF
print(paste('Accuracy for randon forest test', sum(diag(table_RF)) / sum(table_RF)))
#rpart.plot(fit, extra = 106, fallen.leaves = T, cex = 1)
rpart.plot(fit, extra = 106) #106 is used for binary prediction
# Step 4: Build the DT model
library(rpart) # build decision tree
library(rpart.plot) # draw decision tree
fit <- rpart(survived~., data = data_train, method = 'class') # dot means all columns
fit # lots of useless information so we need to plot
#rpart.plot(fit, extra = 106, fallen.leaves = T, cex = 1)
rpart.plot(fit, extra = 106) #106 is used for binary prediction
install.packages("rpart.plot")
# Step 4: Build the DT model
library(rpart) # build decision tree
library(rpart.plot) # draw decision tree
fit <- rpart(survived~., data = data_train, method = 'class') # dot means all columns
fit # lots of useless information so we need to plot
#rpart.plot(fit, extra = 106, fallen.leaves = T, cex = 1)
rpart.plot(fit, extra = 106) #106 is used for binary prediction
# why so compact plot?
str(data_train)
# convert age and fare to number in both test and train
data_train$fare <- as.numeric(as.character(data_train$fare))
data_train$age <- as.numeric(as.character(data_train$age))
data_test$fare <- as.numeric(as.character(data_test$fare))
data_test$age <- as.numeric(as.character(data_test$age))
fit <- rpart(survived~., data = data_train, method = 'class')
rpart.plot(fit, extra = 106)
# Step 5: Make prediction
predict_unseen <-predict(fit, data_test[-2], type = 'class')
predict_unseen
# Step 6: Measure performance
# Create a table to count how many passengers are classified as survivors and passed away compare to the correct classification
table_mat <- table(data_test$survived, predict_unseen)
table_mat
head(data_test[which(predict_unseen != data_test$survived), ])
head(data_test)
# accuracy measurement
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test))
### Random Forest
library(randomForest)
titanic_RF <- randomForest(survived~., data = data_train)
install.packages("randomForest")
### Random Forest
library(randomForest)
### Random Forest
library(randomForest)
install.packages("randomForest")
### Random Forest
library(randomForest)
### Random Forest
install.packages("randomForest")
library(randomForest)
titanic_RF <- randomForest(survived~., data = data_train)
library(randomForest)
df <- data(iris) ##load data
head(iris) ## see the studcture
set.seed(123)
## Generate a random number that is 90% of the total number of rows in dataset.
ran <- sample(1:nrow(iris), 0.9 * nrow(iris))
summary(iris)
## The normalization function is created. Perform min-max normalization.
# Rescales a vector x such that ts minimum value is zero and its maximum value is one.
# It does this by subtracting the minimum value from each value of x and divding by the range of x values.
nor <- function(x) { (x - min(x))/(max(x)-min(x))   }
##Run nomalization on first 4 coulumns of dataset because they are the predictors
iris_norm <- as.data.frame(lapply(iris[,c(1,2,3,4)], nor))
summary(iris_norm)
##extract training set
iris_train <- iris_norm[ran,]
##extract testing set
iris_test <- iris_norm[-ran,]
##extract 5th column of train dataset because it will be used as 'cl' (label or target) argument in knn function.
iris_target_category <- iris[ran,5]
##extract 5th column of test dataset to measure the accuracy
iris_test_category <- iris[-ran,5]
##load the package class
library(class)
?knn
##Run normalization on first 4 columns of dataset because they are the predictors
iris_norm <- as.data.frame(lapply(iris[,c(1,2,3,4)], nor))
summary(iris_norm)
##extract training set
iris_train <- iris_norm[ran,]
##extract testing set
iris_test <- iris_norm[-ran,]
##extract 5th column of train dataset because it will be used as 'cl' (label or target) argument in knn function.
iris_target_category <- iris[ran,5]
##extract 5th column of test dataset to measure the accuracy
iris_test_category <- iris[-ran,5]
##load the package class
library(class)
?knn
##run knn function
pr <- knn(iris_train, iris_test, cl=iris_target_category)
#know the predictions
pr
##create confusion matrix
tab <- table(pr,iris_test_category)
print(tab)
##this function divides the correct predictions by total number of predictions that tell us how accurate the model is.
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab)
#mean(iris_test_category == pr)
plot(pr)
#mean(iris_test_category == pr)
plot(pr, col='red')
# finding out the optimal K
sqrt(nrow(iris_train))
# consider 12 neighbors for majority vote
pr_12 <- knn(iris_train, iris_test, cl=iris_target_category, k=12)
pr_12
tab_12 <- table(pr_12,iris_test_category)
accuracy(tab_12)
##--------------------------------
library(caret)
##--------------------------------
install.packages('caret')
library(caret)
result <- confusionMatrix(pr, iris_test_category)
result
#Ref https://towardsdatascience.com/k-nearest-neighbors-algorithm-with-examples-in-r-simply-explained-knn-1f2c88da405c
##because diamonds dataset is in ggplot2 package
library(ggplot2)
##load data
data(diamonds)
##store it as data frame
dia <- data.frame(diamonds)
set.seed(123)
##create a random number equal 80% of total number of rows
ran <- sample(1:nrow(dia), 0.8 * nrow(dia))
##the normalization function is created
nor <-function(x) { (x -min(x))/(max(x)-min(x))   }
##normalization function is created
dia_nor <- as.data.frame(lapply(dia[,c(1,5,6,7,8,9,10)], nor))
##training dataset extracted
dia_train <- dia_nor[ran,]
##test dataset extracted
dia_test <- dia_nor[-ran,]
str(dia)
##the 2nd column of training dataset because that is what we need to predict about testing dataset
##also convert ordered factor to normal factor
dia_target <- as.factor(dia[ran,2])
##the actual values of 2nd couln of testing dataset to compaire it with values that will be predicted
##also convert ordered factor to normal factor
test_target <- as.factor(dia[-ran,2])
##run knn function
library(class)
# Use the prob parameter to get the proportion of votes for the winning class. prob = TRUE parameter to compute the vote proportions
pr <- knn(dia_train, dia_test, cl=dia_target,k=20, prob = T)
# Get the "prob" attribute from the predicted classes.
# Use the attr() function to obtain the vote proportions for the predicted class. These are stored in the attribute "prob"
prob <- attr(pr, "prob")
# Examine the first several predictions
head(pr)
# Examine the proportion of votes for the winning class
head(prob)
##create the confucion matrix
tb <- table(pr, test_target)
##check the accuracy
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tb)
plot(pr)