forked from romario076/Forest-Cover-Type-Prediction-Kaggle
-
Notifications
You must be signed in to change notification settings - Fork 0
/
code.R
116 lines (94 loc) · 4.18 KB
/
code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
library(randomForest)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(caret)
library(tree)
forest<- read.csv("train.csv")
head(forest)
forest$Id<- NULL
soil<- forest[ ,c(15:54)]
area<- forest[,c(11:14)]
forest<- forest[,c(-15:-54, -11:-14)]
Newfactor <- factor(apply(soil, 1, function(x) which(x == 1)), labels = c(1:38))
forest$Soil_Type<- as.integer(Newfactor)
Newfactor2 <- factor(apply(area, 1, function(x) which(x == 1)), labels = c(1:4))
forest$Wilderness_Area<- as.integer(Newfactor2)
forest<- forest[ ,c(1:10,12,13,11)]
head(forest)
forestTrain<-forest
boxplot(forest[,c(-7,-8,-9,-11,-12,-13)], las=3, par(mar = c(15, 4, 2, 2)), col="darkseagreen4",main="General")
theme_set(theme_gray(base_size = 20))
g1<- ggplot(forest, aes(Elevation, color = factor(Cover_Type), fill = factor(Cover_Type))) + geom_density(alpha = 0.2)
g2<- ggplot(forest, aes(Aspect, color = factor(Cover_Type), fill = factor(Cover_Type))) + geom_density(alpha = 0.2)
g3<- ggplot(forest, aes(Horizontal_Distance_To_Roadways, color = factor(Cover_Type), fill = factor(Cover_Type))) + geom_density(alpha = 0.2)
g4<- ggplot(forest, aes(Horizontal_Distance_To_Fire_Points, color = factor(Cover_Type), fill = factor(Cover_Type))) + geom_density(alpha = 0.2)
grid.arrange(g1, g2,g3,g4, ncol=2,nrow=2)
set.seed(1)
forest1<- forest[runif(dim(forest)[1]) > 0.8, ]
forest1$Id <- NULL
#Remove columns with zero variance
sub = apply(forest1[,-56], 2, function(col) all(var(col) !=0 ))
forestSub<- forest1[,sub]
n<- dim(forestSub)
set.seed(1)
split <- runif(dim(forestSub)[1]) > 0.2
train <- forestSub[split,]
test <- forestSub[!split,]
#Tree prediction
train1<- train
test1<- test
names(train1)<- c("Elevation", "Aspect","Slope","H_D_To_Hydro","V_D_To_Hydro","H_D_To_Roads","Hillshade_9am" ,"Hillshade_Noon" ,"Hillshade_3pm","H_D_To_Fire_Points" ,"Soil_Type","Wilderness_Area","Cover_Type" )
names(test1)<- c("Elevation", "Aspect","Slope","H_D_To_Hydro","V_D_To_Hydro","H_D_To_Roads","Hillshade_9am" ,"Hillshade_Noon" ,"Hillshade_3pm","H_D_To_Fire_Points" ,"Soil_Type","Wilderness_Area","Cover_Type" )
tree.forests = tree(factor(Cover_Type) ~., data = train1)
plot(tree.forests)
text(tree.forests, cex=1.1)
tree.prediction = predict(tree.forests, test1[,-13], type='class')
sa<- data.frame(cover=test[,13], pred=tree.prediction)
#Correlation matrix
cor<- forest[,c(-9,-8,-7,-13)]
names(cor)<- c("Elevation", "Aspect","Slope","H_D_To_Hydro","V_D_To_Hydro","H_D_To_Roads", "H_D_To_Fire_Points" ,"Soil_Type","Wilderness_Area" )
#Correlation between variables
m<- cor(cor)
corrplot(m, method = "number", tl.cex=1.2, mar = c(2, 2, 2, 2))
#Use randomForest for prediction
rf <- randomForest(factor(Cover_Type) ~ ., train, mtry=12, ntree=1000)
predictions <- predict(rf, test)
pred<- data.frame(Cover_Type=test$Cover_Type, Prediction=predictions)
rownames(pred)=NULL
head(pred, 15)
cm1<- confusionMatrix(predictions, test$Cover_Type)
cm1$table
cm1$overall['Accuracy']
#Next step. After training modelmove on to General case. Test set prediction
forestTest<- read.csv("test.csv")
#forest<- read.csv("train.csv")
forestTest$Id<- NULL
soil<- forestTest[ ,c(15:54)]
area<- forestTest[,c(11:14)]
forest<- forestTest[,c(-15:-54, -11:-14)]
Newfactor <- factor(apply(soil, 1, function(x) which(x == 1)), labels = c(1:40))
forestTest$Soil_Type<- as.integer(Newfactor)
Newfactor2 <- factor(apply(area, 1, function(x) which(x == 1)), labels = c(1:4))
forestTest$Wilderness_Area<- as.integer(Newfactor2)
forestTest<- forestTest[ ,c(1:10,56,55)]
head(forestTest)
#Remove columns witt zero variance
sub = apply(forestTest, 2, function(col) all(var(col) !=0 ))
TestSub<- forestTest[,sub]
n<- dim(TestSub)
#Use our previous data set "forest" as training set.
forestTest$Id <- NULL
forestTrain$Id<- NULL
#forestTrain$Cover_Type<- as.factor(forestTarin$Cover_Type)
#Use randomForest for prediction
rf1 <- randomForest(factor(Cover_Type) ~ ., mtry = 12,ntrees=1000, importance = TRUE,forestTrain)
predictions <- predict(rf1, TestSub)
```
## Save results
id<- read.csv("sampleSubmission.csv")
result<- data.frame(Id=id$Id, Cover_Type= predictions)
head(result,20)
#write.csv(result, "Submission7.csv", row.names=FALSE)
Importance<-rf
varImpPlot(Importance, col="darkblue", pch=19)