forked from trasapong/R
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hClustering.R
64 lines (61 loc) · 1.72 KB
/
hClustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# https://github.com/WinVector/zmPDSwR/tree/master/Protein
protein <- read.table("https://raw.githubusercontent.com/trasapong/R/main/protein.txt", sep="\t", header=TRUE)
head(protein)
summary(protein)
vars.to.use <- colnames(protein)[-1]
vars.to.use
pmatrix <- scale(protein[,vars.to.use])
pmatrix
summary(pmatrix)
# keep some attr for later use
pcenter <- attr(pmatrix, "scaled:center")
pcenter
pscale <- attr(pmatrix, "scaled:scale")
pscale
d <- dist(pmatrix, method = "euclidean")
d
print(d,digits=2)
# note: name method name change
pfit <- hclust(d, method = "ward.D2")
pfit
plot(pfit, labels = protein$Country)
rect.hclust(pfit, k=5)
plot(pfit, labels = protein$Country)
rect.hclust(pfit, k=4)
plot(pfit, labels = protein$Country)
rect.hclust(pfit, k=3)
groups <- cutree(pfit, k=5)
groups
groups2 <- cutree(pfit, k=4)
groups2
print_clusters <- function(labels,k) {
for(i in 1:k) {
print(paste("cluster",i))
print(protein[labels==i,c("Country","RedMeat","Fish","Fr.Veg")])
}
}
print_clusters(groups, 5)
library(ggplot2)
princ <- prcomp(pmatrix)
princ
summary(princ)
princ$x
first2 <- princ$x[,1:2]
first2
first2.plus <- cbind(as.data.frame(first2),
cluster=as.factor(groups),
country=protein$Country)
first2.plus
ggplot(first2.plus, aes(x=PC1,y=PC2)) +
geom_point(aes(shape=cluster)) +
geom_text(aes(label=country),
hjust=0, vjust=1)
library(fpc)
kbest.p <- 5
cboot.hclust <- clusterboot(pmatrix,clustermethod = hclustCBI,
method = "ward.D2", k=kbest.p)
summary(cboot.hclust$result)
groups <- cboot.hclust$result$partition
print_clusters(groups, kbest.p)
cboot.hclust$bootmean
cboot.hclust$bootbrd