-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmsclustering.py
133 lines (108 loc) · 4.46 KB
/
kmsclustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import csv
import sys
import os
import random
import math
#take command line argument
try:
arg1 = sys.argv[1]
arg2 = sys.argv[2]
except IndexError:
raise SystemExit(f"Usage: {sys.argv[0]} <please use wine as an argument>")
k=int(arg2)
outputlines=[]
file_dir=os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
data_dir=file_dir+'/data/'
##################################
#Load wine data
##################################
data_instances=[]
with open(data_dir+str(arg1),newline='') as data_csv:
csvreader=csv.reader(data_csv,delimiter=' ')
header=next(csvreader)
for row in csvreader:
data_instances.append(row)
data_rows=len(data_instances)
number_of_features=len(header)-1
########################################################
#Calculating the range of each feature, store them in r
########################################################
r=[]
for n in range(number_of_features):
b_max=float(data_instances[0][n])
b_min=float(data_instances[0][n])
for i in range(data_rows):
if b_max<float(data_instances[i][n]):
b_max=float(data_instances[i][n])
if b_min>float(data_instances[i][n]):
b_min=float(data_instances[i][n])
r.append((b_max-b_min)**2) ###this is the range of each feature
##Pick K random instances
means=random.sample(data_instances, k)
############################################################
## Create k clusters by assigning every instance to the nearest cluster:
## based on the nearest mean according to the distance measure
############################################################
#start the loop here
centroid=means #store the old means
print(centroid)
outputlines.append(str(centroid))
loop_counter=0
while True:
print('Loop run no.',loop_counter)
outputlines.append('Loop run no.'+ str(loop_counter))
#create K clusters, store in clusters[]
clusters=[]
for i in range(k):
clusters.append([])
#calculate distances row by row
for n in range(data_rows): #calculate distances row by row
distances=[]
for m in range(k):
d=0
for i in range(number_of_features): #Calculate the mean of each feature
d=d+(float(data_instances[n][i])-float(centroid[m][i]))**2/r[i] ##caculate the distance - sum all
distances.append(math.sqrt(d)) ##calculate the distance - square root the d caculated above
cluster_label=distances.index(min(distances)) #find the index of the minimum distance, use it as the cluster number
clusters[cluster_label].append(data_instances[n]) #assign the instance to the nearest cluster
########################################################
#Calculating new means, store in new_means[]
########################################################
new_means=[]
for m in range(len(clusters)):
new_means.append([])
number_of_instances=len(clusters[m])
mean=[]
for n in range(number_of_features):
sum=0
for i in range(number_of_instances):
sum=sum+float(clusters[m][i][n])
mean.append(round(sum/number_of_instances,2)) #assign new mean to a list
new_means[m]=mean #add the list of new mean to new_means
s=0 #use s as a signal indicating when to stop the loop - time to break the loop if the old mean equals the new mean
for i in range(len(centroid)):
for n in range(number_of_features):
print('new means:',new_means[i][n],'old means:',centroid[i][n])
outputlines.append('new means:'+ str(new_means[i][n])+'old means:'+str(centroid[i][n]))
if float(new_means[i][n])!=float(centroid[i][n]):
s=s+1
print('New centroid:',new_means)
outputlines.append('New centroid:'+str(new_means))
print('Old centroid:',centroid)
outputlines.append('Old centroid:'+str(centroid))
if s>0:
centroid=new_means #assign new means to centroid
loop_counter+=1
continue
else:
break
for n in range(len(clusters)):
z=0
for i in range(len(clusters[n])):
clusters[n][i].append(n) #add the predicted cluster at the end of each instance
z=z+1
print(z,' instance(s) belong to cluster ',n+1)
outputlines.append(str(z)+' instance(s) belong to cluster '+str(n+1))
#output to sampleoutput.txt
with open(file_dir+'/kmeanssampleoutput.txt','w') as f:
f.write('\n'.join(outputlines))