-
Notifications
You must be signed in to change notification settings - Fork 0
/
knnclassifier.py
111 lines (94 loc) · 3.95 KB
/
knnclassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import csv
import sys
import os
from genericpath import exists
import math
#################
#input arguments
#################
#take command line argument
try:
arg1 = sys.argv[1]
arg2 = sys.argv[2]
arg3 = sys.argv[3]
except IndexError:
raise SystemExit(f"Usage: {sys.argv[0]} <argument missing - check readme.txt>")
k=int(arg3)
file_dir=os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
data_dir=file_dir+'/data/'
outputlines=[]
##################################
#use a list to store training data
##################################
training_instances=[]
with open(data_dir+str(arg1),newline='') as training_data_csv:
csvreader=csv.reader(training_data_csv,delimiter=' ')
header=next(csvreader)
#print(header)
for row in csvreader:
training_instances.append(row)
training_rows=len(training_instances)
number_of_features=len(header)-1
#######################################
#read test data
#######################################
test_instances=[]
with open (data_dir+str(arg2),newline='') as test_data_csv:
csvreader=csv.reader(test_data_csv,delimiter=' ')
header=[]
header=next(csvreader)
#print(header)
for row in csvreader:
test_instances.append(row)
test_rows=len(test_instances)
########################################################
#Calculating the range of each feature, store them in r
########################################################
r=[]
for n in range(number_of_features):
b_max=float(training_instances[0][n])
b_min=float(training_instances[0][n])
for i in range(training_rows):
if b_max<float(training_instances[i][n]):
b_max=float(training_instances[i][n])
if b_min>float(training_instances[i][n]):
b_min=float(training_instances[i][n])
r.append((b_max-b_min)**2) ###this is the range of each feature
#######################################
#Caculating the distances - main algorithm of k-Nearest Neighbour!!!!
#######################################
corrected_prediction=0
for n in range(test_rows):
distances=[]
for m in range(training_rows):
d=0
for i in range(number_of_features):
d=d+(float(test_instances[n][i])-float(training_instances[m][i]))**2/r[i] ##caculate the distance - sum all
distances.append(math.sqrt(d)) ##caculate the distance - square root the d caculated above
#get k nearest neighbours
selected_distance=[]
selected_row_index=[]
predicted_class=[]
tempdistances=distances
#find the smallest k distances and the index of the instances
for i in range(k):
min_d=min(tempdistances)
min_d_index=tempdistances.index(min_d)
tempdistances[min_d_index]=min_d+max(tempdistances) ##once the min has been found, add the max value to it so it won't be selected next time in the loop
selected_distance.append(min_d)
selected_row_index.append(min_d_index)
predicted_class.append(training_instances[min_d_index][number_of_features]) ##get the labelled class from training dataset
#Vote for the class
voter = {i:predicted_class.count(i) for i in predicted_class} ##count each predicted class
result=max(voter)
#caculating accuracy
if (test_instances[n][number_of_features]==result):
corrected_prediction=corrected_prediction+1
print("Test instance",n+1 ," labelled class: ", test_instances[n][number_of_features], ", predicted class: ",result)
outputlines.append('Test instance'+str(n+1)+ ' labelled class: '+str(test_instances[n][number_of_features]) +'predicted class: ' + str(result))
#print(test_instances[n],"predicted class: ",result)
print("The accuracy of the prediction is: ",round(corrected_prediction/test_rows*100,2),"%")
outputlines.append("The accuracy of the prediction is: "+str(round(corrected_prediction/test_rows*100,2))+"%")
#output to sampleoutput.txt
with open(file_dir+'/knnsampleoutput.txt','w') as f:
f.write('\n'.join(outputlines))