This repository has been archived by the owner on Mar 23, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathK Nearest Neighbours - In Parallel.py
134 lines (103 loc) · 4.17 KB
/
K Nearest Neighbours - In Parallel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#================================================================================================================
#----------------------------------------------------------------------------------------------------------------
# K NEAREST NEIGHBOURS
#----------------------------------------------------------------------------------------------------------------
#================================================================================================================
# Details of implementation/tutorial is in : http://madhugnadig.com/articles/machine-learning/parallel-processing/2017/02/10/implementing-k-nearest-neighbours-in-parallel-from-scratch.html
import math
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import random
from collections import Counter
from sklearn import preprocessing
from itertools import repeat
import multiprocessing as mp
import time
#for plotting
plt.style.use('ggplot')
class CustomKNN:
def __init__(self):
self.accurate_predictions = 0
self.total_predictions = 0
self.accuracy = 0.0
def predict(self, training_data, to_predict, k = 3):
if len(training_data) >= k:
print("K cannot be smaller than the total voting groups(ie. number of training data points)")
return
distributions = []
for group in training_data:
for features in training_data[group]:
#Calculate Euclidean distance
euclidean_distance = np.linalg.norm(np.array(features)- np.array(to_predict))
distributions.append([euclidean_distance, group])
#Find the class of K nearest points
results = [i[1] for i in sorted(distributions)[:k]]
result = Counter(results).most_common(1)[0][0]
confidence = Counter(results).most_common(1)[0][1]/k
return result, to_predict
def test(self, test_set, training_set):
pool = mp.Pool(processes= 8)
arr = {}
s = time.clock()
# 'Parallelization' happens here
for group in test_set:
arr[group] = pool.starmap(self.predict, zip(repeat(training_set), test_set[group], repeat(3)))
e = time.clock()
#Calculating Accuracy
for group in test_set:
for data in test_set[group]:
for i in arr[group]:
if data == i[1]:
self.total_predictions += 1
#If accuracte -> predicted class = original class
if group == i[0]:
self.accurate_predictions+=1
self.accuracy = 100*(self.accurate_predictions/self.total_predictions)
print("\nAcurracy :", str(self.accuracy) + "%")
def mod_data(df):
df.replace('?', -999999, inplace = True)
df.replace('yes', 4, inplace = True)
df.replace('no', 2, inplace = True)
df.replace('notpresent', 4, inplace = True)
df.replace('present', 2, inplace = True)
df.replace('abnormal', 4, inplace = True)
df.replace('normal', 2, inplace = True)
df.replace('poor', 4, inplace = True)
df.replace('good', 2, inplace = True)
df.replace('ckd', 4, inplace = True)
df.replace('notckd', 2, inplace = True)
def main():
#Load the dataset
df = pd.read_csv(r".\data\chronic_kidney_disease.csv")
mod_data(df)
dataset = df.astype(float).values.tolist()
#Normalize the data
x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled) #Replace df with normalized values
#Shuffle the dataset
random.shuffle(dataset)
#10% of the available data will be used for testing
test_size = 0.1
#The keys of the dict are the classes that the data is classfied into
training_set = {2: [], 4:[]}
test_set = {2: [], 4:[]}
#Split data into training and test for cross validation
training_data = dataset[:-int(test_size * len(dataset))]
test_data = dataset[-int(test_size * len(dataset)):]
#Insert data into the training set
for record in training_data:
training_set[record[-1]].append(record[:-1]) # Append the list in the dict will all the elements of the record except the class
#Insert data into the test set
for record in test_data:
test_set[record[-1]].append(record[:-1]) # Append the list in the dict will all the elements of the record except the class
s = time.clock()
knn = CustomKNN()
knn.test(test_set, training_set)
e = time.clock()
print("Exec Time: ", e-s)
if __name__ == "__main__":
main()