-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_sports.py
160 lines (134 loc) · 5.53 KB
/
cluster_sports.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""
cluster_sports.py - clusters sports from sports_rankings.csv based on user
selected dimensions and number of clusters
"""
import csv
import random
import math
from constants import ALL_DIMENSIONS
def get_initial_means(sports_data, num_clusters):
"""
chooses random points from sports_data as initial means
"""
sports_data_keys = list(sports_data)
initial_means = []
sample_sports = random.sample(list(range(len(sports_data_keys))), num_clusters)
initial_means = [sports_data[sports_data_keys[sample]] for sample in sample_sports]
return initial_means
def distance(point1, point2):
assert len(point1) == len(point2)
return math.sqrt(sum([((point1[i] - point2[i]) ** 2) for i in range(len(point1))]))
def get_closest_mean(point, means):
distances = [distance(point, m) for m in means]
return distances.index(min(distances))
def calculate_means(sports_dimensions_data, clusters, num_dimensions):
means = []
for cluster in clusters:
cluster_mean = []
cluster_length = len(cluster)
for i in range(num_dimensions):
dimensional_sum = 0
for sport in cluster:
dimensional_sum += sports_dimensions_data[sport][i]
dimensional_mean = dimensional_sum / cluster_length
cluster_mean.append(dimensional_mean)
means.append(cluster_mean)
return means
def lloyds_algorithm(means, sports_dimensions_data,
num_clusters, max_iterations=100):
clusters_changed = False
sport_to_cluster = {}
count = 0
while True:
clusters = [[] for i in range(num_clusters)]
for sport in sports_dimensions_data:
closest_mean = get_closest_mean(sports_dimensions_data[sport], means)
if (sport not in sport_to_cluster or sport_to_cluster[sport] != closest_mean):
clusters_changed = True
clusters[closest_mean].append(sport)
sport_to_cluster[sport] = closest_mean
if count == max_iterations or not clusters_changed:
return [clusters, means]
means = calculate_means(sports_dimensions_data, clusters, len(dimensions))
count += 1
def validate_dimensions(dimensions_input):
if dimensions_input.lower() == 'all':
return (True, range(10), 'Success')
dimensions_input = dimensions_input.split(',')
dimensions_list = []
try:
for dim in dimensions_input:
dim = int(dim)
if dim < 0 or dim > 9:
return (False, [], 'Only enter integers between 0 and 9!')
dimensions_list.append(dim)
except ValueError:
return (False, [], 'Only enter integers!')
if not dimensions_list:
return (False, [], 'Enter at least one integer between 0 and 9!')
return (True, dimensions_list, 'Success')
def prompt_dimensions():
print("You can cluster sports based on these 10 dimensions:")
print("----------------------------------------------------")
for i in range(10):
print(f'{ALL_DIMENSIONS[i][1]} - {ALL_DIMENSIONS[i][0]} - ({i})')
print("----------------------------------------------------\n")
valid = False
message = None
dimensions = []
while not valid:
if message:
print(message)
dimensions_input = input("Enter a comma-separated list of the numbers \
for each dimension you want to cluster on (Example: 4, 7, 2, 3): ")
valid, dimensions, message = validate_dimensions(dimensions_input)
dimensions = list(set(dimensions))
dimension_names = [ALL_DIMENSIONS[dim][0] for dim in dimensions]
print('Chosen dimensions: ' + ', '.join(dimension_names))
return dimensions
def validate_num_clusters(num_clusters_input):
try:
num_clusters = int(num_clusters_input)
return True, num_clusters, None
except ValueError:
return False, None, 'Enter integer between 2 and 59!'
def prompt_num_clusters():
valid = False
message = None
num_clusters = 2
while not valid:
if message:
print(message)
num_clusters_input = input("How many clusters do you want to build? \
Enter integer value between 2 and 59: ")
valid, num_clusters, message = validate_num_clusters(num_clusters_input)
return num_clusters
def get_sports_data_from_csv():
sports_csv = open('sports_rankings.csv')
csv_reader = csv.reader(sports_csv, delimiter='\t')
next(csv_reader) #skip first line (header)
sports_data = {}
for line in csv_reader:
sports_data[line[0]] = [float(value) for value in line[1:]]
return sports_data
def get_required_dimensions(sports_data, dimensions):
sports_dimensions_data = {}
for sport in sports_data:
sports_dimensions_data[sport] = [sports_data[sport][dim] for dim in dimensions]
return sports_dimensions_data
dimensions = prompt_dimensions()
num_clusters = prompt_num_clusters()
all_sports_data = get_sports_data_from_csv()
sports_dimensions_data = get_required_dimensions(all_sports_data, dimensions)
initial_means = get_initial_means(sports_dimensions_data, num_clusters)
clusters, means = lloyds_algorithm(initial_means, sports_dimensions_data,
num_clusters, max_iterations=1000)
# displaying results
dimension_names = [ALL_DIMENSIONS[dim][0] for dim in dimensions]
dimension_text = ', '.join(dimension_names)
for i, cluster in enumerate(clusters):
print("Cluster # " + str(i))
print(dimension_text)
print(str([round(mean, 2) for mean in means[i]]) + " - out of 10")
for sport in cluster:
print("\t" + sport)