-
Notifications
You must be signed in to change notification settings - Fork 1
/
Medical_data.py
117 lines (92 loc) · 3.04 KB
/
Medical_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np
import urllib.request
def download_url(url, save_as):
response = urllib.request.urlopen(url)
data = response.read()
file = open(save_as, 'wb')
file.write(data)
file.close()
response.close()
def read_binary_file(file):
f = open(file, 'rb')
block = f.read()
return block.decode('utf-16')
def split_text_in_lines(text):
return text.split('\r\n')
def split_by_tabs(line):
return line.split('\t')
def parse_double(field):
field = field.replace(',', '.')
return float(field)
def parse_boolean(field):
return 1. if field == 'yes' else 0.
def read_np_array(file = diagnosis_data):
text = read_binary_file(file)
lines = split_text_in_lines(text)
rows = []
for line in lines:
if line == '': continue
line = line.replace('\r\n', '')
fields = split_by_tabs(line)
row = []
j = 0
for field in fields:
value = parse_double(field) if j == 0 else parse_boolean(field)
row.append(value)
j += 1
rows.append(row)
matrix = np.array(rows, dtype = np.float32)
return matrix
def get_random_indexes(n):
indexes = list(range(n))
random_indexes = []
for i in range(n):
r = np.random.randint(len(indexes))
random_indexes.append(indexes.pop(r))
return random_indexes
def get_indexes_for_2_datasets(n, training = 80):
indexes = get_random_indexes(n)
train = int(training / 100. * n)
return indexes[:train], indexes[train:]
matrix = read_np_array()
n_samples, n_dimensions = matrix.shape
train_indexes, test_indexes = get_indexes_for_2_datasets(n_samples)
train_data = matrix[train_indexes]
test_data = matrix[test_indexes]
def print_dataset(name, data):
print('Dataset {}. Shape: {}'.format(name, data.shape))
print(data)
def get_medical_data(n_clients):
"""
Import the dataset via sklearn, shuffle and split train/test.
Return training, target lists for `n_clients` and a holdout test set
"""
print("Loading data")
#diabetes = load_diabetes()
y = output1.numpy()
X = input.numpy()
print(X)
print(y)
# Add constant to emulate intercept
X = np.c_[X, np.ones(X.shape[0])]
# The features are already preprocessed
# Shuffle
perm = np.random.permutation(X.shape[0])
X, y = X[perm, :], y[perm]
# Select test at random
test_size = 50
test_idx = np.random.choice(X.shape[0], size=test_size)
train_idx = np.ones(X.shape[0], dtype=bool)
train_idx[test_idx] = False
print(train_idx)
X_test, y_test = X[test_idx, :], y[test_idx]
X_train, y_train = X[train_idx, :], y[train_idx]
# Split train among multiple clients.
# The selection is not at random. We simulate the fact that each client
# sees a potentially very different sample of patients.
X, y = [], []
step = int(X_train.shape[0] / n_clients)
for c in range(n_clients):
X.append(X_train[step * c: step * (c + 1), :])
y.append(y_train[step * c: step * (c + 1)])
return X, y, X_test, y_test