-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
101 lines (85 loc) · 4.26 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
The main Idea and code of this was copied from towardsdatascience.com
"""
import pickle
import numpy as np
def load_cfar10_batch(cifar10_dataset_folder_path, batch_id):
with open(cifar10_dataset_folder_path + '/data_batch_' + str(batch_id), mode='rb') as file:
# note the encoding type is 'latin1'
batch = pickle.load(file,encoding='latin1')
features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
labels = batch['labels']
return features, labels
#-----------------------------------------------------------------------------------------------------------------------
def load_label_names():
return ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
#-----------------------------------------------------------------------------------------------------------------------
def normalize(x):
"""
argument
- x: input image data in numpy array [32, 32, 3]
return
- normalized x
"""
min_val = np.min(x)
max_val = np.max(x)
x = (x-min_val) / (max_val-min_val)
return x
#-----------------------------------------------------------------------------------------------------------------------
def one_hot_encode(x):
return x
"""
argument
- x: a list of labels
return
- one hot encoding matrix (number of labels, number of class)
"""
encoded = np.zeros((len(x), 10))
for idx, val in enumerate(x):
encoded[idx][val] = 1
# return encoded
#-----------------------------------------------------------------------------------------------------------------------
def _preprocess_and_save(normalize, one_hot_encode, features, labels, filename):
features = normalize(features)
labels = one_hot_encode(labels) #commented because softmax does not need one-hot-encode
pickle.dump((features, labels), open(filename, 'wb'))
#-----------------------------------------------------------------------------------------------------------------------
def preprocess_and_save_data(cifar10_dataset_folder_path, normalize, one_hot_encode):
n_batches = 5
valid_features = []
valid_labels = []
for batch_i in range(1, n_batches + 1):
features, labels = load_cfar10_batch(cifar10_dataset_folder_path, batch_i)
# find index to be the point as validation data in the whole dataset of the batch (10%)
index_of_validation = int(len(features) * 0.1)
# preprocess the 90% of the whole dataset of the batch
# - normalize the features
# - one_hot_encode the lables
# - save in a new file named, "preprocess_batch_" + batch_number
# - each file for each batch
_preprocess_and_save(normalize, one_hot_encode,
features[:-index_of_validation], labels[:-index_of_validation],
'preprocess_batch_' + str(batch_i) + '.p')
# unlike the training dataset, validation dataset will be added through all batch dataset
# - take 10% of the whold dataset of the batch
# - add them into a list of
# - valid_features
# - valid_labels
valid_features.extend(features[-index_of_validation:])
valid_labels.extend(labels[-index_of_validation:])
# preprocess the all stacked validation dataset
_preprocess_and_save(normalize, one_hot_encode,
np.array(valid_features), np.array(valid_labels),
'preprocess_validation.p')
# load the test dataset
with open(cifar10_dataset_folder_path + '/test_batch', mode='rb') as file:
batch = pickle.load(file,encoding='latin1')
# preprocess the testing data
test_features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
test_labels = batch['labels']
# Preprocess and Save all testing data
_preprocess_and_save(normalize, one_hot_encode,
np.array(test_features), np.array(test_labels),
'preprocess_test.p')
#-----------------------------------------------------------------------------------------------------------------------
preprocess_and_save_data(cifar10_dataset_folder_path='data',normalize=normalize,one_hot_encode=one_hot_encode)