-
Notifications
You must be signed in to change notification settings - Fork 57
/
train.py
executable file
·541 lines (477 loc) · 15.9 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
# Build and train PredNet on UCSD sequences.
# This script can be run in isolation, but is really meant to be part of
# an aml pipeline.
import os
import numpy as np
import argparse
import matplotlib.pyplot as plt
import keras
from keras.layers import Input, Dense, Flatten
from keras.layers import TimeDistributed
from keras.callbacks import (
ModelCheckpoint,
EarlyStopping,
CSVLogger,
Callback,
)
from keras.optimizers import Adam
from models.prednet.prednet import PredNet
from models.prednet.data_utils import SequenceGenerator
import urllib.request
from azureml.core import Run
from azureml.core import VERSION
print("defining str2bool")
def str2bool(v):
"""
convert string representation of a boolean into a boolean representation
"""
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError("Boolean value expected.")
print("defining args")
parser = argparse.ArgumentParser(description="Process input arguments")
parser.add_argument(
"--preprocessed_data",
default="./data/preprocessed/",
type=str,
dest="preprocessed_data",
help="data folder mounting point",
)
parser.add_argument(
"--learning_rate",
default=1e-3,
help="learning rate",
type=float,
required=False,
)
parser.add_argument(
"--lr_decay",
default=1e-9,
help="learning rate decay",
type=float,
required=False,
)
parser.add_argument(
"--stack_sizes",
dest="stack_sizes_arg",
default="48, 96, 192",
help="Stack sizes of hidden layers",
type=str,
required=False,
)
parser.add_argument(
"--filter_sizes",
dest="filter_sizes",
default="3, 3, 3",
help="Filter sizes for the three convolutional layers (A, A_hat, R)",
type=str,
required=False,
)
parser.add_argument(
"--layer_loss_weights",
dest="layer_loss_weights_arg",
default="1.0, 0.",
help="Layer loss weights",
type=str,
required=False,
)
parser.add_argument(
"--remote_execution",
dest="remote_execution",
action="store_true",
help="remote execution (AML compute)",
required=False,
)
parser.add_argument(
"--batch_size",
dest="batch_size",
default=4,
help="Batch size",
type=int,
required=False,
)
parser.add_argument(
"--freeze_layers",
dest="freeze_layers",
default="0, 1, 2, 3",
help="space separated list of layers to freeze",
type=str,
required=False,
)
parser.add_argument(
"--fine_tuning",
dest="fine_tuning",
default="False",
help="use the benchmark model and perform fine tuning",
type=str,
required=False,
)
parser.add_argument(
"--dataset",
dest="dataset",
default="UCSDped1",
help="the dataset that we are using",
type=str,
required=False,
)
parser.add_argument(
"--output_mode",
dest="output_mode",
default="error",
help="which output_mode to use (prediction, error)",
type=str,
required=False,
)
parser.add_argument(
"--samples_per_epoch",
dest="samples_per_epoch",
default=10,
help="n samples per epoch",
type=int,
required=False,
)
parser.add_argument(
"--nb_epoch",
dest="nb_epoch",
default=150,
help="max number of epochs",
type=int,
required=False,
)
print("parsing args")
args = parser.parse_args()
nb_epoch = args.nb_epoch
samples_per_epoch = args.samples_per_epoch
learning_rate = args.learning_rate
lr_decay = args.lr_decay
stack_sizes_arg = tuple(map(int, args.stack_sizes_arg.split(",")))
layer_loss_weights_arg = tuple(
map(float, args.layer_loss_weights_arg.split(","))
)
filter_sizes = tuple(map(int, args.filter_sizes.split(",")))
remote_execution = args.remote_execution
preprocessed_data = os.path.join(
args.preprocessed_data,
args.dataset)
batch_size = args.batch_size
fine_tuning = str2bool(args.fine_tuning)
if len(args.freeze_layers) > 0 and fine_tuning:
freeze_layers = tuple(map(int, args.freeze_layers.split(",")))
else:
freeze_layers = []
dataset = args.dataset
output_mode = args.output_mode
print("training dataset is stored here:", preprocessed_data)
# normally would expect data to be passed with a PipelineData object from
# the previous pipeline step. This allows us to instead download the data
if "coursematerial" in preprocessed_data:
preprocessed_data = os.path.join(os.getcwd(), "data")
os.makedirs("data", exist_ok=True)
urllib.request.urlretrieve(
os.path.join(args.preprocessed_data, "X_train.hkl"),
filename=os.path.join(preprocessed_data, "X_train.hkl"),
)
urllib.request.urlretrieve(
os.path.join(args.preprocessed_data, "X_val.hkl"),
filename=os.path.join(preprocessed_data, "X_val.hkl"),
)
urllib.request.urlretrieve(
os.path.join(args.preprocessed_data, "sources_train.hkl"),
filename=os.path.join(preprocessed_data, "sources_train.hkl"),
)
urllib.request.urlretrieve(
os.path.join(args.preprocessed_data, "sources_val.hkl"),
filename=os.path.join(preprocessed_data, "sources_val.hkl"),
)
# create a ./outputs folder in the compute target
# files saved in the "./outputs" folder are automatically uploaded into
# run history
model_dir = 'outputs/model'
os.makedirs(model_dir, exist_ok=True)
# initiate logging if we are running remotely
if remote_execution:
print("Running on remote compute target:", remote_execution)
print("azureml.core.VERSION", VERSION)
# start an Azure ML run
run = Run.get_context()
run.log("learning_rate", learning_rate)
run.log("lr_decay", lr_decay)
run.log("stack_sizes", args.stack_sizes_arg)
run.log("layer_loss_weights_arg", args.layer_loss_weights_arg)
run.log("filter_sizes", args.filter_sizes)
run.log("preprocessed_data", args.preprocessed_data)
run.log("dataset", args.dataset)
run.log("batch_size", batch_size)
run.log("freeze_layers", args.freeze_layers)
run.log("fine_tuning", args.fine_tuning)
# model parameters
A_filt_size, Ahat_filt_size, R_filt_size = filter_sizes
# format of video frames (size of input layer)
n_channels, im_height, im_width = (3, 152, 232)
# settings for sampling the video data
nt = 10 # number of timesteps used for sequences in training
N_seq_val = 15 # number of sequences to use for validation
# settings for training and optimization
optimizer_type = "adam"
min_delta = 1e-4
patience = 10
save_model = True # if weights will be saved
return_sequences = True
# define location for saving model and weights
weights_file = os.path.join(model_dir, "weights.hdf5")
json_file = os.path.join(model_dir, "model.json")
# Load data and source files
X_train_file = os.path.join(preprocessed_data, "X_train.hkl")
train_sources = os.path.join(preprocessed_data, "sources_train.hkl")
X_val_file = os.path.join(preprocessed_data, "X_val.hkl")
y_val_file = os.path.join(preprocessed_data, "y_val.hkl")
val_sources = os.path.join(preprocessed_data, "sources_val.hkl")
X_test_file = os.path.join(preprocessed_data, "X_test.hkl")
y_test_file = os.path.join(preprocessed_data, "y_test.hkl")
test_sources = os.path.join(preprocessed_data, "sources_test.hkl")
if fine_tuning:
print("Performing transfer learning.")
from azureml.core import Workspace
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.core.model import Model
from azureml.exceptions._azureml_exception import ModelNotFoundException
run = Run.get_context()
ws = run.experiment.workspace
keyvault = ws.get_default_keyvault()
tenant_id = keyvault.get_secret('tenantId')
scv_pr_id = keyvault.get_secret("servicePrincipalId")
svc_pr_passwd = keyvault.get_secret("servicePrincipalPassword")
svc_pr = ServicePrincipalAuthentication(
tenant_id=tenant_id,
service_principal_id=scv_pr_id,
service_principal_password=svc_pr_passwd,
)
ws = Workspace(ws.subscription_id, ws.resource_group, ws.name, auth=svc_pr)
try:
model_root = Model.get_model_path("prednet_" + dataset, _workspace=ws)
print("model_root:", model_root)
except ModelNotFoundException:
print(
"Didn't find model for this data set (%s). \
Looking for model for UCSDped1 (where it all started), \
because transfer learning was requested."
)
try:
model_root = Model.get_model_path(
"prednet_UCSDped1", _workspace=ws
)
print("model_root:", model_root)
except ModelNotFoundException as e:
print(
"Warning! No model found in registry, cannot perform transfer"
"learning!"
)
print(e)
fine_tuning = False
# if fine_tuning:
# # load model from json file
# # todo, this is going to the real one
# json_file = open(os.path.join(model_root, "model.json"), "r")
# model_json = json_file.read()
# json_file.close()
# # initialize model
# trained_model = model_from_json(
# model_json, custom_objects={"PredNet": PredNet}
# )
# trained_model.output_mode = output_mode
# trained_model.return_sequences = return_sequences
# # load weights into new model
# trained_model.load_weights(os.path.join(model_root, "weights.hdf5"))
# # retrieve configuration of prednet.
# # all prednet layers are weirdly stored in the second layer of the
# # overall trained_model
# layer_config = trained_model.layers[1].get_config()
# # set output_mode to error, just to be safe
# layer_config["output_mode"] = output_mode
# # create instance of prednet model with the above weights and our
# # layer_config
# prednet = PredNet(
# weights=trained_model.layers[1].get_weights(), **layer_config
# )
# # determine shape of input and define tensor for input
# input_shape = list(trained_model.layers[0].batch_input_shape)[1:]
# inputs = Input(shape=tuple(input_shape))
# # get number of layers and expected length of video sequences
# nb_layers = len(trained_model.layers) - 1
# nt = input_shape[0]
# else:
# Set model characteristics according to our settings
# 4 layer architecture, with 3 input channels (rgb), and 48, 96, 192 units
# in the deep layers
stack_sizes = (n_channels,) + stack_sizes_arg
nb_layers = len(stack_sizes) # number of layers
input_shape = (
(n_channels, im_height, im_width)
if keras.backend.image_data_format() == "channels_first"
else (im_height, im_width, n_channels)
)
# number of channels in the representation modules
R_stack_sizes = stack_sizes
# length is 1 - len(stack_sizes), here targets for layers 2-4 are
# computered by 3x3 convolutions of errors from layer below
A_filt_sizes = (A_filt_size,) * (nb_layers - 1)
# convolutions of the representation layer for computing the predictions
# in each layer
Ahat_filt_sizes = (Ahat_filt_size,) * nb_layers
# filter sizes for the representation modules
R_filt_sizes = (R_filt_size,) * nb_layers
# build the model (see prednet.py)
prednet = PredNet(
stack_sizes,
R_stack_sizes,
A_filt_sizes,
Ahat_filt_sizes,
R_filt_sizes,
output_mode=output_mode,
return_sequences=return_sequences,
)
# define tf tensor for inputs
inputs = Input(shape=(nt,) + input_shape)
# Set up how much the error in each video frame in a sequence is taken
# into account
# equally weight all timesteps except the first (done in next line)
time_loss_weights = 1.0 / (nt - 1) * np.ones((nt, 1))
# we obviously don't blame the model for not being able to predict the
# first video frame in a sequence
time_loss_weights[0] = 0
# Set up how much the error in each layer is taken into account during
# training
# weighting for each layer in final loss; "L_0" model: [1, 0, 0, 0],
# "L_all": [1, 0.1, 0.1, 0.1]
layer_loss_weights = np.array(
[layer_loss_weights_arg[0]]
+ [layer_loss_weights_arg[1]] * (nb_layers - 1)
)
layer_loss_weights = np.expand_dims(layer_loss_weights, 1)
# errors will be (batch_size, nt, nb_layers)
errors = prednet(inputs)
errors_by_time = TimeDistributed(
Dense(1, trainable=False),
weights=[layer_loss_weights, np.zeros(1)],
trainable=False,
)(
errors
) # calculate weighted error by layer
errors_by_time = Flatten()(errors_by_time) # will be (batch_size, nt)
final_errors = Dense(
1, weights=[time_loss_weights, np.zeros(1)], trainable=False
)(
errors_by_time
) # weight errors by time
loss_type = "mean_absolute_error"
# define optimizer
optimizer = Adam(lr=learning_rate, decay=lr_decay)
# put it all together
model = keras.models.Model(inputs=inputs, outputs=final_errors)
model.compile(loss=loss_type, optimizer=optimizer)
if fine_tuning:
model.load_weights(
os.path.join(model_root, model_dir, "weights.hdf5"),
by_name=True,
skip_mismatch=True)
# add early stopping
callbacks = [
EarlyStopping(
monitor="val_loss",
min_delta=min_delta,
patience=patience,
verbose=0,
mode="min",
)
]
if save_model:
callbacks.append(
ModelCheckpoint(
filepath=weights_file, monitor="val_loss", save_best_only=True
)
)
# log training results to a file
callbacks.append(
CSVLogger(
filename=os.path.join("train.log"),
separator=",",
append=False,
)
)
# log progress to AML workspace
if remote_execution:
class LogRunMetrics(Callback):
# callback at the end of every epoch
def on_epoch_end(self, epoch, log):
# log a value repeated which creates a list
run.log("val_loss", log["val_loss"])
run.log("loss", log["loss"])
callbacks.append(LogRunMetrics())
# if we desire to freeze any layers, this is where we do this
if len(freeze_layers) > 0:
print("freezing layers:"),
print(freeze_layers)
for freeze_layer in freeze_layers:
for c in [
"i",
"f",
"c",
"o",
"a",
"ahat",
]: # iterate over layers at each depth
try:
model.layers[1].conv_layers[c][freeze_layer].trainable = False
except IndexError:
if c == "a": # deepest layer doesn't have an A layer
pass
else:
print(
"You seemed to be interested in freezing a layer that "
"is deeper than the deepest layer of the model. "
"What's that supposed to do for you?"
)
raise
# object for generating video sequences for training and validation
train_generator = SequenceGenerator(
X_train_file, train_sources, nt, batch_size=batch_size, shuffle=True
)
val_generator = SequenceGenerator(
X_val_file, val_sources, nt, batch_size=batch_size, N_seq=N_seq_val
)
# train the model
history = model.fit(
x=train_generator,
steps_per_epoch=samples_per_epoch / batch_size,
epochs=nb_epoch,
callbacks=callbacks,
validation_data=val_generator,
validation_steps=N_seq_val / batch_size,
)
if remote_execution:
# after training is complete, we log the final loss on the validation set
run.log("final_val_loss", history.history["val_loss"][-1])
# create a figure of how loss changed of there course of training for
# validation and training set
plt.figure(figsize=(6, 3))
plt.title("({} epochs)".format(nb_epoch), fontsize=14)
plt.plot(
history.history["val_loss"], "b-", label="Validation Loss", lw=4, alpha=0.5
)
plt.plot(history.history["loss"], "g-", label="Train Loss", lw=4, alpha=0.5)
plt.legend(fontsize=12)
plt.grid(True)
# log this plot to the aml workspace so we can see it in the azure portal
if remote_execution:
run.log_image("Validation Loss", plot=plt)
else:
plt.savefig("val_log.png")
plt.close()
# serialize NN architecture to JSON
model_json = model.to_json()
# save model JSON
with open(os.path.join(model_dir, "model.json"), "w") as f:
f.write(model_json)