Skip to content
This repository has been archived by the owner on Jul 5, 2021. It is now read-only.

Adding tensorflow summary metrics support for Tensorboard #166

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ loss_vs_epochs.png
colour_codes.csv
CamVid_train.tfrecords
tfrecords.py
.idea
**/events.out.tfevents.*
18 changes: 16 additions & 2 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,19 +155,22 @@ Wall,64, 192, 0
Then you can simply run `train.py`! Check out the optional command line arguments:

```
usage: train.py [-h] [--num_epochs NUM_EPOCHS]
usage: train.py [-h] [--num_epochs NUM_EPOCHS] [--epoch_start_i EPOCH_START_I]
[--checkpoint_step CHECKPOINT_STEP]
[--validation_step VALIDATION_STEP] [--image IMAGE]
[--continue_training CONTINUE_TRAINING] [--dataset DATASET]
[--crop_height CROP_HEIGHT] [--crop_width CROP_WIDTH]
[--batch_size BATCH_SIZE] [--num_val_images NUM_VAL_IMAGES]
[--h_flip H_FLIP] [--v_flip V_FLIP] [--brightness BRIGHTNESS]
[--rotation ROTATION] [--model MODEL] [--frontend FRONTEND]
[--train_dir TRAIN_DIR]

optional arguments:
-h, --help show this help message and exit
--num_epochs NUM_EPOCHS
Number of epochs to train for
--epoch_start_i EPOCH_START_I
Start counting epochs from this number
--checkpoint_step CHECKPOINT_STEP
How often to save checkpoints (epochs)
--validation_step VALIDATION_STEP
Expand Down Expand Up @@ -201,9 +204,20 @@ optional arguments:
supported models
--frontend FRONTEND The frontend you are using. See frontend_builder.py
for supported models
--train_dir TRAIN_DIR
The directory on which training artifacts will be
stored.

```

### Tensorboard
You can visualize several training metrics using Tensorboard as shown below:

![Alt text](images/tensorboard-metrics.jpg?raw=true "Training Metrics")

You can also visualize the outputs of the network on part of the validation
dataset across epochs. For each image it will show the input | prediction | ground truth.

![Alt text](images/tensorboard-images.jpg?raw=true "Training Metrics")

## Results

Expand Down
Binary file added docs/images/tensorboard-images.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/tensorboard-metrics.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
26 changes: 26 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
absl-py==0.6.1
astor==0.7.1
certifi==2018.8.24
cycler==0.10.0
gast==0.2.0
grpcio==1.16.1
h5py==2.8.0
Keras-Applications==1.0.6
Keras-Preprocessing==1.0.5
kiwisolver==1.0.1
Markdown==3.0.1
matplotlib==3.0.2
numpy==1.15.4
opencv-python==3.4.4.19
Pillow==5.3.0
protobuf==3.6.1
pyparsing==2.3.0
python-dateutil==2.7.5
scikit-learn==0.20.1
scipy==1.1.0
six==1.11.0
sklearn==0.0
tensorboard==1.12.0
tensorflow-gpu==1.12.0
termcolor==1.1.0
Werkzeug==0.14.1
105 changes: 89 additions & 16 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def str2bool(v):
parser.add_argument('--rotation', type=float, default=None, help='Whether to randomly rotate the image for data augmentation. Specifies the max rotation angle in degrees.')
parser.add_argument('--model', type=str, default="FC-DenseNet56", help='The model you are using. See model_builder.py for supported models')
parser.add_argument('--frontend', type=str, default="ResNet101", help='The frontend you are using. See frontend_builder.py for supported models')
parser.add_argument('--train_dir', type=str, default="train", help='The directory on which training artifacts will be stored.')
args = parser.parse_args()


Expand Down Expand Up @@ -96,7 +97,8 @@ def data_augmentation(input_image, output_image):

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=network, labels=net_output))

opt = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.995).minimize(loss, var_list=[var for var in tf.trainable_variables()])
global_step = tf.Variable(0, name='global_step', trainable=False)
opt = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.995).minimize(loss, global_step=global_step, var_list=[var for var in tf.trainable_variables()])

saver=tf.train.Saver(max_to_keep=1000)
sess.run(tf.global_variables_initializer())
Expand All @@ -109,7 +111,7 @@ def data_augmentation(input_image, output_image):
init_fn(sess)

# Load a previous checkpoint if desired
model_checkpoint_name = "checkpoints/latest_model_" + args.model + "_" + args.dataset + ".ckpt"
model_checkpoint_name = args.train_dir + "/checkpoints/latest_model_" + args.model + "_" + args.dataset + ".ckpt"
if args.continue_training:
print('Loaded latest model checkpoint')
saver.restore(sess, model_checkpoint_name)
Expand Down Expand Up @@ -149,6 +151,52 @@ def data_augmentation(input_image, output_image):
random.seed(16)
val_indices=random.sample(range(0,len(val_input_names)),num_vals)


class ImageSummary:
def __init__(self, placeholder):
self.summary = tf.summary.image(placeholder.op.name, placeholder)
self.tensor = placeholder


class ScalarSummary:
def __init__(self, placeholder):
self.summary = tf.summary.scalar(placeholder.op.name, placeholder)
self.tensor = placeholder

summary_train_loss = ScalarSummary(tf.placeholder(tf.float32, name='epoch_train_loss'))
train_summaries = tf.summary.merge([summary_train_loss.summary])

summary_class_accuracies = []
with tf.name_scope('epoch_per_class_metrics'):
for class_name in class_names_list:
tensor = tf.placeholder(tf.float32, name='epoch_{}_class_accuracy_avg_score'.format(class_name))
summary_class_accuracies.append(ScalarSummary(tensor))

summary_epoch_avg_accuracy = ScalarSummary(tf.placeholder(tf.float32, name='epoch_avg_accuracy'))
summary_avg_precision = ScalarSummary(tf.placeholder(tf.float32, name='epoch_avg_precision'))
summary_avg_recall = ScalarSummary(tf.placeholder(tf.float32, name='epoch_avg_recall'))
summary_avg_f1 = ScalarSummary(tf.placeholder(tf.float32, name='epoch_avg_f1'))
summary_avg_iou = ScalarSummary(tf.placeholder(tf.float32, name='epoch_avg_iou'))

summaries = [
summary_epoch_avg_accuracy,
summary_avg_precision,
summary_avg_recall,
summary_avg_f1,
summary_avg_iou,
*summary_class_accuracies
]

validation_output_images_summaries = dict()
for index in val_indices:
image_name = utils.filepath_to_name(val_input_names[index])
summary = ImageSummary(tf.placeholder(tf.float32, shape=[None, None, None, 3], name=image_name))
validation_output_images_summaries[image_name] = summary

validation_summaries = tf.summary.merge([s.summary for s in summaries])

train_writer = tf.summary.FileWriter(args.train_dir + '/train', sess.graph)

# Do the training here
for epoch in range(args.epoch_start_i, args.num_epochs):

Expand Down Expand Up @@ -194,7 +242,7 @@ def data_augmentation(input_image, output_image):
output_image_batch = np.squeeze(np.stack(output_image_batch, axis=1))

# Do the training
_,current=sess.run([opt,loss],feed_dict={net_input:input_image_batch,net_output:output_image_batch})
_, current = sess.run([opt,loss],feed_dict={net_input:input_image_batch,net_output:output_image_batch})
current_losses.append(current)
cnt = cnt + args.batch_size
if cnt % 20 == 0:
Expand All @@ -205,22 +253,27 @@ def data_augmentation(input_image, output_image):
mean_loss = np.mean(current_losses)
avg_loss_per_epoch.append(mean_loss)

summary = sess.run(train_summaries, feed_dict={
summary_train_loss.tensor: mean_loss
})
train_writer.add_summary(summary, epoch)

# Create directories if needed
if not os.path.isdir("%s/%04d"%("checkpoints",epoch)):
os.makedirs("%s/%04d"%("checkpoints",epoch))
if not os.path.isdir("%s/%s/%04d" % (args.train_dir, "checkpoints",epoch)):
os.makedirs("%s/%s/%04d" % (args.train_dir, "checkpoints",epoch))

# Save latest checkpoint to same file name
print("Saving latest checkpoint")
saver.save(sess,model_checkpoint_name)

if val_indices != 0 and epoch % args.checkpoint_step == 0:
print("Saving checkpoint for this epoch")
saver.save(sess,"%s/%04d/model.ckpt"%("checkpoints",epoch))
saver.save(sess,"%s/%s/%04d/model.ckpt" % (args.train_dir, "checkpoints",epoch))


if epoch % args.validation_step == 0:
print("Performing validation")
target=open("%s/%04d/val_scores.csv"%("checkpoints",epoch),'w')
target=open("%s/%s/%04d/val_scores.csv" % (args.train_dir, "checkpoints",epoch),'w')
target.write("val_name, avg_accuracy, precision, recall, f1 score, mean iou, %s\n" % (class_names_string))


Expand All @@ -234,16 +287,15 @@ def data_augmentation(input_image, output_image):

# Do the validation on a small set of validation images
for ind in val_indices:

input_image = np.expand_dims(np.float32(utils.load_image(val_input_names[ind])[:args.crop_height, :args.crop_width]),axis=0)/255.0
input_image_rgb = np.float32(utils.load_image(val_input_names[ind])[:args.crop_height, :args.crop_width])
input_image = np.expand_dims(input_image_rgb,axis=0)/255.0
gt = utils.load_image(val_output_names[ind])[:args.crop_height, :args.crop_width]
gt = helpers.reverse_one_hot(helpers.one_hot_it(gt, label_values))

# st = time.time()

output_image = sess.run(network,feed_dict={net_input:input_image})


output_image = np.array(output_image[0,:,:,:])
output_image = helpers.reverse_one_hot(output_image)
out_vis_image = helpers.colour_code_segmentation(output_image, label_values)
Expand All @@ -265,10 +317,18 @@ def data_augmentation(input_image, output_image):

gt = helpers.colour_code_segmentation(gt, label_values)

image_summary = validation_output_images_summaries[file_name]
input_pred_golden_size_by_size = np.concatenate((input_image_rgb, out_vis_image, gt), axis=1)
summary = sess.run(image_summary.summary, feed_dict={
net_input: input_image,
image_summary.tensor: np.expand_dims(input_pred_golden_size_by_size, axis=0),
})
train_writer.add_summary(summary, epoch)

file_name = os.path.basename(val_input_names[ind])
file_name = os.path.splitext(file_name)[0]
cv2.imwrite("%s/%04d/%s_pred.png"%("checkpoints",epoch, file_name),cv2.cvtColor(np.uint8(out_vis_image), cv2.COLOR_RGB2BGR))
cv2.imwrite("%s/%04d/%s_gt.png"%("checkpoints",epoch, file_name),cv2.cvtColor(np.uint8(gt), cv2.COLOR_RGB2BGR))
cv2.imwrite("%s/%s/%04d/%s_pred.png"%(args.train_dir, "checkpoints",epoch, file_name),cv2.cvtColor(np.uint8(out_vis_image), cv2.COLOR_RGB2BGR))
cv2.imwrite("%s/%s/%04d/%s_gt.png"%(args.train_dir, "checkpoints",epoch, file_name),cv2.cvtColor(np.uint8(gt), cv2.COLOR_RGB2BGR))


target.close()
Expand All @@ -291,6 +351,19 @@ def data_augmentation(input_image, output_image):
print("Validation F1 score = ", avg_f1)
print("Validation IoU score = ", avg_iou)

feed_dict = {
summary_epoch_avg_accuracy.tensor: avg_score,
summary_avg_precision.tensor: avg_precision,
summary_avg_recall.tensor: avg_recall,
summary_avg_iou.tensor: avg_iou,
summary_avg_f1.tensor: avg_f1
}
for index, summary in enumerate(summary_class_accuracies):
feed_dict[summary.tensor] = class_avg_scores[index]

summary = sess.run(validation_summaries, feed_dict)
train_writer.add_summary(summary, epoch)

epoch_time=time.time()-epoch_st
remain_time=epoch_time*(args.num_epochs-1-epoch)
m, s = divmod(remain_time, 60)
Expand All @@ -305,7 +378,7 @@ def data_augmentation(input_image, output_image):

fig1, ax1 = plt.subplots(figsize=(11, 8))

ax1.plot(range(epoch+1), avg_scores_per_epoch)
ax1.plot(range((epoch + 1 - len(avg_scores_per_epoch)), epoch + 1), avg_scores_per_epoch)
ax1.set_title("Average validation accuracy vs epochs")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Avg. val. accuracy")
Expand All @@ -317,7 +390,7 @@ def data_augmentation(input_image, output_image):

fig2, ax2 = plt.subplots(figsize=(11, 8))

ax2.plot(range(epoch+1), avg_loss_per_epoch)
ax2.plot(range((epoch + 1 - len(avg_loss_per_epoch)), epoch + 1), avg_loss_per_epoch)
ax2.set_title("Average loss vs epochs")
ax2.set_xlabel("Epoch")
ax2.set_ylabel("Current loss")
Expand All @@ -328,12 +401,12 @@ def data_augmentation(input_image, output_image):

fig3, ax3 = plt.subplots(figsize=(11, 8))

ax3.plot(range(epoch+1), avg_iou_per_epoch)
ax3.plot(range((epoch + 1 - len(avg_iou_per_epoch)), epoch + 1), avg_iou_per_epoch)
ax3.set_title("Average IoU vs epochs")
ax3.set_xlabel("Epoch")
ax3.set_ylabel("Current IoU")

plt.savefig('iou_vs_epochs.png')


train_writer.close()