From cb663df0750e172665a62276aae0221a8096b6e9 Mon Sep 17 00:00:00 2001 From: tanzhenyu Date: Tue, 16 Jun 2020 15:48:52 -0700 Subject: [PATCH 1/4] Add BoxEncoder for SSD and FasterRCNN. --- kerascv/layers/ssd_box_encoder.py | 94 ++++++++++++++++++++ tests/kerascv/layers/ssd_box_encoder_test.py | 35 ++++++++ 2 files changed, 129 insertions(+) create mode 100644 kerascv/layers/ssd_box_encoder.py create mode 100644 tests/kerascv/layers/ssd_box_encoder_test.py diff --git a/kerascv/layers/ssd_box_encoder.py b/kerascv/layers/ssd_box_encoder.py new file mode 100644 index 0000000000..bd68c79652 --- /dev/null +++ b/kerascv/layers/ssd_box_encoder.py @@ -0,0 +1,94 @@ +import tensorflow as tf + + +class SSDBoxEncoder(tf.keras.layers.Layer): + """Defines a SSDBoxEncoder that converts encodes the ground_truth_boxes using anchors. + + Mathematically, the encoding is: + $ \hat{cx_gt} = (cx_gt - cx_a) / width_a + $ \hat{cy_gt} = (cy_gt - cy_a) / height_a + $ \hat{width_gt} = log(width_gt / width_a) + $ \hat{height_gt} = log(height_gt / height_a) + + where cx, cy, width, height represents center of width, center of height, width, height respectively, + and subscript `gt` represents ground truth box, `a` represents anchor. + + The `boxes` must have the same shape as `anchors`, this is typically the result of assigning + `ground_truth_boxes` to anchors based on a certain matching strategy (argmax, bipartite) + + # Attributes: + variances: The 1-D scaling factor with 4 floats. This is used to represent the variance of + [y_center, x_center, height, width] in Gaussian distribution when labeling the ground truth boxes. + During encoding, the result will be divided by the variances. During decoding, the result will be + multiplied by the variances. Defaults to `None` where no variance is applied. + The SSD paper uses [.1, .1, .2, .2]. + invert: Boolean to indicate whether the layer should encode the `boxes`, i.e., convert from + [y_min, x_min, y_max, x_max] format to [ty, tx, h, w] format, if True, or the other way around, + if False. Defaults to 'False'. + + # References + [Wei Liu et al., 2015](https://arxiv.org/abs/1512.02325) + """ + + def __init__(self, variances=None, invert=False, name=None, **kwargs): + self.variances = variances + self.invert = invert + super(SSDBoxEncoder, self).__init__(name=name, **kwargs) + + def call(self, boxes, anchors): + def corner_to_centroids(box_tensor): + box_tensor = tf.cast(box_tensor, tf.float32) + y_min, x_min, y_max, x_max = tf.split( + box_tensor, num_or_size_splits=4, axis=-1 + ) + height = y_max - y_min + width = x_max - x_min + cy = y_min + 0.5 * height + cx = x_min + 0.5 * width + return ( + cy, + cx, + height + tf.keras.backend.epsilon(), + width + tf.keras.backend.epsilon(), + ) + + cy_a, cx_a, height_a, width_a = corner_to_centroids(anchors) + + if not self.invert: + cy_gt, cx_gt, height_gt, width_gt = corner_to_centroids(boxes) + ty = (cy_gt - cy_a) / height_a + tx = (cx_gt - cx_a) / width_a + th = tf.math.log(height_gt / height_a) + tw = tf.math.log(width_gt / width_a) + + if self.variances is not None: + ty = ty / tf.cast(self.variances[0], dtype=ty.dtype) + tx = tx / tf.cast(self.variances[1], dtype=tx.dtype) + th = th / tf.cast(self.variances[2], dtype=th.dtype) + tw = tw / tf.cast(self.variances[3], dtype=tw.dtype) + + return tf.concat([ty, tx, th, tw], axis=-1) + + else: + ty, tx, th, tw = tf.split(boxes, num_or_size_splits=4, axis=-1) + if self.variances is not None: + ty = ty * tf.cast(self.variances[0], dtype=ty.dtype) + tx = tx * tf.cast(self.variances[1], dtype=tx.dtype) + th = th * tf.cast(self.variances[2], dtype=th.dtype) + tw = tw * tf.cast(self.variances[3], dtype=tw.dtype) + + height_gt = tf.math.exp(th) * height_a + width_gt = tf.math.exp(tw) * width_a + cy_gt = ty * height_a + cy_a + cx_gt = tx * width_a + cx_a + y_min_gt = cy_gt - 0.5 * height_gt + y_max_gt = cy_gt + 0.5 * height_gt + x_min_gt = cx_gt - 0.5 * width_gt + x_max_gt = cx_gt + 0.5 * width_gt + + return tf.concat([y_min_gt, x_min_gt, y_max_gt, x_max_gt], axis=-1) + + def get_config(self): + config = {"variances": self.variances, "invert": self.invert} + base_config = super(SSDBoxEncoder, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/tests/kerascv/layers/ssd_box_encoder_test.py b/tests/kerascv/layers/ssd_box_encoder_test.py new file mode 100644 index 0000000000..652958ba9d --- /dev/null +++ b/tests/kerascv/layers/ssd_box_encoder_test.py @@ -0,0 +1,35 @@ +import numpy as np +from kerascv.layers.ssd_box_encoder import SSDBoxEncoder + + +def test_encode_decode_variance(): + gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32) + anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32) + encode_layer = SSDBoxEncoder(variances=[0.5, 1 / 3, 0.25, 0.2]) + encoded_gt_boxes = encode_layer(gt_boxes, anchors) + expected_out = np.asarray( + [ + [-1.0, -1.25, -1.62186, -0.911608], + [-0.166667, -0.666667, -2.772588, -5.493062], + ] + ) + np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-06, atol=1e-6) + + decode_layer = SSDBoxEncoder(variances=[0.5, 1 / 3, 0.25, 0.2], invert=True) + decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors) + np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6) + + +def test_encode_decode_no_variance(): + gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32) + anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32) + encode_layer = SSDBoxEncoder() + encoded_gt_boxes = encode_layer(gt_boxes, anchors) + expected_out = np.asarray( + [[-0.5, -0.41666, -0.40546, -0.18232], [-0.08333, -0.22222, -0.69314, -1.0986]] + ) + np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-05, atol=1e-5) + + decode_layer = SSDBoxEncoder(invert=True) + decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors) + np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6) From 37a99933aae25295fe68e2e588ad1dca44b49c6b Mon Sep 17 00:00:00 2001 From: tanzhenyu Date: Tue, 16 Jun 2020 15:50:39 -0700 Subject: [PATCH 2/4] [Object_Detection] Rename BoxEncoder to BoxCoder. --- .../layers/{ssd_box_encoder.py => ssd_box_coder.py} | 6 +++--- .../{ssd_box_encoder_test.py => ssd_box_coder_test.py} | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) rename kerascv/layers/{ssd_box_encoder.py => ssd_box_coder.py} (95%) rename tests/kerascv/layers/{ssd_box_encoder_test.py => ssd_box_coder_test.py} (81%) diff --git a/kerascv/layers/ssd_box_encoder.py b/kerascv/layers/ssd_box_coder.py similarity index 95% rename from kerascv/layers/ssd_box_encoder.py rename to kerascv/layers/ssd_box_coder.py index bd68c79652..28738db57c 100644 --- a/kerascv/layers/ssd_box_encoder.py +++ b/kerascv/layers/ssd_box_coder.py @@ -1,7 +1,7 @@ import tensorflow as tf -class SSDBoxEncoder(tf.keras.layers.Layer): +class SSDBoxCoder(tf.keras.layers.Layer): """Defines a SSDBoxEncoder that converts encodes the ground_truth_boxes using anchors. Mathematically, the encoding is: @@ -33,7 +33,7 @@ class SSDBoxEncoder(tf.keras.layers.Layer): def __init__(self, variances=None, invert=False, name=None, **kwargs): self.variances = variances self.invert = invert - super(SSDBoxEncoder, self).__init__(name=name, **kwargs) + super(SSDBoxCoder, self).__init__(name=name, **kwargs) def call(self, boxes, anchors): def corner_to_centroids(box_tensor): @@ -90,5 +90,5 @@ def corner_to_centroids(box_tensor): def get_config(self): config = {"variances": self.variances, "invert": self.invert} - base_config = super(SSDBoxEncoder, self).get_config() + base_config = super(SSDBoxCoder, self).get_config() return dict(list(base_config.items()) + list(config.items())) diff --git a/tests/kerascv/layers/ssd_box_encoder_test.py b/tests/kerascv/layers/ssd_box_coder_test.py similarity index 81% rename from tests/kerascv/layers/ssd_box_encoder_test.py rename to tests/kerascv/layers/ssd_box_coder_test.py index 652958ba9d..9423c1416a 100644 --- a/tests/kerascv/layers/ssd_box_encoder_test.py +++ b/tests/kerascv/layers/ssd_box_coder_test.py @@ -1,11 +1,11 @@ import numpy as np -from kerascv.layers.ssd_box_encoder import SSDBoxEncoder +from kerascv.layers.ssd_box_coder import SSDBoxCoder def test_encode_decode_variance(): gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32) anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32) - encode_layer = SSDBoxEncoder(variances=[0.5, 1 / 3, 0.25, 0.2]) + encode_layer = SSDBoxCoder(variances=[0.5, 1 / 3, 0.25, 0.2]) encoded_gt_boxes = encode_layer(gt_boxes, anchors) expected_out = np.asarray( [ @@ -15,7 +15,7 @@ def test_encode_decode_variance(): ) np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-06, atol=1e-6) - decode_layer = SSDBoxEncoder(variances=[0.5, 1 / 3, 0.25, 0.2], invert=True) + decode_layer = SSDBoxCoder(variances=[0.5, 1 / 3, 0.25, 0.2], invert=True) decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors) np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6) @@ -23,13 +23,13 @@ def test_encode_decode_variance(): def test_encode_decode_no_variance(): gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32) anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32) - encode_layer = SSDBoxEncoder() + encode_layer = SSDBoxCoder() encoded_gt_boxes = encode_layer(gt_boxes, anchors) expected_out = np.asarray( [[-0.5, -0.41666, -0.40546, -0.18232], [-0.08333, -0.22222, -0.69314, -1.0986]] ) np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-05, atol=1e-5) - decode_layer = SSDBoxEncoder(invert=True) + decode_layer = SSDBoxCoder(invert=True) decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors) np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6) From c7bdc8f9f0f7a806e625b0498ab1fb1c79724a76 Mon Sep 17 00:00:00 2001 From: tanzhenyu Date: Tue, 16 Jun 2020 16:19:18 -0700 Subject: [PATCH 3/4] Additional comment in docstring. --- kerascv/layers/ssd_box_coder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kerascv/layers/ssd_box_coder.py b/kerascv/layers/ssd_box_coder.py index 28738db57c..2a058dfe35 100644 --- a/kerascv/layers/ssd_box_coder.py +++ b/kerascv/layers/ssd_box_coder.py @@ -19,8 +19,8 @@ class SSDBoxCoder(tf.keras.layers.Layer): # Attributes: variances: The 1-D scaling factor with 4 floats. This is used to represent the variance of [y_center, x_center, height, width] in Gaussian distribution when labeling the ground truth boxes. - During encoding, the result will be divided by the variances. During decoding, the result will be - multiplied by the variances. Defaults to `None` where no variance is applied. + During encoding, the result will be divided, i.e., normalized by the variances. During decoding, the result + will be multiplied, i.e., denormalized by the variances. Defaults to `None` where no variance is applied. The SSD paper uses [.1, .1, .2, .2]. invert: Boolean to indicate whether the layer should encode the `boxes`, i.e., convert from [y_min, x_min, y_max, x_max] format to [ty, tx, h, w] format, if True, or the other way around, From fb7463d1099544bd9c4b436878d0868bebeb2140 Mon Sep 17 00:00:00 2001 From: tanzhenyu Date: Fri, 19 Jun 2020 15:32:33 -0700 Subject: [PATCH 4/4] [Object_Detection] Address comments for SSDBoxCoder. --- kerascv/layers/ssd_box_coder.py | 93 ++++++++++++++-------- tests/kerascv/layers/ssd_box_coder_test.py | 17 +++- 2 files changed, 77 insertions(+), 33 deletions(-) diff --git a/kerascv/layers/ssd_box_coder.py b/kerascv/layers/ssd_box_coder.py index 2a058dfe35..3adb3f8de6 100644 --- a/kerascv/layers/ssd_box_coder.py +++ b/kerascv/layers/ssd_box_coder.py @@ -2,36 +2,63 @@ class SSDBoxCoder(tf.keras.layers.Layer): - """Defines a SSDBoxEncoder that converts encodes the ground_truth_boxes using anchors. + """Defines a SSDBoxCoder that converts ground_truth_boxes using anchors. - Mathematically, the encoding is: - $ \hat{cx_gt} = (cx_gt - cx_a) / width_a - $ \hat{cy_gt} = (cy_gt - cy_a) / height_a - $ \hat{width_gt} = log(width_gt / width_a) - $ \hat{height_gt} = log(height_gt / height_a) + Mathematically, the encoding result is: + ty = (cy_gt - cy_a) / height_a + tx = (cx_gt - cx_a) / width_a + th = log(height_gt / height_a) + tw = log(width_gt / width_a) - where cx, cy, width, height represents center of width, center of height, width, height respectively, - and subscript `gt` represents ground truth box, `a` represents anchor. + where cx, cy, width, height represents center of width, center of height, + width, height respectively, and subscript `gt` represents ground truth box, + `a` represents anchor. - The `boxes` must have the same shape as `anchors`, this is typically the result of assigning - `ground_truth_boxes` to anchors based on a certain matching strategy (argmax, bipartite) + The `boxes` must have the same shape as `anchors`, this is typically the result + of assigning `ground_truth_boxes` to anchors based on a certain matching + strategy (argmax, bipartite) # Attributes: - variances: The 1-D scaling factor with 4 floats. This is used to represent the variance of - [y_center, x_center, height, width] in Gaussian distribution when labeling the ground truth boxes. - During encoding, the result will be divided, i.e., normalized by the variances. During decoding, the result - will be multiplied, i.e., denormalized by the variances. Defaults to `None` where no variance is applied. - The SSD paper uses [.1, .1, .2, .2]. - invert: Boolean to indicate whether the layer should encode the `boxes`, i.e., convert from - [y_min, x_min, y_max, x_max] format to [ty, tx, h, w] format, if True, or the other way around, - if False. Defaults to 'False'. + center_variances: The 1-D scaling factor with 2 floats. This is used to + represent the variance of center of height and center of width in + Gaussian distribution when labeling the ground truth boxes. + During encoding, the result [ty, tx] will be divided, i.e., normalized + by the variances. During decoding, the result will be multiplied, i.e., + denormalized by the variances. Defaults to `None` where no variance is + applied. The SSD paper uses [.1, .1]. + size_variances: The 1-D scaling factor with 2 floats. This is used to + represent the variance of height and width in Gaussian distribution when + labeling the ground truth boxes. During encoding, the result [th, tw] + will be divided, i.e., normalized by the variances. During decoding, the + result will be multiplied, i.e., denormalized by the variances. Defaults + to `None` where no variance is applied. The SSD paper uses [.2, .2]. + invert: Boolean to indicate whether the layer should encode the `boxes`, + i.e., convert from [y_min, x_min, y_max, x_max] format to [ty, tx, h, w] + format, if True, or the other way around, if False. Defaults to 'False'. # References [Wei Liu et al., 2015](https://arxiv.org/abs/1512.02325) """ - def __init__(self, variances=None, invert=False, name=None, **kwargs): - self.variances = variances + def __init__( + self, + center_variances=None, + size_variances=None, + invert=False, + name=None, + **kwargs + ): + if center_variances is not None and size_variances is not None: + self.center_variances = center_variances + self.size_variances = size_variances + elif center_variances is not None or size_variances is not None: + raise ValueError( + "`center_variances` and `size_variances` should both be None or " + "tuple of floats, got {}, {}".format(center_variances, size_variances) + ) + else: + self.center_variances = None + self.size_variances = None self.invert = invert super(SSDBoxCoder, self).__init__(name=name, **kwargs) @@ -61,21 +88,21 @@ def corner_to_centroids(box_tensor): th = tf.math.log(height_gt / height_a) tw = tf.math.log(width_gt / width_a) - if self.variances is not None: - ty = ty / tf.cast(self.variances[0], dtype=ty.dtype) - tx = tx / tf.cast(self.variances[1], dtype=tx.dtype) - th = th / tf.cast(self.variances[2], dtype=th.dtype) - tw = tw / tf.cast(self.variances[3], dtype=tw.dtype) + if self.center_variances is not None: + ty = ty / tf.cast(self.center_variances[0], dtype=ty.dtype) + tx = tx / tf.cast(self.center_variances[1], dtype=tx.dtype) + th = th / tf.cast(self.size_variances[0], dtype=th.dtype) + tw = tw / tf.cast(self.size_variances[1], dtype=tw.dtype) return tf.concat([ty, tx, th, tw], axis=-1) else: ty, tx, th, tw = tf.split(boxes, num_or_size_splits=4, axis=-1) - if self.variances is not None: - ty = ty * tf.cast(self.variances[0], dtype=ty.dtype) - tx = tx * tf.cast(self.variances[1], dtype=tx.dtype) - th = th * tf.cast(self.variances[2], dtype=th.dtype) - tw = tw * tf.cast(self.variances[3], dtype=tw.dtype) + if self.center_variances is not None: + ty = ty * tf.cast(self.center_variances[0], dtype=ty.dtype) + tx = tx * tf.cast(self.center_variances[1], dtype=tx.dtype) + th = th * tf.cast(self.size_variances[0], dtype=th.dtype) + tw = tw * tf.cast(self.size_variances[1], dtype=tw.dtype) height_gt = tf.math.exp(th) * height_a width_gt = tf.math.exp(tw) * width_a @@ -89,6 +116,10 @@ def corner_to_centroids(box_tensor): return tf.concat([y_min_gt, x_min_gt, y_max_gt, x_max_gt], axis=-1) def get_config(self): - config = {"variances": self.variances, "invert": self.invert} + config = { + "center_variances": self.center_variances, + "size_variances": self.size_variances, + "invert": self.invert, + } base_config = super(SSDBoxCoder, self).get_config() return dict(list(base_config.items()) + list(config.items())) diff --git a/tests/kerascv/layers/ssd_box_coder_test.py b/tests/kerascv/layers/ssd_box_coder_test.py index 9423c1416a..cb9d03b768 100644 --- a/tests/kerascv/layers/ssd_box_coder_test.py +++ b/tests/kerascv/layers/ssd_box_coder_test.py @@ -5,7 +5,9 @@ def test_encode_decode_variance(): gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32) anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32) - encode_layer = SSDBoxCoder(variances=[0.5, 1 / 3, 0.25, 0.2]) + encode_layer = SSDBoxCoder( + center_variances=[0.5, 1 / 3], size_variances=[0.25, 0.2] + ) encoded_gt_boxes = encode_layer(gt_boxes, anchors) expected_out = np.asarray( [ @@ -15,7 +17,9 @@ def test_encode_decode_variance(): ) np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-06, atol=1e-6) - decode_layer = SSDBoxCoder(variances=[0.5, 1 / 3, 0.25, 0.2], invert=True) + decode_layer = SSDBoxCoder( + center_variances=[0.5, 1 / 3], size_variances=[0.25, 0.2], invert=True + ) decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors) np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6) @@ -33,3 +37,12 @@ def test_encode_decode_no_variance(): decode_layer = SSDBoxCoder(invert=True) decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors) np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6) + + +def test_config_with_custom_name(): + layer = SSDBoxCoder( + center_variances=[0.1, 0.1], size_variances=[0.2, 0.2], name="box_coder" + ) + config = layer.get_config() + layer_1 = SSDBoxCoder.from_config(config) + np.testing.assert_equal(layer_1.name, layer.name)