From cb663df0750e172665a62276aae0221a8096b6e9 Mon Sep 17 00:00:00 2001
From: tanzhenyu <tanzheny@google.com>
Date: Tue, 16 Jun 2020 15:48:52 -0700
Subject: [PATCH 1/4] Add BoxEncoder for SSD and FasterRCNN.

---
 kerascv/layers/ssd_box_encoder.py            | 94 ++++++++++++++++++++
 tests/kerascv/layers/ssd_box_encoder_test.py | 35 ++++++++
 2 files changed, 129 insertions(+)
 create mode 100644 kerascv/layers/ssd_box_encoder.py
 create mode 100644 tests/kerascv/layers/ssd_box_encoder_test.py

diff --git a/kerascv/layers/ssd_box_encoder.py b/kerascv/layers/ssd_box_encoder.py
new file mode 100644
index 0000000000..bd68c79652
--- /dev/null
+++ b/kerascv/layers/ssd_box_encoder.py
@@ -0,0 +1,94 @@
+import tensorflow as tf
+
+
+class SSDBoxEncoder(tf.keras.layers.Layer):
+    """Defines a SSDBoxEncoder that converts encodes the ground_truth_boxes using anchors.
+
+    Mathematically, the encoding is:
+        $ \hat{cx_gt} = (cx_gt - cx_a) / width_a
+        $ \hat{cy_gt} = (cy_gt - cy_a) / height_a
+        $ \hat{width_gt} = log(width_gt / width_a)
+        $ \hat{height_gt} = log(height_gt / height_a)
+
+    where cx, cy, width, height represents center of width, center of height, width, height respectively,
+    and subscript `gt` represents ground truth box, `a` represents anchor.
+
+    The `boxes` must have the same shape as `anchors`, this is typically the result of assigning
+    `ground_truth_boxes` to anchors based on a certain matching strategy (argmax, bipartite)
+
+    # Attributes:
+        variances: The 1-D scaling factor with 4 floats. This is used to represent the variance of
+            [y_center, x_center, height, width] in Gaussian distribution when labeling the ground truth boxes.
+            During encoding, the result will be divided by the variances. During decoding, the result will be
+            multiplied by the variances. Defaults to `None` where no variance is applied.
+            The SSD paper uses [.1, .1, .2, .2].
+        invert: Boolean to indicate whether the layer should encode the `boxes`, i.e., convert from
+            [y_min, x_min, y_max, x_max] format to [ty, tx, h, w] format, if True, or the other way around,
+            if False. Defaults to 'False'.
+
+    # References
+        [Wei Liu et al., 2015](https://arxiv.org/abs/1512.02325)
+    """
+
+    def __init__(self, variances=None, invert=False, name=None, **kwargs):
+        self.variances = variances
+        self.invert = invert
+        super(SSDBoxEncoder, self).__init__(name=name, **kwargs)
+
+    def call(self, boxes, anchors):
+        def corner_to_centroids(box_tensor):
+            box_tensor = tf.cast(box_tensor, tf.float32)
+            y_min, x_min, y_max, x_max = tf.split(
+                box_tensor, num_or_size_splits=4, axis=-1
+            )
+            height = y_max - y_min
+            width = x_max - x_min
+            cy = y_min + 0.5 * height
+            cx = x_min + 0.5 * width
+            return (
+                cy,
+                cx,
+                height + tf.keras.backend.epsilon(),
+                width + tf.keras.backend.epsilon(),
+            )
+
+        cy_a, cx_a, height_a, width_a = corner_to_centroids(anchors)
+
+        if not self.invert:
+            cy_gt, cx_gt, height_gt, width_gt = corner_to_centroids(boxes)
+            ty = (cy_gt - cy_a) / height_a
+            tx = (cx_gt - cx_a) / width_a
+            th = tf.math.log(height_gt / height_a)
+            tw = tf.math.log(width_gt / width_a)
+
+            if self.variances is not None:
+                ty = ty / tf.cast(self.variances[0], dtype=ty.dtype)
+                tx = tx / tf.cast(self.variances[1], dtype=tx.dtype)
+                th = th / tf.cast(self.variances[2], dtype=th.dtype)
+                tw = tw / tf.cast(self.variances[3], dtype=tw.dtype)
+
+            return tf.concat([ty, tx, th, tw], axis=-1)
+
+        else:
+            ty, tx, th, tw = tf.split(boxes, num_or_size_splits=4, axis=-1)
+            if self.variances is not None:
+                ty = ty * tf.cast(self.variances[0], dtype=ty.dtype)
+                tx = tx * tf.cast(self.variances[1], dtype=tx.dtype)
+                th = th * tf.cast(self.variances[2], dtype=th.dtype)
+                tw = tw * tf.cast(self.variances[3], dtype=tw.dtype)
+
+            height_gt = tf.math.exp(th) * height_a
+            width_gt = tf.math.exp(tw) * width_a
+            cy_gt = ty * height_a + cy_a
+            cx_gt = tx * width_a + cx_a
+            y_min_gt = cy_gt - 0.5 * height_gt
+            y_max_gt = cy_gt + 0.5 * height_gt
+            x_min_gt = cx_gt - 0.5 * width_gt
+            x_max_gt = cx_gt + 0.5 * width_gt
+
+            return tf.concat([y_min_gt, x_min_gt, y_max_gt, x_max_gt], axis=-1)
+
+    def get_config(self):
+        config = {"variances": self.variances, "invert": self.invert}
+        base_config = super(SSDBoxEncoder, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/tests/kerascv/layers/ssd_box_encoder_test.py b/tests/kerascv/layers/ssd_box_encoder_test.py
new file mode 100644
index 0000000000..652958ba9d
--- /dev/null
+++ b/tests/kerascv/layers/ssd_box_encoder_test.py
@@ -0,0 +1,35 @@
+import numpy as np
+from kerascv.layers.ssd_box_encoder import SSDBoxEncoder
+
+
+def test_encode_decode_variance():
+    gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32)
+    anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32)
+    encode_layer = SSDBoxEncoder(variances=[0.5, 1 / 3, 0.25, 0.2])
+    encoded_gt_boxes = encode_layer(gt_boxes, anchors)
+    expected_out = np.asarray(
+        [
+            [-1.0, -1.25, -1.62186, -0.911608],
+            [-0.166667, -0.666667, -2.772588, -5.493062],
+        ]
+    )
+    np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-06, atol=1e-6)
+
+    decode_layer = SSDBoxEncoder(variances=[0.5, 1 / 3, 0.25, 0.2], invert=True)
+    decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors)
+    np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6)
+
+
+def test_encode_decode_no_variance():
+    gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32)
+    anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32)
+    encode_layer = SSDBoxEncoder()
+    encoded_gt_boxes = encode_layer(gt_boxes, anchors)
+    expected_out = np.asarray(
+        [[-0.5, -0.41666, -0.40546, -0.18232], [-0.08333, -0.22222, -0.69314, -1.0986]]
+    )
+    np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-05, atol=1e-5)
+
+    decode_layer = SSDBoxEncoder(invert=True)
+    decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors)
+    np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6)

From 37a99933aae25295fe68e2e588ad1dca44b49c6b Mon Sep 17 00:00:00 2001
From: tanzhenyu <tanzheny@google.com>
Date: Tue, 16 Jun 2020 15:50:39 -0700
Subject: [PATCH 2/4] [Object_Detection] Rename BoxEncoder to BoxCoder.

---
 .../layers/{ssd_box_encoder.py => ssd_box_coder.py}    |  6 +++---
 .../{ssd_box_encoder_test.py => ssd_box_coder_test.py} | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)
 rename kerascv/layers/{ssd_box_encoder.py => ssd_box_coder.py} (95%)
 rename tests/kerascv/layers/{ssd_box_encoder_test.py => ssd_box_coder_test.py} (81%)

diff --git a/kerascv/layers/ssd_box_encoder.py b/kerascv/layers/ssd_box_coder.py
similarity index 95%
rename from kerascv/layers/ssd_box_encoder.py
rename to kerascv/layers/ssd_box_coder.py
index bd68c79652..28738db57c 100644
--- a/kerascv/layers/ssd_box_encoder.py
+++ b/kerascv/layers/ssd_box_coder.py
@@ -1,7 +1,7 @@
 import tensorflow as tf
 
 
-class SSDBoxEncoder(tf.keras.layers.Layer):
+class SSDBoxCoder(tf.keras.layers.Layer):
     """Defines a SSDBoxEncoder that converts encodes the ground_truth_boxes using anchors.
 
     Mathematically, the encoding is:
@@ -33,7 +33,7 @@ class SSDBoxEncoder(tf.keras.layers.Layer):
     def __init__(self, variances=None, invert=False, name=None, **kwargs):
         self.variances = variances
         self.invert = invert
-        super(SSDBoxEncoder, self).__init__(name=name, **kwargs)
+        super(SSDBoxCoder, self).__init__(name=name, **kwargs)
 
     def call(self, boxes, anchors):
         def corner_to_centroids(box_tensor):
@@ -90,5 +90,5 @@ def corner_to_centroids(box_tensor):
 
     def get_config(self):
         config = {"variances": self.variances, "invert": self.invert}
-        base_config = super(SSDBoxEncoder, self).get_config()
+        base_config = super(SSDBoxCoder, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
diff --git a/tests/kerascv/layers/ssd_box_encoder_test.py b/tests/kerascv/layers/ssd_box_coder_test.py
similarity index 81%
rename from tests/kerascv/layers/ssd_box_encoder_test.py
rename to tests/kerascv/layers/ssd_box_coder_test.py
index 652958ba9d..9423c1416a 100644
--- a/tests/kerascv/layers/ssd_box_encoder_test.py
+++ b/tests/kerascv/layers/ssd_box_coder_test.py
@@ -1,11 +1,11 @@
 import numpy as np
-from kerascv.layers.ssd_box_encoder import SSDBoxEncoder
+from kerascv.layers.ssd_box_coder import SSDBoxCoder
 
 
 def test_encode_decode_variance():
     gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32)
     anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32)
-    encode_layer = SSDBoxEncoder(variances=[0.5, 1 / 3, 0.25, 0.2])
+    encode_layer = SSDBoxCoder(variances=[0.5, 1 / 3, 0.25, 0.2])
     encoded_gt_boxes = encode_layer(gt_boxes, anchors)
     expected_out = np.asarray(
         [
@@ -15,7 +15,7 @@ def test_encode_decode_variance():
     )
     np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-06, atol=1e-6)
 
-    decode_layer = SSDBoxEncoder(variances=[0.5, 1 / 3, 0.25, 0.2], invert=True)
+    decode_layer = SSDBoxCoder(variances=[0.5, 1 / 3, 0.25, 0.2], invert=True)
     decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors)
     np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6)
 
@@ -23,13 +23,13 @@ def test_encode_decode_variance():
 def test_encode_decode_no_variance():
     gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32)
     anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32)
-    encode_layer = SSDBoxEncoder()
+    encode_layer = SSDBoxCoder()
     encoded_gt_boxes = encode_layer(gt_boxes, anchors)
     expected_out = np.asarray(
         [[-0.5, -0.41666, -0.40546, -0.18232], [-0.08333, -0.22222, -0.69314, -1.0986]]
     )
     np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-05, atol=1e-5)
 
-    decode_layer = SSDBoxEncoder(invert=True)
+    decode_layer = SSDBoxCoder(invert=True)
     decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors)
     np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6)

From c7bdc8f9f0f7a806e625b0498ab1fb1c79724a76 Mon Sep 17 00:00:00 2001
From: tanzhenyu <tanzheny@google.com>
Date: Tue, 16 Jun 2020 16:19:18 -0700
Subject: [PATCH 3/4] Additional comment in docstring.

---
 kerascv/layers/ssd_box_coder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kerascv/layers/ssd_box_coder.py b/kerascv/layers/ssd_box_coder.py
index 28738db57c..2a058dfe35 100644
--- a/kerascv/layers/ssd_box_coder.py
+++ b/kerascv/layers/ssd_box_coder.py
@@ -19,8 +19,8 @@ class SSDBoxCoder(tf.keras.layers.Layer):
     # Attributes:
         variances: The 1-D scaling factor with 4 floats. This is used to represent the variance of
             [y_center, x_center, height, width] in Gaussian distribution when labeling the ground truth boxes.
-            During encoding, the result will be divided by the variances. During decoding, the result will be
-            multiplied by the variances. Defaults to `None` where no variance is applied.
+            During encoding, the result will be divided, i.e., normalized by the variances. During decoding, the result
+            will be multiplied, i.e., denormalized by the variances. Defaults to `None` where no variance is applied.
             The SSD paper uses [.1, .1, .2, .2].
         invert: Boolean to indicate whether the layer should encode the `boxes`, i.e., convert from
             [y_min, x_min, y_max, x_max] format to [ty, tx, h, w] format, if True, or the other way around,

From fb7463d1099544bd9c4b436878d0868bebeb2140 Mon Sep 17 00:00:00 2001
From: tanzhenyu <tanzheny@google.com>
Date: Fri, 19 Jun 2020 15:32:33 -0700
Subject: [PATCH 4/4] [Object_Detection] Address comments for SSDBoxCoder.

---
 kerascv/layers/ssd_box_coder.py            | 93 ++++++++++++++--------
 tests/kerascv/layers/ssd_box_coder_test.py | 17 +++-
 2 files changed, 77 insertions(+), 33 deletions(-)

diff --git a/kerascv/layers/ssd_box_coder.py b/kerascv/layers/ssd_box_coder.py
index 2a058dfe35..3adb3f8de6 100644
--- a/kerascv/layers/ssd_box_coder.py
+++ b/kerascv/layers/ssd_box_coder.py
@@ -2,36 +2,63 @@
 
 
 class SSDBoxCoder(tf.keras.layers.Layer):
-    """Defines a SSDBoxEncoder that converts encodes the ground_truth_boxes using anchors.
+    """Defines a SSDBoxCoder that converts ground_truth_boxes using anchors.
 
-    Mathematically, the encoding is:
-        $ \hat{cx_gt} = (cx_gt - cx_a) / width_a
-        $ \hat{cy_gt} = (cy_gt - cy_a) / height_a
-        $ \hat{width_gt} = log(width_gt / width_a)
-        $ \hat{height_gt} = log(height_gt / height_a)
+    Mathematically, the encoding result is:
+        ty = (cy_gt - cy_a) / height_a
+        tx = (cx_gt - cx_a) / width_a
+        th = log(height_gt / height_a)
+        tw = log(width_gt / width_a)
 
-    where cx, cy, width, height represents center of width, center of height, width, height respectively,
-    and subscript `gt` represents ground truth box, `a` represents anchor.
+    where cx, cy, width, height represents center of width, center of height,
+    width, height respectively, and subscript `gt` represents ground truth box,
+    `a` represents anchor.
 
-    The `boxes` must have the same shape as `anchors`, this is typically the result of assigning
-    `ground_truth_boxes` to anchors based on a certain matching strategy (argmax, bipartite)
+    The `boxes` must have the same shape as `anchors`, this is typically the result
+    of assigning `ground_truth_boxes` to anchors based on a certain matching
+    strategy (argmax, bipartite)
 
     # Attributes:
-        variances: The 1-D scaling factor with 4 floats. This is used to represent the variance of
-            [y_center, x_center, height, width] in Gaussian distribution when labeling the ground truth boxes.
-            During encoding, the result will be divided, i.e., normalized by the variances. During decoding, the result
-            will be multiplied, i.e., denormalized by the variances. Defaults to `None` where no variance is applied.
-            The SSD paper uses [.1, .1, .2, .2].
-        invert: Boolean to indicate whether the layer should encode the `boxes`, i.e., convert from
-            [y_min, x_min, y_max, x_max] format to [ty, tx, h, w] format, if True, or the other way around,
-            if False. Defaults to 'False'.
+        center_variances: The 1-D scaling factor with 2 floats. This is used to
+            represent the variance of center of height and center of width in
+            Gaussian distribution when labeling the ground truth boxes.
+            During encoding, the result [ty, tx] will be divided, i.e., normalized
+            by the variances. During decoding, the result will be multiplied, i.e.,
+            denormalized by the variances. Defaults to `None` where no variance is
+            applied. The SSD paper uses [.1, .1].
+        size_variances: The 1-D scaling factor with 2 floats. This is used to
+            represent the variance of height and width in Gaussian distribution when
+            labeling the ground truth boxes. During encoding, the result [th, tw]
+            will be divided, i.e., normalized by the variances. During decoding, the
+            result will be multiplied, i.e., denormalized by the variances. Defaults
+            to `None` where no variance is applied. The SSD paper uses [.2, .2].
+        invert: Boolean to indicate whether the layer should encode the `boxes`,
+            i.e., convert from [y_min, x_min, y_max, x_max] format to [ty, tx, h, w]
+            format, if True, or the other way around, if False. Defaults to 'False'.
 
     # References
         [Wei Liu et al., 2015](https://arxiv.org/abs/1512.02325)
     """
 
-    def __init__(self, variances=None, invert=False, name=None, **kwargs):
-        self.variances = variances
+    def __init__(
+        self,
+        center_variances=None,
+        size_variances=None,
+        invert=False,
+        name=None,
+        **kwargs
+    ):
+        if center_variances is not None and size_variances is not None:
+            self.center_variances = center_variances
+            self.size_variances = size_variances
+        elif center_variances is not None or size_variances is not None:
+            raise ValueError(
+                "`center_variances` and `size_variances` should both be None or "
+                "tuple of floats, got {}, {}".format(center_variances, size_variances)
+            )
+        else:
+            self.center_variances = None
+            self.size_variances = None
         self.invert = invert
         super(SSDBoxCoder, self).__init__(name=name, **kwargs)
 
@@ -61,21 +88,21 @@ def corner_to_centroids(box_tensor):
             th = tf.math.log(height_gt / height_a)
             tw = tf.math.log(width_gt / width_a)
 
-            if self.variances is not None:
-                ty = ty / tf.cast(self.variances[0], dtype=ty.dtype)
-                tx = tx / tf.cast(self.variances[1], dtype=tx.dtype)
-                th = th / tf.cast(self.variances[2], dtype=th.dtype)
-                tw = tw / tf.cast(self.variances[3], dtype=tw.dtype)
+            if self.center_variances is not None:
+                ty = ty / tf.cast(self.center_variances[0], dtype=ty.dtype)
+                tx = tx / tf.cast(self.center_variances[1], dtype=tx.dtype)
+                th = th / tf.cast(self.size_variances[0], dtype=th.dtype)
+                tw = tw / tf.cast(self.size_variances[1], dtype=tw.dtype)
 
             return tf.concat([ty, tx, th, tw], axis=-1)
 
         else:
             ty, tx, th, tw = tf.split(boxes, num_or_size_splits=4, axis=-1)
-            if self.variances is not None:
-                ty = ty * tf.cast(self.variances[0], dtype=ty.dtype)
-                tx = tx * tf.cast(self.variances[1], dtype=tx.dtype)
-                th = th * tf.cast(self.variances[2], dtype=th.dtype)
-                tw = tw * tf.cast(self.variances[3], dtype=tw.dtype)
+            if self.center_variances is not None:
+                ty = ty * tf.cast(self.center_variances[0], dtype=ty.dtype)
+                tx = tx * tf.cast(self.center_variances[1], dtype=tx.dtype)
+                th = th * tf.cast(self.size_variances[0], dtype=th.dtype)
+                tw = tw * tf.cast(self.size_variances[1], dtype=tw.dtype)
 
             height_gt = tf.math.exp(th) * height_a
             width_gt = tf.math.exp(tw) * width_a
@@ -89,6 +116,10 @@ def corner_to_centroids(box_tensor):
             return tf.concat([y_min_gt, x_min_gt, y_max_gt, x_max_gt], axis=-1)
 
     def get_config(self):
-        config = {"variances": self.variances, "invert": self.invert}
+        config = {
+            "center_variances": self.center_variances,
+            "size_variances": self.size_variances,
+            "invert": self.invert,
+        }
         base_config = super(SSDBoxCoder, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
diff --git a/tests/kerascv/layers/ssd_box_coder_test.py b/tests/kerascv/layers/ssd_box_coder_test.py
index 9423c1416a..cb9d03b768 100644
--- a/tests/kerascv/layers/ssd_box_coder_test.py
+++ b/tests/kerascv/layers/ssd_box_coder_test.py
@@ -5,7 +5,9 @@
 def test_encode_decode_variance():
     gt_boxes = np.asarray([[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]], np.float32)
     anchors = np.array([[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]], np.float32)
-    encode_layer = SSDBoxCoder(variances=[0.5, 1 / 3, 0.25, 0.2])
+    encode_layer = SSDBoxCoder(
+        center_variances=[0.5, 1 / 3], size_variances=[0.25, 0.2]
+    )
     encoded_gt_boxes = encode_layer(gt_boxes, anchors)
     expected_out = np.asarray(
         [
@@ -15,7 +17,9 @@ def test_encode_decode_variance():
     )
     np.testing.assert_allclose(expected_out, encoded_gt_boxes, rtol=1e-06, atol=1e-6)
 
-    decode_layer = SSDBoxCoder(variances=[0.5, 1 / 3, 0.25, 0.2], invert=True)
+    decode_layer = SSDBoxCoder(
+        center_variances=[0.5, 1 / 3], size_variances=[0.25, 0.2], invert=True
+    )
     decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors)
     np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6)
 
@@ -33,3 +37,12 @@ def test_encode_decode_no_variance():
     decode_layer = SSDBoxCoder(invert=True)
     decoded_gt_boxes = decode_layer(encoded_gt_boxes, anchors)
     np.testing.assert_allclose(gt_boxes, decoded_gt_boxes, rtol=1e-6, atol=1e-6)
+
+
+def test_config_with_custom_name():
+    layer = SSDBoxCoder(
+        center_variances=[0.1, 0.1], size_variances=[0.2, 0.2], name="box_coder"
+    )
+    config = layer.get_config()
+    layer_1 = SSDBoxCoder.from_config(config)
+    np.testing.assert_equal(layer_1.name, layer.name)