LLK Test Coverage - MathFid and DestAcc in reduce API (#12696)

* #0: Add MathFid and DestAcc sweep to reduce API 1. Added AVG pool in reduce tests 2. Added Math Fidelity for all reduce tests 3. Added FP32 DEST Accumulation but skipped for scalar pool since it isn't supported 4. Fp32_dst_acc sweep skipped for GS
tenstorrent · Sep 20, 2024 · 5bb0b10 · 5bb0b10
1 parent cee4c7e
commit 5bb0b10
Show file tree

Hide file tree

Showing 6 changed files with 353 additions and 146 deletions.
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp
@@ -34,8 +34,14 @@ void MAIN {
             acquire_dst(tt::DstMode::Half);
             for(uint32_t ht = 0; ht < Ht; ++ht) {
                 cb_wait_front(tt::CB::c_in0, onetile);
+#if (MATH_ONLY == 1)
+                UNPACK(( llk_unpack_AB(tt::CB::c_in0, tt::CB::c_in2, 0, 0) ));
+                // REDUCE_OP is expected to come from add_define
+                reduce_tile_math(reduce_dst_idx);
+#elif (MATH_ONLY == 0)
                 // REDUCE_OP is expected to come from add_define
                 reduce_tile(tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
+#endif
                 cb_pop_front(tt::CB::c_in0, onetile);
             }
 

diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp
@@ -34,8 +34,14 @@ void MAIN {
             // in this case we just sequentially add to accumulator all the W-tiles in a row
             for(uint32_t wt = 0; wt < Wt; ++wt) {
                 cb_wait_front(tt::CB::c_in0, onetile);
-                // REDUCE_OP/DIM is expected to come from add_define
+#if (MATH_ONLY == 1)
+                UNPACK(( llk_unpack_AB(tt::CB::c_in0, tt::CB::c_in2, 0, 0) ));
+                // REDUCE_OP is expected to come from add_define
+                reduce_tile_math(reduce_dst_idx);
+#elif (MATH_ONLY == 0)
+                // REDUCE_OP is expected to come from add_define
                 reduce_tile(tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
+#endif
                 cb_pop_front(tt::CB::c_in0, onetile);
             }
         }

diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp
@@ -137,7 +137,7 @@ std::vector<uint16_t> gold_transpose_wh(const std::vector<uint16_t> &src_vec, co
 // input shape.x is assumed to have the full number of elements in bfloat16
 // src_vec is expected to be untilized
 // result is also untilized
-std::vector<uint16_t> gold_reduce_h(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, bool red_max, bool zeropad) {
+std::vector<uint16_t> gold_reduce_h(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, uint8_t red_type, bool zeropad) {
     vector<uint32_t> shape_dst{shape[0], shape[1], 1, shape[3]};
     TT_FATAL(shape[2] > 0, "Error");
     if (zeropad)
@@ -150,10 +150,11 @@ std::vector<uint16_t> gold_reduce_h(const std::vector<uint16_t> &src_vec, const
     for (int n = 0; n < shape[0]; n++)
     for (int c = 0; c < shape[1]; c++)
     for (int w = 0; w < shape[3]; w++) {
-        float sum = red_max ? -std::numeric_limits<float>::max() : 0.0f;
+        // red_type : {SUM, AVG, MAX}; i.e. {0, 1, 2};
+        float sum = (red_type == 2) ? -std::numeric_limits<float>::max() : 0.0f;
         for (int h = 0; h < shape[2]; h++) {
             auto offs = addr.offs(n, c, h, w);
-            if (red_max)
+            if (red_type == 2)
                 sum = fmaxf(bfloat16(src_vec[offs]).to_float(), sum);
             else
                 sum += bfloat16(src_vec[offs]).to_float();
@@ -165,7 +166,7 @@ std::vector<uint16_t> gold_reduce_h(const std::vector<uint16_t> &src_vec, const
     return reduced;
 };
 
-std::vector<uint16_t> gold_reduce_w(const vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, bool red_max, bool zeropad) {
+std::vector<uint16_t> gold_reduce_w(const vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, uint8_t red_type, bool zeropad) {
     vector<uint32_t> shape_dst{shape[0], shape[1], shape[2], 1};
     if (zeropad)
         shape_dst[3] = 32;
@@ -177,22 +178,22 @@ std::vector<uint16_t> gold_reduce_w(const vector<uint16_t> &src_vec, const std::
     for (int n = 0; n < shape[0]; n++)
     for (int c = 0; c < shape[1]; c++)
     for (int h = 0; h < shape[2]; h++) {
-        float sum = red_max ? -std::numeric_limits<float>::max() : 0.0f;
+        // red_type : {SUM, AVG, MAX}; i.e. {0, 1, 2};
+        float sum = (red_type == 2) ? -std::numeric_limits<float>::max() : 0.0f;
         for (int w = 0; w < shape[3]; w++) {
             auto offs = addr.offs(n, c, h, w);
-            if (red_max)
+            if (red_type == 2)
                 sum = fmaxf(bfloat16(src_vec[offs]).to_float(), sum);
             else
                 sum += bfloat16(src_vec[offs]).to_float();
         }
         auto dest_offs = addr_dst.offs(n, c, h, 0);
         reduced[dest_offs] = bfloat16(sum*scaler).to_uint16();
     }
-
     return reduced;
 }
 
-std::vector<uint16_t> gold_reduce_hw(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, bool red_max, bool zeropad) {
+std::vector<uint16_t> gold_reduce_hw(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, uint8_t red_type, bool zeropad) {
     vector<uint32_t> shape_dst{shape[0], shape[1], 1, 1};
     if (zeropad) {
         shape_dst[2] = 32;
@@ -205,17 +206,17 @@ std::vector<uint16_t> gold_reduce_hw(const std::vector<uint16_t> &src_vec, const
     std::fill(reduced.begin(), reduced.end(), 0);
     for (int n = 0; n < shape[0]; n++)
     for (int c = 0; c < shape[1]; c++) {
-        float sum = red_max ? -std::numeric_limits<float>::max() : 0.0f;
+        // red_type : {SUM, AVG, MAX}; i.e. {0, 1, 2};
+        float sum = (red_type == 2) ? -std::numeric_limits<float>::max() : 0.0f;
         for (int h = 0; h < shape[2]; h++) {
-        for (int w = 0; w < shape[3]; w++) {
-            auto offs = addr.offs(n, c, h, w);
-            if (red_max)
-                sum = fmaxf(bfloat16(src_vec[offs]).to_float(), sum);
-            else {
-                sum += bfloat16(src_vec[offs]).to_float();
-                //sum = bfloat16(sum).to_float();
+            for (int w = 0; w < shape[3]; w++) {
+                auto offs = addr.offs(n, c, h, w);
+                if (red_type == 2)
+                    sum = fmaxf(bfloat16(src_vec[offs]).to_float(), sum);
+                else
+                    sum += bfloat16(src_vec[offs]).to_float();
             }
-        }}
+        }
         auto dest_offs = addr_dst.offs(n, c, 0, 0);
         reduced[dest_offs] = bfloat16(sum*scaler).to_uint16();
     }

diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp
@@ -34,17 +34,20 @@ std::vector<uint16_t> gold_transpose_wh(const std::vector<uint16_t> &src_vec, co
 // input shape.x is assumed to have the full number of elements in bfloat16
 // src_vec is expected to be untilized
 // result is also untilized
-std::vector<uint16_t> gold_reduce_h(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, bool red_max = false, bool zeropad = true);
+// red_type : {SUM, AVG, MAX}; i.e. {0, 1, 2};
+std::vector<uint16_t> gold_reduce_h(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, uint8_t red_type = 0, bool zeropad = true);
 
 // input shape.x is assumed to have the full number of elements in bfloat16
 // src_vec is expected to be untilized
 // result is also untilized
-std::vector<uint16_t> gold_reduce_w(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, bool red_max = false, bool zeropad = true);
+// red_type : {SUM, AVG, MAX}; i.e. {0, 1, 2};
+std::vector<uint16_t> gold_reduce_w(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, uint8_t red_type = 0, bool zeropad = true);
 
 // input shape.x is assumed to have the full number of elements in bfloat16
 // src_vec is expected to be untilized
 // result is also untilized
-std::vector<uint16_t> gold_reduce_hw(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, bool red_max = false, bool zeropad = true);
+// red_type : {SUM, AVG, MAX}; i.e. {0, 1, 2};
+std::vector<uint16_t> gold_reduce_hw(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, uint8_t red_type = 0, bool zeropad = true);
 
 // Takes untilized src0_vec and tilized src1_vec
 // returns tilized result of eltwise addition