Implement MlasLoadPartialFloat16x4

microsoft · Jul 27, 2024 · 26d31a0 · 26d31a0
1 parent 647d225
commit 26d31a0
Show file tree

Hide file tree

Showing 4 changed files with 138 additions and 31 deletions.
diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp
@@ -72,7 +72,8 @@ MlasConvDepthwiseKernel(
         }
 
         if (c > 0) {
-            MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]);
+            MLAS_FLOAT16X4 Accumulator =
+                Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c);
             size_t ChannelKernelOffset = ChannelOffset;
 
             for (size_t k = 0; k < KernelSize; k++) {

diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h
@@ -64,6 +64,23 @@ MLAS_FORCEINLINE
 MLAS_FLOAT16X4
 MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); }
 
+MLAS_FORCEINLINE
+MLAS_FLOAT16X4
+MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len)
+{
+    MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4();
+    if ((len & 1) != 0) {
+        Vector = vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0);
+    }
+    if ((len & 2) != 0) {
+        Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0));
+        Vector = vreinterpret_f16_f32(
+            vld1_lane_f32(reinterpret_cast<const float*>(Buffer), vreinterpret_f32_f16(Vector), 0)
+        );
+    }
+    return Vector;
+}
+
 MLAS_FORCEINLINE
 void
 MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector)

diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
@@ -714,7 +714,31 @@ TEST(ConvFp16Test, Conv2D_group) {
   TestConvFp16Op(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
 }
 
-TEST(ConvFp16Test, Depthwise2D_Bias) {
+TEST(ConvFp16Test, Depthwise2D_Bias_Group1_Issue18992) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{1, 1},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<MLFloat16> X = {MLFloat16(1.0f)};
+  vector<int64_t> X_shape = {1, 1, 1, 1};
+  vector<MLFloat16> W = {MLFloat16(0.5f)};
+  vector<int64_t> W_shape = {1, 1, 1, 1};
+  vector<MLFloat16> B = {MLFloat16(0.5f)};
+  vector<int64_t> B_shape = {1};
+  vector<int64_t> Y_shape = {1, 1, 1, 1};
+  auto expected_vals = {MLFloat16(1.0f)};
+
+  TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvFp16Test, Depthwise2D_Bias_Group2) {
   ConvOpAndTestAttributes attrs = {
       "",                           // auto_pad
       vector<int64_t>{1, 1},        // dilations
@@ -752,11 +776,11 @@ TEST(ConvFp16Test, Depthwise2D_Bias) {
   TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
 }
 
-TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
+TEST(ConvFp16Test, Depthwise2D_Bias_Group15) {
   ConvOpAndTestAttributes attrs = {
       "",                           // auto_pad
       vector<int64_t>{1, 1},        // dilations
-      13,                           // group
+      15,                           // group
       vector<int64_t>{2, 2},        // kernel_shape
       vector<int64_t>{0, 0, 0, 0},  // pads
       vector<int64_t>{1, 1},        // strides
@@ -815,8 +839,15 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
       // C = 12
       MLFloat16(48.0f), MLFloat16(49.0f),
       MLFloat16(50.0f), MLFloat16(51.0f),
-  };
-  vector<int64_t> X_shape = {1, 13, 2, 2};
+
+      // C = 13
+      MLFloat16(52.0f), MLFloat16(53.0f),
+      MLFloat16(54.0f), MLFloat16(55.0f),
+
+      // C = 14
+      MLFloat16(56.0f), MLFloat16(57.0f),
+      MLFloat16(58.0f), MLFloat16(59.0f)};
+  vector<int64_t> X_shape = {1, 15, 2, 2};
   vector<MLFloat16> W = {
       // M = 0
       MLFloat16(0.0f), MLFloat16(1.0f),
@@ -869,8 +900,15 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
       // M = 12
       MLFloat16(48.0f), MLFloat16(49.0f),
       MLFloat16(50.0f), MLFloat16(51.0f),
-  };
-  vector<int64_t> W_shape = {13, 1, 2, 2};
+
+      // M = 13
+      MLFloat16(52.0f), MLFloat16(53.0f),
+      MLFloat16(54.0f), MLFloat16(55.0f),
+
+      // M = 14
+      MLFloat16(56.0f), MLFloat16(57.0f),
+      MLFloat16(58.0f), MLFloat16(59.0f)};
+  vector<int64_t> W_shape = {15, 1, 2, 2};
   vector<MLFloat16> B = {
       MLFloat16(1.0f),
       MLFloat16(2.0f),
@@ -885,9 +923,10 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
       MLFloat16(11.0f),
       MLFloat16(12.0f),
       MLFloat16(13.0f),
-  };
-  vector<int64_t> B_shape = {13};
-  vector<int64_t> Y_shape = {1, 13, 1, 1};
+      MLFloat16(14.0f),
+      MLFloat16(15.0f)};
+  vector<int64_t> B_shape = {15};
+  vector<int64_t> Y_shape = {1, 15, 1, 1};
   auto expected_vals = {
       MLFloat16(15.0f),  // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0
       MLFloat16(128.0f),
@@ -901,12 +940,12 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
       MLFloat16(5640.0f),
       MLFloat16(6905.0f),
       MLFloat16(8298.0f),
-      MLFloat16(9819.0f),  // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0
+      MLFloat16(9819.0f),   // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0
+      MLFloat16(11468.0f),  // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0
+      MLFloat16(13245.0f)   // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0
   };
 
   TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
-
-  // NNAPI/CoreML EP requires weight to be an initializer
   TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
 }
 

diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -647,7 +647,31 @@ TEST(ConvTest, Conv2D_group) {
   TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
 }
 
-TEST(ConvTest, Depthwise2D_Bias) {
+TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{1, 1},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<float> X = {1.0f};
+  vector<int64_t> X_shape = {1, 1, 1, 1};
+  vector<float> W = {0.5f};
+  vector<int64_t> W_shape = {1, 1, 1, 1};
+  vector<float> B = {0.5f};
+  vector<int64_t> B_shape = {1};
+  vector<int64_t> Y_shape = {1, 1, 1, 1};
+  auto expected_vals = {1.0f};
+
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvTest, Depthwise2D_Bias_Group2) {
   ConvOpAndTestAttributes attrs = {
       "",                           // auto_pad
       vector<int64_t>{1, 1},        // dilations
@@ -658,26 +682,38 @@ TEST(ConvTest, Depthwise2D_Bias) {
       {}                            // excluded EPs
   };
 
-  vector<float> X = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f};
+  vector<float> X = {
+      0.0f, 1.0f, 2.0f,
+      3.0f, 4.0f, 5.0f,
+      6.0f, 7.0f, 8.0f,
+
+      9.0f, 10.0f, 11.0f,
+      12.0f, 13.0f, 14.0f,
+      15.0f, 16.0f, 17.0f};
   vector<int64_t> X_shape = {1, 2, 3, 3};
   vector<float> W = {1.0f, 2.0f};
   vector<int64_t> W_shape = {2, 1, 1, 1};
   vector<float> B = {1.0f, -1.0f};
   vector<int64_t> B_shape = {2};
   vector<int64_t> Y_shape = {1, 2, 3, 3};
-  auto expected_vals = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 17.0f, 19.0f, 21.0f, 23.0f, 25.0f, 27.0f, 29.0f, 31.0f, 33.0f};
+  auto expected_vals = {
+      1.0f, 2.0f, 3.0f,
+      4.0f, 5.0f, 6.0f,
+      7.0f, 8.0f, 9.0f,
 
-  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+      17.0f, 19.0f, 21.0f,
+      23.0f, 25.0f, 27.0f,
+      29.0f, 31.0f, 33.0f};
 
-  // NNAPI/CoreML EP requires weight to be an initializer
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
   TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
 }
 
-TEST(ConvTest, Depthwise2D_Bias_Complex) {
+TEST(ConvTest, Depthwise2D_Bias_Group15) {
   ConvOpAndTestAttributes attrs = {
       "",                           // auto_pad
       vector<int64_t>{1, 1},        // dilations
-      13,                           // group
+      15,                           // group
       vector<int64_t>{2, 2},        // kernel_shape
       vector<int64_t>{0, 0, 0, 0},  // pads
       vector<int64_t>{1, 1},        // strides
@@ -736,8 +772,15 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) {
       // C = 12
       48.0f, 49.0f,
       50.0f, 51.0f,
-  };
-  vector<int64_t> X_shape = {1, 13, 2, 2};
+
+      // C = 13
+      52.0f, 53.0f,
+      54.0f, 55.0f,
+
+      // C = 14
+      56.0f, 57.0f,
+      58.0f, 59.0f};
+  vector<int64_t> X_shape = {1, 15, 2, 2};
   vector<float> W = {
       // M = 0
       0.0f, 1.0f,
@@ -790,11 +833,18 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) {
       // M = 12
       48.0f, 49.0f,
       50.0f, 51.0f,
-  };
-  vector<int64_t> W_shape = {13, 1, 2, 2};
-  vector<float> B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f};
-  vector<int64_t> B_shape = {13};
-  vector<int64_t> Y_shape = {1, 13, 1, 1};
+
+      // M = 13
+      52.0f, 53.0f,
+      54.0f, 55.0f,
+
+      // M = 14
+      56.0f, 57.0f,
+      58.0f, 59.0f};
+  vector<int64_t> W_shape = {15, 1, 2, 2};
+  vector<float> B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+  vector<int64_t> B_shape = {15};
+  vector<int64_t> Y_shape = {1, 15, 1, 1};
   auto expected_vals = {
       15.0f,  // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0
       128.0f,
@@ -808,12 +858,12 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) {
       5640.0f,
       6905.0f,
       8298.0f,
-      9819.0f,  // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0
+      9819.0f,   // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0
+      11468.0f,  // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0
+      13245.0f   // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0
   };
 
   TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
-
-  // NNAPI/CoreML EP requires weight to be an initializer
   TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
 }