Skip to content

Commit

Permalink
Implement MlasLoadPartialFloat16x4
Browse files Browse the repository at this point in the history
  • Loading branch information
yihonglyu committed Jul 27, 2024
1 parent 647d225 commit 26d31a0
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 31 deletions.
3 changes: 2 additions & 1 deletion onnxruntime/core/mlas/lib/dwconv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ MlasConvDepthwiseKernel(
}

if (c > 0) {
MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]);
MLAS_FLOAT16X4 Accumulator =
Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c);
size_t ChannelKernelOffset = ChannelOffset;

for (size_t k = 0; k < KernelSize; k++) {
Expand Down
17 changes: 17 additions & 0 deletions onnxruntime/core/mlas/lib/fp16_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,23 @@ MLAS_FORCEINLINE
MLAS_FLOAT16X4
MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); }

MLAS_FORCEINLINE
MLAS_FLOAT16X4
MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len)
{
MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4();
if ((len & 1) != 0) {
Vector = vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0);
}
if ((len & 2) != 0) {
Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0));
Vector = vreinterpret_f16_f32(
vld1_lane_f32(reinterpret_cast<const float*>(Buffer), vreinterpret_f32_f16(Vector), 0)
);
}
return Vector;
}

MLAS_FORCEINLINE
void
MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector)
Expand Down
65 changes: 52 additions & 13 deletions onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,31 @@ TEST(ConvFp16Test, Conv2D_group) {
TestConvFp16Op(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
}

TEST(ConvFp16Test, Depthwise2D_Bias) {
TEST(ConvFp16Test, Depthwise2D_Bias_Group1_Issue18992) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad
vector<int64_t>{1, 1}, // dilations
1, // group
vector<int64_t>{1, 1}, // kernel_shape
vector<int64_t>{0, 0, 0, 0}, // pads
vector<int64_t>{1, 1}, // strides
{} // excluded EPs
};

vector<MLFloat16> X = {MLFloat16(1.0f)};
vector<int64_t> X_shape = {1, 1, 1, 1};
vector<MLFloat16> W = {MLFloat16(0.5f)};
vector<int64_t> W_shape = {1, 1, 1, 1};
vector<MLFloat16> B = {MLFloat16(0.5f)};
vector<int64_t> B_shape = {1};
vector<int64_t> Y_shape = {1, 1, 1, 1};
auto expected_vals = {MLFloat16(1.0f)};

TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
}

TEST(ConvFp16Test, Depthwise2D_Bias_Group2) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad
vector<int64_t>{1, 1}, // dilations
Expand Down Expand Up @@ -752,11 +776,11 @@ TEST(ConvFp16Test, Depthwise2D_Bias) {
TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
}

TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
TEST(ConvFp16Test, Depthwise2D_Bias_Group15) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad
vector<int64_t>{1, 1}, // dilations
13, // group
15, // group
vector<int64_t>{2, 2}, // kernel_shape
vector<int64_t>{0, 0, 0, 0}, // pads
vector<int64_t>{1, 1}, // strides
Expand Down Expand Up @@ -815,8 +839,15 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
// C = 12
MLFloat16(48.0f), MLFloat16(49.0f),
MLFloat16(50.0f), MLFloat16(51.0f),
};
vector<int64_t> X_shape = {1, 13, 2, 2};

// C = 13
MLFloat16(52.0f), MLFloat16(53.0f),
MLFloat16(54.0f), MLFloat16(55.0f),

// C = 14
MLFloat16(56.0f), MLFloat16(57.0f),
MLFloat16(58.0f), MLFloat16(59.0f)};
vector<int64_t> X_shape = {1, 15, 2, 2};
vector<MLFloat16> W = {
// M = 0
MLFloat16(0.0f), MLFloat16(1.0f),
Expand Down Expand Up @@ -869,8 +900,15 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
// M = 12
MLFloat16(48.0f), MLFloat16(49.0f),
MLFloat16(50.0f), MLFloat16(51.0f),
};
vector<int64_t> W_shape = {13, 1, 2, 2};

// M = 13
MLFloat16(52.0f), MLFloat16(53.0f),
MLFloat16(54.0f), MLFloat16(55.0f),

// M = 14
MLFloat16(56.0f), MLFloat16(57.0f),
MLFloat16(58.0f), MLFloat16(59.0f)};
vector<int64_t> W_shape = {15, 1, 2, 2};
vector<MLFloat16> B = {
MLFloat16(1.0f),
MLFloat16(2.0f),
Expand All @@ -885,9 +923,10 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
MLFloat16(11.0f),
MLFloat16(12.0f),
MLFloat16(13.0f),
};
vector<int64_t> B_shape = {13};
vector<int64_t> Y_shape = {1, 13, 1, 1};
MLFloat16(14.0f),
MLFloat16(15.0f)};
vector<int64_t> B_shape = {15};
vector<int64_t> Y_shape = {1, 15, 1, 1};
auto expected_vals = {
MLFloat16(15.0f), // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0
MLFloat16(128.0f),
Expand All @@ -901,12 +940,12 @@ TEST(ConvFp16Test, Depthwise2D_Bias_Complex) {
MLFloat16(5640.0f),
MLFloat16(6905.0f),
MLFloat16(8298.0f),
MLFloat16(9819.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0
MLFloat16(9819.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0
MLFloat16(11468.0f), // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0
MLFloat16(13245.0f) // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0
};

TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);

// NNAPI/CoreML EP requires weight to be an initializer
TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
}

Expand Down
84 changes: 67 additions & 17 deletions onnxruntime/test/providers/cpu/nn/conv_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,31 @@ TEST(ConvTest, Conv2D_group) {
TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
}

TEST(ConvTest, Depthwise2D_Bias) {
TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad
vector<int64_t>{1, 1}, // dilations
1, // group
vector<int64_t>{1, 1}, // kernel_shape
vector<int64_t>{0, 0, 0, 0}, // pads
vector<int64_t>{1, 1}, // strides
{} // excluded EPs
};

vector<float> X = {1.0f};
vector<int64_t> X_shape = {1, 1, 1, 1};
vector<float> W = {0.5f};
vector<int64_t> W_shape = {1, 1, 1, 1};
vector<float> B = {0.5f};
vector<int64_t> B_shape = {1};
vector<int64_t> Y_shape = {1, 1, 1, 1};
auto expected_vals = {1.0f};

TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
}

TEST(ConvTest, Depthwise2D_Bias_Group2) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad
vector<int64_t>{1, 1}, // dilations
Expand All @@ -658,26 +682,38 @@ TEST(ConvTest, Depthwise2D_Bias) {
{} // excluded EPs
};

vector<float> X = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f};
vector<float> X = {
0.0f, 1.0f, 2.0f,
3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f,

9.0f, 10.0f, 11.0f,
12.0f, 13.0f, 14.0f,
15.0f, 16.0f, 17.0f};
vector<int64_t> X_shape = {1, 2, 3, 3};
vector<float> W = {1.0f, 2.0f};
vector<int64_t> W_shape = {2, 1, 1, 1};
vector<float> B = {1.0f, -1.0f};
vector<int64_t> B_shape = {2};
vector<int64_t> Y_shape = {1, 2, 3, 3};
auto expected_vals = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 17.0f, 19.0f, 21.0f, 23.0f, 25.0f, 27.0f, 29.0f, 31.0f, 33.0f};
auto expected_vals = {
1.0f, 2.0f, 3.0f,
4.0f, 5.0f, 6.0f,
7.0f, 8.0f, 9.0f,

TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
17.0f, 19.0f, 21.0f,
23.0f, 25.0f, 27.0f,
29.0f, 31.0f, 33.0f};

// NNAPI/CoreML EP requires weight to be an initializer
TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
}

TEST(ConvTest, Depthwise2D_Bias_Complex) {
TEST(ConvTest, Depthwise2D_Bias_Group15) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad
vector<int64_t>{1, 1}, // dilations
13, // group
15, // group
vector<int64_t>{2, 2}, // kernel_shape
vector<int64_t>{0, 0, 0, 0}, // pads
vector<int64_t>{1, 1}, // strides
Expand Down Expand Up @@ -736,8 +772,15 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) {
// C = 12
48.0f, 49.0f,
50.0f, 51.0f,
};
vector<int64_t> X_shape = {1, 13, 2, 2};

// C = 13
52.0f, 53.0f,
54.0f, 55.0f,

// C = 14
56.0f, 57.0f,
58.0f, 59.0f};
vector<int64_t> X_shape = {1, 15, 2, 2};
vector<float> W = {
// M = 0
0.0f, 1.0f,
Expand Down Expand Up @@ -790,11 +833,18 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) {
// M = 12
48.0f, 49.0f,
50.0f, 51.0f,
};
vector<int64_t> W_shape = {13, 1, 2, 2};
vector<float> B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f};
vector<int64_t> B_shape = {13};
vector<int64_t> Y_shape = {1, 13, 1, 1};

// M = 13
52.0f, 53.0f,
54.0f, 55.0f,

// M = 14
56.0f, 57.0f,
58.0f, 59.0f};
vector<int64_t> W_shape = {15, 1, 2, 2};
vector<float> B = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
vector<int64_t> B_shape = {15};
vector<int64_t> Y_shape = {1, 15, 1, 1};
auto expected_vals = {
15.0f, // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 1.0
128.0f,
Expand All @@ -808,12 +858,12 @@ TEST(ConvTest, Depthwise2D_Bias_Complex) {
5640.0f,
6905.0f,
8298.0f,
9819.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0
9819.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 13.0
11468.0f, // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 14.0
13245.0f // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 15.0
};

TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);

// NNAPI/CoreML EP requires weight to be an initializer
TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
}

Expand Down

0 comments on commit 26d31a0

Please sign in to comment.