Skip to content

Commit

Permalink
[xtensa] improved conversion ops and covered them with tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Aelphy committed Sep 20, 2023
1 parent 9f044b5 commit 2c367b9
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 43 deletions.
142 changes: 104 additions & 38 deletions src/CodeGen_Xtensa_vectors.template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2187,17 +2187,26 @@ convert<native_vector_i16_x2, native_vector_i8>(const native_vector_i8 &src) {

template<>
HALIDE_ALWAYS_INLINE native_vector_u16_x2 convert<native_vector_u16_x2, native_vector_u8>(const native_vector_u8 &src) {
xb_vec2Nx24 wide = src * native_vector_u8(1);
return native_vector_u16_x2(native_vector_u16_x2::from_native_vector,
xb_vecNx16_rtor_xb_vecNx16U(IVP_CVT16U2NX24L(wide)),
xb_vecNx16_rtor_xb_vecNx16U(IVP_CVT16U2NX24H(wide)));
return native_vector_u16_x2(
native_vector_u16_x2::from_native_vector,
IVP_MOVNX16_FROM2NX8U(
IVP_SEL2NX8UI(
native_vector_u8(0), src, IVP_SELI_8B_INTERLEAVE_1_LO)),
IVP_MOVNX16_FROM2NX8U(
IVP_SEL2NX8UI(
native_vector_u8(0), src, IVP_SELI_8B_INTERLEAVE_1_HI)));
}

template<>
HALIDE_ALWAYS_INLINE native_vector_i16_x2 convert<native_vector_i16_x2, native_vector_u8>(const native_vector_u8 &src) {
xb_vec2Nx24 wide = src * native_vector_u8(1);
return native_vector_i16_x2(native_vector_i16_x2::from_native_vector,
IVP_CVT16S2NX24L(wide), IVP_CVT16S2NX24H(wide));
return native_vector_i16_x2(
native_vector_i16_x2::from_native_vector,
IVP_MOVNX16_FROM2NX8(
IVP_SEL2NX8UI(
native_vector_u8(0), src, IVP_SELI_8B_INTERLEAVE_1_LO)),
IVP_MOVNX16_FROM2NX8(
IVP_SEL2NX8UI(
native_vector_u8(0), src, IVP_SELI_8B_INTERLEAVE_1_HI)));
}

template<>
Expand All @@ -2215,15 +2224,18 @@ HALIDE_ALWAYS_INLINE native_vector_u16_x2 convert<native_vector_u16_x2, native_v

template<>
HALIDE_ALWAYS_INLINE native_vector_i8 convert<native_vector_i8, native_vector_i16_x2>(const native_vector_i16_x2 &src) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(src.native_vector[1], src.native_vector[0]);
return IVP_PACKL2NX24(wide);
return IVP_SEL2NX8I(
IVP_MOV2NX8_FROMNX16(src.native_vector[1]),
IVP_MOV2NX8_FROMNX16(src.native_vector[0]),
IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
}

template<>
HALIDE_ALWAYS_INLINE native_vector_i8
convert<native_vector_i8, native_vector_u16_x2>(const native_vector_u16_x2 &src) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(src.native_vector[1], src.native_vector[0]);
return IVP_PACKL2NX24(wide);
return IVP_SEL2NX8I(IVP_MOV2NX8U_FROMNX16(src.native_vector[1]),
IVP_MOV2NX8U_FROMNX16(src.native_vector[0]),
IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
}

template<>
Expand All @@ -2235,22 +2247,36 @@ HALIDE_ALWAYS_INLINE native_vector_u8 convert<native_vector_u8, native_vector_i1

template<>
HALIDE_ALWAYS_INLINE native_vector_i8 convert<native_vector_i8, native_vector_i32_x4>(const native_vector_i32_x4 &src) {
xb_vec2Nx24 wide = IVP_CVT24UNX32L(src.native_vector[1], src.native_vector[0]);
IVP_CVT24UNX32H(wide, src.native_vector[3], src.native_vector[2]);
return IVP_PACKL2NX24(wide);
native_vector_i8 x = IVP_SEL2NX8I(
IVP_MOV2NX8_FROMNX16(
IVP_SELNX16I(
IVP_MOVNX16_FROMN_2X32(src.native_vector[3]),
IVP_MOVNX16_FROMN_2X32(src.native_vector[2]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0)),
IVP_MOV2NX8_FROMNX16(
IVP_SELNX16I(
IVP_MOVNX16_FROMN_2X32(src.native_vector[1]),
IVP_MOVNX16_FROMN_2X32(src.native_vector[0]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0)),
IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
return x;
}

template<>
HALIDE_ALWAYS_INLINE native_vector_i8
convert<native_vector_i8, native_vector_u32_x4>(const native_vector_u32_x4 &src) {
xb_vec2Nx24 wide = IVP_CVT24UNX32L(src.native_vector[1], src.native_vector[0]);
IVP_CVT24UNX32H(wide, src.native_vector[3], src.native_vector[2]);
return IVP_PACKL2NX24(wide);
}

template<>
HALIDE_ALWAYS_INLINE native_mask_i8 convert<native_mask_i8, native_vector_u8>(const native_vector_u8 &src) {
return IVP_GTU2NX8U(src, 0);
return IVP_SEL2NX8I(
IVP_MOV2NX8_FROMNX16(
IVP_SELNX16I(
IVP_MOVNX16_FROMN_2X32(src.native_vector[3]),
IVP_MOVNX16_FROMN_2X32(src.native_vector[2]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0)),
IVP_MOV2NX8_FROMNX16(
IVP_SELNX16I(
IVP_MOVNX16_FROMN_2X32(src.native_vector[1]),
IVP_MOVNX16_FROMN_2X32(src.native_vector[0]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0)),
IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
}

template<>
Expand All @@ -2265,9 +2291,18 @@ HALIDE_ALWAYS_INLINE native_vector_u8 convert<native_vector_u8, native_mask_i8>(

template<>
HALIDE_ALWAYS_INLINE native_vector_u8 convert<native_vector_u8, native_vector_i32_x4>(const native_vector_i32_x4 &src) {
xb_vec2Nx24 wide = IVP_CVT24UNX32L(src.native_vector[1], src.native_vector[0]);
IVP_CVT24UNX32H(wide, src.native_vector[3], src.native_vector[2]);
return xb_vec2Nx8_rtor_xb_vec2Nx8U(IVP_PACKL2NX24(wide));
return IVP_SEL2NX8UI(
IVP_MOV2NX8U_FROMNX16(
IVP_SELNX16I(
IVP_MOVNX16_FROMN_2X32(src.native_vector[3]),
IVP_MOVNX16_FROMN_2X32(src.native_vector[2]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0)),
IVP_MOV2NX8U_FROMNX16(
IVP_SELNX16I(
IVP_MOVNX16_FROMN_2X32(src.native_vector[1]),
IVP_MOVNX16_FROMN_2X32(src.native_vector[0]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0)),
IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
}

template<>
Expand Down Expand Up @@ -2368,16 +2403,44 @@ HALIDE_ALWAYS_INLINE native_vector_i32 convert<native_vector_i32, native_mask_i3

template<>
HALIDE_ALWAYS_INLINE native_vector_i32_x4 convert<native_vector_i32_x4, native_vector_u8>(const native_vector_u8 &src) {
xb_vec2Nx24 wide = src * native_vector_u8(1);
return native_vector_i32_x4(native_vector_i32_x4::from_native_vector, IVP_CVT32S2NX24LL(wide), IVP_CVT32S2NX24LH(wide),
IVP_CVT32S2NX24HL(wide), IVP_CVT32S2NX24HH(wide));
native_vector_i16 a = IVP_MOVNX16_FROM2NX8U(
IVP_SEL2NX8UI(
native_vector_u8(0), src, IVP_SELI_8B_INTERLEAVE_1_HI));
native_vector_i16 b = IVP_MOVNX16_FROM2NX8U(
IVP_SEL2NX8UI(
native_vector_u8(0), src, IVP_SELI_8B_INTERLEAVE_1_LO));

return native_vector_i32_x4(
native_vector_i32_x4::from_native_vector,
IVP_MOVN_2X32_FROMNX16(
IVP_SELNX16UI(native_vector_i16(0), b, IVP_SELI_16B_INTERLEAVE_1_LO)),
IVP_MOVN_2X32_FROMNX16(
IVP_SELNX16UI(native_vector_i16(0), b, IVP_SELI_16B_INTERLEAVE_1_HI)),
IVP_MOVN_2X32_FROMNX16(
IVP_SELNX16UI(native_vector_i16(0), a, IVP_SELI_16B_INTERLEAVE_1_LO)),
IVP_MOVN_2X32_FROMNX16(
IVP_SELNX16UI(native_vector_i16(0), a, IVP_SELI_16B_INTERLEAVE_1_HI)));
}

template<>
HALIDE_ALWAYS_INLINE native_vector_u32_x4 convert<native_vector_u32_x4, native_vector_u8>(const native_vector_u8 &src) {
xb_vec2Nx24 wide = src * native_vector_u8(1);
return native_vector_u32_x4(native_vector_u32_x4::from_native_vector, IVP_CVT32S2NX24LL(wide), IVP_CVT32S2NX24LH(wide),
IVP_CVT32S2NX24HL(wide), IVP_CVT32S2NX24HH(wide));
native_vector_i16 a = IVP_MOVNX16_FROM2NX8U(
IVP_SEL2NX8UI(
native_vector_u8(0), src, IVP_SELI_8B_INTERLEAVE_1_HI));
native_vector_i16 b = IVP_MOVNX16_FROM2NX8U(
IVP_SEL2NX8UI(
native_vector_u8(0), src, IVP_SELI_8B_INTERLEAVE_1_LO));

return native_vector_u32_x4(
native_vector_u32_x4::from_native_vector,
IVP_MOVN_2X32U_FROMNX16(
IVP_SELNX16UI(native_vector_i16(0), b, IVP_SELI_16B_INTERLEAVE_1_LO)),
IVP_MOVN_2X32U_FROMNX16(
IVP_SELNX16UI(native_vector_i16(0), b, IVP_SELI_16B_INTERLEAVE_1_HI)),
IVP_MOVN_2X32U_FROMNX16(
IVP_SELNX16UI(native_vector_i16(0), a, IVP_SELI_16B_INTERLEAVE_1_LO)),
IVP_MOVN_2X32U_FROMNX16(
IVP_SELNX16UI(native_vector_i16(0), a, IVP_SELI_16B_INTERLEAVE_1_HI)));
}

template<>
Expand Down Expand Up @@ -2455,10 +2518,11 @@ HALIDE_ALWAYS_INLINE native_vector_i32_x2 convert<native_vector_i32_x2, native_v

template<>
HALIDE_ALWAYS_INLINE native_vector_u32_x2 convert<native_vector_u32_x2, native_vector_u16>(const native_vector_u16 &src) {
xb_vec2Nx24 wide = IVP_CVT24U2NX16(0, xb_vecNx16U_rtor_xb_vecNx16(src));
return native_vector_u32_x2(native_vector_u32_x2::from_native_vector,
xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32S2NX24LL(wide)),
xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32S2NX24LH(wide)));
IVP_MOVN_2X32_FROMNX16(
IVP_SELNX16UI(native_vector_u16(0), src, IVP_SELI_16B_INTERLEAVE_1_LO)),
IVP_MOVN_2X32_FROMNX16(
IVP_SELNX16UI(native_vector_u16(0), src, IVP_SELI_16B_INTERLEAVE_1_HI)));
}

template<>
Expand Down Expand Up @@ -2663,13 +2727,15 @@ HALIDE_ALWAYS_INLINE native_vector_u32 halide_xtensa_convert_u16_high_u32(const
}

HALIDE_ALWAYS_INLINE native_vector_u16 halide_xtensa_convert_i32_u16(const native_vector_i32 &src0, const native_vector_i32 &src1) {
xb_vecNx48 wide = IVP_CVT48SNX32(src1, src0);
return xb_vecNx16_rtor_xb_vecNx16U(IVP_PACKLNX48(wide));
return IVP_SELNX16UI(IVP_MOVNX16_FROMN_2X32(src1),
IVP_MOVNX16_FROMN_2X32(src0),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}

HALIDE_ALWAYS_INLINE native_vector_i8 halide_xtensa_convert_concat_i16_to_i8(const native_vector_i16 &a, const native_vector_i16 &b) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(b, a);
return IVP_PACKL2NX24(wide);
return IVP_SEL2NX8I(IVP_MOV2NX8_FROMNX16(b),
IVP_MOV2NX8_FROMNX16(a),
IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
}

HALIDE_ALWAYS_INLINE native_vector_u8 halide_xtensa_sat_narrow_u8(const native_vector_i16_x2 &a) {
Expand Down
20 changes: 15 additions & 5 deletions test/correctness/simd_op_check_xtensa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,21 +111,31 @@ class SimdOpCheckXtensa : public SimdOpCheckTest {
// some of these could overflow that limit. (Omitting the spaces is
// a bit of a band-aid here; a better solution would probably be
// to allow arbitrary names that don't match, but for now, this will do.)
// TODO(vksnk): float16 doesnt't seem to be supported well by cstubs library.
// https://github.com/halide/Halide/issues/7858
// check("convert<float16x32_t,float32x32_t>", vector_width / 2, f16(f32_1));
// check("convert<float32x32_t,float16x32_t>", vector_width / 2, f32(f16_1));
check("convert<float32x32_t,int16x32_t>", vector_width / 2, f32(i16_1));
check("convert<float32x32_t,uint16x32_t>", vector_width / 2, f32(u16_1));
check("convert<uint32x32_t,uint16x32_t>", vector_width / 2, u32(u16_1));
check("convert<int32x32_t,uint16x32_t>", vector_width / 2, i32(u16_1));
check("convert<int32x32_t,int16x32_t>", vector_width / 2, i32(i16_1));
check("convert<int8x64_t,int16x64_t>", vector_width / 2, i8(i16_1) + i8(i16_2));
check("convert<int8x64_t,uint16x64_t>", vector_width / 2, i8(u16_1) + i8(u16_2));
check("convert<int8x64_t,int32x64_t>", vector_width, i8(i32_1));
check("convert<int8x64_t,uint32x64_t>", vector_width, i8(u32_1));
check("convert<uint8x64_t,int32x64_t>", vector_width, u8(u32_1));
check("convert<int16x64_t,uint8x64_t>", vector_width, i16(u8_1));
check("convert<uint16x64_t,uint8x64_t>", vector_width, u16(u8_1));
check("convert<int32x64_t,uint8x64_t>", vector_width, i32(u8_1));
check("convert<int32x32_t,int16x32_t>", vector_width / 2, i32(i16_1));
check("convert<int32x32_t,uint16x32_t>", vector_width / 2, i32(u16_1));
check("convert<uint32x64_t,uint8x64_t>", vector_width, u32(u8_1));
check("convert<uint32x32_t,uint16x32_t>", vector_width / 2, u32(u16_1));
check("store_narrowing<int32x16_t,int16_t,16>", vector_width / 4, i16(i32_1));
check("store_narrowing<uint32x16_t,uint16_t,16>", vector_width / 4, u16(u32_1));
check("store_narrowing<int16x32_t,int8_t,32>", vector_width / 2, i8(i16_1));
check("store_narrowing<int16x32_t,uint8_t,32>", vector_width / 2, u8(i16_1));
check("store_narrowing<uint16x32_t,uint8_t,32>", vector_width / 2, u8(u16_1));
check("halide_xtensa_sat_narrow_u8", vector_width, u8_sat(i16_1 + i16_2));
check("halide_xtensa_convert_concat_i16_to_i8", vector_width, i8(i16_1 + i16_2));
check("halide_xtensa_convert_concat_i32_to_u16", vector_width, u16(i32_1 + i32_2));
check("halide_xtensa_convert_i32_u16", vector_width / 2, u16(i32_1 + i32_2));

// Averaging instructions.
check("IVP_AVGUNX16", vector_width / 2, u16((u32(u16_1) + u32(u16_2)) / 2));
Expand Down

0 comments on commit 2c367b9

Please sign in to comment.