diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index b196285f6bd0..35fd2b7a78d7 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -582,8 +582,12 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ wrap_topi_schedule(interleaved_schedule), name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu", ) + # Non-quantized cases elif data.dtype in ["float32", "float16"]: - # Non-quantized cases + # The SME schedule for float16->float32 prearranges the two matrices to be multiplied + # using the ARM_SME_BLOCK2_2SVLx1SVL_FP16_TRANSPOSE_INTERLEAVE intrinsic which expects + # the reduction axis K as the second dimension of the matrix (i.e. shape = (_, K)). + # This means that the flattened weights matrix B needs to be transposed to (N, K). if ( target.features.has_sme and kernel.dtype == "float16" diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index ed398f80e6ef..2476cb92b915 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -172,6 +172,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): KH, KW, IC, OC = get_const_tuple(kernel.shape) K = KH * KW * IC N = OC + # The SME schedule for float16->float32 prearranges the two matrices to be multiplied + # using the ARM_SME_BLOCK2_2SVLx1SVL_FP16_TRANSPOSE_INTERLEAVE intrinsic which expects + # the reduction axis K as the second dimension of the matrix (i.e. shape = (_, K)). + # This means that the flattened weights matrix B needs to be transposed to (N, K). transposed_kernel_expr = relay.transpose(inputs[1], axes=[3, 0, 1, 2]) transposed_flattened_kernel_expr = relay.reshape(transposed_kernel_expr, newshape=(N, K)) new_kernel_expr = transposed_flattened_kernel_expr