Conv2D benchmark failed with DoubleTiling methods #431

pzread · 2022-04-05T16:42:03Z

When testing python.examples.conv.conv_2d_bench, I got an error with DoubleTiling methods: error: replacement operation is already associated with another key, while the runs with SingleTiling methods succeeded.

From the stack trace, I think the problems is in "linalg-interp-transforms" pass and can reproduce it with the following IR dump:

Run: mlir-proto-opt -linalg-interp-transforms tmp.mlir with the tmp.mlir below:

func @conv_2d_nhwc_hwcf_main(%arg0: tensor<8x18x18x32xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg1: tensor<3x3x32x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg2: tensor<8x16x16x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = true}) -> tensor<8x16x16x64xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
  %0 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<8x18x18x32xf32>, tensor<3x3x32x64xf32>) outs(%arg2 : tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
  return %0 : tensor<8x16x16x64xf32>
}
func private @nano_time() -> i64 attributes {llvm.emit_c_interface}
func public @main(%arg0: tensor<8x18x18x32xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg1: tensor<3x3x32x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg2: tensor<8x16x16x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = true}, %arg3: memref<?xi64>) -> tensor<8x16x16x64xf32> attributes {llvm.emit_c_interface} {
  %c0 = arith.constant 0 : index
  %0 = memref.dim %arg3, %c0 : memref<?xi64>
  %c1 = arith.constant 1 : index
  %1 = scf.for %arg4 = %c0 to %0 step %c1 iter_args(%arg5 = %arg2) -> (tensor<8x16x16x64xf32>) {
    %2 = call @nano_time() : () -> i64
    %3 = call @conv_2d_nhwc_hwcf_main(%arg0, %arg1, %arg5) : (tensor<8x18x18x32xf32>, tensor<3x3x32x64xf32>, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
    %4 = call @nano_time() : () -> i64
    %5 = arith.subi %4, %2 : i64
    memref.store %5, %arg3[%arg4] : memref<?xi64>
    scf.yield %3 : tensor<8x16x16x64xf32>
  }
  return %1 : tensor<8x16x16x64xf32>
}
iree_linalg_transform.sequence {
  %0 = match @match_linalg_conv_2d_nhwc_hwcf_in_conv_2d_nhwc_hwcf_main
  %tiled_linalg_op, %loops:7 = tile %0 {interchange = [], sizes = [1, 32, 32, 32, 1, 3, 64]}
  %1 = peel_loop %loops#0
  %2 = peel_loop %loops#1
  %3 = peel_loop %loops#2
  %4 = peel_loop %loops#3
  %5 = peel_loop %loops#4
  %6 = peel_loop %loops#5
  %7 = peel_loop %loops#6
  %8 = match @match_linalg_conv_2d_nhwc_hwcf_in_conv_2d_nhwc_hwcf_main
  %tiled_linalg_op_0, %loops_1:7 = tile %8 {interchange = [], sizes = [1, 1, 8, 32, 1, 1, 8]}
  %9 = peel_loop %loops_1#0
  %10 = peel_loop %loops_1#1
  %11 = peel_loop %loops_1#2
  %12 = peel_loop %loops_1#3
  %13 = peel_loop %loops_1#4
  %14 = peel_loop %loops_1#5
  %15 = peel_loop %loops_1#6
  decompose
  vectorize {vectorize_padding = true}
  bufferize
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3, 4], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3, 4, 5], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3, 4, 5, 6], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3, 4, 5, 6, 7], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_to_llvm {enable_amx = false, enable_arm_neon = false, enable_arm_sve = false, enable_async = false, enable_index_optimizations = false, enable_x86vector = false, reassociate_fp_reductions = false}
}
pdl.pattern @match_linalg_conv_2d_nhwc_hwcf_in_conv_2d_nhwc_hwcf_main : benefit(1) {
  %0 = operands
  %1 = types
  %2 = operation "linalg.conv_2d_nhwc_hwcf"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
  %3 = attribute @conv_2d_nhwc_hwcf_main
  apply_native_constraint "nestedInFunc"(%2, %3 : !pdl.operation, !pdl.attribute)
  rewrite %2 with "iree_linalg_transform.apply"
}

Error message:

tmp.mlir:2:8: error: replacement operation is already associated with another key
  %0 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<8x18x18x32xf32>, tensor<3x3x32x64xf32>) outs(%arg2 : tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
       ^
tmp.mlir:2:8: note: see current operation: %8 = "scf.for"(%6, %5, %1, %arg4) ({
^bb0(%arg5: index, %arg6: tensor<8x16x16x64xf32>):
  %10 = "scf.for"(%6, %3, %1, %arg6) ({
  ^bb0(%arg7: index, %arg8: tensor<8x16x16x64xf32>):
    %11 = "scf.for"(%6, %2, %0, %arg8) ({
    ^bb0(%arg9: index, %arg10: tensor<8x16x16x64xf32>):
      %12 = "scf.for"(%6, %2, %2, %arg10) ({
      ^bb0(%arg11: index, %arg12: tensor<8x16x16x64xf32>):
        %13 = "scf.for"(%6, %1, %3, %arg12) ({
        ^bb0(%arg13: index, %arg14: tensor<8x16x16x64xf32>):
          %14 = "affine.apply"(%6, %arg9) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
          %15 = "affine.min"(%6, %arg9) {map = affine_map<(d0, d1) -> (-d0 - d1 + 18, 32)>} : (index, index) -> index
          %16 = "affine.apply"(%arg5, %arg11) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
          %17 = "affine.min"(%arg5, %arg11) {map = affine_map<(d0, d1) -> (-d0 - d1 + 18, 34)>} : (index, index) -> index
          %18 = "affine.min"(%arg13) {map = affine_map<(d0) -> (-d0 + 32, 64)>} : (index) -> index
          %19 = "tensor.extract_slice"(%arg0, %arg3, %14, %16, %arg13, %15, %17, %18) {operand_segment_sizes = dense<[1, 4, 3, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808], static_sizes = [1, -1, -1, -1], static_strides = [1, 1, 1, 1]} : (tensor<8x18x18x32xf32>, index, index, index, index, index, index, index) -> tensor<1x?x?x?xf32>
          %20 = "tensor.extract_slice"(%arg1, %arg9, %arg11, %arg13, %arg7, %18) {operand_segment_sizes = dense<[1, 4, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808], static_sizes = [1, 3, -1, 32], static_strides = [1, 1, 1, 1]} : (tensor<3x3x32x64xf32>, index, index, index, index, index) -> tensor<1x3x?x32xf32>
          %21 = "affine.min"(%6) {map = affine_map<(d0) -> (-d0 + 16, 32)>} : (index) -> index
          %22 = "affine.min"(%arg5) {map = affine_map<(d0) -> (-d0 + 16, 32)>} : (index) -> index
          %23 = "tensor.extract_slice"(%arg14, %arg3, %6, %arg5, %arg7, %21, %22) {operand_segment_sizes = dense<[1, 4, 2, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808], static_sizes = [1, -1, -1, 32], static_strides = [1, 1, 1, 1]} : (tensor<8x16x16x64xf32>, index, index, index, index, index, index) -> tensor<1x?x?x32xf32>
          %24 = "linalg.conv_2d_nhwc_hwcf"(%19, %20, %23) ({
          ^bb0(%arg15: f32, %arg16: f32, %arg17: f32):
            %26 = "arith.mulf"(%arg15, %arg16) : (f32, f32) -> f32
            %27 = "arith.addf"(%arg17, %26) : (f32, f32) -> f32
            "linalg.yield"(%27) : (f32) -> ()
          }) {dilations = dense<1> : tensor<2xi64>, iree_linalg_transform.matched, linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>, strides = dense<1> : tensor<2xi64>} : (tensor<1x?x?x?xf32>, tensor<1x3x?x32xf32>, tensor<1x?x?x32xf32>) -> tensor<1x?x?x32xf32>
          %25 = "tensor.insert_slice"(%24, %arg14, %arg3, %6, %arg5, %arg7, %21, %22) {operand_segment_sizes = dense<[1, 1, 4, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808], static_sizes = [1, -1, -1, 32], static_strides = [1, 1, 1, 1]} : (tensor<1x?x?x32xf32>, tensor<8x16x16x64xf32>, index, index, index, index, index, index) -> tensor<8x16x16x64xf32>
          "scf.yield"(%25) : (tensor<8x16x16x64xf32>) -> ()
        }) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
        "scf.yield"(%13) : (tensor<8x16x16x64xf32>) -> ()
      }) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
      "scf.yield"(%12) : (tensor<8x16x16x64xf32>) -> ()
    }) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
    "scf.yield"(%11) : (tensor<8x16x16x64xf32>) -> ()
  }) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
  "scf.yield"(%10) : (tensor<8x16x16x64xf32>) -> ()
}) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
tmp.mlir:2:8: note: replacing this operation
tmp.mlir:22:32: note: old key
  %tiled_linalg_op, %loops:7 = tile %0 {interchange = [], sizes = [1, 32, 32, 32, 1, 3, 64]}
                               ^
tmp.mlir:22:32: note: new key

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Conv2D benchmark failed with DoubleTiling methods #431

Conv2D benchmark failed with DoubleTiling methods #431

pzread commented Apr 5, 2022 •

edited

Loading

Conv2D benchmark failed with DoubleTiling methods #431

Conv2D benchmark failed with DoubleTiling methods #431

Comments

pzread commented Apr 5, 2022 • edited Loading

pzread commented Apr 5, 2022 •

edited

Loading