Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Conv2D benchmark failed with DoubleTiling methods #431

Open
pzread opened this issue Apr 5, 2022 · 0 comments
Open

Conv2D benchmark failed with DoubleTiling methods #431

pzread opened this issue Apr 5, 2022 · 0 comments

Comments

@pzread
Copy link

pzread commented Apr 5, 2022

When testing python.examples.conv.conv_2d_bench, I got an error with DoubleTiling methods: error: replacement operation is already associated with another key, while the runs with SingleTiling methods succeeded.

From the stack trace, I think the problems is in "linalg-interp-transforms" pass and can reproduce it with the following IR dump:

Run: mlir-proto-opt -linalg-interp-transforms tmp.mlir with the tmp.mlir below:

func @conv_2d_nhwc_hwcf_main(%arg0: tensor<8x18x18x32xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg1: tensor<3x3x32x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg2: tensor<8x16x16x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = true}) -> tensor<8x16x16x64xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
  %0 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<8x18x18x32xf32>, tensor<3x3x32x64xf32>) outs(%arg2 : tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
  return %0 : tensor<8x16x16x64xf32>
}
func private @nano_time() -> i64 attributes {llvm.emit_c_interface}
func public @main(%arg0: tensor<8x18x18x32xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg1: tensor<3x3x32x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg2: tensor<8x16x16x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = true}, %arg3: memref<?xi64>) -> tensor<8x16x16x64xf32> attributes {llvm.emit_c_interface} {
  %c0 = arith.constant 0 : index
  %0 = memref.dim %arg3, %c0 : memref<?xi64>
  %c1 = arith.constant 1 : index
  %1 = scf.for %arg4 = %c0 to %0 step %c1 iter_args(%arg5 = %arg2) -> (tensor<8x16x16x64xf32>) {
    %2 = call @nano_time() : () -> i64
    %3 = call @conv_2d_nhwc_hwcf_main(%arg0, %arg1, %arg5) : (tensor<8x18x18x32xf32>, tensor<3x3x32x64xf32>, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
    %4 = call @nano_time() : () -> i64
    %5 = arith.subi %4, %2 : i64
    memref.store %5, %arg3[%arg4] : memref<?xi64>
    scf.yield %3 : tensor<8x16x16x64xf32>
  }
  return %1 : tensor<8x16x16x64xf32>
}
iree_linalg_transform.sequence {
  %0 = match @match_linalg_conv_2d_nhwc_hwcf_in_conv_2d_nhwc_hwcf_main
  %tiled_linalg_op, %loops:7 = tile %0 {interchange = [], sizes = [1, 32, 32, 32, 1, 3, 64]}
  %1 = peel_loop %loops#0
  %2 = peel_loop %loops#1
  %3 = peel_loop %loops#2
  %4 = peel_loop %loops#3
  %5 = peel_loop %loops#4
  %6 = peel_loop %loops#5
  %7 = peel_loop %loops#6
  %8 = match @match_linalg_conv_2d_nhwc_hwcf_in_conv_2d_nhwc_hwcf_main
  %tiled_linalg_op_0, %loops_1:7 = tile %8 {interchange = [], sizes = [1, 1, 8, 32, 1, 1, 8]}
  %9 = peel_loop %loops_1#0
  %10 = peel_loop %loops_1#1
  %11 = peel_loop %loops_1#2
  %12 = peel_loop %loops_1#3
  %13 = peel_loop %loops_1#4
  %14 = peel_loop %loops_1#5
  %15 = peel_loop %loops_1#6
  decompose
  vectorize {vectorize_padding = true}
  bufferize
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3, 4], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3, 4, 5], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3, 4, 5, 6], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_vectors {contraction_lowering = "outerproduct", multireduction_lowering = "innerparallel", split_transfers = "linalg-copy", stages = [1, 2, 3, 4, 5, 6, 7], transpose_avx2_lowering = false, transpose_lowering = "eltwise", unroll_vector_transfers = true}
  lower_to_llvm {enable_amx = false, enable_arm_neon = false, enable_arm_sve = false, enable_async = false, enable_index_optimizations = false, enable_x86vector = false, reassociate_fp_reductions = false}
}
pdl.pattern @match_linalg_conv_2d_nhwc_hwcf_in_conv_2d_nhwc_hwcf_main : benefit(1) {
  %0 = operands
  %1 = types
  %2 = operation "linalg.conv_2d_nhwc_hwcf"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
  %3 = attribute @conv_2d_nhwc_hwcf_main
  apply_native_constraint "nestedInFunc"(%2, %3 : !pdl.operation, !pdl.attribute)
  rewrite %2 with "iree_linalg_transform.apply"
}

Error message:

tmp.mlir:2:8: error: replacement operation is already associated with another key
  %0 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<8x18x18x32xf32>, tensor<3x3x32x64xf32>) outs(%arg2 : tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
       ^
tmp.mlir:2:8: note: see current operation: %8 = "scf.for"(%6, %5, %1, %arg4) ({
^bb0(%arg5: index, %arg6: tensor<8x16x16x64xf32>):
  %10 = "scf.for"(%6, %3, %1, %arg6) ({
  ^bb0(%arg7: index, %arg8: tensor<8x16x16x64xf32>):
    %11 = "scf.for"(%6, %2, %0, %arg8) ({
    ^bb0(%arg9: index, %arg10: tensor<8x16x16x64xf32>):
      %12 = "scf.for"(%6, %2, %2, %arg10) ({
      ^bb0(%arg11: index, %arg12: tensor<8x16x16x64xf32>):
        %13 = "scf.for"(%6, %1, %3, %arg12) ({
        ^bb0(%arg13: index, %arg14: tensor<8x16x16x64xf32>):
          %14 = "affine.apply"(%6, %arg9) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
          %15 = "affine.min"(%6, %arg9) {map = affine_map<(d0, d1) -> (-d0 - d1 + 18, 32)>} : (index, index) -> index
          %16 = "affine.apply"(%arg5, %arg11) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
          %17 = "affine.min"(%arg5, %arg11) {map = affine_map<(d0, d1) -> (-d0 - d1 + 18, 34)>} : (index, index) -> index
          %18 = "affine.min"(%arg13) {map = affine_map<(d0) -> (-d0 + 32, 64)>} : (index) -> index
          %19 = "tensor.extract_slice"(%arg0, %arg3, %14, %16, %arg13, %15, %17, %18) {operand_segment_sizes = dense<[1, 4, 3, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808], static_sizes = [1, -1, -1, -1], static_strides = [1, 1, 1, 1]} : (tensor<8x18x18x32xf32>, index, index, index, index, index, index, index) -> tensor<1x?x?x?xf32>
          %20 = "tensor.extract_slice"(%arg1, %arg9, %arg11, %arg13, %arg7, %18) {operand_segment_sizes = dense<[1, 4, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808], static_sizes = [1, 3, -1, 32], static_strides = [1, 1, 1, 1]} : (tensor<3x3x32x64xf32>, index, index, index, index, index) -> tensor<1x3x?x32xf32>
          %21 = "affine.min"(%6) {map = affine_map<(d0) -> (-d0 + 16, 32)>} : (index) -> index
          %22 = "affine.min"(%arg5) {map = affine_map<(d0) -> (-d0 + 16, 32)>} : (index) -> index
          %23 = "tensor.extract_slice"(%arg14, %arg3, %6, %arg5, %arg7, %21, %22) {operand_segment_sizes = dense<[1, 4, 2, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808], static_sizes = [1, -1, -1, 32], static_strides = [1, 1, 1, 1]} : (tensor<8x16x16x64xf32>, index, index, index, index, index, index) -> tensor<1x?x?x32xf32>
          %24 = "linalg.conv_2d_nhwc_hwcf"(%19, %20, %23) ({
          ^bb0(%arg15: f32, %arg16: f32, %arg17: f32):
            %26 = "arith.mulf"(%arg15, %arg16) : (f32, f32) -> f32
            %27 = "arith.addf"(%arg17, %26) : (f32, f32) -> f32
            "linalg.yield"(%27) : (f32) -> ()
          }) {dilations = dense<1> : tensor<2xi64>, iree_linalg_transform.matched, linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>], operand_segment_sizes = dense<[2, 1]> : vector<2xi32>, strides = dense<1> : tensor<2xi64>} : (tensor<1x?x?x?xf32>, tensor<1x3x?x32xf32>, tensor<1x?x?x32xf32>) -> tensor<1x?x?x32xf32>
          %25 = "tensor.insert_slice"(%24, %arg14, %arg3, %6, %arg5, %arg7, %21, %22) {operand_segment_sizes = dense<[1, 1, 4, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808], static_sizes = [1, -1, -1, 32], static_strides = [1, 1, 1, 1]} : (tensor<1x?x?x32xf32>, tensor<8x16x16x64xf32>, index, index, index, index, index, index) -> tensor<8x16x16x64xf32>
          "scf.yield"(%25) : (tensor<8x16x16x64xf32>) -> ()
        }) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
        "scf.yield"(%13) : (tensor<8x16x16x64xf32>) -> ()
      }) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
      "scf.yield"(%12) : (tensor<8x16x16x64xf32>) -> ()
    }) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
    "scf.yield"(%11) : (tensor<8x16x16x64xf32>) -> ()
  }) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
  "scf.yield"(%10) : (tensor<8x16x16x64xf32>) -> ()
}) : (index, index, index, tensor<8x16x16x64xf32>) -> tensor<8x16x16x64xf32>
tmp.mlir:2:8: note: replacing this operation
tmp.mlir:22:32: note: old key
  %tiled_linalg_op, %loops:7 = tile %0 {interchange = [], sizes = [1, 32, 32, 32, 1, 3, 64]}
                               ^
tmp.mlir:22:32: note: new key
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant