Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

torch.aten.add to linalg #571

Closed
Tracked by #347
saienduri opened this issue Mar 28, 2024 · 3 comments
Closed
Tracked by #347

torch.aten.add to linalg #571

saienduri opened this issue Mar 28, 2024 · 3 comments
Assignees

Comments

@saienduri
Copy link
Contributor

saienduri commented Mar 28, 2024

We are facing following issues with Add op in these models (opt-1.3b, opt-125M, opt-350m, whisper-base, whisper-medium, whisper-small). Repro instructions can be found here:

opt-1.3b.default.pytorch.torch.mlir:695:12: error: 'linalg.generic' op inferred input/output operand #1 has shape's dimension #1 to be 32, but found 1
    %537 = torch.aten.add.Tensor %536, %473, %int1_139 : !torch.vtensor<[1,32,8,8],f32>, !torch.vtensor<[?,?,8,8],f32>, !torch.int -> !torch.vtensor<[?,32,8,8],f32>
           ^
opt-1.3b.default.pytorch.torch.mlir:695:12: note: see current operation: 
%1183 = "linalg.generic"(%1181, %1079, %1182) <{indexing_maps = [affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 2, 1>}> ({
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
  %5053 = "arith.addf"(%arg1, %arg2) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
  "linalg.yield"(%5053) : (f32) -> ()
}) : (tensor<1x32x8x8xf32>, tensor<1x1x8x8xf32>, tensor<1x32x8x8xf32>) -> tensor<1x32x8x8xf32>
opt-125M.default.pytorch.torch.mlir:455:12: error: 'linalg.generic' op inferred input/output operand #1 has shape's dimension #1 to be 12, but found 1
    %345 = torch.aten.add.Tensor %344, %281, %int1_91 : !torch.vtensor<[1,12,8,8],f32>, !torch.vtensor<[?,?,8,8],f32>, !torch.int -> !torch.vtensor<[?,12,8,8],f32>
           ^
opt-125M.default.pytorch.torch.mlir:455:12: note: see current operation: 
%751 = "linalg.generic"(%749, %647, %750) <{indexing_maps = [affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 2, 1>}> ({
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
  %2665 = "arith.addf"(%arg1, %arg2) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
  "linalg.yield"(%2665) : (f32) -> ()
}) : (tensor<1x12x8x8xf32>, tensor<1x1x8x8xf32>, tensor<1x12x8x8xf32>) -> tensor<1x12x8x8xf32>
opt-350m.default.pytorch.torch.mlir:691:12: error: 'linalg.generic' op inferred input/output operand #1 has shape's dimension #1 to be 16, but found 1
    %537 = torch.aten.add.Tensor %536, %473, %int1_138 : !torch.vtensor<[1,16,8,8],f32>, !torch.vtensor<[?,?,8,8],f32>, !torch.int -> !torch.vtensor<[?,16,8,8],f32>
           ^
opt-350m.default.pytorch.torch.mlir:691:12: note: see current operation: 
%1165 = "linalg.generic"(%1163, %1082, %1164) <{indexing_maps = [affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 2, 1>}> ({
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
  %5042 = "arith.addf"(%arg1, %arg2) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
  "linalg.yield"(%5042) : (f32) -> ()
}) : (tensor<1x16x8x8xf32>, tensor<1x1x8x8xf32>, tensor<1x16x8x8xf32>) -> tensor<1x16x8x8xf32>
whisper-base.default.pytorch.torch.mlir:238:12: error: 'linalg.generic' op inferred input/output operand #1 has shape's dimension #1 to be 8, but found 1
    %174 = torch.aten.add.Tensor %173, %138, %int1_49 : !torch.vtensor<[1,8,10,10],f32>, !torch.vtensor<[?,?,10,10],f32>, !torch.int -> !torch.vtensor<[?,8,10,10],f32>
           ^
whisper-base.default.pytorch.torch.mlir:238:12: note: see current operation: 
%397 = "linalg.generic"(%395, %318, %396) <{indexing_maps = [affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 2, 1>}> ({
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
  %1371 = "arith.addf"(%arg1, %arg2) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
  "linalg.yield"(%1371) : (f32) -> ()
}) : (tensor<1x8x10x10xf32>, tensor<1x1x10x10xf32>, tensor<1x8x10x10xf32>) -> tensor<1x8x10x10xf32>
whisper-medium.default.pytorch.torch.mlir:508:12: error: 'linalg.generic' op inferred input/output operand #1 has shape's dimension #1 to be 16, but found 1
    %444 = torch.aten.add.Tensor %443, %408, %int1_49 : !torch.vtensor<[1,16,10,10],f32>, !torch.vtensor<[?,?,10,10],f32>, !torch.int -> !torch.vtensor<[?,16,10,10],f32>
           ^
whisper-medium.default.pytorch.torch.mlir:508:12: note: see current operation: 
%1207 = "linalg.generic"(%1205, %1128, %1206) <{indexing_maps = [affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 2, 1>}> ({
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
  %5223 = "arith.addf"(%arg1, %arg2) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
  "linalg.yield"(%5223) : (f32) -> ()
}) : (tensor<1x16x10x10xf32>, tensor<1x1x10x10xf32>, tensor<1x16x10x10xf32>) -> tensor<1x16x10x10xf32>
whisper-small.default.pytorch.torch.mlir:328:12: error: 'linalg.generic' op inferred input/output operand #1 has shape's dimension #1 to be 12, but found 1
    %264 = torch.aten.add.Tensor %263, %228, %int1_49 : !torch.vtensor<[1,12,10,10],f32>, !torch.vtensor<[?,?,10,10],f32>, !torch.int -> !torch.vtensor<[?,12,10,10],f32>
           ^
whisper-small.default.pytorch.torch.mlir:328:12: note: see current operation: 
%667 = "linalg.generic"(%665, %588, %666) <{indexing_maps = [affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 2, 1>}> ({
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
  %2655 = "arith.addf"(%arg1, %arg2) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
  "linalg.yield"(%2655) : (f32) -> ()
}) : (tensor<1x12x10x10xf32>, tensor<1x1x10x10xf32>, tensor<1x12x10x10xf32>) -> tensor<1x12x10x10xf32>
@ScottTodd
Copy link
Member

Can you add more context to the issue title for this? "Add" isn't very descriptive :P

@saienduri saienduri assigned saienduri and unassigned saienduri Apr 2, 2024
@saienduri saienduri changed the title Add torch.aten.add onnx model lowering Apr 2, 2024
@saienduri saienduri assigned saienduri and unassigned saienduri Apr 2, 2024
@schnkmwt
Copy link

schnkmwt commented Apr 3, 2024

If the tests above are run with --torchtolinalg switch, the tests do not fail as torch-mlir is able to lower torch to linagl successfully. The issue shows up if the lowering is done as:
torch-onnx -> torch (using torch-mlir-opt) + torch -> linalg -> bin (using iree-compile)
but it does not show up and the tests work if done as:
torch-onnx -> torch -> linalg (using torch-mlir-opt) + linalg -> bin (using iree-compile)

@AmosLewis AmosLewis changed the title torch.aten.add onnx model lowering torch.aten.add to linalg Apr 19, 2024
@AmosLewis AmosLewis assigned saienduri and schnkmwt and unassigned saienduri Apr 22, 2024
@AmosLewis AmosLewis assigned AmosLewis and unassigned schnkmwt May 7, 2024
@AmosLewis
Copy link
Contributor

AmosLewis commented May 14, 2024

#684

failed to translate executables
opt-1.3b.default.pytorch.torch.mlir:383:12: error: 'iree_linalg_ext.scan' op expected type of operand #1 ('tensor<1x8xi64>') to match type of corresponding result ('tensor<1x?xi64>')
    %364 = torch.aten.cumsum %4, %int1, %none : !torch.vtensor<[1,8],si64>, !torch.int, !torch.none -> !torch.vtensor<[1,8],si64>
           ^
opt-1.3b.default.pytorch.torch.mlir:383:12: note: called from
    %364 = torch.aten.cumsum %4, %int1, %none : !torch.vtensor<[1,8],si64>, !torch.int, !torch.none -> !torch.vtensor<[1,8],si64>
           ^
opt-1.3b.default.pytorch.torch.mlir:383:12: note: see current operation: 
%12:2 = "iree_linalg_ext.scan"(%7, %9, %11) <{dimension = 1 : i64, inclusive = true, operandSegmentSizes = array<i32: 1, 2>}> ({
^bb0(%arg0: i64, %arg1: i64):
  %13 = "arith.addi"(%arg0, %arg1) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
  "iree_linalg_ext.yield"(%13) : (i64) -> ()
}) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 0]]>} : (tensor<1x?xi64>, tensor<1x8xi64>, tensor<1xi64>) -> (tensor<1x?xi64>, tensor<1xi64>)
    %364 = torch.aten.cumsum %4, %int1, %none : !torch.vtensor<[1,8],si64>, !torch.int, !torch.none -> !torch.vtensor<[1,8],si64>
           ^
opt-1.3b.default.pytorch.torch.mlir:383:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver3", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,-avx512ifma,+xsave,+sse4.2,-avx512pf,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,-avx512vpopcntdq,+cmov,-avx512vp2intersect,-avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,-avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,-avx512f,-amx-bf16,-avx512bf16,-avx512vnni,-push2pop2,+cx8,-avx512bw,+sse3,-pku,+fsgsbase,+clzero,-mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,-avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,-avx512vbmi2,-prefetchi,+rdpid,-fma4,-avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,-avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 32 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %364 = torch.aten.cumsum %4, %int1, %none : !torch.vtensor<[1,8],si64>, !torch.int, !torch.none -> !torch.vtensor<[1,8],si64>
           ^
opt-1.3b.default.pytorch.torch.mlir:383:12: note: called from
    %364 = torch.aten.cumsum %4, %int1, %none : !torch.vtensor<[1,8],si64>, !torch.int, !torch.none -> !torch.vtensor<[1,8],si64>
           ^
opt-1.3b.default.pytorch.torch.mlir:383:12: note: see current operation: 
"hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg0: !hal.device):
    %0 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%0, %0, %0) : (index, index, index) -> ()
  }) {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "jit_eval_2_dispatch_0_scan_1x8xi64"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "jit_eval_2_dispatch_0_scan_1x8xi64"}> ({
      %0 = "arith.constant"() <{value = 8 : index}> : () -> index
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 0 : index}> : () -> index
      %3 = "arith.constant"() <{value = 64 : index}> : () -> index
      %4 = "hal.interface.binding.subspan"(%2) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<1x8xi64>>
      %5 = "hal.interface.binding.subspan"(%2) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<1x8xi64>>
      %6 = "hal.interface.binding.subspan"(%3) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<1xi64>>
      %7 = "flow.dispatch.tensor.load"(%4, %2, %0) <{operandSegmentSizes = array<i32: 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, -9223372036854775808>, static_strides = array<i64: 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<1x8xi64>>, index, index) -> tensor<1x?xi64>
      %8 = "tensor.empty"() : () -> tensor<1x8xi64>
      %9 = "linalg.fill"(%1, %8) <{operandSegmentSizes = array<i32: 1, 1>}> ({
      ^bb0(%arg0: i64, %arg1: i64):
        "linalg.yield"(%arg0) : (i64) -> ()
      }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 0], [0, 0], [0, 0], [0, 0]]>} : (i64, tensor<1x8xi64>) -> tensor<1x8xi64>
      %10 = "tensor.empty"() : () -> tensor<1xi64>
      %11 = "linalg.fill"(%1, %10) <{operandSegmentSizes = array<i32: 1, 1>}> ({
      ^bb0(%arg0: i64, %arg1: i64):
        "linalg.yield"(%arg0) : (i64) -> ()
      }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1], [0], [0], [0]]>} : (i64, tensor<1xi64>) -> tensor<1xi64>
      %12:2 = "iree_linalg_ext.scan"(%7, %9, %11) <{dimension = 1 : i64, inclusive = true, operandSegmentSizes = array<i32: 1, 2>}> ({
      ^bb0(%arg0: i64, %arg1: i64):
        %13 = "arith.addi"(%arg0, %arg1) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
        "iree_linalg_ext.yield"(%13) : (i64) -> ()
      }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 0]]>} : (tensor<1x?xi64>, tensor<1x8xi64>, tensor<1xi64>) -> (tensor<1x?xi64>, tensor<1xi64>)
      "flow.dispatch.tensor.store"(%12#0, %5, %2, %0) <{operandSegmentSizes = array<i32: 1, 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, -9223372036854775808>, static_strides = array<i64: 1, 1>}> : (tensor<1x?xi64>, !flow.dispatch.tensor<writeonly:tensor<1x8xi64>>, index, index) -> ()
      "flow.dispatch.tensor.store"(%12#1, %6, %2) <{operandSegmentSizes = array<i32: 1, 1, 0, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (tensor<1xi64>, !flow.dispatch.tensor<writeonly:tensor<1xi64>>, index) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<CPUDefault>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
}) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver3", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,-avx512ifma,+xsave,+sse4.2,-avx512pf,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,-avx512vpopcntdq,+cmov,-avx512vp2intersect,-avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,-avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,-avx512f,-amx-bf16,-avx512bf16,-avx512vnni,-push2pop2,+cx8,-avx512bw,+sse3,-pku,+fsgsbase,+clzero,-mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,-avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,-avx512vbmi2,-prefetchi,+rdpid,-fma4,-avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,-avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 32 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
    %364 = torch.aten.cumsum %4, %int1, %none : !torch.vtensor<[1,8],si64>, !torch.int, !torch.none -> !torch.vtensor<[1,8],si64>
           ^

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants