'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes #875

pdhirajkumarprasad · 2024-11-04T08:21:35Z

for the given IR

module {
  func.func @main_graph(%arg2: !torch.vtensor<[?,384,?],f32>) -> !torch.vtensor<[?,384,?],f32>  attributes {torch.onnx_meta.ir_version = 6 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "2.1.0"} {
    %1 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0.0> : tensor<384x384x3xf32>} : () -> !torch.vtensor<[384,384,3],f32> 
    %2 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0.0> : tensor<384xf32>} : () -> !torch.vtensor<[384],f32> 
    %3 = torch.operator "onnx.Conv"(%arg2, %1, %2) {torch.onnx.dilations = [1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [3 : si64], torch.onnx.pads = [1 : si64, 1 : si64], torch.onnx.strides = [2 : si64]} : (!torch.vtensor<[?,384,?],f32>, !torch.vtensor<[384,384,3],f32>, !torch.vtensor<[384],f32>) -> !torch.vtensor<[?,384,?],f32> 
    return %3 : !torch.vtensor<[?,384,?],f32>
  }
}

getting error as

 error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes

command : iree-compile --iree-hal-target-backends=llvm-cpu -o abc.vmfb model.torch_onnx.mlir

The text was updated successfully, but these errors were encountered:

vinayakdsci · 2024-11-04T17:54:48Z

Compilation fails on this dispatch:

hal.executable public @main_graph$async_dispatch_1 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "icelake-server", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,+rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,-avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,-pku,-nf,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,-rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,-shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,-sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_graph$async_dispatch_1_elementwise_Dx384xD_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_graph$async_dispatch_1_elementwise_Dx384xD_f32() {
        %c0 = arith.constant 0 : index
        %c32_i64 = arith.constant 32 : i64
        %cst = arith.constant dense<0.000000e+00> : tensor<384x384x3xf32>
        %cst_0 = arith.constant 0.000000e+00 : f32
        %0 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
        %4 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
        %5 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
        %6 = arith.extui %0 : i32 to i64
        %7 = arith.extui %1 : i32 to i64
        %8 = arith.shli %7, %c32_i64 : i64
        %9 = arith.ori %6, %8 : i64
        %10 = arith.index_castui %9 : i64 to index
        %11 = arith.extui %2 : i32 to i64
        %12 = arith.extui %3 : i32 to i64
        %13 = arith.shli %12, %c32_i64 : i64
        %14 = arith.ori %11, %13 : i64
        %15 = arith.index_castui %14 : i64 to index
        %16 = arith.extui %4 : i32 to i64
        %17 = arith.extui %5 : i32 to i64
        %18 = arith.shli %17, %c32_i64 : i64
        %19 = arith.ori %16, %18 : i64
        %20 = arith.index_castui %19 : i64 to index
        %21:3 = util.assume.int 
            %10<umin = 0, umax = 9007199254740991>, 
            %15<umin = 2, umax = 9007199254740993>, 
            %20<umin = 0, umax = 4503599627370496>
          : index, index, index
        %22 = flow.dispatch.workload.ordinal %21#0, 0 : index
        %23 = flow.dispatch.workload.ordinal %21#1, 1 : index
        %24 = flow.dispatch.workload.ordinal %21#2, 2 : index
        %25 = hal.interface.binding.subspan layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x384x?xf32>>{%22, %23}
        %26 = hal.interface.binding.subspan layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x384x?xf32>>{%22, %24}
        %27 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0], sizes = [%22, 384, %23], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x384x?xf32>>{%22, %23} -> tensor<?x384x?xf32>
        %28 = tensor.empty(%22, %24) : tensor<?x384x?xf32>
        %29 = linalg.fill ins(%cst_0 : f32) outs(%28 : tensor<?x384x?xf32>) -> tensor<?x384x?xf32>
        %30 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<2> : vector<1xi64>} ins(%27, %cst : tensor<?x384x?xf32>, tensor<384x384x3xf32>) outs(%29 : tensor<?x384x?xf32>) -> tensor<?x384x?xf32>
        %31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%30 : tensor<?x384x?xf32>) outs(%28 : tensor<?x384x?xf32>) {
        ^bb0(%in: f32, %out: f32):
          %32 = arith.addf %in, %cst_0 : f32
          linalg.yield %32 : f32
        } -> tensor<?x384x?xf32>
        flow.dispatch.tensor.store %31, %26, offsets = [0, 0, 0], sizes = [%22, 384, %24], strides = [1, 1, 1] : tensor<?x384x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x384x?xf32>>{%22, %24}
        return
      }
    }
  }
}

The error message on this dispatch is

./debug_875/module_main_graph$async_dispatch_1.mlir:9:6: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes
      func.func @main_graph$async_dispatch_1_elementwise_Dx384xD_f32() {
     ^

pdhirajkumarprasad mentioned this issue Nov 4, 2024

[Tracker] All the issue related with e2e shark test suite #812

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes #875

'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes #875

pdhirajkumarprasad commented Nov 4, 2024

vinayakdsci commented Nov 4, 2024

'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes #875

'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes #875

Comments

pdhirajkumarprasad commented Nov 4, 2024

vinayakdsci commented Nov 4, 2024