Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes #875

Open
pdhirajkumarprasad opened this issue Nov 4, 2024 · 1 comment

Comments

@pdhirajkumarprasad
Copy link

for the given IR

module {
  func.func @main_graph(%arg2: !torch.vtensor<[?,384,?],f32>) -> !torch.vtensor<[?,384,?],f32>  attributes {torch.onnx_meta.ir_version = 6 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "2.1.0"} {
    %1 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0.0> : tensor<384x384x3xf32>} : () -> !torch.vtensor<[384,384,3],f32> 
    %2 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0.0> : tensor<384xf32>} : () -> !torch.vtensor<[384],f32> 
    %3 = torch.operator "onnx.Conv"(%arg2, %1, %2) {torch.onnx.dilations = [1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [3 : si64], torch.onnx.pads = [1 : si64, 1 : si64], torch.onnx.strides = [2 : si64]} : (!torch.vtensor<[?,384,?],f32>, !torch.vtensor<[384,384,3],f32>, !torch.vtensor<[384],f32>) -> !torch.vtensor<[?,384,?],f32> 
    return %3 : !torch.vtensor<[?,384,?],f32>
  }
}

getting error as

 error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes

command : iree-compile --iree-hal-target-backends=llvm-cpu -o abc.vmfb model.torch_onnx.mlir

@vinayakdsci
Copy link

Compilation fails on this dispatch:

hal.executable public @main_graph$async_dispatch_1 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "icelake-server", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,+rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,-avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,-pku,-nf,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,-rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,-shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,-sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_graph$async_dispatch_1_elementwise_Dx384xD_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_graph$async_dispatch_1_elementwise_Dx384xD_f32() {
        %c0 = arith.constant 0 : index
        %c32_i64 = arith.constant 32 : i64
        %cst = arith.constant dense<0.000000e+00> : tensor<384x384x3xf32>
        %cst_0 = arith.constant 0.000000e+00 : f32
        %0 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
        %4 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
        %5 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
        %6 = arith.extui %0 : i32 to i64
        %7 = arith.extui %1 : i32 to i64
        %8 = arith.shli %7, %c32_i64 : i64
        %9 = arith.ori %6, %8 : i64
        %10 = arith.index_castui %9 : i64 to index
        %11 = arith.extui %2 : i32 to i64
        %12 = arith.extui %3 : i32 to i64
        %13 = arith.shli %12, %c32_i64 : i64
        %14 = arith.ori %11, %13 : i64
        %15 = arith.index_castui %14 : i64 to index
        %16 = arith.extui %4 : i32 to i64
        %17 = arith.extui %5 : i32 to i64
        %18 = arith.shli %17, %c32_i64 : i64
        %19 = arith.ori %16, %18 : i64
        %20 = arith.index_castui %19 : i64 to index
        %21:3 = util.assume.int 
            %10<umin = 0, umax = 9007199254740991>, 
            %15<umin = 2, umax = 9007199254740993>, 
            %20<umin = 0, umax = 4503599627370496>
          : index, index, index
        %22 = flow.dispatch.workload.ordinal %21#0, 0 : index
        %23 = flow.dispatch.workload.ordinal %21#1, 1 : index
        %24 = flow.dispatch.workload.ordinal %21#2, 2 : index
        %25 = hal.interface.binding.subspan layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x384x?xf32>>{%22, %23}
        %26 = hal.interface.binding.subspan layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x384x?xf32>>{%22, %24}
        %27 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0], sizes = [%22, 384, %23], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x384x?xf32>>{%22, %23} -> tensor<?x384x?xf32>
        %28 = tensor.empty(%22, %24) : tensor<?x384x?xf32>
        %29 = linalg.fill ins(%cst_0 : f32) outs(%28 : tensor<?x384x?xf32>) -> tensor<?x384x?xf32>
        %30 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<2> : vector<1xi64>} ins(%27, %cst : tensor<?x384x?xf32>, tensor<384x384x3xf32>) outs(%29 : tensor<?x384x?xf32>) -> tensor<?x384x?xf32>
        %31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%30 : tensor<?x384x?xf32>) outs(%28 : tensor<?x384x?xf32>) {
        ^bb0(%in: f32, %out: f32):
          %32 = arith.addf %in, %cst_0 : f32
          linalg.yield %32 : f32
        } -> tensor<?x384x?xf32>
        flow.dispatch.tensor.store %31, %26, offsets = [0, 0, 0], sizes = [%22, 384, %24], strides = [1, 1, 1] : tensor<?x384x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x384x?xf32>>{%22, %24}
        return
      }
    }
  }
}

The error message on this dispatch is

./debug_875/module_main_graph$async_dispatch_1.mlir:9:6: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes
      func.func @main_graph$async_dispatch_1_elementwise_Dx384xD_f32() {
     ^

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants