-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes #875
Comments
Compilation fails on this dispatch: hal.executable public @main_graph$async_dispatch_1 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "icelake-server", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,+rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,-avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,-pku,-nf,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,-rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,-shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,-sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @main_graph$async_dispatch_1_elementwise_Dx384xD_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_1_elementwise_Dx384xD_f32() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant dense<0.000000e+00> : tensor<384x384x3xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = arith.extui %0 : i32 to i64
%7 = arith.extui %1 : i32 to i64
%8 = arith.shli %7, %c32_i64 : i64
%9 = arith.ori %6, %8 : i64
%10 = arith.index_castui %9 : i64 to index
%11 = arith.extui %2 : i32 to i64
%12 = arith.extui %3 : i32 to i64
%13 = arith.shli %12, %c32_i64 : i64
%14 = arith.ori %11, %13 : i64
%15 = arith.index_castui %14 : i64 to index
%16 = arith.extui %4 : i32 to i64
%17 = arith.extui %5 : i32 to i64
%18 = arith.shli %17, %c32_i64 : i64
%19 = arith.ori %16, %18 : i64
%20 = arith.index_castui %19 : i64 to index
%21:3 = util.assume.int
%10<umin = 0, umax = 9007199254740991>,
%15<umin = 2, umax = 9007199254740993>,
%20<umin = 0, umax = 4503599627370496>
: index, index, index
%22 = flow.dispatch.workload.ordinal %21#0, 0 : index
%23 = flow.dispatch.workload.ordinal %21#1, 1 : index
%24 = flow.dispatch.workload.ordinal %21#2, 2 : index
%25 = hal.interface.binding.subspan layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x384x?xf32>>{%22, %23}
%26 = hal.interface.binding.subspan layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x384x?xf32>>{%22, %24}
%27 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0], sizes = [%22, 384, %23], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x384x?xf32>>{%22, %23} -> tensor<?x384x?xf32>
%28 = tensor.empty(%22, %24) : tensor<?x384x?xf32>
%29 = linalg.fill ins(%cst_0 : f32) outs(%28 : tensor<?x384x?xf32>) -> tensor<?x384x?xf32>
%30 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<2> : vector<1xi64>} ins(%27, %cst : tensor<?x384x?xf32>, tensor<384x384x3xf32>) outs(%29 : tensor<?x384x?xf32>) -> tensor<?x384x?xf32>
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%30 : tensor<?x384x?xf32>) outs(%28 : tensor<?x384x?xf32>) {
^bb0(%in: f32, %out: f32):
%32 = arith.addf %in, %cst_0 : f32
linalg.yield %32 : f32
} -> tensor<?x384x?xf32>
flow.dispatch.tensor.store %31, %26, offsets = [0, 0, 0], sizes = [%22, 384, %24], strides = [1, 1, 1] : tensor<?x384x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x384x?xf32>>{%22, %24}
return
}
}
}
} The error message on this dispatch is
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
for the given IR
getting error as
command : iree-compile --iree-hal-target-backends=llvm-cpu -o abc.vmfb model.torch_onnx.mlir
The text was updated successfully, but these errors were encountered: