[regression]: Increase in iree-compile memory for > 100X #18869

pdhirajkumarprasad · 2024-10-22T14:12:30Z

What happened?

For the give IR

#map = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0, d1, d2) -> (0, 0, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1, 0)>
#map5 = affine_map<(d0, d1, d2) -> ()>
#map6 = affine_map<(d0, d1, d2) -> (d1, d2)>
module {
  ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
  func.func @torch_jit(%arg0: tensor<1x3x384x384xf32>, %arg1: tensor<?x?x288xf32>, %arg2: tensor<1x1x288xf32>, %arg3: tensor<3xi64>) -> tensor<?x?x1152xf32> {
    %cst = arith.constant dense<[false, true, true]> : tensor<3xi1>
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 2.880000e+02 : f32
    %cst_2 = arith.constant dense<1> : tensor<3xi64>
    %c0_i64 = arith.constant 0 : i64
    %cst_3 = arith.constant dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided__> : tensor<1x1x288xf32>
    %cst_4 = arith.constant dense<[1, -1, -1]> : tensor<3xi64>
    %cst_5 = arith.constant dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided___1> : tensor<288xf32>
    %cst_6 = arith.constant dense<2.000000e+00> : tensor<f32>
    %cst_7 = arith.constant dense<9.99999997E-7> : tensor<f32>
    %cst_8 = arith.constant dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided___2> : tensor<288xf32>
    %cst_9 = arith.constant dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided___3> : tensor<288xf32>
    %cst_10 = arith.constant dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided___4> : tensor<288x1152xf32>
    %cst_11 = arith.constant dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided___5> : tensor<1152xf32>
    %c288_i64 = arith.constant 288 : i64
    %0 = tensor.empty() : tensor<3xi64>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%cst, %cst_2, %cst_4 : tensor<3xi1>, tensor<3xi64>, tensor<3xi64>) outs(%0 : tensor<3xi64>) {
    ^bb0(%in: i1, %in_19: i64, %in_20: i64, %out: i64):
      %40 = arith.select %in, %in_19, %in_20 : i64
      linalg.yield %40 : i64
    } -> tensor<3xi64>
    %extracted_slice = tensor.extract_slice %1[0] [1] [1] : tensor<3xi64> to tensor<1xi64>
    %collapsed = tensor.collapse_shape %extracted_slice [] : tensor<1xi64> into tensor<i64>
    %extracted = tensor.extract %collapsed[] : tensor<i64>
    %extracted_slice_12 = tensor.extract_slice %1[1] [1] [1] : tensor<3xi64> to tensor<1xi64>
    %collapsed_13 = tensor.collapse_shape %extracted_slice_12 [] : tensor<1xi64> into tensor<i64>
    %extracted_14 = tensor.extract %collapsed_13[] : tensor<i64>
    %extracted_slice_15 = tensor.extract_slice %1[2] [1] [1] : tensor<3xi64> to tensor<1xi64>
    %collapsed_16 = tensor.collapse_shape %extracted_slice_15 [] : tensor<1xi64> into tensor<i64>
    %extracted_17 = tensor.extract %collapsed_16[] : tensor<i64>
    %2 = arith.cmpi sle, %extracted_17, %c288_i64 : i64
    cf.assert %2, "onnx.Expand input has a dim that is not statically 1; expected this dim >= dim provided shape."
    %3 = arith.cmpi slt, %extracted, %c0_i64 : i64
    %4 = arith.index_cast %extracted : i64 to index
    %5 = arith.select %3, %c1, %4 : index
    %6 = arith.cmpi slt, %extracted_14, %c0_i64 : i64
    %7 = arith.index_cast %extracted_14 : i64 to index
    %8 = arith.select %6, %c1, %7 : index
    %9 = tensor.empty(%5, %8) : tensor<?x?x288xf32>
    %10 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_3 : tensor<1x1x288xf32>) outs(%9 : tensor<?x?x288xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<?x?x288xf32>
    %11 = tensor.empty() : tensor<1x1x288xf32>
    %12 = linalg.generic {indexing_maps = [#map3, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_5, %arg2 : tensor<288xf32>, tensor<1x1x288xf32>) outs(%11 : tensor<1x1x288xf32>) {
    ^bb0(%in: f32, %in_19: f32, %out: f32):
      %40 = arith.mulf %in, %in_19 : f32
      linalg.yield %40 : f32
    } -> tensor<1x1x288xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%10, %12 : tensor<?x?x288xf32>, tensor<1x1x288xf32>) outs(%9 : tensor<?x?x288xf32>) {
    ^bb0(%in: f32, %in_19: f32, %out: f32):
      %40 = arith.addf %in, %in_19 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x288xf32>
    %14 = tensor.empty(%5, %8) : tensor<?x?x1xf32>
    %15 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<?x?x1xf32>) -> tensor<?x?x1xf32>
    %16 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%13 : tensor<?x?x288xf32>) outs(%15 : tensor<?x?x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %40 = arith.addf %in, %out : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x1xf32>
    %17 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%16 : tensor<?x?x1xf32>) outs(%14 : tensor<?x?x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %40 = arith.divf %in, %cst_1 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x1xf32>
    %18 = linalg.generic {indexing_maps = [#map2, #map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13, %17 : tensor<?x?x288xf32>, tensor<?x?x1xf32>) outs(%9 : tensor<?x?x288xf32>) {
    ^bb0(%in: f32, %in_19: f32, %out: f32):
      %40 = arith.subf %in, %in_19 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x288xf32>
    %dim = tensor.dim %arg1, %c0 : tensor<?x?x288xf32>
    %dim_18 = tensor.dim %arg1, %c1 : tensor<?x?x288xf32>
    %19 = tensor.empty(%dim, %dim_18) : tensor<?x?x288xf32>
    %20 = linalg.generic {indexing_maps = [#map2, #map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg1, %cst_6 : tensor<?x?x288xf32>, tensor<f32>) outs(%19 : tensor<?x?x288xf32>) {
    ^bb0(%in: f32, %in_19: f32, %out: f32):
      %40 = math.powf %in, %in_19 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x288xf32>
    %21 = tensor.empty(%dim, %dim_18) : tensor<?x?x1xf32>
    %22 = linalg.fill ins(%cst_0 : f32) outs(%21 : tensor<?x?x1xf32>) -> tensor<?x?x1xf32>
    %23 = linalg.generic {indexing_maps = [#map2, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%20 : tensor<?x?x288xf32>) outs(%22 : tensor<?x?x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %40 = arith.addf %in, %out : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x1xf32>
    %24 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%23 : tensor<?x?x1xf32>) outs(%21 : tensor<?x?x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %40 = arith.divf %in, %cst_1 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x1xf32>
    %25 = linalg.generic {indexing_maps = [#map4, #map5, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%24, %cst_7 : tensor<?x?x1xf32>, tensor<f32>) outs(%21 : tensor<?x?x1xf32>) {
    ^bb0(%in: f32, %in_19: f32, %out: f32):
      %40 = arith.addf %in, %in_19 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x1xf32>
    %26 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25 : tensor<?x?x1xf32>) outs(%21 : tensor<?x?x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %40 = math.sqrt %in : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x1xf32>
    %27 = arith.cmpi eq, %5, %dim : index
    cf.assert %27, "mismatched size for broadcast"
    %28 = arith.cmpi eq, %8, %dim_18 : index
    cf.assert %28, "mismatched size for broadcast"
    %29 = linalg.generic {indexing_maps = [#map2, #map4, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%18, %26 : tensor<?x?x288xf32>, tensor<?x?x1xf32>) outs(%9 : tensor<?x?x288xf32>) {
    ^bb0(%in: f32, %in_19: f32, %out: f32):
      %40 = arith.divf %in, %in_19 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x288xf32>
    %30 = linalg.generic {indexing_maps = [#map2, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%29, %cst_8 : tensor<?x?x288xf32>, tensor<288xf32>) outs(%9 : tensor<?x?x288xf32>) {
    ^bb0(%in: f32, %in_19: f32, %out: f32):
      %40 = arith.mulf %in, %in_19 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x288xf32>
    %31 = linalg.generic {indexing_maps = [#map2, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%30, %cst_9 : tensor<?x?x288xf32>, tensor<288xf32>) outs(%9 : tensor<?x?x288xf32>) {
    ^bb0(%in: f32, %in_19: f32, %out: f32):
      %40 = arith.addf %in, %in_19 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x288xf32>
    %32 = arith.index_cast %5 : index to i64
    %33 = arith.cmpi sge, %32, %c0_i64 : i64
    cf.assert %33, "negative values not allowed in new dimensions"
    %34 = tensor.empty(%5) : tensor<?x288x1152xf32>
    %35 = linalg.generic {indexing_maps = [#map6, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_10 : tensor<288x1152xf32>) outs(%34 : tensor<?x288x1152xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<?x288x1152xf32>
    %36 = tensor.empty(%5, %8) : tensor<?x?x1152xf32>
    %37 = linalg.fill ins(%cst_0 : f32) outs(%36 : tensor<?x?x1152xf32>) -> tensor<?x?x1152xf32>
    %38 = linalg.batch_matmul ins(%31, %35 : tensor<?x?x288xf32>, tensor<?x288x1152xf32>) outs(%37 : tensor<?x?x1152xf32>) -> tensor<?x?x1152xf32>
    %39 = linalg.generic {indexing_maps = [#map3, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_11, %38 : tensor<1152xf32>, tensor<?x?x1152xf32>) outs(%36 : tensor<?x?x1152xf32>) {
    ^bb0(%in: f32, %in_19: f32, %out: f32):
      %40 = arith.addf %in, %in_19 : f32
      linalg.yield %40 : f32
    } -> tensor<?x?x1152xf32>
    return %39 : tensor<?x?x1152xf32>
  }
}

{-#
  dialect_resources: {
    builtin: {
      __onnx_constant_not_found_possibly_due_to_being_elided__: "0x00000000",
      __onnx_constant_not_found_possibly_due_to_being_elided___1: "0x00000000",
      __onnx_constant_not_found_possibly_due_to_being_elided___2: "0x00000000",
      __onnx_constant_not_found_possibly_due_to_being_elided___3: "0x00000000",
      __onnx_constant_not_found_possibly_due_to_being_elided___4: "0x00000000",
      __onnx_constant_not_found_possibly_due_to_being_elided___5: "0x00000000"
    }
  }
#-}

the iree-compile memory is going beyond >50G and still not completing. This behavior started from

IREE (https://iree.dev):
  IREE compiler version 20241015.1047 @ 7622770c3647cd902546fb23a831e967ee1ccf3e
  LLVM version 20.0.0git
  Optimized build

while with

IREE (https://iree.dev):
  IREE compiler version 20241014.1046 @ d7378bb7299bc558a09216559dc4b4fbf5cb59ef
  LLVM version 20.0.0git
  Optimized build

it compile in < 1 second with < 300MB

Steps to reproduce your issue

command:

iree-compile model.modified.mlir --iree-hal-target-backends=llvm-cpu -o compiled_model.vmfb

What component(s) does this issue relate to?

Compiler

Version information

No response

Additional context

No response

The text was updated successfully, but these errors were encountered:

ScottTodd · 2024-10-22T14:36:33Z

Commits between those versions: candidate-20241014.1046...candidate-20241015.1047

Other debugging tips: https://iree.dev/developers/debugging/compile-time-regressions/

saienduri · 2024-10-22T15:02:38Z

My guess would be that this PR is the culprit from those commits: #18730. @pdhirajkumarprasad can you try locally with reverting that PR?

pashu123 · 2024-10-22T16:08:39Z

My guess would be that this PR is the culprit from those commits: #18730. @pdhirajkumarprasad can you try locally with reverting that PR?

Yes, this causes the timeout issue.

Smaller repro

  func.func @time_out(%arg0: tensor<1x1x288x8x4xf32>, %arg1: tensor<1152xf32>) -> tensor<1x1x1152xf32>  {
    %c1337472 = arith.constant 1337472 : index
    %c1331712 = arith.constant 1331712 : index
    %c0 = arith.constant 0 : index
    %5 = tensor.empty() : tensor<1x1x1152xf32>
    %unpack = tensor.unpack %arg0 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 4] into %5  : tensor<1x1x288x8x4xf32> -> tensor<1x1x1152xf32>
    %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg1, %unpack : tensor<1152xf32>, tensor<1x1x1152xf32>) outs(%5 : tensor<1x1x1152xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %7 = arith.addf %in, %in_0 : f32
      linalg.yield %7 : f32
    } -> tensor<1x1x1152xf32>
    return %6: tensor<1x1x1152xf32>
  }

Looking into this.

ScottTodd · 2024-10-22T23:00:15Z

Some thoughts on how to catch this sort of regression sooner:

Increasing test coverage can give earlier signal for failures.

The tests that failed here (https://github.com/nod-ai/SHARK-TestSuite/actions/workflows/test_e2eshark.yml?query=branch%3Aalt-merge-reports I think?) run nightly and without prominent alerts for failures. We have other test suites that run on every commit (like https://github.com/iree-org/iree-test-suites/tree/main/onnx_models as part of https://github.com/iree-org/iree/blob/main/.github/workflows/pkgci_test_onnx.yml)

I'd prefer for all tests run on presubmit (especially if blocking) in iree-org to pull from only the iree-org repository and other public sources. Contributors should only need access to projects in this organization to be able to make changes with confidence. They shouldn't need access to another repository or any private files, private logs, special hardware, etc.

Presubmit tests should run in less than 30 minutes (ideally 15 minutes or faster). If test suites grow too large, they can be split into shards (given enough runner capacity) or we can apply some selection criteria to choose which run on every commit and which run less frequently.

Guarding against system health regressions in the compiler

We have #13207 tracking general approaches to getting the compiler to fail in useful ways when conditions like those here are observed.

I wonder if the latest idea posted there would have caught this issue:

I had one idea for how to spot when we fall off a cliff, at least: we could add a PassInstrumentation that counts the number of ops in the module and asserts if some threshold is passed. When running with --mlir-print-ir-after-all, that would let us stop right when the threshold is passed, rather than require some extra backtracking.

Guarding against system health regressions in test suites

A timeout on these tests may have caught this sooner (but without a helpful error message). We have such timeouts in iree-org/iree and iree-org/iree-test-suites tests (typically 60 seconds for unit tests or 600 seconds for really large tests).

We could explore other watchdog processes like one on memory usage or disk space. These metrics are often correlated.

Viewing and tracking regular system health metrics

This failure looks like an infinite loop or extremely poor performance ("falling off a performance cliff"). In more usual cases where a metric just grows within one order one magnitude, a test could avoid a timeout while still regressing significantly. In this case we should lean on automated benchmarks and statistics tracking, rather than test suite controls or compiler heuristics.

We have a flag to dump compilation statistics that we could have model test suites include and then dump the results to the logs. We have tracked these metrics in benchmark dashboards before, allowing us to spot historical trends or guard against regressions above a certain threshold (e.g. dispatch count increasing from 500 to 600, or executable binary size increasing from 500KB to 10MB).

Note that if the compiler doesn't actually finish running (such as here, with the multi-hour CI runs), any summarization and statistics dumping that we want to run won't help since the code won't reach that point before the run is cancelled.

pashu123 · 2024-10-23T09:19:28Z

My guess would be that this PR is the culprit from those commits: #18730. @pdhirajkumarprasad can you try locally with reverting that PR?

Yes, this causes the timeout issue.

Smaller repro

  func.func @time_out(%arg0: tensor<1x1x288x8x4xf32>, %arg1: tensor<1152xf32>) -> tensor<1x1x1152xf32>  {
    %c1337472 = arith.constant 1337472 : index
    %c1331712 = arith.constant 1331712 : index
    %c0 = arith.constant 0 : index
    %5 = tensor.empty() : tensor<1x1x1152xf32>
    %unpack = tensor.unpack %arg0 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 4] into %5  : tensor<1x1x288x8x4xf32> -> tensor<1x1x1152xf32>
    %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg1, %unpack : tensor<1152xf32>, tensor<1x1x1152xf32>) outs(%5 : tensor<1x1x1152xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %7 = arith.addf %in, %in_0 : f32
      linalg.yield %7 : f32
    } -> tensor<1x1x1152xf32>
    return %6: tensor<1x1x1152xf32>
  }

Looking into this.

I've created a new tracker for this here: #18875

pdhirajkumarprasad added the bug 🐞 Something isn't working label Oct 22, 2024

ScottTodd added codegen Shared code generation infrastructure and dialects regression Marks regression of feature, compatibility or performance labels Oct 22, 2024

pashu123 self-assigned this Oct 22, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[regression]: Increase in iree-compile memory for > 100X #18869

[regression]: Increase in iree-compile memory for > 100X #18869

pdhirajkumarprasad commented Oct 22, 2024

ScottTodd commented Oct 22, 2024

saienduri commented Oct 22, 2024

pashu123 commented Oct 22, 2024

ScottTodd commented Oct 22, 2024

pashu123 commented Oct 23, 2024

[regression]: Increase in iree-compile memory for > 100X #18869

[regression]: Increase in iree-compile memory for > 100X #18869

Comments

pdhirajkumarprasad commented Oct 22, 2024

What happened?

Steps to reproduce your issue

What component(s) does this issue relate to?

Version information

Additional context

ScottTodd commented Oct 22, 2024

saienduri commented Oct 22, 2024

pashu123 commented Oct 22, 2024

ScottTodd commented Oct 22, 2024

Increasing test coverage can give earlier signal for failures.

Guarding against system health regressions in the compiler

Guarding against system health regressions in test suites

Viewing and tracking regular system health metrics

pashu123 commented Oct 23, 2024