From 9d5fa0662420a7378891526c9397738283ed5406 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Thu, 27 Jun 2024 15:47:23 +0200 Subject: [PATCH 1/3] Eliminate temporary buffer in binary op conversion Improves MLIR pipeline to avoid allocation of temporary buffer and a copy for simple linalg binary named operations. Changes: - use tensor.empty in outs - add bufferization pre- and post-processing - marks outputs as 'restrict' --- .../src/transformations/mlir/convert.cpp | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/common/transformations/src/transformations/mlir/convert.cpp b/src/common/transformations/src/transformations/mlir/convert.cpp index 220a6dfdc37b33..123f37a1d32c1a 100644 --- a/src/common/transformations/src/transformations/mlir/convert.cpp +++ b/src/common/transformations/src/transformations/mlir/convert.cpp @@ -70,7 +70,15 @@ static void prepareMLIRKernelWithoutWrapper(mlir::OwningOpRef& m #else // Simplified default lowering to LLVM from LLVM tests + // Remove empty tensors to avoid converting them into temporary buffers. + pm.addPass(bufferization::createEmptyTensorEliminationPass()); + pm.addPass(bufferization::createOneShotBufferizePass()); + // TODO: Add deallocation pass/pipeline to avoid memory leaks. + + // Cleanup after bufferization - possibly remove redundant copies. + pm.addNestedPass(createCanonicalizerPass()); + pm.addNestedPass(createCSEPass()); // Blanket-convert any remaining high-level vector ops to loops if any remain. pm.addNestedPass(createConvertVectorToSCFPass()); @@ -553,11 +561,15 @@ template struct ConvertBinary { void operator()(ConversionContext& context, std::shared_ptr node) { auto loc = createLocation(context.context, node); + auto& builder = context.builder(); // TODO: Support broadcasts const auto inputs = context.getInputs(node); - auto op = context.builder().create(loc, - mlir::ValueRange{inputs[0], inputs[1]}, - /* FIXME: Use linalg.fill or tensor.empty */ mlir::ValueRange{inputs[0]}); + auto outType = cast(inputs[0].getType()); + // Named binary ops directly overwrite data in `outs` buffer so, there is no need to provide non-empty + // destination at the tensor-level. + // Use `tensor.empty` to avoid temporary buffer allocation and memcpy after bufferization. + auto empty = builder.create(loc, outType.getShape(), outType.getElementType()); + auto op = builder.create(loc, mlir::ValueRange{inputs[0], inputs[1]}, mlir::ValueRange{empty}); context.addOutputs(node, op); } }; @@ -606,8 +618,13 @@ mlir::OwningOpRef ngraph_to_mlir(MLIRContext* context, auto tensor = conversion_context.nodeOutputMap.at(outputs[i]); auto memref = func.getArgument(i + inputs.size()); auto loc = createLocation(context, outputs[i].get_node_shared_ptr()); - auto materialize = block_builder.create(loc, tensor, memref); - materialize.setWritable(true); // TODO: Can I set it as ctor argument above? + // TODO: Is restrict always guaranteed? + block_builder.create(loc, + TypeRange{}, + tensor, + memref, + /*restrict=*/true, + /*writable=*/true); } const auto retLoc = createLayerLocation(context, "output", "Output"); From 5bd668bd0be5cdb6373456f6c3fd360f578a7223 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Fri, 28 Jun 2024 13:31:46 +0200 Subject: [PATCH 2/3] Address comments --- .../src/transformations/mlir/convert.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/common/transformations/src/transformations/mlir/convert.cpp b/src/common/transformations/src/transformations/mlir/convert.cpp index 123f37a1d32c1a..4d78de97eee4c6 100644 --- a/src/common/transformations/src/transformations/mlir/convert.cpp +++ b/src/common/transformations/src/transformations/mlir/convert.cpp @@ -568,7 +568,14 @@ struct ConvertBinary { // Named binary ops directly overwrite data in `outs` buffer so, there is no need to provide non-empty // destination at the tensor-level. // Use `tensor.empty` to avoid temporary buffer allocation and memcpy after bufferization. - auto empty = builder.create(loc, outType.getShape(), outType.getElementType()); + llvm::SmallVector dynamicSizes; + for (auto [idx, dim] : llvm::enumerate(outType.getShape())) { + if (!mlir::ShapedType::isDynamic(dim)) + continue; + auto dimSize = builder.create(loc, inputs[0], idx); + dynamicSizes.push_back(dimSize); + } + auto empty = builder.create(loc, outType, dynamicSizes); auto op = builder.create(loc, mlir::ValueRange{inputs[0], inputs[1]}, mlir::ValueRange{empty}); context.addOutputs(node, op); } @@ -618,7 +625,9 @@ mlir::OwningOpRef ngraph_to_mlir(MLIRContext* context, auto tensor = conversion_context.nodeOutputMap.at(outputs[i]); auto memref = func.getArgument(i + inputs.size()); auto loc = createLocation(context, outputs[i].get_node_shared_ptr()); - // TODO: Is restrict always guaranteed? + // Ensure that the result in stored in the provided function argument. + // Mark as restrict to avoid temporary buffer and copy. + // Mark as writable to ensure the output can be written to the buffer. block_builder.create(loc, TypeRange{}, tensor, From f46e0fc3373495500017a64b0dbd8aa84eb4a180 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Fri, 28 Jun 2024 15:16:52 +0200 Subject: [PATCH 3/3] Fix comment --- src/common/transformations/src/transformations/mlir/convert.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/transformations/src/transformations/mlir/convert.cpp b/src/common/transformations/src/transformations/mlir/convert.cpp index 4d78de97eee4c6..70fe5574f4b90b 100644 --- a/src/common/transformations/src/transformations/mlir/convert.cpp +++ b/src/common/transformations/src/transformations/mlir/convert.cpp @@ -625,7 +625,7 @@ mlir::OwningOpRef ngraph_to_mlir(MLIRContext* context, auto tensor = conversion_context.nodeOutputMap.at(outputs[i]); auto memref = func.getArgument(i + inputs.size()); auto loc = createLocation(context, outputs[i].get_node_shared_ptr()); - // Ensure that the result in stored in the provided function argument. + // Ensure the result is stored in the provided function argument. // Mark as restrict to avoid temporary buffer and copy. // Mark as writable to ensure the output can be written to the buffer. block_builder.create(loc,