From 9d5fa0662420a7378891526c9397738283ed5406 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Thu, 27 Jun 2024 15:47:23 +0200
Subject: [PATCH 1/3] Eliminate temporary buffer in binary op conversion

Improves MLIR pipeline to avoid allocation of temporary buffer
and a copy for simple linalg binary named operations.

Changes:
- use tensor.empty in outs
- add bufferization pre- and post-processing
- marks outputs as 'restrict'
---
 .../src/transformations/mlir/convert.cpp      | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)
diff --git a/src/common/transformations/src/transformations/mlir/convert.cpp b/src/common/transformations/src/transformations/mlir/convert.cpp
index 220a6dfdc37b33..123f37a1d32c1a 100644
--- a/src/common/transformations/src/transformations/mlir/convert.cpp
+++ b/src/common/transformations/src/transformations/mlir/convert.cpp
@@ -70,7 +70,15 @@ static void prepareMLIRKernelWithoutWrapper(mlir::OwningOpRef<mlir::ModuleOp>& m
 
 #else  // Simplified default lowering to LLVM from LLVM tests
 
+    // Remove empty tensors to avoid converting them into temporary buffers.
+    pm.addPass(bufferization::createEmptyTensorEliminationPass());
+
     pm.addPass(bufferization::createOneShotBufferizePass());
+    // TODO: Add deallocation pass/pipeline to avoid memory leaks.
+
+    // Cleanup after bufferization - possibly remove redundant copies.
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(createCSEPass());
 
     // Blanket-convert any remaining high-level vector ops to loops if any remain.
     pm.addNestedPass<func::FuncOp>(createConvertVectorToSCFPass());
@@ -553,11 +561,15 @@ template <typename TargetOp>
 struct ConvertBinary {
     void operator()(ConversionContext& context, std::shared_ptr<ov::Node> node) {
         auto loc = createLocation(context.context, node);
+        auto& builder = context.builder();
         // TODO: Support broadcasts
         const auto inputs = context.getInputs(node);
-        auto op = context.builder().create<TargetOp>(loc,
-                                                     mlir::ValueRange{inputs[0], inputs[1]},
-                                                     /* FIXME: Use linalg.fill or tensor.empty */ mlir::ValueRange{inputs[0]});
+        auto outType = cast<mlir::ShapedType>(inputs[0].getType());
+        // Named binary ops directly overwrite data in `outs` buffer so, there is no need to provide non-empty
+        // destination at the tensor-level.
+        // Use `tensor.empty` to avoid temporary buffer allocation and memcpy after bufferization.
+        auto empty = builder.create<mlir::tensor::EmptyOp>(loc, outType.getShape(), outType.getElementType());
+        auto op = builder.create<TargetOp>(loc, mlir::ValueRange{inputs[0], inputs[1]}, mlir::ValueRange{empty});
         context.addOutputs(node, op);
     }
 };
@@ -606,8 +618,13 @@ mlir::OwningOpRef<mlir::ModuleOp> ngraph_to_mlir(MLIRContext* context,
         auto tensor = conversion_context.nodeOutputMap.at(outputs[i]);
         auto memref = func.getArgument(i + inputs.size());
         auto loc = createLocation(context, outputs[i].get_node_shared_ptr());
-        auto materialize = block_builder.create<bufferization::MaterializeInDestinationOp>(loc, tensor, memref);
-        materialize.setWritable(true);  // TODO: Can I set it as ctor argument above?
+        // TODO: Is restrict always guaranteed?
+        block_builder.create<bufferization::MaterializeInDestinationOp>(loc,
+                                                                        TypeRange{},
+                                                                        tensor,
+                                                                        memref,
+                                                                        /*restrict=*/true,
+                                                                        /*writable=*/true);
     }
 
     const auto retLoc = createLayerLocation(context, "output", "Output");

From 5bd668bd0be5cdb6373456f6c3fd360f578a7223 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Fri, 28 Jun 2024 13:31:46 +0200
Subject: [PATCH 2/3] Address comments

---
 .../src/transformations/mlir/convert.cpp            | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/common/transformations/src/transformations/mlir/convert.cpp b/src/common/transformations/src/transformations/mlir/convert.cpp
index 123f37a1d32c1a..4d78de97eee4c6 100644
--- a/src/common/transformations/src/transformations/mlir/convert.cpp
+++ b/src/common/transformations/src/transformations/mlir/convert.cpp
@@ -568,7 +568,14 @@ struct ConvertBinary {
         // Named binary ops directly overwrite data in `outs` buffer so, there is no need to provide non-empty
         // destination at the tensor-level.
         // Use `tensor.empty` to avoid temporary buffer allocation and memcpy after bufferization.
-        auto empty = builder.create<mlir::tensor::EmptyOp>(loc, outType.getShape(), outType.getElementType());
+        llvm::SmallVector<Value> dynamicSizes;
+        for (auto [idx, dim] : llvm::enumerate(outType.getShape())) {
+            if (!mlir::ShapedType::isDynamic(dim))
+                continue;
+            auto dimSize = builder.create<tensor::DimOp>(loc, inputs[0], idx);
+            dynamicSizes.push_back(dimSize);
+        }
+        auto empty = builder.create<tensor::EmptyOp>(loc, outType, dynamicSizes);
         auto op = builder.create<TargetOp>(loc, mlir::ValueRange{inputs[0], inputs[1]}, mlir::ValueRange{empty});
         context.addOutputs(node, op);
     }
@@ -618,7 +625,9 @@ mlir::OwningOpRef<mlir::ModuleOp> ngraph_to_mlir(MLIRContext* context,
         auto tensor = conversion_context.nodeOutputMap.at(outputs[i]);
         auto memref = func.getArgument(i + inputs.size());
         auto loc = createLocation(context, outputs[i].get_node_shared_ptr());
-        // TODO: Is restrict always guaranteed?
+        // Ensure that the result in stored in the provided function argument.
+        // Mark as restrict to avoid temporary buffer and copy.
+        // Mark as writable to ensure the output can be written to the buffer.
         block_builder.create<bufferization::MaterializeInDestinationOp>(loc,
                                                                         TypeRange{},
                                                                         tensor,

From f46e0fc3373495500017a64b0dbd8aa84eb4a180 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Fri, 28 Jun 2024 15:16:52 +0200
Subject: [PATCH 3/3] Fix comment

---
 src/common/transformations/src/transformations/mlir/convert.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/transformations/src/transformations/mlir/convert.cpp b/src/common/transformations/src/transformations/mlir/convert.cpp
index 4d78de97eee4c6..70fe5574f4b90b 100644
--- a/src/common/transformations/src/transformations/mlir/convert.cpp
+++ b/src/common/transformations/src/transformations/mlir/convert.cpp
@@ -625,7 +625,7 @@ mlir::OwningOpRef<mlir::ModuleOp> ngraph_to_mlir(MLIRContext* context,
         auto tensor = conversion_context.nodeOutputMap.at(outputs[i]);
         auto memref = func.getArgument(i + inputs.size());
         auto loc = createLocation(context, outputs[i].get_node_shared_ptr());
-        // Ensure that the result in stored in the provided function argument.
+        // Ensure the result is stored in the provided function argument.
         // Mark as restrict to avoid temporary buffer and copy.
         // Mark as writable to ensure the output can be written to the buffer.
         block_builder.create<bufferization::MaterializeInDestinationOp>(loc,