From 1022157e21feffd2e3335ec8b4217e92aa9fb9cf Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Thu, 18 Jul 2024 16:57:43 +0200 Subject: [PATCH 1/2] Match and lower ov::Relu Adds ReLU op matcher and lowering to MLIR named Linalg ops. Also, adds buffer deallocation passes to prevent memory leaks when temporary buffers are created in larger graphs. --- .../src/transformations/mlir/convert.cpp | 75 +++++++++++++++++-- .../src/transformations/mlir/mlir_op.cpp | 14 +++- 2 files changed, 83 insertions(+), 6 deletions(-) diff --git a/src/common/transformations/src/transformations/mlir/convert.cpp b/src/common/transformations/src/transformations/mlir/convert.cpp index b978ee314b3667..d8dad59164a8e7 100644 --- a/src/common/transformations/src/transformations/mlir/convert.cpp +++ b/src/common/transformations/src/transformations/mlir/convert.cpp @@ -7,10 +7,10 @@ #include #include #include -#include -#include #include - +#include +#include +#include #include #include #include @@ -269,6 +269,70 @@ class Partitioner : public ov::pass::ModelPass { } }; +struct ConvertRelu { + void operator()(ConversionContext& context, NodePtr node) { + auto loc = createLocation(context.context, node); + auto& builder = context.builder(); + // TODO: Support broadcasts + const auto input = context.getInputs(node)[0]; + const auto ov_output_element_type = node->get_output_element_type(0); + const auto ov_output_shape = node->get_output_partial_shape(0); + auto outType = importTensor(context.context, ov_output_shape, ov_output_element_type); + // Named unary ops directly overwrite data in `outs` buffer so, there is no need to provide non-empty + // destination at the tensor-level. + // Use `tensor.empty` to avoid temporary buffer allocation and memcpy after bufferization. + llvm::SmallVector dynamicSizes; + for (auto [idx, dim] : llvm::enumerate(outType.getShape())) { + if (!mlir::ShapedType::isDynamic(dim)) + continue; + auto dimSize = builder.create(loc, input, idx); + dynamicSizes.push_back(dimSize); + } + auto empty = builder.create(loc, outType, dynamicSizes); + auto zero = getConstant(builder, ov_output_element_type, 0); + auto fill = builder.create(loc, mlir::ValueRange{zero}, mlir::ValueRange{empty}); + auto relu = + builder.create(loc, mlir::ValueRange{input, fill.getResult(0)}, mlir::ValueRange{empty}); + context.addOutputs(node, relu); + } +}; + +bool elementwise_f32_unary_no_broadcast_predicate(const ov::Output& output) { + if (output.get_element_type() != ov::element::f32) { + return false; + } + // Check if implicit broadcast is possible, reject in this case + // Relies on symbolic information -- register SymbolicPropagation before applying this pattern + auto input_shape = output.get_node_shared_ptr()->get_input_partial_shape(0); + auto output_shape = output.get_partial_shape(); + if (output_shape.rank().is_dynamic() || input_shape.rank().is_dynamic()) { + return false; + } + if (output_shape.rank().get_length() != input_shape.rank().get_length()) { + return false; + } + + for (size_t i = 0; i < output_shape.size(); ++i) { + if (output_shape[i] != input_shape[i]) { + return false; + } + // Continue if all shapes are static. + if (output_shape[i].is_static() && input_shape[i].is_static()) { + continue; + } + if (!ov::symbol::are_equal(output_shape[i].get_symbol(), input_shape[i].get_symbol())) { + return false; + } + } + + return true; +} + +template +NodePtr elementwise_f32_unary_no_broadcast() { + using namespace ov::pass::pattern; + return wrap_type({any_input()}, elementwise_f32_unary_no_broadcast_predicate); +} bool elementwise_f32_binary_no_broadcast_predicate(const ov::Output& output) { if(output.get_element_type() != ov::element::f32) { @@ -291,9 +355,9 @@ bool elementwise_f32_binary_no_broadcast_predicate(const ov::Output& o return false; } // Continue if all shapes are static. - if (output_shape[i].is_static() && input_shape_a[i].is_static() && - input_shape_b[i].is_static()) + if (output_shape[i].is_static() && input_shape_a[i].is_static() && input_shape_b[i].is_static()) { continue; + } if(!ov::symbol::are_equal(output_shape[i].get_symbol(), input_shape_a[i].get_symbol()) || !ov::symbol::are_equal(output_shape[i].get_symbol(), input_shape_b[i].get_symbol())) { return false; } @@ -319,6 +383,7 @@ void injectMLIR(std::shared_ptr model, MLIRContext* context) { manager.register_pass(elementwise_f32_binary_no_broadcast(), ConvertBinary()); manager.register_pass(elementwise_f32_binary_no_broadcast(), ConvertBinary()); manager.register_pass(elementwise_f32_binary_no_broadcast(), ConvertBinary()); + manager.register_pass(elementwise_f32_unary_no_broadcast(), ConvertRelu()); manager.register_pass(); manager.register_pass(context); manager.run_passes(model); diff --git a/src/common/transformations/src/transformations/mlir/mlir_op.cpp b/src/common/transformations/src/transformations/mlir/mlir_op.cpp index 8e7c4e1ce7d9db..51a7a8598f7c88 100644 --- a/src/common/transformations/src/transformations/mlir/mlir_op.cpp +++ b/src/common/transformations/src/transformations/mlir/mlir_op.cpp @@ -80,12 +80,24 @@ void prepareMLIRKernelWithoutWrapper(mlir::OwningOpRef& module) pm.addPass(bufferization::createEmptyTensorEliminationPass()); pm.addPass(bufferization::createOneShotBufferizePass()); - // TODO: Add deallocation pass/pipeline to avoid memory leaks. + pm.addNestedPass(bufferization::createFinalizingBufferizePass()); // Cleanup after bufferization - possibly remove redundant copies. pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createCSEPass()); + // Deallocation pipeline to avoid memory leaks from created temporary buffers. + pm.addPass(memref::createExpandReallocPass(/*emitDeallocs=*/false)); + pm.addPass(createCanonicalizerPass()); + bufferization::DeallocationOptions deallocOpts; + deallocOpts.privateFuncDynamicOwnership = false; + pm.addPass(bufferization::createOwnershipBasedBufferDeallocationPass(deallocOpts)); + pm.addPass(createCanonicalizerPass()); + pm.addPass(bufferization::createBufferDeallocationSimplificationPass()); + pm.addPass(bufferization::createLowerDeallocationsPass()); + pm.addPass(createCSEPass()); + pm.addPass(createCanonicalizerPass()); + // Blanket-convert any remaining high-level vector ops to loops if any remain. pm.addNestedPass(createConvertVectorToSCFPass()); // pm.addNestedPass(createLinalgGeneralizeNamedOpsPass()); From 37ca57cda8b9ed4a254dd474895e7e9b272db050 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Thu, 18 Jul 2024 21:39:35 +0200 Subject: [PATCH 2/2] Move to op/relu + predicate refactor --- .../src/transformations/mlir/convert.cpp | 121 ++---------------- .../transformations/mlir/convert_common.cpp | 37 ++++++ .../transformations/mlir/convert_common.hpp | 7 + .../src/transformations/mlir/op/relu.cpp | 59 +++++++++ .../src/transformations/mlir/op/relu.hpp | 23 ++++ 5 files changed, 136 insertions(+), 111 deletions(-) create mode 100644 src/common/transformations/src/transformations/mlir/op/relu.cpp create mode 100644 src/common/transformations/src/transformations/mlir/op/relu.hpp diff --git a/src/common/transformations/src/transformations/mlir/convert.cpp b/src/common/transformations/src/transformations/mlir/convert.cpp index d8dad59164a8e7..a755b033cb6264 100644 --- a/src/common/transformations/src/transformations/mlir/convert.cpp +++ b/src/common/transformations/src/transformations/mlir/convert.cpp @@ -17,6 +17,8 @@ #include // TODO: Prune unused headers -- it's hard to understand needed ones +#include "conversion_context.hpp" +#include "convert_common.hpp" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Casting.h" #include "llvm/Support/InitLLVM.h" @@ -55,20 +57,16 @@ #include "mlir/Target/LLVMIR/Dialect/All.h" #include "mlir/Target/LLVMIR/Export.h" #include "mlir/Target/LLVMIR/ModuleTranslation.h" +#include "mlir_op.hpp" +#include "op/matmul.hpp" +#include "op/relu.hpp" #include "openvino/core/dimension.hpp" #include "openvino/core/rt_info.hpp" -#include "openvino/pass/pattern/op/wrap_type.hpp" -#include "transformations_visibility.hpp" #include "openvino/core/symbol.hpp" - -#include "transformations/symbolic_transformations/symbolic_optimizations.hpp" - -#include "mlir_op.hpp" -#include "convert_common.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" #include "subgraph_tracker.hpp" -#include "conversion_context.hpp" -#include "op/matmul.hpp" - +#include "transformations/symbolic_transformations/symbolic_optimizations.hpp" +#include "transformations_visibility.hpp" namespace { @@ -269,111 +267,12 @@ class Partitioner : public ov::pass::ModelPass { } }; -struct ConvertRelu { - void operator()(ConversionContext& context, NodePtr node) { - auto loc = createLocation(context.context, node); - auto& builder = context.builder(); - // TODO: Support broadcasts - const auto input = context.getInputs(node)[0]; - const auto ov_output_element_type = node->get_output_element_type(0); - const auto ov_output_shape = node->get_output_partial_shape(0); - auto outType = importTensor(context.context, ov_output_shape, ov_output_element_type); - // Named unary ops directly overwrite data in `outs` buffer so, there is no need to provide non-empty - // destination at the tensor-level. - // Use `tensor.empty` to avoid temporary buffer allocation and memcpy after bufferization. - llvm::SmallVector dynamicSizes; - for (auto [idx, dim] : llvm::enumerate(outType.getShape())) { - if (!mlir::ShapedType::isDynamic(dim)) - continue; - auto dimSize = builder.create(loc, input, idx); - dynamicSizes.push_back(dimSize); - } - auto empty = builder.create(loc, outType, dynamicSizes); - auto zero = getConstant(builder, ov_output_element_type, 0); - auto fill = builder.create(loc, mlir::ValueRange{zero}, mlir::ValueRange{empty}); - auto relu = - builder.create(loc, mlir::ValueRange{input, fill.getResult(0)}, mlir::ValueRange{empty}); - context.addOutputs(node, relu); - } -}; - -bool elementwise_f32_unary_no_broadcast_predicate(const ov::Output& output) { - if (output.get_element_type() != ov::element::f32) { - return false; - } - // Check if implicit broadcast is possible, reject in this case - // Relies on symbolic information -- register SymbolicPropagation before applying this pattern - auto input_shape = output.get_node_shared_ptr()->get_input_partial_shape(0); - auto output_shape = output.get_partial_shape(); - if (output_shape.rank().is_dynamic() || input_shape.rank().is_dynamic()) { - return false; - } - if (output_shape.rank().get_length() != input_shape.rank().get_length()) { - return false; - } - - for (size_t i = 0; i < output_shape.size(); ++i) { - if (output_shape[i] != input_shape[i]) { - return false; - } - // Continue if all shapes are static. - if (output_shape[i].is_static() && input_shape[i].is_static()) { - continue; - } - if (!ov::symbol::are_equal(output_shape[i].get_symbol(), input_shape[i].get_symbol())) { - return false; - } - } - - return true; -} - -template -NodePtr elementwise_f32_unary_no_broadcast() { - using namespace ov::pass::pattern; - return wrap_type({any_input()}, elementwise_f32_unary_no_broadcast_predicate); -} - -bool elementwise_f32_binary_no_broadcast_predicate(const ov::Output& output) { - if(output.get_element_type() != ov::element::f32) { - return false; - } - // Check if implicit broadcast is possible, reject in this case - // Relies on symbolic information -- register SymbolicPropagation before applying this pattern - auto input_shape_a = output.get_node_shared_ptr()->get_input_partial_shape(0); - auto input_shape_b = output.get_node_shared_ptr()->get_input_partial_shape(1); - auto output_shape = output.get_partial_shape(); - if(output_shape.rank().is_dynamic() || input_shape_a.rank().is_dynamic() || input_shape_b.rank().is_dynamic()) { - return false; - } - if(output_shape.rank().get_length() != input_shape_a.rank().get_length() || output_shape.rank().get_length() != input_shape_b.rank().get_length()) { - return false; - } - - for(size_t i = 0; i < output_shape.size(); ++i) { - if(output_shape[i] != input_shape_a[i] || output_shape[i] != input_shape_b[i]) { - return false; - } - // Continue if all shapes are static. - if (output_shape[i].is_static() && input_shape_a[i].is_static() && input_shape_b[i].is_static()) { - continue; - } - if(!ov::symbol::are_equal(output_shape[i].get_symbol(), input_shape_a[i].get_symbol()) || !ov::symbol::are_equal(output_shape[i].get_symbol(), input_shape_b[i].get_symbol())) { - return false; - } - } - - return true; -} - - template NodePtr elementwise_f32_binary_no_broadcast() { using namespace ov::pass::pattern; - return wrap_type({any_input(), any_input()}, elementwise_f32_binary_no_broadcast_predicate); + return wrap_type({any_input(), any_input()}, elementwise_no_broadcast_predicate); } - void injectMLIR(std::shared_ptr model, MLIRContext* context) { ov::pass::Manager manager; using namespace ov::op; @@ -383,7 +282,7 @@ void injectMLIR(std::shared_ptr model, MLIRContext* context) { manager.register_pass(elementwise_f32_binary_no_broadcast(), ConvertBinary()); manager.register_pass(elementwise_f32_binary_no_broadcast(), ConvertBinary()); manager.register_pass(elementwise_f32_binary_no_broadcast(), ConvertBinary()); - manager.register_pass(elementwise_f32_unary_no_broadcast(), ConvertRelu()); + manager.register_pass(); manager.register_pass(); manager.register_pass(context); manager.run_passes(model); diff --git a/src/common/transformations/src/transformations/mlir/convert_common.cpp b/src/common/transformations/src/transformations/mlir/convert_common.cpp index f7c917af1a23cb..6bca04c759a356 100644 --- a/src/common/transformations/src/transformations/mlir/convert_common.cpp +++ b/src/common/transformations/src/transformations/mlir/convert_common.cpp @@ -128,5 +128,42 @@ Location createLocation(MLIRContext* ctx, NodePtr node) { return createLayerLocation(ctx, node->get_friendly_name(), node->get_type_name()); } +bool elementwise_no_broadcast_predicate_impl(const ov::Output& output, ov::element::Type type) { + if (output.get_element_type() != type) { + return false; + } + // Check if implicit broadcast is possible, reject in this case + // Relies on symbolic information -- register SymbolicPropagation before applying this pattern + auto inputs = output.get_node_shared_ptr()->inputs(); + auto output_shape = output.get_partial_shape(); + if (output_shape.rank().is_dynamic()) { + return false; + } + if (std::any_of(inputs.begin(), inputs.end(), [&](const ov::Input& input) { + auto input_shape = input.get_partial_shape(); + return input_shape.rank().is_dynamic() || + output_shape.rank().get_length() != input_shape.rank().get_length(); + })) { + return false; + } + + if (std::any_of(inputs.begin(), inputs.end(), [&](const ov::Input& input) { + for (size_t i = 0; i < output_shape.size(); ++i) { + auto input_shape = input.get_partial_shape(); + if (output_shape[i] != input_shape[i]) + return true; + if (output_shape[i].is_static() && input_shape[i].is_static()) + continue; + if (!ov::symbol::are_equal(output_shape[i].get_symbol(), input_shape[i].get_symbol())) + return true; + } + return false; + })) { + return false; + } + + return true; +} + } // namespace mlir } // namespace ov \ No newline at end of file diff --git a/src/common/transformations/src/transformations/mlir/convert_common.hpp b/src/common/transformations/src/transformations/mlir/convert_common.hpp index 7fd19fe875eb80..a33c99e6bedc57 100644 --- a/src/common/transformations/src/transformations/mlir/convert_common.hpp +++ b/src/common/transformations/src/transformations/mlir/convert_common.hpp @@ -30,6 +30,13 @@ RankedTensorType importTensor(MLIRContext* ctx, Location createLocation(MLIRContext* ctx, NodePtr node); +bool elementwise_no_broadcast_predicate_impl(const ov::Output& output, ov::element::Type type); + +template +bool elementwise_no_broadcast_predicate(const ov::Output& output) { + return elementwise_no_broadcast_predicate_impl(output, type); +} + // Borrowed it from TPP-MLIR. FIXME: Do we have a better upstreamed alternative? template mlir::arith::ConstantOp getConstant(OpBuilder &builder, const ov::element::Type& precision, T value) { diff --git a/src/common/transformations/src/transformations/mlir/op/relu.cpp b/src/common/transformations/src/transformations/mlir/op/relu.cpp new file mode 100644 index 00000000000000..a25f571f61cddf --- /dev/null +++ b/src/common/transformations/src/transformations/mlir/op/relu.cpp @@ -0,0 +1,59 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Linalg/Passes.h" + +#include +#include "openvino/pass/pattern/op/wrap_type.hpp" + +#include "relu.hpp" +#include "../convert_common.hpp" + +namespace { + +using namespace ov::mlir; + +struct ConvertRelu { + void operator()(ConversionContext& context, NodePtr node) { + auto loc = createLocation(context.context, node); + auto& builder = context.builder(); + // TODO: Support broadcasts + const auto input = context.getInputs(node)[0]; + const auto ov_output_element_type = node->get_output_element_type(0); + const auto ov_output_shape = node->get_output_partial_shape(0); + auto outType = importTensor(context.context, ov_output_shape, ov_output_element_type); + // Named unary ops directly overwrite data in `outs` buffer so, there is no need to provide non-empty + // destination at the tensor-level. + // Use `tensor.empty` to avoid temporary buffer allocation and memcpy after bufferization. + llvm::SmallVector dynamicSizes; + for (auto [idx, dim] : llvm::enumerate(outType.getShape())) { + if (!mlir::ShapedType::isDynamic(dim)) + continue; + auto dimSize = builder.create(loc, input, idx); + dynamicSizes.push_back(dimSize); + } + auto empty = builder.create(loc, outType, dynamicSizes); + auto zero = getConstant(builder, ov_output_element_type, 0); + auto fill = builder.create(loc, mlir::ValueRange{zero}, mlir::ValueRange{empty}); + auto relu = + builder.create(loc, mlir::ValueRange{input, fill.getResult(0)}, mlir::ValueRange{empty}); + context.addOutputs(node, relu); + } +}; + +} // namespace + +namespace ov { +namespace mlir { + +using namespace ov::pass::pattern; +using namespace ov::op; + +ReluPattern::ReluPattern() + : MarkPattern(wrap_type({any_input()}, elementwise_no_broadcast_predicate), + ConvertRelu()) {} + +} // namespace mlir +} // namespace ov diff --git a/src/common/transformations/src/transformations/mlir/op/relu.hpp b/src/common/transformations/src/transformations/mlir/op/relu.hpp new file mode 100644 index 00000000000000..a51c7366d834fb --- /dev/null +++ b/src/common/transformations/src/transformations/mlir/op/relu.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "mlir/IR/Builders.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Value.h" + +#include "../conversion_context.hpp" + +namespace ov { +namespace mlir { + +class ReluPattern : public MarkPattern { +public: + OPENVINO_RTTI("ReluPattern", "0"); + ReluPattern(); +}; + +} // namespace mlir +} // namespace ov