diff --git a/src/common/memory_tracking.hpp b/src/common/memory_tracking.hpp index 9f6b41a001d..64bfceca244 100644 --- a/src/common/memory_tracking.hpp +++ b/src/common/memory_tracking.hpp @@ -227,6 +227,7 @@ enum { key_gemm_blocked_a, key_gemm_blocked_b, key_gemm_accumulator, + key_generic_acc, key_gnorm_cvt, key_gnorm_reduction, key_gnorm_tmp_mean, diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp index 088284f7e29..09f76e7e19a 100644 --- a/src/cpu/aarch64/acl_convolution_utils.hpp +++ b/src/cpu/aarch64/acl_convolution_utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Arm Ltd. and affiliates +* Copyright 2020-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,7 +41,7 @@ struct acl_conv_conf_t { bool fast_math; // If this is true, the result of the convolution goes into a temporarily // allocated ACL tensor to be accumulated into the oneDNN dst during postops - bool use_dst_acc; + bool use_dst_acc_for_sum; // Tells that the selected algorithm is Winograd. This is needed because the // algorithm can be set to algorithm::convolution_auto and later on we need to // skip fixed-format protocol as ACL Winograd does not support it. @@ -87,7 +87,7 @@ template acp_.with_bias; - bool use_dst_acc = pd->acp_.use_dst_acc; + bool use_dst_acc_for_sum = pd->acp_.use_dst_acc_for_sum; auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC); auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS); @@ -99,14 +99,14 @@ status_t execute_forward_conv_acl( acl_conv_obj.wei_tensor.allocator()->import_memory( const_cast(wei_base)); - if (use_dst_acc) { - // Put the result in a new tensor, it will be accumalated to the dst - // during the post ops - acl_conv_obj.dst_tensor.allocator()->allocate(); - } else { - auto dst_base = CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST); - acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base); - } + const auto scratchpad = ctx.get_scratchpad_grantor(); + + // If we have an unfused sum post op, put the result in a scratchpad tensor. + // Result will be summed to the dst during acl_post_ops.execute + auto dst_base = use_dst_acc_for_sum + ? scratchpad.get(memory_tracking::names::key_generic_acc) + : CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST); + acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base); if (with_bias) { auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS); diff --git a/src/cpu/aarch64/acl_deconvolution.cpp b/src/cpu/aarch64/acl_deconvolution.cpp index 7ac5507422a..cdeca9cb8bb 100644 --- a/src/cpu/aarch64/acl_deconvolution.cpp +++ b/src/cpu/aarch64/acl_deconvolution.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022 Arm Ltd. and affiliates +* Copyright 2022,2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,10 +27,18 @@ status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const { // concurrent multithreaded access. std::lock_guard _lock {this->mtx}; + const auto scratchpad = ctx.get_scratchpad_grantor(); + auto src_base = CTX_IN_MEM(const void *, DNNL_ARG_SRC); auto wei_base = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS); auto bia_base = CTX_IN_MEM(const void *, DNNL_ARG_BIAS); - auto dst_base = CTX_OUT_MEM(void *, DNNL_ARG_DST); + + bool use_dst_acc_for_sum = pd()->acl_pd_conf.use_dst_acc_for_sum; + // If we have an unfused sum post op, put the result in a scratchpad tensor. + // Result will be summed to the dst during acl_post_ops.execute + auto dst_base = use_dst_acc_for_sum + ? scratchpad.get(memory_tracking::names::key_generic_acc) + : CTX_OUT_MEM(void *, DNNL_ARG_DST); // Retrieve primitive resource and configured Compute Library objects auto *acl_resource @@ -41,14 +49,7 @@ status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const { acl_obj.wei_tensor.allocator()->import_memory(const_cast(wei_base)); acl_obj.bia_tensor.allocator()->import_memory(const_cast(bia_base)); - bool use_dst_acc = pd()->acl_pd_conf.use_dst_acc; - if (use_dst_acc) { - // Put the result in a new tensor, it will be accumalated to the dst - // during the post ops - acl_obj.dst_tensor.allocator()->allocate(); - } else { - acl_obj.dst_tensor.allocator()->import_memory(dst_base); - } + acl_obj.dst_tensor.allocator()->import_memory(dst_base); acl_obj.deconv.run(); diff --git a/src/cpu/aarch64/acl_deconvolution.hpp b/src/cpu/aarch64/acl_deconvolution.hpp index 4b646148b1d..274db7d370a 100644 --- a/src/cpu/aarch64/acl_deconvolution.hpp +++ b/src/cpu/aarch64/acl_deconvolution.hpp @@ -37,7 +37,7 @@ struct acl_deconv_conf_t { bool with_bias; // If this is true, the result of the convolution goes into a temporarily // allocated ACL tensor to be accumulated into the oneDNN dst during postops - bool use_dst_acc; + bool use_dst_acc_for_sum; bool fast_math; arm_compute::TensorInfo src_info; arm_compute::TensorInfo wei_info; @@ -88,7 +88,8 @@ struct acl_deconvolution_fwd_t : public primitive_t { , acl_pd_conf() , post_ops() {} - DECLARE_COMMON_PD_T("acl", acl_deconvolution_fwd_t); + DECLARE_COMMON_PD_T( + "acl", acl_deconvolution_fwd_t, USE_GLOBAL_SCRATCHPAD); status_t init(engine_t *engine) { using namespace data_type; @@ -291,7 +292,13 @@ struct acl_deconvolution_fwd_t : public primitive_t { } CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_)); - acl_pd_conf.use_dst_acc = post_ops.has_sum(); + acl_pd_conf.use_dst_acc_for_sum = post_ops.has_sum(); + + if (acl_pd_conf.use_dst_acc_for_sum) { + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_generic_acc, + dst_d.nelems(), dst_d.data_type_size()); + } return status::success; } diff --git a/src/cpu/aarch64/acl_depthwise_convolution.hpp b/src/cpu/aarch64/acl_depthwise_convolution.hpp index e0567daafde..4df41a4ef1a 100644 --- a/src/cpu/aarch64/acl_depthwise_convolution.hpp +++ b/src/cpu/aarch64/acl_depthwise_convolution.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Arm Ltd. and affiliates +* Copyright 2023-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -94,7 +94,14 @@ struct acl_depthwise_convolution_fwd_t : public primitive_t { CHECK(post_ops.init( engine, attr_.post_ops_, dst_md_, acp_.act_info)); - acp_.use_dst_acc = post_ops.has_sum(); + acp_.use_dst_acc_for_sum = post_ops.has_sum(); + + if (acp_.use_dst_acc_for_sum) { + const memory_desc_wrapper dst_d(&dst_md_); + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_generic_acc, + dst_d.nelems(), dst_d.data_type_size()); + } return status::success; } diff --git a/src/cpu/aarch64/acl_gemm_convolution.hpp b/src/cpu/aarch64/acl_gemm_convolution.hpp index 4b867b76057..efae57f0b22 100644 --- a/src/cpu/aarch64/acl_gemm_convolution.hpp +++ b/src/cpu/aarch64/acl_gemm_convolution.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Arm Ltd. and affiliates +* Copyright 2020-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,7 +89,14 @@ struct acl_gemm_convolution_fwd_t : public primitive_t { CHECK(post_ops.init( engine, attr_.post_ops_, dst_md_, acp_.act_info)); - acp_.use_dst_acc = post_ops.has_sum(); + acp_.use_dst_acc_for_sum = post_ops.has_sum(); + + if (acp_.use_dst_acc_for_sum) { + const memory_desc_wrapper dst_d(&dst_md_); + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_generic_acc, + dst_d.nelems(), dst_d.data_type_size()); + } return status::success; } diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp index 762878ad7d1..c91b965398e 100644 --- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp +++ b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp @@ -98,7 +98,14 @@ struct acl_indirect_gemm_convolution_fwd_t : public primitive_t { CHECK(post_ops.init( engine, attr_.post_ops_, dst_md_, acp_.act_info)); - acp_.use_dst_acc = post_ops.has_sum(); + acp_.use_dst_acc_for_sum = post_ops.has_sum(); + + if (acp_.use_dst_acc_for_sum) { + const memory_desc_wrapper dst_d(&dst_md_); + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_generic_acc, + dst_d.nelems(), dst_d.data_type_size()); + } return status::success; } diff --git a/src/cpu/aarch64/acl_inner_product.cpp b/src/cpu/aarch64/acl_inner_product.cpp index 7c9671c8357..34de43ae638 100644 --- a/src/cpu/aarch64/acl_inner_product.cpp +++ b/src/cpu/aarch64/acl_inner_product.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2022 Arm Ltd. and affiliates +* Copyright 2021-2022,2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const { std::lock_guard _lock {this->mtx}; bool with_bias = pd()->aip.with_bias; - bool use_dst_acc = pd()->aip.use_dst_acc; + bool use_dst_acc_for_sum = pd()->aip.use_dst_acc_for_sum; // Retrieve primitive resource and configured Compute Library objects acl_ip_obj_t &acl_obj = ctx.get_resource_mapper() @@ -41,14 +41,14 @@ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const { auto wei_base = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS); acl_obj.wei_tensor.allocator()->import_memory(const_cast(wei_base)); - if (use_dst_acc) { - // Put the result in a new tensor, it will be accumalated to the dst - // during the post ops - acl_obj.dst_tensor.allocator()->allocate(); - } else { - auto dst_base = CTX_OUT_MEM(void *, DNNL_ARG_DST); - acl_obj.dst_tensor.allocator()->import_memory(dst_base); - } + const auto scratchpad = ctx.get_scratchpad_grantor(); + + // If we have an unfused sum post op, put the result in a scratchpad tensor. + // Result will be summed to the dst during acl_post_ops.execute + auto dst_base = use_dst_acc_for_sum + ? scratchpad.get(memory_tracking::names::key_generic_acc) + : CTX_OUT_MEM(void *, DNNL_ARG_DST); + acl_obj.dst_tensor.allocator()->import_memory(dst_base); if (with_bias) { auto bia_base = CTX_IN_MEM(const void *, DNNL_ARG_BIAS); diff --git a/src/cpu/aarch64/acl_inner_product.hpp b/src/cpu/aarch64/acl_inner_product.hpp index 0d7c9bed405..2c86e7eb2cf 100644 --- a/src/cpu/aarch64/acl_inner_product.hpp +++ b/src/cpu/aarch64/acl_inner_product.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2023 Arm Ltd. and affiliates +* Copyright 2021-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,7 +39,7 @@ struct acl_ip_conf_t { bool with_bias; // If this is true, the result of the inner product goes into a temporarily // allocated ACL tensor to be accumulated into the oneDNN dst during postops - bool use_dst_acc; + bool use_dst_acc_for_sum; arm_compute::TensorInfo src_tensor_info; arm_compute::TensorInfo wei_tensor_info; arm_compute::TensorInfo bia_tensor_info; @@ -118,6 +118,13 @@ struct acl_inner_product_fwd_t : public primitive_t { CHECK(init_conf_ip(engine, weights_format_kind_received)); + if (aip.use_dst_acc_for_sum) { + const memory_desc_wrapper dst_d(&dst_md_); + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_generic_acc, + dst_d.nelems(), dst_d.data_type_size()); + } + return status::success; } @@ -185,7 +192,7 @@ struct acl_inner_product_fwd_t : public primitive_t { CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, aip.fc_info.activation_info)); - aip.use_dst_acc = post_ops.has_sum(); + aip.use_dst_acc_for_sum = post_ops.has_sum(); // WeightFormat::ANY tells ACL we can handle any format aip.weights_info = arm_compute::WeightsInfo(false, 1, 1, ic_total, diff --git a/src/cpu/aarch64/acl_winograd_convolution.hpp b/src/cpu/aarch64/acl_winograd_convolution.hpp index 9c105775806..580bcb31c92 100644 --- a/src/cpu/aarch64/acl_winograd_convolution.hpp +++ b/src/cpu/aarch64/acl_winograd_convolution.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2023 Arm Ltd. and affiliates +* Copyright 2020-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -101,7 +101,14 @@ struct acl_wino_convolution_fwd_t : public primitive_t { CHECK(post_ops.init( engine, attr_.post_ops_, dst_md_, acp_.act_info)); - acp_.use_dst_acc = post_ops.has_sum(); + acp_.use_dst_acc_for_sum = post_ops.has_sum(); + + if (acp_.use_dst_acc_for_sum) { + const memory_desc_wrapper dst_d(&dst_md_); + auto scratchpad = scratchpad_registry().registrar(); + scratchpad.book(memory_tracking::names::key_generic_acc, + dst_d.nelems(), dst_d.data_type_size()); + } return status::success; }