oneapi-src · densamoilov · Jun 3, 2024 · May 28, 2024 · May 31, 2024
@@ -227,6 +227,7 @@ enum {
     key_gemm_blocked_a,
     key_gemm_blocked_b,
     key_gemm_accumulator,
+    key_generic_acc,
     key_gnorm_cvt,
     key_gnorm_reduction,
     key_gnorm_tmp_mean,

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Arm Ltd. and affiliates
+* Copyright 2020-2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ struct acl_conv_conf_t {
     bool fast_math;
     // If this is true, the result of the convolution goes into a temporarily
     // allocated ACL tensor to be accumulated into the oneDNN dst during postops
-    bool use_dst_acc;
+    bool use_dst_acc_for_sum;
     // Tells that the selected algorithm is Winograd. This is needed because the
     // algorithm can be set to algorithm::convolution_auto and later on we need to
     // skip fixed-format protocol as ACL Winograd does not support it.
@@ -87,7 +87,7 @@ template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
 status_t execute_forward_conv_acl(
         const exec_ctx_t &ctx, conv_obj_t &acl_conv_obj, const conv_pd_t *pd) {
     bool with_bias = pd->acp_.with_bias;
-    bool use_dst_acc = pd->acp_.use_dst_acc;
+    bool use_dst_acc_for_sum = pd->acp_.use_dst_acc_for_sum;
 
     auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
     auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
@@ -99,14 +99,14 @@ status_t execute_forward_conv_acl(
     acl_conv_obj.wei_tensor.allocator()->import_memory(
             const_cast<wei_data_t *>(wei_base));
 
-    if (use_dst_acc) {
-        // Put the result in a new tensor, it will be accumalated to the dst
-        // during the post ops
-        acl_conv_obj.dst_tensor.allocator()->allocate();
-    } else {
-        auto dst_base = CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
-        acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base);
-    }
+    const auto scratchpad = ctx.get_scratchpad_grantor();
+
+    // If we have an unfused sum post op, put the result in a scratchpad tensor.
+    // Result will be summed to the dst during acl_post_ops.execute
+    auto dst_base = use_dst_acc_for_sum
+            ? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
+            : CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
+    acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base);
 
     if (with_bias) {
         auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Arm Ltd. and affiliates
+* Copyright 2022,2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,10 +27,18 @@ status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     // concurrent multithreaded access.
     std::lock_guard<std::mutex> _lock {this->mtx};
 
+    const auto scratchpad = ctx.get_scratchpad_grantor();
+
     auto src_base = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
     auto wei_base = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS);
     auto bia_base = CTX_IN_MEM(const void *, DNNL_ARG_BIAS);
-    auto dst_base = CTX_OUT_MEM(void *, DNNL_ARG_DST);
+
+    bool use_dst_acc_for_sum = pd()->acl_pd_conf.use_dst_acc_for_sum;
+    // If we have an unfused sum post op, put the result in a scratchpad tensor.
+    // Result will be summed to the dst during acl_post_ops.execute
+    auto dst_base = use_dst_acc_for_sum
+            ? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
+            : CTX_OUT_MEM(void *, DNNL_ARG_DST);
 
     // Retrieve primitive resource and configured Compute Library objects
     auto *acl_resource
@@ -41,14 +49,7 @@ status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     acl_obj.wei_tensor.allocator()->import_memory(const_cast<void *>(wei_base));
     acl_obj.bia_tensor.allocator()->import_memory(const_cast<void *>(bia_base));
 
-    bool use_dst_acc = pd()->acl_pd_conf.use_dst_acc;
-    if (use_dst_acc) {
-        // Put the result in a new tensor, it will be accumalated to the dst
-        // during the post ops
-        acl_obj.dst_tensor.allocator()->allocate();
-    } else {
-        acl_obj.dst_tensor.allocator()->import_memory(dst_base);
-    }
+    acl_obj.dst_tensor.allocator()->import_memory(dst_base);
 
     acl_obj.deconv.run();
 

@@ -37,7 +37,7 @@ struct acl_deconv_conf_t {
     bool with_bias;
     // If this is true, the result of the convolution goes into a temporarily
     // allocated ACL tensor to be accumulated into the oneDNN dst during postops
-    bool use_dst_acc;
+    bool use_dst_acc_for_sum;
     bool fast_math;
     arm_compute::TensorInfo src_info;
     arm_compute::TensorInfo wei_info;
@@ -88,7 +88,8 @@ struct acl_deconvolution_fwd_t : public primitive_t {
             , acl_pd_conf()
             , post_ops() {}
 
-        DECLARE_COMMON_PD_T("acl", acl_deconvolution_fwd_t);
+        DECLARE_COMMON_PD_T(
+                "acl", acl_deconvolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
 
         status_t init(engine_t *engine) {
             using namespace data_type;
@@ -291,7 +292,13 @@ struct acl_deconvolution_fwd_t : public primitive_t {
             }
 
             CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_));
-            acl_pd_conf.use_dst_acc = post_ops.has_sum();
+            acl_pd_conf.use_dst_acc_for_sum = post_ops.has_sum();
+
+            if (acl_pd_conf.use_dst_acc_for_sum) {
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_generic_acc,
+                        dst_d.nelems(), dst_d.data_type_size());
+            }
 
             return status::success;
         }

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Arm Ltd. and affiliates
+* Copyright 2023-2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -94,7 +94,14 @@ struct acl_depthwise_convolution_fwd_t : public primitive_t {
 
             CHECK(post_ops.init(
                     engine, attr_.post_ops_, dst_md_, acp_.act_info));
-            acp_.use_dst_acc = post_ops.has_sum();
+            acp_.use_dst_acc_for_sum = post_ops.has_sum();
+
+            if (acp_.use_dst_acc_for_sum) {
+                const memory_desc_wrapper dst_d(&dst_md_);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_generic_acc,
+                        dst_d.nelems(), dst_d.data_type_size());
+            }
 
             return status::success;
         }

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Arm Ltd. and affiliates
+* Copyright 2020-2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -89,7 +89,14 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
 
             CHECK(post_ops.init(
                     engine, attr_.post_ops_, dst_md_, acp_.act_info));
-            acp_.use_dst_acc = post_ops.has_sum();
+            acp_.use_dst_acc_for_sum = post_ops.has_sum();
+
+            if (acp_.use_dst_acc_for_sum) {
+                const memory_desc_wrapper dst_d(&dst_md_);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_generic_acc,
+                        dst_d.nelems(), dst_d.data_type_size());
+            }
 
             return status::success;
         }

@@ -98,7 +98,14 @@ struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {
 
             CHECK(post_ops.init(
                     engine, attr_.post_ops_, dst_md_, acp_.act_info));
-            acp_.use_dst_acc = post_ops.has_sum();
+            acp_.use_dst_acc_for_sum = post_ops.has_sum();
+
+            if (acp_.use_dst_acc_for_sum) {
+                const memory_desc_wrapper dst_d(&dst_md_);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_generic_acc,
+                        dst_d.nelems(), dst_d.data_type_size());
+            }
 
             return status::success;
         }

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2022 Arm Ltd. and affiliates
+* Copyright 2021-2022,2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     std::lock_guard<std::mutex> _lock {this->mtx};
 
     bool with_bias = pd()->aip.with_bias;
-    bool use_dst_acc = pd()->aip.use_dst_acc;
+    bool use_dst_acc_for_sum = pd()->aip.use_dst_acc_for_sum;
 
     // Retrieve primitive resource and configured Compute Library objects
     acl_ip_obj_t &acl_obj = ctx.get_resource_mapper()
@@ -41,14 +41,14 @@ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     auto wei_base = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS);
     acl_obj.wei_tensor.allocator()->import_memory(const_cast<void *>(wei_base));
 
-    if (use_dst_acc) {
-        // Put the result in a new tensor, it will be accumalated to the dst
-        // during the post ops
-        acl_obj.dst_tensor.allocator()->allocate();
-    } else {
-        auto dst_base = CTX_OUT_MEM(void *, DNNL_ARG_DST);
-        acl_obj.dst_tensor.allocator()->import_memory(dst_base);
-    }
+    const auto scratchpad = ctx.get_scratchpad_grantor();
+
+    // If we have an unfused sum post op, put the result in a scratchpad tensor.
+    // Result will be summed to the dst during acl_post_ops.execute
+    auto dst_base = use_dst_acc_for_sum
+            ? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
+            : CTX_OUT_MEM(void *, DNNL_ARG_DST);
+    acl_obj.dst_tensor.allocator()->import_memory(dst_base);
 
     if (with_bias) {
         auto bia_base = CTX_IN_MEM(const void *, DNNL_ARG_BIAS);

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Arm Ltd. and affiliates
+* Copyright 2021-2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ struct acl_ip_conf_t {
     bool with_bias;
     // If this is true, the result of the inner product goes into a temporarily
     // allocated ACL tensor to be accumulated into the oneDNN dst during postops
-    bool use_dst_acc;
+    bool use_dst_acc_for_sum;
     arm_compute::TensorInfo src_tensor_info;
     arm_compute::TensorInfo wei_tensor_info;
     arm_compute::TensorInfo bia_tensor_info;
@@ -118,6 +118,13 @@ struct acl_inner_product_fwd_t : public primitive_t {
 
             CHECK(init_conf_ip(engine, weights_format_kind_received));
 
+            if (aip.use_dst_acc_for_sum) {
+                const memory_desc_wrapper dst_d(&dst_md_);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_generic_acc,
+                        dst_d.nelems(), dst_d.data_type_size());
+            }
+
             return status::success;
         }
 
@@ -185,7 +192,7 @@ struct acl_inner_product_fwd_t : public primitive_t {
 
             CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_,
                     aip.fc_info.activation_info));
-            aip.use_dst_acc = post_ops.has_sum();
+            aip.use_dst_acc_for_sum = post_ops.has_sum();
 
             // WeightFormat::ANY tells ACL we can handle any format
             aip.weights_info = arm_compute::WeightsInfo(false, 1, 1, ic_total,

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Arm Ltd. and affiliates
+* Copyright 2020-2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -101,7 +101,14 @@ struct acl_wino_convolution_fwd_t : public primitive_t {
 
             CHECK(post_ops.init(
                     engine, attr_.post_ops_, dst_md_, acp_.act_info));
-            acp_.use_dst_acc = post_ops.has_sum();
+            acp_.use_dst_acc_for_sum = post_ops.has_sum();
+
+            if (acp_.use_dst_acc_for_sum) {
+                const memory_desc_wrapper dst_d(&dst_md_);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_generic_acc,
+                        dst_d.nelems(), dst_d.data_type_size());
+            }
 
             return status::success;
         }