Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cpu: aarch64: Move allocation of accumulator from ACL to oneDNN scratchpad #1936

Merged
merged 2 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/common/memory_tracking.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ enum {
key_gemm_blocked_a,
key_gemm_blocked_b,
key_gemm_accumulator,
key_generic_acc,
key_gnorm_cvt,
key_gnorm_reduction,
key_gnorm_tmp_mean,
Expand Down
22 changes: 11 additions & 11 deletions src/cpu/aarch64/acl_convolution_utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2023 Arm Ltd. and affiliates
* Copyright 2020-2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -41,7 +41,7 @@ struct acl_conv_conf_t {
bool fast_math;
// If this is true, the result of the convolution goes into a temporarily
// allocated ACL tensor to be accumulated into the oneDNN dst during postops
bool use_dst_acc;
bool use_dst_acc_for_sum;
// Tells that the selected algorithm is Winograd. This is needed because the
// algorithm can be set to algorithm::convolution_auto and later on we need to
// skip fixed-format protocol as ACL Winograd does not support it.
Expand Down Expand Up @@ -87,7 +87,7 @@ template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
status_t execute_forward_conv_acl(
const exec_ctx_t &ctx, conv_obj_t &acl_conv_obj, const conv_pd_t *pd) {
bool with_bias = pd->acp_.with_bias;
bool use_dst_acc = pd->acp_.use_dst_acc;
bool use_dst_acc_for_sum = pd->acp_.use_dst_acc_for_sum;

auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
Expand All @@ -99,14 +99,14 @@ status_t execute_forward_conv_acl(
acl_conv_obj.wei_tensor.allocator()->import_memory(
const_cast<wei_data_t *>(wei_base));

if (use_dst_acc) {
// Put the result in a new tensor, it will be accumalated to the dst
// during the post ops
acl_conv_obj.dst_tensor.allocator()->allocate();
} else {
auto dst_base = CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base);
}
const auto scratchpad = ctx.get_scratchpad_grantor();

// If we have an unfused sum post op, put the result in a scratchpad tensor.
// Result will be summed to the dst during acl_post_ops.execute
auto dst_base = use_dst_acc_for_sum
? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
: CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base);

if (with_bias) {
auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
Expand Down
21 changes: 11 additions & 10 deletions src/cpu/aarch64/acl_deconvolution.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2022 Arm Ltd. and affiliates
* Copyright 2022,2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,10 +27,18 @@ status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
// concurrent multithreaded access.
std::lock_guard<std::mutex> _lock {this->mtx};

const auto scratchpad = ctx.get_scratchpad_grantor();

auto src_base = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
auto wei_base = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS);
auto bia_base = CTX_IN_MEM(const void *, DNNL_ARG_BIAS);
auto dst_base = CTX_OUT_MEM(void *, DNNL_ARG_DST);

bool use_dst_acc_for_sum = pd()->acl_pd_conf.use_dst_acc_for_sum;
// If we have an unfused sum post op, put the result in a scratchpad tensor.
// Result will be summed to the dst during acl_post_ops.execute
auto dst_base = use_dst_acc_for_sum
? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
: CTX_OUT_MEM(void *, DNNL_ARG_DST);

// Retrieve primitive resource and configured Compute Library objects
auto *acl_resource
Expand All @@ -41,14 +49,7 @@ status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
acl_obj.wei_tensor.allocator()->import_memory(const_cast<void *>(wei_base));
acl_obj.bia_tensor.allocator()->import_memory(const_cast<void *>(bia_base));

bool use_dst_acc = pd()->acl_pd_conf.use_dst_acc;
if (use_dst_acc) {
// Put the result in a new tensor, it will be accumalated to the dst
// during the post ops
acl_obj.dst_tensor.allocator()->allocate();
} else {
acl_obj.dst_tensor.allocator()->import_memory(dst_base);
}
acl_obj.dst_tensor.allocator()->import_memory(dst_base);

acl_obj.deconv.run();

Expand Down
13 changes: 10 additions & 3 deletions src/cpu/aarch64/acl_deconvolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ struct acl_deconv_conf_t {
bool with_bias;
// If this is true, the result of the convolution goes into a temporarily
// allocated ACL tensor to be accumulated into the oneDNN dst during postops
bool use_dst_acc;
bool use_dst_acc_for_sum;
bool fast_math;
arm_compute::TensorInfo src_info;
arm_compute::TensorInfo wei_info;
Expand Down Expand Up @@ -88,7 +88,8 @@ struct acl_deconvolution_fwd_t : public primitive_t {
, acl_pd_conf()
, post_ops() {}

DECLARE_COMMON_PD_T("acl", acl_deconvolution_fwd_t);
DECLARE_COMMON_PD_T(
"acl", acl_deconvolution_fwd_t, USE_GLOBAL_SCRATCHPAD);

status_t init(engine_t *engine) {
using namespace data_type;
Expand Down Expand Up @@ -291,7 +292,13 @@ struct acl_deconvolution_fwd_t : public primitive_t {
}

CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_));
acl_pd_conf.use_dst_acc = post_ops.has_sum();
acl_pd_conf.use_dst_acc_for_sum = post_ops.has_sum();

if (acl_pd_conf.use_dst_acc_for_sum) {
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_generic_acc,
dst_d.nelems(), dst_d.data_type_size());
}

return status::success;
}
Expand Down
11 changes: 9 additions & 2 deletions src/cpu/aarch64/acl_depthwise_convolution.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2023 Arm Ltd. and affiliates
* Copyright 2023-2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -94,7 +94,14 @@ struct acl_depthwise_convolution_fwd_t : public primitive_t {

CHECK(post_ops.init(
engine, attr_.post_ops_, dst_md_, acp_.act_info));
acp_.use_dst_acc = post_ops.has_sum();
acp_.use_dst_acc_for_sum = post_ops.has_sum();

if (acp_.use_dst_acc_for_sum) {
const memory_desc_wrapper dst_d(&dst_md_);
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_generic_acc,
dst_d.nelems(), dst_d.data_type_size());
}

return status::success;
}
Expand Down
11 changes: 9 additions & 2 deletions src/cpu/aarch64/acl_gemm_convolution.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2023 Arm Ltd. and affiliates
* Copyright 2020-2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -89,7 +89,14 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {

CHECK(post_ops.init(
engine, attr_.post_ops_, dst_md_, acp_.act_info));
acp_.use_dst_acc = post_ops.has_sum();
acp_.use_dst_acc_for_sum = post_ops.has_sum();

if (acp_.use_dst_acc_for_sum) {
const memory_desc_wrapper dst_d(&dst_md_);
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_generic_acc,
dst_d.nelems(), dst_d.data_type_size());
}

return status::success;
}
Expand Down
9 changes: 8 additions & 1 deletion src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,14 @@ struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {

CHECK(post_ops.init(
engine, attr_.post_ops_, dst_md_, acp_.act_info));
acp_.use_dst_acc = post_ops.has_sum();
acp_.use_dst_acc_for_sum = post_ops.has_sum();

if (acp_.use_dst_acc_for_sum) {
const memory_desc_wrapper dst_d(&dst_md_);
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_generic_acc,
dst_d.nelems(), dst_d.data_type_size());
}

return status::success;
}
Expand Down
20 changes: 10 additions & 10 deletions src/cpu/aarch64/acl_inner_product.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2021-2022 Arm Ltd. and affiliates
* Copyright 2021-2022,2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,7 +28,7 @@ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
std::lock_guard<std::mutex> _lock {this->mtx};

bool with_bias = pd()->aip.with_bias;
bool use_dst_acc = pd()->aip.use_dst_acc;
bool use_dst_acc_for_sum = pd()->aip.use_dst_acc_for_sum;

// Retrieve primitive resource and configured Compute Library objects
acl_ip_obj_t &acl_obj = ctx.get_resource_mapper()
Expand All @@ -41,14 +41,14 @@ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
auto wei_base = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS);
acl_obj.wei_tensor.allocator()->import_memory(const_cast<void *>(wei_base));

if (use_dst_acc) {
// Put the result in a new tensor, it will be accumalated to the dst
// during the post ops
acl_obj.dst_tensor.allocator()->allocate();
} else {
auto dst_base = CTX_OUT_MEM(void *, DNNL_ARG_DST);
acl_obj.dst_tensor.allocator()->import_memory(dst_base);
}
const auto scratchpad = ctx.get_scratchpad_grantor();

// If we have an unfused sum post op, put the result in a scratchpad tensor.
// Result will be summed to the dst during acl_post_ops.execute
auto dst_base = use_dst_acc_for_sum
? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
: CTX_OUT_MEM(void *, DNNL_ARG_DST);
acl_obj.dst_tensor.allocator()->import_memory(dst_base);

if (with_bias) {
auto bia_base = CTX_IN_MEM(const void *, DNNL_ARG_BIAS);
Expand Down
13 changes: 10 additions & 3 deletions src/cpu/aarch64/acl_inner_product.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2021-2023 Arm Ltd. and affiliates
* Copyright 2021-2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -39,7 +39,7 @@ struct acl_ip_conf_t {
bool with_bias;
// If this is true, the result of the inner product goes into a temporarily
// allocated ACL tensor to be accumulated into the oneDNN dst during postops
bool use_dst_acc;
bool use_dst_acc_for_sum;
arm_compute::TensorInfo src_tensor_info;
arm_compute::TensorInfo wei_tensor_info;
arm_compute::TensorInfo bia_tensor_info;
Expand Down Expand Up @@ -118,6 +118,13 @@ struct acl_inner_product_fwd_t : public primitive_t {

CHECK(init_conf_ip(engine, weights_format_kind_received));

if (aip.use_dst_acc_for_sum) {
const memory_desc_wrapper dst_d(&dst_md_);
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_generic_acc,
dst_d.nelems(), dst_d.data_type_size());
}

return status::success;
}

Expand Down Expand Up @@ -185,7 +192,7 @@ struct acl_inner_product_fwd_t : public primitive_t {

CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_,
aip.fc_info.activation_info));
aip.use_dst_acc = post_ops.has_sum();
aip.use_dst_acc_for_sum = post_ops.has_sum();

// WeightFormat::ANY tells ACL we can handle any format
aip.weights_info = arm_compute::WeightsInfo(false, 1, 1, ic_total,
Expand Down
11 changes: 9 additions & 2 deletions src/cpu/aarch64/acl_winograd_convolution.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2023 Arm Ltd. and affiliates
* Copyright 2020-2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -101,7 +101,14 @@ struct acl_wino_convolution_fwd_t : public primitive_t {

CHECK(post_ops.init(
engine, attr_.post_ops_, dst_md_, acp_.act_info));
acp_.use_dst_acc = post_ops.has_sum();
acp_.use_dst_acc_for_sum = post_ops.has_sum();

if (acp_.use_dst_acc_for_sum) {
const memory_desc_wrapper dst_d(&dst_md_);
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_generic_acc,
dst_d.nelems(), dst_d.data_type_size());
}

return status::success;
}
Expand Down