Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ETI macros to simplify instantiations for element types and devices for communication helpers #995

Merged
merged 1 commit into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions include/dlaf/common/eti.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//

/// @file

#pragma once

#include <complex>

#include <dlaf/types.h>

#ifdef DLAF_WITH_GPU
#define DLAF_EXPAND_ETI_SDCZ_DEVICE(ETI_MACRO, KWORD) \
ETI_MACRO(KWORD, float, Device::CPU); \
ETI_MACRO(KWORD, double, Device::CPU); \
ETI_MACRO(KWORD, std::complex<float>, Device::CPU); \
ETI_MACRO(KWORD, std::complex<double>, Device::CPU); \
ETI_MACRO(KWORD, float, Device::GPU); \
ETI_MACRO(KWORD, double, Device::GPU); \
ETI_MACRO(KWORD, std::complex<float>, Device::GPU); \
ETI_MACRO(KWORD, std::complex<double>, Device::GPU);
#define DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(ETI_MACRO, KWORD, ...) \
ETI_MACRO(KWORD, float, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, double, Device::CPU, __VA_ARGS__); \
msimberg marked this conversation as resolved.
Show resolved Hide resolved
ETI_MACRO(KWORD, std::complex<float>, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<double>, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, float, Device::GPU, __VA_ARGS__); \
ETI_MACRO(KWORD, double, Device::GPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<float>, Device::GPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<double>, Device::GPU, __VA_ARGS__);
#else
#define DLAF_EXPAND_ETI_SDCZ_DEVICE(ETI_MACRO, KWORD) \
ETI_MACRO(KWORD, float, Device::CPU); \
ETI_MACRO(KWORD, double, Device::CPU); \
ETI_MACRO(KWORD, std::complex<float>, Device::CPU); \
ETI_MACRO(KWORD, std::complex<double>, Device::CPU);
#define DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(ETI_MACRO, KWORD, ...) \
ETI_MACRO(KWORD, float, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, double, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<float>, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<double>, Device::CPU, __VA_ARGS__);
#endif
14 changes: 3 additions & 11 deletions include/dlaf/communication/kernels/all_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <pika/execution.hpp>

#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/matrix/matrix.h>
Expand All @@ -43,6 +44,7 @@ template <class T, Device D>
MPI_Op reduce_op, dlaf::matrix::ReadOnlyTileSender<Type, Device> tile_in, \
dlaf::matrix::ReadWriteTileSender<Type, Device> tile_out)

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_ALL_REDUCE_ETI, extern);
DLAF_SCHEDULE_ALL_REDUCE_ETI(extern, int, Device::CPU);

/// Schedule an in-place all reduce.
Expand All @@ -60,16 +62,6 @@ template <class T, Device D>
pika::execution::experimental::unique_any_sender<common::Pipeline<Communicator>::Wrapper> pcomm, \
MPI_Op reduce_op, dlaf::matrix::ReadWriteTileSender<Type, Device> tile)

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI, extern);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, int, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, float, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, double, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, std::complex<float>, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, std::complex<double>, Device::CPU);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, float, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, double, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, std::complex<float>, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, std::complex<double>, Device::GPU);
#endif
}
35 changes: 9 additions & 26 deletions include/dlaf/communication/kernels/broadcast.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <dlaf/common/assert.h>
#include <dlaf/common/callable_object.h>
#include <dlaf/common/data.h>
#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/communication/message.h>
Expand Down Expand Up @@ -71,23 +72,14 @@ template <class T, Device D, class Comm>
pika::execution::experimental::unique_any_sender<Comm> pcomm, \
dlaf::matrix::ReadOnlyTileSender<Type, Device> tile)

DLAF_SCHEDULE_SEND_BCAST_ETI(extern, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_SEND_BCAST_ETI, extern, common::Pipeline<Communicator>::Wrapper);

DLAF_SCHEDULE_SEND_BCAST_ETI(extern, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, SizeType, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format on

/// Schedule a broadcast receive.
///
Expand All @@ -104,21 +96,12 @@ template <class T, Device D, class Comm>
pika::execution::experimental::unique_any_sender<Comm> pcomm, comm::IndexT_MPI root_rank, \
dlaf::matrix::ReadWriteTileSender<Type, Device> tile)

DLAF_SCHEDULE_RECV_BCAST_ETI(extern, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_RECV_BCAST_ETI, extern, common::Pipeline<Communicator>::Wrapper);

DLAF_SCHEDULE_RECV_BCAST_ETI(extern, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, SizeType, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format on
}
59 changes: 9 additions & 50 deletions include/dlaf/communication/kernels/p2p.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <pika/execution.hpp>

#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/matrix/tile.h>
Expand All @@ -33,31 +34,10 @@ template <class T, Device D, class Comm>
pika::execution::experimental::unique_any_sender<Comm> pcomm, IndexT_MPI dest, IndexT_MPI tag, \
dlaf::matrix::ReadOnlyTileSender<Type, Device> tile)

DLAF_SCHEDULE_SEND_ETI(extern, float, Device::CPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, double, Device::CPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<float>, Device::CPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<double>, Device::CPU, Communicator);

DLAF_SCHEDULE_SEND_ETI(extern, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_SEND_ETI(extern, float, Device::GPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, double, Device::GPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<float>, Device::GPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<double>, Device::GPU, Communicator);

DLAF_SCHEDULE_SEND_ETI(extern, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_SEND_ETI, extern, Communicator);
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_SEND_ETI, extern, common::Pipeline<Communicator>::Wrapper);
// clang-format on

template <class T, Device D, class Comm>
[[nodiscard]] dlaf::matrix::ReadWriteTileSender<T, D> scheduleRecv(
Expand All @@ -69,29 +49,8 @@ template <class T, Device D, class Comm>
pika::execution::experimental::unique_any_sender<Comm> pcomm, IndexT_MPI source, IndexT_MPI tag, \
dlaf::matrix::ReadWriteTileSender<Type, Device> tile)

DLAF_SCHEDULE_RECV_ETI(extern, float, Device::CPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, double, Device::CPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<float>, Device::CPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<double>, Device::CPU, Communicator);

DLAF_SCHEDULE_RECV_ETI(extern, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_RECV_ETI(extern, float, Device::GPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, double, Device::GPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<float>, Device::GPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<double>, Device::GPU, Communicator);

DLAF_SCHEDULE_RECV_ETI(extern, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_RECV_ETI, extern, Communicator);
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_RECV_ETI, extern, common::Pipeline<Communicator>::Wrapper);
// clang-format on
}
25 changes: 3 additions & 22 deletions include/dlaf/communication/kernels/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <pika/execution.hpp>

#include <dlaf/common/data.h>
#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/communication/message.h>
Expand All @@ -40,18 +41,8 @@ template <class T, Device D>
pika::execution::experimental::unique_any_sender<common::Pipeline<Communicator>::Wrapper> pcomm, \
MPI_Op reduce_op, dlaf::matrix::ReadWriteTileSender<Type, Device> tile)

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI, extern);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, int, Device::CPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, float, Device::CPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, double, Device::CPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, std::complex<float>, Device::CPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, std::complex<double>, Device::CPU);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, float, Device::GPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, double, Device::GPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, std::complex<float>, Device::GPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, std::complex<double>, Device::GPU);
#endif

/// Schedule a reduction send.
///
Expand All @@ -69,16 +60,6 @@ template <class T, Device D>
comm::IndexT_MPI rank_root, MPI_Op reduce_op, \
dlaf::matrix::ReadOnlyTileSender<Type, Device> tile)

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_REDUCE_SEND_ETI, extern);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, int, Device::CPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, float, Device::CPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, double, Device::CPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, std::complex<float>, Device::CPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, std::complex<double>, Device::CPU);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, float, Device::GPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, double, Device::GPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, std::complex<float>, Device::GPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, std::complex<double>, Device::GPU);
#endif
}
16 changes: 3 additions & 13 deletions src/communication/kernels/all_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <pika/execution.hpp>

#include <dlaf/common/callable_object.h>
#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/communication/kernels/all_reduce.h>
Expand Down Expand Up @@ -115,6 +116,7 @@ template <class T, Device D>
std::move(tile_out));
}

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_ALL_REDUCE_ETI, );
DLAF_SCHEDULE_ALL_REDUCE_ETI(, int, Device::CPU);

template <class T, Device D>
Expand Down Expand Up @@ -150,18 +152,6 @@ template <class T, Device D>
RequireContiguous::Yes>(std::move(tile), std::move(all_reduce_in_place));
}

// TODO: This is only for a test (test_collective_async)
DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI, );
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, int, Device::CPU);

DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, float, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, double, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, std::complex<float>, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, std::complex<double>, Device::CPU);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, float, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, double, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, std::complex<float>, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, std::complex<double>, Device::GPU);
#endif
}
35 changes: 9 additions & 26 deletions src/communication/kernels/broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <dlaf/common/assert.h>
#include <dlaf/common/callable_object.h>
#include <dlaf/common/data.h>
#include <dlaf/common/eti.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/communication/kernels/broadcast.h>
#include <dlaf/communication/message.h>
Expand Down Expand Up @@ -60,23 +61,14 @@ template <class T, Device D, class Comm>
return internal::scheduleSendBcast(std::move(pcomm), std::move(tile));
}

DLAF_SCHEDULE_SEND_BCAST_ETI(, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_SEND_BCAST_ETI, , common::Pipeline<Communicator>::Wrapper);

DLAF_SCHEDULE_SEND_BCAST_ETI(, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_SEND_BCAST_ETI(, SizeType, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format on

template <class T, Device D, class Comm>
[[nodiscard]] dlaf::matrix::ReadWriteTileSender<T, D> scheduleRecvBcast(
Expand Down Expand Up @@ -112,21 +104,12 @@ template <class T, Device D, class Comm>
RequireContiguous::No>(std::move(tile), std::move(recv));
}

DLAF_SCHEDULE_RECV_BCAST_ETI(, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_RECV_BCAST_ETI, , common::Pipeline<Communicator>::Wrapper);

DLAF_SCHEDULE_RECV_BCAST_ETI(, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_RECV_BCAST_ETI(, SizeType, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format on
}
Loading
Loading