Skip to content

Commit

Permalink
Add ETI macros to simplify instantiations for element types and devic…
Browse files Browse the repository at this point in the history
…es in communication (#995)
  • Loading branch information
msimberg authored Oct 6, 2023
1 parent 3bfd3ee commit 87ec68a
Show file tree
Hide file tree
Showing 9 changed files with 93 additions and 212 deletions.
49 changes: 49 additions & 0 deletions include/dlaf/common/eti.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//
// Distributed Linear Algebra with Future (DLAF)
//
// Copyright (c) 2018-2023, ETH Zurich
// All rights reserved.
//
// Please, refer to the LICENSE file in the root directory.
// SPDX-License-Identifier: BSD-3-Clause
//

/// @file

#pragma once

#include <complex>

#include <dlaf/types.h>

#ifdef DLAF_WITH_GPU
#define DLAF_EXPAND_ETI_SDCZ_DEVICE(ETI_MACRO, KWORD) \
ETI_MACRO(KWORD, float, Device::CPU); \
ETI_MACRO(KWORD, double, Device::CPU); \
ETI_MACRO(KWORD, std::complex<float>, Device::CPU); \
ETI_MACRO(KWORD, std::complex<double>, Device::CPU); \
ETI_MACRO(KWORD, float, Device::GPU); \
ETI_MACRO(KWORD, double, Device::GPU); \
ETI_MACRO(KWORD, std::complex<float>, Device::GPU); \
ETI_MACRO(KWORD, std::complex<double>, Device::GPU);
#define DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(ETI_MACRO, KWORD, ...) \
ETI_MACRO(KWORD, float, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, double, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<float>, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<double>, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, float, Device::GPU, __VA_ARGS__); \
ETI_MACRO(KWORD, double, Device::GPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<float>, Device::GPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<double>, Device::GPU, __VA_ARGS__);
#else
#define DLAF_EXPAND_ETI_SDCZ_DEVICE(ETI_MACRO, KWORD) \
ETI_MACRO(KWORD, float, Device::CPU); \
ETI_MACRO(KWORD, double, Device::CPU); \
ETI_MACRO(KWORD, std::complex<float>, Device::CPU); \
ETI_MACRO(KWORD, std::complex<double>, Device::CPU);
#define DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(ETI_MACRO, KWORD, ...) \
ETI_MACRO(KWORD, float, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, double, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<float>, Device::CPU, __VA_ARGS__); \
ETI_MACRO(KWORD, std::complex<double>, Device::CPU, __VA_ARGS__);
#endif
14 changes: 3 additions & 11 deletions include/dlaf/communication/kernels/all_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <pika/execution.hpp>

#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/matrix/matrix.h>
Expand All @@ -43,6 +44,7 @@ template <class T, Device D>
MPI_Op reduce_op, dlaf::matrix::ReadOnlyTileSender<Type, Device> tile_in, \
dlaf::matrix::ReadWriteTileSender<Type, Device> tile_out)

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_ALL_REDUCE_ETI, extern);
DLAF_SCHEDULE_ALL_REDUCE_ETI(extern, int, Device::CPU);

/// Schedule an in-place all reduce.
Expand All @@ -60,16 +62,6 @@ template <class T, Device D>
pika::execution::experimental::unique_any_sender<common::Pipeline<Communicator>::Wrapper> pcomm, \
MPI_Op reduce_op, dlaf::matrix::ReadWriteTileSender<Type, Device> tile)

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI, extern);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, int, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, float, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, double, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, std::complex<float>, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, std::complex<double>, Device::CPU);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, float, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, double, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, std::complex<float>, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(extern, std::complex<double>, Device::GPU);
#endif
}
35 changes: 9 additions & 26 deletions include/dlaf/communication/kernels/broadcast.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <dlaf/common/assert.h>
#include <dlaf/common/callable_object.h>
#include <dlaf/common/data.h>
#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/communication/message.h>
Expand Down Expand Up @@ -71,23 +72,14 @@ template <class T, Device D, class Comm>
pika::execution::experimental::unique_any_sender<Comm> pcomm, \
dlaf::matrix::ReadOnlyTileSender<Type, Device> tile)

DLAF_SCHEDULE_SEND_BCAST_ETI(extern, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_SEND_BCAST_ETI, extern, common::Pipeline<Communicator>::Wrapper);

DLAF_SCHEDULE_SEND_BCAST_ETI(extern, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, SizeType, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(extern, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format on

/// Schedule a broadcast receive.
///
Expand All @@ -104,21 +96,12 @@ template <class T, Device D, class Comm>
pika::execution::experimental::unique_any_sender<Comm> pcomm, comm::IndexT_MPI root_rank, \
dlaf::matrix::ReadWriteTileSender<Type, Device> tile)

DLAF_SCHEDULE_RECV_BCAST_ETI(extern, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_RECV_BCAST_ETI, extern, common::Pipeline<Communicator>::Wrapper);

DLAF_SCHEDULE_RECV_BCAST_ETI(extern, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, SizeType, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(extern, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format on
}
59 changes: 9 additions & 50 deletions include/dlaf/communication/kernels/p2p.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <pika/execution.hpp>

#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/matrix/tile.h>
Expand All @@ -33,31 +34,10 @@ template <class T, Device D, class Comm>
pika::execution::experimental::unique_any_sender<Comm> pcomm, IndexT_MPI dest, IndexT_MPI tag, \
dlaf::matrix::ReadOnlyTileSender<Type, Device> tile)

DLAF_SCHEDULE_SEND_ETI(extern, float, Device::CPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, double, Device::CPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<float>, Device::CPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<double>, Device::CPU, Communicator);

DLAF_SCHEDULE_SEND_ETI(extern, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_SEND_ETI(extern, float, Device::GPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, double, Device::GPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<float>, Device::GPU, Communicator);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<double>, Device::GPU, Communicator);

DLAF_SCHEDULE_SEND_ETI(extern, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_ETI(extern, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_SEND_ETI, extern, Communicator);
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_SEND_ETI, extern, common::Pipeline<Communicator>::Wrapper);
// clang-format on

template <class T, Device D, class Comm>
[[nodiscard]] dlaf::matrix::ReadWriteTileSender<T, D> scheduleRecv(
Expand All @@ -69,29 +49,8 @@ template <class T, Device D, class Comm>
pika::execution::experimental::unique_any_sender<Comm> pcomm, IndexT_MPI source, IndexT_MPI tag, \
dlaf::matrix::ReadWriteTileSender<Type, Device> tile)

DLAF_SCHEDULE_RECV_ETI(extern, float, Device::CPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, double, Device::CPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<float>, Device::CPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<double>, Device::CPU, Communicator);

DLAF_SCHEDULE_RECV_ETI(extern, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_RECV_ETI(extern, float, Device::GPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, double, Device::GPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<float>, Device::GPU, Communicator);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<double>, Device::GPU, Communicator);

DLAF_SCHEDULE_RECV_ETI(extern, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_ETI(extern, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_RECV_ETI, extern, Communicator);
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_RECV_ETI, extern, common::Pipeline<Communicator>::Wrapper);
// clang-format on
}
25 changes: 3 additions & 22 deletions include/dlaf/communication/kernels/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <pika/execution.hpp>

#include <dlaf/common/data.h>
#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/communication/message.h>
Expand All @@ -40,18 +41,8 @@ template <class T, Device D>
pika::execution::experimental::unique_any_sender<common::Pipeline<Communicator>::Wrapper> pcomm, \
MPI_Op reduce_op, dlaf::matrix::ReadWriteTileSender<Type, Device> tile)

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI, extern);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, int, Device::CPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, float, Device::CPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, double, Device::CPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, std::complex<float>, Device::CPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, std::complex<double>, Device::CPU);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, float, Device::GPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, double, Device::GPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, std::complex<float>, Device::GPU);
DLAF_SCHEDULE_REDUCE_RECV_IN_PLACE_ETI(extern, std::complex<double>, Device::GPU);
#endif

/// Schedule a reduction send.
///
Expand All @@ -69,16 +60,6 @@ template <class T, Device D>
comm::IndexT_MPI rank_root, MPI_Op reduce_op, \
dlaf::matrix::ReadOnlyTileSender<Type, Device> tile)

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_REDUCE_SEND_ETI, extern);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, int, Device::CPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, float, Device::CPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, double, Device::CPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, std::complex<float>, Device::CPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, std::complex<double>, Device::CPU);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, float, Device::GPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, double, Device::GPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, std::complex<float>, Device::GPU);
DLAF_SCHEDULE_REDUCE_SEND_ETI(extern, std::complex<double>, Device::GPU);
#endif
}
16 changes: 3 additions & 13 deletions src/communication/kernels/all_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <pika/execution.hpp>

#include <dlaf/common/callable_object.h>
#include <dlaf/common/eti.h>
#include <dlaf/common/pipeline.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/communication/kernels/all_reduce.h>
Expand Down Expand Up @@ -115,6 +116,7 @@ template <class T, Device D>
std::move(tile_out));
}

DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_ALL_REDUCE_ETI, );
DLAF_SCHEDULE_ALL_REDUCE_ETI(, int, Device::CPU);

template <class T, Device D>
Expand Down Expand Up @@ -150,18 +152,6 @@ template <class T, Device D>
RequireContiguous::Yes>(std::move(tile), std::move(all_reduce_in_place));
}

// TODO: This is only for a test (test_collective_async)
DLAF_EXPAND_ETI_SDCZ_DEVICE(DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI, );
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, int, Device::CPU);

DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, float, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, double, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, std::complex<float>, Device::CPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, std::complex<double>, Device::CPU);

#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, float, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, double, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, std::complex<float>, Device::GPU);
DLAF_SCHEDULE_ALL_REDUCE_IN_PLACE_ETI(, std::complex<double>, Device::GPU);
#endif
}
35 changes: 9 additions & 26 deletions src/communication/kernels/broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <dlaf/common/assert.h>
#include <dlaf/common/callable_object.h>
#include <dlaf/common/data.h>
#include <dlaf/common/eti.h>
#include <dlaf/communication/communicator.h>
#include <dlaf/communication/kernels/broadcast.h>
#include <dlaf/communication/message.h>
Expand Down Expand Up @@ -60,23 +61,14 @@ template <class T, Device D, class Comm>
return internal::scheduleSendBcast(std::move(pcomm), std::move(tile));
}

DLAF_SCHEDULE_SEND_BCAST_ETI(, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_SEND_BCAST_ETI, , common::Pipeline<Communicator>::Wrapper);

DLAF_SCHEDULE_SEND_BCAST_ETI(, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_SEND_BCAST_ETI(, SizeType, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_SEND_BCAST_ETI(, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format on

template <class T, Device D, class Comm>
[[nodiscard]] dlaf::matrix::ReadWriteTileSender<T, D> scheduleRecvBcast(
Expand Down Expand Up @@ -112,21 +104,12 @@ template <class T, Device D, class Comm>
RequireContiguous::No>(std::move(tile), std::move(recv));
}

DLAF_SCHEDULE_RECV_BCAST_ETI(, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, float, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, double, Device::CPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, std::complex<float>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, std::complex<double>, Device::CPU,
common::Pipeline<Communicator>::Wrapper);
// clang-format off
DLAF_EXPAND_ETI_SDCZ_DEVICE_VA_ARGS(DLAF_SCHEDULE_RECV_BCAST_ETI, , common::Pipeline<Communicator>::Wrapper);

DLAF_SCHEDULE_RECV_BCAST_ETI(, SizeType, Device::CPU, common::Pipeline<Communicator>::Wrapper);
#ifdef DLAF_WITH_GPU
DLAF_SCHEDULE_RECV_BCAST_ETI(, SizeType, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, float, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, double, Device::GPU, common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, std::complex<float>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
DLAF_SCHEDULE_RECV_BCAST_ETI(, std::complex<double>, Device::GPU,
common::Pipeline<Communicator>::Wrapper);
#endif
// clang-format on
}
Loading

0 comments on commit 87ec68a

Please sign in to comment.