From 6bfd1c958e27529d79ac2c49d76a273edb618d67 Mon Sep 17 00:00:00 2001 From: Doc CI Action Date: Mon, 11 Sep 2023 09:07:24 +0000 Subject: [PATCH] Doc Doc fixes for TridiagSolver (local) and removal of unused GPU kernels (#971) --- ...r_2tridiag__solver_2kernels_8h_source.html | 137 +- master/index__manipulation_8h_source.html | 262 +- master/merge_8h_source.html | 2502 +++++++++-------- 3 files changed, 1395 insertions(+), 1506 deletions(-) diff --git a/master/eigensolver_2tridiag__solver_2kernels_8h_source.html b/master/eigensolver_2tridiag__solver_2kernels_8h_source.html index 57e3c0778d..e11f3e225d 100644 --- a/master/eigensolver_2tridiag__solver_2kernels_8h_source.html +++ b/master/eigensolver_2tridiag__solver_2kernels_8h_source.html @@ -257,131 +257,20 @@
186  std::move(sender));
187 }
188 
-
189 template <class T>
-
190 T maxElementInColumnTile(const matrix::Tile<const T, Device::CPU>& tile);
-
191 
-
192 #define DLAF_CPU_MAX_ELEMENT_IN_COLUMN_TILE_ETI(kword, Type) \
-
193  kword template Type maxElementInColumnTile(const matrix::Tile<const Type, Device::CPU>& tile)
-
194 
-
195 DLAF_CPU_MAX_ELEMENT_IN_COLUMN_TILE_ETI(extern, float);
-
196 DLAF_CPU_MAX_ELEMENT_IN_COLUMN_TILE_ETI(extern, double);
+
189 #ifdef DLAF_WITH_GPU
+
190 
+
191 template <class T>
+
192 void givensRotationOnDevice(SizeType len, T* x, T* y, T c, T s, whip::stream_t stream);
+
193 
+
194 #define DLAF_GIVENS_ROT_ETI(kword, Type) \
+
195  kword template void givensRotationOnDevice(SizeType len, Type* x, Type* y, Type c, Type s, \
+
196  whip::stream_t stream)
197 
-
198 #ifdef DLAF_WITH_GPU
-
199 
-
200 template <class T>
-
201 void maxElementInColumnTile(const matrix::Tile<const T, Device::GPU>& tile, T* host_max_el_ptr,
-
202  T* device_max_el_ptr, whip::stream_t stream);
-
203 
-
204 #define DLAF_GPU_MAX_ELEMENT_IN_COLUMN_TILE_ETI(kword, Type) \
-
205  kword template void maxElementInColumnTile(const matrix::Tile<const Type, Device::GPU>& tile, \
-
206  Type* host_max_el_ptr, Type* device_max_el_ptr, \
-
207  whip::stream_t stream)
-
208 
-
209 DLAF_GPU_MAX_ELEMENT_IN_COLUMN_TILE_ETI(extern, float);
-
210 DLAF_GPU_MAX_ELEMENT_IN_COLUMN_TILE_ETI(extern, double);
-
211 
-
212 #endif
-
213 
-
214 DLAF_MAKE_CALLABLE_OBJECT(maxElementInColumnTile);
-
215 
-
216 template <class T, Device D, class TileSender>
-
217 auto maxElementInColumnTileAsync(TileSender&& tile) {
-
218  namespace di = dlaf::internal;
-
219  namespace ex = pika::execution::experimental;
-
220 
-
221  constexpr auto backend = dlaf::DefaultBackend_v<D>;
-
222 
-
223  if constexpr (D == Device::CPU) {
-
224  return std::forward<TileSender>(tile) |
-
225  di::transform(di::Policy<backend>(), maxElementInColumnTile_o);
-
226  }
-
227  else {
-
228 #ifdef DLAF_WITH_GPU
-
229  using ElementType = dlaf::internal::SenderElementType<TileSender>;
-
230  return ex::when_all(std::forward<TileSender>(tile),
-
231  ex::just(memory::MemoryChunk<ElementType, Device::CPU>{1},
-
232  memory::MemoryChunk<ElementType, Device::GPU>{1})) |
-
233  ex::let_value([](auto& tile, auto& host_max_el, auto& device_max_el) {
-
234  return ex::just(tile, host_max_el(), device_max_el()) |
-
235  di::transform(di::Policy<backend>(), maxElementInColumnTile_o) |
-
236  ex::then([&host_max_el]() { return *host_max_el(); });
-
237  });
-
238 #endif
-
239  }
-
240 }
-
241 
-
242 void setColTypeTile(const ColType& ct, const matrix::Tile<ColType, Device::CPU>& tile);
-
243 
-
244 #ifdef DLAF_WITH_GPU
-
245 void setColTypeTile(const ColType& ct, const matrix::Tile<ColType, Device::GPU>& tile,
-
246  whip::stream_t stream);
-
247 #endif
-
248 
-
249 DLAF_MAKE_CALLABLE_OBJECT(setColTypeTile);
-
250 
-
251 template <Device D, class TileSender>
-
252 void setColTypeTileAsync(ColType val, TileSender&& tile) {
-
253  namespace di = dlaf::internal;
-
254 
-
255  auto sender = di::whenAllLift(val, std::forward<TileSender>(tile));
-
256  di::transformDetach(di::Policy<DefaultBackend_v<D>>(), setColTypeTile_o, std::move(sender));
-
257 }
-
258 
-
259 void initIndexTile(SizeType offset, const matrix::Tile<SizeType, Device::CPU>& tile);
-
260 
-
261 #ifdef DLAF_WITH_GPU
-
262 void initIndexTile(SizeType offset, const matrix::Tile<SizeType, Device::GPU>& tile,
-
263  whip::stream_t stream);
-
264 #endif
-
265 
-
266 DLAF_MAKE_CALLABLE_OBJECT(initIndexTile);
-
267 
-
268 template <Device D, class TileSender>
-
269 void initIndexTileAsync(SizeType tile_row, TileSender&& tile) {
-
270  namespace di = dlaf::internal;
-
271 
-
272  auto sender = di::whenAllLift(tile_row, std::forward<TileSender>(tile));
-
273  di::transformDetach(di::Policy<DefaultBackend_v<D>>(), initIndexTile_o, std::move(sender));
-
274 }
-
275 
-
276 #ifdef DLAF_WITH_GPU
-
277 
-
278 template <class T>
-
279 void mergeIndicesOnDevice(const SizeType* begin_ptr, const SizeType* split_ptr, const SizeType* end_ptr,
-
280  SizeType* out_ptr, const T* v_ptr, whip::stream_t stream);
-
281 
-
282 #define DLAF_CUDA_MERGE_INDICES_ETI(kword, Type) \
-
283  kword template void mergeIndicesOnDevice(const SizeType* begin_ptr, const SizeType* split_ptr, \
-
284  const SizeType* end_ptr, SizeType* out_ptr, \
-
285  const Type* v_ptr, whip::stream_t stream)
-
286 
-
287 DLAF_CUDA_MERGE_INDICES_ETI(extern, float);
-
288 DLAF_CUDA_MERGE_INDICES_ETI(extern, double);
-
289 
-
290 template <class T>
-
291 void applyIndexOnDevice(SizeType len, const SizeType* index, const T* in, T* out, whip::stream_t stream);
-
292 
-
293 #define DLAF_CUDA_APPLY_INDEX_ETI(kword, Type) \
-
294  kword template void applyIndexOnDevice(SizeType len, const SizeType* index, const Type* in, \
-
295  Type* out, whip::stream_t stream)
-
296 
-
297 DLAF_CUDA_APPLY_INDEX_ETI(extern, float);
-
298 DLAF_CUDA_APPLY_INDEX_ETI(extern, double);
-
299 
-
300 void invertIndexOnDevice(SizeType len, const SizeType* in, SizeType* out, whip::stream_t stream);
-
301 
-
302 template <class T>
-
303 void givensRotationOnDevice(SizeType len, T* x, T* y, T c, T s, whip::stream_t stream);
-
304 
-
305 #define DLAF_GIVENS_ROT_ETI(kword, Type) \
-
306  kword template void givensRotationOnDevice(SizeType len, Type* x, Type* y, Type c, Type s, \
-
307  whip::stream_t stream)
-
308 
-
309 DLAF_GIVENS_ROT_ETI(extern, float);
-
310 DLAF_GIVENS_ROT_ETI(extern, double);
-
311 
-
312 #endif
-
313 }
+
198 DLAF_GIVENS_ROT_ETI(extern, float);
+
199 DLAF_GIVENS_ROT_ETI(extern, double);
+
200 
+
201 #endif
+
202 }
#define DLAF_MAKE_CALLABLE_OBJECT(fname)
Definition: callable_object.h:26
diff --git a/master/index__manipulation_8h_source.html b/master/index__manipulation_8h_source.html index a0e279a170..746a0266c2 100644 --- a/master/index__manipulation_8h_source.html +++ b/master/index__manipulation_8h_source.html @@ -106,147 +106,133 @@
35 // The index starts at `0` for tiles in the range [i_begin, i_end)
36 template <Device D>
37 void initIndex(const SizeType i_begin, const SizeType i_end, Matrix<SizeType, D>& index) {
-
38  const SizeType nb = index.distribution().blockSize().rows();
-
39  for (SizeType i = i_begin; i < i_end; ++i) {
-
40  const GlobalTileIndex tile_idx(i, 0);
-
41  const SizeType tile_row = (i - i_begin) * nb;
-
42  initIndexTileAsync<D>(tile_row, index.readwrite(tile_idx));
-
43  }
-
44 }
-
45 
-
46 // Add val to the indices of `index`.
-
47 //
-
48 inline void addIndex(SizeType i_begin, SizeType i_end, SizeType val,
-
49  Matrix<SizeType, Device::CPU>& index) {
-
50  namespace ex = pika::execution::experimental;
-
51  namespace di = dlaf::internal;
-
52 
-
53  SizeType n = problemSize(i_begin, i_end, index.distribution());
-
54  auto add_fn = [val, n](const auto& index) {
-
55  TileElementIndex zero_idx(0, 0);
-
56  SizeType* index_ptr = index[0].ptr(zero_idx);
-
57 
-
58  auto begin_it = index_ptr;
-
59  auto end_it = index_ptr + n;
-
60  std::for_each(begin_it, end_it, [val](SizeType& i) { i += val; });
-
61  };
+
38  namespace di = dlaf::internal;
+
39 
+
40  const SizeType nb = index.distribution().blockSize().rows();
+
41  for (SizeType i = i_begin; i < i_end; ++i) {
+
42  const GlobalTileIndex tile_idx(i, 0);
+
43  const SizeType tile_row = (i - i_begin) * nb;
+
44  auto sender = di::whenAllLift(tile_row, index.readwrite(tile_idx));
+
45  di::transformDetach(
+
46  di::Policy<Backend::MC>(),
+
47  [](SizeType offset, const matrix::Tile<SizeType, Device::CPU>& tile) {
+
48  for (SizeType i = 0; i < tile.size().rows(); ++i) {
+
49  tile(TileElementIndex(i, 0)) = offset + i;
+
50  }
+
51  },
+
52  std::move(sender));
+
53  }
+
54 }
+
55 
+
56 // Add val to the indices of `index`.
+
57 //
+
58 inline void addIndex(SizeType i_begin, SizeType i_end, SizeType val,
+
59  Matrix<SizeType, Device::CPU>& index) {
+
60  namespace ex = pika::execution::experimental;
+
61  namespace di = dlaf::internal;
62 
-
63  TileCollector tc{i_begin, i_end};
-
64 
-
65  auto sender = ex::when_all_vector(tc.readwrite<SizeType, Device::CPU>(index));
-
66 
-
67  ex::start_detached(di::transform(di::Policy<DefaultBackend_v<Device::CPU>>(), std::move(add_fn),
-
68  std::move(sender)));
-
69 }
-
70 
-
71 // Sorts an index `in_index_tiles` based on values in `vals_tiles` in ascending order into the index
-
72 // `out_index_tiles` where `vals_tiles` is composed of two pre-sorted ranges in ascending order that
-
73 // are merged, the first is [0, k) and the second is [k, n).
-
74 //
-
75 template <class T, Device D, class KSender>
-
76 void sortIndex(const SizeType i_begin, const SizeType i_end, KSender&& k, Matrix<const T, D>& vec,
-
77  Matrix<const SizeType, D>& in_index, Matrix<SizeType, D>& out_index) {
-
78  namespace ex = pika::execution::experimental;
-
79  namespace di = dlaf::internal;
+
63  SizeType n = problemSize(i_begin, i_end, index.distribution());
+
64  auto add_fn = [val, n](const auto& index) {
+
65  TileElementIndex zero_idx(0, 0);
+
66  SizeType* index_ptr = index[0].ptr(zero_idx);
+
67 
+
68  auto begin_it = index_ptr;
+
69  auto end_it = index_ptr + n;
+
70  std::for_each(begin_it, end_it, [val](SizeType& i) { i += val; });
+
71  };
+
72 
+
73  TileCollector tc{i_begin, i_end};
+
74 
+
75  auto sender = ex::when_all_vector(tc.readwrite<SizeType, Device::CPU>(index));
+
76 
+
77  ex::start_detached(di::transform(di::Policy<DefaultBackend_v<Device::CPU>>(), std::move(add_fn),
+
78  std::move(sender)));
+
79 }
80 
-
81  const SizeType n = problemSize(i_begin, i_end, vec.distribution());
-
82  auto sort_fn = [n](const auto& k, const auto& vec_futs, const auto& in_index_futs,
-
83  const auto& out_index, [[maybe_unused]] auto&&... ts) {
-
84  DLAF_ASSERT(k <= n, k, n);
-
85 
-
86  const TileElementIndex zero_idx(0, 0);
-
87  const T* v_ptr = vec_futs[0].get().ptr(zero_idx);
-
88  const SizeType* in_index_ptr = in_index_futs[0].get().ptr(zero_idx);
-
89  SizeType* out_index_ptr = out_index[0].ptr(zero_idx);
-
90 
-
91  auto begin_it = in_index_ptr;
-
92  auto split_it = in_index_ptr + k;
-
93  auto end_it = in_index_ptr + n;
-
94  if constexpr (D == Device::CPU) {
-
95  auto cmp = [v_ptr](const SizeType i1, const SizeType i2) { return v_ptr[i1] < v_ptr[i2]; };
-
96  pika::merge(pika::execution::par, begin_it, split_it, split_it, end_it, out_index_ptr,
-
97  std::move(cmp));
-
98  }
-
99  else {
-
100 #ifdef DLAF_WITH_GPU
-
101  mergeIndicesOnDevice(begin_it, split_it, end_it, out_index_ptr, v_ptr, ts...);
-
102 #endif
-
103  }
-
104  };
-
105 
-
106  TileCollector tc{i_begin, i_end};
-
107 
-
108  auto sender = ex::when_all(std::forward<KSender>(k), ex::when_all_vector(tc.read<T, D>(vec)),
-
109  ex::when_all_vector(tc.read<SizeType, D>(in_index)),
-
110  ex::when_all_vector(tc.readwrite<SizeType, D>(out_index)));
+
81 // Sorts an index `in_index_tiles` based on values in `vals_tiles` in ascending order into the index
+
82 // `out_index_tiles` where `vals_tiles` is composed of two pre-sorted ranges in ascending order that
+
83 // are merged, the first is [0, k) and the second is [k, n).
+
84 //
+
85 template <class T, class KSender>
+
86 void sortIndex(const SizeType i_begin, const SizeType i_end, KSender&& k,
+
87  Matrix<const T, Device::CPU>& vec, Matrix<const SizeType, Device::CPU>& in_index,
+
88  Matrix<SizeType, Device::CPU>& out_index) {
+
89  namespace ex = pika::execution::experimental;
+
90  namespace di = dlaf::internal;
+
91 
+
92  const SizeType n = problemSize(i_begin, i_end, vec.distribution());
+
93  auto sort_fn = [n](const auto& k, const auto& vec_futs, const auto& in_index_futs,
+
94  const auto& out_index, [[maybe_unused]] auto&&... ts) {
+
95  DLAF_ASSERT(k <= n, k, n);
+
96 
+
97  const TileElementIndex zero_idx(0, 0);
+
98  const T* v_ptr = vec_futs[0].get().ptr(zero_idx);
+
99  const SizeType* in_index_ptr = in_index_futs[0].get().ptr(zero_idx);
+
100  SizeType* out_index_ptr = out_index[0].ptr(zero_idx);
+
101 
+
102  auto begin_it = in_index_ptr;
+
103  auto split_it = in_index_ptr + k;
+
104  auto end_it = in_index_ptr + n;
+
105  auto cmp = [v_ptr](const SizeType i1, const SizeType i2) { return v_ptr[i1] < v_ptr[i2]; };
+
106  pika::merge(pika::execution::par, begin_it, split_it, split_it, end_it, out_index_ptr,
+
107  std::move(cmp));
+
108  };
+
109 
+
110  TileCollector tc{i_begin, i_end};
111 
-
112  ex::start_detached(di::transform(di::Policy<DefaultBackend_v<D>>(), std::move(sort_fn),
-
113  std::move(sender)));
-
114 }
-
115 
-
116 // Applies `index` to `in` to get `out`
-
117 template <class T, Device D>
-
118 void applyIndex(const SizeType i_begin, const SizeType i_end, Matrix<const SizeType, D>& index,
-
119  Matrix<const T, D>& in, Matrix<T, D>& out) {
-
120  namespace ex = pika::execution::experimental;
-
121  namespace di = dlaf::internal;
-
122 
-
123  const SizeType n = problemSize(i_begin, i_end, index.distribution());
-
124  auto applyIndex_fn = [n](const auto& index_futs, const auto& in_futs, const auto& out,
-
125  [[maybe_unused]] auto&&... ts) {
-
126  const TileElementIndex zero_idx(0, 0);
-
127  const SizeType* i_ptr = index_futs[0].get().ptr(zero_idx);
-
128  const T* in_ptr = in_futs[0].get().ptr(zero_idx);
-
129  T* out_ptr = out[0].ptr(zero_idx);
-
130 
-
131  if constexpr (D == Device::CPU) {
-
132  for (SizeType i = 0; i < n; ++i) {
-
133  out_ptr[i] = in_ptr[i_ptr[i]];
-
134  }
-
135  }
-
136  else {
-
137 #ifdef DLAF_WITH_GPU
-
138  applyIndexOnDevice(n, i_ptr, in_ptr, out_ptr, ts...);
-
139 #endif
-
140  }
-
141  };
-
142 
-
143  TileCollector tc{i_begin, i_end};
-
144 
-
145  auto sender = ex::when_all(ex::when_all_vector(tc.read(index)), ex::when_all_vector(tc.read(in)),
-
146  ex::when_all_vector(tc.readwrite(out)));
-
147  ex::start_detached(di::transform(di::Policy<DefaultBackend_v<D>>(), std::move(applyIndex_fn),
-
148  std::move(sender)));
-
149 }
-
150 
-
151 template <Device D>
-
152 void invertIndex(const SizeType i_begin, const SizeType i_end, Matrix<const SizeType, D>& in,
-
153  Matrix<SizeType, D>& out) {
-
154  namespace ex = pika::execution::experimental;
-
155  namespace di = dlaf::internal;
-
156 
-
157  const SizeType n = problemSize(i_begin, i_end, in.distribution());
-
158  auto inv_fn = [n](const auto& in_tiles_futs, const auto& out_tiles, [[maybe_unused]] auto&&... ts) {
-
159  const TileElementIndex zero(0, 0);
-
160  const SizeType* in_ptr = in_tiles_futs[0].get().ptr(zero);
-
161  SizeType* out_ptr = out_tiles[0].ptr(zero);
-
162 
-
163  if constexpr (D == Device::CPU) {
-
164  for (SizeType i = 0; i < n; ++i) {
-
165  out_ptr[in_ptr[i]] = i;
-
166  }
-
167  }
-
168  else {
-
169  invertIndexOnDevice(n, in_ptr, out_ptr, ts...);
-
170  }
-
171  };
-
172 
-
173  TileCollector tc{i_begin, i_end};
-
174  auto sender = ex::when_all(ex::when_all_vector(tc.read(in)), ex::when_all_vector(tc.readwrite(out)));
-
175  ex::start_detached(di::transform(di::Policy<DefaultBackend_v<D>>(), std::move(inv_fn),
-
176  std::move(sender)));
-
177 }
-
178 }
+
112  ex::start_detached(ex::when_all(std::forward<KSender>(k), ex::when_all_vector(tc.read(vec)),
+
113  ex::when_all_vector(tc.read(in_index)),
+
114  ex::when_all_vector(tc.readwrite(out_index))) |
+
115  di::transform(di::Policy<Backend::MC>(), std::move(sort_fn)));
+
116 }
+
117 
+
118 // Applies `index` to `in` to get `out`
+
119 template <class T>
+
120 void applyIndex(const SizeType i_begin, const SizeType i_end, Matrix<const SizeType, Device::CPU>& index,
+
121  Matrix<const T, Device::CPU>& in, Matrix<T, Device::CPU>& out) {
+
122  namespace ex = pika::execution::experimental;
+
123  namespace di = dlaf::internal;
+
124 
+
125  const SizeType n = problemSize(i_begin, i_end, index.distribution());
+
126  auto applyIndex_fn = [n](const auto& index_futs, const auto& in_futs, const auto& out) {
+
127  const TileElementIndex zero_idx(0, 0);
+
128  const SizeType* i_ptr = index_futs[0].get().ptr(zero_idx);
+
129  const T* in_ptr = in_futs[0].get().ptr(zero_idx);
+
130  T* out_ptr = out[0].ptr(zero_idx);
+
131 
+
132  for (SizeType i = 0; i < n; ++i)
+
133  out_ptr[i] = in_ptr[i_ptr[i]];
+
134  };
+
135 
+
136  TileCollector tc{i_begin, i_end};
+
137 
+
138  auto sender = ex::when_all(ex::when_all_vector(tc.read(index)), ex::when_all_vector(tc.read(in)),
+
139  ex::when_all_vector(tc.readwrite(out)));
+
140  ex::start_detached(di::transform(di::Policy<Backend::MC>(), std::move(applyIndex_fn),
+
141  std::move(sender)));
+
142 }
+
143 
+
144 inline void invertIndex(const SizeType i_begin, const SizeType i_end,
+
145  Matrix<const SizeType, Device::CPU>& in, Matrix<SizeType, Device::CPU>& out) {
+
146  namespace ex = pika::execution::experimental;
+
147  namespace di = dlaf::internal;
+
148 
+
149  const SizeType n = problemSize(i_begin, i_end, in.distribution());
+
150  auto inv_fn = [n](const auto& in_tiles_futs, const auto& out_tiles, [[maybe_unused]] auto&&... ts) {
+
151  const TileElementIndex zero(0, 0);
+
152  const SizeType* in_ptr = in_tiles_futs[0].get().ptr(zero);
+
153  SizeType* out_ptr = out_tiles[0].ptr(zero);
+
154 
+
155  for (SizeType i = 0; i < n; ++i) {
+
156  out_ptr[in_ptr[i]] = i;
+
157  }
+
158  };
+
159 
+
160  TileCollector tc{i_begin, i_end};
+
161  auto sender = ex::when_all(ex::when_all_vector(tc.read(in)), ex::when_all_vector(tc.readwrite(out)));
+
162  ex::start_detached(di::transform(di::Policy<Backend::MC>(), std::move(inv_fn), std::move(sender)));
+
163 }
+
164 }
diff --git a/master/merge_8h_source.html b/master/merge_8h_source.html index f5a59e699e..ee56c6628c 100644 --- a/master/merge_8h_source.html +++ b/master/merge_8h_source.html @@ -274,1302 +274,1316 @@
203 
204 // Returns the maximum element of a portion of a column vector from tile indices `i_begin` to `i_end`
205 //
-
206 template <class T, Device D>
-
207 auto maxVectorElement(const SizeType i_begin, const SizeType i_end, Matrix<const T, D>& vec) {
+
206 template <class T>
+
207 auto maxVectorElement(const SizeType i_begin, const SizeType i_end, Matrix<const T, Device::CPU>& vec) {
208  namespace ex = pika::execution::experimental;
209  namespace di = dlaf::internal;
210 
211  std::vector<ex::unique_any_sender<T>> tiles_max;
212  tiles_max.reserve(to_sizet(i_end - i_begin));
213  for (SizeType i = i_begin; i < i_end; ++i) {
-
214  tiles_max.push_back(maxElementInColumnTileAsync<T, D>(vec.read(LocalTileIndex(i, 0))));
-
215  }
-
216 
-
217  auto tol_calc_fn = [](const std::vector<T>& maxvals) {
-
218  return *std::max_element(maxvals.begin(), maxvals.end());
-
219  };
-
220 
-
221  return ex::when_all_vector(std::move(tiles_max)) |
-
222  di::transform(di::Policy<Backend::MC>(), std::move(tol_calc_fn));
-
223 }
-
224 
-
225 // The tolerance calculation is the same as the one used in LAPACK's stedc implementation [1].
-
226 //
-
227 // [1] LAPACK 3.10.0, file dlaed2.f, line 315, variable TOL
-
228 template <class T, Device D>
-
229 auto calcTolerance(const SizeType i_begin, const SizeType i_end, Matrix<const T, D>& d,
-
230  Matrix<const T, D>& z) {
-
231  namespace ex = pika::execution::experimental;
-
232  namespace di = dlaf::internal;
-
233 
-
234  auto dmax = maxVectorElement(i_begin, i_end, d);
-
235  auto zmax = maxVectorElement(i_begin, i_end, z);
-
236 
-
237  auto tol_fn = [](T dmax, T zmax) {
-
238  return 8 * std::numeric_limits<T>::epsilon() * std::max(dmax, zmax);
-
239  };
-
240 
-
241  return ex::when_all(std::move(dmax), std::move(zmax)) |
-
242  di::transform(di::Policy<Backend::MC>(), std::move(tol_fn)) |
-
243  // TODO: This releases the tiles that are kept in the operation state.
-
244  // This is a temporary fix and needs to be replaced by a different
-
245  // adaptor or different lifetime guarantees. This is tracked in
-
246  // https://github.com/pika-org/pika/issues/479.
-
247  ex::ensure_started();
-
248 }
-
249 
-
250 // This function returns number of non-deflated eigenvectors, together with a permutation @p out_ptr
-
251 // that represent mapping (sorted non-deflated | sorted deflated) -> initial.
-
252 //
-
253 // The permutation will allow to keep the mapping between sorted eigenvalues and unsorted eigenvectors,
-
254 // which is useful since eigenvectors are more expensive to permuted, so we can keep them in their initial order.
-
255 //
-
256 // @param n number of eigenvalues
-
257 // @param c_ptr array[n] containing the column type of each eigenvector after deflation (initial order)
-
258 // @param evals_ptr array[n] of eigenvalues sorted as in_ptr
-
259 // @param in_ptr array[n] representing permutation current -> initial (i.e. evals[i] -> c_ptr[in_ptr[i]])
-
260 // @param out_ptr array[n] permutation (sorted non-deflated | sorted deflated) -> initial
-
261 //
-
262 // @return k number of non-deflated eigenvectors
-
263 template <class T>
-
264 SizeType stablePartitionIndexForDeflationArrays(const SizeType n, const ColType* c_ptr,
-
265  const T* evals_ptr, const SizeType* in_ptr,
-
266  SizeType* out_ptr) {
-
267  // Get the number of non-deflated entries
-
268  SizeType k = 0;
-
269  for (SizeType i = 0; i < n; ++i) {
-
270  if (c_ptr[i] != ColType::Deflated)
-
271  ++k;
-
272  }
-
273 
-
274  // Create the permutation (sorted non-deflated | sorted deflated) -> initial
-
275  // Note:
-
276  // Since during deflation, eigenvalues related to deflated eigenvectors, might not be sorted anymore,
-
277  // this step also take care of sorting eigenvalues (actually just their related index) by their ascending value.
-
278  SizeType i1 = 0; // index of non-deflated values in out
-
279  SizeType i2 = k; // index of deflated values
-
280  for (SizeType i = 0; i < n; ++i) {
-
281  const SizeType ii = in_ptr[i];
-
282 
-
283  // non-deflated are untouched, just squeeze them at the beginning as they appear
-
284  if (c_ptr[ii] != ColType::Deflated) {
-
285  out_ptr[i1] = ii;
-
286  ++i1;
-
287  }
-
288  // deflated are the ones that can have been moved "out-of-order" by deflation...
-
289  // ... so each time insert it in the right place based on eigenvalue value
-
290  else {
-
291  const T a = evals_ptr[ii];
-
292 
-
293  SizeType j = i2;
-
294  // shift to right all greater values (shift just indices)
-
295  for (; j > k; --j) {
-
296  const T b = evals_ptr[out_ptr[j - 1]];
-
297  if (a > b) {
-
298  break;
-
299  }
-
300  out_ptr[j] = out_ptr[j - 1];
-
301  }
-
302  // and insert the current index in the empty place, such that eigenvalues are sorted.
-
303  out_ptr[j] = ii;
-
304  ++i2;
-
305  }
-
306  }
-
307  return k;
-
308 }
-
309 
-
310 // This function returns number of non-deflated eigenvectors, together with two permutations
-
311 // - @p index_sorted (sorted non-deflated | sorted deflated) -> initial.
-
312 // - @p index_sorted_coltype (sort(upper)|sort(dense)|sort(lower)|sort(deflated)) -> initial
-
313 //
-
314 // The permutations will allow to keep the mapping between sorted eigenvalues and unsorted eigenvectors,
-
315 // which is useful since eigenvectors are more expensive to permuted, so we can keep them in their
-
316 // initial order.
-
317 //
-
318 // @param n number of eigenvalues
-
319 // @param types array[n] column type of each eigenvector after deflation (initial order)
-
320 // @param evals array[n] of eigenvalues sorted as perm_sorted
-
321 // @param perm_sorted array[n] current -> initial (i.e. evals[i] -> types[perm_sorted[i]])
-
322 // @param index_sorted array[n] (sorted non-deflated | sorted deflated) -> initial
-
323 // @param index_sorted_coltype array[n] (sort(upper)|sort(dense)|sort(lower)|sort(deflated)) -> initial
-
324 //
-
325 // @return k number of non-deflated eigenvectors
-
326 template <class T>
-
327 SizeType stablePartitionIndexForDeflationArrays(const SizeType n, const ColType* types, const T* evals,
-
328  const SizeType* perm_sorted, SizeType* index_sorted,
-
329  SizeType* index_sorted_coltype) {
-
330  // Note:
-
331  // (in) types
-
332  // column type of the initial indexing
-
333  // (in) perm_sorted
-
334  // initial <-- sorted by ascending eigenvalue
-
335  // (out) index_sorted
-
336  // initial <-- sorted by ascending eigenvalue in two groups (non-deflated | deflated)
-
337  // (out) index_sorted_coltype
-
338  // initial <-- sorted by ascending eigenvalue in four groups (upper | dense | lower | deflated)
-
339 
-
340  // Note:
-
341  // This is the order how we want the eigenvectors to be sorted, since it leads to a nicer matrix
-
342  // shape that allows to reduce the number of following operations (i.e. gemm)
-
343  auto coltype_index = [](const ColType coltype) -> std::size_t {
-
344  switch (coltype) {
-
345  case ColType::UpperHalf:
-
346  return 0;
-
347  case ColType::Dense:
-
348  return 1;
-
349  case ColType::LowerHalf:
-
350  return 2;
-
351  case ColType::Deflated:
-
352  return 3;
-
353  }
-
354  return DLAF_UNREACHABLE(std::size_t);
-
355  };
-
356 
-
357  std::array<std::size_t, 4> offsets{0, 0, 0, 0};
-
358  std::for_each(types, types + n, [&offsets, &coltype_index](const auto& coltype) {
-
359  if (coltype != ColType::Deflated)
-
360  offsets[1 + coltype_index(coltype)]++;
-
361  });
-
362  std::partial_sum(offsets.cbegin(), offsets.cend(), offsets.begin());
-
363 
-
364  const SizeType k = to_SizeType(offsets[coltype_index(ColType::Deflated)]);
-
365 
-
366  // Create the permutation (sorted non-deflated | sorted deflated) -> initial
-
367  // Note:
-
368  // Since during deflation, eigenvalues related to deflated eigenvectors, might not be sorted anymore,
-
369  // this step also take care of sorting eigenvalues (actually just their related index) by their ascending value.
-
370  SizeType i1 = 0; // index of non-deflated values in out
-
371  SizeType i2 = k; // index of deflated values
-
372  for (SizeType i = 0; i < n; ++i) {
-
373  const SizeType ii = perm_sorted[i];
-
374 
-
375  // non-deflated are untouched, just squeeze them at the beginning as they appear
-
376  if (types[ii] != ColType::Deflated) {
-
377  index_sorted[i1] = ii;
-
378  ++i1;
-
379  }
-
380  // deflated are the ones that can have been moved "out-of-order" by deflation...
-
381  // ... so each time insert it in the right place based on eigenvalue value
-
382  else {
-
383  const T a = evals[ii];
-
384 
-
385  SizeType j = i2;
-
386  // shift to right all greater values (shift just indices)
-
387  for (; j > k; --j) {
-
388  const T b = evals[index_sorted[j - 1]];
-
389  if (a > b) {
-
390  break;
-
391  }
-
392  index_sorted[j] = index_sorted[j - 1];
-
393  }
-
394  // and insert the current index in the empty place, such that eigenvalues are sorted.
-
395  index_sorted[j] = ii;
-
396  ++i2;
-
397  }
-
398  }
-
399 
-
400  // Create the permutation (sort(upper)|sort(dense)|sort(lower)|sort(deflated)) -> initial
-
401  for (SizeType j = 0; j < n; ++j) {
-
402  const ColType& coltype = types[to_sizet(j)];
-
403  if (coltype != ColType::Deflated) {
-
404  auto& index_for_coltype = offsets[coltype_index(coltype)];
-
405  index_sorted_coltype[index_for_coltype] = j;
-
406  ++index_for_coltype;
-
407  }
-
408  }
-
409  std::copy(index_sorted + k, index_sorted + n, index_sorted_coltype + k);
-
410 
-
411  return k;
-
412 }
-
413 
-
414 template <class T>
-
415 auto stablePartitionIndexForDeflation(const SizeType i_begin, const SizeType i_end,
-
416  Matrix<const ColType, Device::CPU>& c,
-
417  Matrix<const T, Device::CPU>& evals,
-
418  Matrix<const SizeType, Device::CPU>& in,
-
419  Matrix<SizeType, Device::CPU>& out) {
-
420  namespace ex = pika::execution::experimental;
-
421  namespace di = dlaf::internal;
-
422 
-
423  const SizeType n = problemSize(i_begin, i_end, in.distribution());
-
424  auto part_fn = [n](const auto& c_tiles_futs, const auto& evals_tiles_fut, const auto& in_tiles_futs,
-
425  const auto& out_tiles) {
-
426  const TileElementIndex zero_idx(0, 0);
-
427  const ColType* c_ptr = c_tiles_futs[0].get().ptr(zero_idx);
-
428  const T* evals_ptr = evals_tiles_fut[0].get().ptr(zero_idx);
-
429  const SizeType* in_ptr = in_tiles_futs[0].get().ptr(zero_idx);
-
430  SizeType* out_ptr = out_tiles[0].ptr(zero_idx);
-
431 
-
432  return stablePartitionIndexForDeflationArrays(n, c_ptr, evals_ptr, in_ptr, out_ptr);
-
433  };
-
434 
-
435  TileCollector tc{i_begin, i_end};
-
436  return ex::when_all(ex::when_all_vector(tc.read(c)), ex::when_all_vector(tc.read(evals)),
-
437  ex::when_all_vector(tc.read(in)), ex::when_all_vector(tc.readwrite(out))) |
-
438  di::transform(di::Policy<Backend::MC>(), std::move(part_fn));
-
439 }
+
214  tiles_max.push_back(di::whenAllLift(lapack::Norm::Max, vec.read(LocalTileIndex(i, 0))) |
+
215  di::transform(di::Policy<Backend::MC>(), tile::internal::lange_o));
+
216  }
+
217 
+
218  auto tol_calc_fn = [](const std::vector<T>& maxvals) {
+
219  return *std::max_element(maxvals.begin(), maxvals.end());
+
220  };
+
221 
+
222  return ex::when_all_vector(std::move(tiles_max)) |
+
223  di::transform(di::Policy<Backend::MC>(), std::move(tol_calc_fn));
+
224 }
+
225 
+
226 // The tolerance calculation is the same as the one used in LAPACK's stedc implementation [1].
+
227 //
+
228 // [1] LAPACK 3.10.0, file dlaed2.f, line 315, variable TOL
+
229 template <class T>
+
230 auto calcTolerance(const SizeType i_begin, const SizeType i_end, Matrix<const T, Device::CPU>& d,
+
231  Matrix<const T, Device::CPU>& z) {
+
232  namespace ex = pika::execution::experimental;
+
233  namespace di = dlaf::internal;
+
234 
+
235  auto dmax = maxVectorElement(i_begin, i_end, d);
+
236  auto zmax = maxVectorElement(i_begin, i_end, z);
+
237 
+
238  auto tol_fn = [](T dmax, T zmax) {
+
239  return 8 * std::numeric_limits<T>::epsilon() * std::max(dmax, zmax);
+
240  };
+
241 
+
242  return ex::when_all(std::move(dmax), std::move(zmax)) |
+
243  di::transform(di::Policy<Backend::MC>(), std::move(tol_fn)) |
+
244  // TODO: This releases the tiles that are kept in the operation state.
+
245  // This is a temporary fix and needs to be replaced by a different
+
246  // adaptor or different lifetime guarantees. This is tracked in
+
247  // https://github.com/pika-org/pika/issues/479.
+
248  ex::ensure_started();
+
249 }
+
250 
+
251 // This function returns number of non-deflated eigenvectors, together with a permutation @p out_ptr
+
252 // that represent mapping (sorted non-deflated | sorted deflated) -> initial.
+
253 //
+
254 // The permutation will allow to keep the mapping between sorted eigenvalues and unsorted eigenvectors,
+
255 // which is useful since eigenvectors are more expensive to permuted, so we can keep them in their initial order.
+
256 //
+
257 // @param n number of eigenvalues
+
258 // @param c_ptr array[n] containing the column type of each eigenvector after deflation (initial order)
+
259 // @param evals_ptr array[n] of eigenvalues sorted as in_ptr
+
260 // @param in_ptr array[n] representing permutation current -> initial (i.e. evals[i] -> c_ptr[in_ptr[i]])
+
261 // @param out_ptr array[n] permutation (sorted non-deflated | sorted deflated) -> initial
+
262 //
+
263 // @return k number of non-deflated eigenvectors
+
264 template <class T>
+
265 SizeType stablePartitionIndexForDeflationArrays(const SizeType n, const ColType* c_ptr,
+
266  const T* evals_ptr, const SizeType* in_ptr,
+
267  SizeType* out_ptr) {
+
268  // Get the number of non-deflated entries
+
269  SizeType k = 0;
+
270  for (SizeType i = 0; i < n; ++i) {
+
271  if (c_ptr[i] != ColType::Deflated)
+
272  ++k;
+
273  }
+
274 
+
275  // Create the permutation (sorted non-deflated | sorted deflated) -> initial
+
276  // Note:
+
277  // Since during deflation, eigenvalues related to deflated eigenvectors, might not be sorted anymore,
+
278  // this step also take care of sorting eigenvalues (actually just their related index) by their ascending value.
+
279  SizeType i1 = 0; // index of non-deflated values in out
+
280  SizeType i2 = k; // index of deflated values
+
281  for (SizeType i = 0; i < n; ++i) {
+
282  const SizeType ii = in_ptr[i];
+
283 
+
284  // non-deflated are untouched, just squeeze them at the beginning as they appear
+
285  if (c_ptr[ii] != ColType::Deflated) {
+
286  out_ptr[i1] = ii;
+
287  ++i1;
+
288  }
+
289  // deflated are the ones that can have been moved "out-of-order" by deflation...
+
290  // ... so each time insert it in the right place based on eigenvalue value
+
291  else {
+
292  const T a = evals_ptr[ii];
+
293 
+
294  SizeType j = i2;
+
295  // shift to right all greater values (shift just indices)
+
296  for (; j > k; --j) {
+
297  const T b = evals_ptr[out_ptr[j - 1]];
+
298  if (a > b) {
+
299  break;
+
300  }
+
301  out_ptr[j] = out_ptr[j - 1];
+
302  }
+
303  // and insert the current index in the empty place, such that eigenvalues are sorted.
+
304  out_ptr[j] = ii;
+
305  ++i2;
+
306  }
+
307  }
+
308  return k;
+
309 }
+
310 
+
311 // This function returns number of non-deflated eigenvectors, together with two permutations
+
312 // - @p index_sorted (sort(non-deflated)|sort(deflated)) -> initial.
+
313 // - @p index_sorted_coltype (upper|dense|lower|sort(deflated)) -> initial
+
314 //
+
315 // The permutations will allow to keep the mapping between sorted eigenvalues and unsorted eigenvectors,
+
316 // which is useful since eigenvectors are more expensive to permuted, so we can keep them in their
+
317 // initial order.
+
318 //
+
319 // @param n number of eigenvalues
+
320 // @param types array[n] column type of each eigenvector after deflation (initial order)
+
321 // @param evals array[n] of eigenvalues sorted as perm_sorted
+
322 // @param perm_sorted array[n] current -> initial (i.e. evals[i] -> types[perm_sorted[i]])
+
323 // @param index_sorted array[n] (sort(non-deflated)|sort(deflated)) -> initial
+
324 // @param index_sorted_coltype array[n] (upper|dense|lower|sort(deflated)) -> initial
+
325 //
+
326 // @return k number of non-deflated eigenvectors
+
327 template <class T>
+
328 SizeType stablePartitionIndexForDeflationArrays(const SizeType n, const ColType* types, const T* evals,
+
329  const SizeType* perm_sorted, SizeType* index_sorted,
+
330  SizeType* index_sorted_coltype) {
+
331  // Note:
+
332  // (in) types
+
333  // column type of the initial indexing
+
334  // (in) perm_sorted
+
335  // initial <-- sorted by ascending eigenvalue
+
336  // (out) index_sorted
+
337  // initial <-- (sort(non-deflated) | sort(deflated))
+
338  // (out) index_sorted_coltype
+
339  // initial <-- (upper | dense | lower | sort(deflated))
+
340 
+
341  // Note:
+
342  // This is the order how we want the eigenvectors to be sorted, since it leads to a nicer matrix
+
343  // shape that allows to reduce the number of following operations (i.e. gemm)
+
344  auto coltype_index = [](const ColType coltype) -> std::size_t {
+
345  switch (coltype) {
+
346  case ColType::UpperHalf:
+
347  return 0;
+
348  case ColType::Dense:
+
349  return 1;
+
350  case ColType::LowerHalf:
+
351  return 2;
+
352  case ColType::Deflated:
+
353  return 3;
+
354  }
+
355  return DLAF_UNREACHABLE(std::size_t);
+
356  };
+
357 
+
358  std::array<std::size_t, 4> offsets{0, 0, 0, 0};
+
359  std::for_each(types, types + n, [&offsets, &coltype_index](const auto& coltype) {
+
360  if (coltype != ColType::Deflated)
+
361  offsets[1 + coltype_index(coltype)]++;
+
362  });
+
363  std::partial_sum(offsets.cbegin(), offsets.cend(), offsets.begin());
+
364 
+
365  const SizeType k = to_SizeType(offsets[coltype_index(ColType::Deflated)]);
+
366 
+
367  // Create the permutation (sorted non-deflated | sorted deflated) -> initial
+
368  // Note:
+
369  // Since during deflation, eigenvalues related to deflated eigenvectors, might not be sorted anymore,
+
370  // this step also take care of sorting eigenvalues (actually just their related index) by their ascending value.
+
371  SizeType i1 = 0; // index of non-deflated values in out
+
372  SizeType i2 = k; // index of deflated values
+
373  for (SizeType i = 0; i < n; ++i) {
+
374  const SizeType ii = perm_sorted[i];
+
375 
+
376  // non-deflated are untouched, just squeeze them at the beginning as they appear
+
377  if (types[ii] != ColType::Deflated) {
+
378  index_sorted[i1] = ii;
+
379  ++i1;
+
380  }
+
381  // deflated are the ones that can have been moved "out-of-order" by deflation...
+
382  // ... so each time insert it in the right place based on eigenvalue value
+
383  else {
+
384  const T a = evals[ii];
+
385 
+
386  SizeType j = i2;
+
387  // shift to right all greater values (shift just indices)
+
388  for (; j > k; --j) {
+
389  const T b = evals[index_sorted[j - 1]];
+
390  if (a > b) {
+
391  break;
+
392  }
+
393  index_sorted[j] = index_sorted[j - 1];
+
394  }
+
395  // and insert the current index in the empty place, such that eigenvalues are sorted.
+
396  index_sorted[j] = ii;
+
397  ++i2;
+
398  }
+
399  }
+
400 
+
401  // Create the permutation (upper|dense|lower|sort(deflated)) -> initial
+
402  // Note:
+
403  // non-deflated part is created starting from the initial order, because we are not interested
+
404  // in having them sorted.
+
405  // on the other hand, deflated part has to be sorted, so we copy the work from the index_sorted,
+
406  // where they have been already sorted (post-deflation).
+
407  for (SizeType j = 0; j < n; ++j) {
+
408  const ColType& coltype = types[to_sizet(j)];
+
409  if (coltype != ColType::Deflated) {
+
410  auto& index_for_coltype = offsets[coltype_index(coltype)];
+
411  index_sorted_coltype[index_for_coltype] = j;
+
412  ++index_for_coltype;
+
413  }
+
414  }
+
415  std::copy(index_sorted + k, index_sorted + n, index_sorted_coltype + k);
+
416 
+
417  return k;
+
418 }
+
419 
+
420 template <class T>
+
421 auto stablePartitionIndexForDeflation(const SizeType i_begin, const SizeType i_end,
+
422  Matrix<const ColType, Device::CPU>& c,
+
423  Matrix<const T, Device::CPU>& evals,
+
424  Matrix<const SizeType, Device::CPU>& in,
+
425  Matrix<SizeType, Device::CPU>& out) {
+
426  namespace ex = pika::execution::experimental;
+
427  namespace di = dlaf::internal;
+
428 
+
429  const SizeType n = problemSize(i_begin, i_end, in.distribution());
+
430  auto part_fn = [n](const auto& c_tiles_futs, const auto& evals_tiles_fut, const auto& in_tiles_futs,
+
431  const auto& out_tiles) {
+
432  const TileElementIndex zero_idx(0, 0);
+
433  const ColType* c_ptr = c_tiles_futs[0].get().ptr(zero_idx);
+
434  const T* evals_ptr = evals_tiles_fut[0].get().ptr(zero_idx);
+
435  const SizeType* in_ptr = in_tiles_futs[0].get().ptr(zero_idx);
+
436  SizeType* out_ptr = out_tiles[0].ptr(zero_idx);
+
437 
+
438  return stablePartitionIndexForDeflationArrays(n, c_ptr, evals_ptr, in_ptr, out_ptr);
+
439  };
440 
-
441 template <class T>
-
442 auto stablePartitionIndexForDeflation(
-
443  const SizeType i_begin, const SizeType i_end, Matrix<const ColType, Device::CPU>& c,
-
444  Matrix<const T, Device::CPU>& evals, Matrix<const SizeType, Device::CPU>& in,
-
445  Matrix<SizeType, Device::CPU>& out, Matrix<SizeType, Device::CPU>& out_by_coltype) {
-
446  namespace ex = pika::execution::experimental;
-
447  namespace di = dlaf::internal;
-
448 
-
449  const SizeType n = problemSize(i_begin, i_end, in.distribution());
-
450  auto part_fn = [n](const auto& c_tiles_futs, const auto& evals_tiles_futs, const auto& in_tiles_futs,
-
451  const auto& out_tiles, const auto& out_coltype_tiles) {
-
452  const TileElementIndex zero_idx(0, 0);
-
453  const ColType* c_ptr = c_tiles_futs[0].get().ptr(zero_idx);
-
454  const T* evals_ptr = evals_tiles_futs[0].get().ptr(zero_idx);
-
455  const SizeType* in_ptr = in_tiles_futs[0].get().ptr(zero_idx);
-
456  SizeType* out_ptr = out_tiles[0].ptr(zero_idx);
-
457  SizeType* out_coltype_ptr = out_coltype_tiles[0].ptr(zero_idx);
-
458 
-
459  return stablePartitionIndexForDeflationArrays(n, c_ptr, evals_ptr, in_ptr, out_ptr, out_coltype_ptr);
-
460  };
-
461 
-
462  TileCollector tc{i_begin, i_end};
-
463  return ex::when_all(ex::when_all_vector(tc.read(c)), ex::when_all_vector(tc.read(evals)),
-
464  ex::when_all_vector(tc.read(in)), ex::when_all_vector(tc.readwrite(out)),
-
465  ex::when_all_vector(tc.readwrite(out_by_coltype))) |
-
466  di::transform(di::Policy<Backend::MC>(), std::move(part_fn));
-
467 }
-
468 
-
469 template <Device D>
-
470 void initColTypes(const SizeType i_begin, const SizeType i_split, const SizeType i_end,
-
471  Matrix<ColType, D>& coltypes) {
-
472  for (SizeType i = i_begin; i < i_end; ++i) {
-
473  ColType val = (i < i_split) ? ColType::UpperHalf : ColType::LowerHalf;
-
474  setColTypeTileAsync<D>(val, coltypes.readwrite(LocalTileIndex(i, 0)));
-
475  }
-
476 }
-
477 
-
478 // Assumption 1: The algorithm assumes that the arrays `d_ptr`, `z_ptr` and `c_ptr` are of equal length
-
479 // `len` and are sorted in ascending order of `d_ptr` elements with `i_ptr`.
-
480 //
-
481 // Note: parallelizing this algorithm is non-trivial because the deflation regions due to Givens
-
482 // rotations can cross over tiles and are of unknown length. However such algorithm is unlikely to
-
483 // benefit much from parallelization anyway as it is quite light on flops and it appears memory bound.
-
484 //
-
485 // Returns an array of Given's rotations used to update the colunmns of the eigenvector matrix Q
-
486 template <class T>
-
487 std::vector<GivensRotation<T>> applyDeflationToArrays(T rho, T tol, const SizeType len,
-
488  const SizeType* i_ptr, T* d_ptr, T* z_ptr,
-
489  ColType* c_ptr) {
-
490  std::vector<GivensRotation<T>> rots;
-
491  rots.reserve(to_sizet(len));
-
492 
-
493  SizeType i1 = 0; // index of 1st element in the Givens rotation
-
494  // Iterate over the indices of the sorted elements in pair (i1, i2) where i1 < i2 for every iteration
-
495  for (SizeType i2 = 1; i2 < len; ++i2) {
-
496  const SizeType i1s = i_ptr[i1];
-
497  const SizeType i2s = i_ptr[i2];
-
498  T& d1 = d_ptr[i1s];
-
499  T& d2 = d_ptr[i2s];
-
500  T& z1 = z_ptr[i1s];
-
501  T& z2 = z_ptr[i2s];
-
502  ColType& c1 = c_ptr[i1s];
-
503  ColType& c2 = c_ptr[i2s];
-
504 
-
505  // if z1 nearly zero deflate the element and move i1 forward to i2
-
506  if (std::abs(rho * z1) <= tol) {
-
507  c1 = ColType::Deflated;
-
508  i1 = i2;
-
509  continue;
-
510  }
-
511 
-
512  // Deflate the second element if z2 nearly zero
-
513  if (std::abs(rho * z2) <= tol) {
-
514  c2 = ColType::Deflated;
-
515  continue;
-
516  }
-
517 
-
518  // Given's deflation condition is the same as the one used in LAPACK's stedc implementation [1].
-
519  // However, here the second entry is deflated instead of the first (z2/d2 instead of z1/d1), thus
-
520  // `s` is not negated.
-
521  //
-
522  // [1] LAPACK 3.10.0, file dlaed2.f, line 393
-
523  T r = std::hypot(z1, z2);
-
524  T c = z1 / r;
-
525  T s = z2 / r;
-
526 
-
527  // If d1 is not nearly equal to d2, move i1 forward to i2
-
528  if (std::abs(c * s * (d2 - d1)) > tol) {
-
529  i1 = i2;
-
530  continue;
-
531  }
-
532 
-
533  // When d1 is nearly equal to d2 apply Givens rotation
-
534  z1 = r;
-
535  z2 = 0;
-
536  T tmp = d1 * s * s + d2 * c * c;
-
537  d1 = d1 * c * c + d2 * s * s;
-
538  d2 = tmp;
-
539 
-
540  rots.push_back(GivensRotation<T>{i1s, i2s, c, s});
-
541  // Set the the `i1` column as "Dense" if the `i2` column has opposite non-zero structure (i.e if
-
542  // one comes from Q1 and the other from Q2 or vice-versa)
-
543  if ((c1 == ColType::UpperHalf && c2 == ColType::LowerHalf) ||
-
544  (c1 == ColType::LowerHalf && c2 == ColType::UpperHalf)) {
-
545  c1 = ColType::Dense;
-
546  }
-
547  c2 = ColType::Deflated;
-
548  }
-
549 
-
550  return rots;
-
551 }
-
552 
-
553 template <class T, class RhoSender, class TolSender>
-
554 auto applyDeflation(const SizeType i_begin, const SizeType i_end, RhoSender&& rho, TolSender&& tol,
-
555  Matrix<const SizeType, Device::CPU>& index, Matrix<T, Device::CPU>& d,
-
556  Matrix<T, Device::CPU>& z, Matrix<ColType, Device::CPU>& c) {
-
557  namespace ex = pika::execution::experimental;
-
558  namespace di = dlaf::internal;
-
559 
-
560  const SizeType n = problemSize(i_begin, i_end, index.distribution());
-
561 
-
562  auto deflate_fn = [n](auto rho, auto tol, auto index_tiles_futs, auto d_tiles, auto z_tiles,
-
563  auto c_tiles) {
-
564  const TileElementIndex zero_idx(0, 0);
-
565  const SizeType* i_ptr = index_tiles_futs[0].get().ptr(zero_idx);
-
566  T* d_ptr = d_tiles[0].ptr(zero_idx);
-
567  T* z_ptr = z_tiles[0].ptr(zero_idx);
-
568  ColType* c_ptr = c_tiles[0].ptr(zero_idx);
-
569  return applyDeflationToArrays(rho, tol, n, i_ptr, d_ptr, z_ptr, c_ptr);
-
570  };
-
571 
-
572  TileCollector tc{i_begin, i_end};
+
441  TileCollector tc{i_begin, i_end};
+
442  return ex::when_all(ex::when_all_vector(tc.read(c)), ex::when_all_vector(tc.read(evals)),
+
443  ex::when_all_vector(tc.read(in)), ex::when_all_vector(tc.readwrite(out))) |
+
444  di::transform(di::Policy<Backend::MC>(), std::move(part_fn));
+
445 }
+
446 
+
447 template <class T>
+
448 auto stablePartitionIndexForDeflation(
+
449  const SizeType i_begin, const SizeType i_end, Matrix<const ColType, Device::CPU>& c,
+
450  Matrix<const T, Device::CPU>& evals, Matrix<const SizeType, Device::CPU>& in,
+
451  Matrix<SizeType, Device::CPU>& out, Matrix<SizeType, Device::CPU>& out_by_coltype) {
+
452  namespace ex = pika::execution::experimental;
+
453  namespace di = dlaf::internal;
+
454 
+
455  const SizeType n = problemSize(i_begin, i_end, in.distribution());
+
456  auto part_fn = [n](const auto& c_tiles_futs, const auto& evals_tiles_futs, const auto& in_tiles_futs,
+
457  const auto& out_tiles, const auto& out_coltype_tiles) {
+
458  const TileElementIndex zero_idx(0, 0);
+
459  const ColType* c_ptr = c_tiles_futs[0].get().ptr(zero_idx);
+
460  const T* evals_ptr = evals_tiles_futs[0].get().ptr(zero_idx);
+
461  const SizeType* in_ptr = in_tiles_futs[0].get().ptr(zero_idx);
+
462  SizeType* out_ptr = out_tiles[0].ptr(zero_idx);
+
463  SizeType* out_coltype_ptr = out_coltype_tiles[0].ptr(zero_idx);
+
464 
+
465  return stablePartitionIndexForDeflationArrays(n, c_ptr, evals_ptr, in_ptr, out_ptr, out_coltype_ptr);
+
466  };
+
467 
+
468  TileCollector tc{i_begin, i_end};
+
469  return ex::when_all(ex::when_all_vector(tc.read(c)), ex::when_all_vector(tc.read(evals)),
+
470  ex::when_all_vector(tc.read(in)), ex::when_all_vector(tc.readwrite(out)),
+
471  ex::when_all_vector(tc.readwrite(out_by_coltype))) |
+
472  di::transform(di::Policy<Backend::MC>(), std::move(part_fn));
+
473 }
+
474 
+
475 inline void initColTypes(const SizeType i_begin, const SizeType i_split, const SizeType i_end,
+
476  Matrix<ColType, Device::CPU>& coltypes) {
+
477  namespace di = dlaf::internal;
+
478 
+
479  for (SizeType i = i_begin; i < i_end; ++i) {
+
480  const ColType val = (i < i_split) ? ColType::UpperHalf : ColType::LowerHalf;
+
481  di::transformDetach(
+
482  di::Policy<Backend::MC>(),
+
483  [](const ColType& ct, const matrix::Tile<ColType, Device::CPU>& tile) {
+
484  for (SizeType i = 0; i < tile.size().rows(); ++i) {
+
485  tile(TileElementIndex(i, 0)) = ct;
+
486  }
+
487  },
+
488  di::whenAllLift(val, coltypes.readwrite(LocalTileIndex(i, 0))));
+
489  }
+
490 }
+
491 
+
492 // Assumption 1: The algorithm assumes that the arrays `d_ptr`, `z_ptr` and `c_ptr` are of equal length
+
493 // `len` and are sorted in ascending order of `d_ptr` elements with `i_ptr`.
+
494 //
+
495 // Note: parallelizing this algorithm is non-trivial because the deflation regions due to Givens
+
496 // rotations can cross over tiles and are of unknown length. However such algorithm is unlikely to
+
497 // benefit much from parallelization anyway as it is quite light on flops and it appears memory bound.
+
498 //
+
499 // Returns an array of Given's rotations used to update the colunmns of the eigenvector matrix Q
+
500 template <class T>
+
501 std::vector<GivensRotation<T>> applyDeflationToArrays(T rho, T tol, const SizeType len,
+
502  const SizeType* i_ptr, T* d_ptr, T* z_ptr,
+
503  ColType* c_ptr) {
+
504  std::vector<GivensRotation<T>> rots;
+
505  rots.reserve(to_sizet(len));
+
506 
+
507  SizeType i1 = 0; // index of 1st element in the Givens rotation
+
508  // Iterate over the indices of the sorted elements in pair (i1, i2) where i1 < i2 for every iteration
+
509  for (SizeType i2 = 1; i2 < len; ++i2) {
+
510  const SizeType i1s = i_ptr[i1];
+
511  const SizeType i2s = i_ptr[i2];
+
512  T& d1 = d_ptr[i1s];
+
513  T& d2 = d_ptr[i2s];
+
514  T& z1 = z_ptr[i1s];
+
515  T& z2 = z_ptr[i2s];
+
516  ColType& c1 = c_ptr[i1s];
+
517  ColType& c2 = c_ptr[i2s];
+
518 
+
519  // if z1 nearly zero deflate the element and move i1 forward to i2
+
520  if (std::abs(rho * z1) <= tol) {
+
521  c1 = ColType::Deflated;
+
522  i1 = i2;
+
523  continue;
+
524  }
+
525 
+
526  // Deflate the second element if z2 nearly zero
+
527  if (std::abs(rho * z2) <= tol) {
+
528  c2 = ColType::Deflated;
+
529  continue;
+
530  }
+
531 
+
532  // Given's deflation condition is the same as the one used in LAPACK's stedc implementation [1].
+
533  // However, here the second entry is deflated instead of the first (z2/d2 instead of z1/d1), thus
+
534  // `s` is not negated.
+
535  //
+
536  // [1] LAPACK 3.10.0, file dlaed2.f, line 393
+
537  T r = std::hypot(z1, z2);
+
538  T c = z1 / r;
+
539  T s = z2 / r;
+
540 
+
541  // If d1 is not nearly equal to d2, move i1 forward to i2
+
542  if (std::abs(c * s * (d2 - d1)) > tol) {
+
543  i1 = i2;
+
544  continue;
+
545  }
+
546 
+
547  // When d1 is nearly equal to d2 apply Givens rotation
+
548  z1 = r;
+
549  z2 = 0;
+
550  T tmp = d1 * s * s + d2 * c * c;
+
551  d1 = d1 * c * c + d2 * s * s;
+
552  d2 = tmp;
+
553 
+
554  rots.push_back(GivensRotation<T>{i1s, i2s, c, s});
+
555  // Set the the `i1` column as "Dense" if the `i2` column has opposite non-zero structure (i.e if
+
556  // one comes from Q1 and the other from Q2 or vice-versa)
+
557  if ((c1 == ColType::UpperHalf && c2 == ColType::LowerHalf) ||
+
558  (c1 == ColType::LowerHalf && c2 == ColType::UpperHalf)) {
+
559  c1 = ColType::Dense;
+
560  }
+
561  c2 = ColType::Deflated;
+
562  }
+
563 
+
564  return rots;
+
565 }
+
566 
+
567 template <class T, class RhoSender, class TolSender>
+
568 auto applyDeflation(const SizeType i_begin, const SizeType i_end, RhoSender&& rho, TolSender&& tol,
+
569  Matrix<const SizeType, Device::CPU>& index, Matrix<T, Device::CPU>& d,
+
570  Matrix<T, Device::CPU>& z, Matrix<ColType, Device::CPU>& c) {
+
571  namespace ex = pika::execution::experimental;
+
572  namespace di = dlaf::internal;
573 
-
574  auto sender = ex::when_all(std::forward<RhoSender>(rho), std::forward<TolSender>(tol),
-
575  ex::when_all_vector(tc.read(index)), ex::when_all_vector(tc.readwrite(d)),
-
576  ex::when_all_vector(tc.readwrite(z)), ex::when_all_vector(tc.readwrite(c)));
-
577 
-
578  return di::transform(di::Policy<Backend::MC>(), std::move(deflate_fn), std::move(sender)) |
-
579  // TODO: This releases the tiles that are kept in the operation state.
-
580  // This is a temporary fix and needs to be replaced by a different
-
581  // adaptor or different lifetime guarantees. This is tracked in
-
582  // https://github.com/pika-org/pika/issues/479.
-
583  ex::ensure_started();
-
584 }
+
574  const SizeType n = problemSize(i_begin, i_end, index.distribution());
+
575 
+
576  auto deflate_fn = [n](auto rho, auto tol, auto index_tiles_futs, auto d_tiles, auto z_tiles,
+
577  auto c_tiles) {
+
578  const TileElementIndex zero_idx(0, 0);
+
579  const SizeType* i_ptr = index_tiles_futs[0].get().ptr(zero_idx);
+
580  T* d_ptr = d_tiles[0].ptr(zero_idx);
+
581  T* z_ptr = z_tiles[0].ptr(zero_idx);
+
582  ColType* c_ptr = c_tiles[0].ptr(zero_idx);
+
583  return applyDeflationToArrays(rho, tol, n, i_ptr, d_ptr, z_ptr, c_ptr);
+
584  };
585 
-
586 // z is an input whose values are destroyed by this call (input + workspace)
-
587 template <class T, class KSender, class RhoSender>
-
588 void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k, RhoSender&& rho,
-
589  Matrix<const T, Device::CPU>& d, Matrix<T, Device::CPU>& z,
-
590  Matrix<T, Device::CPU>& evals, Matrix<const SizeType, Device::CPU>& i2,
-
591  Matrix<T, Device::CPU>& evecs) {
-
592  namespace ex = pika::execution::experimental;
-
593  namespace di = dlaf::internal;
-
594 
-
595  const SizeType n = problemSize(i_begin, i_end, evals.distribution());
-
596  const SizeType nb = evals.distribution().blockSize().rows();
-
597 
-
598  TileCollector tc{i_begin, i_end};
+
586  TileCollector tc{i_begin, i_end};
+
587 
+
588  auto sender = ex::when_all(std::forward<RhoSender>(rho), std::forward<TolSender>(tol),
+
589  ex::when_all_vector(tc.read(index)), ex::when_all_vector(tc.readwrite(d)),
+
590  ex::when_all_vector(tc.readwrite(z)), ex::when_all_vector(tc.readwrite(c)));
+
591 
+
592  return di::transform(di::Policy<Backend::MC>(), std::move(deflate_fn), std::move(sender)) |
+
593  // TODO: This releases the tiles that are kept in the operation state.
+
594  // This is a temporary fix and needs to be replaced by a different
+
595  // adaptor or different lifetime guarantees. This is tracked in
+
596  // https://github.com/pika-org/pika/issues/479.
+
597  ex::ensure_started();
+
598 }
599 
-
600  // Note: at least two column of tiles per-worker, in the range [1, getTridiagRank1NWorkers()]
-
601  const std::size_t nthreads = [nrtiles = (i_end - i_begin)]() {
-
602  const std::size_t min_workers = 1;
-
603  const std::size_t available_workers = getTridiagRank1NWorkers();
-
604  const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(2));
-
605  return std::clamp(ideal_workers, min_workers, available_workers);
-
606  }();
-
607 
-
608  ex::start_detached(
-
609  ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads)), std::forward<KSender>(k),
-
610  std::forward<RhoSender>(rho), ex::when_all_vector(tc.read(d)),
-
611  ex::when_all_vector(tc.readwrite(z)), ex::when_all_vector(tc.readwrite(evals)),
-
612  ex::when_all_vector(tc.read(i2)), ex::when_all_vector(tc.readwrite(evecs)),
-
613  ex::just(std::vector<memory::MemoryView<T, Device::CPU>>())) |
-
614  ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
-
615  ex::bulk(nthreads, [nthreads, n, nb](std::size_t thread_idx, auto& barrier_ptr, auto& k, auto& rho,
-
616  auto& d_tiles_futs, auto& z_tiles, auto& eval_tiles,
-
617  const auto& i2_tile_arr, auto& evec_tiles, auto& ws_vecs) {
-
618  const matrix::Distribution distr(LocalElementSize(n, n), TileElementSize(nb, nb));
-
619 
-
620  const SizeType* i2_perm = i2_tile_arr[0].get().ptr();
+
600 // z is an input whose values are destroyed by this call (input + workspace)
+
601 template <class T, class KSender, class RhoSender>
+
602 void solveRank1Problem(const SizeType i_begin, const SizeType i_end, KSender&& k, RhoSender&& rho,
+
603  Matrix<const T, Device::CPU>& d, Matrix<T, Device::CPU>& z,
+
604  Matrix<T, Device::CPU>& evals, Matrix<const SizeType, Device::CPU>& i2,
+
605  Matrix<T, Device::CPU>& evecs) {
+
606  namespace ex = pika::execution::experimental;
+
607  namespace di = dlaf::internal;
+
608 
+
609  const SizeType n = problemSize(i_begin, i_end, evals.distribution());
+
610  const SizeType nb = evals.distribution().blockSize().rows();
+
611 
+
612  TileCollector tc{i_begin, i_end};
+
613 
+
614  // Note: at least two column of tiles per-worker, in the range [1, getTridiagRank1NWorkers()]
+
615  const std::size_t nthreads = [nrtiles = (i_end - i_begin)]() {
+
616  const std::size_t min_workers = 1;
+
617  const std::size_t available_workers = getTridiagRank1NWorkers();
+
618  const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(2));
+
619  return std::clamp(ideal_workers, min_workers, available_workers);
+
620  }();
621 
-
622  const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
-
623  const std::size_t batch_size = util::ceilDiv(to_sizet(k), nthreads);
-
624  const std::size_t begin = thread_idx * batch_size;
-
625  const std::size_t end = std::min(thread_idx * batch_size + batch_size, to_sizet(k));
-
626 
-
627  // STEP 0a: Fill ones for deflated Eigenvectors. (single-thread)
-
628  // Note: this step is completely independent from the rest, but it is small and it is going
-
629  // to be dropped soon.
-
630  // Note: use last thread that in principle should have less work to do
-
631  if (thread_idx == nthreads - 1) {
-
632  for (SizeType j = k; j < n; ++j) {
-
633  const GlobalElementIndex jj(j, j);
-
634  const auto linear_jj = distr.globalTileLinearIndex(jj);
-
635  const auto jj_el = distr.tileElementIndex(jj);
-
636 
-
637  evec_tiles[to_sizet(linear_jj)](jj_el) = 1;
-
638  }
-
639  }
+
622  ex::start_detached(
+
623  ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads)), std::forward<KSender>(k),
+
624  std::forward<RhoSender>(rho), ex::when_all_vector(tc.read(d)),
+
625  ex::when_all_vector(tc.readwrite(z)), ex::when_all_vector(tc.readwrite(evals)),
+
626  ex::when_all_vector(tc.read(i2)), ex::when_all_vector(tc.readwrite(evecs)),
+
627  ex::just(std::vector<memory::MemoryView<T, Device::CPU>>())) |
+
628  ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
+
629  ex::bulk(nthreads, [nthreads, n, nb](std::size_t thread_idx, auto& barrier_ptr, auto& k, auto& rho,
+
630  auto& d_tiles_futs, auto& z_tiles, auto& eval_tiles,
+
631  const auto& i2_tile_arr, auto& evec_tiles, auto& ws_vecs) {
+
632  const matrix::Distribution distr(LocalElementSize(n, n), TileElementSize(nb, nb));
+
633 
+
634  const SizeType* i2_perm = i2_tile_arr[0].get().ptr();
+
635 
+
636  const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
+
637  const std::size_t batch_size = util::ceilDiv(to_sizet(k), nthreads);
+
638  const std::size_t begin = thread_idx * batch_size;
+
639  const std::size_t end = std::min(thread_idx * batch_size + batch_size, to_sizet(k));
640 
-
641  // STEP 0b: Initialize workspaces (single-thread)
-
642  if (thread_idx == 0) {
-
643  ws_vecs.reserve(nthreads);
-
644  for (std::size_t i = 0; i < nthreads; ++i)
-
645  ws_vecs.emplace_back(to_sizet(k));
-
646  }
-
647 
-
648  barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
649 
-
650  // STEP 1: LAED4 (multi-thread)
-
651  const T* d_ptr = d_tiles_futs[0].get().ptr();
-
652  const T* z_ptr = z_tiles[0].ptr();
-
653 
-
654  {
-
655  common::internal::SingleThreadedBlasScope single;
-
656 
-
657  T* eval_ptr = eval_tiles[0].ptr();
-
658 
-
659  for (std::size_t i = begin; i < end; ++i) {
-
660  T& eigenval = eval_ptr[i];
+
641  // STEP 0a: Fill ones for deflated Eigenvectors. (single-thread)
+
642  // Note: this step is completely independent from the rest, but it is small and it is going
+
643  // to be dropped soon.
+
644  // Note: use last thread that in principle should have less work to do
+
645  if (thread_idx == nthreads - 1) {
+
646  for (SizeType j = k; j < n; ++j) {
+
647  const GlobalElementIndex jj(j, j);
+
648  const auto linear_jj = distr.globalTileLinearIndex(jj);
+
649  const auto jj_el = distr.tileElementIndex(jj);
+
650 
+
651  evec_tiles[to_sizet(linear_jj)](jj_el) = 1;
+
652  }
+
653  }
+
654 
+
655  // STEP 0b: Initialize workspaces (single-thread)
+
656  if (thread_idx == 0) {
+
657  ws_vecs.reserve(nthreads);
+
658  for (std::size_t i = 0; i < nthreads; ++i)
+
659  ws_vecs.emplace_back(to_sizet(k));
+
660  }
661 
-
662  const SizeType i_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, to_SizeType(i)));
-
663  const SizeType i_col = distr.tileElementFromGlobalElement<Coord::Col>(to_SizeType(i));
-
664  T* delta = evec_tiles[to_sizet(i_tile)].ptr(TileElementIndex(0, i_col));
-
665 
-
666  lapack::laed4(to_int(k), to_int(i), d_ptr, z_ptr, delta, rho, &eigenval);
-
667  }
-
668 
-
669  // Note: laed4 handles k <= 2 cases differently
-
670  if (k <= 2) {
-
671  // Note: The rows should be permuted for the k=2 case as well.
-
672  if (k == 2) {
-
673  T* ws = ws_vecs[thread_idx]();
-
674  for (SizeType j = to_SizeType(begin); j < to_SizeType(end); ++j) {
-
675  const SizeType j_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, j));
-
676  const SizeType j_col = distr.tileElementFromGlobalElement<Coord::Col>(j);
-
677  T* evec = evec_tiles[to_sizet(j_tile)].ptr(TileElementIndex(0, j_col));
-
678 
-
679  std::copy(evec, evec + k, ws);
-
680  std::fill_n(evec, k, 0); // by default "deflated"
-
681  for (SizeType i = 0; i < n; ++i) {
-
682  const SizeType ii = i2_perm[i];
-
683  if (ii < k)
-
684  evec[i] = ws[ii];
-
685  }
-
686  }
-
687  }
-
688  return;
-
689  }
-
690  }
-
691 
-
692  // Note: This barrier ensures that LAED4 finished, so from now on values are available
-
693  barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
694 
-
695  // STEP 2a Compute weights (multi-thread)
-
696  auto& q = evec_tiles;
-
697  T* w = ws_vecs[thread_idx]();
-
698 
-
699  // - copy diagonal from q -> w (or just initialize with 1)
-
700  if (thread_idx == 0) {
-
701  for (SizeType i = 0; i < k; ++i) {
-
702  const GlobalElementIndex kk(i, i);
-
703  const auto diag_tile = distr.globalTileLinearIndex(kk);
-
704  const auto diag_element = distr.tileElementIndex(kk);
+
662  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
663 
+
664  // STEP 1: LAED4 (multi-thread)
+
665  const T* d_ptr = d_tiles_futs[0].get().ptr();
+
666  const T* z_ptr = z_tiles[0].ptr();
+
667 
+
668  {
+
669  common::internal::SingleThreadedBlasScope single;
+
670 
+
671  T* eval_ptr = eval_tiles[0].ptr();
+
672 
+
673  for (std::size_t i = begin; i < end; ++i) {
+
674  T& eigenval = eval_ptr[i];
+
675 
+
676  const SizeType i_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, to_SizeType(i)));
+
677  const SizeType i_col = distr.tileElementFromGlobalElement<Coord::Col>(to_SizeType(i));
+
678  T* delta = evec_tiles[to_sizet(i_tile)].ptr(TileElementIndex(0, i_col));
+
679 
+
680  lapack::laed4(to_int(k), to_int(i), d_ptr, z_ptr, delta, rho, &eigenval);
+
681  }
+
682 
+
683  // Note: laed4 handles k <= 2 cases differently
+
684  if (k <= 2) {
+
685  // Note: The rows should be permuted for the k=2 case as well.
+
686  if (k == 2) {
+
687  T* ws = ws_vecs[thread_idx]();
+
688  for (SizeType j = to_SizeType(begin); j < to_SizeType(end); ++j) {
+
689  const SizeType j_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, j));
+
690  const SizeType j_col = distr.tileElementFromGlobalElement<Coord::Col>(j);
+
691  T* evec = evec_tiles[to_sizet(j_tile)].ptr(TileElementIndex(0, j_col));
+
692 
+
693  std::copy(evec, evec + k, ws);
+
694  std::fill_n(evec, k, 0); // by default "deflated"
+
695  for (SizeType i = 0; i < n; ++i) {
+
696  const SizeType ii = i2_perm[i];
+
697  if (ii < k)
+
698  evec[i] = ws[ii];
+
699  }
+
700  }
+
701  }
+
702  return;
+
703  }
+
704  }
705 
-
706  w[i] = q[to_sizet(diag_tile)](diag_element);
-
707  }
-
708  }
-
709  else {
-
710  std::fill_n(w, k, T(1));
-
711  }
+
706  // Note: This barrier ensures that LAED4 finished, so from now on values are available
+
707  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
708 
+
709  // STEP 2a Compute weights (multi-thread)
+
710  auto& q = evec_tiles;
+
711  T* w = ws_vecs[thread_idx]();
712 
-
713  // - compute productorial
-
714  auto compute_w = [&](const GlobalElementIndex ij) {
-
715  const auto q_tile = distr.globalTileLinearIndex(ij);
-
716  const auto q_ij = distr.tileElementIndex(ij);
-
717 
-
718  const SizeType i = ij.row();
-
719  const SizeType j = ij.col();
-
720 
-
721  w[i] *= q[to_sizet(q_tile)](q_ij) / (d_ptr[to_sizet(i)] - d_ptr[to_sizet(j)]);
-
722  };
-
723 
-
724  for (SizeType j = to_SizeType(begin); j < to_SizeType(end); ++j) {
-
725  for (SizeType i = 0; i < j; ++i)
-
726  compute_w({i, j});
-
727 
-
728  for (SizeType i = j + 1; i < k; ++i)
-
729  compute_w({i, j});
-
730  }
+
713  // - copy diagonal from q -> w (or just initialize with 1)
+
714  if (thread_idx == 0) {
+
715  for (SizeType i = 0; i < k; ++i) {
+
716  const GlobalElementIndex kk(i, i);
+
717  const auto diag_tile = distr.globalTileLinearIndex(kk);
+
718  const auto diag_element = distr.tileElementIndex(kk);
+
719 
+
720  w[i] = q[to_sizet(diag_tile)](diag_element);
+
721  }
+
722  }
+
723  else {
+
724  std::fill_n(w, k, T(1));
+
725  }
+
726 
+
727  // - compute productorial
+
728  auto compute_w = [&](const GlobalElementIndex ij) {
+
729  const auto q_tile = distr.globalTileLinearIndex(ij);
+
730  const auto q_ij = distr.tileElementIndex(ij);
731 
-
732  barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
733 
-
734  // STEP 2B: reduce, then finalize computation with sign and square root (single-thread)
-
735  if (thread_idx == 0) {
-
736  for (SizeType i = 0; i < k; ++i) {
-
737  for (std::size_t tidx = 1; tidx < nthreads; ++tidx) {
-
738  const T* w_partial = ws_vecs[tidx]();
-
739  w[i] *= w_partial[i];
-
740  }
-
741  z_tiles[0].ptr()[i] = std::copysign(std::sqrt(-w[i]), z_ptr[to_sizet(i)]);
-
742  }
-
743  }
-
744 
-
745  barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
746 
-
747  // STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread)
-
748  {
-
749  common::internal::SingleThreadedBlasScope single;
-
750 
-
751  const T* w = z_ptr;
-
752  T* s = ws_vecs[thread_idx]();
-
753 
-
754  for (SizeType j = to_SizeType(begin); j < to_SizeType(end); ++j) {
-
755  for (SizeType i = 0; i < k; ++i) {
-
756  const auto q_tile = distr.globalTileLinearIndex({i, j});
-
757  const auto q_ij = distr.tileElementIndex({i, j});
+
732  const SizeType i = ij.row();
+
733  const SizeType j = ij.col();
+
734 
+
735  w[i] *= q[to_sizet(q_tile)](q_ij) / (d_ptr[to_sizet(i)] - d_ptr[to_sizet(j)]);
+
736  };
+
737 
+
738  for (SizeType j = to_SizeType(begin); j < to_SizeType(end); ++j) {
+
739  for (SizeType i = 0; i < j; ++i)
+
740  compute_w({i, j});
+
741 
+
742  for (SizeType i = j + 1; i < k; ++i)
+
743  compute_w({i, j});
+
744  }
+
745 
+
746  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
747 
+
748  // STEP 2B: reduce, then finalize computation with sign and square root (single-thread)
+
749  if (thread_idx == 0) {
+
750  for (SizeType i = 0; i < k; ++i) {
+
751  for (std::size_t tidx = 1; tidx < nthreads; ++tidx) {
+
752  const T* w_partial = ws_vecs[tidx]();
+
753  w[i] *= w_partial[i];
+
754  }
+
755  z_tiles[0].ptr()[i] = std::copysign(std::sqrt(-w[i]), z_ptr[to_sizet(i)]);
+
756  }
+
757  }
758 
-
759  s[i] = w[i] / q[to_sizet(q_tile)](q_ij);
-
760  }
-
761 
-
762  const T vec_norm = blas::nrm2(k, s, 1);
-
763 
-
764  for (SizeType i = 0; i < k; ++i) {
-
765  const SizeType ii = i2_perm[i];
-
766  const auto q_tile = distr.globalTileLinearIndex({i, j});
-
767  const auto q_ij = distr.tileElementIndex({i, j});
-
768 
-
769  q[to_sizet(q_tile)](q_ij) = s[ii] / vec_norm;
-
770  }
-
771  }
-
772  }
-
773  }));
-
774 }
+
759  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
760 
+
761  // STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread)
+
762  {
+
763  common::internal::SingleThreadedBlasScope single;
+
764 
+
765  const T* w = z_ptr;
+
766  T* s = ws_vecs[thread_idx]();
+
767 
+
768  for (SizeType j = to_SizeType(begin); j < to_SizeType(end); ++j) {
+
769  for (SizeType i = 0; i < k; ++i) {
+
770  const auto q_tile = distr.globalTileLinearIndex({i, j});
+
771  const auto q_ij = distr.tileElementIndex({i, j});
+
772 
+
773  s[i] = w[i] / q[to_sizet(q_tile)](q_ij);
+
774  }
775 
-
776 template <Backend B, Device D, class T, class RhoSender>
-
777 void mergeSubproblems(const SizeType i_begin, const SizeType i_split, const SizeType i_end,
-
778  RhoSender&& rho, WorkSpace<T, D>& ws, WorkSpaceHost<T>& ws_h,
-
779  WorkSpaceHostMirror<T, D>& ws_hm) {
-
780  namespace ex = pika::execution::experimental;
-
781  namespace di = dlaf::internal;
+
776  const T vec_norm = blas::nrm2(k, s, 1);
+
777 
+
778  for (SizeType i = 0; i < k; ++i) {
+
779  const SizeType ii = i2_perm[i];
+
780  const auto q_tile = distr.globalTileLinearIndex({i, j});
+
781  const auto q_ij = distr.tileElementIndex({i, j});
782 
-
783  const GlobalTileIndex idx_gl_begin(i_begin, i_begin);
-
784  const LocalTileIndex idx_loc_begin(i_begin, i_begin);
-
785  const SizeType nrtiles = i_end - i_begin;
-
786  const LocalTileSize sz_loc_tiles(nrtiles, nrtiles);
-
787 
-
788  const LocalTileIndex idx_begin_tiles_vec(i_begin, 0);
-
789  const LocalTileSize sz_tiles_vec(nrtiles, 1);
-
790 
-
791  // Calculate the size of the upper subproblem
-
792  const SizeType n1 = problemSize(i_begin, i_split, ws.e0.distribution());
-
793 
-
794  // Assemble the rank-1 update vector `z` from the last row of Q1 and the first row of Q2
-
795  assembleZVec(i_begin, i_split, i_end, rho, ws.e0, ws.z0);
-
796  copy(idx_begin_tiles_vec, sz_tiles_vec, ws.z0, ws_hm.z0);
-
797 
-
798  // Double `rho` to account for the normalization of `z` and make sure `rho > 0` for the root solver laed4
-
799  auto scaled_rho = scaleRho(std::move(rho)) | ex::split();
-
800 
-
801  // Calculate the tolerance used for deflation
-
802  auto tol = calcTolerance(i_begin, i_end, ws_h.d0, ws_hm.z0);
-
803 
-
804  // Initialize the column types vector `c`
-
805  initColTypes(i_begin, i_split, i_end, ws_h.c);
-
806 
-
807  // Initialize `i1` as identity just for single tile sub-problems
-
808  if (i_split == i_begin + 1) {
-
809  initIndex(i_begin, i_split, ws_h.i1);
-
810  }
-
811  if (i_split + 1 == i_end) {
-
812  initIndex(i_split, i_end, ws_h.i1);
-
813  }
+
783  q[to_sizet(q_tile)](q_ij) = s[ii] / vec_norm;
+
784  }
+
785  }
+
786  }
+
787  }));
+
788 }
+
789 
+
790 template <Backend B, Device D, class T, class RhoSender>
+
791 void mergeSubproblems(const SizeType i_begin, const SizeType i_split, const SizeType i_end,
+
792  RhoSender&& rho, WorkSpace<T, D>& ws, WorkSpaceHost<T>& ws_h,
+
793  WorkSpaceHostMirror<T, D>& ws_hm) {
+
794  namespace ex = pika::execution::experimental;
+
795  namespace di = dlaf::internal;
+
796 
+
797  const GlobalTileIndex idx_gl_begin(i_begin, i_begin);
+
798  const LocalTileIndex idx_loc_begin(i_begin, i_begin);
+
799  const SizeType nrtiles = i_end - i_begin;
+
800  const LocalTileSize sz_loc_tiles(nrtiles, nrtiles);
+
801 
+
802  const LocalTileIndex idx_begin_tiles_vec(i_begin, 0);
+
803  const LocalTileSize sz_tiles_vec(nrtiles, 1);
+
804 
+
805  // Calculate the size of the upper subproblem
+
806  const SizeType n1 = problemSize(i_begin, i_split, ws.e0.distribution());
+
807 
+
808  // Assemble the rank-1 update vector `z` from the last row of Q1 and the first row of Q2
+
809  assembleZVec(i_begin, i_split, i_end, rho, ws.e0, ws.z0);
+
810  copy(idx_begin_tiles_vec, sz_tiles_vec, ws.z0, ws_hm.z0);
+
811 
+
812  // Double `rho` to account for the normalization of `z` and make sure `rho > 0` for the root solver laed4
+
813  auto scaled_rho = scaleRho(std::move(rho)) | ex::split();
814 
-
815  // Update indices of second sub-problem
-
816  addIndex(i_split, i_end, n1, ws_h.i1);
+
815  // Calculate the tolerance used for deflation
+
816  auto tol = calcTolerance(i_begin, i_end, ws_h.d0, ws_hm.z0);
817 
-
818  // Step #1
-
819  //
-
820  // i1 (in) : initial <--- pre_sorted per sub-problem
-
821  // i2 (out) : initial <--- pre_sorted
-
822  //
-
823  // - deflate `d`, `z` and `c`
-
824  // - apply Givens rotations to `Q` - `evecs`
-
825  //
-
826  sortIndex(i_begin, i_end, ex::just(n1), ws_h.d0, ws_h.i1, ws_hm.i2);
-
827 
-
828  auto rots =
-
829  applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_hm.i2, ws_h.d0, ws_hm.z0, ws_h.c);
-
830 
-
831  applyGivensRotationsToMatrixColumns(i_begin, i_end, std::move(rots), ws.e0);
-
832 
-
833  // Step #2
-
834  //
-
835  // i2 (in) : initial <--- pre_sorted
-
836  // i5 (out) : initial <--- sorted by coltype
-
837  // i3 (out) : initial <--- deflated
-
838  // i4 (out) : deflated <--- sorted by coltype
+
818  // Initialize the column types vector `c`
+
819  initColTypes(i_begin, i_split, i_end, ws_h.c);
+
820 
+
821  // Initialize `i1` as identity just for single tile sub-problems
+
822  if (i_split == i_begin + 1) {
+
823  initIndex(i_begin, i_split, ws_h.i1);
+
824  }
+
825  if (i_split + 1 == i_end) {
+
826  initIndex(i_split, i_end, ws_h.i1);
+
827  }
+
828 
+
829  // Update indices of second sub-problem
+
830  addIndex(i_split, i_end, n1, ws_h.i1);
+
831 
+
832  // Step #1
+
833  //
+
834  // i1 (in) : initial <--- pre_sorted per sub-problem
+
835  // i2 (out) : initial <--- pre_sorted
+
836  //
+
837  // - deflate `d`, `z` and `c`
+
838  // - apply Givens rotations to `Q` - `evecs`
839  //
-
840  // Note: `i3[k:] == i5[k:]` (i.e. deflated part are sorted in the same way)
-
841  //
-
842  // - permute eigenvectors in `e0` using `i5` so that they are sorted by column type in `e1`
-
843  // - reorder `d0 -> d1`, `z0 -> z1`, using `i3` such that deflated entries are at the bottom.
-
844  // - compute permutation `i4`: sorted by col type ---> deflated
-
845  // - solve rank-1 problem and save eigenvalues in `d0` and `d1` (copy) and eigenvectors in `e2` (sorted
-
846  // by coltype)
-
847  // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented)
+
840  sortIndex(i_begin, i_end, ex::just(n1), ws_h.d0, ws_h.i1, ws_hm.i2);
+
841 
+
842  auto rots =
+
843  applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_hm.i2, ws_h.d0, ws_hm.z0, ws_h.c);
+
844 
+
845  applyGivensRotationsToMatrixColumns(i_begin, i_end, std::move(rots), ws.e0);
+
846 
+
847  // Step #2
848  //
-
849  // | U | U | D | D | | | DF | DF | U: UpperHalf
-
850  // | U | U | D | D | | | DF | DF | D: Dense
-
851  // | | | D | D | L | L | DF | DF | L: LowerHalf
-
852  // | | | D | D | L | L | DF | DF | DF: Deflated
-
853  // | | | D | D | L | L | DF | DF |
-
854  //
-
855  auto k =
-
856  stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_h.d0, ws_hm.i2, ws_h.i3, ws_hm.i5) |
-
857  ex::split();
-
858 
-
859  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i5, ws.i5);
-
860  dlaf::permutations::permute<B, D, T, Coord::Col>(i_begin, i_end, ws.i5, ws.e0, ws.e1);
-
861 
-
862  applyIndex(i_begin, i_end, ws_h.i3, ws_h.d0, ws_hm.d1);
-
863  applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z0, ws_hm.z1);
-
864  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.d1, ws_h.d0);
-
865 
-
866  //
-
867  // i3 (in) : initial <--- deflated
-
868  // i2 (out) : deflated <--- initial
-
869  //
-
870  invertIndex(i_begin, i_end, ws_h.i3, ws_hm.i2);
-
871 
-
872  //
-
873  // i5 (in) : initial <--- sort by coltype
-
874  // i2 (in) : deflated <--- initial
-
875  // i4 (out) : deflated <--- sort by col type
-
876  //
-
877  // This allows to work in rank1 solver with columns sorted by type, so that they are well-shaped for
-
878  // an optimized gemm, but still keeping track of where the actual position sorted by eigenvalues is.
-
879  applyIndex(i_begin, i_end, ws_hm.i5, ws_hm.i2, ws_h.i4);
-
880 
-
881  // Note:
-
882  // This is needed to set to zero elements of e2 outside of the k by k top-left part.
-
883  // The input is not required to be zero for solveRank1Problem.
-
884  matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, idx_loc_begin, sz_loc_tiles,
-
885  ws_hm.e2);
-
886  solveRank1Problem(i_begin, i_end, k, scaled_rho, ws_hm.d1, ws_hm.z1, ws_h.d0, ws_h.i4, ws_hm.e2);
-
887  copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2);
-
888 
-
889  // Step #3: Eigenvectors of the tridiagonal system: Q * U
+
849  // i2 (in) : initial <--- pre_sorted
+
850  // i5 (out) : initial <--- sorted by coltype
+
851  // i3 (out) : initial <--- deflated
+
852  // i4 (out) : deflated <--- sorted by coltype
+
853  //
+
854  // Note: `i3[k:] == i5[k:]` (i.e. deflated part are sorted in the same way)
+
855  //
+
856  // - permute eigenvectors in `e0` using `i5` so that they are sorted by column type in `e1`
+
857  // - reorder `d0 -> d1`, `z0 -> z1`, using `i3` such that deflated entries are at the bottom.
+
858  // - compute permutation `i4`: sorted by col type ---> deflated
+
859  // - solve rank-1 problem and save eigenvalues in `d0` and `d1` (copy) and eigenvectors in `e2` (sorted
+
860  // by coltype)
+
861  // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented)
+
862  //
+
863  // | U | U | D | D | | | DF | DF | U: UpperHalf
+
864  // | U | U | D | D | | | DF | DF | D: Dense
+
865  // | | | D | D | L | L | DF | DF | L: LowerHalf
+
866  // | | | D | D | L | L | DF | DF | DF: Deflated
+
867  // | | | D | D | L | L | DF | DF |
+
868  //
+
869  auto k =
+
870  stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_h.d0, ws_hm.i2, ws_h.i3, ws_hm.i5) |
+
871  ex::split();
+
872 
+
873  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i5, ws.i5);
+
874  dlaf::permutations::permute<B, D, T, Coord::Col>(i_begin, i_end, ws.i5, ws.e0, ws.e1);
+
875 
+
876  applyIndex(i_begin, i_end, ws_h.i3, ws_h.d0, ws_hm.d1);
+
877  applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z0, ws_hm.z1);
+
878  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.d1, ws_h.d0);
+
879 
+
880  //
+
881  // i3 (in) : initial <--- deflated
+
882  // i2 (out) : deflated <--- initial
+
883  //
+
884  invertIndex(i_begin, i_end, ws_h.i3, ws_hm.i2);
+
885 
+
886  //
+
887  // i5 (in) : initial <--- sort by coltype
+
888  // i2 (in) : deflated <--- initial
+
889  // i4 (out) : deflated <--- sort by col type
890  //
-
891  // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as
-
892  // prepared for the deflated system.
-
893  dlaf::multiplication::internal::generalSubMatrix<B, D, T>(i_begin, i_end, blas::Op::NoTrans,
-
894  blas::Op::NoTrans, T(1), ws.e1, ws.e2, T(0),
-
895  ws.e0);
-
896 
-
897  // Step #4: Final permutation to sort eigenvalues and eigenvectors
-
898  //
-
899  // i1 (in) : deflated <--- deflated (identity map)
-
900  // i2 (out) : deflated <--- post_sorted
-
901  //
-
902  initIndex(i_begin, i_end, ws_h.i1);
-
903  sortIndex(i_begin, i_end, std::move(k), ws_h.d0, ws_h.i1, ws_hm.i2);
-
904  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i2, ws_h.i1);
-
905 }
-
906 
-
907 // The bottom row of Q1 and the top row of Q2. The bottom row of Q1 is negated if `rho < 0`.
-
908 //
-
909 // Note that the norm of `z` is sqrt(2) because it is a concatination of two normalized vectors. Hence
-
910 // to normalize `z` we have to divide by sqrt(2).
-
911 template <class T, Device D, class RhoSender>
-
912 void assembleDistZVec(comm::CommunicatorGrid grid, common::Pipeline<comm::Communicator>& full_task_chain,
-
913  const SizeType i_begin, const SizeType i_split, const SizeType i_end,
-
914  RhoSender&& rho, Matrix<const T, D>& evecs, Matrix<T, D>& z) {
-
915  namespace ex = pika::execution::experimental;
-
916 
-
917  const matrix::Distribution& dist = evecs.distribution();
-
918  comm::Index2D this_rank = dist.rankIndex();
-
919 
-
920  // Iterate over tiles of Q1 and Q2 around the split row `i_split`.
-
921  for (SizeType i = i_begin; i < i_end; ++i) {
-
922  // True if tile is in Q1
-
923  bool top_tile = i < i_split;
-
924  // Move to the row below `i_split` for `Q2`
-
925  const SizeType evecs_row = i_split - ((top_tile) ? 1 : 0);
-
926  const GlobalTileIndex idx_evecs(evecs_row, i);
-
927  const GlobalTileIndex z_idx(i, 0);
-
928 
-
929  // Copy the last row of a `Q1` tile or the first row of a `Q2` tile into a column vector `z` tile
-
930  comm::Index2D evecs_tile_rank = dist.rankGlobalTile(idx_evecs);
-
931  if (evecs_tile_rank == this_rank) {
-
932  // Copy the row into the column vector `z`
-
933  assembleRank1UpdateVectorTileAsync<T, D>(top_tile, rho, evecs.read(idx_evecs), z.readwrite(z_idx));
-
934  ex::start_detached(comm::scheduleSendBcast(full_task_chain(), z.read(z_idx)));
-
935  }
-
936  else {
-
937  const comm::IndexT_MPI root_rank = grid.rankFullCommunicator(evecs_tile_rank);
-
938  ex::start_detached(comm::scheduleRecvBcast(full_task_chain(), root_rank, z.readwrite(z_idx)));
-
939  }
-
940  }
-
941 }
+
891  // This allows to work in rank1 solver with columns sorted by type, so that they are well-shaped for
+
892  // an optimized gemm, but still keeping track of where the actual position sorted by eigenvalues is.
+
893  applyIndex(i_begin, i_end, ws_hm.i5, ws_hm.i2, ws_h.i4);
+
894 
+
895  // Note:
+
896  // This is needed to set to zero elements of e2 outside of the k by k top-left part.
+
897  // The input is not required to be zero for solveRank1Problem.
+
898  matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, idx_loc_begin, sz_loc_tiles,
+
899  ws_hm.e2);
+
900  solveRank1Problem(i_begin, i_end, k, scaled_rho, ws_hm.d1, ws_hm.z1, ws_h.d0, ws_h.i4, ws_hm.e2);
+
901  copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2);
+
902 
+
903  // Step #3: Eigenvectors of the tridiagonal system: Q * U
+
904  //
+
905  // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as
+
906  // prepared for the deflated system.
+
907  dlaf::multiplication::internal::generalSubMatrix<B, D, T>(i_begin, i_end, blas::Op::NoTrans,
+
908  blas::Op::NoTrans, T(1), ws.e1, ws.e2, T(0),
+
909  ws.e0);
+
910 
+
911  // Step #4: Final permutation to sort eigenvalues and eigenvectors
+
912  //
+
913  // i1 (in) : deflated <--- deflated (identity map)
+
914  // i2 (out) : deflated <--- post_sorted
+
915  //
+
916  initIndex(i_begin, i_end, ws_h.i1);
+
917  sortIndex(i_begin, i_end, std::move(k), ws_h.d0, ws_h.i1, ws_hm.i2);
+
918  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i2, ws_h.i1);
+
919 }
+
920 
+
921 // The bottom row of Q1 and the top row of Q2. The bottom row of Q1 is negated if `rho < 0`.
+
922 //
+
923 // Note that the norm of `z` is sqrt(2) because it is a concatination of two normalized vectors. Hence
+
924 // to normalize `z` we have to divide by sqrt(2).
+
925 template <class T, Device D, class RhoSender>
+
926 void assembleDistZVec(comm::CommunicatorGrid grid, common::Pipeline<comm::Communicator>& full_task_chain,
+
927  const SizeType i_begin, const SizeType i_split, const SizeType i_end,
+
928  RhoSender&& rho, Matrix<const T, D>& evecs, Matrix<T, D>& z) {
+
929  namespace ex = pika::execution::experimental;
+
930 
+
931  const matrix::Distribution& dist = evecs.distribution();
+
932  comm::Index2D this_rank = dist.rankIndex();
+
933 
+
934  // Iterate over tiles of Q1 and Q2 around the split row `i_split`.
+
935  for (SizeType i = i_begin; i < i_end; ++i) {
+
936  // True if tile is in Q1
+
937  bool top_tile = i < i_split;
+
938  // Move to the row below `i_split` for `Q2`
+
939  const SizeType evecs_row = i_split - ((top_tile) ? 1 : 0);
+
940  const GlobalTileIndex idx_evecs(evecs_row, i);
+
941  const GlobalTileIndex z_idx(i, 0);
942 
-
943 template <class T, class CommSender, class KSender, class RhoSender>
-
944 void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const SizeType i_begin,
-
945  const SizeType i_end, const LocalTileIndex ij_begin_lc,
-
946  const LocalTileSize sz_loc_tiles, KSender&& k, RhoSender&& rho,
-
947  Matrix<const T, Device::CPU>& d, Matrix<T, Device::CPU>& z,
-
948  Matrix<T, Device::CPU>& evals, Matrix<const SizeType, Device::CPU>& i2,
-
949  Matrix<T, Device::CPU>& evecs) {
-
950  namespace ex = pika::execution::experimental;
-
951  namespace di = dlaf::internal;
-
952  namespace tt = pika::this_thread::experimental;
-
953 
-
954  const matrix::Distribution& dist = evecs.distribution();
-
955 
-
956  TileCollector tc{i_begin, i_end};
-
957 
-
958  const SizeType n = problemSize(i_begin, i_end, dist);
-
959 
-
960  const SizeType m_subm_el_lc = [=]() {
-
961  const auto i_loc_begin = ij_begin_lc.row();
-
962  const auto i_loc_end = ij_begin_lc.row() + sz_loc_tiles.rows();
-
963  return dist.localElementDistanceFromLocalTile<Coord::Row>(i_loc_begin, i_loc_end);
-
964  }();
-
965 
-
966  const SizeType n_subm_el_lc = [=]() {
-
967  const auto i_loc_begin = ij_begin_lc.col();
-
968  const auto i_loc_end = ij_begin_lc.col() + sz_loc_tiles.cols();
-
969  return dist.localElementDistanceFromLocalTile<Coord::Col>(i_loc_begin, i_loc_end);
-
970  }();
+
943  // Copy the last row of a `Q1` tile or the first row of a `Q2` tile into a column vector `z` tile
+
944  comm::Index2D evecs_tile_rank = dist.rankGlobalTile(idx_evecs);
+
945  if (evecs_tile_rank == this_rank) {
+
946  // Copy the row into the column vector `z`
+
947  assembleRank1UpdateVectorTileAsync<T, D>(top_tile, rho, evecs.read(idx_evecs), z.readwrite(z_idx));
+
948  ex::start_detached(comm::scheduleSendBcast(full_task_chain(), z.read(z_idx)));
+
949  }
+
950  else {
+
951  const comm::IndexT_MPI root_rank = grid.rankFullCommunicator(evecs_tile_rank);
+
952  ex::start_detached(comm::scheduleRecvBcast(full_task_chain(), root_rank, z.readwrite(z_idx)));
+
953  }
+
954  }
+
955 }
+
956 
+
957 template <class T, class CommSender, class KSender, class RhoSender>
+
958 void solveRank1ProblemDist(CommSender&& row_comm, CommSender&& col_comm, const SizeType i_begin,
+
959  const SizeType i_end, const LocalTileIndex ij_begin_lc,
+
960  const LocalTileSize sz_loc_tiles, KSender&& k, RhoSender&& rho,
+
961  Matrix<const T, Device::CPU>& d, Matrix<T, Device::CPU>& z,
+
962  Matrix<T, Device::CPU>& evals, Matrix<const SizeType, Device::CPU>& i2,
+
963  Matrix<T, Device::CPU>& evecs) {
+
964  namespace ex = pika::execution::experimental;
+
965  namespace di = dlaf::internal;
+
966  namespace tt = pika::this_thread::experimental;
+
967 
+
968  const matrix::Distribution& dist = evecs.distribution();
+
969 
+
970  TileCollector tc{i_begin, i_end};
971 
-
972  auto bcast_evals = [i_begin, i_end,
-
973  dist](common::Pipeline<comm::Communicator>& row_comm_chain,
-
974  const std::vector<matrix::Tile<T, Device::CPU>>& eval_tiles) {
-
975  using dlaf::comm::internal::sendBcast_o;
-
976  using dlaf::comm::internal::recvBcast_o;
-
977 
-
978  const comm::Index2D this_rank = dist.rankIndex();
+
972  const SizeType n = problemSize(i_begin, i_end, dist);
+
973 
+
974  const SizeType m_subm_el_lc = [=]() {
+
975  const auto i_loc_begin = ij_begin_lc.row();
+
976  const auto i_loc_end = ij_begin_lc.row() + sz_loc_tiles.rows();
+
977  return dist.localElementDistanceFromLocalTile<Coord::Row>(i_loc_begin, i_loc_end);
+
978  }();
979 
-
980  std::vector<ex::unique_any_sender<>> comms;
-
981  comms.reserve(to_sizet(i_end - i_begin));
-
982 
-
983  for (SizeType i = i_begin; i < i_end; ++i) {
-
984  const comm::IndexT_MPI evecs_tile_rank = dist.rankGlobalTile<Coord::Col>(i);
-
985  auto& tile = eval_tiles[to_sizet(i - i_begin)];
-
986 
-
987  if (evecs_tile_rank == this_rank.col())
-
988  comms.emplace_back(ex::when_all(row_comm_chain(), ex::just(std::cref(tile))) |
-
989  transformMPI(sendBcast_o));
-
990  else
-
991  comms.emplace_back(ex::when_all(row_comm_chain(), ex::just(evecs_tile_rank, std::cref(tile))) |
-
992  transformMPI(recvBcast_o));
-
993  }
-
994 
-
995  return ex::ensure_started(ex::when_all_vector(std::move(comms)));
-
996  };
-
997 
-
998  auto all_reduce_in_place = [](const dlaf::comm::Communicator& comm, MPI_Op reduce_op, const auto& data,
-
999  MPI_Request* req) {
-
1000  auto msg = comm::make_message(data);
-
1001  DLAF_MPI_CHECK_ERROR(MPI_Iallreduce(MPI_IN_PLACE, msg.data(), msg.count(), msg.mpi_type(), reduce_op,
-
1002  comm, req));
-
1003  };
-
1004 
-
1005  // Note: at least two column of tiles per-worker, in the range [1, getTridiagRank1NWorkers()]
-
1006  const std::size_t nthreads = [nrtiles = sz_loc_tiles.cols()]() {
-
1007  const std::size_t min_workers = 1;
-
1008  const std::size_t available_workers = getTridiagRank1NWorkers();
-
1009  const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(2));
-
1010  return std::clamp(ideal_workers, min_workers, available_workers);
-
1011  }();
-
1012 
-
1013  ex::start_detached(
-
1014  ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads)),
-
1015  std::forward<CommSender>(row_comm), std::forward<CommSender>(col_comm),
-
1016  std::forward<KSender>(k), std::forward<RhoSender>(rho),
-
1017  ex::when_all_vector(tc.read(d)), ex::when_all_vector(tc.readwrite(z)),
-
1018  ex::when_all_vector(tc.readwrite(evals)), ex::when_all_vector(tc.read(i2)),
-
1019  ex::when_all_vector(tc.readwrite(evecs)),
-
1020  // additional workspaces
-
1021  ex::just(std::vector<memory::MemoryView<T, Device::CPU>>()),
-
1022  ex::just(memory::MemoryView<T, Device::CPU>())) |
-
1023  ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
-
1024  ex::bulk(nthreads, [nthreads, n, n_subm_el_lc, m_subm_el_lc, i_begin, ij_begin_lc, sz_loc_tiles,
-
1025  dist, bcast_evals, all_reduce_in_place](
-
1026  const std::size_t thread_idx, auto& barrier_ptr, auto& row_comm_wrapper,
-
1027  auto& col_comm_wrapper, const auto& k, const auto& rho,
-
1028  const auto& d_tiles_futs, auto& z_tiles, const auto& eval_tiles,
-
1029  const auto& i2_tile_arr, const auto& evec_tiles, auto& ws_cols,
-
1030  auto& ws_row) {
-
1031  using dlaf::comm::internal::transformMPI;
-
1032 
-
1033  common::Pipeline<comm::Communicator> row_comm_chain(row_comm_wrapper.get());
-
1034  const dlaf::comm::Communicator& col_comm = col_comm_wrapper.get();
-
1035 
-
1036  const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
-
1037  const std::size_t batch_size =
-
1038  std::max<std::size_t>(2, util::ceilDiv(to_sizet(sz_loc_tiles.cols()), nthreads));
-
1039  const SizeType begin = to_SizeType(thread_idx * batch_size);
-
1040  const SizeType end = std::min(to_SizeType((thread_idx + 1) * batch_size), sz_loc_tiles.cols());
-
1041 
-
1042  // STEP 0a: Fill ones for deflated Eigenvectors. (single-thread)
-
1043  // Note: this step is completely independent from the rest, but it is small and it is going
-
1044  // to be dropped soon.
-
1045  // Note: use last threads that in principle should have less work to do
-
1046  if (thread_idx == nthreads - 1) {
-
1047  // just if there are deflated eigenvectors
-
1048  if (k < n) {
-
1049  const GlobalElementSize origin_el(i_begin * dist.blockSize().rows(),
-
1050  i_begin * dist.blockSize().cols());
-
1051  const SizeType* i2_perm = i2_tile_arr[0].get().ptr();
-
1052 
-
1053  for (SizeType i_subm_el = 0; i_subm_el < n; ++i_subm_el) {
-
1054  const SizeType j_subm_el = i2_perm[i_subm_el];
+
980  const SizeType n_subm_el_lc = [=]() {
+
981  const auto i_loc_begin = ij_begin_lc.col();
+
982  const auto i_loc_end = ij_begin_lc.col() + sz_loc_tiles.cols();
+
983  return dist.localElementDistanceFromLocalTile<Coord::Col>(i_loc_begin, i_loc_end);
+
984  }();
+
985 
+
986  auto bcast_evals = [i_begin, i_end,
+
987  dist](common::Pipeline<comm::Communicator>& row_comm_chain,
+
988  const std::vector<matrix::Tile<T, Device::CPU>>& eval_tiles) {
+
989  using dlaf::comm::internal::sendBcast_o;
+
990  using dlaf::comm::internal::recvBcast_o;
+
991 
+
992  const comm::Index2D this_rank = dist.rankIndex();
+
993 
+
994  std::vector<ex::unique_any_sender<>> comms;
+
995  comms.reserve(to_sizet(i_end - i_begin));
+
996 
+
997  for (SizeType i = i_begin; i < i_end; ++i) {
+
998  const comm::IndexT_MPI evecs_tile_rank = dist.rankGlobalTile<Coord::Col>(i);
+
999  auto& tile = eval_tiles[to_sizet(i - i_begin)];
+
1000 
+
1001  if (evecs_tile_rank == this_rank.col())
+
1002  comms.emplace_back(ex::when_all(row_comm_chain(), ex::just(std::cref(tile))) |
+
1003  transformMPI(sendBcast_o));
+
1004  else
+
1005  comms.emplace_back(ex::when_all(row_comm_chain(), ex::just(evecs_tile_rank, std::cref(tile))) |
+
1006  transformMPI(recvBcast_o));
+
1007  }
+
1008 
+
1009  return ex::ensure_started(ex::when_all_vector(std::move(comms)));
+
1010  };
+
1011 
+
1012  auto all_reduce_in_place = [](const dlaf::comm::Communicator& comm, MPI_Op reduce_op, const auto& data,
+
1013  MPI_Request* req) {
+
1014  auto msg = comm::make_message(data);
+
1015  DLAF_MPI_CHECK_ERROR(MPI_Iallreduce(MPI_IN_PLACE, msg.data(), msg.count(), msg.mpi_type(), reduce_op,
+
1016  comm, req));
+
1017  };
+
1018 
+
1019  // Note: at least two column of tiles per-worker, in the range [1, getTridiagRank1NWorkers()]
+
1020  const std::size_t nthreads = [nrtiles = sz_loc_tiles.cols()]() {
+
1021  const std::size_t min_workers = 1;
+
1022  const std::size_t available_workers = getTridiagRank1NWorkers();
+
1023  const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(2));
+
1024  return std::clamp(ideal_workers, min_workers, available_workers);
+
1025  }();
+
1026 
+
1027  ex::start_detached(
+
1028  ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads)),
+
1029  std::forward<CommSender>(row_comm), std::forward<CommSender>(col_comm),
+
1030  std::forward<KSender>(k), std::forward<RhoSender>(rho),
+
1031  ex::when_all_vector(tc.read(d)), ex::when_all_vector(tc.readwrite(z)),
+
1032  ex::when_all_vector(tc.readwrite(evals)), ex::when_all_vector(tc.read(i2)),
+
1033  ex::when_all_vector(tc.readwrite(evecs)),
+
1034  // additional workspaces
+
1035  ex::just(std::vector<memory::MemoryView<T, Device::CPU>>()),
+
1036  ex::just(memory::MemoryView<T, Device::CPU>())) |
+
1037  ex::transfer(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
+
1038  ex::bulk(nthreads, [nthreads, n, n_subm_el_lc, m_subm_el_lc, i_begin, ij_begin_lc, sz_loc_tiles,
+
1039  dist, bcast_evals, all_reduce_in_place](
+
1040  const std::size_t thread_idx, auto& barrier_ptr, auto& row_comm_wrapper,
+
1041  auto& col_comm_wrapper, const auto& k, const auto& rho,
+
1042  const auto& d_tiles_futs, auto& z_tiles, const auto& eval_tiles,
+
1043  const auto& i2_tile_arr, const auto& evec_tiles, auto& ws_cols,
+
1044  auto& ws_row) {
+
1045  using dlaf::comm::internal::transformMPI;
+
1046 
+
1047  common::Pipeline<comm::Communicator> row_comm_chain(row_comm_wrapper.get());
+
1048  const dlaf::comm::Communicator& col_comm = col_comm_wrapper.get();
+
1049 
+
1050  const auto barrier_busy_wait = getTridiagRank1BarrierBusyWait();
+
1051  const std::size_t batch_size =
+
1052  std::max<std::size_t>(2, util::ceilDiv(to_sizet(sz_loc_tiles.cols()), nthreads));
+
1053  const SizeType begin = to_SizeType(thread_idx * batch_size);
+
1054  const SizeType end = std::min(to_SizeType((thread_idx + 1) * batch_size), sz_loc_tiles.cols());
1055 
-
1056  // if it is a deflated vector
-
1057  if (j_subm_el >= k) {
-
1058  const GlobalElementIndex ij_el(origin_el.rows() + i_subm_el,
-
1059  origin_el.cols() + j_subm_el);
-
1060  const GlobalTileIndex ij = dist.globalTileIndex(ij_el);
-
1061 
-
1062  if (dist.rankIndex() == dist.rankGlobalTile(ij)) {
-
1063  const LocalTileIndex ij_lc = dist.localTileIndex(ij);
-
1064  const SizeType linear_subm_lc =
-
1065  (ij_lc.row() - ij_begin_lc.row()) +
-
1066  (ij_lc.col() - ij_begin_lc.col()) * sz_loc_tiles.rows();
-
1067  const TileElementIndex ij_el_tl = dist.tileElementIndex(ij_el);
-
1068  evec_tiles[to_sizet(linear_subm_lc)](ij_el_tl) = T{1};
-
1069  }
-
1070  }
-
1071  }
-
1072  }
-
1073  }
-
1074 
-
1075  // STEP 0b: Initialize workspaces (single-thread)
-
1076  if (thread_idx == 0) {
-
1077  // Note:
-
1078  // - nthreads are used for both LAED4 and weight calculation (one per worker thread)
-
1079  // - last one is used for reducing weights from all workers
-
1080  ws_cols.reserve(nthreads + 1);
-
1081 
-
1082  // Note:
-
1083  // Considering that
-
1084  // - LAED4 requires working on k elements
-
1085  // - Weight computation requires working on m_subm_el_lc
-
1086  //
-
1087  // and they are needed at two steps that cannot happen in parallel, we opted for allocating
-
1088  // the workspace with the highest requirement of memory, and reuse them for both steps.
-
1089  const SizeType max_size = std::max(k, m_subm_el_lc);
-
1090  for (std::size_t i = 0; i < nthreads; ++i)
-
1091  ws_cols.emplace_back(max_size);
-
1092  ws_cols.emplace_back(m_subm_el_lc);
-
1093 
-
1094  ws_row = memory::MemoryView<T, Device::CPU>(n_subm_el_lc);
-
1095  std::fill_n(ws_row(), n_subm_el_lc, 0);
-
1096  }
-
1097 
-
1098  // Note: we have to wait that LAED4 workspaces are ready to be used
-
1099  barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
1100 
-
1101  const T* d_ptr = d_tiles_futs[0].get().ptr();
-
1102  const T* z_ptr = z_tiles[0].ptr();
-
1103 
-
1104  // STEP 1: LAED4 (multi-thread)
-
1105  {
-
1106  common::internal::SingleThreadedBlasScope single;
+
1056  // STEP 0a: Fill ones for deflated Eigenvectors. (single-thread)
+
1057  // Note: this step is completely independent from the rest, but it is small and it is going
+
1058  // to be dropped soon.
+
1059  // Note: use last threads that in principle should have less work to do
+
1060  if (thread_idx == nthreads - 1) {
+
1061  // just if there are deflated eigenvectors
+
1062  if (k < n) {
+
1063  const GlobalElementSize origin_el(i_begin * dist.blockSize().rows(),
+
1064  i_begin * dist.blockSize().cols());
+
1065  const SizeType* i2_perm = i2_tile_arr[0].get().ptr();
+
1066 
+
1067  for (SizeType i_subm_el = 0; i_subm_el < n; ++i_subm_el) {
+
1068  const SizeType j_subm_el = i2_perm[i_subm_el];
+
1069 
+
1070  // if it is a deflated vector
+
1071  if (j_subm_el >= k) {
+
1072  const GlobalElementIndex ij_el(origin_el.rows() + i_subm_el,
+
1073  origin_el.cols() + j_subm_el);
+
1074  const GlobalTileIndex ij = dist.globalTileIndex(ij_el);
+
1075 
+
1076  if (dist.rankIndex() == dist.rankGlobalTile(ij)) {
+
1077  const LocalTileIndex ij_lc = dist.localTileIndex(ij);
+
1078  const SizeType linear_subm_lc =
+
1079  (ij_lc.row() - ij_begin_lc.row()) +
+
1080  (ij_lc.col() - ij_begin_lc.col()) * sz_loc_tiles.rows();
+
1081  const TileElementIndex ij_el_tl = dist.tileElementIndex(ij_el);
+
1082  evec_tiles[to_sizet(linear_subm_lc)](ij_el_tl) = T{1};
+
1083  }
+
1084  }
+
1085  }
+
1086  }
+
1087  }
+
1088 
+
1089  // STEP 0b: Initialize workspaces (single-thread)
+
1090  if (thread_idx == 0) {
+
1091  // Note:
+
1092  // - nthreads are used for both LAED4 and weight calculation (one per worker thread)
+
1093  // - last one is used for reducing weights from all workers
+
1094  ws_cols.reserve(nthreads + 1);
+
1095 
+
1096  // Note:
+
1097  // Considering that
+
1098  // - LAED4 requires working on k elements
+
1099  // - Weight computation requires working on m_subm_el_lc
+
1100  //
+
1101  // and they are needed at two steps that cannot happen in parallel, we opted for allocating
+
1102  // the workspace with the highest requirement of memory, and reuse them for both steps.
+
1103  const SizeType max_size = std::max(k, m_subm_el_lc);
+
1104  for (std::size_t i = 0; i < nthreads; ++i)
+
1105  ws_cols.emplace_back(max_size);
+
1106  ws_cols.emplace_back(m_subm_el_lc);
1107 
-
1108  T* eval_ptr = eval_tiles[0].ptr();
-
1109  T* delta_ptr = ws_cols[thread_idx]();
-
1110 
-
1111  for (SizeType j_subm_lc = begin; j_subm_lc < end; ++j_subm_lc) {
-
1112  const SizeType j_lc = ij_begin_lc.col() + to_SizeType(j_subm_lc);
-
1113  const SizeType j = dist.globalTileFromLocalTile<Coord::Col>(j_lc);
-
1114  const SizeType n_subm_el = dist.globalTileElementDistance<Coord::Col>(i_begin, j);
-
1115 
-
1116  // Skip columns that are in the deflation zone
-
1117  if (n_subm_el >= k)
-
1118  break;
-
1119 
-
1120  const SizeType n_el_tl = std::min(dist.tileSize<Coord::Col>(j), k - n_subm_el);
-
1121  for (SizeType j_el_tl = 0; j_el_tl < n_el_tl; ++j_el_tl) {
-
1122  const SizeType j_el = n_subm_el + j_el_tl;
-
1123 
-
1124  // Solve the deflated rank-1 problem
-
1125  T& eigenval = eval_ptr[to_sizet(j_el)];
-
1126  lapack::laed4(to_int(k), to_int(j_el), d_ptr, z_ptr, delta_ptr, rho, &eigenval);
-
1127 
-
1128  // copy the parts from delta stored on this rank
-
1129  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
-
1130  const SizeType linear_subm_lc = i_subm_lc + to_SizeType(j_subm_lc) * sz_loc_tiles.rows();
-
1131  auto& evec_tile = evec_tiles[to_sizet(linear_subm_lc)];
-
1132 
-
1133  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
-
1134  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
-
1135  const SizeType m_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
-
1136 
-
1137  const SizeType i_subm = i - i_begin;
-
1138  const auto& i2_perm = i2_tile_arr[to_sizet(i_subm)].get();
-
1139 
-
1140  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - m_subm_el);
-
1141  for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
-
1142  const SizeType jj_subm_el = i2_perm({i_el_tl, 0});
-
1143  if (jj_subm_el < k)
-
1144  evec_tile({i_el_tl, j_el_tl}) = delta_ptr[jj_subm_el];
-
1145  }
-
1146  }
-
1147  }
-
1148  }
-
1149  }
+
1108  ws_row = memory::MemoryView<T, Device::CPU>(n_subm_el_lc);
+
1109  std::fill_n(ws_row(), n_subm_el_lc, 0);
+
1110  }
+
1111 
+
1112  // Note: we have to wait that LAED4 workspaces are ready to be used
+
1113  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
1114 
+
1115  const T* d_ptr = d_tiles_futs[0].get().ptr();
+
1116  const T* z_ptr = z_tiles[0].ptr();
+
1117 
+
1118  // STEP 1: LAED4 (multi-thread)
+
1119  {
+
1120  common::internal::SingleThreadedBlasScope single;
+
1121 
+
1122  T* eval_ptr = eval_tiles[0].ptr();
+
1123  T* delta_ptr = ws_cols[thread_idx]();
+
1124 
+
1125  for (SizeType j_subm_lc = begin; j_subm_lc < end; ++j_subm_lc) {
+
1126  const SizeType j_lc = ij_begin_lc.col() + to_SizeType(j_subm_lc);
+
1127  const SizeType j = dist.globalTileFromLocalTile<Coord::Col>(j_lc);
+
1128  const SizeType n_subm_el = dist.globalTileElementDistance<Coord::Col>(i_begin, j);
+
1129 
+
1130  // Skip columns that are in the deflation zone
+
1131  if (n_subm_el >= k)
+
1132  break;
+
1133 
+
1134  const SizeType n_el_tl = std::min(dist.tileSize<Coord::Col>(j), k - n_subm_el);
+
1135  for (SizeType j_el_tl = 0; j_el_tl < n_el_tl; ++j_el_tl) {
+
1136  const SizeType j_el = n_subm_el + j_el_tl;
+
1137 
+
1138  // Solve the deflated rank-1 problem
+
1139  T& eigenval = eval_ptr[to_sizet(j_el)];
+
1140  lapack::laed4(to_int(k), to_int(j_el), d_ptr, z_ptr, delta_ptr, rho, &eigenval);
+
1141 
+
1142  // copy the parts from delta stored on this rank
+
1143  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
+
1144  const SizeType linear_subm_lc = i_subm_lc + to_SizeType(j_subm_lc) * sz_loc_tiles.rows();
+
1145  auto& evec_tile = evec_tiles[to_sizet(linear_subm_lc)];
+
1146 
+
1147  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
+
1148  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
+
1149  const SizeType m_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
1150 
-
1151  // Note: This barrier ensures that LAED4 finished, so from now on values are available
-
1152  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
1151  const SizeType i_subm = i - i_begin;
+
1152  const auto& i2_perm = i2_tile_arr[to_sizet(i_subm)].get();
1153 
-
1154  // STEP 2: Broadcast evals
-
1155 
-
1156  // Note: this ensures that evals broadcasting finishes before bulk releases resources
-
1157  struct sync_wait_on_exit_t {
-
1158  ex::unique_any_sender<> sender_;
-
1159 
-
1160  ~sync_wait_on_exit_t() {
-
1161  if (sender_)
-
1162  tt::sync_wait(std::move(sender_));
-
1163  }
-
1164  } bcast_barrier;
-
1165 
-
1166  if (thread_idx == 0)
-
1167  bcast_barrier.sender_ = bcast_evals(row_comm_chain, eval_tiles);
-
1168 
-
1169  // Note: laed4 handles k <= 2 cases differently
-
1170  if (k <= 2)
-
1171  return;
-
1172 
-
1173  // STEP 2 Compute weights (multi-thread)
-
1174  auto& q = evec_tiles;
-
1175  T* w = ws_cols[thread_idx]();
-
1176 
-
1177  // STEP 2a: copy diagonal from q -> w (or just initialize with 1)
-
1178  if (thread_idx == 0) {
-
1179  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
-
1180  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
-
1181  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
-
1182  const SizeType i_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
-
1183  const SizeType m_subm_el_lc =
-
1184  dist.localElementDistanceFromLocalTile<Coord::Row>(ij_begin_lc.row(), i_lc);
-
1185  const auto& i2 = i2_tile_arr[to_sizet(i - i_begin)].get();
+
1154  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - m_subm_el);
+
1155  for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
+
1156  const SizeType jj_subm_el = i2_perm({i_el_tl, 0});
+
1157  if (jj_subm_el < k)
+
1158  evec_tile({i_el_tl, j_el_tl}) = delta_ptr[jj_subm_el];
+
1159  }
+
1160  }
+
1161  }
+
1162  }
+
1163  }
+
1164 
+
1165  // Note: This barrier ensures that LAED4 finished, so from now on values are available
+
1166  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
1167 
+
1168  // STEP 2: Broadcast evals
+
1169 
+
1170  // Note: this ensures that evals broadcasting finishes before bulk releases resources
+
1171  struct sync_wait_on_exit_t {
+
1172  ex::unique_any_sender<> sender_;
+
1173 
+
1174  ~sync_wait_on_exit_t() {
+
1175  if (sender_)
+
1176  tt::sync_wait(std::move(sender_));
+
1177  }
+
1178  } bcast_barrier;
+
1179 
+
1180  if (thread_idx == 0)
+
1181  bcast_barrier.sender_ = bcast_evals(row_comm_chain, eval_tiles);
+
1182 
+
1183  // Note: laed4 handles k <= 2 cases differently
+
1184  if (k <= 2)
+
1185  return;
1186 
-
1187  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - i_subm_el);
-
1188  for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
-
1189  const SizeType i_subm_el_lc = m_subm_el_lc + i_el_tl;
+
1187  // STEP 2 Compute weights (multi-thread)
+
1188  auto& q = evec_tiles;
+
1189  T* w = ws_cols[thread_idx]();
1190 
-
1191  const SizeType jj_subm_el = i2({i_el_tl, 0});
-
1192  const SizeType n_el = dist.globalTileElementDistance<Coord::Col>(0, i_begin);
-
1193  const SizeType jj_el = n_el + jj_subm_el;
-
1194  const SizeType jj = dist.globalTileFromGlobalElement<Coord::Col>(jj_el);
-
1195 
-
1196  if (dist.rankGlobalTile<Coord::Col>(jj) == dist.rankIndex().col()) {
-
1197  const SizeType jj_lc = dist.localTileFromGlobalTile<Coord::Col>(jj);
-
1198  const SizeType jj_subm_lc = jj_lc - ij_begin_lc.col();
-
1199  const SizeType jj_el_tl = dist.tileElementFromGlobalElement<Coord::Col>(jj_el);
+
1191  // STEP 2a: copy diagonal from q -> w (or just initialize with 1)
+
1192  if (thread_idx == 0) {
+
1193  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
+
1194  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
+
1195  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
+
1196  const SizeType i_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
+
1197  const SizeType m_subm_el_lc =
+
1198  dist.localElementDistanceFromLocalTile<Coord::Row>(ij_begin_lc.row(), i_lc);
+
1199  const auto& i2 = i2_tile_arr[to_sizet(i - i_begin)].get();
1200 
-
1201  const SizeType linear_subm_lc = i_subm_lc + sz_loc_tiles.rows() * jj_subm_lc;
-
1202 
-
1203  w[i_subm_el_lc] = q[to_sizet(linear_subm_lc)]({i_el_tl, jj_el_tl});
-
1204  }
-
1205  else {
-
1206  w[i_subm_el_lc] = T(1);
-
1207  }
-
1208  }
-
1209  }
-
1210  }
-
1211  else { // other workers
-
1212  std::fill_n(w, m_subm_el_lc, T(1));
-
1213  }
+
1201  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - i_subm_el);
+
1202  for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
+
1203  const SizeType i_subm_el_lc = m_subm_el_lc + i_el_tl;
+
1204 
+
1205  const SizeType jj_subm_el = i2({i_el_tl, 0});
+
1206  const SizeType n_el = dist.globalTileElementDistance<Coord::Col>(0, i_begin);
+
1207  const SizeType jj_el = n_el + jj_subm_el;
+
1208  const SizeType jj = dist.globalTileFromGlobalElement<Coord::Col>(jj_el);
+
1209 
+
1210  if (dist.rankGlobalTile<Coord::Col>(jj) == dist.rankIndex().col()) {
+
1211  const SizeType jj_lc = dist.localTileFromGlobalTile<Coord::Col>(jj);
+
1212  const SizeType jj_subm_lc = jj_lc - ij_begin_lc.col();
+
1213  const SizeType jj_el_tl = dist.tileElementFromGlobalElement<Coord::Col>(jj_el);
1214 
-
1215  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
1215  const SizeType linear_subm_lc = i_subm_lc + sz_loc_tiles.rows() * jj_subm_lc;
1216 
-
1217  // STEP 2b: compute weights
-
1218  for (SizeType j_subm_lc = begin; j_subm_lc < end; ++j_subm_lc) {
-
1219  const SizeType j_lc = ij_begin_lc.col() + to_SizeType(j_subm_lc);
-
1220  const SizeType j = dist.globalTileFromLocalTile<Coord::Col>(j_lc);
-
1221  const SizeType n_subm_el = dist.globalTileElementDistance<Coord::Col>(i_begin, j);
-
1222 
-
1223  // Skip columns that are in the deflation zone
-
1224  if (n_subm_el >= k)
-
1225  break;
-
1226 
-
1227  const SizeType n_el_tl = std::min(dist.tileSize<Coord::Col>(j), k - n_subm_el);
-
1228  for (SizeType j_el_tl = 0; j_el_tl < n_el_tl; ++j_el_tl) {
-
1229  const SizeType j_subm_el = n_subm_el + j_el_tl;
-
1230  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
-
1231  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
-
1232  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
-
1233  const SizeType m_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
-
1234 
-
1235  auto& i2_perm = i2_tile_arr[to_sizet(i - i_begin)].get();
+
1217  w[i_subm_el_lc] = q[to_sizet(linear_subm_lc)]({i_el_tl, jj_el_tl});
+
1218  }
+
1219  else {
+
1220  w[i_subm_el_lc] = T(1);
+
1221  }
+
1222  }
+
1223  }
+
1224  }
+
1225  else { // other workers
+
1226  std::fill_n(w, m_subm_el_lc, T(1));
+
1227  }
+
1228 
+
1229  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
1230 
+
1231  // STEP 2b: compute weights
+
1232  for (SizeType j_subm_lc = begin; j_subm_lc < end; ++j_subm_lc) {
+
1233  const SizeType j_lc = ij_begin_lc.col() + to_SizeType(j_subm_lc);
+
1234  const SizeType j = dist.globalTileFromLocalTile<Coord::Col>(j_lc);
+
1235  const SizeType n_subm_el = dist.globalTileElementDistance<Coord::Col>(i_begin, j);
1236 
-
1237  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - m_subm_el);
-
1238  for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
-
1239  const SizeType ii_subm_el = i2_perm({i_el_tl, 0});
+
1237  // Skip columns that are in the deflation zone
+
1238  if (n_subm_el >= k)
+
1239  break;
1240 
-
1241  // deflated zone
-
1242  if (ii_subm_el >= k)
-
1243  continue;
-
1244 
-
1245  // diagonal
-
1246  if (ii_subm_el == j_subm_el)
-
1247  continue;
+
1241  const SizeType n_el_tl = std::min(dist.tileSize<Coord::Col>(j), k - n_subm_el);
+
1242  for (SizeType j_el_tl = 0; j_el_tl < n_el_tl; ++j_el_tl) {
+
1243  const SizeType j_subm_el = n_subm_el + j_el_tl;
+
1244  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
+
1245  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
+
1246  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
+
1247  const SizeType m_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
1248 
-
1249  const SizeType linear_subm_lc = i_subm_lc + sz_loc_tiles.rows() * j_subm_lc;
-
1250  const SizeType i_subm_el_lc = i_subm_lc * dist.blockSize().rows() + i_el_tl;
-
1251 
-
1252  w[i_subm_el_lc] *= q[to_sizet(linear_subm_lc)]({i_el_tl, j_el_tl}) /
-
1253  (d_ptr[to_sizet(ii_subm_el)] - d_ptr[to_sizet(j_subm_el)]);
-
1254  }
-
1255  }
-
1256  }
-
1257  }
+
1249  auto& i2_perm = i2_tile_arr[to_sizet(i - i_begin)].get();
+
1250 
+
1251  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - m_subm_el);
+
1252  for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
+
1253  const SizeType ii_subm_el = i2_perm({i_el_tl, 0});
+
1254 
+
1255  // deflated zone
+
1256  if (ii_subm_el >= k)
+
1257  continue;
1258 
-
1259  barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
1260 
-
1261  // STEP 2c: reduce, then finalize computation with sign and square root (single-thread)
-
1262  if (thread_idx == 0) {
-
1263  // local reduction from all bulk workers
-
1264  for (SizeType i = 0; i < m_subm_el_lc; ++i) {
-
1265  for (std::size_t tidx = 1; tidx < nthreads; ++tidx) {
-
1266  const T* w_partial = ws_cols[tidx]();
-
1267  w[i] *= w_partial[i];
-
1268  }
-
1269  }
-
1270 
-
1271  tt::sync_wait(ex::when_all(row_comm_chain(),
-
1272  ex::just(MPI_PROD, common::make_data(w, m_subm_el_lc))) |
-
1273  transformMPI(all_reduce_in_place));
+
1259  // diagonal
+
1260  if (ii_subm_el == j_subm_el)
+
1261  continue;
+
1262 
+
1263  const SizeType linear_subm_lc = i_subm_lc + sz_loc_tiles.rows() * j_subm_lc;
+
1264  const SizeType i_subm_el_lc = i_subm_lc * dist.blockSize().rows() + i_el_tl;
+
1265 
+
1266  w[i_subm_el_lc] *= q[to_sizet(linear_subm_lc)]({i_el_tl, j_el_tl}) /
+
1267  (d_ptr[to_sizet(ii_subm_el)] - d_ptr[to_sizet(j_subm_el)]);
+
1268  }
+
1269  }
+
1270  }
+
1271  }
+
1272 
+
1273  barrier_ptr->arrive_and_wait(barrier_busy_wait);
1274 
-
1275  T* weights = ws_cols[nthreads]();
-
1276  for (SizeType i_subm_el_lc = 0; i_subm_el_lc < m_subm_el_lc; ++i_subm_el_lc) {
-
1277  const SizeType i_subm_lc = i_subm_el_lc / dist.blockSize().rows();
-
1278  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
-
1279  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
-
1280  const SizeType i_subm = i - i_begin;
-
1281  const SizeType i_subm_el =
-
1282  i_subm * dist.blockSize().rows() + i_subm_el_lc % dist.blockSize().rows();
-
1283 
-
1284  const auto* i2_perm = i2_tile_arr[0].get().ptr();
-
1285  const SizeType ii_subm_el = i2_perm[i_subm_el];
-
1286  weights[to_sizet(i_subm_el_lc)] =
-
1287  std::copysign(std::sqrt(-w[i_subm_el_lc]), z_ptr[to_sizet(ii_subm_el)]);
-
1288  }
-
1289  }
-
1290 
-
1291  barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
1292 
-
1293  // STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread)
-
1294 
-
1295  // STEP 3a: Form evecs using weights vector and compute (local) sum of squares
-
1296  {
-
1297  common::internal::SingleThreadedBlasScope single;
-
1298 
-
1299  const T* w = ws_cols[nthreads]();
-
1300  T* sum_squares = ws_row();
-
1301 
-
1302  for (SizeType j_subm_lc = begin; j_subm_lc < end; ++j_subm_lc) {
-
1303  const SizeType j_lc = ij_begin_lc.col() + to_SizeType(j_subm_lc);
-
1304  const SizeType j = dist.globalTileFromLocalTile<Coord::Col>(j_lc);
-
1305  const SizeType n_subm_el = dist.globalTileElementDistance<Coord::Col>(i_begin, j);
+
1275  // STEP 2c: reduce, then finalize computation with sign and square root (single-thread)
+
1276  if (thread_idx == 0) {
+
1277  // local reduction from all bulk workers
+
1278  for (SizeType i = 0; i < m_subm_el_lc; ++i) {
+
1279  for (std::size_t tidx = 1; tidx < nthreads; ++tidx) {
+
1280  const T* w_partial = ws_cols[tidx]();
+
1281  w[i] *= w_partial[i];
+
1282  }
+
1283  }
+
1284 
+
1285  tt::sync_wait(ex::when_all(row_comm_chain(),
+
1286  ex::just(MPI_PROD, common::make_data(w, m_subm_el_lc))) |
+
1287  transformMPI(all_reduce_in_place));
+
1288 
+
1289  T* weights = ws_cols[nthreads]();
+
1290  for (SizeType i_subm_el_lc = 0; i_subm_el_lc < m_subm_el_lc; ++i_subm_el_lc) {
+
1291  const SizeType i_subm_lc = i_subm_el_lc / dist.blockSize().rows();
+
1292  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
+
1293  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
+
1294  const SizeType i_subm = i - i_begin;
+
1295  const SizeType i_subm_el =
+
1296  i_subm * dist.blockSize().rows() + i_subm_el_lc % dist.blockSize().rows();
+
1297 
+
1298  const auto* i2_perm = i2_tile_arr[0].get().ptr();
+
1299  const SizeType ii_subm_el = i2_perm[i_subm_el];
+
1300  weights[to_sizet(i_subm_el_lc)] =
+
1301  std::copysign(std::sqrt(-w[i_subm_el_lc]), z_ptr[to_sizet(ii_subm_el)]);
+
1302  }
+
1303  }
+
1304 
+
1305  barrier_ptr->arrive_and_wait(barrier_busy_wait);
1306 
-
1307  // Skip columns that are in the deflation zone
-
1308  if (n_subm_el >= k)
-
1309  break;
-
1310 
-
1311  const SizeType n_el_tl = std::min(dist.tileSize<Coord::Col>(j), k - n_subm_el);
-
1312  for (SizeType j_el_tl = 0; j_el_tl < n_el_tl; ++j_el_tl) {
-
1313  const SizeType j_subm_el_lc = j_subm_lc * dist.blockSize().cols() + j_el_tl;
-
1314  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
-
1315  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
-
1316  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
-
1317  const SizeType m_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
-
1318 
-
1319  const SizeType i_subm = i - i_begin;
-
1320  const auto& i2_perm = i2_tile_arr[to_sizet(i_subm)].get();
-
1321 
-
1322  const SizeType linear_subm_lc = i_subm_lc + sz_loc_tiles.rows() * j_subm_lc;
-
1323  const auto& q_tile = q[to_sizet(linear_subm_lc)];
+
1307  // STEP 3: Compute eigenvectors of the modified rank-1 modification (normalize) (multi-thread)
+
1308 
+
1309  // STEP 3a: Form evecs using weights vector and compute (local) sum of squares
+
1310  {
+
1311  common::internal::SingleThreadedBlasScope single;
+
1312 
+
1313  const T* w = ws_cols[nthreads]();
+
1314  T* sum_squares = ws_row();
+
1315 
+
1316  for (SizeType j_subm_lc = begin; j_subm_lc < end; ++j_subm_lc) {
+
1317  const SizeType j_lc = ij_begin_lc.col() + to_SizeType(j_subm_lc);
+
1318  const SizeType j = dist.globalTileFromLocalTile<Coord::Col>(j_lc);
+
1319  const SizeType n_subm_el = dist.globalTileElementDistance<Coord::Col>(i_begin, j);
+
1320 
+
1321  // Skip columns that are in the deflation zone
+
1322  if (n_subm_el >= k)
+
1323  break;
1324 
-
1325  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - m_subm_el);
-
1326  for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
-
1327  const SizeType ii_subm_el = i2_perm({i_el_tl, 0});
-
1328 
-
1329  const SizeType i_subm_el_lc = i_subm_lc * dist.blockSize().rows() + i_el_tl;
-
1330  if (ii_subm_el >= k)
-
1331  q_tile({i_el_tl, j_el_tl}) = 0;
-
1332  else
-
1333  q_tile({i_el_tl, j_el_tl}) = w[i_subm_el_lc] / q_tile({i_el_tl, j_el_tl});
-
1334  }
+
1325  const SizeType n_el_tl = std::min(dist.tileSize<Coord::Col>(j), k - n_subm_el);
+
1326  for (SizeType j_el_tl = 0; j_el_tl < n_el_tl; ++j_el_tl) {
+
1327  const SizeType j_subm_el_lc = j_subm_lc * dist.blockSize().cols() + j_el_tl;
+
1328  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
+
1329  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
+
1330  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
+
1331  const SizeType m_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
+
1332 
+
1333  const SizeType i_subm = i - i_begin;
+
1334  const auto& i2_perm = i2_tile_arr[to_sizet(i_subm)].get();
1335 
-
1336  sum_squares[j_subm_el_lc] +=
-
1337  blas::dot(m_el_tl, q_tile.ptr({0, j_el_tl}), 1, q_tile.ptr({0, j_el_tl}), 1);
-
1338  }
-
1339  }
-
1340  }
-
1341  }
+
1336  const SizeType linear_subm_lc = i_subm_lc + sz_loc_tiles.rows() * j_subm_lc;
+
1337  const auto& q_tile = q[to_sizet(linear_subm_lc)];
+
1338 
+
1339  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - m_subm_el);
+
1340  for (SizeType i_el_tl = 0; i_el_tl < m_el_tl; ++i_el_tl) {
+
1341  const SizeType ii_subm_el = i2_perm({i_el_tl, 0});
1342 
-
1343  barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
1344 
-
1345  // STEP 3b: Reduce to get the sum of all squares on all ranks
-
1346  if (thread_idx == 0)
-
1347  tt::sync_wait(ex::just(std::cref(col_comm), MPI_SUM,
-
1348  common::make_data(ws_row(), n_subm_el_lc)) |
-
1349  transformMPI(all_reduce_in_place));
-
1350 
-
1351  barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
1352 
-
1353  // STEP 3c: Normalize (compute norm of each column and scale column vector)
-
1354  {
-
1355  common::internal::SingleThreadedBlasScope single;
+
1343  const SizeType i_subm_el_lc = i_subm_lc * dist.blockSize().rows() + i_el_tl;
+
1344  if (ii_subm_el >= k)
+
1345  q_tile({i_el_tl, j_el_tl}) = 0;
+
1346  else
+
1347  q_tile({i_el_tl, j_el_tl}) = w[i_subm_el_lc] / q_tile({i_el_tl, j_el_tl});
+
1348  }
+
1349 
+
1350  sum_squares[j_subm_el_lc] +=
+
1351  blas::dot(m_el_tl, q_tile.ptr({0, j_el_tl}), 1, q_tile.ptr({0, j_el_tl}), 1);
+
1352  }
+
1353  }
+
1354  }
+
1355  }
1356 
-
1357  const T* sum_squares = ws_row();
+
1357  barrier_ptr->arrive_and_wait(barrier_busy_wait);
1358 
-
1359  for (SizeType j_subm_lc = begin; j_subm_lc < end; ++j_subm_lc) {
-
1360  const SizeType j_lc = ij_begin_lc.col() + to_SizeType(j_subm_lc);
-
1361  const SizeType j = dist.globalTileFromLocalTile<Coord::Col>(j_lc);
-
1362  const SizeType n_subm_el = dist.globalTileElementDistance<Coord::Col>(i_begin, j);
-
1363 
-
1364  // Skip columns that are in the deflation zone
-
1365  if (n_subm_el >= k)
-
1366  break;
-
1367 
-
1368  const SizeType n_el_tl = std::min(dist.tileSize<Coord::Col>(j), k - n_subm_el);
-
1369  for (SizeType j_el_tl = 0; j_el_tl < n_el_tl; ++j_el_tl) {
-
1370  const SizeType j_subm_el_lc = j_subm_lc * dist.blockSize().cols() + j_el_tl;
-
1371  const T vec_norm = std::sqrt(sum_squares[j_subm_el_lc]);
+
1359  // STEP 3b: Reduce to get the sum of all squares on all ranks
+
1360  if (thread_idx == 0)
+
1361  tt::sync_wait(ex::just(std::cref(col_comm), MPI_SUM,
+
1362  common::make_data(ws_row(), n_subm_el_lc)) |
+
1363  transformMPI(all_reduce_in_place));
+
1364 
+
1365  barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
1366 
+
1367  // STEP 3c: Normalize (compute norm of each column and scale column vector)
+
1368  {
+
1369  common::internal::SingleThreadedBlasScope single;
+
1370 
+
1371  const T* sum_squares = ws_row();
1372 
-
1373  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
-
1374  const SizeType linear_subm_lc = i_subm_lc + sz_loc_tiles.rows() * j_subm_lc;
-
1375  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
-
1376  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
-
1377  const SizeType m_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
-
1378 
-
1379  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - m_subm_el);
-
1380  blas::scal(m_el_tl, 1 / vec_norm, q[to_sizet(linear_subm_lc)].ptr({0, j_el_tl}), 1);
-
1381  }
-
1382  }
-
1383  }
-
1384  }
-
1385  }));
-
1386 }
-
1387 
-
1388 // Distributed version of the tridiagonal solver on CPUs
-
1389 template <Backend B, class T, Device D, class RhoSender>
-
1390 void mergeDistSubproblems(comm::CommunicatorGrid grid,
-
1391  common::Pipeline<comm::Communicator>& full_task_chain,
-
1392  common::Pipeline<comm::Communicator>& row_task_chain,
-
1393  common::Pipeline<comm::Communicator>& col_task_chain, const SizeType i_begin,
-
1394  const SizeType i_split, const SizeType i_end, RhoSender&& rho,
-
1395  WorkSpace<T, D>& ws, WorkSpaceHost<T>& ws_h,
-
1396  DistWorkSpaceHostMirror<T, D>& ws_hm) {
-
1397  namespace ex = pika::execution::experimental;
-
1398 
-
1399  const matrix::Distribution& dist_evecs = ws.e0.distribution();
-
1400 
-
1401  // Calculate the size of the upper subproblem
-
1402  const SizeType n1 = dist_evecs.globalTileElementDistance<Coord::Row>(i_begin, i_split);
-
1403 
-
1404  // The local size of the subproblem
-
1405  const GlobalTileIndex idx_gl_begin(i_begin, i_begin);
-
1406  const LocalTileIndex idx_loc_begin{dist_evecs.nextLocalTileFromGlobalTile<Coord::Row>(i_begin),
-
1407  dist_evecs.nextLocalTileFromGlobalTile<Coord::Col>(i_begin)};
-
1408  const LocalTileIndex idx_loc_end{dist_evecs.nextLocalTileFromGlobalTile<Coord::Row>(i_end),
-
1409  dist_evecs.nextLocalTileFromGlobalTile<Coord::Col>(i_end)};
-
1410  const LocalTileSize sz_loc_tiles = idx_loc_end - idx_loc_begin;
-
1411  const LocalTileIndex idx_begin_tiles_vec(i_begin, 0);
-
1412  const LocalTileSize sz_tiles_vec(i_end - i_begin, 1);
-
1413 
-
1414  // Assemble the rank-1 update vector `z` from the last row of Q1 and the first row of Q2
-
1415  assembleDistZVec(grid, full_task_chain, i_begin, i_split, i_end, rho, ws.e0, ws.z0);
-
1416  copy(idx_begin_tiles_vec, sz_tiles_vec, ws.z0, ws_hm.z0);
+
1373  for (SizeType j_subm_lc = begin; j_subm_lc < end; ++j_subm_lc) {
+
1374  const SizeType j_lc = ij_begin_lc.col() + to_SizeType(j_subm_lc);
+
1375  const SizeType j = dist.globalTileFromLocalTile<Coord::Col>(j_lc);
+
1376  const SizeType n_subm_el = dist.globalTileElementDistance<Coord::Col>(i_begin, j);
+
1377 
+
1378  // Skip columns that are in the deflation zone
+
1379  if (n_subm_el >= k)
+
1380  break;
+
1381 
+
1382  const SizeType n_el_tl = std::min(dist.tileSize<Coord::Col>(j), k - n_subm_el);
+
1383  for (SizeType j_el_tl = 0; j_el_tl < n_el_tl; ++j_el_tl) {
+
1384  const SizeType j_subm_el_lc = j_subm_lc * dist.blockSize().cols() + j_el_tl;
+
1385  const T vec_norm = std::sqrt(sum_squares[j_subm_el_lc]);
+
1386 
+
1387  for (SizeType i_subm_lc = 0; i_subm_lc < sz_loc_tiles.rows(); ++i_subm_lc) {
+
1388  const SizeType linear_subm_lc = i_subm_lc + sz_loc_tiles.rows() * j_subm_lc;
+
1389  const SizeType i_lc = ij_begin_lc.row() + i_subm_lc;
+
1390  const SizeType i = dist.globalTileFromLocalTile<Coord::Row>(i_lc);
+
1391  const SizeType m_subm_el = dist.globalTileElementDistance<Coord::Row>(i_begin, i);
+
1392 
+
1393  const SizeType m_el_tl = std::min(dist.tileSize<Coord::Row>(i), n - m_subm_el);
+
1394  blas::scal(m_el_tl, 1 / vec_norm, q[to_sizet(linear_subm_lc)].ptr({0, j_el_tl}), 1);
+
1395  }
+
1396  }
+
1397  }
+
1398  }
+
1399  }));
+
1400 }
+
1401 
+
1402 // Distributed version of the tridiagonal solver on CPUs
+
1403 template <Backend B, class T, Device D, class RhoSender>
+
1404 void mergeDistSubproblems(comm::CommunicatorGrid grid,
+
1405  common::Pipeline<comm::Communicator>& full_task_chain,
+
1406  common::Pipeline<comm::Communicator>& row_task_chain,
+
1407  common::Pipeline<comm::Communicator>& col_task_chain, const SizeType i_begin,
+
1408  const SizeType i_split, const SizeType i_end, RhoSender&& rho,
+
1409  WorkSpace<T, D>& ws, WorkSpaceHost<T>& ws_h,
+
1410  DistWorkSpaceHostMirror<T, D>& ws_hm) {
+
1411  namespace ex = pika::execution::experimental;
+
1412 
+
1413  const matrix::Distribution& dist_evecs = ws.e0.distribution();
+
1414 
+
1415  // Calculate the size of the upper subproblem
+
1416  const SizeType n1 = dist_evecs.globalTileElementDistance<Coord::Row>(i_begin, i_split);
1417 
-
1418  // Double `rho` to account for the normalization of `z` and make sure `rho > 0` for the root solver laed4
-
1419  auto scaled_rho = scaleRho(std::move(rho)) | ex::split();
-
1420 
-
1421  // Calculate the tolerance used for deflation
-
1422  auto tol = calcTolerance(i_begin, i_end, ws_h.d0, ws_hm.z0);
-
1423 
-
1424  // Initialize the column types vector `c`
-
1425  initColTypes(i_begin, i_split, i_end, ws_h.c);
-
1426 
-
1427  // Step #1
-
1428  //
-
1429  // i1 (out) : initial <--- initial (identity map)
-
1430  // i2 (out) : initial <--- pre_sorted
-
1431  //
-
1432  // - deflate `d`, `z` and `c`
-
1433  // - apply Givens rotations to `Q` - `evecs`
-
1434  //
-
1435  if (i_split == i_begin + 1) {
-
1436  initIndex(i_begin, i_split, ws_h.i1);
-
1437  }
-
1438  if (i_split + 1 == i_end) {
-
1439  initIndex(i_split, i_end, ws_h.i1);
-
1440  }
-
1441  addIndex(i_split, i_end, n1, ws_h.i1);
-
1442  sortIndex(i_begin, i_end, ex::just(n1), ws_h.d0, ws_h.i1, ws_hm.i2);
-
1443 
-
1444  auto rots =
-
1445  applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_hm.i2, ws_h.d0, ws_hm.z0, ws_h.c);
-
1446 
-
1447  // Make sure Isend/Irecv messages don't match between calls by providing a unique `tag`
+
1418  // The local size of the subproblem
+
1419  const GlobalTileIndex idx_gl_begin(i_begin, i_begin);
+
1420  const LocalTileIndex idx_loc_begin{dist_evecs.nextLocalTileFromGlobalTile<Coord::Row>(i_begin),
+
1421  dist_evecs.nextLocalTileFromGlobalTile<Coord::Col>(i_begin)};
+
1422  const LocalTileIndex idx_loc_end{dist_evecs.nextLocalTileFromGlobalTile<Coord::Row>(i_end),
+
1423  dist_evecs.nextLocalTileFromGlobalTile<Coord::Col>(i_end)};
+
1424  const LocalTileSize sz_loc_tiles = idx_loc_end - idx_loc_begin;
+
1425  const LocalTileIndex idx_begin_tiles_vec(i_begin, 0);
+
1426  const LocalTileSize sz_tiles_vec(i_end - i_begin, 1);
+
1427 
+
1428  // Assemble the rank-1 update vector `z` from the last row of Q1 and the first row of Q2
+
1429  assembleDistZVec(grid, full_task_chain, i_begin, i_split, i_end, rho, ws.e0, ws.z0);
+
1430  copy(idx_begin_tiles_vec, sz_tiles_vec, ws.z0, ws_hm.z0);
+
1431 
+
1432  // Double `rho` to account for the normalization of `z` and make sure `rho > 0` for the root solver laed4
+
1433  auto scaled_rho = scaleRho(std::move(rho)) | ex::split();
+
1434 
+
1435  // Calculate the tolerance used for deflation
+
1436  auto tol = calcTolerance(i_begin, i_end, ws_h.d0, ws_hm.z0);
+
1437 
+
1438  // Initialize the column types vector `c`
+
1439  initColTypes(i_begin, i_split, i_end, ws_h.c);
+
1440 
+
1441  // Step #1
+
1442  //
+
1443  // i1 (out) : initial <--- initial (identity map)
+
1444  // i2 (out) : initial <--- pre_sorted
+
1445  //
+
1446  // - deflate `d`, `z` and `c`
+
1447  // - apply Givens rotations to `Q` - `evecs`
1448  //
-
1449  // Note: i_split is unique
-
1450  const comm::IndexT_MPI tag = to_int(i_split);
-
1451  applyGivensRotationsToMatrixColumns(grid.rowCommunicator(), tag, i_begin, i_end, std::move(rots),
-
1452  ws.e0);
-
1453  // Placeholder for rearranging the eigenvectors: (local permutation)
-
1454  copy(idx_loc_begin, sz_loc_tiles, ws.e0, ws.e1);
-
1455 
-
1456  // Step #2
-
1457  //
-
1458  // i2 (in) : initial <--- pre_sorted
-
1459  // i3 (out) : initial <--- deflated
-
1460  //
-
1461  // - reorder `d0 -> d1`, `z0 -> z1`, using `i3` such that deflated entries are at the bottom.
-
1462  // - solve the rank-1 problem and save eigenvalues in `d0` and `d1` (copy) and eigenvectors in `e2`.
-
1463  // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented)
-
1464  //
-
1465  auto k =
-
1466  stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_h.d0, ws_hm.i2, ws_h.i3) | ex::split();
-
1467  applyIndex(i_begin, i_end, ws_h.i3, ws_h.d0, ws_hm.d1);
-
1468  applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z0, ws_hm.z1);
-
1469  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.d1, ws_h.d0);
-
1470 
+
1449  if (i_split == i_begin + 1) {
+
1450  initIndex(i_begin, i_split, ws_h.i1);
+
1451  }
+
1452  if (i_split + 1 == i_end) {
+
1453  initIndex(i_split, i_end, ws_h.i1);
+
1454  }
+
1455  addIndex(i_split, i_end, n1, ws_h.i1);
+
1456  sortIndex(i_begin, i_end, ex::just(n1), ws_h.d0, ws_h.i1, ws_hm.i2);
+
1457 
+
1458  auto rots =
+
1459  applyDeflation(i_begin, i_end, scaled_rho, std::move(tol), ws_hm.i2, ws_h.d0, ws_hm.z0, ws_h.c);
+
1460 
+
1461  // Make sure Isend/Irecv messages don't match between calls by providing a unique `tag`
+
1462  //
+
1463  // Note: i_split is unique
+
1464  const comm::IndexT_MPI tag = to_int(i_split);
+
1465  applyGivensRotationsToMatrixColumns(grid.rowCommunicator(), tag, i_begin, i_end, std::move(rots),
+
1466  ws.e0);
+
1467  // Placeholder for rearranging the eigenvectors: (local permutation)
+
1468  copy(idx_loc_begin, sz_loc_tiles, ws.e0, ws.e1);
+
1469 
+
1470  // Step #2
1471  //
-
1472  // i3 (in) : initial <--- deflated
-
1473  // i2 (out) : initial ---> deflated
+
1472  // i2 (in) : initial <--- pre_sorted
+
1473  // i3 (out) : initial <--- deflated
1474  //
-
1475  invertIndex(i_begin, i_end, ws_h.i3, ws_hm.i2);
-
1476 
-
1477  // Note: here ws_hm.z0 is used as a contiguous buffer for the laed4 call
-
1478  matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, idx_loc_begin, sz_loc_tiles,
-
1479  ws_hm.e2);
-
1480  solveRank1ProblemDist(row_task_chain(), col_task_chain(), i_begin, i_end, idx_loc_begin, sz_loc_tiles,
-
1481  k, std::move(scaled_rho), ws_hm.d1, ws_hm.z1, ws_h.d0, ws_hm.i2, ws_hm.e2);
-
1482 
-
1483  // Step #3: Eigenvectors of the tridiagonal system: Q * U
-
1484  //
-
1485  // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as
-
1486  // prepared for the deflated system.
-
1487  copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2);
-
1488  dlaf::multiplication::internal::generalSubMatrix<B, D, T>(grid, row_task_chain, col_task_chain,
-
1489  i_begin, i_end, T(1), ws.e1, ws.e2, T(0),
-
1490  ws.e0);
-
1491 
-
1492  // Step #4: Final permutation to sort eigenvalues and eigenvectors
-
1493  //
-
1494  // i1 (in) : deflated <--- deflated (identity map)
-
1495  // i2 (out) : deflated <--- post_sorted
-
1496  //
-
1497  initIndex(i_begin, i_end, ws_h.i1);
-
1498  sortIndex(i_begin, i_end, std::move(k), ws_h.d0, ws_h.i1, ws_hm.i2);
-
1499  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i2, ws_h.i1);
-
1500 }
-
1501 }
+
1475  // - reorder `d0 -> d1`, `z0 -> z1`, using `i3` such that deflated entries are at the bottom.
+
1476  // - solve the rank-1 problem and save eigenvalues in `d0` and `d1` (copy) and eigenvectors in `e2`.
+
1477  // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented)
+
1478  //
+
1479  auto k =
+
1480  stablePartitionIndexForDeflation(i_begin, i_end, ws_h.c, ws_h.d0, ws_hm.i2, ws_h.i3) | ex::split();
+
1481  applyIndex(i_begin, i_end, ws_h.i3, ws_h.d0, ws_hm.d1);
+
1482  applyIndex(i_begin, i_end, ws_h.i3, ws_hm.z0, ws_hm.z1);
+
1483  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.d1, ws_h.d0);
+
1484 
+
1485  //
+
1486  // i3 (in) : initial <--- deflated
+
1487  // i2 (out) : initial ---> deflated
+
1488  //
+
1489  invertIndex(i_begin, i_end, ws_h.i3, ws_hm.i2);
+
1490 
+
1491  // Note: here ws_hm.z0 is used as a contiguous buffer for the laed4 call
+
1492  matrix::util::set0<Backend::MC>(pika::execution::thread_priority::normal, idx_loc_begin, sz_loc_tiles,
+
1493  ws_hm.e2);
+
1494  solveRank1ProblemDist(row_task_chain(), col_task_chain(), i_begin, i_end, idx_loc_begin, sz_loc_tiles,
+
1495  k, std::move(scaled_rho), ws_hm.d1, ws_hm.z1, ws_h.d0, ws_hm.i2, ws_hm.e2);
+
1496 
+
1497  // Step #3: Eigenvectors of the tridiagonal system: Q * U
+
1498  //
+
1499  // The eigenvectors resulting from the multiplication are already in the order of the eigenvalues as
+
1500  // prepared for the deflated system.
+
1501  copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e2);
+
1502  dlaf::multiplication::internal::generalSubMatrix<B, D, T>(grid, row_task_chain, col_task_chain,
+
1503  i_begin, i_end, T(1), ws.e1, ws.e2, T(0),
+
1504  ws.e0);
+
1505 
+
1506  // Step #4: Final permutation to sort eigenvalues and eigenvectors
+
1507  //
+
1508  // i1 (in) : deflated <--- deflated (identity map)
+
1509  // i2 (out) : deflated <--- post_sorted
+
1510  //
+
1511  initIndex(i_begin, i_end, ws_h.i1);
+
1512  sortIndex(i_begin, i_end, std::move(k), ws_h.d0, ws_h.i1, ws_hm.i2);
+
1513  copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i2, ws_h.i1);
+
1514 }
+
1515 }
Definition: communicator.h:40