-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement seed finding Thunderdome in CUDA #410
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/** TRACCC library, part of the ACTS project (R&D line) | ||
* | ||
* (c) 2021-2023 CERN for the benefit of the ACTS project | ||
* | ||
* Mozilla Public License Version 2.0 | ||
*/ | ||
|
||
#pragma once | ||
|
||
// Project include(s). | ||
#include "traccc/cuda/utils/stream.hpp" | ||
#include "traccc/edm/seed.hpp" | ||
#include "traccc/edm/nseed.hpp" | ||
#include "traccc/utils/algorithm.hpp" | ||
#include "traccc/utils/memory_resource.hpp" | ||
#include <vecmem/utils/copy.hpp> | ||
#include <vecmem/memory/unique_ptr.hpp> | ||
|
||
|
||
namespace traccc::cuda { | ||
|
||
class seed_merging : public algorithm<std::pair<vecmem::unique_alloc_ptr<nseed<20>[]>, uint32_t>(const seed_collection_types::buffer&)> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why At the same time the return type should really instead be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah okay, sorry if I did not explain properly. The five-iteration loop you commented on below means that seeds can become at most five spacepoints longer than they are now. For example, seed ABC can be merged with BCD to form ABCD in step 1, then that can merge with BCDE in step 2, etc. So In principle the seed capacity needs to be 3 + n where n is the number of |
||
public: | ||
seed_merging(const traccc::memory_resource& mr, stream& str); | ||
|
||
output_type operator()( | ||
const seed_collection_types::buffer&) const override; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The input parameter needs to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
|
||
private: | ||
traccc::memory_resource m_mr; | ||
stream& m_stream; | ||
}; | ||
|
||
} // namespace traccc::cuda |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
#include "traccc/cuda/seed_merging/seed_merging.hpp" | ||
#include "traccc/edm/seed.hpp" | ||
#include "traccc/edm/nseed.hpp" | ||
#include "traccc/cuda/utils/definitions.hpp" | ||
|
||
#include <iostream> | ||
|
||
namespace traccc::cuda { | ||
namespace kernels { | ||
template<std::size_t N> | ||
__global__ void convert_to_nseeds(seed_collection_types::view vf, nseed<N> * out, unsigned long long * out_n) { | ||
seed_collection_types::device vfd(vf); | ||
|
||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < vf.size(); i += gridDim.x * blockDim.x) { | ||
out[i] = vfd[i]; | ||
atomicAdd(out_n, 1ULL); | ||
} | ||
} | ||
Comment on lines
+11
to
+18
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand this kernel... 😕 What's up with the At the same time, you'll need to put some amount of explanation here about that for-loop. I assume you're going for optimal cache line usage with it. Though on first look I don't understand why this access would yield better caching than just processing a single seed in every thread. 😕 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're absolutely correct that this doesn't make sense right now. What I want to do in the future is turn this kernel into a seed binning kernel that stores seeds into φ-bins to reduce the amount of work done. When I get that sorted I will need to do atomic space reservations. For now, with only one bin, this is indeed not really necessary. |
||
|
||
template<std::size_t N, std::size_t M> | ||
__global__ void merge_nseeds(const nseed<N> * in, const unsigned long long * in_c, nseed<M> * out, unsigned long long * out_c) { | ||
__shared__ nseed<M> out_seeds[32]; | ||
__shared__ uint32_t num_seeds; | ||
__shared__ uint32_t num_consm; | ||
__shared__ uint32_t out_index; | ||
|
||
if (threadIdx.x == 0) { | ||
num_seeds = 0; | ||
num_consm = 0; | ||
} | ||
|
||
__syncthreads(); | ||
|
||
for (int i = threadIdx.x; i < *in_c; i += blockDim.x) { | ||
if (i == blockIdx.x) { | ||
continue; | ||
} | ||
|
||
bool compat, consumed; | ||
|
||
if (in[blockIdx.x].size() == in[i].size()) { | ||
compat = true; | ||
consumed = true; | ||
|
||
for (int j = 0; j < in[i].size() - 1; ++j) { | ||
if (in[blockIdx.x]._sps[j+1] != in[i]._sps[j]) { | ||
compat = false; | ||
} | ||
|
||
if (in[i]._sps[j+1] != in[blockIdx.x]._sps[j]) { | ||
consumed = false; | ||
} | ||
} | ||
} else { | ||
if (in[i].size() > in[blockIdx.x].size()) { | ||
consumed = false; | ||
|
||
for (int j = 0; j < in[i].size(); ++j) { | ||
for (int k = 0; k < in[blockIdx.x].size(); ++k) { | ||
if (in[i]._sps[j] == in[blockIdx.x]._sps[k]) { | ||
consumed = true; | ||
} | ||
} | ||
} | ||
} else { | ||
consumed = false; | ||
} | ||
|
||
compat = false; | ||
} | ||
|
||
if (compat) { | ||
nseed<M> new_seed; | ||
|
||
new_seed._size = in[blockIdx.x]._size + 1; | ||
|
||
int j = 0; | ||
|
||
for (; j < in[blockIdx.x].size(); ++j) { | ||
new_seed._sps[j] = in[blockIdx.x]._sps[j]; | ||
} | ||
|
||
new_seed._sps[j] = in[i]._sps[in[i].size() - 1]; | ||
|
||
uint32_t idx = atomicAdd(&num_seeds, 1); | ||
|
||
if (idx < 32) { | ||
out_seeds[idx] = new_seed; | ||
} | ||
} | ||
|
||
if (consumed) { | ||
atomicAdd(&num_consm, 1); | ||
} | ||
} | ||
|
||
__syncthreads(); | ||
|
||
if (threadIdx.x == 0) { | ||
if (num_seeds == 0 && num_consm == 0) { | ||
out_index = atomicAdd(out_c, 1U); | ||
out[out_index] = in[blockIdx.x]; | ||
} else { | ||
out_index = atomicAdd(out_c, num_seeds); | ||
} | ||
} | ||
|
||
__syncthreads(); | ||
|
||
for (int i = threadIdx.x; i < num_seeds; i += blockDim.x) { | ||
out[out_index + i] = out_seeds[i]; | ||
} | ||
} | ||
} | ||
|
||
seed_merging::seed_merging(const traccc::memory_resource& mr, stream& str) : m_mr(mr), m_stream(str) { | ||
} | ||
|
||
seed_merging::output_type seed_merging::operator()(const seed_collection_types::buffer&i) const { | ||
vecmem::unique_alloc_ptr<nseed<20>[]> | ||
arr1 = vecmem::make_unique_alloc<nseed<20>[]>(m_mr.main, 1000000), | ||
arr2 = vecmem::make_unique_alloc<nseed<20>[]>(m_mr.main, 1000000); | ||
Comment on lines
+120
to
+122
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's with the 1M fixed numbers? 😕 Just ask the input view how many elements it has. I.e. const unsigned int nseeds = m_copy.get_size(input_view); Assuming that you switch to using a view, and that you ask the user to provide a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the thing with this is that it is possible for the number of seeds to increase, at least in early steps of the algorithms. So I need a little bit of buffer space for that. But you're right I can do two times the number of initial seeds, for example. |
||
|
||
vecmem::unique_alloc_ptr<unsigned long long> | ||
siz1 = vecmem::make_unique_alloc<unsigned long long>(m_mr.main), | ||
siz2 = vecmem::make_unique_alloc<unsigned long long>(m_mr.main); | ||
|
||
kernels::convert_to_nseeds<20><<<2048, 256>>>(i, arr1.get(), siz1.get()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should try to write new code with asynchronicity in mind. But this is just to note this, I'm fine with this PR not caring about asynchronicity yet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll make sure to add the appropriate synchronization points later. This algorithm is also a prime candidate for Dynamic Parallelism 2, which might reduce the number of synchronization points. 🙂 |
||
|
||
CUDA_ERROR_CHECK(cudaGetLastError()); | ||
CUDA_ERROR_CHECK(cudaDeviceSynchronize()); | ||
|
||
unsigned long long rc; | ||
|
||
CUDA_ERROR_CHECK(cudaMemcpy(&rc, siz1.get(), sizeof(unsigned long long), cudaMemcpyDeviceToHost)); | ||
|
||
std::cout << "Step 0 has " << rc << " seeds." << std::endl; | ||
|
||
for (std::size_t i = 0; i < 5; ++i) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I imagine 5 is a number that you came up with after some testing. That is fine, but it has to be made a configurable property of the algorithm... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Absolutely. |
||
CUDA_ERROR_CHECK(cudaMemset(siz2.get(), 0, sizeof(unsigned long long))); | ||
kernels::merge_nseeds<20, 20><<<rc, 256>>>(arr1.get(), siz1.get(), arr2.get(), siz2.get()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Curious. I thought the compiler would be able to deduce those template parameters automatically... 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It probably can, actually! I added them explicitly for no real good reason, honestly. |
||
|
||
CUDA_ERROR_CHECK(cudaGetLastError()); | ||
CUDA_ERROR_CHECK(cudaDeviceSynchronize()); | ||
|
||
std::swap(arr1, arr2); | ||
std::swap(siz1, siz2); | ||
Comment on lines
+146
to
+147
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this. 😄 You should've mentioned that you're taking queues from the discussion we had with Beomki about AdePT's memory handling. 😛 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not to toot my own horn but I've been on this train for a while. 👼 |
||
|
||
CUDA_ERROR_CHECK(cudaMemcpy(&rc, siz1.get(), sizeof(unsigned long long), cudaMemcpyDeviceToHost)); | ||
|
||
std::cout << "Step " << (i + 1) << " has " << rc << " seeds." << std::endl; | ||
} | ||
|
||
|
||
return {std::move(arr1), rc}; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If client code is meant to access these variables now, they should really not be prefixed with
_
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Absolutely correct, I plan on renaming these and making some more changes to the
nseed
class during the development of this PR. Well spotted. 👍