From 0f1c01898b8c30477fe37bd7d56d4e7e3be43e01 Mon Sep 17 00:00:00 2001 From: waredjeb <39335169+waredjeb@users.noreply.github.com> Date: Tue, 29 Oct 2019 07:09:04 +0100 Subject: [PATCH] Replace CUDA API wrapper memory operations with native CUDA calls (#395) --- .../PixelVertexFinding/test/VertexFinder_t.h | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 0df7af362ac0d..14263ed7b3d18 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -6,6 +6,7 @@ #include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" #ifdef USE_DBSCAN @@ -126,10 +127,10 @@ int main() { std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cuda::memory::copy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); - cuda::memory::copy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); - cuda::memory::copy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); - cuda::memory::copy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); + cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); #else ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); @@ -162,7 +163,7 @@ int main() { cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); cudaCheck(cudaGetLastError()); - cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t)); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else print(onGPU_d.get(), ws_d.get()); @@ -207,8 +208,8 @@ int main() { #endif #ifdef __CUDACC__ - cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t)); - cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); #endif @@ -223,9 +224,9 @@ int main() { #ifdef __CUDACC__ cudautils::launch(fitVertices, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t)); - cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t)); - cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else fitVertices(onGPU_d.get(), ws_d.get(), 50.f); nv = onGPU_d->nvFinal; @@ -243,7 +244,7 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! cudautils::launch(splitVertices, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); - cuda::memory::copy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t)); + cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else gridDim.x = 1024; // nv ???? assert(blockIdx.x == 0); @@ -260,7 +261,7 @@ int main() { cudautils::launch(sortByPt2, {1, 256}, onGPU_d.get(), ws_d.get()); cudaCheck(cudaGetLastError()); - cuda::memory::copy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t)); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); sortByPt2(onGPU_d.get(), ws_d.get()); @@ -274,12 +275,12 @@ int main() { } #ifdef __CUDACC__ - cuda::memory::copy(zv, LOC_ONGPU(zv), nv * sizeof(float)); - cuda::memory::copy(wv, LOC_ONGPU(wv), nv * sizeof(float)); - cuda::memory::copy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); - cuda::memory::copy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float)); - cuda::memory::copy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t)); - cuda::memory::copy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t)); + cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif for (auto j = 0U; j < nv; ++j) if (nn[j] > 0)