From aacf68e45153be4a6fc4537cf157ba3a25986714 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 9 Jun 2023 10:14:02 -0400 Subject: [PATCH 01/25] Adding exception for arrayOfStructure option for bGrid. --- libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h | 3 +++ libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp | 4 ++-- .../tests/domain-neighbour-globalIdx/src/runHelper.h | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 3c57d7d9..3d8fbfb7 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -28,6 +28,9 @@ bField(grid.getBackend()); mData->grid = std::make_shared(grid); + if(memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs){ + NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs"); + } // the allocation size is the number of blocks x block size x cardinality mData->memoryField = mData->grid->helpGetBlockViewGrid().template newField( "BitMask", diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp index bdf77a74..feba5a9b 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp @@ -22,9 +22,9 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGridSingleGPU) +TEST(domain_unit_test_globalIdx, bGrid) { - int nGpus = 1; + int nGpus = 5; using Type = int64_t; runAllTestConfiguration(std::function(globalIdx::run), nGpus, diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h index e064a49a..0014594c 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h @@ -82,6 +82,9 @@ void runAllTestConfiguration( if (dim.z < 8 * ngpu * 3) { dim.z = ngpu * 3 * 8; } + if(memoryLayout == Neon::MemoryLayout::arrayOfStructs){ + continue ; + } } assert(card == 1); From 18f2d7223fc2de5a0e41c7bd1c906035ea2e79a4 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 15:37:00 -0400 Subject: [PATCH 02/25] Some documentation to bGrid. --- .../Neon/domain/details/bGrid/bField_imp.h | 4 +- .../include/Neon/domain/details/bGrid/bGrid.h | 81 ++++++++++++++++--- .../Neon/domain/details/bGrid/bGrid_imp.h | 4 +- 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 3d8fbfb7..687b7a0d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -32,7 +32,7 @@ bFieldmemoryField = mData->grid->helpGetBlockViewGrid().template newField( + mData->memoryField = mData->grid->getBlockViewGrid().template newField( "BitMask", [&] { int elPerBlock = dataBlockSize3D.rMul(); @@ -53,7 +53,7 @@ bFieldmemoryField.getPartition(execution, setIdx, Neon::DataView::STANDARD); auto& blockConnectivity = mData->grid->helpGetBlockConnectivity().getPartition(execution, setIdx, Neon::DataView::STANDARD); - auto& bitmask = mData->grid->helpGetActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& bitmask = mData->grid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD); auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD); partition = bPartition(setIdx, diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index e19ef98a..c31831ff 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -66,25 +66,46 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate - bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const int voxelSpacing, - const double_3d& spacingData = double_3d(1, 1, 1), - const double_3d& origin = double_3d(0, 0, 0)); - + bGrid(const Neon::Backend& backend /**< Neon backend for the computation */, + const Neon::int32_3d& domainSize /**< Size of the bounded Cartesian */, + const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, + const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, + const int voxelSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/ + , + const double_3d& spacingData = double_3d(1, 1, 1) /** Physical spacing between two consecutive data points in the Cartesian domain */, + const double_3d& origin = double_3d(0, 0, 0) /** Physical location in space of the origin of the Cartesian discretization */); + /** + * Returns some properties for a given cartesian in the Cartesian domain. + * The provide index my be inside or outside the user defined bounded Cartesian domain + */ auto getProperties(const Neon::index_3d& idx) const -> typename GridBaseTemplate::CellProperties final; + /** + * Returns true if the query 3D point is inside the user domain + * @param idx + * @return + */ auto isInsideDomain(const Neon::index_3d& idx) const -> bool final; + /** + * Retrieves the device index that contains the query point + * @param idx + * @return + */ auto getSetIdx(const Neon::index_3d& idx) const -> int32_t final; + /** + * Allocates a new field on the grid + */ template auto newField(const std::string name, int cardinality, @@ -93,6 +114,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate Field; + /** + * Allocates a new field on the block view grid + */ template auto newBlockViewField(const std::string name, int cardinality, @@ -101,6 +125,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate BlockViewGrid::Field; + /* + * Allocates a new container to execute some computation in the grid + */ template auto newContainer(const std::string& name, @@ -108,26 +135,58 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate Neon::set::Container; + /* + * Allocates a new container to execute some computation in the grid + */ template auto newContainer(const std::string& name, LoadingLambda lambda) const -> Neon::set::Container; - + /** + * Defines a new set of parameter to launch a Container + */ auto getLaunchParameters(Neon::DataView dataView, const Neon::index_3d& blockSize, const size_t& sharedMem) const -> Neon::set::LaunchParameters; + /** + * Retrieve the span associated to the grid w.r.t. some user defined parameters. + */ auto getSpan(Neon::Execution execution, SetIdx setIdx, Neon::DataView dataView) -> const Span&; - auto helpGetBlockViewGrid() const -> BlockViewGrid&; - auto helpGetActiveBitMask() const -> BlockViewGrid::Field&; + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getBlockViewGrid() const -> BlockViewGrid&; + + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getActiveBitMask() const -> BlockViewGrid::Field&; + + /** + * Help function to retrieve the block connectivity as a BlockViewGrid field + */ auto helpGetBlockConnectivity() const -> BlockViewGrid::Field&; + + /** + * Help function to retrieve the block origin as a BlockViewGrid field + */ auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field&; + + /* + * Help function to retrieve the map that converts a stencil point id to 3d offset + */ auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet&; + /* + * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid + */ auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple; struct Data diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index fcd0f803..03c1bd59 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -313,7 +313,7 @@ auto bGrid auto bGrid:: - helpGetBlockViewGrid() + getBlockViewGrid() const -> BlockViewGrid& { return mData->blockViewGrid; @@ -321,7 +321,7 @@ auto bGrid auto bGrid:: - helpGetActiveBitMask() + getActiveBitMask() const -> BlockViewGrid::Field& { return mData->activeBitMask; From b81c423586558e2d20b4e9c3684d077999ea0fbf Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 17:20:19 -0400 Subject: [PATCH 03/25] bGrid: API documentation and refactoring of the template API. --- libNeonDomain/include/Neon/domain/bGrid.h | 2 +- .../Neon/domain/details/bGrid/StaticBlock.h | 46 +++++ .../Neon/domain/details/bGrid/bField.h | 22 +-- .../Neon/domain/details/bGrid/bField_imp.h | 99 +++++----- .../include/Neon/domain/details/bGrid/bGrid.h | 29 +-- .../Neon/domain/details/bGrid/bGrid_imp.h | 174 +++++++++--------- .../Neon/domain/details/bGrid/bIndex.h | 38 ++-- .../Neon/domain/details/bGrid/bIndex_imp.h | 80 +++----- .../Neon/domain/details/bGrid/bPartition.h | 8 +- .../domain/details/bGrid/bPartition_imp.h | 70 +++---- .../include/Neon/domain/details/bGrid/bSpan.h | 6 +- .../Neon/domain/details/bGrid/bSpan_imp.h | 32 ++-- .../src/domain/details/bGrid/bGrid.cpp | 2 +- .../tests/domain-bGrid-tray/src/gtests.cpp | 42 ++--- 14 files changed, 322 insertions(+), 328 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h diff --git a/libNeonDomain/include/Neon/domain/bGrid.h b/libNeonDomain/include/Neon/domain/bGrid.h index 13c01cc3..39a4f366 100644 --- a/libNeonDomain/include/Neon/domain/bGrid.h +++ b/libNeonDomain/include/Neon/domain/bGrid.h @@ -2,5 +2,5 @@ #include "Neon/domain/details/bGrid/bGrid.h" namespace Neon { -using bGrid = Neon::domain::details::bGrid::bGrid<8,8,8>; +using bGrid = Neon::domain::details::bGrid::bGrid>; } \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h new file mode 100644 index 00000000..612c6b9a --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h @@ -0,0 +1,46 @@ +#include "Neon/domain/details/bGrid/bSpan.h" + +namespace Neon::domain::details::bGrid { + +template +struct StaticBlock +{ + public: + constexpr static uint32_t memBlockSizeX = memBlockSizeX_; + constexpr static uint32_t memBlockSizeY = memBlockSizeY_; + constexpr static uint32_t memBlockSizeZ = memBlockSizeZ_; + constexpr static Neon::uint32_3d memBlockSize3D = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); + + constexpr static uint32_t userBlockSizeX = userBlockSizeX_; + constexpr static uint32_t userBlockSizeY = userBlockSizeY_; + constexpr static uint32_t userBlockSizeZ = userBlockSizeZ_; + constexpr static Neon::uint32_3d userBlockSize3D = Neon::uint32_3d(userBlockSizeX, userBlockSizeY, userBlockSizeZ); + + constexpr static uint32_t blockRatioX = memBlockSizeX / userBlockSizeX; + constexpr static uint32_t blockRatioY = memBlockSizeY / userBlockSizeY; + constexpr static uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ; + + constexpr static uint32_t memBlockPitchX = 1; + constexpr static uint32_t memBlockPitchY = memBlockSizeX; + constexpr static uint32_t memBlockPitchZ = memBlockSizeX * memBlockSizeY; + + constexpr static bool isMultiResMode = isMultiResMode_; + + constexpr static uint32_t memBlockCountElements = memBlockSizeX * memBlockSizeY * memBlockSizeZ; + + static_assert(memBlockSizeX >= userBlockSizeX); + static_assert(memBlockSizeY >= userBlockSizeY); + static_assert(memBlockSizeZ >= userBlockSizeZ); + + static_assert(memBlockSizeX % userBlockSizeX == 0); + static_assert(memBlockSizeY % userBlockSizeY == 0); + static_assert(memBlockSizeZ % userBlockSizeZ == 0); +}; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index f232d96b..95c1d6d5 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -18,31 +18,25 @@ namespace Neon::domain::details::bGrid { -template +template class bField : public Neon::domain::interface::FieldBaseTemplate, - bPartition, + bGrid, + bPartition, int> { - friend bGrid; + friend bGrid; public: using Type = T; - using Grid = bGrid; - using Field = bField; - using Partition = bPartition; - using Idx = bIndex; + using Grid = bGrid; + using Field = bField; + using Partition = bPartition; + using Idx = bIndex; using NghIdx = typename Partition::NghIdx; using NghData = typename Partition::NghData; - static constexpr Neon::index_3d dataBlockSize3D = Neon::index_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); - - static constexpr Neon::int8_3d DataBlockSize = Neon::int8_3d(memBlockSizeX, - memBlockSizeY, - memBlockSizeZ); - bField(const std::string& fieldUserName, Neon::DataUse dataUse, diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 687b7a0d..a9c249ca 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -4,19 +4,19 @@ namespace Neon::domain::details::bGrid { -template -bField::bField() +template +bField::bField() { mData = std::make_shared(); } -template -bField::bField(const std::string& fieldUserName, - Neon::DataUse dataUse, - const Neon::MemoryOptions& memoryOptions, - const Grid& grid, - int cardinality, - T inactiveValue) +template +bField::bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + const Neon::MemoryOptions& memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue) : Neon::domain::interface::FieldBaseTemplate(&grid, fieldUserName, "bField", @@ -28,20 +28,19 @@ bField(grid.getBackend()); mData->grid = std::make_shared(grid); - if(memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs){ + if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) { NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs"); } // the allocation size is the number of blocks x block size x cardinality mData->memoryField = mData->grid->getBlockViewGrid().template newField( "BitMask", [&] { - int elPerBlock = dataBlockSize3D.rMul(); - elPerBlock = elPerBlock * cardinality; + int elPerBlock = SBlock::memBlockCountElements * cardinality; return elPerBlock; }(), 0, dataUse, - mData->grid->getBackend().getMemoryOptions(bSpan::activeMaskMemoryLayout)); + mData->grid->getBackend().getMemoryOptions(bSpan::activeMaskMemoryLayout)); { // Setting up partitionTable @@ -56,28 +55,28 @@ bFieldgrid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD); auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD); - partition = bPartition(setIdx, - cardinality, - memoryFieldPartition.mem(), - blockConnectivity.mem(), - bitmask.mem(), - dataBlockOrigins.mem(), - mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx)); + partition = bPartition(setIdx, + cardinality, + memoryFieldPartition.mem(), + blockConnectivity.mem(), + bitmask.mem(), + dataBlockOrigins.mem(), + mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx)); }); } initHaloUpdateTable(); } -template -auto bField::isInsideDomain(const Neon::index_3d& idx) const -> bool +template +auto bField::isInsideDomain(const Neon::index_3d& idx) const -> bool { return mData->grid->isInsideDomain(idx); } -template -auto bField::getReference(const Neon::index_3d& cartesianIdx, - const int& cardinality) -> T& +template +auto bField::getReference(const Neon::index_3d& cartesianIdx, + const int& cardinality) -> T& { auto& grid = this->getGrid(); auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); @@ -86,9 +85,9 @@ auto bField -auto bField::operator()(const Neon::index_3d& cartesianIdx, - const int& cardinality) const -> T +template +auto bField::operator()(const Neon::index_3d& cartesianIdx, + const int& cardinality) const -> T { auto& grid = this->getGrid(); auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); @@ -100,22 +99,22 @@ auto bField -auto bField::updateHostData(int streamId) -> void +template +auto bField::updateHostData(int streamId) -> void { mData->memoryField.updateHostData(streamId); } -template -auto bField::updateDeviceData(int streamId) -> void +template +auto bField::updateDeviceData(int streamId) -> void { mData->memoryField.updateDeviceData(streamId); } -template -auto bField::getPartition(Neon::Execution execution, - Neon::SetIdx setIdx, - const Neon::DataView& dataView) const -> const Partition& +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) const -> const Partition& { const Neon::DataUse dataUse = this->getDataUse(); bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); @@ -128,10 +127,10 @@ auto bField -auto bField::getPartition(Neon::Execution execution, - Neon::SetIdx setIdx, - const Neon::DataView& dataView) -> Partition& +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) -> Partition& { const Neon::DataUse dataUse = this->getDataUse(); bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); @@ -144,10 +143,10 @@ auto bField -auto bField::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic, - Neon::set::TransferMode transferMode, - Neon::Execution execution) const -> Neon::set::Container +template +auto bField::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic, + Neon::set::TransferMode transferMode, + Neon::Execution execution) const -> Neon::set::Container { @@ -220,8 +219,8 @@ auto bField -auto bField::initHaloUpdateTable() -> void +template +auto bField::initHaloUpdateTable() -> void { // NEON_THROW_UNSUPPORTED_OPERATION(""); auto& grid = this->getGrid(); @@ -269,10 +268,10 @@ auto bFieldgetCountAllocated()) * dataBlockSize3D.rMul()); + size_t(blockViewPartitions[endPoint]->getCountAllocated()) * SBlock::memBlockCountElements); } if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSrc)) { @@ -299,10 +298,8 @@ auto bField + +template class bField; -template -class bGrid : public Neon::domain::interface::GridBaseTemplate, - bIndex > +template +class bGrid : public Neon::domain::interface::GridBaseTemplate, + bIndex > { public: - using Grid = bGrid; + using Grid = bGrid; - template - using Partition = bPartition; + template + using Partition = bPartition; - template - using Field = Neon::domain::details::bGrid::bField; + template + using Field = Neon::domain::details::bGrid::bField; - using Span = bSpan; + using Span = bSpan; using NghIdx = typename Partition::NghIdx; - using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate >; + using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate >; - using Idx = bIndex; + using Idx = bIndex; static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3; using ExecutionThreadSpanIndexType = uint32_t; - static constexpr Neon::index_3d dataBlockSize3D = Neon::index_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); using BlockIdx = uint32_t; bGrid() = default; @@ -227,7 +228,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate mData; }; -extern template class bGrid<8, 8, 8>; +extern template class bGrid>; } // namespace Neon::domain::details::bGrid #include "bField_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 03c1bd59..1b40a8b7 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -2,42 +2,38 @@ namespace Neon::domain::details::bGrid { -template +template template -bGrid::bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const double_3d& spacingData, - const double_3d& origin) +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData, + const double_3d& origin) : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin) { } -template +template template -bGrid::bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const int voxelSpacing, - const double_3d& spacingData, - const double_3d& origin) +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const int voxelSpacing, + const double_3d& spacingData, + const double_3d& origin) { - static_assert(memBlockSizeX >= userBlockSizeX); - static_assert(memBlockSizeY >= userBlockSizeY); - static_assert(memBlockSizeZ >= userBlockSizeZ); - static_assert(memBlockSizeX % userBlockSizeX == 0); - static_assert(memBlockSizeY % userBlockSizeY == 0); - static_assert(memBlockSizeZ % userBlockSizeZ == 0); mData = std::make_shared(); mData->init(backend); mData->voxelSpacing = voxelSpacing; mData->stencil = stencil; - const index_3d defaultKernelBlockSize(memBlockSizeX, memBlockSizeY, memBlockSizeZ); + const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX, + SBlock::memBlockSizeY, + SBlock::memBlockSizeZ); { auto nElementsPerPartition = backend.devSet().template newDataSet(0); @@ -59,7 +55,7 @@ bGrid(), domainSize, Neon::domain::Stencil::s27_t(false), 1); @@ -76,7 +72,7 @@ bGridpartitioner1D.getBlockSpan(), mData->partitioner1D, Neon::domain::Stencil::s27_t(false), - spacingData * dataBlockSize3D, + spacingData * SBlock::memBlockSize3D, origin); mData->blockViewGrid = BlockViewGrid(egrid); @@ -106,9 +102,9 @@ bGrid().z; k++) { + for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { + for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { Neon::int32_3d localPosition(i, j, k); typename Span::BitMaskWordType mask; @@ -166,7 +162,7 @@ bGrid(nghIdx.helpGet()); + blockNghIdx = static_cast(nghIdx.helpGet()); } blockConnectivity(idx, targetDirection) = blockNghIdx; } @@ -220,7 +216,7 @@ bGrid(); + Neon::int8_3d pShort = pLong.newType(); mData->stencilIdTo3dOffset.eRef(devIdx, i) = pShort; } } @@ -232,7 +228,7 @@ bGridmNumActiveVoxel, - dataBlockSize3D, + SBlock::memBlockSize3D.template newType(), spacingData, origin); { // setting launchParameters @@ -244,47 +240,47 @@ bGrid( eDomainGridSize.x); + int nBlocks = static_cast(eDomainGridSize.x); bLaunchParameters.get(setIdx).set(Neon::sys::GpuLaunchInfo::mode_e::cudaGridMode, - nBlocks, dataBlockSize3D, 0); + nBlocks, SBlock::memBlockSize3D.template newType(), 0); }); }); } } -template +template template -auto bGrid::newField(const std::string name, - int cardinality, - T inactiveValue, - Neon::DataUse dataUse, - Neon::MemoryOptions memoryOptions) const -> Field +auto bGrid::newField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> Field { memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); Field field(name, dataUse, memoryOptions, *this, cardinality, inactiveValue); return field; } -template +template template -auto bGrid::newBlockViewField(const std::string name, - int cardinality, - T inactiveValue, - Neon::DataUse dataUse, - Neon::MemoryOptions memoryOptions) const -> BlockViewGrid::Field +auto bGrid::newBlockViewField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> BlockViewGrid::Field { memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); BlockViewGrid::Field blockViewField = mData->blockViewGrid.template newField(name, cardinality, inactiveValue, dataUse, memoryOptions); return blockViewField; } -template +template template -auto bGrid::newContainer(const std::string& name, - index_3d blockSize, - size_t sharedMem, - LoadingLambda lambda) const -> Neon::set::Container +auto bGrid::newContainer(const std::string& name, + index_3d blockSize, + size_t sharedMem, + LoadingLambda lambda) const -> Neon::set::Container { Neon::set::Container kContainer = Neon::set::Container::factory(name, Neon::set::internal::ContainerAPI::DataViewSupport::on, @@ -295,11 +291,11 @@ auto bGrid +template template -auto bGrid::newContainer(const std::string& name, - LoadingLambda lambda) const -> Neon::set::Container +auto bGrid::newContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container { const Neon::index_3d& defaultBlockSize = this->getDefaultBlock(); Neon::set::Container kContainer = Neon::set::Container::factory(name, @@ -311,50 +307,50 @@ auto bGrid -auto bGrid:: +template +auto bGrid:: getBlockViewGrid() const -> BlockViewGrid& { return mData->blockViewGrid; } -template -auto bGrid:: +template +auto bGrid:: getActiveBitMask() const -> BlockViewGrid::Field& { return mData->activeBitMask; } -template -auto bGrid:: +template +auto bGrid:: helpGetBlockConnectivity() const -> BlockViewGrid::Field& { return mData->blockConnectivity; } -template -auto bGrid:: +template +auto bGrid:: helpGetDataBlockOriginField() const -> Neon::aGrid::Field& { return mData->mDataBlockOriginField; } -template -auto bGrid::getSpan(Neon::Execution execution, - SetIdx setIdx, - Neon::DataView dataView) -> const bGrid::Span& +template +auto bGrid::getSpan(Neon::Execution execution, + SetIdx setIdx, + Neon::DataView dataView) -> const bGrid::Span& { return mData->spanTable.getSpan(execution, setIdx, dataView); } -template -bGrid::~bGrid() +template +bGrid::~bGrid() { } -template -auto bGrid::getSetIdx(const index_3d& idx) const -> int32_t +template +auto bGrid::getSetIdx(const index_3d& idx) const -> int32_t { typename GridBaseTemplate::CellProperties cellProperties; @@ -365,10 +361,10 @@ auto bGrid -auto bGrid::getLaunchParameters(Neon::DataView dataView, - const index_3d&, - const size_t& sharedMem) const -> Neon::set::LaunchParameters +template +auto bGrid::getLaunchParameters(Neon::DataView dataView, + const index_3d&, + const size_t& sharedMem) const -> Neon::set::LaunchParameters { auto res = mData->launchParametersTable.get(dataView); res.forEachSeq([&](SetIdx const& /*setIdx*/, @@ -378,19 +374,19 @@ auto bGrid -auto bGrid:: +template +auto bGrid:: helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet& { return mData->stencilIdTo3dOffset; } -template -auto bGrid::isInsideDomain(const index_3d& idx) const -> bool +template +auto bGrid::isInsideDomain(const index_3d& idx) const -> bool { // 1. check if the block is active - const index_3d blockIdx3d = idx / dataBlockSize3D; + const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); auto blockProperties = mData->blockViewGrid.getProperties(blockIdx3d); if (!blockProperties.isInside()) { @@ -399,17 +395,17 @@ auto bGridactiveBitMask.getReference(blockIdx3d, int(wordCardinality)); return (activeBits & mask) != 0; } -template -auto bGrid::getProperties(const index_3d& idx) +template +auto bGrid::getProperties(const index_3d& idx) const -> typename GridBaseTemplate::CellProperties { typename GridBaseTemplate::CellProperties cellProperties; @@ -422,7 +418,7 @@ auto bGridgetDevSet().setCardinality() == 1) { cellProperties.init(0, DataView::INTERNAL); } else { - const index_3d blockIdx3d = idx / dataBlockSize3D; + const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); auto blockViewProperty = mData->blockViewGrid.getProperties(blockIdx3d); cellProperties.init(blockViewProperty.getSetIdx(), @@ -431,17 +427,17 @@ auto bGrid -auto bGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) +template +auto bGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple { - const index_3d blockIdx3d = idx / dataBlockSize3D; + const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); auto [setIdx, bvGridIdx] = mData->blockViewGrid.helpGetSetIdxAndGridIdx(blockIdx3d); Idx bIdx; bIdx.mDataBlockIdx = bvGridIdx.helpGet(); - bIdx.mInDataBlockIdx.x = static_cast(idx.x % dataBlockSize3D.x); - bIdx.mInDataBlockIdx.y = static_cast(idx.y % dataBlockSize3D.y); - bIdx.mInDataBlockIdx.z = static_cast(idx.z % dataBlockSize3D.z); + bIdx.mInDataBlockIdx.x = static_cast(idx.x % SBlock::memBlockSize3D.x); + bIdx.mInDataBlockIdx.y = static_cast(idx.y % SBlock::memBlockSize3D.y); + bIdx.mInDataBlockIdx.z = static_cast(idx.z % SBlock::memBlockSize3D.z); return {setIdx, bIdx}; } diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h index 7b8d7bcf..bbf103d1 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h @@ -6,11 +6,11 @@ namespace Neon::domain::details::bGrid { // Common forward declarations -template +template class bGrid; -template +template class bSpan; -template +template class bPartition; class MicroIndex @@ -59,26 +59,24 @@ class MicroIndex TrayIdx mTrayBlockIdx{}; }; -template +template class bIndex { public: - template + template friend class bSpan; - using OuterIdx = bIndex; - - static constexpr Neon::uint32_3d memBlock3DSize = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); + using OuterIdx = bIndex; using NghIdx = int8_3d; - template + template friend class bPartition; - template + template friend class bField; - template + template friend class bSpan; - template + template friend class bGrid; @@ -109,25 +107,25 @@ class bIndex DataBlockIdx mDataBlockIdx{}; }; -template -NEON_CUDA_HOST_DEVICE auto bIndex::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void +template +NEON_CUDA_HOST_DEVICE auto bIndex::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void { mDataBlockIdx = dataBlockIdx; } -template -NEON_CUDA_HOST_DEVICE auto bIndex::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void +template +NEON_CUDA_HOST_DEVICE auto bIndex::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void { mInDataBlockIdx = inDataBlockIdx; } -template -NEON_CUDA_HOST_DEVICE auto bIndex::getDataBlockIdx() const -> const bIndex::DataBlockIdx& +template +NEON_CUDA_HOST_DEVICE auto bIndex::getDataBlockIdx() const -> const bIndex::DataBlockIdx& { return mDataBlockIdx; } -template -NEON_CUDA_HOST_DEVICE auto bIndex::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx& +template +NEON_CUDA_HOST_DEVICE auto bIndex::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx& { return mInDataBlockIdx; } diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h index a55fddbb..be45749d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h @@ -3,8 +3,8 @@ namespace Neon::domain::details::bGrid { -template -NEON_CUDA_HOST_DEVICE inline bIndex:: +template +NEON_CUDA_HOST_DEVICE inline bIndex:: bIndex(const DataBlockIdx& blockIdx, const InDataBlockIdx::Integer& x, const InDataBlockIdx::Integer& y, @@ -16,86 +16,52 @@ NEON_CUDA_HOST_DEVICE inline bIndex -// NEON_CUDA_HOST_DEVICE inline auto bIndex::getTrayIdx() -> TrayIdx -//{ -// -// TrayIdx const exBlockOffset = mDataBlockIdx * (userBlockSizeX * userBlockSizeY * userBlockSizeZ); -// TrayIdx const exTrayOffset = [&]() { -// int const trayBlockIdxX = mInDataBlockIdx.x / userBlockSizeX; -// int const trayBlockIdxY = mInDataBlockIdx.y / userBlockSizeY; -// int const trayBlockIdxZ = mInDataBlockIdx.z / userBlockSizeZ; -// -// constexpr int countMicroBlocksInTrayX = (memBlockSizeX / userBlockSizeX); -// constexpr int countMicroBlocksInTrayY = (memBlockSizeY / userBlockSizeY); -// -// int const res = trayBlockIdxX + trayBlockIdxY * countMicroBlocksInTrayX + -// trayBlockIdxZ * (countMicroBlocksInTrayX * countMicroBlocksInTrayY); -// return res; -// }; -// return exBlockOffset + exTrayOffset; -//} -// -// -// template -// NEON_CUDA_HOST_DEVICE inline auto bIndex::getInTrayIdx() -> InTrayIdx -//{ -// InTrayIdx inTrayIdx; -// inTrayIdx.x = mInDataBlockIdx.x % userBlockSizeX; -// inTrayIdx.y = mInDataBlockIdx.y % userBlockSizeY; -// inTrayIdx.z = mInDataBlockIdx.z % userBlockSizeZ; -// -// return inTrayIdx; -//} -template -NEON_CUDA_HOST_DEVICE inline auto bIndex::getMicroIndex() -> MicroIndex +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::getMicroIndex() -> MicroIndex { - constexpr uint32_t blockRatioX = memBlockSizeX / userBlockSizeX; - constexpr uint32_t blockRatioY = memBlockSizeY / userBlockSizeY; - constexpr uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ; - TrayIdx const exBlockOffset = mDataBlockIdx * (blockRatioX * blockRatioY * blockRatioZ); + + TrayIdx const exBlockOffset = mDataBlockIdx * (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ); TrayIdx const exTrayOffset = [&] { - TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / userBlockSizeX; - TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / userBlockSizeY; - TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / userBlockSizeZ; + TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / SBlock::userBlockSizeX; + TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / SBlock::userBlockSizeY; + TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / SBlock::userBlockSizeZ; - TrayIdx const res = trayBlockIdxX + trayBlockIdxY * blockRatioX + - trayBlockIdxZ * (blockRatioX * blockRatioY); + TrayIdx const res = trayBlockIdxX + trayBlockIdxY * SBlock::blockRatioX + + trayBlockIdxZ * (SBlock::blockRatioX * SBlock::blockRatioY); return res; }(); MicroIndex res; res.setTrayBlockIdx(exBlockOffset + exTrayOffset); - res.setInTrayBlockIdx({static_cast(mInDataBlockIdx.x % userBlockSizeX), - static_cast(mInDataBlockIdx.y % userBlockSizeY), - static_cast(mInDataBlockIdx.z % userBlockSizeZ)}); + res.setInTrayBlockIdx({static_cast(mInDataBlockIdx.x % SBlock::userBlockSizeX), + static_cast(mInDataBlockIdx.y % SBlock::userBlockSizeY), + static_cast(mInDataBlockIdx.z % SBlock::userBlockSizeZ)}); return res; } -template -NEON_CUDA_HOST_DEVICE inline auto bIndex::init(MicroIndex const& microIndex) -> void +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::init(MicroIndex const& microIndex) -> void { - constexpr uint32_t memBlockSize = memBlockSizeX * memBlockSizeY * memBlockSizeZ; - constexpr uint32_t userBlockSize = userBlockSizeX * userBlockSizeY * userBlockSizeZ; + constexpr uint32_t memBlockSize = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + constexpr uint32_t userBlockSize = SBlock::userBlockSizeX * SBlock::userBlockSizeY * SBlock::userBlockSizeZ; constexpr uint32_t blockRatioSize = memBlockSize / userBlockSize; - constexpr uint32_t blockRatioX = memBlockSizeX / userBlockSizeX; - constexpr uint32_t blockRatioY = memBlockSizeY / userBlockSizeY; + constexpr uint32_t blockRatioX = SBlock::memBlockSizeX / SBlock::userBlockSizeX; + constexpr uint32_t blockRatioY = SBlock::memBlockSizeY / SBlock::userBlockSizeY; mDataBlockIdx = microIndex.getTrayBlockIdx() / (blockRatioSize); uint32_t reminder = microIndex.getTrayBlockIdx() % (blockRatioSize); const uint32_t reminderInZ = reminder / (blockRatioX * blockRatioY); - mInDataBlockIdx.z = static_cast < InDataBlockIdx::Integer>( microIndex.getInTrayBlockIdx().z + reminderInZ * userBlockSizeZ); + mInDataBlockIdx.z = static_cast(microIndex.getInTrayBlockIdx().z + reminderInZ * SBlock::userBlockSizeZ); reminder = reminder % (blockRatioX * blockRatioY); const uint32_t reminderInY = reminder / (blockRatioX); - mInDataBlockIdx.y = static_cast(microIndex.getInTrayBlockIdx().y + reminderInY * userBlockSizeY); + mInDataBlockIdx.y = static_cast(microIndex.getInTrayBlockIdx().y + reminderInY * SBlock::userBlockSizeY); const uint32_t reminderInX = reminder % blockRatioX; - mInDataBlockIdx.x = static_cast(microIndex.getInTrayBlockIdx().x + reminderInX * userBlockSizeX); + mInDataBlockIdx.x = static_cast(microIndex.getInTrayBlockIdx().x + reminderInX * SBlock::userBlockSizeX); } } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 09db40e4..f20a513d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -9,15 +9,15 @@ namespace Neon::domain::details::bGrid { -template +template class bSpan; -template +template class bPartition { public: - using Span = bSpan; - using Idx = bIndex; + using Span = bSpan; + using Idx = bIndex; using NghIdx = typename Idx::NghIdx; using Type = T; using NghData = Neon::domain::NghData; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index db057f47..8506476b 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -5,8 +5,8 @@ namespace Neon::domain::details::bGrid { -template -bPartition::bPartition() +template +bPartition::bPartition() : mCardinality(0), mMem(nullptr), mStencilNghIndex(), @@ -17,8 +17,8 @@ bPartition -bPartition:: +template +bPartition:: bPartition(int setIdx, int cardinality, T* mem, @@ -36,8 +36,8 @@ bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: getGlobalIndex(const Idx& gidx) const -> Neon::index_3d { @@ -48,8 +48,8 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: getBlockViewGridIdx(const Idx& gidx) const -> BlockViewGridIdx { @@ -58,32 +58,32 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: cardinality() const -> int { return mCardinality; } -template -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: operator()(const Idx& cell, int card) -> T& { return mMem[helpGetPitch(cell, card)]; } -template -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: operator()(const Idx& cell, int card) const -> const T& { return mMem[helpGetPitch(cell, card)]; } -template -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpGetPitch(const Idx& idx, int card) const -> uint32_t { @@ -92,22 +92,22 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpGetValidIdxPitchExplicit(const Idx& idx, int card) const -> uint32_t { - uint32_t const blockPitchByCard = memBlockSizeX * memBlockSizeY * memBlockSizeZ; + uint32_t const blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x + - memBlockSizeX * idx.mInDataBlockIdx.y + - (memBlockSizeX * memBlockSizeY) * idx.mInDataBlockIdx.z; + SBlock::memBlockSizeX * idx.mInDataBlockIdx.y + + (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z; uint32_t const blockAdnCardPitch = (idx.mDataBlockIdx * mCardinality + card) * blockPitchByCard; uint32_t const pitch = blockAdnCardPitch + inBlockInCardPitch; return pitch; } -template -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpNghPitch(const Idx& nghIdx, int card) const -> std::tuple { @@ -126,8 +126,8 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: helpGetNghIdx(const Idx& idx, const NghIdx& offset) const -> Idx @@ -142,9 +142,9 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition= memBlockSizeX ? +1 : 0); - const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= memBlockSizeX ? +1 : 0); - const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= memBlockSizeX ? +1 : 0); + const int xFlag = ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); const bool isLocal = (xFlag | yFlag | zFlag) == 0; if (!(isLocal)) { @@ -177,9 +177,9 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: getNghData(const Idx& eId, uint8_t nghID, int card) @@ -207,8 +207,8 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: getNghData(const Idx& idx, const NghIdx& offset, const int card) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h index bf91dc16..80fb12ab 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h @@ -4,7 +4,7 @@ namespace Neon::domain::details::bGrid { -template +template class bSpan { public: @@ -15,8 +15,8 @@ class bSpan static constexpr Neon::MemoryLayout activeMaskMemoryLayout = Neon::MemoryLayout::arrayOfStructs; static constexpr uint32_t log2OfbitMaskWordSize = 6; - using Idx = bIndex; - friend class bGrid; + using Idx = bIndex; + friend class bGrid; static constexpr int SpaceDim = 3; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h index 50f441a0..57d7aeca 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h @@ -2,9 +2,9 @@ namespace Neon::domain::details::bGrid { -template +template NEON_CUDA_HOST_DEVICE inline auto -bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool +bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool { #ifdef NEON_PLACE_CUDA_DEVICE bidx.mDataBlockIdx = blockIdx.x + mFirstDataBlockOffset; @@ -22,9 +22,9 @@ bSpan +template NEON_CUDA_HOST_DEVICE inline auto -bSpan::setAndValidateCPUDevice(Idx& bidx, +bSpan::setAndValidateCPUDevice(Idx& bidx, uint32_t const& dataBlockIdx, uint32_t const& x, uint32_t const& y, @@ -41,8 +41,8 @@ bSpan -bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, +template +bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, BitMaskWordType* activeMask, Neon::DataView dataView) : mFirstDataBlockOffset(firstDataBlockOffset), @@ -51,16 +51,16 @@ bSpan -NEON_CUDA_HOST_DEVICE inline auto bSpan::getRequiredWordsForBlockBitMask() -> uint32_t +template +NEON_CUDA_HOST_DEVICE inline auto bSpan::getRequiredWordsForBlockBitMask() -> uint32_t { - uint32_t requiredBits = memBlockSizeX * memBlockSizeY * memBlockSizeZ; - uint32_t requiredWords = ((requiredBits - 1) >> bSpan::log2OfbitMaskWordSize) + 1; + uint32_t requiredBits = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + uint32_t requiredWords = ((requiredBits - 1) >> bSpan::log2OfbitMaskWordSize) + 1; return requiredWords; } -template -inline auto bSpan::getMaskAndWordIdforBlockBitMask(int threadX, +template +inline auto bSpan::getMaskAndWordIdforBlockBitMask(int threadX, int threadY, int threadZ, NEON_OUT BitMaskWordType& mask, @@ -68,7 +68,7 @@ inline auto bSpan> log2OfbitMaskWordSize // the same as: threadPitch / 2^{log2OfbitMaskWordSize} wordIdx = threadPitch >> log2OfbitMaskWordSize; @@ -82,8 +82,8 @@ inline auto bSpan -NEON_CUDA_HOST_DEVICE inline auto bSpan::getActiveStatus( +template +NEON_CUDA_HOST_DEVICE inline auto bSpan::getActiveStatus( const typename Idx::DataBlockIdx& dataBlockIdx, int threadX, int threadY, @@ -92,7 +92,7 @@ NEON_CUDA_HOST_DEVICE inline auto bSpan> log2OfbitMaskWordSize // the same as: threadPitch / 2^{log2OfbitMaskWordSize} const uint32_t wordIdx = threadPitch >> log2OfbitMaskWordSize; diff --git a/libNeonDomain/src/domain/details/bGrid/bGrid.cpp b/libNeonDomain/src/domain/details/bGrid/bGrid.cpp index 0cc0dfef..78dad9bf 100644 --- a/libNeonDomain/src/domain/details/bGrid/bGrid.cpp +++ b/libNeonDomain/src/domain/details/bGrid/bGrid.cpp @@ -3,6 +3,6 @@ namespace Neon::domain::details::bGrid { -template class bGrid<8,8,8>; +template class bGrid>; } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp b/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp index 9e0cd408..794dfde0 100644 --- a/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp +++ b/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp @@ -4,38 +4,34 @@ #include "gtest/gtest.h" - -template +template void test_backToBackConversion() { - using bGrid = Neon::domain::details::bGrid::bGrid; + using bGrid = Neon::domain::details::bGrid::bGrid; using MicroIndex = Neon::domain::details::bGrid::MicroIndex; typename bGrid::Idx bIdx; MicroIndex microIdx; - uint32_t ratioOnX = (memBlockSizeX) / (userBlockSizeX); - uint32_t ratioOnY = (memBlockSizeY) / (userBlockSizeY); - uint32_t ratioOnZ = (memBlockSizeZ) / (userBlockSizeZ); for (uint32_t memBlockIdx = 0; memBlockIdx < 10; memBlockIdx++) { - const uint32_t memBlockJump = (ratioOnX*ratioOnY*ratioOnZ)*memBlockIdx; - for (uint32_t rZ = 0; rZ < ratioOnZ; rZ++) { - for (uint32_t rY = 0; rY < ratioOnY; rY++) { - for (uint32_t rX = 0; rX < ratioOnX; rX++) { - for (int8_t k = 0; k < int8_t(userBlockSizeX); k++) { - for (int8_t j = 0; j < int8_t(userBlockSizeY); j++) { - for (int8_t i = 0; i < int8_t(userBlockSizeZ); i++) { // Set the micro idx to the first voxel + const uint32_t memBlockJump = (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ) * memBlockIdx; + for (uint32_t rZ = 0; rZ < SBlock::blockRatioZ; rZ++) { + for (uint32_t rY = 0; rY < SBlock::blockRatioY; rY++) { + for (uint32_t rX = 0; rX < SBlock::blockRatioX; rX++) { + for (int8_t k = 0; k < int8_t(SBlock::userBlockSizeX); k++) { + for (int8_t j = 0; j < int8_t(SBlock::userBlockSizeY); j++) { + for (int8_t i = 0; i < int8_t(SBlock::userBlockSizeZ); i++) { // Set the micro idx to the first voxel // Check that bIdx point to the first voxels too - microIdx.setTrayBlockIdx(memBlockJump + rX + rY * ratioOnX + rZ * ratioOnY * ratioOnX); + microIdx.setTrayBlockIdx(memBlockJump + rX + rY * SBlock::blockRatioX + rZ * SBlock::blockRatioY * SBlock::blockRatioX); microIdx.setInTrayBlockIdx({i, j, k}); bIdx.init(microIdx); auto res = bIdx.getMicroIndex(); ASSERT_EQ(bIdx.getDataBlockIdx(), memBlockIdx); - ASSERT_EQ(bIdx.getInDataBlockIdx(), Neon::int8_3d(static_cast(i + rX * userBlockSizeX), - static_cast(j + rY * userBlockSizeY), - static_cast( k + rZ * userBlockSizeZ))) - << bIdx.getInDataBlockIdx() << " instead of " << Neon::int8_3d(static_cast(i + rX * userBlockSizeX), static_cast(j + rY * userBlockSizeY),static_cast( k + rZ * userBlockSizeZ)) << " with rX,Ry,rZ " << rX << "," << rY << "," << rZ << " and i,j,k = " << i << "," << j << "," << k; + ASSERT_EQ(bIdx.getInDataBlockIdx(), Neon::int8_3d(static_cast(i + rX * SBlock::userBlockSizeX), + static_cast(j + rY * SBlock::userBlockSizeY), + static_cast(k + rZ * SBlock::userBlockSizeZ))) + << bIdx.getInDataBlockIdx() << " instead of " << Neon::int8_3d(static_cast(i + rX * SBlock::userBlockSizeX), static_cast(j + rY * SBlock::userBlockSizeY), static_cast(k + rZ * SBlock::userBlockSizeZ)) << " with rX,Ry,rZ " << rX << "," << rY << "," << rZ << " and i,j,k = " << i << "," << j << "," << k; ASSERT_EQ(res.getTrayBlockIdx(), microIdx.getTrayBlockIdx()); @@ -51,27 +47,27 @@ void test_backToBackConversion() TEST(bGrid_tray, init_4_4_4_2_2_2) { - test_backToBackConversion<4, 4, 4, 2, 2, 2>(); + test_backToBackConversion>(); } TEST(bGrid_tray, init_8_8_8_2_2_2) { - test_backToBackConversion<8, 8, 8, 2, 2, 2>(); + test_backToBackConversion>(); } TEST(bGrid_tray, init_8_8_8_1_1_1) { - test_backToBackConversion<8, 8, 8, 1, 1, 1>(); + test_backToBackConversion>(); } TEST(bGrid_tray, init_8_8_8_4_4_4) { - test_backToBackConversion<8, 8, 8, 4, 4, 4>(); + test_backToBackConversion>(); } TEST(bGrid_tray, init_4_4_4_2_1_2) { - test_backToBackConversion<4,4,4, 2, 1, 2>(); + test_backToBackConversion>(); } int main(int argc, char** argv) From d82e985c2c92863daf39ae3d595d2dafc9ea443f Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 17:54:05 -0400 Subject: [PATCH 04/25] Cleaning up naming for the BlockViewGrid --- .../bGrid/BlockViewGrid/BlockViewGrid.h | 2 +- .../Neon/domain/details/bGrid/StaticBlock.h | 13 +++++++++++ .../Neon/domain/details/bGrid/bField.h | 6 +++-- .../include/Neon/domain/details/bGrid/bGrid.h | 23 +++++++++++-------- .../Neon/domain/details/bGrid/bPartition.h | 12 ++++++---- 5 files changed, 38 insertions(+), 18 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h index 3f2f3544..cc714802 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h @@ -90,8 +90,8 @@ struct GridTransformation }); } }; +using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; } // namespace details -using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h index 612c6b9a..14872577 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h @@ -41,6 +41,19 @@ struct StaticBlock static_assert(memBlockSizeX % userBlockSizeX == 0); static_assert(memBlockSizeY % userBlockSizeY == 0); static_assert(memBlockSizeZ % userBlockSizeZ == 0); + + struct BitMask + { + auto reset() + { + for (uint32_t i = 0; i < nWords; ++i) { + bits[i] = 0; + } + } + + constexpr static uint32_t nWords = (memBlockCountElements + 31) / 32; + uint32_t bits[nWords]; + }; }; } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index 95c1d6d5..8f1ac485 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -33,11 +33,13 @@ class bField : public Neon::domain::interface::FieldBaseTemplate; using Partition = bPartition; using Idx = bIndex; + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + template + using BlockViewField = BlockViewGrid::template Field; using NghIdx = typename Partition::NghIdx; using NghData = typename Partition::NghData; - bField(const std::string& fieldUserName, Neon::DataUse dataUse, const Neon::MemoryOptions& memoryOptions, @@ -109,7 +111,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate grid; - BlockViewGrid::Field memoryField; + BlockViewField memoryField; int mCardinality; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index 9d91df5d..a40935bb 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -5,6 +5,7 @@ #include "BlockViewGrid/BlockViewGrid.h" #include "Neon/domain/aGrid.h" +#include "Neon/domain/details/bGrid/StaticBlock.h" #include "Neon/domain/details/bGrid/bField.h" #include "Neon/domain/details/bGrid/bIndex.h" #include "Neon/domain/details/bGrid/bPartition.h" @@ -16,8 +17,6 @@ #include "Neon/domain/tools/SpanTable.h" #include "Neon/set/Containter.h" #include "Neon/set/LaunchParametersTable.h" -#include "Neon/domain/details/bGrid/StaticBlock.h" - #include "bField.h" #include "bPartition.h" @@ -31,7 +30,7 @@ class bField; template class bGrid : public Neon::domain::interface::GridBaseTemplate, - bIndex > + bIndex> { public: using Grid = bGrid; @@ -42,9 +41,13 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, template using Field = Neon::domain::details::bGrid::bField; + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + template + using BlockViewField = BlockViewGrid::template Field; + using Span = bSpan; using NghIdx = typename Partition::NghIdx; - using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate >; + using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate>; using Idx = bIndex; static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3; @@ -124,9 +127,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, T inactiveValue, Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const - -> BlockViewGrid::Field; + -> BlockViewField; - /* + /** * Allocates a new container to execute some computation in the grid */ template , size_t sharedMem, LoadingLambda lambda) const -> Neon::set::Container; - /* + /** * Allocates a new container to execute some computation in the grid */ template , * Retrieve the block vew grid internally used. * This grid can be leverage to allocate data at the block level. */ - auto getActiveBitMask() const -> BlockViewGrid::Field&; + auto getActiveBitMask() const -> BlockViewField&; /** * Help function to retrieve the block connectivity as a BlockViewGrid field */ - auto helpGetBlockConnectivity() const -> BlockViewGrid::Field&; + auto helpGetBlockConnectivity() const -> BlockViewField&; /** * Help function to retrieve the block origin as a BlockViewGrid field @@ -228,7 +231,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, }; std::shared_ptr mData; }; -extern template class bGrid>; +extern template class bGrid>; } // namespace Neon::domain::details::bGrid #include "bField_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index f20a513d..48312b22 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -21,6 +21,8 @@ class bPartition using NghIdx = typename Idx::NghIdx; using Type = T; using NghData = Neon::domain::NghData; + + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; using BlockViewGridIdx = BlockViewGrid::Idx; public: @@ -90,13 +92,13 @@ class bPartition const -> Idx; - int mCardinality; - T* mMem; - NghIdx* mStencilNghIndex; + int mCardinality; + T* mMem; + NghIdx* mStencilNghIndex; typename Idx::DataBlockIdx* mBlockConnectivity; typename Span::BitMaskWordType* mMask; - Neon::int32_3d* mOrigin; - int mSetIdx; + Neon::int32_3d* mOrigin; + int mSetIdx; }; } // namespace Neon::domain::details::bGrid From 9e29f8e78a673dda0f587184ee981f0793b75fa0 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 19:35:02 -0400 Subject: [PATCH 05/25] bGrid - introducing the concept of BlockView and refactoring the bitmask field. --- .../Neon/domain/details/bGrid/BlockView.h | 29 +++++ .../BlockViewGrid.h | 0 .../BlockViewPartition.h | 0 .../BlockViewPartition_imp.h | 0 .../Neon/domain/details/bGrid/StaticBlock.h | 53 +++++++++- .../include/Neon/domain/details/bGrid/bGrid.h | 32 +++--- .../Neon/domain/details/bGrid/bGrid_imp.h | 100 ++++++++---------- .../Neon/domain/details/bGrid/bPartition.h | 28 ++--- .../domain/details/bGrid/bPartition_imp.h | 19 ++-- .../include/Neon/domain/details/bGrid/bSpan.h | 40 +++---- .../Neon/domain/details/bGrid/bSpan_imp.h | 89 ++-------------- .../Neon/domain/details/eGrid/eField_imp.h | 2 +- .../Neon/domain/interface/FieldBase_imp.h | 2 +- 13 files changed, 185 insertions(+), 209 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h rename libNeonDomain/include/Neon/domain/details/bGrid/{BlockViewGrid => BlockView}/BlockViewGrid.h (100%) rename libNeonDomain/include/Neon/domain/details/bGrid/{BlockViewGrid => BlockView}/BlockViewPartition.h (100%) rename libNeonDomain/include/Neon/domain/details/bGrid/{BlockViewGrid => BlockView}/BlockViewPartition_imp.h (100%) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h new file mode 100644 index 00000000..42093147 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h @@ -0,0 +1,29 @@ +#include "Neon/domain/details/bGrid/BlockView/BlockViewGrid.h" +#include "Neon/domain/tools/GridTransformer.h" + +namespace Neon::domain::details::bGrid { + +struct BlockView +{ + public: + using Grid = Neon::domain::tool::GridTransformer::Grid; + template + using Field = Grid::template Field; + using index_3d = Neon::index_3d; + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * card]; + } + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * C]; + } + + static constexpr Neon::MemoryLayout layout = Neon::MemoryLayout::arrayOfStructs; +}; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewGrid.h similarity index 100% rename from libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h rename to libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewGrid.h diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition.h similarity index 100% rename from libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition.h rename to libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition.h diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition_imp.h similarity index 100% rename from libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition_imp.h rename to libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition_imp.h diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h index 14872577..951f9fd3 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h @@ -44,15 +44,60 @@ struct StaticBlock struct BitMask { - auto reset() + using BitMaskWordType = uint32_t; + auto reset() -> void { - for (uint32_t i = 0; i < nWords; ++i) { + for (BitMaskWordType i = 0; i < nWords; ++i) { bits[i] = 0; } } - constexpr static uint32_t nWords = (memBlockCountElements + 31) / 32; - uint32_t bits[nWords]; + auto setActive(int threadX, + int threadY, + int threadZ) -> void + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + word = word | mask; + } + + inline auto NEON_CUDA_HOST_DEVICE isActive(int threadX, + int threadY, + int threadZ) const -> bool + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + return (word & mask) != 0; + } + + static inline auto NEON_CUDA_HOST_DEVICE getMaskAndWordI(int threadX, + int threadY, + int threadZ, + NEON_OUT BitMaskWordType& mask, + NEON_OUT uint32_t& wordIdx) -> void + { + const uint32_t threadPitch = threadX * memBlockPitchX + + threadY * memBlockPitchY + + threadZ * memBlockPitchZ; + + // threadPitch >> log2_of_bitPerWord + // the same as: threadPitch / 2^{log2_of_bitPerWord} + wordIdx = threadPitch >> log2_of_bitPerWord; + // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1); + // same as threadPitch % 2^{log2OfbitMaskWordSize} + const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitPerWord)) - 1); + mask = BitMaskWordType(1) << offsetInWord; + } + + constexpr static BitMaskWordType nWords = (memBlockCountElements + 31) / 32; + static constexpr uint32_t log2_of_bitPerWord = 5; + static constexpr uint32_t bitPerWord = 32; + + BitMaskWordType bits[nWords]; }; }; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index a40935bb..8ed458c8 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -1,10 +1,8 @@ #pragma once #include "Neon/core/core.h" -#include "Neon/set/memory/memSet.h" - -#include "BlockViewGrid/BlockViewGrid.h" #include "Neon/domain/aGrid.h" +#include "Neon/domain/details/bGrid/BlockView.h" #include "Neon/domain/details/bGrid/StaticBlock.h" #include "Neon/domain/details/bGrid/bField.h" #include "Neon/domain/details/bGrid/bIndex.h" @@ -17,6 +15,7 @@ #include "Neon/domain/tools/SpanTable.h" #include "Neon/set/Containter.h" #include "Neon/set/LaunchParametersTable.h" +#include "Neon/set/memory/memSet.h" #include "bField.h" #include "bPartition.h" @@ -34,17 +33,11 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, { public: using Grid = bGrid; - template using Partition = bPartition; - template using Field = Neon::domain::details::bGrid::bField; - using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; - template - using BlockViewField = BlockViewGrid::template Field; - using Span = bSpan; using NghIdx = typename Partition::NghIdx; using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate>; @@ -127,7 +120,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, T inactiveValue, Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const - -> BlockViewField; + -> BlockView::Field; /** * Allocates a new container to execute some computation in the grid @@ -165,30 +158,30 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, * Retrieve the block vew grid internally used. * This grid can be leverage to allocate data at the block level. */ - auto getBlockViewGrid() const -> BlockViewGrid&; + auto getBlockViewGrid() const -> BlockView::Grid&; /** * Retrieve the block vew grid internally used. * This grid can be leverage to allocate data at the block level. */ - auto getActiveBitMask() const -> BlockViewField&; + auto getActiveBitMask() const -> BlockView::Field&; /** * Help function to retrieve the block connectivity as a BlockViewGrid field */ - auto helpGetBlockConnectivity() const -> BlockViewField&; + auto helpGetBlockConnectivity() const -> BlockView::Field&; /** * Help function to retrieve the block origin as a BlockViewGrid field */ auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field&; - /* + /** * Help function to retrieve the map that converts a stencil point id to 3d offset */ auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet&; - /* + /** * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid */ auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple; @@ -212,11 +205,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, Neon::aGrid::Field mDataBlockOriginField; Neon::set::MemSet mStencil3dTo1dOffset; - BlockViewGrid blockViewGrid; - BlockViewGrid::Field activeBitMask; - BlockViewGrid::Field blockConnectivity; - - Neon::set::MemSet stencilIdTo3dOffset; + BlockView::Grid blockViewGrid; + BlockView::Field activeBitField; + BlockView::Field blockConnectivity; + Neon::set::MemSet stencilIdTo3dOffset; tool::Partitioner1D::DenseMeta denseMeta; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 1b40a8b7..7505a06b 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -75,66 +75,60 @@ bGrid::bGrid(const Neon::Backend& backend, spacingData * SBlock::memBlockSize3D, origin); - mData->blockViewGrid = BlockViewGrid(egrid); + mData->blockViewGrid = BlockView::Grid(egrid); } { // Active bitmask - int requiredWords = Span::getRequiredWordsForBlockBitMask(); - mData->activeBitMask = mData->blockViewGrid.template newField("BitMask", - requiredWords, - 0, - Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(Span::activeMaskMemoryLayout)); + mData->activeBitField = mData->blockViewGrid.template newField( + "BlockViewBitMask", + 1, + [] { + typename SBlock::BitMask outsideBitMask; + outsideBitMask.reset(); + return outsideBitMask; + }(), + Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(BlockView::layout)); mData->mNumActiveVoxel = backend.devSet().template newDataSet(); - mData->activeBitMask + mData->activeBitField .getGrid() .template newContainer( "activeBitMaskInit", [&](Neon::set::Loader& loader) { - auto bitMask = loader.load(mData->activeBitMask); - return [&, bitMask](const auto& bitMaskIdx) mutable { - auto prtIdx = bitMask.prtID(); - int coutActive = 0; - auto const blockOrigin = bitMask.getGlobalIndex(bitMaskIdx); - - for (int c = 0; c < bitMask.cardinality(); c++) { - bitMask(bitMaskIdx, c) = 0; - } + auto bitMaskPartition = loader.load(mData->activeBitField); + return [&, bitMaskPartition](const auto& bitMaskIdx) mutable { + auto prtIdx = bitMaskPartition.prtID(); + int countActive = 0; + auto const blockOrigin = bitMaskPartition.getGlobalIndex(bitMaskIdx); + typename SBlock::BitMask& bitMask = bitMaskPartition(bitMaskIdx, 0); + bitMask.reset(); for (int k = 0; k < SBlock::memBlockSize3D.template newType().z; k++) { for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { - - Neon::int32_3d localPosition(i, j, k); - typename Span::BitMaskWordType mask; - uint32_t wordIdx; - - Span::getMaskAndWordIdforBlockBitMask(i, j, k, NEON_OUT mask, NEON_OUT wordIdx); - auto globalPosition = localPosition + blockOrigin; - bool isInDomain = globalPosition < domainSize; - bool isActive = activeCellLambda(globalPosition); + auto globalPosition = blockOrigin + Neon::int32_3d(i, j, k); + bool const isInDomain = globalPosition < domainSize; + bool const isActive = activeCellLambda(globalPosition); if (isActive && isInDomain) { - coutActive++; - auto value = bitMask(bitMaskIdx, wordIdx); - value = value | mask; - bitMask(bitMaskIdx, wordIdx) = value; + countActive++; + bitMask.setActive(i, j, k); } } } } #pragma omp critical { - mData->mNumActiveVoxel[prtIdx] += coutActive; + mData->mNumActiveVoxel[prtIdx] += countActive; } }; }) .run(Neon::Backend::mainStreamIdx); - mData->activeBitMask.updateDeviceData(Neon::Backend::mainStreamIdx); - mData->activeBitMask.newHaloUpdate(Neon::set::StencilSemantic::standard, - Neon::set::TransferMode::put, - Neon::Execution::device) + mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx); + mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::put, + Neon::Execution::device) .run(Neon::Backend::mainStreamIdx); } @@ -184,20 +178,20 @@ bGrid::bGrid(const Neon::Backend& backend, case Neon::DataView::STANDARD: { span.mFirstDataBlockOffset = 0; span.mDataView = dw; - span.mActiveMask = mData->activeBitMask.getPartition(execution, setIdx, dw).mem(); + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); break; } case Neon::DataView::BOUNDARY: { span.mFirstDataBlockOffset = mData->partitioner1D.getSpanClassifier().countInternal(setIdx); span.mDataView = dw; - span.mActiveMask = mData->activeBitMask.getPartition(execution, setIdx, dw).mem(); + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); break; } case Neon::DataView::INTERNAL: { span.mFirstDataBlockOffset = 0; span.mDataView = dw; - span.mActiveMask = mData->activeBitMask.getPartition(execution, setIdx, dw).mem(); + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); break; } default: { @@ -267,10 +261,10 @@ auto bGrid::newBlockViewField(const std::string name, int cardinality, T inactiveValue, Neon::DataUse dataUse, - Neon::MemoryOptions memoryOptions) const -> BlockViewGrid::Field + Neon::MemoryOptions memoryOptions) const -> BlockView::Field { memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); - BlockViewGrid::Field blockViewField = mData->blockViewGrid.template newField(name, cardinality, inactiveValue, dataUse, memoryOptions); + BlockView::Field blockViewField = mData->blockViewGrid.template newField(name, cardinality, inactiveValue, dataUse, memoryOptions); return blockViewField; } @@ -310,7 +304,7 @@ auto bGrid::newContainer(const std::string& name, template auto bGrid:: getBlockViewGrid() - const -> BlockViewGrid& + const -> BlockView::Grid& { return mData->blockViewGrid; } @@ -318,15 +312,15 @@ auto bGrid:: template auto bGrid:: getActiveBitMask() - const -> BlockViewGrid::Field& + const -> BlockView::Field& { - return mData->activeBitMask; + return mData->activeBitField; } template auto bGrid:: helpGetBlockConnectivity() - const -> BlockViewGrid::Field& + const -> BlockView::Field& { return mData->blockConnectivity; } @@ -386,22 +380,18 @@ template auto bGrid::isInsideDomain(const index_3d& idx) const -> bool { // 1. check if the block is active - const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); - auto blockProperties = mData->blockViewGrid.getProperties(blockIdx3d); + const BlockView::index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); + auto blockProperties = mData->blockViewGrid.getProperties(blockIdx3d); if (!blockProperties.isInside()) { return false; } - // 2. The block is active, check the element on the block - uint32_t wordCardinality; - typename Span::BitMaskWordType mask; - Span::getMaskAndWordIdforBlockBitMask(idx.x % SBlock::memBlockSize3D.x, - idx.y % SBlock::memBlockSize3D.y, - idx.z % SBlock::memBlockSize3D.z, - NEON_OUT mask, - NEON_OUT wordCardinality); - auto activeBits = mData->activeBitMask.getReference(blockIdx3d, int(wordCardinality)); - return (activeBits & mask) != 0; + // 2. The block is active, check the element in the block + typename SBlock::BitMask const& bitMask = mData->activeBitField.getReference(blockIdx3d, 0); + bool isActive = bitMask.isActive(idx.x % SBlock::memBlockSize3D.x, + idx.y % SBlock::memBlockSize3D.y, + idx.z % SBlock::memBlockSize3D.z); + return isActive; } template diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 48312b22..7f537ad5 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -30,13 +30,13 @@ class bPartition ~bPartition() = default; - explicit bPartition(int setIdx, - int mCardinality, - T* mMem, - typename Idx::DataBlockIdx* mBlockConnectivity, - typename Span::BitMaskWordType* mMask, - Neon::int32_3d* mOrigin, - NghIdx* mStencilNghIndex); + explicit bPartition(int setIdx, + int mCardinality, + T* mMem, + typename Idx::DataBlockIdx* mBlockConnectivity, + typename SBlock::BitMask const* NEON_RESTRICT mMask, + Neon::int32_3d* mOrigin, + NghIdx* mStencilNghIndex); inline NEON_CUDA_HOST_DEVICE auto cardinality() @@ -92,13 +92,13 @@ class bPartition const -> Idx; - int mCardinality; - T* mMem; - NghIdx* mStencilNghIndex; - typename Idx::DataBlockIdx* mBlockConnectivity; - typename Span::BitMaskWordType* mMask; - Neon::int32_3d* mOrigin; - int mSetIdx; + int mCardinality; + T* mMem; + NghIdx const* NEON_RESTRICT mStencilNghIndex; + typename Idx::DataBlockIdx const* NEON_RESTRICT mBlockConnectivity; + typename SBlock::BitMask const* NEON_RESTRICT mMask; + Neon::int32_3d const* NEON_RESTRICT mOrigin; + int mSetIdx; }; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index 8506476b..6e3b728f 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -19,13 +19,13 @@ bPartition::bPartition() template bPartition:: - bPartition(int setIdx, - int cardinality, - T* mem, - typename Idx::DataBlockIdx* blockConnectivity, - typename Span::BitMaskWordType* mask, - Neon::int32_3d* origin, - NghIdx* stencilNghIndex) + bPartition(int setIdx, + int cardinality, + T* mem, + typename Idx::DataBlockIdx* blockConnectivity, + typename SBlock::BitMask const* NEON_RESTRICT mask, + Neon::int32_3d* origin, + NghIdx* stencilNghIndex) : mCardinality(cardinality), mMem(mem), mStencilNghIndex(stencilNghIndex), @@ -115,10 +115,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: return {false, 0}; } - bool isActive = Span::getActiveStatus(nghIdx.mDataBlockIdx, - nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z, - mMask); - + const bool isActive = mMask[nghIdx.mDataBlockIdx].isActive(nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z); if (!isActive) { return {false, 0}; } diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h index 80fb12ab..9c6ed821 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h @@ -23,42 +23,32 @@ class bSpan bSpan() = default; virtual ~bSpan() = default; - NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId() -> typename Idx::DataBlockIdx + NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId() + -> typename Idx::DataBlockIdx { return std::numeric_limits::max(); } - inline bSpan(typename Idx::DataBlockCount mFirstDataBlockOffset, - bSpan::BitMaskWordType* mActiveMask, - Neon::DataView mDataView); + inline bSpan( + typename Idx::DataBlockCount mFirstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask, + Neon::DataView mDataView); - NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice(Idx& bidx, - uint32_t const& threadIdx, - uint32_t const& x, - uint32_t const& y, - uint32_t const& z) const -> bool; + NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice( + Idx& bidx, + uint32_t const& threadIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool; NEON_CUDA_HOST_DEVICE inline auto setAndValidateGPUDevice( Idx& bidx) const -> bool; - static NEON_CUDA_HOST_DEVICE inline auto getRequiredWordsForBlockBitMask() -> uint32_t; - static NEON_CUDA_HOST_DEVICE inline auto getActiveStatus( - const typename Idx::DataBlockIdx& dataBlockIdx, - int threadX, - int threadY, - int threadZ, - bSpan::BitMaskWordType* mActiveMask) -> bool; - - static inline auto getMaskAndWordIdforBlockBitMask(int threadX, - int threadY, - int threadZ, - BitMaskWordType& mask, - uint32_t& wordIdx) -> void; // We don't need to have a count on active blocks - typename Idx::DataBlockCount mFirstDataBlockOffset; - bSpan::BitMaskWordType* mActiveMask; - Neon::DataView mDataView; + typename Idx::DataBlockCount mFirstDataBlockOffset; + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask; + Neon::DataView mDataView; }; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h index 57d7aeca..8a208110 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h @@ -12,10 +12,8 @@ bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool bidx.mInDataBlockIdx.y = threadIdx.y; bidx.mInDataBlockIdx.z = threadIdx.z; - bool const isActive = getActiveStatus(bidx.mDataBlockIdx, - bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z, - mActiveMask); - // printf("%d %d %d is active %d\n",bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z, (isActive?1:-1)); + const bool isActive = mActiveMask[bidx.mDataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + return isActive; #else NEON_THROW_UNSUPPORTED_OPERATION("Operation supported only on GPU"); @@ -25,94 +23,29 @@ bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool template NEON_CUDA_HOST_DEVICE inline auto bSpan::setAndValidateCPUDevice(Idx& bidx, - uint32_t const& dataBlockIdx, - uint32_t const& x, - uint32_t const& y, - uint32_t const& z) const -> bool + uint32_t const& dataBlockIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool { bidx.mDataBlockIdx = dataBlockIdx; - bidx.mInDataBlockIdx.x = static_cast < typename Idx::InDataBlockIdx::Integer>(x); + bidx.mInDataBlockIdx.x = static_cast(x); bidx.mInDataBlockIdx.y = static_cast(y); bidx.mInDataBlockIdx.z = static_cast(z); - bool const isActive = getActiveStatus(bidx.mDataBlockIdx, - bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z, - mActiveMask); + const bool isActive = mActiveMask[dataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); return isActive; } template -bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, - BitMaskWordType* activeMask, - Neon::DataView dataView) +bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT activeMask, + Neon::DataView dataView) : mFirstDataBlockOffset(firstDataBlockOffset), mActiveMask(activeMask), mDataView(dataView) { } -template -NEON_CUDA_HOST_DEVICE inline auto bSpan::getRequiredWordsForBlockBitMask() -> uint32_t -{ - uint32_t requiredBits = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; - uint32_t requiredWords = ((requiredBits - 1) >> bSpan::log2OfbitMaskWordSize) + 1; - return requiredWords; -} - -template -inline auto bSpan::getMaskAndWordIdforBlockBitMask(int threadX, - int threadY, - int threadZ, - NEON_OUT BitMaskWordType& mask, - NEON_OUT uint32_t& wordIdx) -> void -{ - if constexpr (activeMaskMemoryLayout == Neon::MemoryLayout::arrayOfStructs) { - // 6 = log_2 64 - const uint32_t threadPitch = threadX + threadY * SBlock::memBlockSizeX + threadZ * SBlock::memBlockSizeX * SBlock::memBlockSizeY; - // threadPitch >> log2OfbitMaskWordSize - // the same as: threadPitch / 2^{log2OfbitMaskWordSize} - wordIdx = threadPitch >> log2OfbitMaskWordSize; - // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1); - // same as threadPitch % 2^{log2OfbitMaskWordSize} - const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitMaskStorageBitWidth)) - 1); - mask = BitMaskWordType(1) << offsetInWord; - } else { - assert(false); - } -} - - -template -NEON_CUDA_HOST_DEVICE inline auto bSpan::getActiveStatus( - const typename Idx::DataBlockIdx& dataBlockIdx, - int threadX, - int threadY, - int threadZ, - BitMaskWordType* mActiveMask) -> bool -{ - if constexpr (activeMaskMemoryLayout == Neon::MemoryLayout::arrayOfStructs) { - // 6 = log_2 64 - const uint32_t threadPitch = threadX + threadY * SBlock::memBlockSizeX + threadZ * SBlock::memBlockSizeX * SBlock::memBlockSizeY; - // threadPitch >> log2OfbitMaskWordSize - // the same as: threadPitch / 2^{log2OfbitMaskWordSize} - const uint32_t wordIdx = threadPitch >> log2OfbitMaskWordSize; - // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1); - // same as threadPitch % 2^{log2OfbitMaskWordSize} - const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitMaskStorageBitWidth)) - 1); - BitMaskWordType mask = BitMaskWordType(1) << offsetInWord; - - uint32_t const cardinality = getRequiredWordsForBlockBitMask(); - uint32_t const pitch = (cardinality * dataBlockIdx) + wordIdx; - BitMaskWordType targetWord = mActiveMask[pitch]; - auto masked = targetWord & mask; - if (masked != 0) { - return true; - } - return false; - } else { - assert(false); - } - // -} } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h index 5cd93860..1843c4df 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h @@ -35,7 +35,7 @@ eField::eField(const std::string& fieldUserName, mData->memoryField = mData->grid->getMemoryGrid().template newField(fieldUserName + "-storage", cardinality, - T(0), + inactiveValue, dataUse); diff --git a/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h b/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h index ea10edf6..97d10dc1 100644 --- a/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h +++ b/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h @@ -359,7 +359,7 @@ template FieldBase::Storage::Storage() : dimension(0), cardinality(0), - outsideVal(static_cast(0.0)), + outsideVal(T()), dataUse(), memoryOptions(), haloStatus(), From cdcdc0df3dcf91aa7927f9085e30041d057a7336 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 20:09:20 -0400 Subject: [PATCH 06/25] bGrid - fixing multi-GPU --- .../include/Neon/domain/details/bGrid/bField.h | 18 +++++++++--------- .../Neon/domain/details/bGrid/bField_imp.h | 15 ++++++++------- .../Neon/domain/details/bGrid/bGrid_imp.h | 2 ++ .../tests/domain-stencil/src/gtests.cpp | 4 ++-- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index 8f1ac485..d0dd45c5 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -34,18 +34,18 @@ class bField : public Neon::domain::interface::FieldBaseTemplate; using Idx = bIndex; using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; - template + template using BlockViewField = BlockViewGrid::template Field; using NghIdx = typename Partition::NghIdx; using NghData = typename Partition::NghData; - bField(const std::string& fieldUserName, - Neon::DataUse dataUse, - const Neon::MemoryOptions& memoryOptions, - const Grid& grid, - int cardinality, - T inactiveValue); + bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue); bField(); @@ -110,8 +110,8 @@ class bField : public Neon::domain::interface::FieldBaseTemplate grid; - BlockViewField memoryField; + std::shared_ptr grid; + BlockViewField memoryField; int mCardinality; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index a9c249ca..a6127c43 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -11,12 +11,12 @@ bField::bField() } template -bField::bField(const std::string& fieldUserName, - Neon::DataUse dataUse, - const Neon::MemoryOptions& memoryOptions, - const Grid& grid, - int cardinality, - T inactiveValue) +bField::bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue) : Neon::domain::interface::FieldBaseTemplate(&grid, fieldUserName, "bField", @@ -29,7 +29,8 @@ bField::bField(const std::string& fieldUserName, mData->grid = std::make_shared(grid); if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) { - NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs"); + NEON_WARNING("bField does not support MemoryLayout::arrayOfStructs, enforcing MemoryLayout::structOfArrays"); + memoryOptions.setOrder(Neon::MemoryLayout::structOfArrays); } // the allocation size is the number of blocks x block size x cardinality mData->memoryField = mData->grid->getBlockViewGrid().template newField( diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 7505a06b..b921a3e1 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -125,7 +125,9 @@ bGrid::bGrid(const Neon::Backend& backend, }) .run(Neon::Backend::mainStreamIdx); + mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx); + this->getBackend().sync(Neon::Backend::mainStreamIdx); mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device) diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp index 70d9d650..ec6f892a 100644 --- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp +++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp @@ -22,9 +22,9 @@ TEST(domain_stencil, eGrid) 1); } -TEST(domain_stencil, bGridSingleGPU) +TEST(domain_stencil, bGri ) { - int nGpus = 1; + int nGpus = 5; using Type = int64_t; runAllTestConfiguration(std::function(map::run), nGpus, From ea82dfc1c0553a64f5c0bd0d8b23b4be04436195 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 16 Jun 2023 10:14:27 -0400 Subject: [PATCH 07/25] Adding scripts --- .../lbm-lid-driven-cavity-flow.py | 48 +++++++++++++++++++ .../lbm-lid-driven-cavity-flow.sh | 30 ++++++------ 2 files changed, 64 insertions(+), 14 deletions(-) create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py new file mode 100644 index 00000000..f4b48dd3 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -0,0 +1,48 @@ +import subprocess + +DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split() +DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split() +DEVICE_TYPE_LIST = 'cpu gpu'.split() +GRID_LIST = "dGrid bGrid eGrid".split() +STORAGE_FP_LIST = "double float".split() +COMPUTE_FP_LIST = "double float".split() +OCC_LIST = "nOCC".split() +WARM_UP_ITER = 10 +MAX_ITER = 100 +REPETITIONS = 5 + +for DEVICE_TYPE in DEVICE_TYPE_LIST: + + DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in DEVICE_ID_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + + command = 'lbm-lid-driven-cavity-flow' + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + DEVICE_TYPE) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + + STORAGE_FP + '_' + COMPUTE_FP + '_' + + DEVICE_SET.replace(' ', '_') + '_' + OCC) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--benchmark') + parameters.append('--' + OCC) + + subprocess.run(['echo' , ' '.join(parameters)]) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh index ba5fe106..7cc5108c 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh @@ -1,7 +1,7 @@ set -x -DOMAIN_SIZE_LIST="128 192 256 320 384 448 512" -GRID="dGrid" +DOMAIN_SIZE_LIST="64 128 192 256 320 384 448 512" +GRID_LIST="dGrid bGrid eGrid" STORAGE_FP_LIST="double float" COMPUTE_FP_LIST="double float" OCC="nOCC" @@ -9,20 +9,22 @@ OCC="nOCC" for DOMAIN_SIZE in ${DOMAIN_SIZE_LIST}; do for STORAGE_FP in ${STORAGE_FP_LIST}; do for COMPUTE_FP in ${COMPUTE_FP_LIST}; do + for GRID in ${GRID_LIST}; do - if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then - continue - fi + if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then + continue + fi - echo ./lbm-lid-driven-cavity-flow \ - --deviceType gpu --deviceIds 0 \ - --grid "${GRID}" \ - --domain-size "${DOMAIN_SIZE}" \ - --warmup-iter 10 --max-iter 100 --repetitions 5 \ - --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \ - --computeFP "${COMPUTE_FP}" \ - --storageFP "${STORAGE_FP}" \ - --${OCC} --benchmark + echo ./lbm-lid-driven-cavity-flow \ + --deviceType gpu --deviceIds 0 \ + --grid "${GRID}" \ + --domain-size "${DOMAIN_SIZE}" \ + --warmup-iter 10 --max-iter 100 --repetitions 5 \ + --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \ + --computeFP "${COMPUTE_FP}" \ + --storageFP "${STORAGE_FP}" \ + --${OCC} --benchmark + done done done done From 55af7081427f22a352ecfe69cb03c2ad722c16df Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 19 Jun 2023 09:54:52 -0400 Subject: [PATCH 08/25] Benchmarks and scripts Adding scripts Adding scripts Adding scripts Adding scripts Adding back eGrid and bGrid to the LBM benchmark. Fixing warning issue Fixing script. --- .../lbm-lid-driven-cavity-flow/CMakeLists.txt | 9 +- .../lbm-lid-driven-cavity-flow.py | 123 ++++++++++++----- .../src/RunCavityTwoPop.cu | 8 +- .../Neon/domain/details/bGrid/bField.h | 2 +- .../Neon/domain/details/bGrid/bField_imp.h | 2 +- .../Neon/domain/details/bGrid/bPartition.h | 48 ++++++- .../domain/details/bGrid/bPartition_imp.h | 130 +++++++++++++++++- .../Neon/domain/details/eGrid/ePartition.h | 39 ++++-- .../domain/details/eGrid/ePartition_imp.h | 37 +++++ libNeonSet/include/Neon/set/Containter_imp.h | 1 + 10 files changed, 336 insertions(+), 63 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt b/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt index dfb18a8c..ed03a750 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt +++ b/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt @@ -23,4 +23,11 @@ add_custom_command( TARGET ${APP} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.sh - ${CMAKE_BINARY_DIR}/bin/${APP}.sh) \ No newline at end of file + ${CMAKE_BINARY_DIR}/bin/${APP}.sh) + +add_custom_command( + TARGET ${APP} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.py + ${CMAKE_BINARY_DIR}/bin/${APP}.py +) \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index f4b48dd3..5aebe104 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -1,5 +1,3 @@ -import subprocess - DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split() DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split() DEVICE_TYPE_LIST = 'cpu gpu'.split() @@ -11,38 +9,89 @@ MAX_ITER = 100 REPETITIONS = 5 -for DEVICE_TYPE in DEVICE_TYPE_LIST: - - DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] - if DEVICE_TYPE == 'gpu': - for DEVICE in DEVICE_ID_LIST[1:]: - DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) - for OCC in OCC_LIST: - for DOMAIN_SIZE in DOMAIN_SIZE_LIST: - for STORAGE_FP in STORAGE_FP_LIST: - for COMPUTE_FP in COMPUTE_FP_LIST: - for DEVICE_SET in DEVICE_SET_LIST: - - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue - - command = 'lbm-lid-driven-cavity-flow' - parameters = [] - parameters.append('--deviceType ' + DEVICE_TYPE) - parameters.append('--deviceIds ' + DEVICE_SET) - parameters.append('--grid ' + DEVICE_TYPE) - parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) - parameters.append('--max-iter ' + str(MAX_ITER)) - parameters.append( - '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + - STORAGE_FP + '_' + COMPUTE_FP + '_' + - DEVICE_SET.replace(' ', '_') + '_' + OCC) - parameters.append('--computeFP ' + COMPUTE_FP) - parameters.append('--storageFP ' + STORAGE_FP) - parameters.append('--benchmark') - parameters.append('--' + OCC) - - subprocess.run(['echo' , ' '.join(parameters)]) +import subprocess +import sys + + +def printProgressBar(value, label): + n_bar = 40 # size of progress bar + max = 100 + j = value / max + sys.stdout.write('\r') + bar = 'â–ˆ' * int(n_bar * j) + bar = bar + '-' * int(n_bar * (1 - j)) + + sys.stdout.write(f"{label.ljust(10)} | [{bar:{n_bar}s}] {int(100 * j)}% ") + sys.stdout.flush() + + +def countAll(): + counter = 0 + for DEVICE_TYPE in DEVICE_TYPE_LIST: + DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in DEVICE_ID_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for GRID in GRID_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + + counter += 1 + return counter + + +SAMPLES = countAll() +counter = 0 +command = './lbm-lid-driven-cavity-flow' +with open(command + '.log', 'w') as fp: + for DEVICE_TYPE in DEVICE_TYPE_LIST: + DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in DEVICE_ID_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for GRID in GRID_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + + STORAGE_FP + '_' + COMPUTE_FP + '_' + + DEVICE_SET.replace(' ', '_') + '_' + OCC) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--benchmark') + parameters.append('--' + OCC) + + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) + + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + subprocess.run(commandList, text=True, stdout=fp) + + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index 2ca5e128..c603415c 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -128,7 +128,7 @@ auto run(Config& config, }, Neon::computeMode_t::seq); - //sort the position so the linear interpolation works + // sort the position so the linear interpolation works std::sort(xPosVal.begin(), xPosVal.end(), [=](std::pair& a, std::pair& b) { return a.first < b.first; }); @@ -308,12 +308,10 @@ auto run(Config& config, return details::runFilterStoreType(config, report); } if (config.gridType == "eGrid") { - NEON_DEV_UNDER_CONSTRUCTION(""); - // return details::runFilterStoreType(config, report); + return details::runFilterStoreType(config, report); } if (config.gridType == "bGrid") { - NEON_DEV_UNDER_CONSTRUCTION(""); - // return details::runFilterStoreType(config, report); + return details::runFilterStoreType(config, report); } } } // namespace CavityTwoPop diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index d0dd45c5..d4d663fd 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -111,7 +111,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate grid; - BlockViewField memoryField; + BlockViewField memoryField; int mCardinality; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index a6127c43..29a71248 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -250,7 +250,7 @@ auto bField::initHaloUpdateTable() -> void setIdxVec[Data::EndPoints::src] = setIdxSrc; std::array partitions; - std::array*, Data::EndPointsUtils::nConfigs> blockViewPartitions; + std::array*, Data::EndPointsUtils::nConfigs> blockViewPartitions; std::array, Data::EndPointsUtils::nConfigs> ghostZBeginIdx; std::array, Data::EndPointsUtils::nConfigs> boundaryZBeginIdx; std::array memPhyDim; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 7f537ad5..35abdc50 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -38,42 +38,80 @@ class bPartition Neon::int32_3d* mOrigin, NghIdx* mStencilNghIndex); + /** + * Retrieve the cardinality of the field. + */ inline NEON_CUDA_HOST_DEVICE auto cardinality() const -> int; + /** + * Gets the field metadata at a cartesian point. + */ inline NEON_CUDA_HOST_DEVICE auto operator()(const Idx& cell, int card) -> T&; + /** + * Gets the field metadata at a cartesian point. + */ inline NEON_CUDA_HOST_DEVICE auto operator()(const Idx& cell, int card) const -> const T&; + /** + * Gets the field metadata at a neighbour cartesian point. + */ NEON_CUDA_HOST_DEVICE inline auto getNghData(const Idx& cell, const NghIdx& offset, const int card) const -> NghData; + /** + * Gets the field metadata at a neighbour cartesian point. + */ NEON_CUDA_HOST_DEVICE inline auto getNghData(const Idx& eId, uint8_t nghID, int card) const -> NghData; + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + int card, + T defaultValue) + const -> NghData; + + /** + * Gets the global coordinates of the cartesian point. + */ NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& cell) const -> Neon::index_3d; - + /** + * Gets the Idx for in the block view space. + */ NEON_CUDA_HOST_DEVICE inline auto - getBlockViewGridIdx(const Idx& cell) + getBlockViewIdx(const Idx& cell) const -> BlockViewGridIdx; - protected: NEON_CUDA_HOST_DEVICE inline auto helpGetPitch(const Idx& cell, int card) @@ -91,6 +129,10 @@ class bPartition helpGetNghIdx(const Idx& idx, const NghIdx& offset) const -> Idx; + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx) + const -> Idx; int mCardinality; T* mMem; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index 6e3b728f..d8bbef08 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -50,7 +50,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: template NEON_CUDA_HOST_DEVICE inline auto bPartition:: - getBlockViewGridIdx(const Idx& gidx) + getBlockViewIdx(const Idx& gidx) const -> BlockViewGridIdx { BlockViewGridIdx res; @@ -193,6 +193,96 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: } } +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx) + const -> Idx +{ + + typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + xOff, + idx.mInDataBlockIdx.y + yOff, + idx.mInDataBlockIdx.z + zOff); + + /** + * 0 if no offset on the direction + * 1 positive offset + * -1 negative offset + */ + const int xFlag = [&] { + if constexpr (xOff == 0) { + return 0; + } else { + return ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + + const int yFlag = [&] { + if constexpr (yOff == 0) { + return 0; + } else { + return ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + const int zFlag = [&] { + if constexpr (zOff == 0) { + return 0; + } else { + return ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + const bool isLocal = (xFlag | yFlag | zFlag) == 0; + if (!(isLocal)) { + typename Idx::InDataBlockIdx remoteInBlockOffset; + /** + * Example + * - 8 block (1D case) + * Case 1: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 starting point + * + * - idx.inBlock = 2 + * - offset = -1 + * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3 + * Case 2: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * starting point +3 from 3 + * + * - idx.inBlock = 3 + * - offset = (+3,0) + * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2 + * + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 from 0 +3 from 3 + * + * NOTE: if in one direction the neighbour offet is zero, xFalg is 0; + * */ + + Idx remoteNghIdx; + remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX; + + int connectivityJump = idx.mDataBlockIdx * 27 + + (xFlag + 1) + + (yFlag + 1) * 3 + + (zFlag + 1) * 9; + remoteNghIdx.mDataBlockIdx = mBlockConnectivity[connectivityJump]; + + return remoteNghIdx; + } else { + Idx localNghIdx; + localNghIdx.mDataBlockIdx = idx.mDataBlockIdx; + localNghIdx.mInDataBlockIdx = ngh; + return localNghIdx; + } +} + template NEON_CUDA_HOST_DEVICE inline auto bPartition:: getNghData(const Idx& eId, @@ -223,4 +313,42 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: return result; } +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.invalidate(); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card, + T defaultValue) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.set(defaultValue, false); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h index cacac275..012a3588 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h @@ -74,8 +74,8 @@ class ePartition public: //-- [PUBLIC TYPES] ---------------------------------------------------------------------------- - using Self = ePartition; //<- this type - using Idx = eIndex; //<- index type + using Self = ePartition; //<- this type + using Idx = eIndex; //<- index type using OuterIdx = typename Idx::OuterIdx; //<- index type for the subGrid static constexpr int Cardinality = C; @@ -147,15 +147,15 @@ class ePartition operator()(Idx eId, int cardinalityIdx) -> T&; -// template -// NEON_CUDA_HOST_DEVICE inline auto -// castRead(Idx eId, int cardinalityIdx) const -// -> ComputeType; -// -// template -// NEON_CUDA_HOST_DEVICE inline auto -// castWrite(Idx eId, int cardinalityIdx, const ComputeType& value) -// -> void; + // template + // NEON_CUDA_HOST_DEVICE inline auto + // castRead(Idx eId, int cardinalityIdx) const + // -> ComputeType; + // + // template + // NEON_CUDA_HOST_DEVICE inline auto + // castWrite(Idx eId, int cardinalityIdx, const ComputeType& value) + // -> void; /** * Retrieve value of a neighbour for a field with multiple cardinalities * @tparam dataView_ta @@ -165,9 +165,9 @@ class ePartition * @return */ NEON_CUDA_HOST_DEVICE inline auto - getNghData(Idx eId, - NghIdx nghIdx, - int card) + getNghData(Idx eId, + NghIdx nghIdx, + int card) const -> NghData; NEON_CUDA_HOST_DEVICE inline auto @@ -176,7 +176,18 @@ class ePartition int card) const -> NghData; + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(Idx eId, + int card) + const -> NghData; + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(Idx eId, + int card, + T defaultValue) + const -> NghData; /** * Check is the * @tparam dataView_ta diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h index c2ff1ae0..0063ee9e 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h @@ -87,6 +87,43 @@ ePartition::getNghData(eIndex eId, return res; } +template +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(eIndex eId, + int card) + const -> NghData +{ + int tablePithc = (xOff + mStencilRadius) + + (yOff + mStencilRadius) * mStencilTableYPitch + + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(eId, nghIdx, card); + + return res; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(eIndex eId, + int card, + T defaultVal) + const -> NghData +{ + int tablePithc = (xOff + mStencilRadius) + + (yOff + mStencilRadius) * mStencilTableYPitch + + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(eId, nghIdx, card); + if (!res.isValid()) { + res.set(defaultVal, false); + } + return res; +} + template NEON_CUDA_HOST_DEVICE inline auto diff --git a/libNeonSet/include/Neon/set/Containter_imp.h b/libNeonSet/include/Neon/set/Containter_imp.h index 534d92ff..f7f421e6 100644 --- a/libNeonSet/include/Neon/set/Containter_imp.h +++ b/libNeonSet/include/Neon/set/Containter_imp.h @@ -48,6 +48,7 @@ auto Container::factory(const std::string& name, std::shared_ptr tmp(k); return {tmp}; } + NEON_THROW_UNSUPPORTED_OPERATION("Execution type not supported"); } template From 90a4ba9f628b93e86edf440b3df883cf83ee7ab2 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 19 Jun 2023 10:47:29 -0400 Subject: [PATCH 09/25] Code documentation --- .../include/Neon/domain/details/bGrid/bGrid.h | 7 +- .../Neon/domain/details/bGrid/bGrid_imp.h | 6 +- .../include/Neon/domain/tools/Partitioner1D.h | 74 +++++++++++++------ 3 files changed, 58 insertions(+), 29 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index 8ed458c8..62ae8ad6 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -71,9 +71,8 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, const Neon::int32_3d& domainSize /**< Size of the bounded Cartesian */, const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, - const int voxelSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. - * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/ - , + const int multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/, const double_3d& spacingData = double_3d(1, 1, 1) /** Physical spacing between two consecutive data points in the Cartesian domain */, const double_3d& origin = double_3d(0, 0, 0) /** Physical location in space of the origin of the Cartesian discretization */); @@ -212,7 +211,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, tool::Partitioner1D::DenseMeta denseMeta; - int voxelSpacing; + int mMultiResDiscreteIdxSpacing; // number of active voxels in each block Neon::set::DataSet mNumActiveVoxel; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index b921a3e1..85da8a62 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -20,7 +20,7 @@ bGrid::bGrid(const Neon::Backend& backend, const Neon::int32_3d& domainSize, const ActiveCellLambda activeCellLambda, const Neon::domain::Stencil& stencil, - const int voxelSpacing, + const int multiResDiscreteIdxSpacing, const double_3d& spacingData, const double_3d& origin) { @@ -29,7 +29,7 @@ bGrid::bGrid(const Neon::Backend& backend, mData = std::make_shared(); mData->init(backend); - mData->voxelSpacing = voxelSpacing; + mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing; mData->stencil = stencil; const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX, SBlock::memBlockSizeY, @@ -45,7 +45,7 @@ bGrid::bGrid(const Neon::Backend& backend, stencil, nElementsPerPartition, defaultKernelBlockSize, - voxelSpacing, + multiResDiscreteIdxSpacing, origin); } diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h index e162512c..0204098c 100644 --- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h +++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h @@ -8,6 +8,34 @@ namespace Neon::domain::tool { +/** + * Abstraction for a partitioner on a 1D domain. + * + * Partitioning is executed over the cartesian index space of the domain. + * The Partitioner works at the block granularity. The block size is defined by the user. + * + * The partitioning is done in thee steps: + * a. [DOMAIN DECOMPOSITION] - Projecting of the blocks into the Z-axis and then applying a uniform partitioning schema. + * Definition of the span of each partition is the final result of this step. + * + * b. [CLASSIFIER] - For each partition, the indexes in a partition span are classified twice: + * - First, the indexes are classified according to the data view configuration. + * - INTERNAL: The span is fully contained in the partition. + * - BOUNDARY: The span is partially contained in the partition. + * - GHOST: The span is not contained in the partition. + * - Second, the indexes are classified according to the boundary conditions. This is a user driven classification + * + * c. [LAYOUT] - The final step is to layout the indexes in memory, i.e. decide for each index its position in a 1D array. + * + * The final layout of each partitioning will look like the following: + * + * -------------------------------------------------------------------- + * | Internal | Boundary | Ghost | + * | | UP | DW | UP | Dw | + * | Bulk | Bc | Bulk | Bc | Bulk | Bc | Bulk | Bc | Bulk | Bc | + * -------------------------------------------------------------------- + * + */ class Partitioner1D { public: @@ -75,59 +103,59 @@ class Partitioner1D Meta invalidMeta; }; - template Partitioner1D(const Neon::Backend& backend, - const ActiveCellLambda& activeCellLambda, + const ActiveIndexLambda& activeIndexLambda, const BcLambda& bcLambda, const Neon::index_3d& dataBlockSize, const Neon::int32_3d& domainSize, const Neon::domain::Stencil stencil, - const int& discreteVoxelSpacing = 1) + const int& multiResDiscreteIdxSpacing = 1) { mData = std::make_shared(); mData->mDataBlockSize = dataBlockSize; - mData->mDiscreteVoxelSpacing = discreteVoxelSpacing; + mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing; mData->mStencil = stencil; mData->mDomainSize = domainSize; - Neon::int32_3d block3DSpan(NEON_DIVIDE_UP(domainSize.x, dataBlockSize.x), - NEON_DIVIDE_UP(domainSize.y, dataBlockSize.y), - NEON_DIVIDE_UP(domainSize.z, dataBlockSize.z)); + // Block space interval (i.e. indexing space at the block granularity) - mData->block3DSpan = block3DSpan; + mData->block3DSpan = Neon::int32_3d(NEON_DIVIDE_UP(domainSize.x, dataBlockSize.x), + NEON_DIVIDE_UP(domainSize.y, dataBlockSize.y), + NEON_DIVIDE_UP(domainSize.z, dataBlockSize.z)); std::vector nBlockProjectedToZ(block3DSpan.z); auto block3dIdxToBlockOrigin = [&](Neon::int32_3d const& block3dIdx) { - Neon::int32_3d blockOrigin(block3dIdx.x * dataBlockSize.x * discreteVoxelSpacing, - block3dIdx.y * dataBlockSize.y * discreteVoxelSpacing, - block3dIdx.z * dataBlockSize.z * discreteVoxelSpacing); + Neon::int32_3d blockOrigin(block3dIdx.x * dataBlockSize.x * multiResDiscreteIdxSpacing, + block3dIdx.y * dataBlockSize.y * multiResDiscreteIdxSpacing, + block3dIdx.z * dataBlockSize.z * multiResDiscreteIdxSpacing); return blockOrigin; }; auto getVoxelAbsolute3DIdx = [&](Neon::int32_3d const& blockOrigin, Neon::int32_3d const& voxelRelative3DIdx) { - const Neon::int32_3d id(blockOrigin.x + voxelRelative3DIdx.x * discreteVoxelSpacing, - blockOrigin.y + voxelRelative3DIdx.y * discreteVoxelSpacing, - blockOrigin.z + voxelRelative3DIdx.z * discreteVoxelSpacing); + const Neon::int32_3d id(blockOrigin.x + voxelRelative3DIdx.x * multiResDiscreteIdxSpacing, + blockOrigin.y + voxelRelative3DIdx.y * multiResDiscreteIdxSpacing, + blockOrigin.z + voxelRelative3DIdx.z * multiResDiscreteIdxSpacing); return id; }; mData->spanDecomposition = std::make_shared( backend, - activeCellLambda, + activeIndexLambda, block3dIdxToBlockOrigin, getVoxelAbsolute3DIdx, block3DSpan, dataBlockSize, domainSize, - discreteVoxelSpacing); + multiResDiscreteIdxSpacing); mData->mSpanClassifier = std::make_shared( backend, - activeCellLambda, + activeIndexLambda, bcLambda, block3dIdxToBlockOrigin, getVoxelAbsolute3DIdx, @@ -135,7 +163,7 @@ class Partitioner1D dataBlockSize, domainSize, stencil, - discreteVoxelSpacing, + multiResDiscreteIdxSpacing, mData->spanDecomposition); mData->mSpanLayout = std::make_shared( @@ -147,10 +175,12 @@ class Partitioner1D mData->mSpanLayout->getStandardAndGhostCount().typedClone(), {251, 1, 1}); } - auto getBlockSpan() -> Neon::int32_3d + auto getBlockSpan() const + -> Neon::int32_3d { return mData->block3DSpan; } + auto getMemoryGrid() -> Neon::aGrid& { return mData->mTopologyWithGhost; @@ -207,7 +237,7 @@ class Partitioner1D aGrid::Cell idx(count); Neon::int32_3d point3d = mapperVec[j]; - point3d = point3d * mData->mDiscreteVoxelSpacing * mData->mDataBlockSize; + point3d = point3d * mData->mMultiResDiscreteIdxSpacing * mData->mDataBlockSize; partition(idx, 0) = point3d; count++; } @@ -349,7 +379,7 @@ class Partitioner1D if (findings.first) { targetNgh = findings.second; } - aGrid::Cell aIdx(static_cast (start + blockIdx)); + aGrid::Cell aIdx(static_cast(start + blockIdx)); partition(aIdx, s) = targetNgh; } } @@ -402,7 +432,7 @@ class Partitioner1D { public: Neon::index_3d mDataBlockSize = 0; - int mDiscreteVoxelSpacing = 0; + int mMultiResDiscreteIdxSpacing = 0; Neon::domain::Stencil mStencil; Neon::index_3d mDomainSize; Neon::int32_3d block3DSpan; From 019db4d6a03e771b8309d3d2291ccb151e071e98 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 19 Jun 2023 17:38:57 -0400 Subject: [PATCH 10/25] Fixing grid spacing in bGrid. --- .../Neon/domain/details/bGrid/bField.h | 10 +-------- .../Neon/domain/details/bGrid/bField_imp.h | 21 ++++++++++++++----- .../include/Neon/domain/details/bGrid/bGrid.h | 15 ++++++++++--- .../Neon/domain/details/bGrid/bGrid_imp.h | 19 +++++++++++++---- .../Neon/domain/details/bGrid/bPartition.h | 1 + .../domain/details/bGrid/bPartition_imp.h | 5 ++++- .../include/Neon/domain/tools/Partitioner1D.h | 6 +++--- 7 files changed, 52 insertions(+), 25 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index d4d663fd..565ae518 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -84,13 +84,6 @@ class bField : public Neon::domain::interface::FieldBaseTemplate void; - // - // enum PartitionBackend - // { - // cpu = 0, - // gpu = 1, - // }; - struct Data { Data() = default; @@ -112,8 +105,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate grid; BlockViewField memoryField; - - int mCardinality; + int cardinality; // Neon::domain::tool::HaloTable1DPartitioning latticeHaloUpdateTable; Neon::domain::tool::HaloTable1DPartitioning soaHaloUpdateTable; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 29a71248..52802f1c 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -79,11 +79,22 @@ template auto bField::getReference(const Neon::index_3d& cartesianIdx, const int& cardinality) -> T& { - auto& grid = this->getGrid(); - auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); - auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); - auto& result = partition(bIdx, cardinality); - return result; + if constexpr (SBlock::isMultiResMode) { + auto& grid = this->getGrid(); + auto uniformCartesianIdx = cartesianIdx / grid.helpGetMultiResFactor(); + auto uniformCartesianIdxTruncation = cartesianIdx % grid.helpGetMultiResFactor(); + static_assert(uniformCartesianIdxTruncation == 0); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(uniformCartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } else { + auto& grid = this->getGrid(); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } } template diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index 62ae8ad6..d94d1aa1 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -72,9 +72,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, const int multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. - * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/, - const double_3d& spacingData = double_3d(1, 1, 1) /** Physical spacing between two consecutive data points in the Cartesian domain */, - const double_3d& origin = double_3d(0, 0, 0) /** Physical location in space of the origin of the Cartesian discretization */); + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */ + , + const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */, + const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */); /** * Returns some properties for a given cartesian in the Cartesian domain. @@ -159,12 +160,20 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, */ auto getBlockViewGrid() const -> BlockView::Grid&; + /** * Retrieve the block vew grid internally used. * This grid can be leverage to allocate data at the block level. */ auto getActiveBitMask() const -> BlockView::Field&; + /** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ + template + auto helGetMultiResDiscreteIdxSpacing() const -> std::enable_if_t; + + /** * Help function to retrieve the block connectivity as a BlockViewGrid field */ diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 85da8a62..bde200e3 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -58,7 +58,7 @@ bGrid::bGrid(const Neon::Backend& backend, SBlock::memBlockSize3D.template newType(), domainSize, Neon::domain::Stencil::s27_t(false), - 1); + multiResDiscreteIdxSpacing); mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping(); mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset(); @@ -95,7 +95,7 @@ bGrid::bGrid(const Neon::Backend& backend, .getGrid() .template newContainer( "activeBitMaskInit", - [&](Neon::set::Loader& loader) { + [&, this](Neon::set::Loader& loader) { auto bitMaskPartition = loader.load(mData->activeBitField); return [&, bitMaskPartition](const auto& bitMaskIdx) mutable { auto prtIdx = bitMaskPartition.prtID(); @@ -107,9 +107,9 @@ bGrid::bGrid(const Neon::Backend& backend, for (int k = 0; k < SBlock::memBlockSize3D.template newType().z; k++) { for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { - auto globalPosition = blockOrigin + Neon::int32_3d(i, j, k); + auto globalPosition = blockOrigin + Neon::int32_3d(i, j, k); bool const isInDomain = globalPosition < domainSize; - bool const isActive = activeCellLambda(globalPosition); + bool const isActive = activeCellLambda(globalPosition * mData->mMultiResDiscreteIdxSpacing); if (isActive && isInDomain) { countActive++; bitMask.setActive(i, j, k); @@ -319,6 +319,17 @@ auto bGrid:: return mData->activeBitField; } +/** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ +template +template +auto bGrid::helGetMultiResDiscreteIdxSpacing() const + -> std::enable_if_t +{ + return mData->mMultiResDiscreteIdxSpacing; +} + template auto bGrid:: helpGetBlockConnectivity() diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 35abdc50..73ccb914 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -141,6 +141,7 @@ class bPartition typename SBlock::BitMask const* NEON_RESTRICT mMask; Neon::int32_3d const* NEON_RESTRICT mOrigin; int mSetIdx; + int mMultiResDiscreteIdxSpacing = 1; }; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index d8bbef08..dc4c5880 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -45,7 +45,10 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: location.x += gidx.mInDataBlockIdx.x; location.y += gidx.mInDataBlockIdx.y; location.z += gidx.mInDataBlockIdx.z; - return location; + if constexpr (SBlock::isMultiResMode){ + return location * mMultiResDiscreteIdxSpacing; + } + return location ; } template diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h index 0204098c..ac49dc6f 100644 --- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h +++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h @@ -126,7 +126,7 @@ class Partitioner1D NEON_DIVIDE_UP(domainSize.y, dataBlockSize.y), NEON_DIVIDE_UP(domainSize.z, dataBlockSize.z)); - std::vector nBlockProjectedToZ(block3DSpan.z); + std::vector nBlockProjectedToZ(mData->block3DSpan.z); auto block3dIdxToBlockOrigin = [&](Neon::int32_3d const& block3dIdx) { Neon::int32_3d blockOrigin(block3dIdx.x * dataBlockSize.x * multiResDiscreteIdxSpacing, @@ -148,7 +148,7 @@ class Partitioner1D activeIndexLambda, block3dIdxToBlockOrigin, getVoxelAbsolute3DIdx, - block3DSpan, + mData->block3DSpan, dataBlockSize, domainSize, multiResDiscreteIdxSpacing); @@ -159,7 +159,7 @@ class Partitioner1D bcLambda, block3dIdxToBlockOrigin, getVoxelAbsolute3DIdx, - block3DSpan, + mData->block3DSpan, dataBlockSize, domainSize, stencil, From 588b74601d393955714aead173d82dc161327182 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 22 Jun 2023 18:51:30 -0400 Subject: [PATCH 11/25] WIP --- .../src/RunCavityTwoPop.cu | 4 + .../Neon/core/types/vec/vec4d_integer.tdecl.h | 1 + .../Neon/domain/details/dGrid/dPartition.h | 40 +- .../Neon/domain/details/dGridSoA/dGridSoA.h | 97 +++++ .../Neon/domain/details/dGridSoA/dIndexSoA.h | 53 +++ .../domain/details/dGridSoA/dIndexSoA_imp.h | 50 +++ .../domain/details/dGridSoA/dPartitionSoA.h | 342 ++++++++++++++++++ .../Neon/domain/details/dGridSoA/dSpanSoA.h | 52 +++ .../domain/details/dGridSoA/dSpanSoA_imp.h | 71 ++++ 9 files changed, 696 insertions(+), 14 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index c603415c..d28688d1 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -3,6 +3,7 @@ #include "Neon/domain/bGrid.h" #include "Neon/domain/dGrid.h" #include "Neon/domain/eGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "CellType.h" #include "LbmIteration.h" @@ -313,5 +314,8 @@ auto run(Config& config, if (config.gridType == "bGrid") { return details::runFilterStoreType(config, report); } + if (config.gridType == "dGridSoA") { + return details::runFilterStoreType(config, report); + } } } // namespace CavityTwoPop diff --git a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h index 788291a6..940c6d2c 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h @@ -58,6 +58,7 @@ template class Vec_4d { public: + using Integer = IntegerType_ta; using element_t = IntegerType_ta; using self_t = Vec_4d; diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 196f6b70..31e480aa 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -150,13 +150,13 @@ class dPartition return NghData(val, isValidNeighbour); } - template + template NEON_CUDA_HOST_DEVICE inline auto getNghData(const Idx& eId, int card, LambdaVALID funIfValid, LambdaNOTValid funIfNOTValid = nullptr) - const -> std::enable_if_t , void> + const -> std::enable_if_t, void> { Idx cellNgh; const bool isValidNeighbour = nghIdx(eId, cellNgh); @@ -419,19 +419,31 @@ class dPartition return; } + auto getDataView() + -> Neon::DataView + { + return m_dataView; + } + + auto helpGetGlobalToLocalOffets() const + -> NghIdx const* + { + return mStencil; + } + private: - Neon::DataView m_dataView; - T* m_mem; - Neon::index_3d m_dim; - int m_zHaloRadius; - int m_zBoundaryRadius; - Pitch m_pitch; - int m_prtID; - Neon::index_3d m_origin; - int m_cardinality; - Neon::index_3d m_fullGridSize; - bool mPeriodicZ; - NghIdx* mStencil; + Neon::DataView m_dataView; + T* NEON_RESTRICT m_mem; + Neon::index_3d m_dim; + int m_zHaloRadius; + int m_zBoundaryRadius; + Pitch m_pitch; + int m_prtID; + Neon::index_3d m_origin; + int m_cardinality; + Neon::index_3d m_fullGridSize; + bool mPeriodicZ; + NghIdx* NEON_RESTRICT mStencil; }; diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h new file mode 100644 index 00000000..61b182b2 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h @@ -0,0 +1,97 @@ +#pragma once +#include + +#include "Neon/core/core.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/BlockConfig.h" +#include "Neon/set/Containter.h" +#include "Neon/set/DevSet.h" +#include "Neon/set/MemoryOptions.h" + +#include "Neon/sys/memory/MemDevice.h" + +#include "Neon/domain/aGrid.h" + +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/interface/GridConcept.h" +#include "Neon/domain/interface/KernelConfig.h" +#include "Neon/domain/interface/LaunchConfig.h" +#include "Neon/domain/interface/Stencil.h" +#include "Neon/domain/interface/common.h" + +#include "Neon/domain/tools/GridTransformer.h" +#include "Neon/domain/tools/SpanTable.h" + +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/patterns/PatternScalar.h" + +#include "dPartitionSoA.h" + +namespace Neon::domain::details::dGridSoA { + +namespace details { +struct dGridSoATransformation +{ + template + using Partition = dPartitionSoA; + using Span = Neon::domain::details::eGrid::eSpan; + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + + using FoundationGrid = Neon::domain::details::eGrid::eGrid; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + using ExecutionThreadSpanIndexType = int32_t; + using Idx = FoundationGrid::Idx; + + static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& + { + return foundationGrid.getDefaultBlock(); + } + + static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable& spanTable) -> void + { + spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span = foundationGrid.getSpan(execution, setIdx, dw); + }); + } + + static auto initLaunchParameters(FoundationGrid& foundationGrid, + Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& shareMem) -> Neon::set::LaunchParameters + { + return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); + } + + static auto helpGetGridIdx(FoundationGrid&, + Neon::SetIdx const&, + FoundationGrid::Idx const& fgIdx) + -> GridTransformation::Idx + { + GridTransformation::Idx tgIdx = fgIdx; + return tgIdx; + } + + template + static auto initFieldPartition(FoundationGrid::Field& foundationField, + Neon::domain::tool::PartitionTable>& partitionTable) -> void + { + partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Partition& partition) { + auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw); + partition = Partition(foundationPartition); + }); + } +}; + +} // namespace details +using dGridSoA = Neon::domain::tool::GridTransformer::Grid; + +} // namespace Neon::domain::details::dGridSoA diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h new file mode 100644 index 00000000..2ed82d86 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h @@ -0,0 +1,53 @@ +#pragma once + +#include "Neon/core/core.h" +#include "Neon/domain/details/dGridSoA/dIndexSoA.h" + +namespace Neon::domain::details::dGridSoA { + +// Common forward declarations +class dSpanSoA; +template +class dPartitionSoA; + +struct dIndexSoA +{ + using OuterIdx = dIndexSoA; + + template + friend class dPartition; + friend dSpanSoA; + + template + friend class dField; + + // dGrid specific types + using Offset = int32_t; + using Location = index_3d; + using Count = int32_t; + + dIndexSoA() = default; + Location mLocation = 0; + Offset mOffset = 0; + + NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location const& location, + Offset const& offset); + + NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location::Integer const& x, + Location::Integer const& y, + Location::Integer const& z, + Offset const& offset); + + NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&; + + NEON_CUDA_HOST_DEVICE inline auto setOffset() -> Offset&; + + NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&; + + NEON_CUDA_HOST_DEVICE inline auto getOffset() const -> const Offset&; +}; + +} // namespace Neon::domain::details::dGridSoA + +#include "dIndexSoA_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h new file mode 100644 index 00000000..790608c7 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h @@ -0,0 +1,50 @@ +#pragma once +#include "Neon/core/core.h" + +namespace Neon::domain::details::dGridSoA { + +NEON_CUDA_HOST_DEVICE inline dIndexSoA:: + dIndexSoA(const Location& location, + Offset const& offset) +{ + mLocation = location; + mOffset = offset; +} + +NEON_CUDA_HOST_DEVICE inline dIndexSoA:: + dIndexSoA(const Location::Integer& x, + const Location::Integer& y, + const Location::Integer& z, + Offset const& offset) +{ + mLocation.x = x; + mLocation.y = y; + mLocation.z = z; + mOffset = offset; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + setLocation() -> Location& +{ + return mLocation; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + setOffset() -> Offset& +{ + return mOffset; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + getLocation() const -> const Location& +{ + return mLocation; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + getOffset() + const -> const Offset& +{ + return mOffset; +} +} // namespace Neon::domain::details::dGridSoA \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h new file mode 100644 index 00000000..fc4c3642 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -0,0 +1,342 @@ +#pragma once +#include +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" +#include "Neon/domain/details/dGrid/dGrid.h" +#include "Neon/domain/interface/NghData.h" +#include "Neon/set/DevSet.h" +#include "Neon/sys/memory/CudaIntrinsics.h" +#include "cuda_fp16.h" +#include "dIndexSoA.h" + +namespace Neon::domain::details::dGridSoA { + +template +class dPartitionSoA +{ + public: + using Idx = dIndexSoA; + using NghData = Neon::domain::NghData; + using Pitch = uint32_4d; + + dPartitionSoA() + { + } + + dPartitionSoA(Neon::domain::details::dGrid::dPartition const& dPartitionOriginal) + { + mDataView = dPartitionOriginal.getDataView(); + mMem = dPartitionOriginal.mem(); + mDim = dPartitionOriginal.dim(); + mZHaloRadius = dPartitionOriginal.halo().z; + mPitch = dPartitionOriginal.getPitchData().template newType(); + mPrtID = dPartitionOriginal.prtID(); + mOrigin = dPartitionOriginal.origin(); + mCardinality = dPartitionOriginal.cardinality(); + mFullGridSize = dPartitionOriginal.fullGridSize(); + NghIdx* mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets(); + } + + inline NEON_CUDA_HOST_DEVICE auto + prtID() + const -> int + { + return mPrtID(); + } + + inline NEON_CUDA_HOST_DEVICE auto + cardinality() + const -> int + { + return mCardinality(); + } + + inline NEON_CUDA_HOST_DEVICE auto + getPitchData() + const -> const Pitch& + { + return mPitch; + } + + inline NEON_CUDA_HOST_DEVICE auto + getPitch(const Idx& idx, + int cardinality) + -> Idx::Offset + { + return idx.getLocationOffset() + cardinality * mPitch.w; + } + + inline NEON_CUDA_HOST_DEVICE auto + dim() + const -> const Neon::index_3d + { + return mDim(); + } + + inline NEON_CUDA_HOST_DEVICE auto + halo() + const -> const Neon::index_3d + { + return mDPartition.halo(); + } + + inline NEON_CUDA_HOST_DEVICE auto + origin() + const -> const Neon::index_3d + { + return m_ormDPartition.origin(); + } + + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + NghIdx nghOffset, + int card, + const T& alternativeVal) + const -> NghData + { + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh); + T val = alternativeVal; + if (isValidNeighbour) { + val = operator()(gidxNgh, card); + } + return NghData(val, isValidNeighbour); + } + + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + NghIdx nghOffset, + int card) + const -> NghData + { + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh); + T val; + if (isValidNeighbour) { + val = operator()(gidxNgh, card); + } + return NghData(val, isValidNeighbour); + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t, void> + { + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = this->operator()(gidxNgh, card); + funIfValid(val); + } + if constexpr (!std::is_same_v) { + if (!isValidNeighbour) { + funIfNOTValid(); + } + } + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card) + const -> NghData + { + NghData res; + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = operator()(gidxNgh, card); + res.set(val, true); + } else { + res.invalidate(); + } + return res; + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + T const& defaultValue) + const -> NghData + { + NghData res(defaultValue, false); + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = operator()(gidxNgh, card); + res.set(val, true); + } + return res; + } + + NEON_CUDA_HOST_DEVICE inline auto + nghVal(const Idx& gidx, + uint8_t nghID, + int card, + const T& alternativeVal) + const -> NghData + { + NghIdx nghOffset = mStencil[nghID]; + return getNghData(gidx, nghOffset, card, alternativeVal); + } + + /** + * Get the index of the neighbor given the offset + * @tparam dataView_ta + * @param[in] gidx Index of the current element + * @param[in] nghOffset Offset of the neighbor of interest from the current element + * @param[in,out] neighbourIdx Index of the neighbor + * @return Whether the neighbour is valid + */ + NEON_CUDA_HOST_DEVICE inline auto + nghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) + const -> bool + { + Neon::index_3d cartesian(gidx.get().x + nghOffset.x, + gidx.get().y + nghOffset.y, + gidx.get().z + nghOffset.z); + + neighbourIdx = Idx(cartesian, + gidx.getOffset() + nghOffset.x * getPitchData().x + + nghOffset.y * getPitchData().y + + nghOffset.z * getPitchData().z); + + Idx::Location nghCartesianGlobal = getGlobalIndex(gidxNgh); + + bool isValidNeighbour = true; + + isValidNeighbour = (gidxNghGlobal.x >= 0) && + (gidxNghGlobal.y >= 0) && + (gidxNghGlobal.z >= 0); + + isValidNeighbour = (gidxNghGlobal.x < m_fullGridSize.x) && + (gidxNghGlobal.y < m_fullGridSize.y) && + (gidxNghGlobal.z < m_fullGridSize.z) && + isValidNeighbour; + + return isValidNeighbour; + } + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& gidx, + Idx& gidxNgh) + const -> bool + { + Neon::index_3d cartesian(gidx.get().x + xOff, + gidx.get().y + yOff, + gidx.get().z + zOff); + gidxNgh = Idx(cartesian, + gidx.getOffset() + xOff * getPitchData().x + + yOff * getPitchData().y + + zOff * getPitchData().z); + + Idx::Location nghCartesianGlobal(getGlobalIndex(gidxNgh)); + + bool isValidNeighbour = true; + if constexpr (xOff > 0) { + isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour; + isValidNeighbour = nghCartesianGlobal.x <= mDPartition.m_fullGridSize.x && isValidNeighbour; + } + if constexpr (xOff < 0) { + isValidNeighbour = nghCartesianGlobal.x >= 0 && isValidNeighbour; + } + if constexpr (yOff > 0) { + isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour; + isValidNeighbour = nghCartesianGlobal.y <= mDPartition.m_fullGridSize.y && isValidNeighbour; + } + if constexpr (yOff < 0) { + isValidNeighbour = nghCartesianGlobal.y >= 0 && isValidNeighbour; + } + if constexpr (zOff > 0) { + isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour; + isValidNeighbour = nghCartesianGlobal.z <= mDPartition.m_fullGridSize.z && isValidNeighbour; + } + if constexpr (zOff < 0) { + isValidNeighbour = nghCartesianGlobal.z >= mDPartition.m_zHaloRadius && isValidNeighbour; + } + return isValidNeighbour; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem() + -> T* + { + return mDPartition.m_mem; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem() const + -> const T* + { + return mDPartition.m_mem; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem(const Idx& cell, + int cardinalityIdx) + -> T* + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mDPartition.m_mem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto + operator()(const Idx& cell, + int cardinalityIdx) + -> T& + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mDPartition.m_mem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto + operator()(const Idx& cell, + int cardinalityIdx) + const -> const T& + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mDPartition.m_mem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local) + const -> Neon::index_3d + { + Neon::index_3d result = local.mLocation + m_origin; + result.z -= mDPartition.m_zHaloRadius; + return result; + } + + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() + const -> Neon::index_3d + { + return mDPartition.m_fullGridSize; + } + + Neon::DataView mDataView; + T* NEON_RESTRICT mMem; + Neon::index_3d mDim; + int mZHaloRadius; + int mZBoundaryRadius; + Pitch mPitch; + int mPrtID; + Neon::index_3d mOrigin; + int mCardinality; + Neon::index_3d mFullGridSize; + bool mPeriodicZ; + NghIdx* NEON_RESTRICT mStencil; +}; + +} // namespace Neon::domain::details::dGridSoA diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h new file mode 100644 index 00000000..83d5a2dc --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h @@ -0,0 +1,52 @@ +#pragma once +#include "Neon/set/DevSet.h" +#include "dIndexSoA.h" +namespace Neon::domain::details::dGridSoA { + +/** + * Abstraction that represents the Cell space of a partition + * This abstraction is used by the neon lambda executor to + * run a containers on aGrid + */ +class dSpanSoA +{ + public: + using Idx = dIndexSoA; + + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d3; + using ExecutionThreadSpanIndexType = int32_t; + + + NEON_CUDA_HOST_DEVICE inline auto + setAndValidate(Idx& idx, + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) const + -> bool; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetDataView() + const -> Neon::DataView const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetZHaloRadius() + const -> int const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetZBoundaryRadius() + const -> int const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetDim() + const -> Neon::index_3d const&; + + private: + Neon::DataView mDataView; + int mZHaloRadius; + int mZBoundaryRadius; + Neon::index_3d mDim /** Dimension of the span, its values depends on the mDataView*/; +}; + +} // namespace Neon::domain::details::dGrid + +#include "dSpanSoA_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h new file mode 100644 index 00000000..a3dff4cf --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h @@ -0,0 +1,71 @@ +#pragma once + +namespace Neon::domain::details::dGridSoA { + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::setAndValidate(Idx& idx, + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) + const -> bool +{ + bool res = false; + idx.setLocation().x = int(x); + idx.setLocation().y = int(y); + idx.setLocation().z = int(z); + + if (idx.get() < mDim) { + res = true; + } + + switch (mDataView) { + case Neon::DataView::STANDARD: { + idx.setLocation().z += mZHaloRadius; + idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + return res; + } + case Neon::DataView::INTERNAL: { + idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; + idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + return res; + } + case Neon::DataView::BOUNDARY: { + + idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius + ? 0 + : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); + idx.setLocation().z += mZHaloRadius; + idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + return res; + } + default: { + } + } + return false; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDataView() + const -> Neon::DataView const& +{ + return mDataView; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZHaloRadius() + const -> int const& +{ + return mZHaloRadius; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZBoundaryRadius() + const -> int const& +{ + return mZBoundaryRadius; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDim() + const -> Neon::index_3d const& +{ + return mDim; +} + +} // namespace Neon::domain::details::dGrid \ No newline at end of file From 9a87088f549eee95154e7ab5e11a555a2db203b7 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 22 Jun 2023 09:34:01 -0400 Subject: [PATCH 12/25] Fixing report filename for benchmarks scripts --- .../lbm-lid-driven-cavity-flow.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index 5aebe104..90a55ad2 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -4,7 +4,7 @@ GRID_LIST = "dGrid bGrid eGrid".split() STORAGE_FP_LIST = "double float".split() COMPUTE_FP_LIST = "double float".split() -OCC_LIST = "nOCC".split() +OCC_LIST = "nOCC sOCC".split() WARM_UP_ITER = 10 MAX_ITER = 100 REPETITIONS = 5 @@ -48,17 +48,18 @@ def countAll(): SAMPLES = countAll() counter = 0 command = './lbm-lid-driven-cavity-flow' +# command = 'echo' with open(command + '.log', 'w') as fp: for DEVICE_TYPE in DEVICE_TYPE_LIST: DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] if DEVICE_TYPE == 'gpu': for DEVICE in DEVICE_ID_LIST[1:]: DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) - for OCC in OCC_LIST: - for DOMAIN_SIZE in DOMAIN_SIZE_LIST: - for STORAGE_FP in STORAGE_FP_LIST: - for COMPUTE_FP in COMPUTE_FP_LIST: - for DEVICE_SET in DEVICE_SET_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: for GRID in GRID_LIST: if STORAGE_FP == 'double' and COMPUTE_FP == 'float': continue @@ -73,9 +74,12 @@ def countAll(): parameters.append('--max-iter ' + str(MAX_ITER)) parameters.append( '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + - STORAGE_FP + '_' + COMPUTE_FP + '_' + - DEVICE_SET.replace(' ', '_') + '_' + OCC) + DEVICE_TYPE + '_' + + DEVICE_SET.replace(' ', '_') + '-' + + GRID + '_' + + DOMAIN_SIZE + '-' + + STORAGE_FP + '-' + COMPUTE_FP + '-' + + OCC) parameters.append('--computeFP ' + COMPUTE_FP) parameters.append('--storageFP ' + STORAGE_FP) parameters.append('--benchmark') @@ -91,6 +95,7 @@ def countAll(): fp.write(' '.join(commandList)) fp.write("\n-------------------------------------------\n") fp.flush() + print(' '.join(commandList)) subprocess.run(commandList, text=True, stdout=fp) counter += 1 From 1168cc2105986b9f07537f3dc379d5135cbefa47 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 23 Jun 2023 11:58:57 -0400 Subject: [PATCH 13/25] Adding halo option. --- .../lbm-lid-driven-cavity-flow.py | 80 ++++++++++--------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index 90a55ad2..795cb046 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -5,6 +5,7 @@ STORAGE_FP_LIST = "double float".split() COMPUTE_FP_LIST = "double float".split() OCC_LIST = "nOCC sOCC".split() +HU_LIST = "huGrid huLattice".split() WARM_UP_ITER = 10 MAX_ITER = 100 REPETITIONS = 5 @@ -38,10 +39,11 @@ def countAll(): for COMPUTE_FP in COMPUTE_FP_LIST: for DEVICE_SET in DEVICE_SET_LIST: for GRID in GRID_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue + for HU in HU_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue - counter += 1 + counter += 1 return counter @@ -61,42 +63,44 @@ def countAll(): for STORAGE_FP in STORAGE_FP_LIST: for COMPUTE_FP in COMPUTE_FP_LIST: for GRID in GRID_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue + for HU in HU_LIST: - parameters = [] - parameters.append('--deviceType ' + DEVICE_TYPE) - parameters.append('--deviceIds ' + DEVICE_SET) - parameters.append('--grid ' + GRID) - parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) - parameters.append('--max-iter ' + str(MAX_ITER)) - parameters.append( - '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + - DEVICE_SET.replace(' ', '_') + '-' + - GRID + '_' + - DOMAIN_SIZE + '-' + - STORAGE_FP + '-' + COMPUTE_FP + '-' + - OCC) - parameters.append('--computeFP ' + COMPUTE_FP) - parameters.append('--storageFP ' + STORAGE_FP) - parameters.append('--benchmark') - parameters.append('--' + OCC) + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + + DEVICE_SET.replace(' ', '_') + '-' + + GRID + '_' + + DOMAIN_SIZE + '-' + + STORAGE_FP + '-' + COMPUTE_FP + '-' + + OCC) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--benchmark') + parameters.append('--' + OCC) - commandList = [] - commandList.append(command) - for el in parameters: - for s in el.split(): - commandList.append(s) + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) - fp.write("\n-------------------------------------------\n") - fp.write(' '.join(commandList)) - fp.write("\n-------------------------------------------\n") - fp.flush() - print(' '.join(commandList)) - subprocess.run(commandList, text=True, stdout=fp) + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + print(' '.join(commandList)) + subprocess.run(commandList, text=True, stdout=fp) - counter += 1 - printProgressBar(counter * 100.0 / SAMPLES, 'Progress') + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') From 0bdce94ec294e0a6e142b704625882939906894e Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 23 Jun 2023 13:00:21 -0400 Subject: [PATCH 14/25] Adding halo option. --- .../lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index 795cb046..677aefba 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -67,7 +67,7 @@ def countAll(): if STORAGE_FP == 'double' and COMPUTE_FP == 'float': continue - + parameters = [] parameters.append('--deviceType ' + DEVICE_TYPE) parameters.append('--deviceIds ' + DEVICE_SET) @@ -88,6 +88,7 @@ def countAll(): parameters.append('--storageFP ' + STORAGE_FP) parameters.append('--benchmark') parameters.append('--' + OCC) + parameters.append('--' + HU) commandList = [] commandList.append(command) From 3dc808eaff2f0eb39423c76176223224087784e1 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 23 Jun 2023 18:52:44 -0400 Subject: [PATCH 15/25] WIP --- .../src/RunCavityTwoPop.cu | 2 +- .../Neon/core/types/vec/vec3d_integer.tdecl.h | 6 +- libNeonDomain/include/Neon/domain/Grids.h | 1 + libNeonDomain/include/Neon/domain/dGridSoA.h | 7 + .../Neon/domain/details/dGrid/dIndex.h | 4 +- .../Neon/domain/details/dGrid/dIndex_imp.h | 4 +- .../Neon/domain/details/dGrid/dPartition.h | 268 +++++++++--------- .../Neon/domain/details/dGrid/dSpan_imp.h | 16 +- .../Neon/domain/details/dGridSoA/dGridSoA.h | 29 +- .../domain/details/dGridSoA/dPartitionSoA.h | 140 +++++---- .../Neon/domain/details/dGridSoA/dSpanSoA.h | 5 + .../domain/details/dGridSoA/dSpanSoA_imp.h | 17 +- .../Neon/domain/details/eGrid/ePartition.h | 2 +- .../Neon/domain/tools/GridTransformer.h | 7 +- .../Neon/domain/tools/gridTransformer/tGrid.h | 11 +- .../domain/tools/gridTransformer/tGrid_ti.h | 28 ++ .../tests/domain-globalIdx/src/globalIdx.cu | 22 +- .../tests/domain-globalIdx/src/globalIdx.h | 5 +- .../tests/domain-globalIdx/src/gtests.cpp | 15 +- libNeonDomain/tests/domain-map/src/gtests.cpp | 9 + libNeonDomain/tests/domain-map/src/map.cu | 2 + libNeonDomain/tests/domain-map/src/map.h | 3 + 22 files changed, 365 insertions(+), 238 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/dGridSoA.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index d28688d1..29c7573d 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -315,7 +315,7 @@ auto run(Config& config, return details::runFilterStoreType(config, report); } if (config.gridType == "dGridSoA") { - return details::runFilterStoreType(config, report); + return details::runFilterStoreType(config, report); } } } // namespace CavityTwoPop diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h index acdae410..ae475c6e 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h @@ -56,6 +56,10 @@ class Vec_3d num_axis = 3 }; + static constexpr int directionX = axis_e::x_axis; + static constexpr int directionY = axis_e::y_axis; + static constexpr int directionZ = axis_e::z_axis; + union { Integer v[axis_e::num_axis]{0, 0, 0}; @@ -120,7 +124,7 @@ class Vec_3d NEON_CUDA_HOST_DEVICE inline void constexpr set(Integer p[self_t::num_axis]); - NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other); + NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other); NEON_CUDA_HOST_DEVICE inline void constexpr set(const Integer& xyz); diff --git a/libNeonDomain/include/Neon/domain/Grids.h b/libNeonDomain/include/Neon/domain/Grids.h index aad0cda5..7c899b98 100644 --- a/libNeonDomain/include/Neon/domain/Grids.h +++ b/libNeonDomain/include/Neon/domain/Grids.h @@ -3,3 +3,4 @@ #include "Neon/domain/aGrid.h" #include "Neon/domain/eGrid.h" #include "Neon/domain/bGrid.h" +#include "Neon/domain/dGridSoA.h" diff --git a/libNeonDomain/include/Neon/domain/dGridSoA.h b/libNeonDomain/include/Neon/domain/dGridSoA.h new file mode 100644 index 00000000..bdd63f25 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/dGridSoA.h @@ -0,0 +1,7 @@ +#pragma once +#include "Neon/domain/details/dGridSoA/dGridSoA.h" + + +namespace Neon { +using dGridSoA = Neon::domain::details::dGridSoA::dGridSoA; +} \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h index 3291e622..a2c57cdb 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h @@ -37,9 +37,9 @@ struct dIndex NEON_CUDA_HOST_DEVICE inline explicit dIndex(const Location& location); - NEON_CUDA_HOST_DEVICE inline auto set() -> Location&; + NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&; - NEON_CUDA_HOST_DEVICE inline auto get() const -> const Location&; + NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&; }; } // namespace Neon::domain::details::dGrid diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h index 4389fb3f..6426e43a 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h @@ -16,11 +16,11 @@ NEON_CUDA_HOST_DEVICE inline dIndex::dIndex(const Location::Integer &x, mLocation.z = z; } -NEON_CUDA_HOST_DEVICE inline auto dIndex::set() -> Location& +NEON_CUDA_HOST_DEVICE inline auto dIndex::setLocation() -> Location& { return mLocation; } -NEON_CUDA_HOST_DEVICE inline auto dIndex::get() const -> const Location& +NEON_CUDA_HOST_DEVICE inline auto dIndex::getLocation() const -> const Location& { return mLocation; } diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 31e480aa..86faf619 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -44,16 +44,16 @@ class dPartition int cardinality, Neon::index_3d fullGridSize, NghIdx* stencil = nullptr) - : m_dataView(dataView), - m_mem(mem), - m_dim(dim), - m_zHaloRadius(zHaloRadius), - m_zBoundaryRadius(zBoundaryRadius), - m_pitch(pitch), - m_prtID(prtID), - m_origin(origin), - m_cardinality(cardinality), - m_fullGridSize(fullGridSize), + : mDataView(dataView), + mMem(mem), + mDim(dim), + mZHaloRadius(zHaloRadius), + mZBoundaryRadius(zBoundaryRadius), + mPitch(pitch), + mPrtID(prtID), + mOrigin(origin), + mCardinality(cardinality), + mFullGridSize(fullGridSize), mPeriodicZ(false), mStencil(stencil) { @@ -70,21 +70,21 @@ class dPartition prtID() const -> int { - return m_prtID; + return mPrtID; } inline NEON_CUDA_HOST_DEVICE auto cardinality() const -> int { - return m_cardinality; + return mCardinality; } inline NEON_CUDA_HOST_DEVICE auto getPitchData() const -> const Pitch& { - return m_pitch; + return mPitch; } inline NEON_CUDA_HOST_DEVICE auto @@ -92,76 +92,76 @@ class dPartition int cardinalityIdx = 0) const -> int64_t { - return idx.get().x * int64_t(m_pitch.x) + - idx.get().y * int64_t(m_pitch.y) + - idx.get().z * int64_t(m_pitch.z) + - cardinalityIdx * int64_t(m_pitch.w); + return idx.getLocation().x * int64_t(mPitch.x) + + idx.getLocation().y * int64_t(mPitch.y) + + idx.getLocation().z * int64_t(mPitch.z) + + cardinalityIdx * int64_t(mPitch.w); } inline NEON_CUDA_HOST_DEVICE auto dim() const -> const Neon::index_3d { - return m_dim; + return mDim; } inline NEON_CUDA_HOST_DEVICE auto halo() const -> const Neon::index_3d { - return Neon::index_3d(0, 0, m_zHaloRadius); + return Neon::index_3d(0, 0, mZHaloRadius); } inline NEON_CUDA_HOST_DEVICE auto origin() const -> const Neon::index_3d { - return m_origin; + return mOrigin; } NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, NghIdx nghOffset, int card, const T& alternativeVal) const -> NghData { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val = alternativeVal; if (isValidNeighbour) { - val = operator()(cellNgh, card); + val = operator()(gidxNgh, card); } return NghData(val, isValidNeighbour); } NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, NghIdx nghOffset, int card) const -> NghData { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val; if (isValidNeighbour) { - val = operator()(cellNgh, card); + val = operator()(gidxNgh, card); } return NghData(val, isValidNeighbour); } template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card, LambdaVALID funIfValid, LambdaNOTValid funIfNOTValid = nullptr) const -> std::enable_if_t, void> { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { - T val = this->operator()(cellNgh, card); + T val = this->operator()(gidxNgh, card); funIfValid(val); } if constexpr (!std::is_same_v) { @@ -171,131 +171,130 @@ class dPartition } } - template + template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card) const -> NghData { - NghData res; - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + T val; if (isValidNeighbour) { - T val = operator()(cellNgh, card); - res.set(val, true); - } else { - res.invalidate(); + val = operator()(gidxNgh, card); } - return res; + return NghData(val, isValidNeighbour); } template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card, T const& defaultValue) const -> NghData { NghData res(defaultValue, false); - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { - T val = operator()(cellNgh, card); + T val = operator()(gidxNgh, card); res.set(val, true); } return res; } NEON_CUDA_HOST_DEVICE inline auto - nghVal(const Idx& eId, + nghVal(const Idx& gidx, uint8_t nghID, int card, const T& alternativeVal) const -> NghData { NghIdx nghOffset = mStencil[nghID]; - return getNghData(eId, nghOffset, card, alternativeVal); + return getNghData(gidx, nghOffset, card, alternativeVal); } /** * Get the index of the neighbor given the offset * @tparam dataView_ta - * @param[in] eId Index of the current element + * @param[in] gidx Index of the current element * @param[in] nghOffset Offset of the neighbor of interest from the current element * @param[in,out] neighbourIdx Index of the neighbor * @return Whether the neighbour is valid */ NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& eId, - const NghIdx& nghOffset, - Idx& neighbourIdx) + helpGetNghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) const -> bool { - Idx cellNgh(eId.get().x + nghOffset.x, - eId.get().y + nghOffset.y, - eId.get().z + nghOffset.z); + Idx gidxNgh(gidx.getLocation().x + nghOffset.x, + gidx.getLocation().y + nghOffset.y, + gidx.getLocation().z + nghOffset.z); - const auto cellNghGlobal = getGlobalIndex(cellNgh); + const auto gidxNghGlobal = getGlobalIndex(gidxNgh); bool isValidNeighbour = true; - if (mPeriodicZ) { - printf("Error, periodic not implemented yet"); - assert(false); - } - - isValidNeighbour = (cellNghGlobal.x >= 0) && - (cellNghGlobal.y >= 0) && - (cellNghGlobal.z >= 0); - - // isValidNeighbour = (cellNgh.get().x < m_dim.x) && - // (cellNgh.get().y < m_dim.y) && - // (cellNgh.get().z < m_dim.z + 2 * m_zHaloRadius) && isValidNeighbour; + isValidNeighbour = (gidxNghGlobal.x >= 0) && + (gidxNghGlobal.y >= 0) && + (gidxNghGlobal.z >= 0); - isValidNeighbour = (cellNghGlobal.x < m_fullGridSize.x) && - (cellNghGlobal.y < m_fullGridSize.y) && - (cellNghGlobal.z < m_fullGridSize.z) && + isValidNeighbour = (gidxNghGlobal.x < mFullGridSize.x) && + (gidxNghGlobal.y < mFullGridSize.y) && + (gidxNghGlobal.z < mFullGridSize.z) && isValidNeighbour; if (isValidNeighbour) { - neighbourIdx = cellNgh; + neighbourIdx = gidxNgh; } return isValidNeighbour; } template NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& eId, - Idx& cellNgh) + helpGetNghIdx(const Idx& gidx, + Idx& gidxNgh) const -> bool { - cellNgh = Idx(eId.get().x + xOff, - eId.get().y + yOff, - eId.get().z + zOff); - Idx cellNgh_global(cellNgh.get() + m_origin); - // const bool isValidNeighbour = (cellNgh_global >= 0 && cellNgh < (m_dim + m_halo) && cellNgh_global < m_fullGridSize); - bool isValidNeighbour = true; - if constexpr (xOff > 0) { - isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().x <= m_fullGridSize.x && isValidNeighbour; - } - if constexpr (xOff < 0) { - isValidNeighbour = cellNgh_global.get().x >= 0 && isValidNeighbour; - } - if constexpr (yOff > 0) { - isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().y <= m_fullGridSize.y && isValidNeighbour; - } - if constexpr (yOff < 0) { - isValidNeighbour = cellNgh_global.get().y >= 0 && isValidNeighbour; - } - if constexpr (zOff > 0) { - isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().z <= m_fullGridSize.z && isValidNeighbour; - } - if constexpr (zOff < 0) { - isValidNeighbour = cellNgh_global.get().z >= m_zHaloRadius && isValidNeighbour; - } - return isValidNeighbour; + return helpGetNghIdx(gidx, NghIdx{xOff, yOff, zOff}, gidxNgh); + // gidxNgh = Idx(gidx.getLocation().x + xOff, + // gidx.getLocation().y + yOff, + // gidx.getLocation().z + zOff); + // + // bool isValidNeighbour = true; + // if constexpr (xOff > 0) { + // int constexpr direction = Neon::index_3d::directionX; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + // } + // if constexpr (xOff < 0) { + // int constexpr direction = Neon::index_3d::directionX; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + // } + // if constexpr (yOff > 0) { + // int constexpr direction = Neon::index_3d::directionY; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + // } + // if constexpr (yOff < 0) { + // int constexpr direction = Neon::index_3d::directionY; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + // } + // if constexpr (zOff > 0) { + // int constexpr direction = Neon::index_3d::directionZ; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + // } + // if constexpr (zOff < 0) { + // int constexpr direction = Neon::index_3d::directionZ; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + // } + // return isValidNeighbour; } @@ -303,7 +302,7 @@ class dPartition mem() -> T* { - return m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -311,7 +310,7 @@ class dPartition const -> const T* { - return m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -319,7 +318,7 @@ class dPartition int cardinalityIdx) -> T* { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -327,7 +326,7 @@ class dPartition int cardinalityIdx) -> T& { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -335,7 +334,7 @@ class dPartition int cardinalityIdx) const -> const T& { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } template @@ -386,22 +385,35 @@ class dPartition // local.mLocation.y < m_dim.y && // local.mLocation.z < m_dim.z + m_zHaloRadius); - Neon::index_3d result = local.mLocation + m_origin; - result.z -= m_zHaloRadius; + Neon::index_3d result = local.mLocation; + result.z = result.z + mOrigin.z - mZHaloRadius; return result; } + template + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local) + const -> int + { + if constexpr (Neon::index_3d::directionZ != direction) { + return local.mLocation.v[direction]; + } else { + return local.mLocation.v[Neon::index_3d::directionZ] + + mOrigin.v[Neon::index_3d::directionZ] - + mZHaloRadius; + } + } + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() const -> Neon::index_3d { - return m_fullGridSize; + return mFullGridSize; } auto ioToVti(std::string const& fname, std::string const& fieldName) { - auto fnameCommplete = fname + "_" + std::to_string(m_prtID); - auto haloOrigin = Vec_3d(m_origin.x, m_origin.y, m_origin.z - m_zHaloRadius); - auto haloDim = m_dim + Neon::index_3d(0, 0, 2 * m_zHaloRadius) + 1; + auto fnameCommplete = fname + "_" + std::to_string(mPrtID); + auto haloOrigin = Vec_3d(mOrigin.x, mOrigin.y, mOrigin.z - mZHaloRadius); + auto haloDim = mDim + Neon::index_3d(0, 0, 2 * mZHaloRadius) + 1; IoToVTK io(fnameCommplete, haloDim, @@ -413,35 +425,35 @@ class dPartition io.addField([&](const Neon::index_3d& idx, int i) { return operator()(dIndex(idx), i); }, - m_cardinality, "Partition", ioToVTKns::VtiDataType_e::voxel); + mCardinality, "Partition", ioToVTKns::VtiDataType_e::voxel); io.flushAndClear(); return; } auto getDataView() - -> Neon::DataView + const -> Neon::DataView { - return m_dataView; + return mDataView; } - auto helpGetGlobalToLocalOffets() const - -> NghIdx const* + auto helpGetGlobalToLocalOffets() + const -> NghIdx* { return mStencil; } private: - Neon::DataView m_dataView; - T* NEON_RESTRICT m_mem; - Neon::index_3d m_dim; - int m_zHaloRadius; - int m_zBoundaryRadius; - Pitch m_pitch; - int m_prtID; - Neon::index_3d m_origin; - int m_cardinality; - Neon::index_3d m_fullGridSize; + Neon::DataView mDataView; + T* NEON_RESTRICT mMem; + Neon::index_3d mDim; + int mZHaloRadius; + int mZBoundaryRadius; + Pitch mPitch; + int mPrtID; + Neon::index_3d mOrigin; + int mCardinality; + Neon::index_3d mFullGridSize; bool mPeriodicZ; NghIdx* NEON_RESTRICT mStencil; }; diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h index 8f6f9fea..9fb56572 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h @@ -10,29 +10,29 @@ dSpan::setAndValidate(Idx& idx, const -> bool { bool res = false; - idx.set().x = int(x); - idx.set().y = int(y); - idx.set().z = int(z); + idx.setLocation().x = int(x); + idx.setLocation().y = int(y); + idx.setLocation().z = int(z); - if (idx.get() < mDim) { + if (idx.getLocation() < mDim) { res = true; } switch (mDataView) { case Neon::DataView::STANDARD: { - idx.set().z += mZHaloRadius; + idx.setLocation().z += mZHaloRadius; return res; } case Neon::DataView::INTERNAL: { - idx.set().z += mZHaloRadius + mZBoundaryRadius; + idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; return res; } case Neon::DataView::BOUNDARY: { - idx.set().z += idx.get().z < mZBoundaryRadius + idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius ? 0 : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); - idx.set().z += mZHaloRadius; + idx.setLocation().z += mZHaloRadius; return res; } diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h index 61b182b2..7ce3e582 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h @@ -28,21 +28,22 @@ #include "Neon/domain/patterns/PatternScalar.h" #include "dPartitionSoA.h" +#include "dSpanSoA.h" namespace Neon::domain::details::dGridSoA { namespace details { struct dGridSoATransformation { + using FoundationGrid = Neon::domain::details::dGrid::dGrid; + using Idx = dIndexSoA; + using Span = dSpanSoA; template using Partition = dPartitionSoA; - using Span = Neon::domain::details::eGrid::eSpan; - static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; - using FoundationGrid = Neon::domain::details::eGrid::eGrid; - static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; using ExecutionThreadSpanIndexType = int32_t; - using Idx = FoundationGrid::Idx; static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& { @@ -55,7 +56,7 @@ struct dGridSoATransformation Neon::SetIdx setIdx, Neon::DataView dw, Span& span) { - span = foundationGrid.getSpan(execution, setIdx, dw); + span.helpInit(foundationGrid.getSpan(execution, setIdx, dw)); }); } @@ -67,14 +68,14 @@ struct dGridSoATransformation return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); } - static auto helpGetGridIdx(FoundationGrid&, - Neon::SetIdx const&, - FoundationGrid::Idx const& fgIdx) - -> GridTransformation::Idx - { - GridTransformation::Idx tgIdx = fgIdx; - return tgIdx; - } + // static auto helpGetGridIdx(FoundationGrid&, + // Neon::SetIdx const&, + // FoundationGrid::Idx const& fgIdx) + // -> dGridSoATransformation::Idx + // { + // dGridSoATransformation::Idx tgIdx = fgIdx; + // return tgIdx; + // } template static auto initFieldPartition(FoundationGrid::Field& foundationField, diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h index fc4c3642..1cdd75db 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -19,12 +19,13 @@ class dPartitionSoA using Idx = dIndexSoA; using NghData = Neon::domain::NghData; using Pitch = uint32_4d; + using NghIdx = int8_3d; dPartitionSoA() { } - dPartitionSoA(Neon::domain::details::dGrid::dPartition const& dPartitionOriginal) + dPartitionSoA(Neon::domain::details::dGrid::dPartition& dPartitionOriginal) { mDataView = dPartitionOriginal.getDataView(); mMem = dPartitionOriginal.mem(); @@ -34,22 +35,22 @@ class dPartitionSoA mPrtID = dPartitionOriginal.prtID(); mOrigin = dPartitionOriginal.origin(); mCardinality = dPartitionOriginal.cardinality(); - mFullGridSize = dPartitionOriginal.fullGridSize(); - NghIdx* mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets(); + mFullGridSize = dPartitionOriginal.getDomainSize(); + mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets(); } inline NEON_CUDA_HOST_DEVICE auto prtID() const -> int { - return mPrtID(); + return mPrtID; } inline NEON_CUDA_HOST_DEVICE auto cardinality() const -> int { - return mCardinality(); + return mCardinality; } inline NEON_CUDA_HOST_DEVICE auto @@ -62,30 +63,30 @@ class dPartitionSoA inline NEON_CUDA_HOST_DEVICE auto getPitch(const Idx& idx, int cardinality) - -> Idx::Offset + const -> Idx::Offset { - return idx.getLocationOffset() + cardinality * mPitch.w; + return idx.getOffset() + cardinality * mPitch.w; } inline NEON_CUDA_HOST_DEVICE auto dim() const -> const Neon::index_3d { - return mDim(); + return mDim; } inline NEON_CUDA_HOST_DEVICE auto halo() const -> const Neon::index_3d { - return mDPartition.halo(); + return Neon::index_3d(0, 0, mZHaloRadius); } inline NEON_CUDA_HOST_DEVICE auto origin() const -> const Neon::index_3d { - return m_ormDPartition.origin(); + return mOrigin; } NEON_CUDA_HOST_DEVICE inline auto @@ -96,7 +97,7 @@ class dPartitionSoA const -> NghData { Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val = alternativeVal; if (isValidNeighbour) { val = operator()(gidxNgh, card); @@ -111,7 +112,7 @@ class dPartitionSoA const -> NghData { Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val; if (isValidNeighbour) { val = operator()(gidxNgh, card); @@ -132,7 +133,7 @@ class dPartitionSoA const -> std::enable_if_t, void> { Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { T val = this->operator()(gidxNgh, card); funIfValid(val); @@ -152,7 +153,7 @@ class dPartitionSoA { NghData res; Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { T val = operator()(gidxNgh, card); res.set(val, true); @@ -171,7 +172,7 @@ class dPartitionSoA { NghData res(defaultValue, false); Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { T val = operator()(gidxNgh, card); res.set(val, true); @@ -199,31 +200,31 @@ class dPartitionSoA * @return Whether the neighbour is valid */ NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& gidx, - const NghIdx& nghOffset, - Idx& neighbourIdx) + helpGetNghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) const -> bool { - Neon::index_3d cartesian(gidx.get().x + nghOffset.x, - gidx.get().y + nghOffset.y, - gidx.get().z + nghOffset.z); + Neon::index_3d cartesian(gidx.getLocation().x + nghOffset.x, + gidx.getLocation().y + nghOffset.y, + gidx.getLocation().z + nghOffset.z); - neighbourIdx = Idx(cartesian, - gidx.getOffset() + nghOffset.x * getPitchData().x + - nghOffset.y * getPitchData().y + - nghOffset.z * getPitchData().z); + neighbourIdx = Idx(cartesian, gidx.getOffset() + + nghOffset.x * getPitchData().x + + nghOffset.y * getPitchData().y + + nghOffset.z * getPitchData().z); - Idx::Location nghCartesianGlobal = getGlobalIndex(gidxNgh); + Neon::index_3d const nghCartesianIdx = getGlobalIndex(neighbourIdx); bool isValidNeighbour = true; - isValidNeighbour = (gidxNghGlobal.x >= 0) && - (gidxNghGlobal.y >= 0) && - (gidxNghGlobal.z >= 0); + isValidNeighbour = (nghCartesianIdx.x >= 0) && + (nghCartesianIdx.y >= 0) && + (nghCartesianIdx.z >= 0); - isValidNeighbour = (gidxNghGlobal.x < m_fullGridSize.x) && - (gidxNghGlobal.y < m_fullGridSize.y) && - (gidxNghGlobal.z < m_fullGridSize.z) && + isValidNeighbour = (nghCartesianIdx.x < mFullGridSize.x) && + (nghCartesianIdx.y < mFullGridSize.y) && + (nghCartesianIdx.z < mFullGridSize.z) && isValidNeighbour; return isValidNeighbour; @@ -235,37 +236,46 @@ class dPartitionSoA Idx& gidxNgh) const -> bool { - Neon::index_3d cartesian(gidx.get().x + xOff, - gidx.get().y + yOff, - gidx.get().z + zOff); - gidxNgh = Idx(cartesian, - gidx.getOffset() + xOff * getPitchData().x + - yOff * getPitchData().y + - zOff * getPitchData().z); - - Idx::Location nghCartesianGlobal(getGlobalIndex(gidxNgh)); + { + Neon::index_3d cartesian(gidx.getLocation().x + xOff, + gidx.getLocation().y + yOff, + gidx.getLocation().z + zOff); + gidxNgh = Idx(cartesian, gidx.getOffset() + + xOff * getPitchData().x + + yOff * getPitchData().y + + zOff * getPitchData().z); + } bool isValidNeighbour = true; if constexpr (xOff > 0) { - isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour; - isValidNeighbour = nghCartesianGlobal.x <= mDPartition.m_fullGridSize.x && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (xOff < 0) { - isValidNeighbour = nghCartesianGlobal.x >= 0 && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } if constexpr (yOff > 0) { - isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour; - isValidNeighbour = nghCartesianGlobal.y <= mDPartition.m_fullGridSize.y && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (yOff < 0) { - isValidNeighbour = nghCartesianGlobal.y >= 0 && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } if constexpr (zOff > 0) { - isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour; - isValidNeighbour = nghCartesianGlobal.z <= mDPartition.m_fullGridSize.z && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (zOff < 0) { - isValidNeighbour = nghCartesianGlobal.z >= mDPartition.m_zHaloRadius && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } return isValidNeighbour; } @@ -274,14 +284,14 @@ class dPartitionSoA mem() -> T* { - return mDPartition.m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto mem() const -> const T* { - return mDPartition.m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -290,7 +300,7 @@ class dPartitionSoA -> T* { Idx::Offset p = getPitch(cell, cardinalityIdx); - return mDPartition.m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -299,7 +309,7 @@ class dPartitionSoA -> T& { Idx::Offset p = getPitch(cell, cardinalityIdx); - return mDPartition.m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -308,21 +318,35 @@ class dPartitionSoA const -> const T& { Idx::Offset p = getPitch(cell, cardinalityIdx); - return mDPartition.m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local) const -> Neon::index_3d { - Neon::index_3d result = local.mLocation + m_origin; - result.z -= mDPartition.m_zHaloRadius; + Neon::index_3d result = local.mLocation + mOrigin; + result.z -= mZHaloRadius; return result; } + template + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local) + const -> int + { + if constexpr (Neon::index_3d::directionZ != direction) { + return local.mLocation.v[direction] + + mOrigin.v[direction]; + } else { + return local.mLocation.v[Neon::index_3d::directionZ] + + mOrigin.v[Neon::index_3d::directionZ] - + mZHaloRadius; + } + } + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() const -> Neon::index_3d { - return mDPartition.m_fullGridSize; + return mFullGridSize; } Neon::DataView mDataView; diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h index 83d5a2dc..3aee038c 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h @@ -1,6 +1,8 @@ #pragma once #include "Neon/set/DevSet.h" #include "dIndexSoA.h" +#include "Neon/domain/details/dGrid/dSpan.h" + namespace Neon::domain::details::dGridSoA { /** @@ -40,6 +42,9 @@ class dSpanSoA helpGetDim() const -> Neon::index_3d const&; + NEON_CUDA_HOST_DEVICE inline auto + helpInit(Neon::domain::details::dGrid::dSpan const&) ->void; + private: Neon::DataView mDataView; int mZHaloRadius; diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h index a3dff4cf..421a3f27 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h @@ -14,19 +14,19 @@ dSpanSoA::setAndValidate(Idx& idx, idx.setLocation().y = int(y); idx.setLocation().z = int(z); - if (idx.get() < mDim) { + if (idx.getLocation() < mDim) { res = true; } switch (mDataView) { case Neon::DataView::STANDARD: { idx.setLocation().z += mZHaloRadius; - idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; return res; } case Neon::DataView::INTERNAL: { idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; - idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; return res; } case Neon::DataView::BOUNDARY: { @@ -35,7 +35,7 @@ dSpanSoA::setAndValidate(Idx& idx, ? 0 : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); idx.setLocation().z += mZHaloRadius; - idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; return res; } default: { @@ -68,4 +68,13 @@ NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDim() return mDim; } +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) ->void +{ + mDataView = dspan.helpGetDataView(); + mZHaloRadius = dspan.helpGetZHaloRadius(); + mZBoundaryRadius = dspan.helpGetZBoundaryRadius(); + mDim = dspan.helpGetDim(); +} + + } // namespace Neon::domain::details::dGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h index 012a3588..62b75981 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h @@ -59,7 +59,7 @@ class ePartition * | * | Connectivity table has the same layout of a field with cardinality equal to * | the number of neighbours and an SoA layout. Let's call this field nghField. - * | nghField(e, nghIdx) is the eIdx_t of the neighbour element as in a STANDARD + * | nghField(e, helpGetNghIdx) is the eIdx_t of the neighbour element as in a STANDARD * | view. * |--) */ diff --git a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h index 90556fb9..47518f7a 100644 --- a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h +++ b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h @@ -1,10 +1,10 @@ #pragma once +#include "Neon/domain/tools/PartitionTable.h" +#include "Neon/domain/tools/SpanTable.h" #include "Neon/domain/tools/gridTransformer/tField.h" #include "Neon/domain/tools/gridTransformer/tGrid.h" #include "Neon/domain/tools/gridTransformer/tGrid_ti.h" -#include "Neon/domain/tools/PartitionTable.h" -#include "Neon/domain/tools/SpanTable.h" namespace Neon::domain::tool { @@ -24,9 +24,10 @@ template class GridTransformer { public: + using Idx = typename GridTransformation::Idx; + using Span = typename GridTransformation::Span; template using Partition = typename GridTransformation::template Partition; - using Span = typename GridTransformation::Span; using FoundationGrid = typename GridTransformation::FoundationGrid; using Grid = details::tGrid; diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h index d6d98be1..bd28e8f5 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h @@ -54,6 +54,15 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate + tGrid(const Neon::Backend& backend /**< Target for computation */, + const Neon::int32_3d& dimension /**< Dimension of the bounding box containing the domain */, + const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, + const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, + const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, + const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */); + tGrid(const tGrid& other); // copy constructor tGrid(tGrid&& other) noexcept; // move constructor tGrid& operator=(const tGrid& other); // copy assignment @@ -109,7 +118,7 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate(bk); } diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h index 4ba1403d..0a0249d7 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h @@ -30,6 +30,34 @@ tGrid::tGrid(FoundationGrid& foundationGrid) foundationGrid.getOrigin()); } +template +template +tGrid::tGrid(const Neon::Backend& bk, + const Neon::int32_3d& dimension, + const SparsityPattern& activeCellLambda, + const Neon::domain::Stencil& stencil, + const Vec_3d& spacing, + const Vec_3d& origin) +{ + mData = std::make_shared(bk); + mData->foundationGrid = FoundationGrid(bk, + dimension, + activeCellLambda, + stencil, + spacing, + origin); + GridTransformation::initSpan(mData->foundationGrid, + NEON_OUT mData->spanTable); + tGrid::GridBase::init("tGrid", + bk, + mData->foundationGrid.getDimension(), + mData->foundationGrid.getStencil(), + mData->foundationGrid.getNumActiveCellsPerPartition(), + mData->foundationGrid.getDefaultBlock(), + mData->foundationGrid.getSpacing(), + mData->foundationGrid.getOrigin()); +} + template tGrid::tGrid() { diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu index 158d3e05..1b94b566 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu @@ -1,5 +1,6 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" @@ -27,18 +28,18 @@ auto defContainer(int streamIdx, return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); Neon::index_3d globalPoint = a.getGlobalIndex(e); - a(e, 0) = globalPoint.x ; + a(e, 0) = globalPoint.x; b(e, 0) = globalPoint.y; c(e, 0) = globalPoint.z; -// if constexpr (std::is_same_v) { -// printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx, -// e.mInDataBlockIdx.x, -// e.mInDataBlockIdx.y, -// e.mInDataBlockIdx.z, -// globalPoint.x, -// globalPoint.y, -// globalPoint.z); -// } + // if constexpr (std::is_same_v) { + // printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx, + // e.mInDataBlockIdx.x, + // e.mInDataBlockIdx.y, + // e.mInDataBlockIdx.z, + // globalPoint.x, + // globalPoint.y, + // globalPoint.z); + // } }; }); } @@ -98,5 +99,6 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h index 0a3b87eb..c766f7ca 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h @@ -3,9 +3,9 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" - namespace globalIdx { using namespace Neon::domain::tool::testing; @@ -15,6 +15,7 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; -} // namespace map +} // namespace globalIdx diff --git a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp index 783830ca..f0ecce78 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp @@ -4,7 +4,7 @@ #include "globalIdx.h" #include "runHelper.h" -TEST(domain_unit_test_globalIdx, dGrid) +TEST(domain_globalIdx, dGrid) { int nGpus = 3; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid) 1); } -TEST(domain_unit_test_globalIdx, eGrid) +TEST(domain_globalIdx, eGrid) { int nGpus = 3; using Type = int64_t; @@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGrid) +TEST(domain_globalIdx, bGrid) { int nGpus = 3; using Type = int64_t; @@ -31,6 +31,15 @@ TEST(domain_unit_test_globalIdx, bGrid) 1); } +TEST(domain_globalIdx, dGridSoA) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp index d0d43b60..50d6e34d 100644 --- a/libNeonDomain/tests/domain-map/src/gtests.cpp +++ b/libNeonDomain/tests/domain-map/src/gtests.cpp @@ -31,6 +31,15 @@ TEST(domain_map, bGrid) 1); } +TEST(domain_map, dGridSoA) +{ + int nGpus = 1; + using Type = int64_t; + runAllTestConfiguration(std::function(map::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu index bd25f178..b001d832 100644 --- a/libNeonDomain/tests/domain-map/src/map.cu +++ b/libNeonDomain/tests/domain-map/src/map.cu @@ -4,6 +4,7 @@ #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" #include "gtest/gtest.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace map { @@ -75,6 +76,7 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h index 611f2046..16073657 100644 --- a/libNeonDomain/tests/domain-map/src/map.h +++ b/libNeonDomain/tests/domain-map/src/map.h @@ -4,6 +4,7 @@ #include "Neon/domain/Grids.h" #include "Neon/domain/tools/TestData.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace map { @@ -14,6 +15,8 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; } // namespace map From ceab2a6f62dd72d4faedfadaea2be33b3ab4f565 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 26 Jun 2023 11:32:35 -0400 Subject: [PATCH 16/25] domain_neighbour_globalIdx for dGridSoA --- .../Neon/domain/details/dGrid/dPartition.h | 75 +++--- .../domain-neighbour-globalIdx/src/gtests.cpp | 55 ++++- .../src/runHelper.h | 1 + .../src/testsAndContainers.cu | 220 ++++++++++++++++-- .../src/testsAndContainers.h | 9 + 5 files changed, 306 insertions(+), 54 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 86faf619..2becc97d 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -258,43 +258,44 @@ class dPartition Idx& gidxNgh) const -> bool { - return helpGetNghIdx(gidx, NghIdx{xOff, yOff, zOff}, gidxNgh); - // gidxNgh = Idx(gidx.getLocation().x + xOff, - // gidx.getLocation().y + yOff, - // gidx.getLocation().z + zOff); - // - // bool isValidNeighbour = true; - // if constexpr (xOff > 0) { - // int constexpr direction = Neon::index_3d::directionX; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; - // } - // if constexpr (xOff < 0) { - // int constexpr direction = Neon::index_3d::directionX; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; - // } - // if constexpr (yOff > 0) { - // int constexpr direction = Neon::index_3d::directionY; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; - // } - // if constexpr (yOff < 0) { - // int constexpr direction = Neon::index_3d::directionY; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; - // } - // if constexpr (zOff > 0) { - // int constexpr direction = Neon::index_3d::directionZ; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; - // } - // if constexpr (zOff < 0) { - // int constexpr direction = Neon::index_3d::directionZ; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; - // } - // return isValidNeighbour; + // NghIdx offset(xOff, yOff, zOff); + // return helpGetNghIdx(gidx, offset, gidxNgh); + gidxNgh = Idx(gidx.getLocation().x + xOff, + gidx.getLocation().y + yOff, + gidx.getLocation().z + zOff); + + bool isValidNeighbour = true; + if constexpr (xOff > 0) { + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (xOff < 0) { + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + if constexpr (yOff > 0) { + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (yOff < 0) { + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + if constexpr (zOff > 0) { + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (zOff < 0) { + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + return isValidNeighbour; } diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp index feba5a9b..21bba9b5 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp @@ -1,10 +1,10 @@ +#include "./testsAndContainers.h" #include "Neon/Neon.h" #include "gtest/gtest.h" -#include "./testsAndContainers.h" #include "runHelper.h" -TEST(domain_unit_test_globalIdx, dGrid) +TEST(domain_neighbour_globalIdx, dGrid) { int nGpus = 5; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid) 1); } -TEST(domain_unit_test_globalIdx, eGrid) +TEST(domain_neighbour_globalIdx, eGrid) { int nGpus = 5; using Type = int64_t; @@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGrid) +TEST(domain_neighbour_globalIdx, bGrid) { int nGpus = 5; using Type = int64_t; @@ -31,6 +31,53 @@ TEST(domain_unit_test_globalIdx, bGrid) 1); } +TEST(domain_neighbour_globalIdx, dGridSoA) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + +/////////////////////////////////////////// + +TEST(domain_neighbour_globalIdx, dGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, eGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, bGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, dGridSoA_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h index 0014594c..32a078d6 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h @@ -9,6 +9,7 @@ #include "Neon/domain/dGrid.h" #include "Neon/domain/eGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/Geometries.h" #include "Neon/domain/tools/TestData.h" diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu index 49dd3bd2..7b2c3fef 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu @@ -1,5 +1,6 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" @@ -61,15 +62,15 @@ auto checkNeighbourData(Field const& filedA, Field const& filedB, Field const& filedC, Neon::index_3d testDirection, - Field const& checkFlatA, - Field const& checkFlatB, - Field const& checkFlatC) + Field& checkFlatA, + Field& checkFlatB, + Field& checkFlatC) -> Neon::set::Container { const auto& grid = filedA.getGrid(); return grid.newContainer( "defContainer", - [&](Neon::set::Loader& loader) { + [&, testDirection](Neon::set::Loader& loader) { auto a = loader.load(filedA, Neon::Pattern::STENCIL); auto b = loader.load(filedB, Neon::Pattern::STENCIL); auto c = loader.load(filedC, Neon::Pattern::STENCIL); @@ -102,6 +103,58 @@ auto checkNeighbourData(Field const& filedA, }); } +template +auto checkNeighbourDataTemplate(Field const& filedA, + Field const& filedB, + Field const& filedC, + Field& checkFlatA, + Field& checkFlatB, + Field& checkFlatC) + -> Neon::set::Container +{ + const auto& grid = filedA.getGrid(); + return grid.newContainer( + "defContainer", + [&](Neon::set::Loader& loader) { + auto a = loader.load(filedA, Neon::Pattern::STENCIL); + auto b = loader.load(filedB, Neon::Pattern::STENCIL); + auto c = loader.load(filedC, Neon::Pattern::STENCIL); + + auto resA = loader.load(checkFlatA, Neon::Pattern::MAP); + auto resB = loader.load(checkFlatB, Neon::Pattern::MAP); + auto resC = loader.load(checkFlatC, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { + constexpr Neon::index_3d testDirection(xOff, yOff, zOff); + + // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); + Neon::index_3d globalPoint = a.getGlobalIndex(e); + auto ngh = globalPoint + testDirection; + + decltype(a)* nghInfo[3] = {&a, &b, &c}; + decltype(a)* results[3] = {&resA, &resB, &resC}; + + for (int i = 0; i < 3; i++) { + auto d = nghInfo[i]->template getNghData(e, 0); + // auto d = nghInfo[i]->getNghData(e, testDirection.newType(), 0); + + if (d.isValid()) { + results[i]->operator()(e, 0) = d.getData() == ngh.v[i] ? +1 : -1; + if (d.getData() != ngh.v[i]) { + printf("ERROR: %d %d %d %d %d %d\n", globalPoint.x, globalPoint.y, globalPoint.z, ngh.v[0], ngh.v[1], ngh.v[2]); + d = nghInfo[i]->getNghData(e, testDirection.newType(), 0); + } + } else { + results[i]->operator()(e, 0) = 0; + } + } + }; + }); +} + using namespace Neon::domain::tool::testing; template @@ -165,15 +218,15 @@ auto run(TestData& data) -> void X, Y, Z); }; - // constexpr std::array - // stencil{Ngh3DIdx(1, 0, 0), - // Ngh3DIdx(-1, 0, 0), - // Ngh3DIdx(0, 1, 0), - // Ngh3DIdx(0, -1, 0), - // Ngh3DIdx(0, 0, 1), - // Ngh3DIdx(0, 0, -1)}; - constexpr std::array - stencil{Ngh3DIdx(0, 0, -1)}; + constexpr std::array + stencil{Ngh3DIdx(1, 0, 0), + Ngh3DIdx(-1, 0, 0), + Ngh3DIdx(0, 1, 0), + Ngh3DIdx(0, -1, 0), + Ngh3DIdx(0, 0, 1), + Ngh3DIdx(0, 0, -1)}; + // constexpr std::array + // stencil{Ngh3DIdx(0, 0, -1)}; for (auto const& direction : stencil) { reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx); @@ -214,8 +267,149 @@ auto run(TestData& data) -> void } } +template +auto runTemplate(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + + data.resetValuesToLinear(1, 100); + + auto aField = grid.template newField("a", 1, 0); + auto bField = grid.template newField("a", 1, 0); + auto cField = grid.template newField("a", 1, 0); + + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + auto& Z = data.getField(FieldNames::Z); + + const Neon::index_3d dim = grid.getDimension(); + auto bk = grid.getBackend(); + + { // NEON + { + initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + } + using Ngh3DIdx = Neon::int32_3d; + + auto setGolden = [&](Ngh3DIdx const& direction) { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + auto& Z = data.getIODomain(FieldNames::Z); + + data.forEachActiveIODomain([&](const Neon::index_3d& idx, + int cardinality, + Type& a, + Type& b, + Type& c) { + a = 1; + b = 1; + c = 1; + auto ngh = direction + idx; + if (!(ngh >= 0)) { + a = 0; + b = 0; + c = 0; + } + if (!(dim > ngh)) { + a = 0; + b = 0; + c = 0; + } + }, + X, Y, Z); + }; + + constexpr std::array + stencil{Ngh3DIdx(1, 0, 0), + Ngh3DIdx(-1, 0, 0), + Ngh3DIdx(0, 1, 0), + Ngh3DIdx(0, -1, 0), + Ngh3DIdx(0, 0, 1), + Ngh3DIdx(0, 0, -1)}; + // constexpr std::array + // stencil{Ngh3DIdx(0, 0, -1)}; + + for (auto const& direction : stencil) { + reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + reset(X, Y, Z).run(Neon::Backend::mainStreamIdx); + { // Updating halo with wrong data + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + { + initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + + + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + + if (direction == Neon::index_3d(1, 0, 0)) { + checkNeighbourDataTemplate<1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(-1, 0, 0)) { + checkNeighbourDataTemplate<-1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 1, 0)) { + checkNeighbourDataTemplate<0, 1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, -1, 0)) { + checkNeighbourDataTemplate<0, -1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 0, 1)) { + checkNeighbourDataTemplate<0, 0, 1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 0, -1)) { + checkNeighbourDataTemplate<0, 0, -1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else { + std::cout << "Direction not implemented " << direction << std::endl; + exit(99); + } + setGolden(direction); + + bk.sync(Neon::Backend::mainStreamIdx); + bool isOk = data.compare(FieldNames::X); + isOk = isOk && data.compare(FieldNames::Y); + isOk = isOk && data.compare(FieldNames::Z); + + if (!isOk) { + std::cout << "Direction with errors " << direction << std::endl; + data.getField(FieldNames::X).ioToVtk(grid.getImplementationName() + "X", "X", true); + data.getField(FieldNames::Y).ioToVtk(grid.getImplementationName() + "Y", "Y", true); + data.getField(FieldNames::Z).ioToVtk(grid.getImplementationName() + "Z", "Z", true); + exit(77); + ASSERT_TRUE(isOk); + } + } +} + + template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; + + +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h index 0a3b87eb..bcf503f2 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h @@ -4,6 +4,7 @@ #include "Neon/domain/Grids.h" #include "Neon/domain/tools/TestData.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace globalIdx { @@ -12,9 +13,17 @@ using namespace Neon::domain::tool::testing; template auto run(TestData& data) -> void; +template +auto runTemplate(TestData& data) -> void; + extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; } // namespace map From 13377a4af18430dfc9bf7ec16afe2fcb2d209520 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 27 Jun 2023 10:08:19 -0400 Subject: [PATCH 17/25] Testing block sizes on bGrid --- .../lbm-lid-driven-cavity-flow/src/LbmTools.h | 8 +- .../src/LbmToolsTemplateOnly.h | 440 ++++++++++++++++++ .../src/RunCavityTwoPop.cu | 27 +- .../domain/details/dGridSoA/dSpanSoA_imp.h | 52 ++- .../tests/domain-map/src/runHelper.h | 4 +- 5 files changed, 501 insertions(+), 30 deletions(-) create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h index 5728a5d3..ab79ed2a 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h @@ -35,7 +35,6 @@ struct LbmContainers(i, GOid, 0.0).value; \ @@ -101,8 +100,6 @@ struct LbmContainers; constexpr std::array stencil{ @@ -160,7 +157,6 @@ struct LbmContainers(pop_out_06); + fOut(i, 16) = static_cast(pop_out_opp_06); COMPUTE_GO_AND_BACK(7, 17) COMPUTE_GO_AND_BACK(8, 18) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h new file mode 100644 index 00000000..fc4d7806 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h @@ -0,0 +1,440 @@ +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +#define COMPUTE_CAST(VAR) static_cast((VAR)) + +template +struct LbmContainersTemplateOnly +{ +}; + +/** + * Specialization for Lattice + * @tparam PopulationField + * @tparam LbmComputeType + */ +template +struct LbmContainersTemplateOnly, + PopulationField, + LbmComputeType> +{ + using LbmStoreType = typename PopulationField::Type; + using CellTypeField = typename PopulationField::Grid::template Field; + using Lattice = D3Q19Template; + using Idx = typename PopulationField::Idx; + using Grid = typename PopulationField::Grid; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + +#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ + popIn[GOid] = fin.template read(gidx); \ + } else { \ + popIn[GOid] = fin.template nghVal(gidx).value; \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin.template read(gidx); \ + } else { \ + popIn[BKid] = fin.template nghVal(gidx).value; \ + } \ + } \ + } + static inline NEON_CUDA_HOST_DEVICE auto + loadPopulation(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopulationField::Partition const& fin, + NEON_OUT LbmStoreType popIn[19]) + { + // #pragma omp critical + // { + + LOADPOP(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); + LOADPOP(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); + LOADPOP(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); + LOADPOP(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); + LOADPOP(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); + LOADPOP(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); + LOADPOP(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); + LOADPOP(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); + LOADPOP(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); + // } + // Treat the case of the center (c[k] = {0, 0, 0,}). + { + popIn[Lattice::centerDirection] = fin(i, Lattice::centerDirection); + } + } +#undef LOADPOP + +#define PULL_STREAM(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ + popIn[GOid] = fin(gidx, BKid) + \ + fin.template getNghData(gidx, BKid)(); \ + } else { \ + popIn[GOid] = fin.template getNghData(gidx, GOid)(); \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin(gidx, GOid) + fin.template getNghData(gidx, GOid)(); \ + } else { \ + popIn[BKid] = fin.template getNghData(gidx, BKid)(); \ + } \ + } \ + } + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopulationField::Partition const& fin, + NEON_OUT LbmStoreType popIn[19]) + { + // #pragma omp critical + // { +#if 0 + using TopologyByDirection = std::tuple; + constexpr std::array stencil{ + std::make_tuple(Neon::int32_3d(-1, 0, 0), /* GOid */ 0, /* --- */ Neon::int32_3d(1, 0, 0), /* BKid */ 10), + std::make_tuple(Neon::int32_3d(0, -1, 0), /* GOid */ 1, /* --- */ Neon::int32_3d(0, 1, 0), /* BKid */ 11), + std::make_tuple(Neon::int32_3d(0, 0, -1), /* GOid */ 2, /* --- */ Neon::int32_3d(0, 0, 1), /* BKid */ 12), + std::make_tuple(Neon::int32_3d(-1, -1, 0), /* GOid */ 3, /* --- */ Neon::int32_3d(1, 1, 0), /* BKid */ 13), + std::make_tuple(Neon::int32_3d(-1, 1, 0), /* GOid */ 4, /* --- */ Neon::int32_3d(1, -1, 0), /* BKid */ 14), + std::make_tuple(Neon::int32_3d(-1, 0, -1), /* GOid */ 5, /* --- */ Neon::int32_3d(1, 0, 1), /* BKid */ 15), + std::make_tuple(Neon::int32_3d(-1, 0, 1), /* GOid */ 6, /* --- */ Neon::int32_3d(1, 0, -1), /* BKid */ 16), + std::make_tuple(Neon::int32_3d(0, -1, -1), /* GOid */ 7, /* --- */ Neon::int32_3d(0, 1, 1), /* BKid */ 17), + std::make_tuple(Neon::int32_3d(0, -1, 1), /* GOid */ 8, /* --- */ Neon::int32_3d(0, 1, -1), /* BKid */ 18)}; + + + auto pullStream = [&]() { + static_assert(stencilIdx < 9); + constexpr int GOid = std::get<1>(stencil[stencilIdx]); + constexpr int BKid = std::get<3>(stencil[stencilIdx]); + constexpr Neon::int32_3d GoOffset = std::get<0>(stencil[stencilIdx]); + constexpr Neon::int32_3d BkOffset = std::get<2>(stencil[stencilIdx]); + { + if (wallBitFlag & (uint32_t(1) << GOid)) { + popIn[GOid] = fin(gidx, BKid) + + fin.template getNghData(gidx, BKid)(); + } else { + popIn[GOid] = fin.template getNghData(gidx, GOid)(); + } + } + { /*BK*/ + if (wallBitFlag & (uint32_t(1) << BKid)) { + popIn[BKid] = fin(gidx, GOid) + + fin.template getNghData(gidx, GOid)(); + } else { + popIn[BKid] = fin.template getNghData(gidx, BKid)(); + } + } + }; + pullStream.template operator()<0>(); + pullStream.template operator()<1>(); + pullStream.template operator()<2>(); + pullStream.template operator()<3>(); + pullStream.template operator()<4>(); + pullStream.template operator()<5>(); + pullStream.template operator()<6>(); + pullStream.template operator()<7>(); + pullStream.template operator()<8>(); +#endif + PULL_STREAM(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); + PULL_STREAM(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); + PULL_STREAM(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); + PULL_STREAM(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); + PULL_STREAM(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); + PULL_STREAM(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); + PULL_STREAM(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); + PULL_STREAM(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); + PULL_STREAM(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); + + // } + // Treat the case of the center (c[k] = {0, 0, 0,}). + { + popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection); + } + } +#undef PULL_STREAM + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const LbmStoreType pop[Lattice::Q], + NEON_OUT LbmComputeType& rho, + NEON_OUT std::array& u) + -> void + { +#define POP(IDX) static_cast(pop[IDX]) + + const LbmComputeType X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const LbmComputeType X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const LbmComputeType X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const LbmComputeType Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const LbmComputeType Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const LbmComputeType Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const LbmComputeType Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); + +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! LbmComputeType iterator */, + const LbmStoreType pop[Lattice::Q], + LbmComputeType const& rho /*! Density */, + std::array const& u /*! Velocity */, + LbmComputeType const& usqr /*! Usqr */, + LbmComputeType const& omega /*! Omega */, + typename PopulationField::Partition& fOut /*! Population */) + + -> void + { + const LbmComputeType ck_u03 = u[0] + u[1]; + const LbmComputeType ck_u04 = u[0] - u[1]; + const LbmComputeType ck_u05 = u[0] + u[2]; + const LbmComputeType ck_u06 = u[0] - u[2]; + const LbmComputeType ck_u07 = u[1] + u[2]; + const LbmComputeType ck_u08 = u[1] - u[2]; + + const LbmComputeType eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr); + const LbmComputeType eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr); + const LbmComputeType eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr); + const LbmComputeType eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr); + const LbmComputeType eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr); + const LbmComputeType eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr); + const LbmComputeType eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr); + const LbmComputeType eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr); + const LbmComputeType eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr); + + const LbmComputeType eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0]; + const LbmComputeType eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1]; + const LbmComputeType eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2]; + const LbmComputeType eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03; + const LbmComputeType eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04; + const LbmComputeType eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05; + const LbmComputeType eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06; + const LbmComputeType eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07; + const LbmComputeType eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08; + + const LbmComputeType pop_out_00 = (1. - omega) * static_cast(pop[0]) + omega * eq_00; + const LbmComputeType pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; + const LbmComputeType pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; + const LbmComputeType pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; + const LbmComputeType pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; + const LbmComputeType pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; + const LbmComputeType pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; + const LbmComputeType pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; + const LbmComputeType pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; + + const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; + const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; + const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; + const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; + const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; + const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; + const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; + const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; + const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; + + +#define COMPUTE_GO_AND_BACK(GOid, BKid) \ + { \ + fOut(i, GOid) = static_cast(pop_out_0##GOid); \ + fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ + } + + COMPUTE_GO_AND_BACK(0, 10) + COMPUTE_GO_AND_BACK(1, 11) + COMPUTE_GO_AND_BACK(2, 12) + COMPUTE_GO_AND_BACK(3, 13) + COMPUTE_GO_AND_BACK(4, 14) + COMPUTE_GO_AND_BACK(5, 15) + COMPUTE_GO_AND_BACK(6, 16) + COMPUTE_GO_AND_BACK(7, 17) + COMPUTE_GO_AND_BACK(8, 18) + +#undef COMPUTE_GO_AND_BACK + + { + const LbmComputeType eq_09 = rho * (1. / 3.) * (1. - usqr); + const LbmComputeType pop_out_09 = (1. - omega) * + static_cast(pop[Lattice::centerDirection]) + + omega * eq_09; + fOut(i, Lattice::centerDirection) = static_cast(pop_out_09); + } + } + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopulationField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const LbmComputeType omega /*! LBM omega parameter */, + PopulationField& fOutField /*! output Population field */) + -> Neon::set::Container + { + + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + LbmStoreType popIn[Lattice::Q]; + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + LbmComputeType rho; + std::array u{.0, .0, .0}; + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + LbmComputeType usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + +#define COMPUTE_MASK_WALL(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); \ + } \ + } \ + { /*BK*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << BKid)); \ + } \ + } \ + } + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, + Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + COMPUTE_MASK_WALL(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10) + COMPUTE_MASK_WALL(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11) + COMPUTE_MASK_WALL(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12) + COMPUTE_MASK_WALL(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13) + COMPUTE_MASK_WALL(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14) + COMPUTE_MASK_WALL(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15) + COMPUTE_MASK_WALL(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16) + COMPUTE_MASK_WALL(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17) + COMPUTE_MASK_WALL(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18) + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } +#undef COMPUTE_MASK_WALL + +#define BC_LOAD(GOID, DKID) \ + popIn[GOID] = fIn(gidx, GOID); \ + popIn[DKID] = fIn(gidx, DKID); + + static auto + computeRhoAndU([[maybe_unused]] const PopulationField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + LbmComputeType rho = 0; + std::array u{.0, .0, .0}; + LbmStoreType popIn[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + BC_LOAD(0, 10) + BC_LOAD(1, 11) + BC_LOAD(2, 12) + BC_LOAD(3, 13) + BC_LOAD(4, 14) + BC_LOAD(5, 15) + BC_LOAD(6, 16) + BC_LOAD(7, 17) + BC_LOAD(8, 18) + popIn[9] = fIn(gidx, 9); + + rho = 1.0; + u = std::array{COMPUTE_CAST(popIn[0]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[1]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[2]) / COMPUTE_CAST(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } +}; + +#undef COMPUTE_CAST \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index 29c7573d..e91055f9 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -2,8 +2,8 @@ #include "D3Q19.h" #include "Neon/domain/bGrid.h" #include "Neon/domain/dGrid.h" -#include "Neon/domain/eGrid.h" #include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/eGrid.h" #include "CellType.h" #include "LbmIteration.h" @@ -314,6 +314,31 @@ auto run(Config& config, if (config.gridType == "bGrid") { return details::runFilterStoreType(config, report); } + if (config.gridType == "bGrid_4_4_4") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_8_4") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_8_4") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 4, 8>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_2_8") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_8_2") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } if (config.gridType == "dGridSoA") { return details::runFilterStoreType(config, report); } diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h index 421a3f27..f760adb5 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h @@ -4,71 +4,77 @@ namespace Neon::domain::details::dGridSoA { NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::setAndValidate(Idx& idx, - const uint32_t& x, - const uint32_t& y, - const uint32_t& z) + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) const -> bool { - bool res = false; idx.setLocation().x = int(x); idx.setLocation().y = int(y); idx.setLocation().z = int(z); - if (idx.getLocation() < mDim) { - res = true; - } + bool isValid = idx.getLocation() < mDim; switch (mDataView) { case Neon::DataView::STANDARD: { idx.setLocation().z += mZHaloRadius; - idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; - return res; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; } case Neon::DataView::INTERNAL: { idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; - idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; - return res; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; } case Neon::DataView::BOUNDARY: { - idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius - ? 0 - : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); + ? 0 + : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); idx.setLocation().z += mZHaloRadius; - idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; - return res; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; } default: { } } - return false; + return isValid; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDataView() +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetDataView() const -> Neon::DataView const& { return mDataView; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZHaloRadius() +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetZHaloRadius() const -> int const& { return mZHaloRadius; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZBoundaryRadius() +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetZBoundaryRadius() const -> int const& { return mZBoundaryRadius; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDim() +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetDim() const -> Neon::index_3d const& { return mDim; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) ->void +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) -> void { mDataView = dspan.helpGetDataView(); mZHaloRadius = dspan.helpGetZHaloRadius(); @@ -77,4 +83,4 @@ NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGr } -} // namespace Neon::domain::details::dGrid \ No newline at end of file +} // namespace Neon::domain::details::dGridSoA \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map/src/runHelper.h b/libNeonDomain/tests/domain-map/src/runHelper.h index 53ea8681..593e31c2 100644 --- a/libNeonDomain/tests/domain-map/src/runHelper.h +++ b/libNeonDomain/tests/domain-map/src/runHelper.h @@ -31,7 +31,7 @@ void runAllTestConfiguration( nGpuTest.push_back(i); } // std::vector nGpuTest{2,4,6,8}; - std::vector cardinalityTest{1}; + std::vector cardinalityTest{1,3,19}; std::vector dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}}; std::vector runtimeE{Neon::Runtime::openmp}; @@ -95,6 +95,7 @@ void runAllTestConfiguration( } } +#if 0 template void runOneTestConfiguration(const std::string& gname, @@ -144,3 +145,4 @@ void runOneTestConfiguration(const std::string& gname, } } } +#endif \ No newline at end of file From 3a36f0c81e830a170712227b463a8c4d7631cf26 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 28 Jun 2023 12:55:22 -0400 Subject: [PATCH 18/25] Adding dGridSoA to the stencil tests --- .../Neon/domain/details/dGridSoA/dPartitionSoA.h | 2 -- .../Neon/domain/tools/gridTransformer/tField.h | 1 + libNeonDomain/tests/domain-stencil/src/gtests.cpp | 11 ++++++++++- libNeonDomain/tests/domain-stencil/src/runHelper.h | 2 +- libNeonDomain/tests/domain-stencil/src/stencil.cu | 1 + libNeonDomain/tests/domain-stencil/src/stencil.h | 3 ++- 6 files changed, 15 insertions(+), 5 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h index 1cdd75db..62fdc9a4 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -353,13 +353,11 @@ class dPartitionSoA T* NEON_RESTRICT mMem; Neon::index_3d mDim; int mZHaloRadius; - int mZBoundaryRadius; Pitch mPitch; int mPrtID; Neon::index_3d mOrigin; int mCardinality; Neon::index_3d mFullGridSize; - bool mPeriodicZ; NghIdx* NEON_RESTRICT mStencil; }; diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h index c9ca59b9..a1b4c90d 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h @@ -26,6 +26,7 @@ class tField : public Neon::domain::interface::FieldBaseTemplate; using Idx = typename Partition::Idx; using NghIdx = typename Partition::NghIdx; // for compatibility with eGrid + using NghData = typename Partition::NghData; // for compatibility with eGrid private: using FoundationGrid = typename GridTransformation::FoundationGrid; diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp index ec6f892a..15816da3 100644 --- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp +++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp @@ -22,7 +22,7 @@ TEST(domain_stencil, eGrid) 1); } -TEST(domain_stencil, bGri ) +TEST(domain_stencil, bGri) { int nGpus = 5; using Type = int64_t; @@ -31,6 +31,15 @@ TEST(domain_stencil, bGri ) 1); } +TEST(domain_stencil, dGridSoA) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-stencil/src/runHelper.h b/libNeonDomain/tests/domain-stencil/src/runHelper.h index e8f286ae..16cefb0f 100644 --- a/libNeonDomain/tests/domain-stencil/src/runHelper.h +++ b/libNeonDomain/tests/domain-stencil/src/runHelper.h @@ -33,7 +33,7 @@ void runAllTestConfiguration( // std::vector nGpuTest{2,4,6,8}; std::vector cardinalityTest{1}; - std::vector dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}}; + std::vector dimTest{{10, 17, 90}, {1, 1, 100}, {17, 1, 77}}; std::vector runtimeE{Neon::Runtime::openmp}; if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) { runtimeE.push_back(Neon::Runtime::stream); diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index a86f1def..d0f19c67 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -203,6 +203,7 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.h b/libNeonDomain/tests/domain-stencil/src/stencil.h index a35d8011..7d74196a 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.h +++ b/libNeonDomain/tests/domain-stencil/src/stencil.h @@ -15,5 +15,6 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; - +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; } // namespace map From a49b27aeaeb83dfdd1ed47debba0fed99221a834 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 29 Jun 2023 11:27:58 -0400 Subject: [PATCH 19/25] WIP --- .../Neon/domain/details/bGrid/bPartition.h | 15 ++ .../domain/details/bGrid/bPartition_imp.h | 34 +++- .../domain/details/dGridSoA/dPartitionSoA.h | 1 + .../Neon/domain/details/eGrid/ePartition.h | 13 ++ .../domain/details/eGrid/ePartition_imp.h | 87 ++++++---- .../tests/domain-stencil/src/gtests.cpp | 52 +++++- .../tests/domain-stencil/src/stencil.cu | 158 +++++++++++++----- .../tests/domain-stencil/src/stencil.h | 20 ++- 8 files changed, 291 insertions(+), 89 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 73ccb914..a03af559 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -98,6 +98,19 @@ class bPartition T defaultValue) const -> NghData; + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t &&( std::is_invocable_v || std::is_same_v), void>; + + /** * Gets the global coordinates of the cartesian point. */ @@ -134,6 +147,8 @@ class bPartition helpGetNghIdx(const Idx& idx) const -> Idx; + + int mCardinality; T* mMem; NghIdx const* NEON_RESTRICT mStencilNghIndex; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index dc4c5880..5fa6f260 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -45,10 +45,10 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: location.x += gidx.mInDataBlockIdx.x; location.y += gidx.mInDataBlockIdx.y; location.z += gidx.mInDataBlockIdx.z; - if constexpr (SBlock::isMultiResMode){ + if constexpr (SBlock::isMultiResMode) { return location * mMultiResDiscreteIdxSpacing; } - return location ; + return location; } template @@ -354,4 +354,34 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: result.set(value, true); return result; } + +template + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + + if (isValid) { + auto const& value = mMem[pitch]; + funIfValid(value); + return; + } + + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h index 62fdc9a4..0572302b 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -20,6 +20,7 @@ class dPartitionSoA using NghData = Neon::domain::NghData; using Pitch = uint32_4d; using NghIdx = int8_3d; + using Type = T; dPartitionSoA() { diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h index 62b75981..05f3101b 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h @@ -188,6 +188,19 @@ class ePartition int card, T defaultValue) const -> NghData; + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t &&( std::is_invocable_v || std::is_same_v), void>; + /** * Check is the * @tparam dataView_ta diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h index 0063ee9e..8565cdc1 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h @@ -37,34 +37,34 @@ ePartition::cardinality() const template NEON_CUDA_HOST_DEVICE inline auto -ePartition::operator()(eIndex eId, int cardinalityIdx) const +ePartition::operator()(eIndex gidx, int cardinalityIdx) const -> T { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem[jump]; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::operator()(eIndex eId, int cardinalityIdx) -> T& +ePartition::operator()(eIndex gidx, int cardinalityIdx) -> T& { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem[jump]; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, +ePartition::getNghData(eIndex gidx, NghIdx nghIdx, int card) const -> NghData { - eIndex eIdxNgh; - const bool isValidNeighbour = isValidNgh(eId, nghIdx, eIdxNgh); + eIndex gidxxNgh; + const bool isValidNeighbour = isValidNgh(gidx, nghIdx, gidxxNgh); if (isValidNeighbour) { - T val = this->operator()(eIdxNgh, card); + T val = this->operator()(gidxxNgh, card); return NghData(val, isValidNeighbour); } return NghData(isValidNeighbour); @@ -73,7 +73,7 @@ ePartition::getNghData(eIndex eId, template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, +ePartition::getNghData(eIndex gidx, const Neon::int8_3d& ngh3dIdx, int card) const -> NghData @@ -82,7 +82,7 @@ ePartition::getNghData(eIndex eId, (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); return res; } @@ -91,15 +91,15 @@ template template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, - int card) +ePartition::getNghData(eIndex gidx, + int card) const -> NghData { int tablePithc = (xOff + mStencilRadius) + (yOff + mStencilRadius) * mStencilTableYPitch + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); return res; } @@ -108,37 +108,66 @@ template template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, - int card, - T defaultVal) +ePartition::getNghData(eIndex gidx, + int card, + T defaultVal) const -> NghData { int tablePithc = (xOff + mStencilRadius) + (yOff + mStencilRadius) * mStencilTableYPitch + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); if (!res.isValid()) { res.set(defaultVal, false); } return res; } +template +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + int tablePithc = (xOff + mStencilRadius) + + (yOff + mStencilRadius) * mStencilTableYPitch + + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(gidx, nghIdx, card); + if (res.isValid()) { + funIfValid(res.getData()); + return; + } + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} + template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghIndex(eIndex eId, +ePartition::getNghIndex(eIndex gidx, const Neon::int8_3d& ngh3dIdx, - eIndex& eIdxNgh) const -> bool + eIndex& gidxxNgh) const -> bool { int tablePithc = (ngh3dIdx.x + mStencilRadius) + (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; eIndex tmpEIdxNgh; - const bool isValidNeighbour = isValidNgh(eId, nghIdx, tmpEIdxNgh); + const bool isValidNeighbour = isValidNgh(gidx, nghIdx, tmpEIdxNgh); if (isValidNeighbour) { - eIdxNgh = tmpEIdxNgh; + gidxxNgh = tmpEIdxNgh; } return isValidNeighbour; } @@ -146,17 +175,17 @@ ePartition::getNghIndex(eIndex eId, template NEON_CUDA_HOST_DEVICE inline auto -ePartition::isValidNgh(eIndex eId, +ePartition::isValidNgh(eIndex gidx, NghIdx nghIdx, eIndex& neighbourIdx) const -> bool { - const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + eId.helpGet(); + const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + gidx.helpGet(); neighbourIdx.helpSet() = NEON_CUDA_CONST_LOAD((mConnectivity + connectivityJumo)); const bool isValidNeighbour = (neighbourIdx.mIdx > -1); - // printf("(prtId %d) getNghData id %d eIdxNgh %d connectivityJumo %d\n", + // printf("(prtId %d) getNghData id %d gidxxNgh %d connectivityJumo %d\n", // mPrtID, - // eId.mIdx, neighbourIdx.mIdx, connectivityJumo); + // gidx.mIdx, neighbourIdx.mIdx, connectivityJumo); return isValidNeighbour; } @@ -201,20 +230,20 @@ ePartition::ePartition(int prtId, template NEON_CUDA_HOST_DEVICE auto -ePartition::pointer(eIndex eId, int cardinalityIdx) const +ePartition::pointer(eIndex gidx, int cardinalityIdx) const -> const Type* { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem + jump; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getOffset(eIndex eId, int cardinalityIdx) const +ePartition::getOffset(eIndex gidx, int cardinalityIdx) const -> Offset { - return Offset(eId.helpGet() * mPitch.x + cardinalityIdx * mPitch.y); + return Offset(gidx.helpGet() * mPitch.x + cardinalityIdx * mPitch.y); } template ), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, eGrid) +TEST(domain_stencil, eGrid_NoTemplate) { int nGpus = 3; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, bGri) +TEST(domain_stencil, bGri_NoTemplate) { int nGpus = 5; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, dGridSoA) +TEST(domain_stencil, dGridSoA_NoTemplate) { int nGpus = 5; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGrid_Template) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, eGrid_Template) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, bGri_Template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGridSoA_Template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), nGpus, 1); } diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index d0f19c67..926153fa 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -9,8 +9,8 @@ namespace map { template -auto stencilContainer_laplace(const Field& filedA, - Field& fieldB) +auto laplaceNoTemplate(const Field& filedA, + Field& fieldB) -> Neon::set::Container { const auto& grid = filedA.getGrid(); @@ -59,15 +59,22 @@ static constexpr std::array stencil{ Ngh3DIdx(0, 0, 1), Ngh3DIdx(0, 0, -1)}; -template -inline auto viaTemplate (const IDX& idx, int i, const Field& a, int& partial, int& count){ - a.template getNghData(idx, i, - [&](typename Field::Type const& val) { - partial += val; - count++; - }); +template +NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Partition& a, Partial& partial, int& count) +{ + Neon::index_3d direction(X, Y, Z); + auto nghData = a.getNghData(idx, direction.newType(), i); + if (nghData.isValid()) { + partial += nghData.getData(); + count++; + } + // a.template getNghData(idx, i, + // [&](typename Partition::Type const& val) { + // partial += val; + // count++; + // }); }; template @@ -88,36 +95,18 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); typename Field::Type partial = 0; int count = 0; + using Ngh3DIdx = Neon::int8_3d; - constexpr std::array stencil{ - Ngh3DIdx(1, 0, 0), - Ngh3DIdx(-1, 0, 0), - Ngh3DIdx(0, 1, 0), - Ngh3DIdx(0, -1, 0), - Ngh3DIdx(0, 0, 1), - Ngh3DIdx(0, 0, -1)}; -#if 0 - auto viaTemplate = [&]() { - if constexpr (std::is_same_v) { - a.template getNghData(idx, i, - [&](Field::Type const& val) { - partial += val; - count++; - }); - } - }; -#endif - viaTemplate<0>(idx, i, a, partial, count); - viaTemplate<1>(idx, i, a, partial, count); - viaTemplate<2>(idx, i, a, partial, count); - viaTemplate<3>(idx, i, a, partial, count); - viaTemplate<4>(idx, i, a, partial, count); - viaTemplate<5>(idx, i, a, partial, count); + viaTemplate<1, 0, 0>(idx, i, a, partial, count); + viaTemplate<-1, 0, 0>(idx, i, a, partial, count); + viaTemplate<0, 1, 0>(idx, i, a, partial, count); + viaTemplate<0, -1, 0>(idx, i, a, partial, count); + viaTemplate<0, 0, 1>(idx, i, a, partial, count); + viaTemplate<0, 0, -1>(idx, i, a, partial, count); - b(idx, i) = a(idx, i) - count * partial; + + b(idx, i) = a(idx, i) - count * partial ; } }; }); @@ -126,7 +115,82 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, using namespace Neon::domain::tool::testing; template -auto run(TestData& data) -> void +auto runNoTemplate(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + const int maxIters = 1; + + NEON_INFO(grid.toString()); + + // data.resetValuesToLinear(1, 100); + data.resetValuesToMasked(1); + + { // NEON + const Neon::index_3d dim = grid.getDimension(); + std::vector elements; + auto bk = grid.getBackend(); + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + for (int iter = maxIters; iter > 0; iter--) { + bk.sync(Neon::Backend::mainStreamIdx); + X.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::put, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(X, Y).run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + Y.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx); + } + data.getBackend().sync(0); + } + + { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + for (int iter = maxIters; iter > 0; iter--) { + data.laplace(X, Y); + data.laplace(Y, X); + } + } + + data.updateHostData(); + + data.getField(FieldNames::X).ioToVtk("X", "X", true); + // data.getField(FieldNames::Y).ioToVtk("Y", "Y", false); + // data.getField(FieldNames::Z).ioToVtk("Z", "Z", false); + // + data.getIODomain(FieldNames::X).ioToVti("X_", "X_"); + // data.getField(FieldNames::Y).ioVtiAllocator("Y_"); + // data.getField(FieldNames::Z).ioVtiAllocator("Z_"); + + bool isOk = data.compare(FieldNames::X); + isOk = data.compare(FieldNames::Y); + if (!isOk) { + auto flagField = data.compareAndGetField(FieldNames::X); + flagField.ioToVti("X_diffFlag", "X_diffFlag"); + flagField = data.compareAndGetField(FieldNames::Y); + flagField.ioToVti("Y_diffFlag", "Y_diffFlag"); + } + ASSERT_TRUE(isOk); + if (!isOk) { + exit(99); + } +} + +template +auto runTemplate(TestData& data) -> void { using Type = typename TestData::Type; @@ -153,7 +217,7 @@ auto run(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainer_laplace(X, Y).run(Neon::Backend::mainStreamIdx); + stencilContainerLaplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); Y.newHaloUpdate(Neon::set::StencilSemantic::standard, @@ -162,7 +226,7 @@ auto run(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainer_laplace(Y, X).run(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx); } data.getBackend().sync(0); } @@ -200,10 +264,14 @@ auto run(TestData& data) -> void } } -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.h b/libNeonDomain/tests/domain-stencil/src/stencil.h index 7d74196a..456f5f01 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.h +++ b/libNeonDomain/tests/domain-stencil/src/stencil.h @@ -11,10 +11,20 @@ namespace map { using namespace Neon::domain::tool::testing; template -auto run(TestData& data) -> void; +auto runNoTemplate(TestData& data) -> void; + +template +auto runTemplate(TestData& data) -> void; + + +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; + +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; -extern template auto run(TestData&) -> void; -extern template auto run(TestData&) -> void; -extern template auto run(TestData&) -> void; -extern template auto run(TestData&) -> void; } // namespace map From fde014d67b87529c5ae18e297b307e4381b4bd65 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 29 Jun 2023 11:33:43 -0400 Subject: [PATCH 20/25] Extending unit test for stencil to dGridSoA --- .../tests/domain-stencil/src/stencil.cu | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index 926153fa..14ae82b1 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -59,22 +59,22 @@ static constexpr std::array stencil{ Ngh3DIdx(0, 0, 1), Ngh3DIdx(0, 0, -1)}; -template +template NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Partition& a, Partial& partial, int& count) { - Neon::index_3d direction(X, Y, Z); - auto nghData = a.getNghData(idx, direction.newType(), i); - if (nghData.isValid()) { - partial += nghData.getData(); - count++; - } - // a.template getNghData(idx, i, - // [&](typename Partition::Type const& val) { - // partial += val; - // count++; - // }); + // Neon::index_3d direction(X, Y, Z); + // auto nghData = a.getNghData(idx, direction.newType(), i); + // if (nghData.isValid()) { + // partial += nghData.getData(); + // count++; + // } + a.template getNghData(idx, i, + [&](typename Partition::Type const& val) { + partial += val; + count++; + }); }; template @@ -98,15 +98,15 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, using Ngh3DIdx = Neon::int8_3d; - viaTemplate<1, 0, 0>(idx, i, a, partial, count); - viaTemplate<-1, 0, 0>(idx, i, a, partial, count); - viaTemplate<0, 1, 0>(idx, i, a, partial, count); - viaTemplate<0, -1, 0>(idx, i, a, partial, count); - viaTemplate<0, 0, 1>(idx, i, a, partial, count); - viaTemplate<0, 0, -1>(idx, i, a, partial, count); + viaTemplate<0>(idx, i, a, partial, count); + viaTemplate<1>(idx, i, a, partial, count); + viaTemplate<2>(idx, i, a, partial, count); + viaTemplate<3>(idx, i, a, partial, count); + viaTemplate<4>(idx, i, a, partial, count); + viaTemplate<5>(idx, i, a, partial, count); - b(idx, i) = a(idx, i) - count * partial ; + b(idx, i) = a(idx, i) - count * partial; } }; }); From b0e74e6c3dc62179c84a9d7d899efa461ecbc115 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 29 Jun 2023 17:14:38 -0400 Subject: [PATCH 21/25] WIP --- libNeonDomain/tests/domain-stencil/src/stencil.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index 14ae82b1..31e937e1 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -78,7 +78,7 @@ NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Parti }; template -auto stencilContainerLaplaceTemplate(const Field& filedA, +auto laplaceTemplate(const Field& filedA, Field& fieldB) -> Neon::set::Container { @@ -217,7 +217,7 @@ auto runTemplate(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainerLaplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx); + laplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); Y.newHaloUpdate(Neon::set::StencilSemantic::standard, @@ -226,7 +226,7 @@ auto runTemplate(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx); + laplaceTemplate(Y, X).run(Neon::Backend::mainStreamIdx); } data.getBackend().sync(0); } From 1dd5abc612caa5b3dc6f0896fea36f02e73f42dc Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 09:07:11 -0400 Subject: [PATCH 22/25] WIP --- .../include/Neon/domain/details/bGrid/bPartition_imp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index 5fa6f260..9a0bab8e 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -100,7 +100,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpGetValidIdxPitchExplicit(const Idx& idx, int card) const -> uint32_t { - uint32_t const blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x + SBlock::memBlockSizeX * idx.mInDataBlockIdx.y + (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z; From 81b352696731adfd70786292d8f0e107a3f0958d Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 10:43:24 -0400 Subject: [PATCH 23/25] WIP --- .../tests/domain-stencil/src/stencil.cu | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index 31e937e1..f6865999 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -77,9 +77,19 @@ NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Parti }); }; + +template +constexpr void constexpr_for(F&& f) +{ + if constexpr (Start < End) { + f(std::integral_constant()); + constexpr_for(f); + } +} + template auto laplaceTemplate(const Field& filedA, - Field& fieldB) + Field& fieldB) -> Neon::set::Container { const auto& grid = filedA.getGrid(); @@ -97,13 +107,23 @@ auto laplaceTemplate(const Field& filedA, int count = 0; using Ngh3DIdx = Neon::int8_3d; - - viaTemplate<0>(idx, i, a, partial, count); - viaTemplate<1>(idx, i, a, partial, count); - viaTemplate<2>(idx, i, a, partial, count); - viaTemplate<3>(idx, i, a, partial, count); - viaTemplate<4>(idx, i, a, partial, count); - viaTemplate<5>(idx, i, a, partial, count); + constexpr_for<0, 6, 1>([&](auto sIdx) { + a.template getNghData(idx, i, + [&](auto const& val) { + partial += val; + count++; + }); + }); + + +// viaTemplate<0>(idx, i, a, partial, count); +// viaTemplate<1>(idx, i, a, partial, count); +// viaTemplate<2>(idx, i, a, partial, count); +// viaTemplate<3>(idx, i, a, partial, count); +// viaTemplate<4>(idx, i, a, partial, count); +// viaTemplate<5>(idx, i, a, partial, count); b(idx, i) = a(idx, i) - count * partial; From 2a2caf7d83bb0c401cc5d7839e2d212132a966c1 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 11:04:00 -0400 Subject: [PATCH 24/25] WIP --- .../include/Neon/core/tools/metaprogramming.h | 1 + .../core/tools/metaprogramming/ConstexprFor.h | 14 +++++++++ .../tests/domain-stencil/src/stencil.cu | 29 +++++++------------ 3 files changed, 25 insertions(+), 19 deletions(-) create mode 100644 libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming.h b/libNeonCore/include/Neon/core/tools/metaprogramming.h index 53678ed6..ea004a43 100644 --- a/libNeonCore/include/Neon/core/tools/metaprogramming.h +++ b/libNeonCore/include/Neon/core/tools/metaprogramming.h @@ -4,3 +4,4 @@ #include "Neon/core/tools/metaprogramming/debugHelp.h" #include "Neon/core/tools/metaprogramming/extractTupleVecType.h" #include "Neon/core/tools/metaprogramming/tupleVecTable.h" +#include "Neon/core/tools/metaprogramming/ConstexprFor.h" \ No newline at end of file diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h new file mode 100644 index 00000000..2e8161e6 --- /dev/null +++ b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h @@ -0,0 +1,14 @@ +#pragma once + +namespace Neon { + +template +constexpr void ConstexprFor(F&& f) +{ + if constexpr (Start < End) { + f(std::integral_constant()); + ConstexprFor(f); + } +} + +} // namespace Neon \ No newline at end of file diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index f6865999..6cd4f6ff 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -78,14 +78,14 @@ NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Parti }; -template -constexpr void constexpr_for(F&& f) -{ - if constexpr (Start < End) { - f(std::integral_constant()); - constexpr_for(f); - } -} +//template +//constexpr void constexpr_for(F&& f) +//{ +// if constexpr (Start < End) { +// f(std::integral_constant()); +// constexpr_for(f); +// } +//} template auto laplaceTemplate(const Field& filedA, @@ -107,7 +107,7 @@ auto laplaceTemplate(const Field& filedA, int count = 0; using Ngh3DIdx = Neon::int8_3d; - constexpr_for<0, 6, 1>([&](auto sIdx) { + Neon::ConstexprFor<0, 6, 1>([&](auto sIdx) { a.template getNghData(idx, i, @@ -116,16 +116,7 @@ auto laplaceTemplate(const Field& filedA, count++; }); }); - - -// viaTemplate<0>(idx, i, a, partial, count); -// viaTemplate<1>(idx, i, a, partial, count); -// viaTemplate<2>(idx, i, a, partial, count); -// viaTemplate<3>(idx, i, a, partial, count); -// viaTemplate<4>(idx, i, a, partial, count); -// viaTemplate<5>(idx, i, a, partial, count); - - + b(idx, i) = a(idx, i) - count * partial; } }; From b63b90beece180c75f39695b20437bcc3b29a1fb Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 15:58:06 -0400 Subject: [PATCH 25/25] WIP --- .../lbm-lid-driven-cavity-flow/src/LbmTools.h | 125 +++++++++--------- 1 file changed, 65 insertions(+), 60 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h index ab79ed2a..4a12ca18 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h @@ -31,22 +31,22 @@ struct LbmContainers; using U = typename Grid::template Field; -#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - if (wallBitFlag & (uint32_t(1) << GOid)) { \ - popIn[GOid] = fin(i, BKid); \ - } else { \ - popIn[GOid] = fin.template nghVal(i, GOid, 0.0).value; \ - } \ - } \ - { /*BK*/ \ - if (wallBitFlag & (uint32_t(1) << BKid)) { \ - popIn[BKid] = fin(i, GOid); \ - } else { \ - popIn[BKid] = fin.template nghVal(i, BKid, 0.0).value; \ - } \ - } \ +#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + popIn[GOid] = fin(i, BKid); \ + } else { \ + popIn[GOid] = fin.template nghVal(i, GOid, 0.0).value; \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin(i, GOid); \ + } else { \ + popIn[BKid] = fin.template nghVal(i, BKid, 0.0).value; \ + } \ + } \ } static inline NEON_CUDA_HOST_DEVICE auto loadPopulation(Idx const& i, @@ -209,45 +209,52 @@ struct LbmContainers(pop[0]) + omega * eq_00; - const LbmComputeType pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; - const LbmComputeType pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; - const LbmComputeType pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; - const LbmComputeType pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; - const LbmComputeType pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; - const LbmComputeType pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; - const LbmComputeType pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; - const LbmComputeType pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; - - const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; - const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; - const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; - const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; - const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; - const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; - const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; - const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; - const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; + constexpr LbmComputeType c1over18 = 1. / 18.; + constexpr LbmComputeType c1over36 = 1. / 36.; + constexpr LbmComputeType c4dot5 = 4.5; + constexpr LbmComputeType c3 = 3.; + constexpr LbmComputeType c1 = 1.; + constexpr LbmComputeType c6 = 6.; + + const LbmComputeType eq_00 = rho * c1over18 * (c1 - c6 * u[0] + c4dot5 * u[0] * u[0] - usqr); + const LbmComputeType eq_01 = rho * c1over18 * (c1 - c6 * u[1] + c4dot5 * u[1] * u[1] - usqr); + const LbmComputeType eq_02 = rho * c1over18 * (c1 - c6 * u[2] + c4dot5 * u[2] * u[2] - usqr); + const LbmComputeType eq_03 = rho * c1over36 * (c1 - c6 * ck_u03 + c4dot5 * ck_u03 * ck_u03 - usqr); + const LbmComputeType eq_04 = rho * c1over36 * (c1 - c6 * ck_u04 + c4dot5 * ck_u04 * ck_u04 - usqr); + const LbmComputeType eq_05 = rho * c1over36 * (c1 - c6 * ck_u05 + c4dot5 * ck_u05 * ck_u05 - usqr); + const LbmComputeType eq_06 = rho * c1over36 * (c1 - c6 * ck_u06 + c4dot5 * ck_u06 * ck_u06 - usqr); + const LbmComputeType eq_07 = rho * c1over36 * (c1 - c6 * ck_u07 + c4dot5 * ck_u07 * ck_u07 - usqr); + const LbmComputeType eq_08 = rho * c1over36 * (c1 - c6 * ck_u08 + c4dot5 * ck_u08 * ck_u08 - usqr); + + const LbmComputeType eqopp_00 = eq_00 + rho * c1over18 * c6 * u[0]; + const LbmComputeType eqopp_01 = eq_01 + rho * c1over18 * c6 * u[1]; + const LbmComputeType eqopp_02 = eq_02 + rho * c1over18 * c6 * u[2]; + const LbmComputeType eqopp_03 = eq_03 + rho * c1over36 * c6 * ck_u03; + const LbmComputeType eqopp_04 = eq_04 + rho * c1over36 * c6 * ck_u04; + const LbmComputeType eqopp_05 = eq_05 + rho * c1over36 * c6 * ck_u05; + const LbmComputeType eqopp_06 = eq_06 + rho * c1over36 * c6 * ck_u06; + const LbmComputeType eqopp_07 = eq_07 + rho * c1over36 * c6 * ck_u07; + const LbmComputeType eqopp_08 = eq_08 + rho * c1over36 * c6 * ck_u08; + + const LbmComputeType pop_out_00 = (c1 - omega) * static_cast(pop[0]) + omega * eq_00; + const LbmComputeType pop_out_01 = (c1 - omega) * static_cast(pop[1]) + omega * eq_01; + const LbmComputeType pop_out_02 = (c1 - omega) * static_cast(pop[2]) + omega * eq_02; + const LbmComputeType pop_out_03 = (c1 - omega) * static_cast(pop[3]) + omega * eq_03; + const LbmComputeType pop_out_04 = (c1 - omega) * static_cast(pop[4]) + omega * eq_04; + const LbmComputeType pop_out_05 = (c1 - omega) * static_cast(pop[5]) + omega * eq_05; + const LbmComputeType pop_out_06 = (c1 - omega) * static_cast(pop[6]) + omega * eq_06; + const LbmComputeType pop_out_07 = (c1 - omega) * static_cast(pop[7]) + omega * eq_07; + const LbmComputeType pop_out_08 = (c1 - omega) * static_cast(pop[8]) + omega * eq_08; + + const LbmComputeType pop_out_opp_00 = (c1 - omega) * static_cast(pop[10]) + omega * eqopp_00; + const LbmComputeType pop_out_opp_01 = (c1 - omega) * static_cast(pop[11]) + omega * eqopp_01; + const LbmComputeType pop_out_opp_02 = (c1 - omega) * static_cast(pop[12]) + omega * eqopp_02; + const LbmComputeType pop_out_opp_03 = (c1 - omega) * static_cast(pop[13]) + omega * eqopp_03; + const LbmComputeType pop_out_opp_04 = (c1 - omega) * static_cast(pop[14]) + omega * eqopp_04; + const LbmComputeType pop_out_opp_05 = (c1 - omega) * static_cast(pop[15]) + omega * eqopp_05; + const LbmComputeType pop_out_opp_06 = (c1 - omega) * static_cast(pop[16]) + omega * eqopp_06; + const LbmComputeType pop_out_opp_07 = (c1 - omega) * static_cast(pop[17]) + omega * eqopp_07; + const LbmComputeType pop_out_opp_08 = (c1 - omega) * static_cast(pop[18]) + omega * eqopp_08; #define COMPUTE_GO_AND_BACK(GOid, BKid) \ @@ -262,17 +269,15 @@ struct LbmContainers(pop_out_06); - fOut(i, 16) = static_cast(pop_out_opp_06); + COMPUTE_GO_AND_BACK(6, 16) COMPUTE_GO_AND_BACK(7, 17) COMPUTE_GO_AND_BACK(8, 18) #undef COMPUTE_GO_AND_BACK { - const LbmComputeType eq_09 = rho * (1. / 3.) * (1. - usqr); - const LbmComputeType pop_out_09 = (1. - omega) * + const LbmComputeType eq_09 = rho * (c1 / c3) * (c1 - usqr); + const LbmComputeType pop_out_09 = (c1 - omega) * static_cast(pop[Lattice::centerDirection]) + omega * eq_09; fOut(i, Lattice::centerDirection) = static_cast(pop_out_09);