From aacf68e45153be4a6fc4537cf157ba3a25986714 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 9 Jun 2023 10:14:02 -0400
Subject: [PATCH 01/25] Adding exception for arrayOfStructure option for bGrid.

---
 libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h  | 3 +++
 libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp | 4 ++--
 .../tests/domain-neighbour-globalIdx/src/runHelper.h          | 3 +++
 3 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
index 3c57d7d9..3d8fbfb7 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
@@ -28,6 +28,9 @@ bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBl
     mData = std::make_shared<Data>(grid.getBackend());
     mData->grid = std::make_shared<Grid>(grid);
 
+    if(memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs){
+        NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs");
+    }
     // the allocation size is the number of blocks x block size x cardinality
     mData->memoryField = mData->grid->helpGetBlockViewGrid().template newField<T, 0>(
         "BitMask",
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp
index bdf77a74..feba5a9b 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp
@@ -22,9 +22,9 @@ TEST(domain_unit_test_globalIdx, eGrid)
                             1);
 }
 
-TEST(domain_unit_test_globalIdx, bGridSingleGPU)
+TEST(domain_unit_test_globalIdx, bGrid)
 {
-    int nGpus = 1;
+    int nGpus = 5;
     using Type = int64_t;
     runAllTestConfiguration(std::function(globalIdx::run<Neon::bGrid, Type, 0>),
                             nGpus,
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h
index e064a49a..0014594c 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h
@@ -82,6 +82,9 @@ void runAllTestConfiguration(
                                 if (dim.z < 8 * ngpu * 3) {
                                     dim.z = ngpu * 3 * 8;
                                 }
+                                if(memoryLayout == Neon::MemoryLayout::arrayOfStructs){
+                                    continue ;
+                                }
                             }
 
                             assert(card == 1);

From 18f2d7223fc2de5a0e41c7bd1c906035ea2e79a4 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 Jun 2023 15:37:00 -0400
Subject: [PATCH 02/25] Some documentation to bGrid.

---
 .../Neon/domain/details/bGrid/bField_imp.h    |  4 +-
 .../include/Neon/domain/details/bGrid/bGrid.h | 81 ++++++++++++++++---
 .../Neon/domain/details/bGrid/bGrid_imp.h     |  4 +-
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
index 3d8fbfb7..687b7a0d 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
@@ -32,7 +32,7 @@ bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBl
         NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs");
     }
     // the allocation size is the number of blocks x block size x cardinality
-    mData->memoryField = mData->grid->helpGetBlockViewGrid().template newField<T, 0>(
+    mData->memoryField = mData->grid->getBlockViewGrid().template newField<T, 0>(
         "BitMask",
         [&] {
             int elPerBlock = dataBlockSize3D.rMul();
@@ -53,7 +53,7 @@ bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBl
                 Partition& partition) {
                 auto& memoryFieldPartition = mData->memoryField.getPartition(execution, setIdx, Neon::DataView::STANDARD);
                 auto& blockConnectivity = mData->grid->helpGetBlockConnectivity().getPartition(execution, setIdx, Neon::DataView::STANDARD);
-                auto& bitmask = mData->grid->helpGetActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD);
+                auto& bitmask = mData->grid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD);
                 auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD);
 
                 partition = bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>(setIdx,
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
index e19ef98a..c31831ff 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
@@ -66,25 +66,46 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<memBlockSiz
           const double_3d&             origin = double_3d(0, 0, 0));
 
 
+    /**
+     * Constructor for bGrid. This constructor should be directly used only by mGrid
+     */
     template <typename ActiveCellLambda>
-    bGrid(const Neon::Backend&         backend,
-          const Neon::int32_3d&        domainSize,
-          const ActiveCellLambda       activeCellLambda,
-          const Neon::domain::Stencil& stencil,
-          const int                    voxelSpacing,
-          const double_3d&             spacingData = double_3d(1, 1, 1),
-          const double_3d&             origin = double_3d(0, 0, 0));
-
+    bGrid(const Neon::Backend&         backend /**< Neon backend for the computation */,
+          const Neon::int32_3d&        domainSize /**< Size of the bounded Cartesian */,
+          const ActiveCellLambda       activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization  */,
+          const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */,
+          const int                    voxelSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing  and (i+1)* voxelSpacing.
+                                                     * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/
+          ,
+          const double_3d& spacingData = double_3d(1, 1, 1) /** Physical spacing between two consecutive data points in the Cartesian domain */,
+          const double_3d& origin = double_3d(0, 0, 0) /** Physical location in space of the origin of the Cartesian discretization */);
 
+    /**
+     * Returns some properties for a given cartesian in the Cartesian domain.
+     * The provide index my be inside or outside the user defined bounded Cartesian domain
+     */
     auto getProperties(const Neon::index_3d& idx)
         const -> typename GridBaseTemplate::CellProperties final;
 
+    /**
+     * Returns true if the query 3D point is inside the user domain
+     * @param idx
+     * @return
+     */
     auto isInsideDomain(const Neon::index_3d& idx)
         const -> bool final;
 
+    /**
+     * Retrieves the device index that contains the query point
+     * @param idx
+     * @return
+     */
     auto getSetIdx(const Neon::index_3d& idx)
         const -> int32_t final;
 
+    /**
+     * Allocates a new field on the grid
+     */
     template <typename T, int C = 0>
     auto newField(const std::string   name,
                   int                 cardinality,
@@ -93,6 +114,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<memBlockSiz
                   Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const
         -> Field<T, C>;
 
+    /**
+     * Allocates a new field on the block view grid
+     */
     template <typename T, int C = 0>
     auto newBlockViewField(const std::string   name,
                            int                 cardinality,
@@ -101,6 +125,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<memBlockSiz
                            Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const
         -> BlockViewGrid::Field<T, C>;
 
+    /*
+     * Allocates a new container to execute some computation in the grid
+     */
     template <Neon::Execution execution = Neon::Execution::device,
               typename LoadingLambda = void*>
     auto newContainer(const std::string& name,
@@ -108,26 +135,58 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<memBlockSiz
                       size_t             sharedMem,
                       LoadingLambda      lambda) const -> Neon::set::Container;
 
+    /*
+     * Allocates a new container to execute some computation in the grid
+     */
     template <Neon::Execution execution = Neon::Execution::device,
               typename LoadingLambda = void*>
     auto newContainer(const std::string& name,
                       LoadingLambda      lambda) const -> Neon::set::Container;
 
-
+    /**
+     * Defines a new set of parameter to launch a Container
+     */
     auto getLaunchParameters(Neon::DataView        dataView,
                              const Neon::index_3d& blockSize,
                              const size_t&         sharedMem) const -> Neon::set::LaunchParameters;
 
+    /**
+     * Retrieve the span associated to the grid w.r.t. some user defined parameters.
+     */
     auto getSpan(Neon::Execution execution,
                  SetIdx          setIdx,
                  Neon::DataView  dataView) -> const Span&;
 
-    auto helpGetBlockViewGrid() const -> BlockViewGrid&;
-    auto helpGetActiveBitMask() const -> BlockViewGrid::Field<uint64_t, 0>&;
+    /**
+     * Retrieve the block vew grid internally used.
+     * This grid can be leverage to allocate data at the block level.
+     */
+    auto getBlockViewGrid() const -> BlockViewGrid&;
+
+    /**
+     * Retrieve the block vew grid internally used.
+     * This grid can be leverage to allocate data at the block level.
+     */
+    auto getActiveBitMask() const -> BlockViewGrid::Field<uint64_t, 0>&;
+
+    /**
+     * Help function to retrieve the block connectivity as a BlockViewGrid field
+     */
     auto helpGetBlockConnectivity() const -> BlockViewGrid::Field<BlockIdx, 27>&;
+
+    /**
+     * Help function to retrieve the block origin as a BlockViewGrid field
+     */
     auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field<index_3d, 0>&;
+
+    /*
+     * Help function to retrieve the map that converts a stencil point id to 3d offset
+     */
     auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet<Neon::int8_3d>&;
 
+    /*
+     * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid
+     */
     auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple<Neon::SetIdx, Idx>;
 
     struct Data
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
index fcd0f803..03c1bd59 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
@@ -313,7 +313,7 @@ auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBloc
 
 template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
 auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
-    helpGetBlockViewGrid()
+    getBlockViewGrid()
         const -> BlockViewGrid&
 {
     return mData->blockViewGrid;
@@ -321,7 +321,7 @@ auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBloc
 
 template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
 auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
-    helpGetActiveBitMask()
+    getActiveBitMask()
         const -> BlockViewGrid::Field<uint64_t, 0>&
 {
     return mData->activeBitMask;

From b81c423586558e2d20b4e9c3684d077999ea0fbf Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 Jun 2023 17:20:19 -0400
Subject: [PATCH 03/25] bGrid: API documentation and refactoring of the
 template API.

---
 libNeonDomain/include/Neon/domain/bGrid.h     |   2 +-
 .../Neon/domain/details/bGrid/StaticBlock.h   |  46 +++++
 .../Neon/domain/details/bGrid/bField.h        |  22 +--
 .../Neon/domain/details/bGrid/bField_imp.h    |  99 +++++-----
 .../include/Neon/domain/details/bGrid/bGrid.h |  29 +--
 .../Neon/domain/details/bGrid/bGrid_imp.h     | 174 +++++++++---------
 .../Neon/domain/details/bGrid/bIndex.h        |  38 ++--
 .../Neon/domain/details/bGrid/bIndex_imp.h    |  80 +++-----
 .../Neon/domain/details/bGrid/bPartition.h    |   8 +-
 .../domain/details/bGrid/bPartition_imp.h     |  70 +++----
 .../include/Neon/domain/details/bGrid/bSpan.h |   6 +-
 .../Neon/domain/details/bGrid/bSpan_imp.h     |  32 ++--
 .../src/domain/details/bGrid/bGrid.cpp        |   2 +-
 .../tests/domain-bGrid-tray/src/gtests.cpp    |  42 ++---
 14 files changed, 322 insertions(+), 328 deletions(-)
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h

diff --git a/libNeonDomain/include/Neon/domain/bGrid.h b/libNeonDomain/include/Neon/domain/bGrid.h
index 13c01cc3..39a4f366 100644
--- a/libNeonDomain/include/Neon/domain/bGrid.h
+++ b/libNeonDomain/include/Neon/domain/bGrid.h
@@ -2,5 +2,5 @@
 #include "Neon/domain/details/bGrid/bGrid.h"
 
 namespace Neon {
-using bGrid = Neon::domain::details::bGrid::bGrid<8,8,8>;
+using bGrid = Neon::domain::details::bGrid::bGrid<Neon::domain::details::bGrid::StaticBlock<8,8,8>>;
 }
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
new file mode 100644
index 00000000..612c6b9a
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
@@ -0,0 +1,46 @@
+#include "Neon/domain/details/bGrid/bSpan.h"
+
+namespace Neon::domain::details::bGrid {
+
+template <uint32_t memBlockSizeX_,
+          uint32_t memBlockSizeY_,
+          uint32_t memBlockSizeZ_,
+          uint32_t userBlockSizeX_ = memBlockSizeX_,
+          uint32_t userBlockSizeY_ = memBlockSizeY_,
+          uint32_t userBlockSizeZ_ = memBlockSizeZ_,
+          bool     isMultiResMode_ = false>
+struct StaticBlock
+{
+   public:
+    constexpr static uint32_t        memBlockSizeX = memBlockSizeX_;
+    constexpr static uint32_t        memBlockSizeY = memBlockSizeY_;
+    constexpr static uint32_t        memBlockSizeZ = memBlockSizeZ_;
+    constexpr static Neon::uint32_3d memBlockSize3D = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ);
+
+    constexpr static uint32_t        userBlockSizeX = userBlockSizeX_;
+    constexpr static uint32_t        userBlockSizeY = userBlockSizeY_;
+    constexpr static uint32_t        userBlockSizeZ = userBlockSizeZ_;
+    constexpr static Neon::uint32_3d userBlockSize3D = Neon::uint32_3d(userBlockSizeX, userBlockSizeY, userBlockSizeZ);
+
+    constexpr static uint32_t blockRatioX = memBlockSizeX / userBlockSizeX;
+    constexpr static uint32_t blockRatioY = memBlockSizeY / userBlockSizeY;
+    constexpr static uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ;
+
+    constexpr static uint32_t memBlockPitchX = 1;
+    constexpr static uint32_t memBlockPitchY = memBlockSizeX;
+    constexpr static uint32_t memBlockPitchZ = memBlockSizeX * memBlockSizeY;
+
+    constexpr static bool isMultiResMode = isMultiResMode_;
+
+    constexpr static uint32_t memBlockCountElements = memBlockSizeX * memBlockSizeY * memBlockSizeZ;
+
+    static_assert(memBlockSizeX >= userBlockSizeX);
+    static_assert(memBlockSizeY >= userBlockSizeY);
+    static_assert(memBlockSizeZ >= userBlockSizeZ);
+
+    static_assert(memBlockSizeX % userBlockSizeX == 0);
+    static_assert(memBlockSizeY % userBlockSizeY == 0);
+    static_assert(memBlockSizeZ % userBlockSizeZ == 0);
+};
+
+}  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
index f232d96b..95c1d6d5 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
@@ -18,31 +18,25 @@
 namespace Neon::domain::details::bGrid {
 
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename T, int C, typename SBlock>
 class bField : public Neon::domain::interface::FieldBaseTemplate<T,
                                                                  C,
-                                                                 bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>,
-                                                                 bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>,
+                                                                 bGrid<SBlock>,
+                                                                 bPartition<T, C, SBlock>,
                                                                  int>
 {
-    friend bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
+    friend bGrid<SBlock>;
 
    public:
     using Type = T;
-    using Grid = bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
-    using Field = bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
-    using Partition = bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
-    using Idx = bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
+    using Grid = bGrid<SBlock>;
+    using Field = bField<T, C, SBlock>;
+    using Partition = bPartition<T, C, SBlock>;
+    using Idx = bIndex<SBlock>;
 
     using NghIdx = typename Partition::NghIdx;
     using NghData = typename Partition::NghData;
 
-    static constexpr Neon::index_3d dataBlockSize3D = Neon::index_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ);
-
-    static constexpr Neon::int8_3d DataBlockSize = Neon::int8_3d(memBlockSizeX,
-                                                                 memBlockSizeY,
-                                                                 memBlockSizeZ);
-
 
     bField(const std::string&         fieldUserName,
            Neon::DataUse              dataUse,
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
index 687b7a0d..a9c249ca 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
@@ -4,19 +4,19 @@
 
 namespace Neon::domain::details::bGrid {
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::bField()
+template <typename T, int C, typename SBlock>
+bField<T, C, SBlock>::bField()
 {
     mData = std::make_shared<Data>();
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::bField(const std::string&         fieldUserName,
-                                                                                                                  Neon::DataUse              dataUse,
-                                                                                                                  const Neon::MemoryOptions& memoryOptions,
-                                                                                                                  const Grid&                grid,
-                                                                                                                  int                        cardinality,
-                                                                                                                  T                          inactiveValue)
+template <typename T, int C, typename SBlock>
+bField<T, C, SBlock>::bField(const std::string&         fieldUserName,
+                             Neon::DataUse              dataUse,
+                             const Neon::MemoryOptions& memoryOptions,
+                             const Grid&                grid,
+                             int                        cardinality,
+                             T                          inactiveValue)
     : Neon::domain::interface::FieldBaseTemplate<T, C, Grid, Partition, int>(&grid,
                                                                              fieldUserName,
                                                                              "bField",
@@ -28,20 +28,19 @@ bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBl
     mData = std::make_shared<Data>(grid.getBackend());
     mData->grid = std::make_shared<Grid>(grid);
 
-    if(memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs){
+    if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) {
         NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs");
     }
     // the allocation size is the number of blocks x block size x cardinality
     mData->memoryField = mData->grid->getBlockViewGrid().template newField<T, 0>(
         "BitMask",
         [&] {
-            int elPerBlock = dataBlockSize3D.rMul();
-            elPerBlock = elPerBlock * cardinality;
+            int elPerBlock = SBlock::memBlockCountElements * cardinality;
             return elPerBlock;
         }(),
         0,
         dataUse,
-        mData->grid->getBackend().getMemoryOptions(bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::activeMaskMemoryLayout));
+        mData->grid->getBackend().getMemoryOptions(bSpan<SBlock>::activeMaskMemoryLayout));
 
 
     {  // Setting up partitionTable
@@ -56,28 +55,28 @@ bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBl
                 auto& bitmask = mData->grid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD);
                 auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD);
 
-                partition = bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>(setIdx,
-                                                                                             cardinality,
-                                                                                             memoryFieldPartition.mem(),
-                                                                                             blockConnectivity.mem(),
-                                                                                             bitmask.mem(),
-                                                                                             dataBlockOrigins.mem(),
-                                                                                             mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx));
+                partition = bPartition<T, C, SBlock>(setIdx,
+                                                     cardinality,
+                                                     memoryFieldPartition.mem(),
+                                                     blockConnectivity.mem(),
+                                                     bitmask.mem(),
+                                                     dataBlockOrigins.mem(),
+                                                     mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx));
             });
     }
 
     initHaloUpdateTable();
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::isInsideDomain(const Neon::index_3d& idx) const -> bool
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::isInsideDomain(const Neon::index_3d& idx) const -> bool
 {
     return mData->grid->isInsideDomain(idx);
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getReference(const Neon::index_3d& cartesianIdx,
-                                                                                                                             const int&            cardinality) -> T&
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::getReference(const Neon::index_3d& cartesianIdx,
+                                        const int&            cardinality) -> T&
 {
     auto& grid = this->getGrid();
     auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx);
@@ -86,9 +85,9 @@ auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, u
     return result;
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::operator()(const Neon::index_3d& cartesianIdx,
-                                                                                                                           const int&            cardinality) const -> T
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::operator()(const Neon::index_3d& cartesianIdx,
+                                      const int&            cardinality) const -> T
 {
     auto& grid = this->getGrid();
     auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx);
@@ -100,22 +99,22 @@ auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, u
     return result;
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::updateHostData(int streamId) -> void
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::updateHostData(int streamId) -> void
 {
     mData->memoryField.updateHostData(streamId);
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::updateDeviceData(int streamId) -> void
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::updateDeviceData(int streamId) -> void
 {
     mData->memoryField.updateDeviceData(streamId);
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getPartition(Neon::Execution       execution,
-                                                                                                                             Neon::SetIdx          setIdx,
-                                                                                                                             const Neon::DataView& dataView) const -> const Partition&
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::getPartition(Neon::Execution       execution,
+                                        Neon::SetIdx          setIdx,
+                                        const Neon::DataView& dataView) const -> const Partition&
 {
     const Neon::DataUse dataUse = this->getDataUse();
     bool                isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution);
@@ -128,10 +127,10 @@ auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, u
     NEON_THROW_UNSUPPORTED_OPERATION(message.str());
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getPartition(Neon::Execution       execution,
-                                                                                                                             Neon::SetIdx          setIdx,
-                                                                                                                             const Neon::DataView& dataView) -> Partition&
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::getPartition(Neon::Execution       execution,
+                                        Neon::SetIdx          setIdx,
+                                        const Neon::DataView& dataView) -> Partition&
 {
     const Neon::DataUse dataUse = this->getDataUse();
     bool                isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution);
@@ -144,10 +143,10 @@ auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, u
     NEON_THROW_UNSUPPORTED_OPERATION(message.str());
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic,
-                                                                                                                              Neon::set::TransferMode    transferMode,
-                                                                                                                              Neon::Execution            execution) const -> Neon::set::Container
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic,
+                                         Neon::set::TransferMode    transferMode,
+                                         Neon::Execution            execution) const -> Neon::set::Container
 {
 
 
@@ -220,8 +219,8 @@ auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, u
     return output;
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::initHaloUpdateTable() -> void
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::initHaloUpdateTable() -> void
 {
     // NEON_THROW_UNSUPPORTED_OPERATION("");
     auto& grid = this->getGrid();
@@ -269,10 +268,10 @@ auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, u
                     }
 
                     memPhyDim[endPoint] = Neon::size_4d(
-                        dataBlockSize3D.rMul(),
+                        SBlock::memBlockCountElements,
                         1,
                         1,
-                        size_t(blockViewPartitions[endPoint]->getCountAllocated()) * dataBlockSize3D.rMul());
+                        size_t(blockViewPartitions[endPoint]->getCountAllocated()) * SBlock::memBlockCountElements);
                 }
 
                 if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSrc)) {
@@ -299,10 +298,8 @@ auto bField<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, u
 
                 Neon::set::MemoryTransfer transfer({setIdxDst, dstMem + (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum(), dstGhostBuff},
                                                    {setIdxSrc, srcMem + (srcBoundaryBuff * memPhyDim[Data::EndPoints::src]).rSum(), srcBoundaryBuff},
-                                                   sizeof(T) * dataBlockSize3D.rMul() * transferDataBlockCount);
+                                                   sizeof(T) * SBlock::memBlockCountElements * transferDataBlockCount);
 
-
-                //                    std::cout << transfer.toString() << std::endl;
                 transfersVec.push_back(transfer);
             }
         });
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
index c31831ff..9d91df5d 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
@@ -16,6 +16,7 @@
 #include "Neon/domain/tools/SpanTable.h"
 #include "Neon/set/Containter.h"
 #include "Neon/set/LaunchParametersTable.h"
+#include "Neon/domain/details/bGrid/StaticBlock.h"
 
 
 #include "bField.h"
@@ -24,31 +25,31 @@
 
 namespace Neon::domain::details::bGrid {
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+
+template <typename T, int C, typename SBlock>
 class bField;
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX = memBlockSizeX, uint32_t userBlockSizeY = memBlockSizeY, uint32_t userBlockSizeZ = memBlockSizeZ>
-class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>,
-                                                               bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ> >
+template <typename SBlock>
+class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
+                                                               bIndex<SBlock> >
 {
    public:
-    using Grid = bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
+    using Grid = bGrid<SBlock>;
 
-    template <typename T, int C = 0, int8_t dmemBlockSizeX = memBlockSizeX, int8_t dmemBlockSizeY = memBlockSizeY, int8_t dmemBlockSizeZ = memBlockSizeZ, int8_t uuserBlockSizeX = userBlockSizeX, int8_t uuserBlockSizeY = userBlockSizeY, int8_t uuserBlockSizeZ = userBlockSizeZ>
-    using Partition = bPartition<T, C, dmemBlockSizeX, dmemBlockSizeY, dmemBlockSizeZ, uuserBlockSizeX, uuserBlockSizeY, uuserBlockSizeZ>;
+    template <typename T, int C = 0>
+    using Partition = bPartition<T, C, SBlock>;
 
-    template <typename T, int C = 0, int8_t dmemBlockSizeX = memBlockSizeX, int8_t dmemBlockSizeY = memBlockSizeY, int8_t dmemBlockSizeZ = memBlockSizeZ, int8_t uuserBlockSizeX = userBlockSizeX, int8_t uuserBlockSizeY = userBlockSizeY, int8_t uuserBlockSizeZ = userBlockSizeZ>
-    using Field = Neon::domain::details::bGrid::bField<T, C, dmemBlockSizeX, dmemBlockSizeY, dmemBlockSizeZ, uuserBlockSizeX, uuserBlockSizeY, uuserBlockSizeZ>;
+    template <typename T, int C = 0>
+    using Field = Neon::domain::details::bGrid::bField<T, C, SBlock>;
 
-    using Span = bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
+    using Span = bSpan<SBlock>;
     using NghIdx = typename Partition<int>::NghIdx;
-    using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate<Grid, bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ> >;
+    using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate<Grid, bIndex<SBlock> >;
 
-    using Idx = bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
+    using Idx = bIndex<SBlock>;
     static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3;
     using ExecutionThreadSpanIndexType = uint32_t;
 
-    static constexpr Neon::index_3d dataBlockSize3D = Neon::index_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ);
     using BlockIdx = uint32_t;
 
     bGrid() = default;
@@ -227,7 +228,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<memBlockSiz
     };
     std::shared_ptr<Data> mData;
 };
-extern template class bGrid<8, 8, 8>;
+extern template class bGrid<StaticBlock<8,8,8>>;
 }  // namespace Neon::domain::details::bGrid
 
 #include "bField_imp.h"
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
index 03c1bd59..1b40a8b7 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
@@ -2,42 +2,38 @@
 
 namespace Neon::domain::details::bGrid {
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 template <typename ActiveCellLambda>
-bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::bGrid(const Neon::Backend&         backend,
-                                                                                                          const Neon::int32_3d&        domainSize,
-                                                                                                          const ActiveCellLambda       activeCellLambda,
-                                                                                                          const Neon::domain::Stencil& stencil,
-                                                                                                          const double_3d&             spacingData,
-                                                                                                          const double_3d&             origin)
+bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
+                     const Neon::int32_3d&        domainSize,
+                     const ActiveCellLambda       activeCellLambda,
+                     const Neon::domain::Stencil& stencil,
+                     const double_3d&             spacingData,
+                     const double_3d&             origin)
     : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin)
 {
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 template <typename ActiveCellLambda>
-bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::bGrid(const Neon::Backend&         backend,
-                                                                                                          const Neon::int32_3d&        domainSize,
-                                                                                                          const ActiveCellLambda       activeCellLambda,
-                                                                                                          const Neon::domain::Stencil& stencil,
-                                                                                                          const int                    voxelSpacing,
-                                                                                                          const double_3d&             spacingData,
-                                                                                                          const double_3d&             origin)
+bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
+                     const Neon::int32_3d&        domainSize,
+                     const ActiveCellLambda       activeCellLambda,
+                     const Neon::domain::Stencil& stencil,
+                     const int                    voxelSpacing,
+                     const double_3d&             spacingData,
+                     const double_3d&             origin)
 {
-    static_assert(memBlockSizeX >= userBlockSizeX);
-    static_assert(memBlockSizeY >= userBlockSizeY);
-    static_assert(memBlockSizeZ >= userBlockSizeZ);
 
-    static_assert(memBlockSizeX % userBlockSizeX == 0);
-    static_assert(memBlockSizeY % userBlockSizeY == 0);
-    static_assert(memBlockSizeZ % userBlockSizeZ == 0);
 
     mData = std::make_shared<Data>();
     mData->init(backend);
 
     mData->voxelSpacing = voxelSpacing;
     mData->stencil = stencil;
-    const index_3d defaultKernelBlockSize(memBlockSizeX, memBlockSizeY, memBlockSizeZ);
+    const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX,
+                                          SBlock::memBlockSizeY,
+                                          SBlock::memBlockSizeZ);
 
     {
         auto nElementsPerPartition = backend.devSet().template newDataSet<size_t>(0);
@@ -59,7 +55,7 @@ bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
             backend,
             activeCellLambda,
             [](Neon::index_3d /*idx*/) { return false; },
-            dataBlockSize3D,
+            SBlock::memBlockSize3D.template newType<int32_t>(),
             domainSize,
             Neon::domain::Stencil::s27_t(false),
             1);
@@ -76,7 +72,7 @@ bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
             mData->partitioner1D.getBlockSpan(),
             mData->partitioner1D,
             Neon::domain::Stencil::s27_t(false),
-            spacingData * dataBlockSize3D,
+            spacingData * SBlock::memBlockSize3D,
             origin);
 
         mData->blockViewGrid = BlockViewGrid(egrid);
@@ -106,9 +102,9 @@ bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
                             bitMask(bitMaskIdx, c) = 0;
                         }
 
-                        for (int k = 0; k < dataBlockSize3D.z; k++) {
-                            for (int j = 0; j < dataBlockSize3D.y; j++) {
-                                for (int i = 0; i < dataBlockSize3D.x; i++) {
+                        for (int k = 0; k < SBlock::memBlockSize3D.template newType<int32_t>().z; k++) {
+                            for (int j = 0; j < SBlock::memBlockSize3D.template newType<int32_t>().y; j++) {
+                                for (int i = 0; i < SBlock::memBlockSize3D.template newType<int32_t>().x; i++) {
 
                                     Neon::int32_3d                 localPosition(i, j, k);
                                     typename Span::BitMaskWordType mask;
@@ -166,7 +162,7 @@ bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
                                                                                                                          k - int8_t(1));
                                                                   bool                                      isValid = blockConnectivity.getNghIndex(idx, stencilPoint, nghIdx);
                                                                   if (isValid) {
-                                                                      blockNghIdx = static_cast < BlockIdx>(nghIdx.helpGet());
+                                                                      blockNghIdx = static_cast<BlockIdx>(nghIdx.helpGet());
                                                                   }
                                                                   blockConnectivity(idx, targetDirection) = blockNghIdx;
                                                               }
@@ -220,7 +216,7 @@ bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
         for (int i = 0; i < stencil.nNeighbours(); ++i) {
             for (int devIdx = 0; devIdx < backend.devSet().setCardinality(); devIdx++) {
                 index_3d      pLong = stencil.neighbours()[i];
-                Neon::int8_3d pShort=pLong.newType<int8_t>();
+                Neon::int8_3d pShort = pLong.newType<int8_t>();
                 mData->stencilIdTo3dOffset.eRef(devIdx, i) = pShort;
             }
         }
@@ -232,7 +228,7 @@ bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
                           domainSize,
                           Neon::domain::Stencil(),
                           mData->mNumActiveVoxel,
-                          dataBlockSize3D,
+                          SBlock::memBlockSize3D.template newType<int32_t>(),
                           spacingData,
                           origin);
     {  // setting launchParameters
@@ -244,47 +240,47 @@ bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
                 auto eDomainGridSize = launchSingleDev.domainGrid();
                 assert(eDomainGridSize.y == 1);
                 assert(eDomainGridSize.z == 1);
-                int nBlocks = static_cast<int>( eDomainGridSize.x);
+                int nBlocks = static_cast<int>(eDomainGridSize.x);
                 bLaunchParameters.get(setIdx).set(Neon::sys::GpuLaunchInfo::mode_e::cudaGridMode,
-                                                  nBlocks, dataBlockSize3D, 0);
+                                                  nBlocks, SBlock::memBlockSize3D.template newType<int32_t>(), 0);
             });
         });
     }
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 template <typename T, int C>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::newField(const std::string   name,
-                                                                                                                  int                 cardinality,
-                                                                                                                  T                   inactiveValue,
-                                                                                                                  Neon::DataUse       dataUse,
-                                                                                                                  Neon::MemoryOptions memoryOptions) const -> Field<T, C>
+auto bGrid<SBlock>::newField(const std::string   name,
+                             int                 cardinality,
+                             T                   inactiveValue,
+                             Neon::DataUse       dataUse,
+                             Neon::MemoryOptions memoryOptions) const -> Field<T, C>
 {
     memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions);
     Field<T, C> field(name, dataUse, memoryOptions, *this, cardinality, inactiveValue);
     return field;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 template <typename T, int C>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::newBlockViewField(const std::string   name,
-                                                                                                                           int                 cardinality,
-                                                                                                                           T                   inactiveValue,
-                                                                                                                           Neon::DataUse       dataUse,
-                                                                                                                           Neon::MemoryOptions memoryOptions) const -> BlockViewGrid::Field<T, C>
+auto bGrid<SBlock>::newBlockViewField(const std::string   name,
+                                      int                 cardinality,
+                                      T                   inactiveValue,
+                                      Neon::DataUse       dataUse,
+                                      Neon::MemoryOptions memoryOptions) const -> BlockViewGrid::Field<T, C>
 {
     memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions);
     BlockViewGrid::Field<T, C> blockViewField = mData->blockViewGrid.template newField<T, C>(name, cardinality, inactiveValue, dataUse, memoryOptions);
     return blockViewField;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 template <Neon::Execution execution,
           typename LoadingLambda>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::newContainer(const std::string& name,
-                                                                                                                      index_3d           blockSize,
-                                                                                                                      size_t             sharedMem,
-                                                                                                                      LoadingLambda      lambda) const -> Neon::set::Container
+auto bGrid<SBlock>::newContainer(const std::string& name,
+                                 index_3d           blockSize,
+                                 size_t             sharedMem,
+                                 LoadingLambda      lambda) const -> Neon::set::Container
 {
     Neon::set::Container kContainer = Neon::set::Container::factory<execution>(name,
                                                                                Neon::set::internal::ContainerAPI::DataViewSupport::on,
@@ -295,11 +291,11 @@ auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBloc
     return kContainer;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 template <Neon::Execution execution,
           typename LoadingLambda>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::newContainer(const std::string& name,
-                                                                                                                      LoadingLambda      lambda) const -> Neon::set::Container
+auto bGrid<SBlock>::newContainer(const std::string& name,
+                                 LoadingLambda      lambda) const -> Neon::set::Container
 {
     const Neon::index_3d& defaultBlockSize = this->getDefaultBlock();
     Neon::set::Container  kContainer = Neon::set::Container::factory<execution>(name,
@@ -311,50 +307,50 @@ auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBloc
     return kContainer;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename SBlock>
+auto bGrid<SBlock>::
     getBlockViewGrid()
         const -> BlockViewGrid&
 {
     return mData->blockViewGrid;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename SBlock>
+auto bGrid<SBlock>::
     getActiveBitMask()
         const -> BlockViewGrid::Field<uint64_t, 0>&
 {
     return mData->activeBitMask;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename SBlock>
+auto bGrid<SBlock>::
     helpGetBlockConnectivity()
         const -> BlockViewGrid::Field<BlockIdx, 27>&
 {
     return mData->blockConnectivity;
 }
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename SBlock>
+auto bGrid<SBlock>::
     helpGetDataBlockOriginField()
         const -> Neon::aGrid::Field<index_3d, 0>&
 {
     return mData->mDataBlockOriginField;
 }
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getSpan(Neon::Execution execution,
-                                                                                                                 SetIdx          setIdx,
-                                                                                                                 Neon::DataView  dataView) -> const bGrid::Span&
+template <typename SBlock>
+auto bGrid<SBlock>::getSpan(Neon::Execution execution,
+                            SetIdx          setIdx,
+                            Neon::DataView  dataView) -> const bGrid::Span&
 {
     return mData->spanTable.getSpan(execution, setIdx, dataView);
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::~bGrid()
+template <typename SBlock>
+bGrid<SBlock>::~bGrid()
 {
 }
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getSetIdx(const index_3d& idx) const -> int32_t
+template <typename SBlock>
+auto bGrid<SBlock>::getSetIdx(const index_3d& idx) const -> int32_t
 {
     typename GridBaseTemplate::CellProperties cellProperties;
 
@@ -365,10 +361,10 @@ auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBloc
     Neon::SetIdx setIdx = cellProperties.getSetIdx();
     return setIdx;
 }
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getLaunchParameters(Neon::DataView dataView,
-                                                                                                                             const index_3d&,
-                                                                                                                             const size_t& sharedMem) const -> Neon::set::LaunchParameters
+template <typename SBlock>
+auto bGrid<SBlock>::getLaunchParameters(Neon::DataView dataView,
+                                        const index_3d&,
+                                        const size_t& sharedMem) const -> Neon::set::LaunchParameters
 {
     auto res = mData->launchParametersTable.get(dataView);
     res.forEachSeq([&](SetIdx const& /*setIdx*/,
@@ -378,19 +374,19 @@ auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBloc
     return res;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename SBlock>
+auto bGrid<SBlock>::
     helpGetStencilIdTo3dOffset()
         const -> Neon::set::MemSet<Neon::int8_3d>&
 {
     return mData->stencilIdTo3dOffset;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::isInsideDomain(const index_3d& idx) const -> bool
+template <typename SBlock>
+auto bGrid<SBlock>::isInsideDomain(const index_3d& idx) const -> bool
 {
     // 1. check if the block is active
-    const index_3d blockIdx3d = idx / dataBlockSize3D;
+    const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType<int32_t>();
     auto           blockProperties = mData->blockViewGrid.getProperties(blockIdx3d);
 
     if (!blockProperties.isInside()) {
@@ -399,17 +395,17 @@ auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBloc
     // 2. The block is active, check the element on the block
     uint32_t                       wordCardinality;
     typename Span::BitMaskWordType mask;
-    Span::getMaskAndWordIdforBlockBitMask(idx.x % dataBlockSize3D.x,
-                                          idx.y % dataBlockSize3D.y,
-                                          idx.z % dataBlockSize3D.z,
+    Span::getMaskAndWordIdforBlockBitMask(idx.x % SBlock::memBlockSize3D.x,
+                                          idx.y % SBlock::memBlockSize3D.y,
+                                          idx.z % SBlock::memBlockSize3D.z,
                                           NEON_OUT mask,
                                           NEON_OUT wordCardinality);
     auto activeBits = mData->activeBitMask.getReference(blockIdx3d, int(wordCardinality));
     return (activeBits & mask) != 0;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getProperties(const index_3d& idx)
+template <typename SBlock>
+auto bGrid<SBlock>::getProperties(const index_3d& idx)
     const -> typename GridBaseTemplate::CellProperties
 {
     typename GridBaseTemplate::CellProperties cellProperties;
@@ -422,7 +418,7 @@ auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBloc
     if (this->getDevSet().setCardinality() == 1) {
         cellProperties.init(0, DataView::INTERNAL);
     } else {
-        const index_3d blockIdx3d = idx / dataBlockSize3D;
+        const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType<int32_t>();
         auto           blockViewProperty = mData->blockViewGrid.getProperties(blockIdx3d);
 
         cellProperties.init(blockViewProperty.getSetIdx(),
@@ -431,17 +427,17 @@ auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBloc
     return cellProperties;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-auto bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::helpGetSetIdxAndGridIdx(Neon::index_3d idx)
+template <typename SBlock>
+auto bGrid<SBlock>::helpGetSetIdxAndGridIdx(Neon::index_3d idx)
     const -> std::tuple<Neon::SetIdx, Idx>
 {
-    const index_3d blockIdx3d = idx / dataBlockSize3D;
+    const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType<int32_t>();
     auto [setIdx, bvGridIdx] = mData->blockViewGrid.helpGetSetIdxAndGridIdx(blockIdx3d);
     Idx bIdx;
     bIdx.mDataBlockIdx = bvGridIdx.helpGet();
-    bIdx.mInDataBlockIdx.x = static_cast<typename Idx::InDataBlockIdx::Integer>(idx.x % dataBlockSize3D.x);
-    bIdx.mInDataBlockIdx.y = static_cast<typename Idx::InDataBlockIdx::Integer>(idx.y % dataBlockSize3D.y);
-    bIdx.mInDataBlockIdx.z = static_cast<typename Idx::InDataBlockIdx::Integer>(idx.z % dataBlockSize3D.z);
+    bIdx.mInDataBlockIdx.x = static_cast<typename Idx::InDataBlockIdx::Integer>(idx.x % SBlock::memBlockSize3D.x);
+    bIdx.mInDataBlockIdx.y = static_cast<typename Idx::InDataBlockIdx::Integer>(idx.y % SBlock::memBlockSize3D.y);
+    bIdx.mInDataBlockIdx.z = static_cast<typename Idx::InDataBlockIdx::Integer>(idx.z % SBlock::memBlockSize3D.z);
 
     return {setIdx, bIdx};
 }
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h
index 7b8d7bcf..bbf103d1 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h
@@ -6,11 +6,11 @@
 namespace Neon::domain::details::bGrid {
 
 // Common forward declarations
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 class bGrid;
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 class bSpan;
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename T, int C, typename SBlock>
 class bPartition;
 
 class MicroIndex
@@ -59,26 +59,24 @@ class MicroIndex
     TrayIdx   mTrayBlockIdx{};
 };
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 class bIndex
 {
    public:
-    template <uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>
+    template <typename SBlock_>
     friend class bSpan;
-    using OuterIdx = bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
-
-    static constexpr Neon::uint32_3d memBlock3DSize = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ);
+    using OuterIdx = bIndex<SBlock>;
 
     using NghIdx = int8_3d;
-    template <typename T, int C, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>
+    template <typename T, int C, typename SBlock_>
     friend class bPartition;
 
-    template <typename T, int C, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>
+    template <typename T, int C, typename SBlock_>
     friend class bField;
 
-    template <uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>
+    template <typename SBlock_>
     friend class bSpan;
-    template <uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>
+    template <typename SBlock_>
     friend class bGrid;
 
 
@@ -109,25 +107,25 @@ class bIndex
     DataBlockIdx   mDataBlockIdx{};
 };
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE auto bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE auto bIndex<SBlock>::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void
 {
     mDataBlockIdx = dataBlockIdx;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE auto bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE auto bIndex<SBlock>::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void
 {
     mInDataBlockIdx = inDataBlockIdx;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE auto bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getDataBlockIdx() const -> const bIndex::DataBlockIdx&
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE auto bIndex<SBlock>::getDataBlockIdx() const -> const bIndex::DataBlockIdx&
 {
     return mDataBlockIdx;
 }
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE auto bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx&
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE auto bIndex<SBlock>::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx&
 {
     return mInDataBlockIdx;
 }
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h
index a55fddbb..be45749d 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h
@@ -3,8 +3,8 @@
 
 namespace Neon::domain::details::bGrid {
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline bIndex<SBlock>::
     bIndex(const DataBlockIdx&            blockIdx,
            const InDataBlockIdx::Integer& x,
            const InDataBlockIdx::Integer& y,
@@ -16,86 +16,52 @@ NEON_CUDA_HOST_DEVICE inline bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ,
     mInDataBlockIdx.z = z;
 }
 
-//
-// template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-// NEON_CUDA_HOST_DEVICE inline auto bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getTrayIdx() -> TrayIdx
-//{
-//
-//    TrayIdx const exBlockOffset = mDataBlockIdx * (userBlockSizeX * userBlockSizeY * userBlockSizeZ);
-//    TrayIdx const exTrayOffset = [&]() {
-//        int const trayBlockIdxX = mInDataBlockIdx.x / userBlockSizeX;
-//        int const trayBlockIdxY = mInDataBlockIdx.y / userBlockSizeY;
-//        int const trayBlockIdxZ = mInDataBlockIdx.z / userBlockSizeZ;
-//
-//        constexpr int countMicroBlocksInTrayX = (memBlockSizeX / userBlockSizeX);
-//        constexpr int countMicroBlocksInTrayY = (memBlockSizeY / userBlockSizeY);
-//
-//        int const res = trayBlockIdxX + trayBlockIdxY * countMicroBlocksInTrayX +
-//                        trayBlockIdxZ * (countMicroBlocksInTrayX * countMicroBlocksInTrayY);
-//        return res;
-//    };
-//    return exBlockOffset + exTrayOffset;
-//}
-//
-//
-// template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-// NEON_CUDA_HOST_DEVICE inline auto bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getInTrayIdx() -> InTrayIdx
-//{
-//    InTrayIdx inTrayIdx;
-//    inTrayIdx.x = mInDataBlockIdx.x % userBlockSizeX;
-//    inTrayIdx.y = mInDataBlockIdx.y % userBlockSizeY;
-//    inTrayIdx.z = mInDataBlockIdx.z % userBlockSizeZ;
-//
-//    return inTrayIdx;
-//}
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline auto bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getMicroIndex() -> MicroIndex
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bIndex<SBlock>::getMicroIndex() -> MicroIndex
 {
-    constexpr uint32_t blockRatioX = memBlockSizeX / userBlockSizeX;
-    constexpr uint32_t blockRatioY = memBlockSizeY / userBlockSizeY;
-    constexpr uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ;
 
-    TrayIdx const exBlockOffset = mDataBlockIdx * (blockRatioX * blockRatioY * blockRatioZ);
+
+    TrayIdx const exBlockOffset = mDataBlockIdx * (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ);
     TrayIdx const exTrayOffset = [&] {
-        TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / userBlockSizeX;
-        TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / userBlockSizeY;
-        TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / userBlockSizeZ;
+        TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / SBlock::userBlockSizeX;
+        TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / SBlock::userBlockSizeY;
+        TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / SBlock::userBlockSizeZ;
 
-        TrayIdx const res = trayBlockIdxX + trayBlockIdxY * blockRatioX +
-                            trayBlockIdxZ * (blockRatioX * blockRatioY);
+        TrayIdx const res = trayBlockIdxX + trayBlockIdxY * SBlock::blockRatioX +
+                            trayBlockIdxZ * (SBlock::blockRatioX * SBlock::blockRatioY);
         return res;
     }();
     MicroIndex res;
     res.setTrayBlockIdx(exBlockOffset + exTrayOffset);
-    res.setInTrayBlockIdx({static_cast<InTrayIdx::Integer>(mInDataBlockIdx.x % userBlockSizeX),
-                           static_cast<InTrayIdx::Integer>(mInDataBlockIdx.y % userBlockSizeY),
-                           static_cast<InTrayIdx::Integer>(mInDataBlockIdx.z % userBlockSizeZ)});
+    res.setInTrayBlockIdx({static_cast<InTrayIdx::Integer>(mInDataBlockIdx.x % SBlock::userBlockSizeX),
+                           static_cast<InTrayIdx::Integer>(mInDataBlockIdx.y % SBlock::userBlockSizeY),
+                           static_cast<InTrayIdx::Integer>(mInDataBlockIdx.z % SBlock::userBlockSizeZ)});
     return res;
 }
 
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline auto bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::init(MicroIndex const& microIndex) -> void
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bIndex<SBlock>::init(MicroIndex const& microIndex) -> void
 {
-    constexpr uint32_t memBlockSize = memBlockSizeX * memBlockSizeY * memBlockSizeZ;
-    constexpr uint32_t userBlockSize = userBlockSizeX * userBlockSizeY * userBlockSizeZ;
+    constexpr uint32_t memBlockSize = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
+    constexpr uint32_t userBlockSize = SBlock::userBlockSizeX * SBlock::userBlockSizeY * SBlock::userBlockSizeZ;
     constexpr uint32_t blockRatioSize = memBlockSize / userBlockSize;
 
-    constexpr uint32_t blockRatioX = memBlockSizeX / userBlockSizeX;
-    constexpr uint32_t blockRatioY = memBlockSizeY / userBlockSizeY;
+    constexpr uint32_t blockRatioX = SBlock::memBlockSizeX / SBlock::userBlockSizeX;
+    constexpr uint32_t blockRatioY = SBlock::memBlockSizeY / SBlock::userBlockSizeY;
 
     mDataBlockIdx = microIndex.getTrayBlockIdx() / (blockRatioSize);
 
     uint32_t reminder = microIndex.getTrayBlockIdx() % (blockRatioSize);
 
     const uint32_t reminderInZ = reminder / (blockRatioX * blockRatioY);
-    mInDataBlockIdx.z = static_cast < InDataBlockIdx::Integer>( microIndex.getInTrayBlockIdx().z + reminderInZ * userBlockSizeZ);
+    mInDataBlockIdx.z = static_cast<InDataBlockIdx::Integer>(microIndex.getInTrayBlockIdx().z + reminderInZ * SBlock::userBlockSizeZ);
     reminder = reminder % (blockRatioX * blockRatioY);
     const uint32_t reminderInY = reminder / (blockRatioX);
-    mInDataBlockIdx.y = static_cast<InDataBlockIdx::Integer>(microIndex.getInTrayBlockIdx().y + reminderInY * userBlockSizeY);
+    mInDataBlockIdx.y = static_cast<InDataBlockIdx::Integer>(microIndex.getInTrayBlockIdx().y + reminderInY * SBlock::userBlockSizeY);
     const uint32_t reminderInX = reminder % blockRatioX;
-    mInDataBlockIdx.x = static_cast<InDataBlockIdx::Integer>(microIndex.getInTrayBlockIdx().x + reminderInX * userBlockSizeX);
+    mInDataBlockIdx.x = static_cast<InDataBlockIdx::Integer>(microIndex.getInTrayBlockIdx().x + reminderInX * SBlock::userBlockSizeX);
 }
 
 }  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
index 09db40e4..f20a513d 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
@@ -9,15 +9,15 @@
 
 namespace Neon::domain::details::bGrid {
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 class bSpan;
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename T, int C, typename SBlock>
 class bPartition
 {
    public:
-    using Span = bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
-    using Idx = bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
+    using Span = bSpan<SBlock>;
+    using Idx = bIndex<SBlock>;
     using NghIdx = typename Idx::NghIdx;
     using Type = T;
     using NghData = Neon::domain::NghData<T>;
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
index db057f47..8506476b 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
@@ -5,8 +5,8 @@
 
 namespace Neon::domain::details::bGrid {
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::bPartition()
+template <typename T, int C, typename SBlock>
+bPartition<T, C, SBlock>::bPartition()
     : mCardinality(0),
       mMem(nullptr),
       mStencilNghIndex(),
@@ -17,8 +17,8 @@ bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, us
 {
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+bPartition<T, C, SBlock>::
     bPartition(int                             setIdx,
                int                             cardinality,
                T*                              mem,
@@ -36,8 +36,8 @@ bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, us
 {
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     getGlobalIndex(const Idx& gidx)
         const -> Neon::index_3d
 {
@@ -48,8 +48,8 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY,
     return location;
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     getBlockViewGridIdx(const Idx& gidx)
         const -> BlockViewGridIdx
 {
@@ -58,32 +58,32 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY,
     return res;
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
     cardinality()
         const -> int
 {
     return mCardinality;
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
 operator()(const Idx& cell,
            int        card) -> T&
 {
     return mMem[helpGetPitch(cell, card)];
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
 operator()(const Idx& cell,
            int        card) const -> const T&
 {
     return mMem[helpGetPitch(cell, card)];
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
     helpGetPitch(const Idx& idx, int card)
         const -> uint32_t
 {
@@ -92,22 +92,22 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, memBlockSizeX, memBlockSizeY,
 }
 
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
     helpGetValidIdxPitchExplicit(const Idx& idx, int card)
         const -> uint32_t
 {
-    uint32_t const blockPitchByCard = memBlockSizeX * memBlockSizeY * memBlockSizeZ;
+    uint32_t const blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
     uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x +
-                                        memBlockSizeX * idx.mInDataBlockIdx.y +
-                                        (memBlockSizeX * memBlockSizeY) * idx.mInDataBlockIdx.z;
+                                        SBlock::memBlockSizeX * idx.mInDataBlockIdx.y +
+                                        (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z;
     uint32_t const blockAdnCardPitch = (idx.mDataBlockIdx * mCardinality + card) * blockPitchByCard;
     uint32_t const pitch = blockAdnCardPitch + inBlockInCardPitch;
     return pitch;
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
     helpNghPitch(const Idx& nghIdx, int card)
         const -> std::tuple<bool, uint32_t>
 {
@@ -126,8 +126,8 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, memBlockSizeX, memBlockSizeY,
     return {true, offset};
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     helpGetNghIdx(const Idx&    idx,
                   const NghIdx& offset)
         const -> Idx
@@ -142,9 +142,9 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY,
      * 1 positive offset
      * -1 negative offset
      */
-    const int xFlag = ngh.x < 0 ? -1 : (ngh.x >= memBlockSizeX ? +1 : 0);
-    const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= memBlockSizeX ? +1 : 0);
-    const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= memBlockSizeX ? +1 : 0);
+    const int xFlag = ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0);
+    const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0);
+    const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0);
 
     const bool isLocal = (xFlag | yFlag | zFlag) == 0;
     if (!(isLocal)) {
@@ -177,9 +177,9 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY,
          * */
 
         Idx remoteNghIdx;
-        remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * memBlockSizeX;
-        remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * memBlockSizeX;
-        remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * memBlockSizeX;
+        remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX;
+        remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX;
+        remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX;
 
         int connectivityJump = idx.mDataBlockIdx * 27 +
                                (xFlag + 1) +
@@ -196,8 +196,8 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY,
     }
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     getNghData(const Idx& eId,
                uint8_t    nghID,
                int        card)
@@ -207,8 +207,8 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY,
     return getNghData(eId, nghOffset, card);
 }
 
-template <typename T, int C, uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     getNghData(const Idx&    idx,
                const NghIdx& offset,
                const int     card)
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h
index bf91dc16..80fb12ab 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h
@@ -4,7 +4,7 @@
 
 namespace Neon::domain::details::bGrid {
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 class bSpan
 {
    public:
@@ -15,8 +15,8 @@ class bSpan
     static constexpr Neon::MemoryLayout activeMaskMemoryLayout = Neon::MemoryLayout::arrayOfStructs;
     static constexpr uint32_t           log2OfbitMaskWordSize = 6;
 
-    using Idx = bIndex<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
-    friend class bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
+    using Idx = bIndex<SBlock>;
+    friend class bGrid<SBlock>;
 
     static constexpr int SpaceDim = 3;
 
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h
index 50f441a0..57d7aeca 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h
@@ -2,9 +2,9 @@
 
 namespace Neon::domain::details::bGrid {
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 NEON_CUDA_HOST_DEVICE inline auto
-bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool
+bSpan<SBlock>::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool
 {
 #ifdef NEON_PLACE_CUDA_DEVICE
     bidx.mDataBlockIdx = blockIdx.x + mFirstDataBlockOffset;
@@ -22,9 +22,9 @@ bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
 #endif
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 NEON_CUDA_HOST_DEVICE inline auto
-bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::setAndValidateCPUDevice(Idx&            bidx,
+bSpan<SBlock>::setAndValidateCPUDevice(Idx&            bidx,
                                                                                                                             uint32_t const& dataBlockIdx,
                                                                                                                             uint32_t const& x,
                                                                                                                             uint32_t const& y,
@@ -41,8 +41,8 @@ bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
     return isActive;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::bSpan(typename Idx::DataBlockCount firstDataBlockOffset,
+template <typename SBlock>
+bSpan<SBlock>::bSpan(typename Idx::DataBlockCount firstDataBlockOffset,
                                                                                                           BitMaskWordType*             activeMask,
                                                                                                           Neon::DataView               dataView)
     : mFirstDataBlockOffset(firstDataBlockOffset),
@@ -51,16 +51,16 @@ bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSize
 {
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline auto bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getRequiredWordsForBlockBitMask() -> uint32_t
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bSpan<SBlock>::getRequiredWordsForBlockBitMask() -> uint32_t
 {
-    uint32_t requiredBits = memBlockSizeX * memBlockSizeY * memBlockSizeZ;
-    uint32_t requiredWords = ((requiredBits - 1) >> bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::log2OfbitMaskWordSize) + 1;
+    uint32_t requiredBits = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
+    uint32_t requiredWords = ((requiredBits - 1) >> bSpan<SBlock>::log2OfbitMaskWordSize) + 1;
     return requiredWords;
 }
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-inline auto bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getMaskAndWordIdforBlockBitMask(int                       threadX,
+template <typename SBlock>
+inline auto bSpan<SBlock>::getMaskAndWordIdforBlockBitMask(int                       threadX,
                                                                                                                                                 int                       threadY,
                                                                                                                                                 int                       threadZ,
                                                                                                                                                 NEON_OUT BitMaskWordType& mask,
@@ -68,7 +68,7 @@ inline auto bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, u
 {
     if constexpr (activeMaskMemoryLayout == Neon::MemoryLayout::arrayOfStructs) {
         // 6 = log_2 64
-        const uint32_t threadPitch = threadX + threadY * memBlockSizeX + threadZ * memBlockSizeX * memBlockSizeY;
+        const uint32_t threadPitch = threadX + threadY * SBlock::memBlockSizeX + threadZ * SBlock::memBlockSizeX * SBlock::memBlockSizeY;
         // threadPitch >> log2OfbitMaskWordSize
         // the same as: threadPitch / 2^{log2OfbitMaskWordSize}
         wordIdx = threadPitch >> log2OfbitMaskWordSize;
@@ -82,8 +82,8 @@ inline auto bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, u
 }
 
 
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
-NEON_CUDA_HOST_DEVICE inline auto bSpan<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>::getActiveStatus(
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bSpan<SBlock>::getActiveStatus(
     const typename Idx::DataBlockIdx& dataBlockIdx,
     int                               threadX,
     int                               threadY,
@@ -92,7 +92,7 @@ NEON_CUDA_HOST_DEVICE inline auto bSpan<memBlockSizeX, memBlockSizeY, memBlockSi
 {
     if constexpr (activeMaskMemoryLayout == Neon::MemoryLayout::arrayOfStructs) {
         // 6 = log_2 64
-        const uint32_t threadPitch = threadX + threadY * memBlockSizeX + threadZ * memBlockSizeX * memBlockSizeY;
+        const uint32_t threadPitch = threadX + threadY * SBlock::memBlockSizeX + threadZ * SBlock::memBlockSizeX * SBlock::memBlockSizeY;
         // threadPitch >> log2OfbitMaskWordSize
         // the same as: threadPitch / 2^{log2OfbitMaskWordSize}
         const uint32_t wordIdx = threadPitch >> log2OfbitMaskWordSize;
diff --git a/libNeonDomain/src/domain/details/bGrid/bGrid.cpp b/libNeonDomain/src/domain/details/bGrid/bGrid.cpp
index 0cc0dfef..78dad9bf 100644
--- a/libNeonDomain/src/domain/details/bGrid/bGrid.cpp
+++ b/libNeonDomain/src/domain/details/bGrid/bGrid.cpp
@@ -3,6 +3,6 @@
 namespace Neon::domain::details::bGrid {
 
 
-template class bGrid<8,8,8>;
+template class bGrid<Neon::domain::details::bGrid::StaticBlock<8,8,8>>;
 
 }  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp b/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp
index 9e0cd408..794dfde0 100644
--- a/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp
@@ -4,38 +4,34 @@
 #include "gtest/gtest.h"
 
 
-
-template <uint32_t memBlockSizeX, uint32_t memBlockSizeY, uint32_t memBlockSizeZ, uint32_t userBlockSizeX, uint32_t userBlockSizeY, uint32_t userBlockSizeZ>
+template <typename SBlock>
 void test_backToBackConversion()
 {
-    using bGrid = Neon::domain::details::bGrid::bGrid<memBlockSizeX, memBlockSizeY, memBlockSizeZ, userBlockSizeX, userBlockSizeY, userBlockSizeZ>;
+    using bGrid = Neon::domain::details::bGrid::bGrid<SBlock>;
     using MicroIndex = Neon::domain::details::bGrid::MicroIndex;
     typename bGrid::Idx bIdx;
     MicroIndex          microIdx;
-    uint32_t            ratioOnX = (memBlockSizeX) / (userBlockSizeX);
-    uint32_t            ratioOnY = (memBlockSizeY) / (userBlockSizeY);
-    uint32_t            ratioOnZ = (memBlockSizeZ) / (userBlockSizeZ);
 
     for (uint32_t memBlockIdx = 0; memBlockIdx < 10; memBlockIdx++) {
-      const uint32_t  memBlockJump = (ratioOnX*ratioOnY*ratioOnZ)*memBlockIdx;
-        for (uint32_t rZ = 0; rZ < ratioOnZ; rZ++) {
-            for (uint32_t rY = 0; rY < ratioOnY; rY++) {
-                for (uint32_t rX = 0; rX < ratioOnX; rX++) {
-                    for (int8_t k = 0; k < int8_t(userBlockSizeX); k++) {
-                        for (int8_t j = 0; j < int8_t(userBlockSizeY); j++) {
-                            for (int8_t i = 0; i < int8_t(userBlockSizeZ); i++) {  // Set the micro idx to the first voxel
+        const uint32_t memBlockJump = (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ) * memBlockIdx;
+        for (uint32_t rZ = 0; rZ < SBlock::blockRatioZ; rZ++) {
+            for (uint32_t rY = 0; rY < SBlock::blockRatioY; rY++) {
+                for (uint32_t rX = 0; rX < SBlock::blockRatioX; rX++) {
+                    for (int8_t k = 0; k < int8_t(SBlock::userBlockSizeX); k++) {
+                        for (int8_t j = 0; j < int8_t(SBlock::userBlockSizeY); j++) {
+                            for (int8_t i = 0; i < int8_t(SBlock::userBlockSizeZ); i++) {  // Set the micro idx to the first voxel
                                 // Check that bIdx point to the first voxels too
-                                microIdx.setTrayBlockIdx(memBlockJump + rX + rY * ratioOnX + rZ * ratioOnY * ratioOnX);
+                                microIdx.setTrayBlockIdx(memBlockJump + rX + rY * SBlock::blockRatioX + rZ * SBlock::blockRatioY * SBlock::blockRatioX);
                                 microIdx.setInTrayBlockIdx({i, j, k});
                                 bIdx.init(microIdx);
 
                                 auto res = bIdx.getMicroIndex();
 
                                 ASSERT_EQ(bIdx.getDataBlockIdx(), memBlockIdx);
-                                ASSERT_EQ(bIdx.getInDataBlockIdx(), Neon::int8_3d(static_cast<int8_t>(i + rX * userBlockSizeX),
-                                                                                   static_cast<int8_t>(j + rY * userBlockSizeY),
-                                                                                  static_cast<int8_t>( k + rZ * userBlockSizeZ)))
-                                    << bIdx.getInDataBlockIdx() << " instead of " << Neon::int8_3d(static_cast<int8_t>(i + rX * userBlockSizeX), static_cast<int8_t>(j + rY * userBlockSizeY),static_cast<int8_t>( k + rZ * userBlockSizeZ)) << " with rX,Ry,rZ " << rX << "," << rY << "," << rZ << " and i,j,k = " << i << "," << j << "," << k;
+                                ASSERT_EQ(bIdx.getInDataBlockIdx(), Neon::int8_3d(static_cast<int8_t>(i + rX * SBlock::userBlockSizeX),
+                                                                                  static_cast<int8_t>(j + rY * SBlock::userBlockSizeY),
+                                                                                  static_cast<int8_t>(k + rZ * SBlock::userBlockSizeZ)))
+                                    << bIdx.getInDataBlockIdx() << " instead of " << Neon::int8_3d(static_cast<int8_t>(i + rX * SBlock::userBlockSizeX), static_cast<int8_t>(j + rY * SBlock::userBlockSizeY), static_cast<int8_t>(k + rZ * SBlock::userBlockSizeZ)) << " with rX,Ry,rZ " << rX << "," << rY << "," << rZ << " and i,j,k = " << i << "," << j << "," << k;
 
 
                                 ASSERT_EQ(res.getTrayBlockIdx(), microIdx.getTrayBlockIdx());
@@ -51,27 +47,27 @@ void test_backToBackConversion()
 
 TEST(bGrid_tray, init_4_4_4_2_2_2)
 {
-    test_backToBackConversion<4, 4, 4, 2, 2, 2>();
+    test_backToBackConversion<Neon::domain::details::bGrid::StaticBlock<4, 4, 4, 2, 2, 2>>();
 }
 
 TEST(bGrid_tray, init_8_8_8_2_2_2)
 {
-    test_backToBackConversion<8, 8, 8, 2, 2, 2>();
+    test_backToBackConversion<Neon::domain::details::bGrid::StaticBlock<8, 8, 8, 2, 2, 2>>();
 }
 
 TEST(bGrid_tray, init_8_8_8_1_1_1)
 {
-    test_backToBackConversion<8, 8, 8, 1, 1, 1>();
+    test_backToBackConversion<Neon::domain::details::bGrid::StaticBlock<8, 8, 8, 1, 1, 1>>();
 }
 
 TEST(bGrid_tray, init_8_8_8_4_4_4)
 {
-    test_backToBackConversion<8, 8, 8, 4, 4, 4>();
+    test_backToBackConversion<Neon::domain::details::bGrid::StaticBlock<8, 8, 8, 4, 4, 4>>();
 }
 
 TEST(bGrid_tray, init_4_4_4_2_1_2)
 {
-    test_backToBackConversion<4,4,4, 2, 1, 2>();
+    test_backToBackConversion<Neon::domain::details::bGrid::StaticBlock<4, 4, 4, 2, 1, 2>>();
 }
 
 int main(int argc, char** argv)

From d82e985c2c92863daf39ae3d595d2dafc9ea443f Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 Jun 2023 17:54:05 -0400
Subject: [PATCH 04/25] Cleaning up naming for the BlockViewGrid

---
 .../bGrid/BlockViewGrid/BlockViewGrid.h       |  2 +-
 .../Neon/domain/details/bGrid/StaticBlock.h   | 13 +++++++++++
 .../Neon/domain/details/bGrid/bField.h        |  6 +++--
 .../include/Neon/domain/details/bGrid/bGrid.h | 23 +++++++++++--------
 .../Neon/domain/details/bGrid/bPartition.h    | 12 ++++++----
 5 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h
index 3f2f3544..cc714802 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h
@@ -90,8 +90,8 @@ struct GridTransformation
             });
     }
 };
+using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
 
 }  // namespace details
-using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
 
 }  // namespace Neon::domain::details::bGrid
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
index 612c6b9a..14872577 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
@@ -41,6 +41,19 @@ struct StaticBlock
     static_assert(memBlockSizeX % userBlockSizeX == 0);
     static_assert(memBlockSizeY % userBlockSizeY == 0);
     static_assert(memBlockSizeZ % userBlockSizeZ == 0);
+
+    struct BitMask
+    {
+        auto reset()
+        {
+            for (uint32_t i = 0; i < nWords; ++i) {
+                bits[i] = 0;
+            }
+        }
+
+        constexpr static uint32_t nWords = (memBlockCountElements + 31) / 32;
+        uint32_t                  bits[nWords];
+    };
 };
 
 }  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
index 95c1d6d5..8f1ac485 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
@@ -33,11 +33,13 @@ class bField : public Neon::domain::interface::FieldBaseTemplate<T,
     using Field = bField<T, C, SBlock>;
     using Partition = bPartition<T, C, SBlock>;
     using Idx = bIndex<SBlock>;
+    using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
+    template<typename TT, int CC = 0>
+    using BlockViewField = BlockViewGrid::template Field<TT, CC>;
 
     using NghIdx = typename Partition::NghIdx;
     using NghData = typename Partition::NghData;
 
-
     bField(const std::string&         fieldUserName,
            Neon::DataUse              dataUse,
            const Neon::MemoryOptions& memoryOptions,
@@ -109,7 +111,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate<T,
         };
 
         std::shared_ptr<Grid>      grid;
-        BlockViewGrid::Field<T, C> memoryField;
+        BlockViewField<T, C> memoryField;
 
         int mCardinality;
 
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
index 9d91df5d..a40935bb 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
@@ -5,6 +5,7 @@
 
 #include "BlockViewGrid/BlockViewGrid.h"
 #include "Neon/domain/aGrid.h"
+#include "Neon/domain/details/bGrid/StaticBlock.h"
 #include "Neon/domain/details/bGrid/bField.h"
 #include "Neon/domain/details/bGrid/bIndex.h"
 #include "Neon/domain/details/bGrid/bPartition.h"
@@ -16,8 +17,6 @@
 #include "Neon/domain/tools/SpanTable.h"
 #include "Neon/set/Containter.h"
 #include "Neon/set/LaunchParametersTable.h"
-#include "Neon/domain/details/bGrid/StaticBlock.h"
-
 
 #include "bField.h"
 #include "bPartition.h"
@@ -31,7 +30,7 @@ class bField;
 
 template <typename SBlock>
 class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
-                                                               bIndex<SBlock> >
+                                                               bIndex<SBlock>>
 {
    public:
     using Grid = bGrid<SBlock>;
@@ -42,9 +41,13 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
     template <typename T, int C = 0>
     using Field = Neon::domain::details::bGrid::bField<T, C, SBlock>;
 
+    using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
+    template <typename T, int C = 0>
+    using BlockViewField = BlockViewGrid::template Field<T, C>;
+
     using Span = bSpan<SBlock>;
     using NghIdx = typename Partition<int>::NghIdx;
-    using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate<Grid, bIndex<SBlock> >;
+    using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate<Grid, bIndex<SBlock>>;
 
     using Idx = bIndex<SBlock>;
     static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3;
@@ -124,9 +127,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
                            T                   inactiveValue,
                            Neon::DataUse       dataUse = Neon::DataUse::HOST_DEVICE,
                            Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const
-        -> BlockViewGrid::Field<T, C>;
+        -> BlockViewField<T, C>;
 
-    /*
+    /**
      * Allocates a new container to execute some computation in the grid
      */
     template <Neon::Execution execution = Neon::Execution::device,
@@ -136,7 +139,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
                       size_t             sharedMem,
                       LoadingLambda      lambda) const -> Neon::set::Container;
 
-    /*
+    /**
      * Allocates a new container to execute some computation in the grid
      */
     template <Neon::Execution execution = Neon::Execution::device,
@@ -168,12 +171,12 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
      * Retrieve the block vew grid internally used.
      * This grid can be leverage to allocate data at the block level.
      */
-    auto getActiveBitMask() const -> BlockViewGrid::Field<uint64_t, 0>&;
+    auto getActiveBitMask() const -> BlockViewField<uint64_t, 0>&;
 
     /**
      * Help function to retrieve the block connectivity as a BlockViewGrid field
      */
-    auto helpGetBlockConnectivity() const -> BlockViewGrid::Field<BlockIdx, 27>&;
+    auto helpGetBlockConnectivity() const -> BlockViewField<BlockIdx, 27>&;
 
     /**
      * Help function to retrieve the block origin as a BlockViewGrid field
@@ -228,7 +231,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
     };
     std::shared_ptr<Data> mData;
 };
-extern template class bGrid<StaticBlock<8,8,8>>;
+extern template class bGrid<StaticBlock<8, 8, 8>>;
 }  // namespace Neon::domain::details::bGrid
 
 #include "bField_imp.h"
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
index f20a513d..48312b22 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
@@ -21,6 +21,8 @@ class bPartition
     using NghIdx = typename Idx::NghIdx;
     using Type = T;
     using NghData = Neon::domain::NghData<T>;
+
+    using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
     using BlockViewGridIdx = BlockViewGrid::Idx;
 
    public:
@@ -90,13 +92,13 @@ class bPartition
         const -> Idx;
 
 
-    int                    mCardinality;
-    T*                     mMem;
-    NghIdx*                mStencilNghIndex;
+    int                             mCardinality;
+    T*                              mMem;
+    NghIdx*                         mStencilNghIndex;
     typename Idx::DataBlockIdx*     mBlockConnectivity;
     typename Span::BitMaskWordType* mMask;
-    Neon::int32_3d*        mOrigin;
-    int                    mSetIdx;
+    Neon::int32_3d*                 mOrigin;
+    int                             mSetIdx;
 };
 
 }  // namespace Neon::domain::details::bGrid

From 9e29f8e78a673dda0f587184ee981f0793b75fa0 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 Jun 2023 19:35:02 -0400
Subject: [PATCH 05/25] bGrid - introducing the concept of BlockView and
 refactoring the bitmask field.

---
 .../Neon/domain/details/bGrid/BlockView.h     |  29 +++++
 .../BlockViewGrid.h                           |   0
 .../BlockViewPartition.h                      |   0
 .../BlockViewPartition_imp.h                  |   0
 .../Neon/domain/details/bGrid/StaticBlock.h   |  53 +++++++++-
 .../include/Neon/domain/details/bGrid/bGrid.h |  32 +++---
 .../Neon/domain/details/bGrid/bGrid_imp.h     | 100 ++++++++----------
 .../Neon/domain/details/bGrid/bPartition.h    |  28 ++---
 .../domain/details/bGrid/bPartition_imp.h     |  19 ++--
 .../include/Neon/domain/details/bGrid/bSpan.h |  40 +++----
 .../Neon/domain/details/bGrid/bSpan_imp.h     |  89 ++--------------
 .../Neon/domain/details/eGrid/eField_imp.h    |   2 +-
 .../Neon/domain/interface/FieldBase_imp.h     |   2 +-
 13 files changed, 185 insertions(+), 209 deletions(-)
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h
 rename libNeonDomain/include/Neon/domain/details/bGrid/{BlockViewGrid => BlockView}/BlockViewGrid.h (100%)
 rename libNeonDomain/include/Neon/domain/details/bGrid/{BlockViewGrid => BlockView}/BlockViewPartition.h (100%)
 rename libNeonDomain/include/Neon/domain/details/bGrid/{BlockViewGrid => BlockView}/BlockViewPartition_imp.h (100%)

diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h
new file mode 100644
index 00000000..42093147
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h
@@ -0,0 +1,29 @@
+#include "Neon/domain/details/bGrid/BlockView/BlockViewGrid.h"
+#include "Neon/domain/tools/GridTransformer.h"
+
+namespace Neon::domain::details::bGrid {
+
+struct BlockView
+{
+   public:
+    using Grid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
+    template <typename T, int C = 0>
+    using Field = Grid::template Field<T, C>;
+    using index_3d = Neon::index_3d;
+
+    template <typename T, int C = 0>
+    static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t<C == 0, T&>
+    {
+        return mem[idx * card];
+    }
+
+    template <typename T, int C = 0>
+    static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t<C != 0, T&>
+    {
+        return mem[idx * C];
+    }
+
+    static constexpr Neon::MemoryLayout layout = Neon::MemoryLayout::arrayOfStructs;
+};
+
+}  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewGrid.h
similarity index 100%
rename from libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h
rename to libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewGrid.h
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition.h
similarity index 100%
rename from libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition.h
rename to libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition.h
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition_imp.h
similarity index 100%
rename from libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition_imp.h
rename to libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition_imp.h
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
index 14872577..951f9fd3 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
@@ -44,15 +44,60 @@ struct StaticBlock
 
     struct BitMask
     {
-        auto reset()
+        using BitMaskWordType = uint32_t;
+        auto reset() -> void
         {
-            for (uint32_t i = 0; i < nWords; ++i) {
+            for (BitMaskWordType i = 0; i < nWords; ++i) {
                 bits[i] = 0;
             }
         }
 
-        constexpr static uint32_t nWords = (memBlockCountElements + 31) / 32;
-        uint32_t                  bits[nWords];
+        auto setActive(int threadX,
+                       int threadY,
+                       int threadZ) -> void
+        {
+            BitMaskWordType mask;
+            uint32_t        wordIdx;
+            getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx);
+            auto& word = bits[wordIdx];
+            word = word | mask;
+        }
+
+        inline auto NEON_CUDA_HOST_DEVICE isActive(int threadX,
+                                                   int threadY,
+                                                   int threadZ) const -> bool
+        {
+            BitMaskWordType mask;
+            uint32_t        wordIdx;
+            getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx);
+            auto& word = bits[wordIdx];
+            return (word & mask) != 0;
+        }
+
+        static inline auto NEON_CUDA_HOST_DEVICE getMaskAndWordI(int                       threadX,
+                                                                 int                       threadY,
+                                                                 int                       threadZ,
+                                                                 NEON_OUT BitMaskWordType& mask,
+                                                                 NEON_OUT uint32_t&        wordIdx) -> void
+        {
+            const uint32_t threadPitch = threadX * memBlockPitchX +
+                                         threadY * memBlockPitchY +
+                                         threadZ * memBlockPitchZ;
+
+            // threadPitch >> log2_of_bitPerWord
+            // the same as: threadPitch / 2^{log2_of_bitPerWord}
+            wordIdx = threadPitch >> log2_of_bitPerWord;
+            // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1);
+            // same as threadPitch % 2^{log2OfbitMaskWordSize}
+            const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitPerWord)) - 1);
+            mask = BitMaskWordType(1) << offsetInWord;
+        }
+
+        constexpr static BitMaskWordType nWords = (memBlockCountElements + 31) / 32;
+        static constexpr uint32_t        log2_of_bitPerWord = 5;
+        static constexpr uint32_t        bitPerWord = 32;
+
+        BitMaskWordType bits[nWords];
     };
 };
 
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
index a40935bb..8ed458c8 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
@@ -1,10 +1,8 @@
 #pragma once
 #include "Neon/core/core.h"
 
-#include "Neon/set/memory/memSet.h"
-
-#include "BlockViewGrid/BlockViewGrid.h"
 #include "Neon/domain/aGrid.h"
+#include "Neon/domain/details/bGrid/BlockView.h"
 #include "Neon/domain/details/bGrid/StaticBlock.h"
 #include "Neon/domain/details/bGrid/bField.h"
 #include "Neon/domain/details/bGrid/bIndex.h"
@@ -17,6 +15,7 @@
 #include "Neon/domain/tools/SpanTable.h"
 #include "Neon/set/Containter.h"
 #include "Neon/set/LaunchParametersTable.h"
+#include "Neon/set/memory/memSet.h"
 
 #include "bField.h"
 #include "bPartition.h"
@@ -34,17 +33,11 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
 {
    public:
     using Grid = bGrid<SBlock>;
-
     template <typename T, int C = 0>
     using Partition = bPartition<T, C, SBlock>;
-
     template <typename T, int C = 0>
     using Field = Neon::domain::details::bGrid::bField<T, C, SBlock>;
 
-    using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
-    template <typename T, int C = 0>
-    using BlockViewField = BlockViewGrid::template Field<T, C>;
-
     using Span = bSpan<SBlock>;
     using NghIdx = typename Partition<int>::NghIdx;
     using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate<Grid, bIndex<SBlock>>;
@@ -127,7 +120,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
                            T                   inactiveValue,
                            Neon::DataUse       dataUse = Neon::DataUse::HOST_DEVICE,
                            Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const
-        -> BlockViewField<T, C>;
+        -> BlockView::Field<T, C>;
 
     /**
      * Allocates a new container to execute some computation in the grid
@@ -165,30 +158,30 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
      * Retrieve the block vew grid internally used.
      * This grid can be leverage to allocate data at the block level.
      */
-    auto getBlockViewGrid() const -> BlockViewGrid&;
+    auto getBlockViewGrid() const -> BlockView::Grid&;
 
     /**
      * Retrieve the block vew grid internally used.
      * This grid can be leverage to allocate data at the block level.
      */
-    auto getActiveBitMask() const -> BlockViewField<uint64_t, 0>&;
+    auto getActiveBitMask() const -> BlockView::Field<typename SBlock::BitMask, 1>&;
 
     /**
      * Help function to retrieve the block connectivity as a BlockViewGrid field
      */
-    auto helpGetBlockConnectivity() const -> BlockViewField<BlockIdx, 27>&;
+    auto helpGetBlockConnectivity() const -> BlockView::Field<BlockIdx, 27>&;
 
     /**
      * Help function to retrieve the block origin as a BlockViewGrid field
      */
     auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field<index_3d, 0>&;
 
-    /*
+    /**
      * Help function to retrieve the map that converts a stencil point id to 3d offset
      */
     auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet<Neon::int8_3d>&;
 
-    /*
+    /**
      * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid
      */
     auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple<Neon::SetIdx, Idx>;
@@ -212,11 +205,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
         Neon::aGrid::Field<index_3d, 0> mDataBlockOriginField;
         Neon::set::MemSet<int8_t>       mStencil3dTo1dOffset;
 
-        BlockViewGrid                      blockViewGrid;
-        BlockViewGrid::Field<uint64_t, 0>  activeBitMask;
-        BlockViewGrid::Field<BlockIdx, 27> blockConnectivity;
-
-        Neon::set::MemSet<Neon::int8_3d> stencilIdTo3dOffset;
+        BlockView::Grid                               blockViewGrid;
+        BlockView::Field<typename SBlock::BitMask, 1> activeBitField;
+        BlockView::Field<BlockIdx, 27>                blockConnectivity;
+        Neon::set::MemSet<Neon::int8_3d>              stencilIdTo3dOffset;
 
         tool::Partitioner1D::DenseMeta denseMeta;
 
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
index 1b40a8b7..7505a06b 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
@@ -75,66 +75,60 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
             spacingData * SBlock::memBlockSize3D,
             origin);
 
-        mData->blockViewGrid = BlockViewGrid(egrid);
+        mData->blockViewGrid = BlockView::Grid(egrid);
     }
 
     {  // Active bitmask
-        int requiredWords = Span::getRequiredWordsForBlockBitMask();
-        mData->activeBitMask = mData->blockViewGrid.template newField<typename Span::BitMaskWordType>("BitMask",
-                                                                                                      requiredWords,
-                                                                                                      0,
-                                                                                                      Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(Span::activeMaskMemoryLayout));
+        mData->activeBitField = mData->blockViewGrid.template newField<typename SBlock::BitMask, 1>(
+            "BlockViewBitMask",
+            1,
+            [] {
+                typename SBlock::BitMask outsideBitMask;
+                outsideBitMask.reset();
+                return outsideBitMask;
+            }(),
+            Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(BlockView::layout));
 
         mData->mNumActiveVoxel = backend.devSet().template newDataSet<uint64_t>();
 
-        mData->activeBitMask
+        mData->activeBitField
             .getGrid()
             .template newContainer<Neon::Execution::host>(
                 "activeBitMaskInit",
                 [&](Neon::set::Loader& loader) {
-                    auto bitMask = loader.load(mData->activeBitMask);
-                    return [&, bitMask](const auto& bitMaskIdx) mutable {
-                        auto       prtIdx = bitMask.prtID();
-                        int        coutActive = 0;
-                        auto const blockOrigin = bitMask.getGlobalIndex(bitMaskIdx);
-
-                        for (int c = 0; c < bitMask.cardinality(); c++) {
-                            bitMask(bitMaskIdx, c) = 0;
-                        }
+                    auto bitMaskPartition = loader.load(mData->activeBitField);
+                    return [&, bitMaskPartition](const auto& bitMaskIdx) mutable {
+                        auto                      prtIdx = bitMaskPartition.prtID();
+                        int                       countActive = 0;
+                        auto const                blockOrigin = bitMaskPartition.getGlobalIndex(bitMaskIdx);
+                        typename SBlock::BitMask& bitMask = bitMaskPartition(bitMaskIdx, 0);
+                        bitMask.reset();
 
                         for (int k = 0; k < SBlock::memBlockSize3D.template newType<int32_t>().z; k++) {
                             for (int j = 0; j < SBlock::memBlockSize3D.template newType<int32_t>().y; j++) {
                                 for (int i = 0; i < SBlock::memBlockSize3D.template newType<int32_t>().x; i++) {
-
-                                    Neon::int32_3d                 localPosition(i, j, k);
-                                    typename Span::BitMaskWordType mask;
-                                    uint32_t                       wordIdx;
-
-                                    Span::getMaskAndWordIdforBlockBitMask(i, j, k, NEON_OUT mask, NEON_OUT wordIdx);
-                                    auto globalPosition = localPosition + blockOrigin;
-                                    bool isInDomain = globalPosition < domainSize;
-                                    bool isActive = activeCellLambda(globalPosition);
+                                    auto globalPosition = blockOrigin + Neon::int32_3d(i, j, k);
+                                    bool const isInDomain = globalPosition < domainSize;
+                                    bool const isActive = activeCellLambda(globalPosition);
                                     if (isActive && isInDomain) {
-                                        coutActive++;
-                                        auto value = bitMask(bitMaskIdx, wordIdx);
-                                        value = value | mask;
-                                        bitMask(bitMaskIdx, wordIdx) = value;
+                                        countActive++;
+                                        bitMask.setActive(i, j, k);
                                     }
                                 }
                             }
                         }
 #pragma omp critical
                         {
-                            mData->mNumActiveVoxel[prtIdx] += coutActive;
+                            mData->mNumActiveVoxel[prtIdx] += countActive;
                         }
                     };
                 })
             .run(Neon::Backend::mainStreamIdx);
 
-        mData->activeBitMask.updateDeviceData(Neon::Backend::mainStreamIdx);
-        mData->activeBitMask.newHaloUpdate(Neon::set::StencilSemantic::standard,
-                                           Neon::set::TransferMode::put,
-                                           Neon::Execution::device)
+        mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx);
+        mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard,
+                                            Neon::set::TransferMode::put,
+                                            Neon::Execution::device)
             .run(Neon::Backend::mainStreamIdx);
     }
 
@@ -184,20 +178,20 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
             case Neon::DataView::STANDARD: {
                 span.mFirstDataBlockOffset = 0;
                 span.mDataView = dw;
-                span.mActiveMask = mData->activeBitMask.getPartition(execution, setIdx, dw).mem();
+                span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem();
                 break;
             }
             case Neon::DataView::BOUNDARY: {
                 span.mFirstDataBlockOffset = mData->partitioner1D.getSpanClassifier().countInternal(setIdx);
                 span.mDataView = dw;
-                span.mActiveMask = mData->activeBitMask.getPartition(execution, setIdx, dw).mem();
+                span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem();
 
                 break;
             }
             case Neon::DataView::INTERNAL: {
                 span.mFirstDataBlockOffset = 0;
                 span.mDataView = dw;
-                span.mActiveMask = mData->activeBitMask.getPartition(execution, setIdx, dw).mem();
+                span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem();
                 break;
             }
             default: {
@@ -267,10 +261,10 @@ auto bGrid<SBlock>::newBlockViewField(const std::string   name,
                                       int                 cardinality,
                                       T                   inactiveValue,
                                       Neon::DataUse       dataUse,
-                                      Neon::MemoryOptions memoryOptions) const -> BlockViewGrid::Field<T, C>
+                                      Neon::MemoryOptions memoryOptions) const -> BlockView::Field<T, C>
 {
     memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions);
-    BlockViewGrid::Field<T, C> blockViewField = mData->blockViewGrid.template newField<T, C>(name, cardinality, inactiveValue, dataUse, memoryOptions);
+    BlockView::Field<T, C> blockViewField = mData->blockViewGrid.template newField<T, C>(name, cardinality, inactiveValue, dataUse, memoryOptions);
     return blockViewField;
 }
 
@@ -310,7 +304,7 @@ auto bGrid<SBlock>::newContainer(const std::string& name,
 template <typename SBlock>
 auto bGrid<SBlock>::
     getBlockViewGrid()
-        const -> BlockViewGrid&
+        const -> BlockView::Grid&
 {
     return mData->blockViewGrid;
 }
@@ -318,15 +312,15 @@ auto bGrid<SBlock>::
 template <typename SBlock>
 auto bGrid<SBlock>::
     getActiveBitMask()
-        const -> BlockViewGrid::Field<uint64_t, 0>&
+        const -> BlockView::Field<typename SBlock::BitMask, 1>&
 {
-    return mData->activeBitMask;
+    return mData->activeBitField;
 }
 
 template <typename SBlock>
 auto bGrid<SBlock>::
     helpGetBlockConnectivity()
-        const -> BlockViewGrid::Field<BlockIdx, 27>&
+        const -> BlockView::Field<BlockIdx, 27>&
 {
     return mData->blockConnectivity;
 }
@@ -386,22 +380,18 @@ template <typename SBlock>
 auto bGrid<SBlock>::isInsideDomain(const index_3d& idx) const -> bool
 {
     // 1. check if the block is active
-    const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType<int32_t>();
-    auto           blockProperties = mData->blockViewGrid.getProperties(blockIdx3d);
+    const BlockView::index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType<int32_t>();
+    auto                      blockProperties = mData->blockViewGrid.getProperties(blockIdx3d);
 
     if (!blockProperties.isInside()) {
         return false;
     }
-    // 2. The block is active, check the element on the block
-    uint32_t                       wordCardinality;
-    typename Span::BitMaskWordType mask;
-    Span::getMaskAndWordIdforBlockBitMask(idx.x % SBlock::memBlockSize3D.x,
-                                          idx.y % SBlock::memBlockSize3D.y,
-                                          idx.z % SBlock::memBlockSize3D.z,
-                                          NEON_OUT mask,
-                                          NEON_OUT wordCardinality);
-    auto activeBits = mData->activeBitMask.getReference(blockIdx3d, int(wordCardinality));
-    return (activeBits & mask) != 0;
+    // 2. The block is active, check the element in the block
+    typename SBlock::BitMask const& bitMask = mData->activeBitField.getReference(blockIdx3d, 0);
+    bool                            isActive = bitMask.isActive(idx.x % SBlock::memBlockSize3D.x,
+                                                                idx.y % SBlock::memBlockSize3D.y,
+                                                                idx.z % SBlock::memBlockSize3D.z);
+    return isActive;
 }
 
 template <typename SBlock>
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
index 48312b22..7f537ad5 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
@@ -30,13 +30,13 @@ class bPartition
 
     ~bPartition() = default;
 
-    explicit bPartition(int                             setIdx,
-                        int                             mCardinality,
-                        T*                              mMem,
-                        typename Idx::DataBlockIdx*     mBlockConnectivity,
-                        typename Span::BitMaskWordType* mMask,
-                        Neon::int32_3d*                 mOrigin,
-                        NghIdx*                         mStencilNghIndex);
+    explicit bPartition(int                                           setIdx,
+                        int                                           mCardinality,
+                        T*                                            mMem,
+                        typename Idx::DataBlockIdx*                   mBlockConnectivity,
+                        typename SBlock::BitMask const* NEON_RESTRICT mMask,
+                        Neon::int32_3d*                               mOrigin,
+                        NghIdx*                                       mStencilNghIndex);
 
     inline NEON_CUDA_HOST_DEVICE auto
     cardinality()
@@ -92,13 +92,13 @@ class bPartition
         const -> Idx;
 
 
-    int                             mCardinality;
-    T*                              mMem;
-    NghIdx*                         mStencilNghIndex;
-    typename Idx::DataBlockIdx*     mBlockConnectivity;
-    typename Span::BitMaskWordType* mMask;
-    Neon::int32_3d*                 mOrigin;
-    int                             mSetIdx;
+    int                                             mCardinality;
+    T*                                              mMem;
+    NghIdx const* NEON_RESTRICT                     mStencilNghIndex;
+    typename Idx::DataBlockIdx const* NEON_RESTRICT mBlockConnectivity;
+    typename SBlock::BitMask const* NEON_RESTRICT   mMask;
+    Neon::int32_3d const* NEON_RESTRICT             mOrigin;
+    int                                             mSetIdx;
 };
 
 }  // namespace Neon::domain::details::bGrid
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
index 8506476b..6e3b728f 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
@@ -19,13 +19,13 @@ bPartition<T, C, SBlock>::bPartition()
 
 template <typename T, int C, typename SBlock>
 bPartition<T, C, SBlock>::
-    bPartition(int                             setIdx,
-               int                             cardinality,
-               T*                              mem,
-               typename Idx::DataBlockIdx*     blockConnectivity,
-               typename Span::BitMaskWordType* mask,
-               Neon::int32_3d*                 origin,
-               NghIdx*                         stencilNghIndex)
+    bPartition(int                                           setIdx,
+               int                                           cardinality,
+               T*                                            mem,
+               typename Idx::DataBlockIdx*                   blockConnectivity,
+               typename SBlock::BitMask const* NEON_RESTRICT mask,
+               Neon::int32_3d*                               origin,
+               NghIdx*                                       stencilNghIndex)
     : mCardinality(cardinality),
       mMem(mem),
       mStencilNghIndex(stencilNghIndex),
@@ -115,10 +115,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
         return {false, 0};
     }
 
-    bool isActive = Span::getActiveStatus(nghIdx.mDataBlockIdx,
-                                          nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z,
-                                          mMask);
-
+    const bool isActive = mMask[nghIdx.mDataBlockIdx].isActive(nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z);
     if (!isActive) {
         return {false, 0};
     }
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h
index 80fb12ab..9c6ed821 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h
@@ -23,42 +23,32 @@ class bSpan
     bSpan() = default;
     virtual ~bSpan() = default;
 
-    NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId() -> typename Idx::DataBlockIdx
+    NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId()
+        -> typename Idx::DataBlockIdx
     {
         return std::numeric_limits<uint32_t>::max();
     }
 
-    inline bSpan(typename Idx::DataBlockCount     mFirstDataBlockOffset,
-                 bSpan::BitMaskWordType* mActiveMask,
-                 Neon::DataView          mDataView);
+    inline bSpan(
+        typename Idx::DataBlockCount                  mFirstDataBlockOffset,
+        typename SBlock::BitMask const* NEON_RESTRICT mActiveMask,
+        Neon::DataView                                mDataView);
 
-    NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice(Idx&                   bidx,
-                                                              uint32_t const&        threadIdx,
-                                                              uint32_t const&        x,
-                                                              uint32_t const&        y,
-                                                              uint32_t const&        z) const -> bool;
+    NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice(
+        Idx&            bidx,
+        uint32_t const& threadIdx,
+        uint32_t const& x,
+        uint32_t const& y,
+        uint32_t const& z) const -> bool;
 
     NEON_CUDA_HOST_DEVICE inline auto setAndValidateGPUDevice(
         Idx& bidx) const -> bool;
 
-    static NEON_CUDA_HOST_DEVICE inline auto getRequiredWordsForBlockBitMask() -> uint32_t;
 
-    static NEON_CUDA_HOST_DEVICE inline auto getActiveStatus(
-        const typename Idx::DataBlockIdx& dataBlockIdx,
-        int                      threadX,
-        int                      threadY,
-        int                      threadZ,
-        bSpan::BitMaskWordType*  mActiveMask) -> bool;
-
-    static inline auto getMaskAndWordIdforBlockBitMask(int              threadX,
-                                                       int              threadY,
-                                                       int              threadZ,
-                                                       BitMaskWordType& mask,
-                                                       uint32_t&        wordIdx) -> void;
     // We don't need to have a count on active blocks
-    typename Idx::DataBlockCount     mFirstDataBlockOffset;
-    bSpan::BitMaskWordType* mActiveMask;
-    Neon::DataView          mDataView;
+    typename Idx::DataBlockCount                  mFirstDataBlockOffset;
+    typename SBlock::BitMask const* NEON_RESTRICT mActiveMask;
+    Neon::DataView                                mDataView;
 };
 }  // namespace Neon::domain::details::bGrid
 
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h
index 57d7aeca..8a208110 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h
@@ -12,10 +12,8 @@ bSpan<SBlock>::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool
     bidx.mInDataBlockIdx.y = threadIdx.y;
     bidx.mInDataBlockIdx.z = threadIdx.z;
 
-    bool const isActive = getActiveStatus(bidx.mDataBlockIdx,
-                                          bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z,
-                                          mActiveMask);
-    //  printf("%d %d %d is active %d\n",bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z, (isActive?1:-1));
+    const bool isActive = mActiveMask[bidx.mDataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z);
+
     return isActive;
 #else
     NEON_THROW_UNSUPPORTED_OPERATION("Operation supported only on GPU");
@@ -25,94 +23,29 @@ bSpan<SBlock>::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool
 template <typename SBlock>
 NEON_CUDA_HOST_DEVICE inline auto
 bSpan<SBlock>::setAndValidateCPUDevice(Idx&            bidx,
-                                                                                                                            uint32_t const& dataBlockIdx,
-                                                                                                                            uint32_t const& x,
-                                                                                                                            uint32_t const& y,
-                                                                                                                            uint32_t const& z) const -> bool
+                                       uint32_t const& dataBlockIdx,
+                                       uint32_t const& x,
+                                       uint32_t const& y,
+                                       uint32_t const& z) const -> bool
 {
 
     bidx.mDataBlockIdx = dataBlockIdx;
-    bidx.mInDataBlockIdx.x = static_cast < typename Idx::InDataBlockIdx::Integer>(x);
+    bidx.mInDataBlockIdx.x = static_cast<typename Idx::InDataBlockIdx::Integer>(x);
     bidx.mInDataBlockIdx.y = static_cast<typename Idx::InDataBlockIdx::Integer>(y);
     bidx.mInDataBlockIdx.z = static_cast<typename Idx::InDataBlockIdx::Integer>(z);
-    bool const isActive = getActiveStatus(bidx.mDataBlockIdx,
-                                          bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z,
-                                          mActiveMask);
+    const bool isActive = mActiveMask[dataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z);
     return isActive;
 }
 
 template <typename SBlock>
-bSpan<SBlock>::bSpan(typename Idx::DataBlockCount firstDataBlockOffset,
-                                                                                                          BitMaskWordType*             activeMask,
-                                                                                                          Neon::DataView               dataView)
+bSpan<SBlock>::bSpan(typename Idx::DataBlockCount                  firstDataBlockOffset,
+                     typename SBlock::BitMask const* NEON_RESTRICT activeMask,
+                     Neon::DataView                                dataView)
     : mFirstDataBlockOffset(firstDataBlockOffset),
       mActiveMask(activeMask),
       mDataView(dataView)
 {
 }
 
-template <typename SBlock>
-NEON_CUDA_HOST_DEVICE inline auto bSpan<SBlock>::getRequiredWordsForBlockBitMask() -> uint32_t
-{
-    uint32_t requiredBits = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
-    uint32_t requiredWords = ((requiredBits - 1) >> bSpan<SBlock>::log2OfbitMaskWordSize) + 1;
-    return requiredWords;
-}
-
-template <typename SBlock>
-inline auto bSpan<SBlock>::getMaskAndWordIdforBlockBitMask(int                       threadX,
-                                                                                                                                                int                       threadY,
-                                                                                                                                                int                       threadZ,
-                                                                                                                                                NEON_OUT BitMaskWordType& mask,
-                                                                                                                                                NEON_OUT uint32_t&        wordIdx) -> void
-{
-    if constexpr (activeMaskMemoryLayout == Neon::MemoryLayout::arrayOfStructs) {
-        // 6 = log_2 64
-        const uint32_t threadPitch = threadX + threadY * SBlock::memBlockSizeX + threadZ * SBlock::memBlockSizeX * SBlock::memBlockSizeY;
-        // threadPitch >> log2OfbitMaskWordSize
-        // the same as: threadPitch / 2^{log2OfbitMaskWordSize}
-        wordIdx = threadPitch >> log2OfbitMaskWordSize;
-        // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1);
-        // same as threadPitch % 2^{log2OfbitMaskWordSize}
-        const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitMaskStorageBitWidth)) - 1);
-        mask = BitMaskWordType(1) << offsetInWord;
-    } else {
-        assert(false);
-    }
-}
-
-
-template <typename SBlock>
-NEON_CUDA_HOST_DEVICE inline auto bSpan<SBlock>::getActiveStatus(
-    const typename Idx::DataBlockIdx& dataBlockIdx,
-    int                               threadX,
-    int                               threadY,
-    int                               threadZ,
-    BitMaskWordType*                  mActiveMask) -> bool
-{
-    if constexpr (activeMaskMemoryLayout == Neon::MemoryLayout::arrayOfStructs) {
-        // 6 = log_2 64
-        const uint32_t threadPitch = threadX + threadY * SBlock::memBlockSizeX + threadZ * SBlock::memBlockSizeX * SBlock::memBlockSizeY;
-        // threadPitch >> log2OfbitMaskWordSize
-        // the same as: threadPitch / 2^{log2OfbitMaskWordSize}
-        const uint32_t wordIdx = threadPitch >> log2OfbitMaskWordSize;
-        // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1);
-        // same as threadPitch % 2^{log2OfbitMaskWordSize}
-        const uint32_t  offsetInWord = threadPitch & ((BitMaskWordType(bitMaskStorageBitWidth)) - 1);
-        BitMaskWordType mask = BitMaskWordType(1) << offsetInWord;
-
-        uint32_t const  cardinality = getRequiredWordsForBlockBitMask();
-        uint32_t const  pitch = (cardinality * dataBlockIdx) + wordIdx;
-        BitMaskWordType targetWord = mActiveMask[pitch];
-        auto            masked = targetWord & mask;
-        if (masked != 0) {
-            return true;
-        }
-        return false;
-    } else {
-        assert(false);
-    }
-    //
-}
 
 }  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h
index 5cd93860..1843c4df 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h
@@ -35,7 +35,7 @@ eField<T, C>::eField(const std::string&         fieldUserName,
 
     mData->memoryField = mData->grid->getMemoryGrid().template newField<T, C>(fieldUserName + "-storage",
                                                                               cardinality,
-                                                                              T(0),
+                                                                              inactiveValue,
                                                                               dataUse);
 
 
diff --git a/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h b/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h
index ea10edf6..97d10dc1 100644
--- a/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h
+++ b/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h
@@ -359,7 +359,7 @@ template <typename T, int C>
 FieldBase<T, C>::Storage::Storage()
     : dimension(0),
       cardinality(0),
-      outsideVal(static_cast<T>(0.0)),
+      outsideVal(T()),
       dataUse(),
       memoryOptions(),
       haloStatus(),

From cdcdc0df3dcf91aa7927f9085e30041d057a7336 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 Jun 2023 20:09:20 -0400
Subject: [PATCH 06/25] bGrid - fixing multi-GPU

---
 .../include/Neon/domain/details/bGrid/bField.h | 18 +++++++++---------
 .../Neon/domain/details/bGrid/bField_imp.h     | 15 ++++++++-------
 .../Neon/domain/details/bGrid/bGrid_imp.h      |  2 ++
 .../tests/domain-stencil/src/gtests.cpp        |  4 ++--
 4 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
index 8f1ac485..d0dd45c5 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
@@ -34,18 +34,18 @@ class bField : public Neon::domain::interface::FieldBaseTemplate<T,
     using Partition = bPartition<T, C, SBlock>;
     using Idx = bIndex<SBlock>;
     using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
-    template<typename TT, int CC = 0>
+    template <typename TT, int CC = 0>
     using BlockViewField = BlockViewGrid::template Field<TT, CC>;
 
     using NghIdx = typename Partition::NghIdx;
     using NghData = typename Partition::NghData;
 
-    bField(const std::string&         fieldUserName,
-           Neon::DataUse              dataUse,
-           const Neon::MemoryOptions& memoryOptions,
-           const Grid&                grid,
-           int                        cardinality,
-           T                          inactiveValue);
+    bField(const std::string&  fieldUserName,
+           Neon::DataUse       dataUse,
+           Neon::MemoryOptions memoryOptions,
+           const Grid&         grid,
+           int                 cardinality,
+           T                   inactiveValue);
 
     bField();
 
@@ -110,8 +110,8 @@ class bField : public Neon::domain::interface::FieldBaseTemplate<T,
             static constexpr int nConfigs = 2;
         };
 
-        std::shared_ptr<Grid>      grid;
-        BlockViewField<T, C> memoryField;
+        std::shared_ptr<Grid> grid;
+        BlockViewField<T, C>  memoryField;
 
         int mCardinality;
 
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
index a9c249ca..a6127c43 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
@@ -11,12 +11,12 @@ bField<T, C, SBlock>::bField()
 }
 
 template <typename T, int C, typename SBlock>
-bField<T, C, SBlock>::bField(const std::string&         fieldUserName,
-                             Neon::DataUse              dataUse,
-                             const Neon::MemoryOptions& memoryOptions,
-                             const Grid&                grid,
-                             int                        cardinality,
-                             T                          inactiveValue)
+bField<T, C, SBlock>::bField(const std::string&  fieldUserName,
+                             Neon::DataUse       dataUse,
+                             Neon::MemoryOptions memoryOptions,
+                             const Grid&         grid,
+                             int                 cardinality,
+                             T                   inactiveValue)
     : Neon::domain::interface::FieldBaseTemplate<T, C, Grid, Partition, int>(&grid,
                                                                              fieldUserName,
                                                                              "bField",
@@ -29,7 +29,8 @@ bField<T, C, SBlock>::bField(const std::string&         fieldUserName,
     mData->grid = std::make_shared<Grid>(grid);
 
     if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) {
-        NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs");
+        NEON_WARNING("bField does not support MemoryLayout::arrayOfStructs, enforcing MemoryLayout::structOfArrays");
+        memoryOptions.setOrder(Neon::MemoryLayout::structOfArrays);
     }
     // the allocation size is the number of blocks x block size x cardinality
     mData->memoryField = mData->grid->getBlockViewGrid().template newField<T, 0>(
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
index 7505a06b..b921a3e1 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
@@ -125,7 +125,9 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
                 })
             .run(Neon::Backend::mainStreamIdx);
 
+
         mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx);
+        this->getBackend().sync(Neon::Backend::mainStreamIdx);
         mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard,
                                             Neon::set::TransferMode::put,
                                             Neon::Execution::device)
diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp
index 70d9d650..ec6f892a 100644
--- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp
@@ -22,9 +22,9 @@ TEST(domain_stencil, eGrid)
                             1);
 }
 
-TEST(domain_stencil, bGridSingleGPU)
+TEST(domain_stencil, bGri )
 {
-    int nGpus = 1;
+    int nGpus = 5;
     using Type = int64_t;
     runAllTestConfiguration(std::function(map::run<Neon::bGrid, Type, 0>),
                             nGpus,

From ea82dfc1c0553a64f5c0bd0d8b23b4be04436195 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 16 Jun 2023 10:14:27 -0400
Subject: [PATCH 07/25] Adding scripts

---
 .../lbm-lid-driven-cavity-flow.py             | 48 +++++++++++++++++++
 .../lbm-lid-driven-cavity-flow.sh             | 30 ++++++------
 2 files changed, 64 insertions(+), 14 deletions(-)
 create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py

diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
new file mode 100644
index 00000000..f4b48dd3
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
@@ -0,0 +1,48 @@
+import subprocess
+
+DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split()
+DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split()
+DEVICE_TYPE_LIST = 'cpu gpu'.split()
+GRID_LIST = "dGrid bGrid eGrid".split()
+STORAGE_FP_LIST = "double float".split()
+COMPUTE_FP_LIST = "double float".split()
+OCC_LIST = "nOCC".split()
+WARM_UP_ITER = 10
+MAX_ITER = 100
+REPETITIONS = 5
+
+for DEVICE_TYPE in DEVICE_TYPE_LIST:
+
+    DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
+    if DEVICE_TYPE == 'gpu':
+        for DEVICE in DEVICE_ID_LIST[1:]:
+            DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
+    for OCC in OCC_LIST:
+        for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
+            for STORAGE_FP in STORAGE_FP_LIST:
+                for COMPUTE_FP in COMPUTE_FP_LIST:
+                    for DEVICE_SET in DEVICE_SET_LIST:
+
+                        if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                            continue
+
+                        command = 'lbm-lid-driven-cavity-flow'
+                        parameters = []
+                        parameters.append('--deviceType ' + DEVICE_TYPE)
+                        parameters.append('--deviceIds ' + DEVICE_SET)
+                        parameters.append('--grid ' + DEVICE_TYPE)
+                        parameters.append('--domain-size ' + DOMAIN_SIZE)
+                        parameters.append('--warmup-iter ' + str(WARM_UP_ITER))
+                        parameters.append('--repetitions ' + str(REPETITIONS))
+                        parameters.append('--max-iter ' + str(MAX_ITER))
+                        parameters.append(
+                            '--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
+                            DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' +
+                            STORAGE_FP + '_' + COMPUTE_FP + '_' +
+                            DEVICE_SET.replace(' ', '_') + '_' + OCC)
+                        parameters.append('--computeFP ' + COMPUTE_FP)
+                        parameters.append('--storageFP ' + STORAGE_FP)
+                        parameters.append('--benchmark')
+                        parameters.append('--' + OCC)
+
+                        subprocess.run(['echo' , ' '.join(parameters)])
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh
index ba5fe106..7cc5108c 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh
+++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh
@@ -1,7 +1,7 @@
 set -x
 
-DOMAIN_SIZE_LIST="128 192 256 320 384 448 512"
-GRID="dGrid"
+DOMAIN_SIZE_LIST="64 128 192 256 320 384 448 512"
+GRID_LIST="dGrid bGrid eGrid"
 STORAGE_FP_LIST="double float"
 COMPUTE_FP_LIST="double float"
 OCC="nOCC"
@@ -9,20 +9,22 @@ OCC="nOCC"
 for DOMAIN_SIZE in ${DOMAIN_SIZE_LIST}; do
   for STORAGE_FP in ${STORAGE_FP_LIST}; do
     for COMPUTE_FP in ${COMPUTE_FP_LIST}; do
+      for GRID in ${GRID_LIST}; do
 
-      if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then
-        continue
-      fi
+        if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then
+          continue
+        fi
 
-      echo ./lbm-lid-driven-cavity-flow \
-        --deviceType gpu --deviceIds 0 \
-        --grid "${GRID}" \
-        --domain-size "${DOMAIN_SIZE}" \
-        --warmup-iter 10 --max-iter 100 --repetitions 5 \
-        --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \
-        --computeFP "${COMPUTE_FP}" \
-        --storageFP "${STORAGE_FP}" \
-        --${OCC} --benchmark
+        echo ./lbm-lid-driven-cavity-flow \
+          --deviceType gpu --deviceIds 0 \
+          --grid "${GRID}" \
+          --domain-size "${DOMAIN_SIZE}" \
+          --warmup-iter 10 --max-iter 100 --repetitions 5 \
+          --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \
+          --computeFP "${COMPUTE_FP}" \
+          --storageFP "${STORAGE_FP}" \
+          --${OCC} --benchmark
+      done
     done
   done
 done

From 55af7081427f22a352ecfe69cb03c2ad722c16df Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 19 Jun 2023 09:54:52 -0400
Subject: [PATCH 08/25] Benchmarks and scripts

Adding scripts


Adding scripts


Adding scripts


Adding scripts


Adding back eGrid and bGrid to the LBM benchmark.


Fixing warning issue


Fixing script.
---
 .../lbm-lid-driven-cavity-flow/CMakeLists.txt |   9 +-
 .../lbm-lid-driven-cavity-flow.py             | 123 ++++++++++++-----
 .../src/RunCavityTwoPop.cu                    |   8 +-
 .../Neon/domain/details/bGrid/bField.h        |   2 +-
 .../Neon/domain/details/bGrid/bField_imp.h    |   2 +-
 .../Neon/domain/details/bGrid/bPartition.h    |  48 ++++++-
 .../domain/details/bGrid/bPartition_imp.h     | 130 +++++++++++++++++-
 .../Neon/domain/details/eGrid/ePartition.h    |  39 ++++--
 .../domain/details/eGrid/ePartition_imp.h     |  37 +++++
 libNeonSet/include/Neon/set/Containter_imp.h  |   1 +
 10 files changed, 336 insertions(+), 63 deletions(-)

diff --git a/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt b/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt
index dfb18a8c..ed03a750 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt
+++ b/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt
@@ -23,4 +23,11 @@ add_custom_command(
 		TARGET ${APP}  POST_BUILD
 		COMMAND ${CMAKE_COMMAND} -E copy
 		${CMAKE_CURRENT_SOURCE_DIR}/${APP}.sh
-		${CMAKE_BINARY_DIR}/bin/${APP}.sh)
\ No newline at end of file
+		${CMAKE_BINARY_DIR}/bin/${APP}.sh)
+
+add_custom_command(
+		TARGET ${APP}  POST_BUILD
+		COMMAND ${CMAKE_COMMAND} -E copy
+		${CMAKE_CURRENT_SOURCE_DIR}/${APP}.py
+		${CMAKE_BINARY_DIR}/bin/${APP}.py
+)
\ No newline at end of file
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
index f4b48dd3..5aebe104 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
+++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
@@ -1,5 +1,3 @@
-import subprocess
-
 DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split()
 DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split()
 DEVICE_TYPE_LIST = 'cpu gpu'.split()
@@ -11,38 +9,89 @@
 MAX_ITER = 100
 REPETITIONS = 5
 
-for DEVICE_TYPE in DEVICE_TYPE_LIST:
-
-    DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
-    if DEVICE_TYPE == 'gpu':
-        for DEVICE in DEVICE_ID_LIST[1:]:
-            DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
-    for OCC in OCC_LIST:
-        for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
-            for STORAGE_FP in STORAGE_FP_LIST:
-                for COMPUTE_FP in COMPUTE_FP_LIST:
-                    for DEVICE_SET in DEVICE_SET_LIST:
-
-                        if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
-                            continue
-
-                        command = 'lbm-lid-driven-cavity-flow'
-                        parameters = []
-                        parameters.append('--deviceType ' + DEVICE_TYPE)
-                        parameters.append('--deviceIds ' + DEVICE_SET)
-                        parameters.append('--grid ' + DEVICE_TYPE)
-                        parameters.append('--domain-size ' + DOMAIN_SIZE)
-                        parameters.append('--warmup-iter ' + str(WARM_UP_ITER))
-                        parameters.append('--repetitions ' + str(REPETITIONS))
-                        parameters.append('--max-iter ' + str(MAX_ITER))
-                        parameters.append(
-                            '--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
-                            DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' +
-                            STORAGE_FP + '_' + COMPUTE_FP + '_' +
-                            DEVICE_SET.replace(' ', '_') + '_' + OCC)
-                        parameters.append('--computeFP ' + COMPUTE_FP)
-                        parameters.append('--storageFP ' + STORAGE_FP)
-                        parameters.append('--benchmark')
-                        parameters.append('--' + OCC)
-
-                        subprocess.run(['echo' , ' '.join(parameters)])
+import subprocess
+import sys
+
+
+def printProgressBar(value, label):
+    n_bar = 40  # size of progress bar
+    max = 100
+    j = value / max
+    sys.stdout.write('\r')
+    bar = '█' * int(n_bar * j)
+    bar = bar + '-' * int(n_bar * (1 - j))
+
+    sys.stdout.write(f"{label.ljust(10)} | [{bar:{n_bar}s}] {int(100 * j)}% ")
+    sys.stdout.flush()
+
+
+def countAll():
+    counter = 0
+    for DEVICE_TYPE in DEVICE_TYPE_LIST:
+        DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
+        if DEVICE_TYPE == 'gpu':
+            for DEVICE in DEVICE_ID_LIST[1:]:
+                DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
+        for OCC in OCC_LIST:
+            for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
+                for STORAGE_FP in STORAGE_FP_LIST:
+                    for COMPUTE_FP in COMPUTE_FP_LIST:
+                        for DEVICE_SET in DEVICE_SET_LIST:
+                            for GRID in GRID_LIST:
+                                if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                    continue
+
+                                counter += 1
+    return counter
+
+
+SAMPLES = countAll()
+counter = 0
+command = './lbm-lid-driven-cavity-flow'
+with open(command + '.log', 'w') as fp:
+    for DEVICE_TYPE in DEVICE_TYPE_LIST:
+        DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
+        if DEVICE_TYPE == 'gpu':
+            for DEVICE in DEVICE_ID_LIST[1:]:
+                DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
+        for OCC in OCC_LIST:
+            for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
+                for STORAGE_FP in STORAGE_FP_LIST:
+                    for COMPUTE_FP in COMPUTE_FP_LIST:
+                        for DEVICE_SET in DEVICE_SET_LIST:
+                            for GRID in GRID_LIST:
+                                if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                    continue
+
+                                parameters = []
+                                parameters.append('--deviceType ' + DEVICE_TYPE)
+                                parameters.append('--deviceIds ' + DEVICE_SET)
+                                parameters.append('--grid ' + GRID)
+                                parameters.append('--domain-size ' + DOMAIN_SIZE)
+                                parameters.append('--warmup-iter ' + str(WARM_UP_ITER))
+                                parameters.append('--repetitions ' + str(REPETITIONS))
+                                parameters.append('--max-iter ' + str(MAX_ITER))
+                                parameters.append(
+                                    '--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
+                                    DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' +
+                                    STORAGE_FP + '_' + COMPUTE_FP + '_' +
+                                    DEVICE_SET.replace(' ', '_') + '_' + OCC)
+                                parameters.append('--computeFP ' + COMPUTE_FP)
+                                parameters.append('--storageFP ' + STORAGE_FP)
+                                parameters.append('--benchmark')
+                                parameters.append('--' + OCC)
+
+                                commandList = []
+                                commandList.append(command)
+                                for el in parameters:
+                                    for s in el.split():
+                                        commandList.append(s)
+
+                                fp.write("\n-------------------------------------------\n")
+                                fp.write(' '.join(commandList))
+                                fp.write("\n-------------------------------------------\n")
+                                fp.flush()
+                                subprocess.run(commandList, text=True, stdout=fp)
+
+                                counter += 1
+                                printProgressBar(counter * 100.0 / SAMPLES, 'Progress')
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
index 2ca5e128..c603415c 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
@@ -128,7 +128,7 @@ auto run(Config& config,
             },
                                 Neon::computeMode_t::seq);
 
-            //sort the position so the linear interpolation works
+            // sort the position so the linear interpolation works
             std::sort(xPosVal.begin(), xPosVal.end(), [=](std::pair<double, double>& a, std::pair<double, double>& b) {
                 return a.first < b.first;
             });
@@ -308,12 +308,10 @@ auto run(Config& config,
         return details::runFilterStoreType<Neon::dGrid>(config, report);
     }
     if (config.gridType == "eGrid") {
-        NEON_DEV_UNDER_CONSTRUCTION("");
-        // return details::runFilterStoreType<Neon::eGrid>(config, report);
+        return details::runFilterStoreType<Neon::eGrid>(config, report);
     }
     if (config.gridType == "bGrid") {
-        NEON_DEV_UNDER_CONSTRUCTION("");
-        //        return details::runFilterStoreType<Neon::bGrid>(config, report);
+        return details::runFilterStoreType<Neon::bGrid>(config, report);
     }
 }
 }  // namespace CavityTwoPop
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
index d0dd45c5..d4d663fd 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
@@ -111,7 +111,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate<T,
         };
 
         std::shared_ptr<Grid> grid;
-        BlockViewField<T, C>  memoryField;
+        BlockViewField<T, 0>  memoryField;
 
         int mCardinality;
 
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
index a6127c43..29a71248 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
@@ -250,7 +250,7 @@ auto bField<T, C, SBlock>::initHaloUpdateTable() -> void
                 setIdxVec[Data::EndPoints::src] = setIdxSrc;
 
                 std::array<Partition*, Data::EndPointsUtils::nConfigs>                                  partitions;
-                std::array<BlockViewPartition<T, C>*, Data::EndPointsUtils::nConfigs>                   blockViewPartitions;
+                std::array<BlockViewPartition<T, 0>*, Data::EndPointsUtils::nConfigs>                   blockViewPartitions;
                 std::array<std::array<int, ByDirectionUtils::nConfigs>, Data::EndPointsUtils::nConfigs> ghostZBeginIdx;
                 std::array<std::array<int, ByDirectionUtils::nConfigs>, Data::EndPointsUtils::nConfigs> boundaryZBeginIdx;
                 std::array<Neon::size_4d, Data::EndPointsUtils::nConfigs>                               memPhyDim;
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
index 7f537ad5..35abdc50 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
@@ -38,42 +38,80 @@ class bPartition
                         Neon::int32_3d*                               mOrigin,
                         NghIdx*                                       mStencilNghIndex);
 
+    /**
+     * Retrieve the cardinality of the field.
+     */
     inline NEON_CUDA_HOST_DEVICE auto
     cardinality()
         const -> int;
 
+    /**
+     * Gets the field metadata at a cartesian point.
+     */
     inline NEON_CUDA_HOST_DEVICE auto
     operator()(const Idx& cell,
                int        card)
         -> T&;
 
+    /**
+     * Gets the field metadata at a cartesian point.
+     */
     inline NEON_CUDA_HOST_DEVICE auto
     operator()(const Idx& cell,
                int        card)
         const -> const T&;
 
+    /**
+     * Gets the field metadata at a neighbour cartesian point.
+     */
     NEON_CUDA_HOST_DEVICE inline auto
     getNghData(const Idx&    cell,
                const NghIdx& offset,
                const int     card)
         const -> NghData;
 
+    /**
+     * Gets the field metadata at a neighbour cartesian point.
+     */
     NEON_CUDA_HOST_DEVICE inline auto
     getNghData(const Idx& eId,
                uint8_t    nghID,
                int        card)
         const -> NghData;
 
+    /**
+     * Gets the field metadata at a neighbour cartesian point.
+     */
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& eId,
+               int        card)
+        const -> NghData;
+
+    /**
+     * Gets the field metadata at a neighbour cartesian point.
+     */
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& eId,
+               int        card,
+               T          defaultValue)
+        const -> NghData;
+
+    /**
+     * Gets the global coordinates of the cartesian point.
+     */
     NEON_CUDA_HOST_DEVICE inline auto
     getGlobalIndex(const Idx& cell)
         const -> Neon::index_3d;
 
-
+    /**
+     * Gets the Idx for in the block view space.
+     */
     NEON_CUDA_HOST_DEVICE inline auto
-    getBlockViewGridIdx(const Idx& cell)
+    getBlockViewIdx(const Idx& cell)
         const -> BlockViewGridIdx;
 
-
    protected:
     NEON_CUDA_HOST_DEVICE inline auto
     helpGetPitch(const Idx& cell, int card)
@@ -91,6 +129,10 @@ class bPartition
     helpGetNghIdx(const Idx& idx, const NghIdx& offset)
         const -> Idx;
 
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetNghIdx(const Idx& idx)
+        const -> Idx;
 
     int                                             mCardinality;
     T*                                              mMem;
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
index 6e3b728f..d8bbef08 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
@@ -50,7 +50,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
 
 template <typename T, int C, typename SBlock>
 NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
-    getBlockViewGridIdx(const Idx& gidx)
+    getBlockViewIdx(const Idx& gidx)
         const -> BlockViewGridIdx
 {
     BlockViewGridIdx res;
@@ -193,6 +193,96 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     }
 }
 
+template <typename T, int C, typename SBlock>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    helpGetNghIdx(const Idx& idx)
+        const -> Idx
+{
+
+    typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + xOff,
+                                     idx.mInDataBlockIdx.y + yOff,
+                                     idx.mInDataBlockIdx.z + zOff);
+
+    /**
+     * 0 if no offset on the direction
+     * 1 positive offset
+     * -1 negative offset
+     */
+    const int xFlag = [&] {
+        if constexpr (xOff == 0) {
+            return 0;
+        } else {
+            return ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0);
+        }
+    }();
+
+
+    const int yFlag = [&] {
+        if constexpr (yOff == 0) {
+            return 0;
+        } else {
+            return ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0);
+        }
+    }();
+    const int zFlag = [&] {
+        if constexpr (zOff == 0) {
+            return 0;
+        } else {
+            return ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0);
+        }
+    }();
+
+    const bool isLocal = (xFlag | yFlag | zFlag) == 0;
+    if (!(isLocal)) {
+        typename Idx::InDataBlockIdx remoteInBlockOffset;
+        /**
+         * Example
+         * - 8 block (1D case)
+         * Case 1:
+         * |0,1,2,3|0,1,2,3|0,1,2,3|
+         *        ^     ^
+         *       -3     starting point
+         *
+         * - idx.inBlock = 2
+         * - offset = -1
+         * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3
+         * Case 2:
+         * |0,1,2,3|0,1,2,3|0,1,2,3|
+         *                ^     ^
+         *  starting point      +3 from 3
+         *
+         * - idx.inBlock = 3
+         * - offset = (+3,0)
+         * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2
+         *
+         * |0,1,2,3|0,1,2,3|0,1,2,3|
+         *  ^                   ^
+         *  -3 from 0          +3 from 3
+         *
+         * NOTE: if in one direction the neighbour offet is zero, xFalg is 0;
+         * */
+
+        Idx remoteNghIdx;
+        remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX;
+        remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX;
+        remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX;
+
+        int connectivityJump = idx.mDataBlockIdx * 27 +
+                               (xFlag + 1) +
+                               (yFlag + 1) * 3 +
+                               (zFlag + 1) * 9;
+        remoteNghIdx.mDataBlockIdx = mBlockConnectivity[connectivityJump];
+
+        return remoteNghIdx;
+    } else {
+        Idx localNghIdx;
+        localNghIdx.mDataBlockIdx = idx.mDataBlockIdx;
+        localNghIdx.mInDataBlockIdx = ngh;
+        return localNghIdx;
+    }
+}
+
 template <typename T, int C, typename SBlock>
 NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     getNghData(const Idx& eId,
@@ -223,4 +313,42 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     return result;
 }
 
+template <typename T, int C, typename SBlock>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getNghData(const Idx& idx,
+               int        card)
+        const -> NghData
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx<xOff, yOff, zOff>(idx);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+    if (!isValid) {
+        result.invalidate();
+        return result;
+    }
+    auto const value = mMem[pitch];
+    result.set(value, true);
+    return result;
+}
+
+template <typename T, int C, typename SBlock>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getNghData(const Idx& idx,
+               int        card,
+               T          defaultValue)
+        const -> NghData
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx<xOff, yOff, zOff>(idx);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+    if (!isValid) {
+        result.set(defaultValue, false);
+        return result;
+    }
+    auto const value = mMem[pitch];
+    result.set(value, true);
+    return result;
+}
 }  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
index cacac275..012a3588 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
@@ -74,8 +74,8 @@ class ePartition
 
    public:
     //-- [PUBLIC TYPES] ----------------------------------------------------------------------------
-    using Self = ePartition<T, C>;              //<- this type
-    using Idx = eIndex;                         //<- index type
+    using Self = ePartition<T, C>;            //<- this type
+    using Idx = eIndex;                       //<- index type
     using OuterIdx = typename Idx::OuterIdx;  //<- index type for the subGrid
 
     static constexpr int Cardinality = C;
@@ -147,15 +147,15 @@ class ePartition
     operator()(Idx eId, int cardinalityIdx)
         -> T&;
 
-//    template <typename ComputeType>
-//    NEON_CUDA_HOST_DEVICE inline auto
-//    castRead(Idx eId, int cardinalityIdx) const
-//        -> ComputeType;
-//
-//    template <typename ComputeType>
-//    NEON_CUDA_HOST_DEVICE inline auto
-//    castWrite(Idx eId, int cardinalityIdx, const ComputeType& value)
-//        -> void;
+    //    template <typename ComputeType>
+    //    NEON_CUDA_HOST_DEVICE inline auto
+    //    castRead(Idx eId, int cardinalityIdx) const
+    //        -> ComputeType;
+    //
+    //    template <typename ComputeType>
+    //    NEON_CUDA_HOST_DEVICE inline auto
+    //    castWrite(Idx eId, int cardinalityIdx, const ComputeType& value)
+    //        -> void;
     /**
      * Retrieve value of a neighbour for a field with multiple cardinalities
      * @tparam dataView_ta
@@ -165,9 +165,9 @@ class ePartition
      * @return
      */
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(Idx         eId,
-               NghIdx      nghIdx,
-               int         card)
+    getNghData(Idx    eId,
+               NghIdx nghIdx,
+               int    card)
         const -> NghData;
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -176,7 +176,18 @@ class ePartition
                int                  card)
         const -> NghData;
 
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(Idx eId,
+               int card)
+        const -> NghData;
 
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(Idx eId,
+               int card,
+               T defaultValue)
+        const -> NghData;
     /**
      * Check is the
      * @tparam dataView_ta
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h
index c2ff1ae0..0063ee9e 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h
@@ -87,6 +87,43 @@ ePartition<T, C>::getNghData(eIndex               eId,
     return res;
 }
 
+template <typename T,
+          int C>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::getNghData(eIndex               eId,
+                             int                  card)
+    const -> NghData
+{
+    int tablePithc = (xOff + mStencilRadius) +
+                     (yOff + mStencilRadius) * mStencilTableYPitch +
+                     (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
+    NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
+    NghData res = getNghData(eId, nghIdx, card);
+
+    return res;
+}
+
+template <typename T,
+          int C>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::getNghData(eIndex               eId,
+                             int                  card,
+                             T defaultVal)
+    const -> NghData
+{
+    int tablePithc = (xOff + mStencilRadius) +
+                     (yOff + mStencilRadius) * mStencilTableYPitch +
+                     (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
+    NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
+    NghData res = getNghData(eId, nghIdx, card);
+    if (!res.isValid()) {
+        res.set(defaultVal, false);
+    }
+    return res;
+}
+
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
diff --git a/libNeonSet/include/Neon/set/Containter_imp.h b/libNeonSet/include/Neon/set/Containter_imp.h
index 534d92ff..f7f421e6 100644
--- a/libNeonSet/include/Neon/set/Containter_imp.h
+++ b/libNeonSet/include/Neon/set/Containter_imp.h
@@ -48,6 +48,7 @@ auto Container::factory(const std::string&                                 name,
         std::shared_ptr<Neon::set::internal::ContainerAPI> tmp(k);
         return {tmp};
     }
+    NEON_THROW_UNSUPPORTED_OPERATION("Execution type not supported");
 }
 
 template <typename DataContainerT, typename UserLoadingLambdaT>

From 90a4ba9f628b93e86edf440b3df883cf83ee7ab2 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 19 Jun 2023 10:47:29 -0400
Subject: [PATCH 09/25] Code documentation

---
 .../include/Neon/domain/details/bGrid/bGrid.h |  7 +-
 .../Neon/domain/details/bGrid/bGrid_imp.h     |  6 +-
 .../include/Neon/domain/tools/Partitioner1D.h | 74 +++++++++++++------
 3 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
index 8ed458c8..62ae8ad6 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
@@ -71,9 +71,8 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
           const Neon::int32_3d&        domainSize /**< Size of the bounded Cartesian */,
           const ActiveCellLambda       activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization  */,
           const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */,
-          const int                    voxelSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing  and (i+1)* voxelSpacing.
-                                                     * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/
-          ,
+          const int                    multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing  and (i+1)* voxelSpacing.
+                                                     * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/,
           const double_3d& spacingData = double_3d(1, 1, 1) /** Physical spacing between two consecutive data points in the Cartesian domain */,
           const double_3d& origin = double_3d(0, 0, 0) /** Physical location in space of the origin of the Cartesian discretization */);
 
@@ -212,7 +211,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
 
         tool::Partitioner1D::DenseMeta denseMeta;
 
-        int voxelSpacing;
+        int mMultiResDiscreteIdxSpacing;
 
         // number of active voxels in each block
         Neon::set::DataSet<uint64_t> mNumActiveVoxel;
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
index b921a3e1..85da8a62 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
@@ -20,7 +20,7 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
                      const Neon::int32_3d&        domainSize,
                      const ActiveCellLambda       activeCellLambda,
                      const Neon::domain::Stencil& stencil,
-                     const int                    voxelSpacing,
+                     const int                    multiResDiscreteIdxSpacing,
                      const double_3d&             spacingData,
                      const double_3d&             origin)
 {
@@ -29,7 +29,7 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
     mData = std::make_shared<Data>();
     mData->init(backend);
 
-    mData->voxelSpacing = voxelSpacing;
+    mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing;
     mData->stencil = stencil;
     const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX,
                                           SBlock::memBlockSizeY,
@@ -45,7 +45,7 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
                               stencil,
                               nElementsPerPartition,
                               defaultKernelBlockSize,
-                              voxelSpacing,
+                              multiResDiscreteIdxSpacing,
                               origin);
     }
 
diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h
index e162512c..0204098c 100644
--- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h
+++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h
@@ -8,6 +8,34 @@
 
 namespace Neon::domain::tool {
 
+/**
+ * Abstraction for a partitioner on a 1D domain.
+ *
+ * Partitioning is executed over the cartesian index space of the domain.
+ * The Partitioner works at the block granularity. The block size is defined by the user.
+ *
+ * The partitioning is done in thee steps:
+ * a. [DOMAIN DECOMPOSITION] - Projecting of the blocks into the Z-axis and then applying a uniform partitioning schema.
+ *    Definition of the span of each partition is the final result of this step.
+ *
+ * b. [CLASSIFIER] - For each partition, the indexes in a partition span are classified twice:
+ *    - First, the indexes are classified according to the data view configuration.
+ *       - INTERNAL: The span is fully contained in the partition.
+ *       - BOUNDARY: The span is partially contained in the partition.
+ *       - GHOST: The span is not contained in the partition.
+ *    - Second, the indexes are classified according to the boundary conditions. This is a user driven classification
+ *
+ * c. [LAYOUT] - The final step is to layout the indexes in memory, i.e. decide for each index its position in a 1D array.
+ *
+ * The final layout of each partitioning will look like the following:
+ *
+ * --------------------------------------------------------------------
+ * | Internal  |       Boundary         |            Ghost            |
+ * |           |    UP     |     DW     |      UP      |      Dw      |
+ * | Bulk | Bc | Bulk | Bc | Bulk | Bc  | Bulk |  Bc   | Bulk |   Bc  |
+ * --------------------------------------------------------------------
+ *
+ */
 class Partitioner1D
 {
    public:
@@ -75,59 +103,59 @@ class Partitioner1D
         Meta                 invalidMeta;
     };
 
-    template <typename ActiveCellLambda,
+    template <typename ActiveIndexLambda,
               typename BcLambda>
     Partitioner1D(const Neon::Backend&        backend,
-                  const ActiveCellLambda&     activeCellLambda,
+                  const ActiveIndexLambda&    activeIndexLambda,
                   const BcLambda&             bcLambda,
                   const Neon::index_3d&       dataBlockSize,
                   const Neon::int32_3d&       domainSize,
                   const Neon::domain::Stencil stencil,
-                  const int&                  discreteVoxelSpacing = 1)
+                  const int&                  multiResDiscreteIdxSpacing = 1)
     {
         mData = std::make_shared<Data>();
 
         mData->mDataBlockSize = dataBlockSize;
-        mData->mDiscreteVoxelSpacing = discreteVoxelSpacing;
+        mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing;
         mData->mStencil = stencil;
         mData->mDomainSize = domainSize;
 
-        Neon::int32_3d block3DSpan(NEON_DIVIDE_UP(domainSize.x, dataBlockSize.x),
-                                   NEON_DIVIDE_UP(domainSize.y, dataBlockSize.y),
-                                   NEON_DIVIDE_UP(domainSize.z, dataBlockSize.z));
+        // Block space interval (i.e. indexing space at the block granularity)
 
-        mData->block3DSpan = block3DSpan;
+        mData->block3DSpan = Neon::int32_3d(NEON_DIVIDE_UP(domainSize.x, dataBlockSize.x),
+                                            NEON_DIVIDE_UP(domainSize.y, dataBlockSize.y),
+                                            NEON_DIVIDE_UP(domainSize.z, dataBlockSize.z));
 
         std::vector<int> nBlockProjectedToZ(block3DSpan.z);
 
         auto block3dIdxToBlockOrigin = [&](Neon::int32_3d const& block3dIdx) {
-            Neon::int32_3d blockOrigin(block3dIdx.x * dataBlockSize.x * discreteVoxelSpacing,
-                                       block3dIdx.y * dataBlockSize.y * discreteVoxelSpacing,
-                                       block3dIdx.z * dataBlockSize.z * discreteVoxelSpacing);
+            Neon::int32_3d blockOrigin(block3dIdx.x * dataBlockSize.x * multiResDiscreteIdxSpacing,
+                                       block3dIdx.y * dataBlockSize.y * multiResDiscreteIdxSpacing,
+                                       block3dIdx.z * dataBlockSize.z * multiResDiscreteIdxSpacing);
             return blockOrigin;
         };
 
         auto getVoxelAbsolute3DIdx = [&](Neon::int32_3d const& blockOrigin,
                                          Neon::int32_3d const& voxelRelative3DIdx) {
-            const Neon::int32_3d id(blockOrigin.x + voxelRelative3DIdx.x * discreteVoxelSpacing,
-                                    blockOrigin.y + voxelRelative3DIdx.y * discreteVoxelSpacing,
-                                    blockOrigin.z + voxelRelative3DIdx.z * discreteVoxelSpacing);
+            const Neon::int32_3d id(blockOrigin.x + voxelRelative3DIdx.x * multiResDiscreteIdxSpacing,
+                                    blockOrigin.y + voxelRelative3DIdx.y * multiResDiscreteIdxSpacing,
+                                    blockOrigin.z + voxelRelative3DIdx.z * multiResDiscreteIdxSpacing);
             return id;
         };
 
         mData->spanDecomposition = std::make_shared<partitioning::SpanDecomposition>(
             backend,
-            activeCellLambda,
+            activeIndexLambda,
             block3dIdxToBlockOrigin,
             getVoxelAbsolute3DIdx,
             block3DSpan,
             dataBlockSize,
             domainSize,
-            discreteVoxelSpacing);
+            multiResDiscreteIdxSpacing);
 
         mData->mSpanClassifier = std::make_shared<partitioning::SpanClassifier>(
             backend,
-            activeCellLambda,
+            activeIndexLambda,
             bcLambda,
             block3dIdxToBlockOrigin,
             getVoxelAbsolute3DIdx,
@@ -135,7 +163,7 @@ class Partitioner1D
             dataBlockSize,
             domainSize,
             stencil,
-            discreteVoxelSpacing,
+            multiResDiscreteIdxSpacing,
             mData->spanDecomposition);
 
         mData->mSpanLayout = std::make_shared<partitioning::SpanLayout>(
@@ -147,10 +175,12 @@ class Partitioner1D
                                           mData->mSpanLayout->getStandardAndGhostCount().typedClone<size_t>(), {251, 1, 1});
     }
 
-    auto getBlockSpan() -> Neon::int32_3d
+    auto getBlockSpan() const
+        -> Neon::int32_3d
     {
         return mData->block3DSpan;
     }
+    
     auto getMemoryGrid() -> Neon::aGrid&
     {
         return mData->mTopologyWithGhost;
@@ -207,7 +237,7 @@ class Partitioner1D
 
                                 aGrid::Cell    idx(count);
                                 Neon::int32_3d point3d = mapperVec[j];
-                                point3d = point3d * mData->mDiscreteVoxelSpacing * mData->mDataBlockSize;
+                                point3d = point3d * mData->mMultiResDiscreteIdxSpacing * mData->mDataBlockSize;
                                 partition(idx, 0) = point3d;
                                 count++;
                             }
@@ -349,7 +379,7 @@ class Partitioner1D
                                     if (findings.first) {
                                         targetNgh = findings.second;
                                     }
-                                    aGrid::Cell aIdx(static_cast <aGrid::Cell::Location>(start + blockIdx));
+                                    aGrid::Cell aIdx(static_cast<aGrid::Cell::Location>(start + blockIdx));
                                     partition(aIdx, s) = targetNgh;
                                 }
                             }
@@ -402,7 +432,7 @@ class Partitioner1D
     {
        public:
         Neon::index_3d                        mDataBlockSize = 0;
-        int                                   mDiscreteVoxelSpacing = 0;
+        int                                   mMultiResDiscreteIdxSpacing = 0;
         Neon::domain::Stencil                 mStencil;
         Neon::index_3d                        mDomainSize;
         Neon::int32_3d                        block3DSpan;

From 019db4d6a03e771b8309d3d2291ccb151e071e98 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 19 Jun 2023 17:38:57 -0400
Subject: [PATCH 10/25] Fixing grid spacing in bGrid.

---
 .../Neon/domain/details/bGrid/bField.h        | 10 +--------
 .../Neon/domain/details/bGrid/bField_imp.h    | 21 ++++++++++++++-----
 .../include/Neon/domain/details/bGrid/bGrid.h | 15 ++++++++++---
 .../Neon/domain/details/bGrid/bGrid_imp.h     | 19 +++++++++++++----
 .../Neon/domain/details/bGrid/bPartition.h    |  1 +
 .../domain/details/bGrid/bPartition_imp.h     |  5 ++++-
 .../include/Neon/domain/tools/Partitioner1D.h |  6 +++---
 7 files changed, 52 insertions(+), 25 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
index d4d663fd..565ae518 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h
@@ -84,13 +84,6 @@ class bField : public Neon::domain::interface::FieldBaseTemplate<T,
     auto initHaloUpdateTable() -> void;
 
 
-    //
-    //    enum PartitionBackend
-    //    {
-    //        cpu = 0,
-    //        gpu = 1,
-    //    };
-
     struct Data
     {
         Data() = default;
@@ -112,8 +105,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate<T,
 
         std::shared_ptr<Grid> grid;
         BlockViewField<T, 0>  memoryField;
-
-        int mCardinality;
+        int                   cardinality;
 
         //        Neon::domain::tool::HaloTable1DPartitioning   latticeHaloUpdateTable;
         Neon::domain::tool::HaloTable1DPartitioning soaHaloUpdateTable;
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
index 29a71248..52802f1c 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
@@ -79,11 +79,22 @@ template <typename T, int C, typename SBlock>
 auto bField<T, C, SBlock>::getReference(const Neon::index_3d& cartesianIdx,
                                         const int&            cardinality) -> T&
 {
-    auto& grid = this->getGrid();
-    auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx);
-    auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD);
-    auto& result = partition(bIdx, cardinality);
-    return result;
+    if constexpr (SBlock::isMultiResMode) {
+        auto& grid = this->getGrid();
+        auto  uniformCartesianIdx = cartesianIdx / grid.helpGetMultiResFactor();
+        auto  uniformCartesianIdxTruncation = cartesianIdx % grid.helpGetMultiResFactor();
+        static_assert(uniformCartesianIdxTruncation == 0);
+        auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(uniformCartesianIdx);
+        auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD);
+        auto& result = partition(bIdx, cardinality);
+        return result;
+    } else {
+        auto& grid = this->getGrid();
+        auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx);
+        auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD);
+        auto& result = partition(bIdx, cardinality);
+        return result;
+    }
 }
 
 template <typename T, int C, typename SBlock>
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
index 62ae8ad6..d94d1aa1 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
@@ -72,9 +72,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
           const ActiveCellLambda       activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization  */,
           const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */,
           const int                    multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing  and (i+1)* voxelSpacing.
-                                                     * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/,
-          const double_3d& spacingData = double_3d(1, 1, 1) /** Physical spacing between two consecutive data points in the Cartesian domain */,
-          const double_3d& origin = double_3d(0, 0, 0) /** Physical location in space of the origin of the Cartesian discretization */);
+                                                                   * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */
+          ,
+          const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */,
+          const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */);
 
     /**
      * Returns some properties for a given cartesian in the Cartesian domain.
@@ -159,12 +160,20 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
      */
     auto getBlockViewGrid() const -> BlockView::Grid&;
 
+
     /**
      * Retrieve the block vew grid internally used.
      * This grid can be leverage to allocate data at the block level.
      */
     auto getActiveBitMask() const -> BlockView::Field<typename SBlock::BitMask, 1>&;
 
+    /**
+     * Helper function to retrieve the discrete index spacing used for the multi-resolution
+     */
+    template <int dummy = SBlock::isMultiResMode>
+    auto helGetMultiResDiscreteIdxSpacing() const -> std::enable_if_t<dummy == 1, int>;
+
+
     /**
      * Help function to retrieve the block connectivity as a BlockViewGrid field
      */
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
index 85da8a62..bde200e3 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
@@ -58,7 +58,7 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
             SBlock::memBlockSize3D.template newType<int32_t>(),
             domainSize,
             Neon::domain::Stencil::s27_t(false),
-            1);
+            multiResDiscreteIdxSpacing);
 
         mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping();
         mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset();
@@ -95,7 +95,7 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
             .getGrid()
             .template newContainer<Neon::Execution::host>(
                 "activeBitMaskInit",
-                [&](Neon::set::Loader& loader) {
+                [&, this](Neon::set::Loader& loader) {
                     auto bitMaskPartition = loader.load(mData->activeBitField);
                     return [&, bitMaskPartition](const auto& bitMaskIdx) mutable {
                         auto                      prtIdx = bitMaskPartition.prtID();
@@ -107,9 +107,9 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
                         for (int k = 0; k < SBlock::memBlockSize3D.template newType<int32_t>().z; k++) {
                             for (int j = 0; j < SBlock::memBlockSize3D.template newType<int32_t>().y; j++) {
                                 for (int i = 0; i < SBlock::memBlockSize3D.template newType<int32_t>().x; i++) {
-                                    auto globalPosition = blockOrigin + Neon::int32_3d(i, j, k);
+                                    auto       globalPosition = blockOrigin + Neon::int32_3d(i, j, k);
                                     bool const isInDomain = globalPosition < domainSize;
-                                    bool const isActive = activeCellLambda(globalPosition);
+                                    bool const isActive = activeCellLambda(globalPosition * mData->mMultiResDiscreteIdxSpacing);
                                     if (isActive && isInDomain) {
                                         countActive++;
                                         bitMask.setActive(i, j, k);
@@ -319,6 +319,17 @@ auto bGrid<SBlock>::
     return mData->activeBitField;
 }
 
+/**
+ * Helper function to retrieve the discrete index spacing used for the multi-resolution
+ */
+template <typename SBlock>
+template <int dummy>
+auto bGrid<SBlock>::helGetMultiResDiscreteIdxSpacing() const
+    -> std::enable_if_t<dummy == 1, int>
+{
+    return mData->mMultiResDiscreteIdxSpacing;
+}
+
 template <typename SBlock>
 auto bGrid<SBlock>::
     helpGetBlockConnectivity()
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
index 35abdc50..73ccb914 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
@@ -141,6 +141,7 @@ class bPartition
     typename SBlock::BitMask const* NEON_RESTRICT   mMask;
     Neon::int32_3d const* NEON_RESTRICT             mOrigin;
     int                                             mSetIdx;
+    int                                             mMultiResDiscreteIdxSpacing = 1;
 };
 
 }  // namespace Neon::domain::details::bGrid
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
index d8bbef08..dc4c5880 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
@@ -45,7 +45,10 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     location.x += gidx.mInDataBlockIdx.x;
     location.y += gidx.mInDataBlockIdx.y;
     location.z += gidx.mInDataBlockIdx.z;
-    return location;
+    if constexpr (SBlock::isMultiResMode){
+        return location * mMultiResDiscreteIdxSpacing;
+    }
+    return location ;
 }
 
 template <typename T, int C, typename SBlock>
diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h
index 0204098c..ac49dc6f 100644
--- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h
+++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h
@@ -126,7 +126,7 @@ class Partitioner1D
                                             NEON_DIVIDE_UP(domainSize.y, dataBlockSize.y),
                                             NEON_DIVIDE_UP(domainSize.z, dataBlockSize.z));
 
-        std::vector<int> nBlockProjectedToZ(block3DSpan.z);
+        std::vector<int> nBlockProjectedToZ(mData->block3DSpan.z);
 
         auto block3dIdxToBlockOrigin = [&](Neon::int32_3d const& block3dIdx) {
             Neon::int32_3d blockOrigin(block3dIdx.x * dataBlockSize.x * multiResDiscreteIdxSpacing,
@@ -148,7 +148,7 @@ class Partitioner1D
             activeIndexLambda,
             block3dIdxToBlockOrigin,
             getVoxelAbsolute3DIdx,
-            block3DSpan,
+            mData->block3DSpan,
             dataBlockSize,
             domainSize,
             multiResDiscreteIdxSpacing);
@@ -159,7 +159,7 @@ class Partitioner1D
             bcLambda,
             block3dIdxToBlockOrigin,
             getVoxelAbsolute3DIdx,
-            block3DSpan,
+            mData->block3DSpan,
             dataBlockSize,
             domainSize,
             stencil,

From 588b74601d393955714aead173d82dc161327182 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 22 Jun 2023 18:51:30 -0400
Subject: [PATCH 11/25] WIP

---
 .../src/RunCavityTwoPop.cu                    |   4 +
 .../Neon/core/types/vec/vec4d_integer.tdecl.h |   1 +
 .../Neon/domain/details/dGrid/dPartition.h    |  40 +-
 .../Neon/domain/details/dGridSoA/dGridSoA.h   |  97 +++++
 .../Neon/domain/details/dGridSoA/dIndexSoA.h  |  53 +++
 .../domain/details/dGridSoA/dIndexSoA_imp.h   |  50 +++
 .../domain/details/dGridSoA/dPartitionSoA.h   | 342 ++++++++++++++++++
 .../Neon/domain/details/dGridSoA/dSpanSoA.h   |  52 +++
 .../domain/details/dGridSoA/dSpanSoA_imp.h    |  71 ++++
 9 files changed, 696 insertions(+), 14 deletions(-)
 create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h

diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
index c603415c..d28688d1 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
@@ -3,6 +3,7 @@
 #include "Neon/domain/bGrid.h"
 #include "Neon/domain/dGrid.h"
 #include "Neon/domain/eGrid.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 
 #include "CellType.h"
 #include "LbmIteration.h"
@@ -313,5 +314,8 @@ auto run(Config& config,
     if (config.gridType == "bGrid") {
         return details::runFilterStoreType<Neon::bGrid>(config, report);
     }
+    if (config.gridType == "dGridSoA") {
+        return details::runFilterStoreType<Neon::domain::details::dGridSoA::dSpanSoA>(config, report);
+    }
 }
 }  // namespace CavityTwoPop
diff --git a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h
index 788291a6..940c6d2c 100644
--- a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h
+++ b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h
@@ -58,6 +58,7 @@ template <typename IntegerType_ta>
 class Vec_4d<IntegerType_ta, true, false>
 {
    public:
+    using Integer = IntegerType_ta;
     using element_t = IntegerType_ta;
     using self_t = Vec_4d<element_t, true, false>;
 
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
index 196f6b70..31e480aa 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
@@ -150,13 +150,13 @@ class dPartition
         return NghData(val, isValidNeighbour);
     }
 
-    template <int xOff, int yOff, int zOff, typename LambdaVALID, typename LambdaNOTValid = void* >
+    template <int xOff, int yOff, int zOff, typename LambdaVALID, typename LambdaNOTValid = void*>
     NEON_CUDA_HOST_DEVICE inline auto
     getNghData(const Idx&     eId,
                int            card,
                LambdaVALID    funIfValid,
                LambdaNOTValid funIfNOTValid = nullptr)
-        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> , void>
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T>, void>
     {
         Idx        cellNgh;
         const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(eId, cellNgh);
@@ -419,19 +419,31 @@ class dPartition
         return;
     }
 
+    auto getDataView()
+        -> Neon::DataView
+    {
+        return m_dataView;
+    }
+
+    auto helpGetGlobalToLocalOffets() const
+        -> NghIdx const*
+    {
+        return mStencil;
+    }
+
    private:
-    Neon::DataView m_dataView;
-    T*             m_mem;
-    Neon::index_3d m_dim;
-    int            m_zHaloRadius;
-    int            m_zBoundaryRadius;
-    Pitch          m_pitch;
-    int            m_prtID;
-    Neon::index_3d m_origin;
-    int            m_cardinality;
-    Neon::index_3d m_fullGridSize;
-    bool           mPeriodicZ;
-    NghIdx*        mStencil;
+    Neon::DataView        m_dataView;
+    T* NEON_RESTRICT      m_mem;
+    Neon::index_3d        m_dim;
+    int                   m_zHaloRadius;
+    int                   m_zBoundaryRadius;
+    Pitch                 m_pitch;
+    int                   m_prtID;
+    Neon::index_3d        m_origin;
+    int                   m_cardinality;
+    Neon::index_3d        m_fullGridSize;
+    bool                  mPeriodicZ;
+    NghIdx* NEON_RESTRICT mStencil;
 };
 
 
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h
new file mode 100644
index 00000000..61b182b2
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h
@@ -0,0 +1,97 @@
+#pragma once
+#include <assert.h>
+
+#include "Neon/core/core.h"
+#include "Neon/core/types/DataUse.h"
+#include "Neon/core/types/Macros.h"
+
+#include "Neon/set/BlockConfig.h"
+#include "Neon/set/Containter.h"
+#include "Neon/set/DevSet.h"
+#include "Neon/set/MemoryOptions.h"
+
+#include "Neon/sys/memory/MemDevice.h"
+
+#include "Neon/domain/aGrid.h"
+
+#include "Neon/domain/interface/GridBaseTemplate.h"
+#include "Neon/domain/interface/GridConcept.h"
+#include "Neon/domain/interface/KernelConfig.h"
+#include "Neon/domain/interface/LaunchConfig.h"
+#include "Neon/domain/interface/Stencil.h"
+#include "Neon/domain/interface/common.h"
+
+#include "Neon/domain/tools/GridTransformer.h"
+#include "Neon/domain/tools/SpanTable.h"
+
+#include "Neon/domain/details/eGrid/eGrid.h"
+#include "Neon/domain/patterns/PatternScalar.h"
+
+#include "dPartitionSoA.h"
+
+namespace Neon::domain::details::dGridSoA {
+
+namespace details {
+struct dGridSoATransformation
+{
+    template <typename T, int C>
+    using Partition = dPartitionSoA<T, C>;
+    using Span = Neon::domain::details::eGrid::eSpan;
+    static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on;
+
+    using FoundationGrid = Neon::domain::details::eGrid::eGrid;
+    static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan;
+    using ExecutionThreadSpanIndexType = int32_t;
+    using Idx = FoundationGrid::Idx;
+
+    static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const&
+    {
+        return foundationGrid.getDefaultBlock();
+    }
+
+    static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable<Span>& spanTable) -> void
+    {
+        spanTable.forEachConfiguration([&](Neon::Execution execution,
+                                           Neon::SetIdx    setIdx,
+                                           Neon::DataView  dw,
+                                           Span&           span) {
+            span = foundationGrid.getSpan(execution, setIdx, dw);
+        });
+    }
+
+    static auto initLaunchParameters(FoundationGrid&       foundationGrid,
+                                     Neon::DataView        dataView,
+                                     const Neon::index_3d& blockSize,
+                                     const size_t&         shareMem) -> Neon::set::LaunchParameters
+    {
+        return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem);
+    }
+
+    static auto helpGetGridIdx(FoundationGrid&,
+                               Neon::SetIdx const&,
+                               FoundationGrid::Idx const& fgIdx)
+        -> GridTransformation::Idx
+    {
+        GridTransformation::Idx tgIdx = fgIdx;
+        return tgIdx;
+    }
+
+    template <typename T, int C>
+    static auto initFieldPartition(FoundationGrid::Field<T, C>&                         foundationField,
+                                   Neon::domain::tool::PartitionTable<Partition<T, C>>& partitionTable) -> void
+    {
+        partitionTable.forEachConfiguration(
+            [&](Neon::Execution  execution,
+                Neon::SetIdx     setIdx,
+                Neon::DataView   dw,
+                Partition<T, C>& partition) {
+                auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw);
+                partition = Partition<T, C>(foundationPartition);
+            });
+    }
+};
+
+}  // namespace details
+using dGridSoA = Neon::domain::tool::GridTransformer<details::dGridSoATransformation>::Grid;
+
+}  // namespace Neon::domain::details::dGridSoA
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h
new file mode 100644
index 00000000..2ed82d86
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "Neon/core/core.h"
+#include "Neon/domain/details/dGridSoA/dIndexSoA.h"
+
+namespace Neon::domain::details::dGridSoA {
+
+// Common forward declarations
+class dSpanSoA;
+template <typename T, int C>
+class dPartitionSoA;
+
+struct dIndexSoA
+{
+    using OuterIdx = dIndexSoA;
+
+    template <typename T, int C>
+    friend class dPartition;
+    friend dSpanSoA;
+
+    template <typename T,
+              int Cardinality>
+    friend class dField;
+
+    // dGrid specific types
+    using Offset = int32_t;
+    using Location = index_3d;
+    using Count = int32_t;
+
+    dIndexSoA() = default;
+    Location mLocation = 0;
+    Offset   mOffset = 0;
+
+    NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location const& location,
+                                                    Offset const&   offset);
+
+    NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location::Integer const& x,
+                                                 Location::Integer const& y,
+                                                 Location::Integer const& z,
+                                                 Offset const&            offset);
+
+    NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&;
+
+    NEON_CUDA_HOST_DEVICE inline auto setOffset() -> Offset&;
+
+    NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&;
+
+    NEON_CUDA_HOST_DEVICE inline auto getOffset() const -> const Offset&;
+};
+
+}  // namespace Neon::domain::details::dGridSoA
+
+#include "dIndexSoA_imp.h"
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h
new file mode 100644
index 00000000..790608c7
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h
@@ -0,0 +1,50 @@
+#pragma once
+#include "Neon/core/core.h"
+
+namespace Neon::domain::details::dGridSoA {
+
+NEON_CUDA_HOST_DEVICE inline dIndexSoA::
+    dIndexSoA(const Location& location,
+              Offset const&   offset)
+{
+    mLocation = location;
+    mOffset = offset;
+}
+
+NEON_CUDA_HOST_DEVICE inline dIndexSoA::
+    dIndexSoA(const Location::Integer& x,
+              const Location::Integer& y,
+              const Location::Integer& z,
+              Offset const&            offset)
+{
+    mLocation.x = x;
+    mLocation.y = y;
+    mLocation.z = z;
+    mOffset = offset;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dIndexSoA::
+    setLocation() -> Location&
+{
+    return mLocation;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dIndexSoA::
+    setOffset() -> Offset&
+{
+    return mOffset;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dIndexSoA::
+    getLocation() const -> const Location&
+{
+    return mLocation;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dIndexSoA::
+    getOffset()
+        const -> const Offset&
+{
+    return mOffset;
+}
+}  // namespace Neon::domain::details::dGridSoA
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
new file mode 100644
index 00000000..fc4c3642
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
@@ -0,0 +1,342 @@
+#pragma once
+#include <assert.h>
+#include "Neon/core/core.h"
+#include "Neon/core/types/Macros.h"
+#include "Neon/domain/details/dGrid/dGrid.h"
+#include "Neon/domain/interface/NghData.h"
+#include "Neon/set/DevSet.h"
+#include "Neon/sys/memory/CudaIntrinsics.h"
+#include "cuda_fp16.h"
+#include "dIndexSoA.h"
+
+namespace Neon::domain::details::dGridSoA {
+
+template <typename T,
+          int C = 1>
+class dPartitionSoA
+{
+   public:
+    using Idx = dIndexSoA;
+    using NghData = Neon::domain::NghData<T>;
+    using Pitch = uint32_4d;
+
+    dPartitionSoA()
+    {
+    }
+
+    dPartitionSoA(Neon::domain::details::dGrid::dPartition<T, C> const& dPartitionOriginal)
+    {
+        mDataView = dPartitionOriginal.getDataView();
+        mMem = dPartitionOriginal.mem();
+        mDim = dPartitionOriginal.dim();
+        mZHaloRadius = dPartitionOriginal.halo().z;
+        mPitch = dPartitionOriginal.getPitchData().template newType<Pitch::Integer>();
+        mPrtID = dPartitionOriginal.prtID();
+        mOrigin = dPartitionOriginal.origin();
+        mCardinality = dPartitionOriginal.cardinality();
+        mFullGridSize = dPartitionOriginal.fullGridSize();
+        NghIdx* mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets();
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    prtID()
+        const -> int
+    {
+        return mPrtID();
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    cardinality()
+        const -> int
+    {
+        return mCardinality();
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    getPitchData()
+        const -> const Pitch&
+    {
+        return mPitch;
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    getPitch(const Idx& idx,
+             int        cardinality)
+        -> Idx::Offset
+    {
+        return idx.getLocationOffset() + cardinality * mPitch.w;
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    dim()
+        const -> const Neon::index_3d
+    {
+        return mDim();
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    halo()
+        const -> const Neon::index_3d
+    {
+        return mDPartition.halo();
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    origin()
+        const -> const Neon::index_3d
+    {
+        return m_ormDPartition.origin();
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& gidx,
+               NghIdx     nghOffset,
+               int        card,
+               const T&   alternativeVal)
+        const -> NghData
+    {
+        Idx        gidxNgh;
+        const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh);
+        T          val = alternativeVal;
+        if (isValidNeighbour) {
+            val = operator()(gidxNgh, card);
+        }
+        return NghData(val, isValidNeighbour);
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& gidx,
+               NghIdx     nghOffset,
+               int        card)
+        const -> NghData
+    {
+        Idx        gidxNgh;
+        const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh);
+        T          val;
+        if (isValidNeighbour) {
+            val = operator()(gidxNgh, card);
+        }
+        return NghData(val, isValidNeighbour);
+    }
+
+    template <int xOff,
+              int yOff,
+              int zOff,
+              typename LambdaVALID,
+              typename LambdaNOTValid = void*>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid = nullptr)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T>, void>
+    {
+        Idx        gidxNgh;
+        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        if (isValidNeighbour) {
+            T val = this->operator()(gidxNgh, card);
+            funIfValid(val);
+        }
+        if constexpr (!std::is_same_v<LambdaNOTValid, void*>) {
+            if (!isValidNeighbour) {
+                funIfNOTValid();
+            }
+        }
+    }
+
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& gidx,
+               int        card)
+        const -> NghData
+    {
+        NghData    res;
+        Idx        gidxNgh;
+        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        if (isValidNeighbour) {
+            T val = operator()(gidxNgh, card);
+            res.set(val, true);
+        } else {
+            res.invalidate();
+        }
+        return res;
+    }
+
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& gidx,
+               int        card,
+               T const&   defaultValue)
+        const -> NghData
+    {
+        NghData    res(defaultValue, false);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        if (isValidNeighbour) {
+            T val = operator()(gidxNgh, card);
+            res.set(val, true);
+        }
+        return res;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    nghVal(const Idx& gidx,
+           uint8_t    nghID,
+           int        card,
+           const T&   alternativeVal)
+        const -> NghData
+    {
+        NghIdx nghOffset = mStencil[nghID];
+        return getNghData(gidx, nghOffset, card, alternativeVal);
+    }
+
+    /**
+     * Get the index of the neighbor given the offset
+     * @tparam dataView_ta
+     * @param[in] gidx Index of the current element
+     * @param[in] nghOffset Offset of the neighbor of interest from the current element
+     * @param[in,out] neighbourIdx Index of the neighbor
+     * @return Whether the neighbour is valid
+     */
+    NEON_CUDA_HOST_DEVICE inline auto
+    nghIdx(const Idx&    gidx,
+           const NghIdx& nghOffset,
+           Idx&          neighbourIdx)
+        const -> bool
+    {
+        Neon::index_3d cartesian(gidx.get().x + nghOffset.x,
+                                 gidx.get().y + nghOffset.y,
+                                 gidx.get().z + nghOffset.z);
+
+        neighbourIdx = Idx(cartesian,
+                           gidx.getOffset() + nghOffset.x * getPitchData().x +
+                               nghOffset.y * getPitchData().y +
+                               nghOffset.z * getPitchData().z);
+
+        Idx::Location nghCartesianGlobal = getGlobalIndex(gidxNgh);
+
+        bool isValidNeighbour = true;
+
+        isValidNeighbour = (gidxNghGlobal.x >= 0) &&
+                           (gidxNghGlobal.y >= 0) &&
+                           (gidxNghGlobal.z >= 0);
+
+        isValidNeighbour = (gidxNghGlobal.x < m_fullGridSize.x) &&
+                           (gidxNghGlobal.y < m_fullGridSize.y) &&
+                           (gidxNghGlobal.z < m_fullGridSize.z) &&
+                           isValidNeighbour;
+
+        return isValidNeighbour;
+    }
+
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetNghIdx(const Idx& gidx,
+                  Idx&       gidxNgh)
+        const -> bool
+    {
+        Neon::index_3d cartesian(gidx.get().x + xOff,
+                                 gidx.get().y + yOff,
+                                 gidx.get().z + zOff);
+        gidxNgh = Idx(cartesian,
+                      gidx.getOffset() + xOff * getPitchData().x +
+                          yOff * getPitchData().y +
+                          zOff * getPitchData().z);
+
+        Idx::Location nghCartesianGlobal(getGlobalIndex(gidxNgh));
+
+        bool isValidNeighbour = true;
+        if constexpr (xOff > 0) {
+            isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour;
+            isValidNeighbour = nghCartesianGlobal.x <= mDPartition.m_fullGridSize.x && isValidNeighbour;
+        }
+        if constexpr (xOff < 0) {
+            isValidNeighbour = nghCartesianGlobal.x >= 0 && isValidNeighbour;
+        }
+        if constexpr (yOff > 0) {
+            isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour;
+            isValidNeighbour = nghCartesianGlobal.y <= mDPartition.m_fullGridSize.y && isValidNeighbour;
+        }
+        if constexpr (yOff < 0) {
+            isValidNeighbour = nghCartesianGlobal.y >= 0 && isValidNeighbour;
+        }
+        if constexpr (zOff > 0) {
+            isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour;
+            isValidNeighbour = nghCartesianGlobal.z <= mDPartition.m_fullGridSize.z && isValidNeighbour;
+        }
+        if constexpr (zOff < 0) {
+            isValidNeighbour = nghCartesianGlobal.z >= mDPartition.m_zHaloRadius && isValidNeighbour;
+        }
+        return isValidNeighbour;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    mem()
+        -> T*
+    {
+        return mDPartition.m_mem;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    mem() const
+        -> const T*
+    {
+        return mDPartition.m_mem;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    mem(const Idx& cell,
+        int        cardinalityIdx)
+        -> T*
+    {
+        Idx::Offset p = getPitch(cell, cardinalityIdx);
+        return mDPartition.m_mem[p];
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    operator()(const Idx& cell,
+               int        cardinalityIdx)
+        -> T&
+    {
+        Idx::Offset p = getPitch(cell, cardinalityIdx);
+        return mDPartition.m_mem[p];
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    operator()(const Idx& cell,
+               int        cardinalityIdx)
+        const -> const T&
+    {
+        Idx::Offset p = getPitch(cell, cardinalityIdx);
+        return mDPartition.m_mem[p];
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local)
+        const -> Neon::index_3d
+    {
+        Neon::index_3d result = local.mLocation + m_origin;
+        result.z -= mDPartition.m_zHaloRadius;
+        return result;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto getDomainSize()
+        const -> Neon::index_3d
+    {
+        return mDPartition.m_fullGridSize;
+    }
+
+    Neon::DataView        mDataView;
+    T* NEON_RESTRICT      mMem;
+    Neon::index_3d        mDim;
+    int                   mZHaloRadius;
+    int                   mZBoundaryRadius;
+    Pitch                 mPitch;
+    int                   mPrtID;
+    Neon::index_3d        mOrigin;
+    int                   mCardinality;
+    Neon::index_3d        mFullGridSize;
+    bool                  mPeriodicZ;
+    NghIdx* NEON_RESTRICT mStencil;
+};
+
+}  // namespace Neon::domain::details::dGridSoA
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h
new file mode 100644
index 00000000..83d5a2dc
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h
@@ -0,0 +1,52 @@
+#pragma once
+#include "Neon/set/DevSet.h"
+#include "dIndexSoA.h"
+namespace Neon::domain::details::dGridSoA {
+
+/**
+ * Abstraction that represents the Cell space of a partition
+ * This abstraction is used by the neon lambda executor to
+ * run a containers on aGrid
+ */
+class dSpanSoA
+{
+   public:
+    using Idx = dIndexSoA;
+
+    static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d3;
+    using ExecutionThreadSpanIndexType = int32_t;
+
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    setAndValidate(Idx&            idx,
+                   const uint32_t& x,
+                   const uint32_t& y,
+                   const uint32_t& z) const
+        -> bool;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetDataView()
+        const -> Neon::DataView const&;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetZHaloRadius()
+        const -> int const&;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetZBoundaryRadius()
+        const -> int const&;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetDim()
+        const -> Neon::index_3d const&;
+
+   private:
+    Neon::DataView mDataView;
+    int            mZHaloRadius;
+    int            mZBoundaryRadius;
+    Neon::index_3d mDim /** Dimension of the span, its values depends on the mDataView*/;
+};
+
+}  // namespace Neon::domain::details::dGrid
+
+#include "dSpanSoA_imp.h"
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
new file mode 100644
index 00000000..a3dff4cf
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
@@ -0,0 +1,71 @@
+#pragma once
+
+namespace Neon::domain::details::dGridSoA {
+
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::setAndValidate(Idx&            idx,
+                      const uint32_t& x,
+                      const uint32_t& y,
+                      const uint32_t& z)
+    const -> bool
+{
+    bool res = false;
+    idx.setLocation().x = int(x);
+    idx.setLocation().y = int(y);
+    idx.setLocation().z = int(z);
+
+    if (idx.get() < mDim) {
+        res = true;
+    }
+
+    switch (mDataView) {
+        case Neon::DataView::STANDARD: {
+            idx.setLocation().z += mZHaloRadius;
+            idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
+            return res;
+        }
+        case Neon::DataView::INTERNAL: {
+            idx.setLocation().z += mZHaloRadius + mZBoundaryRadius;
+            idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
+            return res;
+        }
+        case Neon::DataView::BOUNDARY: {
+
+            idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius
+                               ? 0
+                               : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */);
+            idx.setLocation().z += mZHaloRadius;
+            idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
+            return res;
+        }
+        default: {
+        }
+    }
+    return false;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDataView()
+    const -> Neon::DataView const&
+{
+    return mDataView;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZHaloRadius()
+    const -> int const&
+{
+    return mZHaloRadius;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZBoundaryRadius()
+    const -> int const&
+{
+    return mZBoundaryRadius;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDim()
+    const -> Neon::index_3d const&
+{
+    return mDim;
+}
+
+}  // namespace Neon::domain::details::dGrid
\ No newline at end of file

From 9a87088f549eee95154e7ab5e11a555a2db203b7 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 22 Jun 2023 09:34:01 -0400
Subject: [PATCH 12/25] Fixing report filename for benchmarks scripts

---
 .../lbm-lid-driven-cavity-flow.py             | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
index 5aebe104..90a55ad2 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
+++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
@@ -4,7 +4,7 @@
 GRID_LIST = "dGrid bGrid eGrid".split()
 STORAGE_FP_LIST = "double float".split()
 COMPUTE_FP_LIST = "double float".split()
-OCC_LIST = "nOCC".split()
+OCC_LIST = "nOCC sOCC".split()
 WARM_UP_ITER = 10
 MAX_ITER = 100
 REPETITIONS = 5
@@ -48,17 +48,18 @@ def countAll():
 SAMPLES = countAll()
 counter = 0
 command = './lbm-lid-driven-cavity-flow'
+# command = 'echo'
 with open(command + '.log', 'w') as fp:
     for DEVICE_TYPE in DEVICE_TYPE_LIST:
         DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
         if DEVICE_TYPE == 'gpu':
             for DEVICE in DEVICE_ID_LIST[1:]:
                 DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
-        for OCC in OCC_LIST:
-            for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
-                for STORAGE_FP in STORAGE_FP_LIST:
-                    for COMPUTE_FP in COMPUTE_FP_LIST:
-                        for DEVICE_SET in DEVICE_SET_LIST:
+        for DEVICE_SET in DEVICE_SET_LIST:
+            for OCC in OCC_LIST:
+                for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
+                    for STORAGE_FP in STORAGE_FP_LIST:
+                        for COMPUTE_FP in COMPUTE_FP_LIST:
                             for GRID in GRID_LIST:
                                 if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
                                     continue
@@ -73,9 +74,12 @@ def countAll():
                                 parameters.append('--max-iter ' + str(MAX_ITER))
                                 parameters.append(
                                     '--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
-                                    DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' +
-                                    STORAGE_FP + '_' + COMPUTE_FP + '_' +
-                                    DEVICE_SET.replace(' ', '_') + '_' + OCC)
+                                    DEVICE_TYPE + '_' +
+                                    DEVICE_SET.replace(' ', '_') + '-' +
+                                    GRID + '_' +
+                                    DOMAIN_SIZE + '-' +
+                                    STORAGE_FP + '-' + COMPUTE_FP + '-' +
+                                    OCC)
                                 parameters.append('--computeFP ' + COMPUTE_FP)
                                 parameters.append('--storageFP ' + STORAGE_FP)
                                 parameters.append('--benchmark')
@@ -91,6 +95,7 @@ def countAll():
                                 fp.write(' '.join(commandList))
                                 fp.write("\n-------------------------------------------\n")
                                 fp.flush()
+                                print(' '.join(commandList))
                                 subprocess.run(commandList, text=True, stdout=fp)
 
                                 counter += 1

From 1168cc2105986b9f07537f3dc379d5135cbefa47 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 23 Jun 2023 11:58:57 -0400
Subject: [PATCH 13/25] Adding halo option.

---
 .../lbm-lid-driven-cavity-flow.py             | 80 ++++++++++---------
 1 file changed, 42 insertions(+), 38 deletions(-)

diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
index 90a55ad2..795cb046 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
+++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
@@ -5,6 +5,7 @@
 STORAGE_FP_LIST = "double float".split()
 COMPUTE_FP_LIST = "double float".split()
 OCC_LIST = "nOCC sOCC".split()
+HU_LIST = "huGrid huLattice".split()
 WARM_UP_ITER = 10
 MAX_ITER = 100
 REPETITIONS = 5
@@ -38,10 +39,11 @@ def countAll():
                     for COMPUTE_FP in COMPUTE_FP_LIST:
                         for DEVICE_SET in DEVICE_SET_LIST:
                             for GRID in GRID_LIST:
-                                if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
-                                    continue
+                                for HU in HU_LIST:
+                                    if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                        continue
 
-                                counter += 1
+                                    counter += 1
     return counter
 
 
@@ -61,42 +63,44 @@ def countAll():
                     for STORAGE_FP in STORAGE_FP_LIST:
                         for COMPUTE_FP in COMPUTE_FP_LIST:
                             for GRID in GRID_LIST:
-                                if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
-                                    continue
+                                for HU in HU_LIST:
 
-                                parameters = []
-                                parameters.append('--deviceType ' + DEVICE_TYPE)
-                                parameters.append('--deviceIds ' + DEVICE_SET)
-                                parameters.append('--grid ' + GRID)
-                                parameters.append('--domain-size ' + DOMAIN_SIZE)
-                                parameters.append('--warmup-iter ' + str(WARM_UP_ITER))
-                                parameters.append('--repetitions ' + str(REPETITIONS))
-                                parameters.append('--max-iter ' + str(MAX_ITER))
-                                parameters.append(
-                                    '--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
-                                    DEVICE_TYPE + '_' +
-                                    DEVICE_SET.replace(' ', '_') + '-' +
-                                    GRID + '_' +
-                                    DOMAIN_SIZE + '-' +
-                                    STORAGE_FP + '-' + COMPUTE_FP + '-' +
-                                    OCC)
-                                parameters.append('--computeFP ' + COMPUTE_FP)
-                                parameters.append('--storageFP ' + STORAGE_FP)
-                                parameters.append('--benchmark')
-                                parameters.append('--' + OCC)
+                                    if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                        continue
+    
+                                    parameters = []
+                                    parameters.append('--deviceType ' + DEVICE_TYPE)
+                                    parameters.append('--deviceIds ' + DEVICE_SET)
+                                    parameters.append('--grid ' + GRID)
+                                    parameters.append('--domain-size ' + DOMAIN_SIZE)
+                                    parameters.append('--warmup-iter ' + str(WARM_UP_ITER))
+                                    parameters.append('--repetitions ' + str(REPETITIONS))
+                                    parameters.append('--max-iter ' + str(MAX_ITER))
+                                    parameters.append(
+                                        '--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
+                                        DEVICE_TYPE + '_' +
+                                        DEVICE_SET.replace(' ', '_') + '-' +
+                                        GRID + '_' +
+                                        DOMAIN_SIZE + '-' +
+                                        STORAGE_FP + '-' + COMPUTE_FP + '-' +
+                                        OCC)
+                                    parameters.append('--computeFP ' + COMPUTE_FP)
+                                    parameters.append('--storageFP ' + STORAGE_FP)
+                                    parameters.append('--benchmark')
+                                    parameters.append('--' + OCC)
 
-                                commandList = []
-                                commandList.append(command)
-                                for el in parameters:
-                                    for s in el.split():
-                                        commandList.append(s)
+                                    commandList = []
+                                    commandList.append(command)
+                                    for el in parameters:
+                                        for s in el.split():
+                                            commandList.append(s)
 
-                                fp.write("\n-------------------------------------------\n")
-                                fp.write(' '.join(commandList))
-                                fp.write("\n-------------------------------------------\n")
-                                fp.flush()
-                                print(' '.join(commandList))
-                                subprocess.run(commandList, text=True, stdout=fp)
+                                    fp.write("\n-------------------------------------------\n")
+                                    fp.write(' '.join(commandList))
+                                    fp.write("\n-------------------------------------------\n")
+                                    fp.flush()
+                                    print(' '.join(commandList))
+                                    subprocess.run(commandList, text=True, stdout=fp)
 
-                                counter += 1
-                                printProgressBar(counter * 100.0 / SAMPLES, 'Progress')
+                                    counter += 1
+                                    printProgressBar(counter * 100.0 / SAMPLES, 'Progress')

From 0bdce94ec294e0a6e142b704625882939906894e Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 23 Jun 2023 13:00:21 -0400
Subject: [PATCH 14/25] Adding halo option.

---
 .../lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
index 795cb046..677aefba 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
+++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
@@ -67,7 +67,7 @@ def countAll():
 
                                     if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
                                         continue
-    
+
                                     parameters = []
                                     parameters.append('--deviceType ' + DEVICE_TYPE)
                                     parameters.append('--deviceIds ' + DEVICE_SET)
@@ -88,6 +88,7 @@ def countAll():
                                     parameters.append('--storageFP ' + STORAGE_FP)
                                     parameters.append('--benchmark')
                                     parameters.append('--' + OCC)
+                                    parameters.append('--' + HU)
 
                                     commandList = []
                                     commandList.append(command)

From 3dc808eaff2f0eb39423c76176223224087784e1 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 23 Jun 2023 18:52:44 -0400
Subject: [PATCH 15/25] WIP

---
 .../src/RunCavityTwoPop.cu                    |   2 +-
 .../Neon/core/types/vec/vec3d_integer.tdecl.h |   6 +-
 libNeonDomain/include/Neon/domain/Grids.h     |   1 +
 libNeonDomain/include/Neon/domain/dGridSoA.h  |   7 +
 .../Neon/domain/details/dGrid/dIndex.h        |   4 +-
 .../Neon/domain/details/dGrid/dIndex_imp.h    |   4 +-
 .../Neon/domain/details/dGrid/dPartition.h    | 268 +++++++++---------
 .../Neon/domain/details/dGrid/dSpan_imp.h     |  16 +-
 .../Neon/domain/details/dGridSoA/dGridSoA.h   |  29 +-
 .../domain/details/dGridSoA/dPartitionSoA.h   | 140 +++++----
 .../Neon/domain/details/dGridSoA/dSpanSoA.h   |   5 +
 .../domain/details/dGridSoA/dSpanSoA_imp.h    |  17 +-
 .../Neon/domain/details/eGrid/ePartition.h    |   2 +-
 .../Neon/domain/tools/GridTransformer.h       |   7 +-
 .../Neon/domain/tools/gridTransformer/tGrid.h |  11 +-
 .../domain/tools/gridTransformer/tGrid_ti.h   |  28 ++
 .../tests/domain-globalIdx/src/globalIdx.cu   |  22 +-
 .../tests/domain-globalIdx/src/globalIdx.h    |   5 +-
 .../tests/domain-globalIdx/src/gtests.cpp     |  15 +-
 libNeonDomain/tests/domain-map/src/gtests.cpp |   9 +
 libNeonDomain/tests/domain-map/src/map.cu     |   2 +
 libNeonDomain/tests/domain-map/src/map.h      |   3 +
 22 files changed, 365 insertions(+), 238 deletions(-)
 create mode 100644 libNeonDomain/include/Neon/domain/dGridSoA.h

diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
index d28688d1..29c7573d 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
@@ -315,7 +315,7 @@ auto run(Config& config,
         return details::runFilterStoreType<Neon::bGrid>(config, report);
     }
     if (config.gridType == "dGridSoA") {
-        return details::runFilterStoreType<Neon::domain::details::dGridSoA::dSpanSoA>(config, report);
+        return details::runFilterStoreType<Neon::domain::details::dGridSoA::dGridSoA>(config, report);
     }
 }
 }  // namespace CavityTwoPop
diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h
index acdae410..ae475c6e 100644
--- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h
+++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h
@@ -56,6 +56,10 @@ class Vec_3d<IntegerType_ta, true, false>
         num_axis = 3
     };
 
+    static constexpr int directionX = axis_e::x_axis;
+    static constexpr int directionY = axis_e::y_axis;
+    static constexpr int directionZ = axis_e::z_axis;
+
     union
     {
         Integer v[axis_e::num_axis]{0, 0, 0};
@@ -120,7 +124,7 @@ class Vec_3d<IntegerType_ta, true, false>
 
     NEON_CUDA_HOST_DEVICE inline void constexpr set(Integer p[self_t::num_axis]);
 
-    NEON_CUDA_HOST_DEVICE inline void  constexpr set(const self_t& other);
+    NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other);
 
     NEON_CUDA_HOST_DEVICE inline void constexpr set(const Integer& xyz);
 
diff --git a/libNeonDomain/include/Neon/domain/Grids.h b/libNeonDomain/include/Neon/domain/Grids.h
index aad0cda5..7c899b98 100644
--- a/libNeonDomain/include/Neon/domain/Grids.h
+++ b/libNeonDomain/include/Neon/domain/Grids.h
@@ -3,3 +3,4 @@
 #include "Neon/domain/aGrid.h"
 #include "Neon/domain/eGrid.h"
 #include "Neon/domain/bGrid.h"
+#include "Neon/domain/dGridSoA.h"
diff --git a/libNeonDomain/include/Neon/domain/dGridSoA.h b/libNeonDomain/include/Neon/domain/dGridSoA.h
new file mode 100644
index 00000000..bdd63f25
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/dGridSoA.h
@@ -0,0 +1,7 @@
+#pragma once
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
+
+
+namespace Neon {
+using dGridSoA = Neon::domain::details::dGridSoA::dGridSoA;
+}
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h
index 3291e622..a2c57cdb 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h
@@ -37,9 +37,9 @@ struct dIndex
 
     NEON_CUDA_HOST_DEVICE inline explicit dIndex(const Location& location);
 
-    NEON_CUDA_HOST_DEVICE inline auto set() -> Location&;
+    NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&;
 
-    NEON_CUDA_HOST_DEVICE inline auto get() const -> const Location&;
+    NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&;
 };
 
 }  // namespace Neon::domain::details::dGrid
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h
index 4389fb3f..6426e43a 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h
@@ -16,11 +16,11 @@ NEON_CUDA_HOST_DEVICE inline dIndex::dIndex(const Location::Integer &x,
     mLocation.z = z;
 }
 
-NEON_CUDA_HOST_DEVICE inline auto dIndex::set() -> Location&
+NEON_CUDA_HOST_DEVICE inline auto dIndex::setLocation() -> Location&
 {
     return mLocation;
 }
-NEON_CUDA_HOST_DEVICE inline auto dIndex::get() const -> const Location&
+NEON_CUDA_HOST_DEVICE inline auto dIndex::getLocation() const -> const Location&
 {
     return mLocation;
 }
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
index 31e480aa..86faf619 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
@@ -44,16 +44,16 @@ class dPartition
                         int            cardinality,
                         Neon::index_3d fullGridSize,
                         NghIdx*        stencil = nullptr)
-        : m_dataView(dataView),
-          m_mem(mem),
-          m_dim(dim),
-          m_zHaloRadius(zHaloRadius),
-          m_zBoundaryRadius(zBoundaryRadius),
-          m_pitch(pitch),
-          m_prtID(prtID),
-          m_origin(origin),
-          m_cardinality(cardinality),
-          m_fullGridSize(fullGridSize),
+        : mDataView(dataView),
+          mMem(mem),
+          mDim(dim),
+          mZHaloRadius(zHaloRadius),
+          mZBoundaryRadius(zBoundaryRadius),
+          mPitch(pitch),
+          mPrtID(prtID),
+          mOrigin(origin),
+          mCardinality(cardinality),
+          mFullGridSize(fullGridSize),
           mPeriodicZ(false),
           mStencil(stencil)
     {
@@ -70,21 +70,21 @@ class dPartition
     prtID()
         const -> int
     {
-        return m_prtID;
+        return mPrtID;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     cardinality()
         const -> int
     {
-        return m_cardinality;
+        return mCardinality;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     getPitchData()
         const -> const Pitch&
     {
-        return m_pitch;
+        return mPitch;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
@@ -92,76 +92,76 @@ class dPartition
              int        cardinalityIdx = 0)
         const -> int64_t
     {
-        return idx.get().x * int64_t(m_pitch.x) +
-               idx.get().y * int64_t(m_pitch.y) +
-               idx.get().z * int64_t(m_pitch.z) +
-               cardinalityIdx * int64_t(m_pitch.w);
+        return idx.getLocation().x * int64_t(mPitch.x) +
+               idx.getLocation().y * int64_t(mPitch.y) +
+               idx.getLocation().z * int64_t(mPitch.z) +
+               cardinalityIdx * int64_t(mPitch.w);
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     dim()
         const -> const Neon::index_3d
     {
-        return m_dim;
+        return mDim;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     halo()
         const -> const Neon::index_3d
     {
-        return Neon::index_3d(0, 0, m_zHaloRadius);
+        return Neon::index_3d(0, 0, mZHaloRadius);
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     origin()
         const -> const Neon::index_3d
     {
-        return m_origin;
+        return mOrigin;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx& eId,
+    getNghData(const Idx& gidx,
                NghIdx     nghOffset,
                int        card,
                const T&   alternativeVal)
         const -> NghData
     {
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh);
         T          val = alternativeVal;
         if (isValidNeighbour) {
-            val = operator()(cellNgh, card);
+            val = operator()(gidxNgh, card);
         }
         return NghData(val, isValidNeighbour);
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx& eId,
+    getNghData(const Idx& gidx,
                NghIdx     nghOffset,
                int        card)
         const -> NghData
     {
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh);
         T          val;
         if (isValidNeighbour) {
-            val = operator()(cellNgh, card);
+            val = operator()(gidxNgh, card);
         }
         return NghData(val, isValidNeighbour);
     }
 
     template <int xOff, int yOff, int zOff, typename LambdaVALID, typename LambdaNOTValid = void*>
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx&     eId,
+    getNghData(const Idx&     gidx,
                int            card,
                LambdaVALID    funIfValid,
                LambdaNOTValid funIfNOTValid = nullptr)
         const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T>, void>
     {
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(eId, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
         if (isValidNeighbour) {
-            T val = this->operator()(cellNgh, card);
+            T val = this->operator()(gidxNgh, card);
             funIfValid(val);
         }
         if constexpr (!std::is_same_v<LambdaNOTValid, void*>) {
@@ -171,131 +171,130 @@ class dPartition
         }
     }
 
-    template <int xOff, int yOff, int zOff>
+    template <int xOff,
+              int yOff,
+              int zOff>
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx& eId,
+    getNghData(const Idx& gidx,
                int        card)
         const -> NghData
     {
-        NghData    res;
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(eId, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        T          val;
         if (isValidNeighbour) {
-            T val = operator()(cellNgh, card);
-            res.set(val, true);
-        } else {
-            res.invalidate();
+            val = operator()(gidxNgh, card);
         }
-        return res;
+        return NghData(val, isValidNeighbour);
     }
 
     template <int xOff, int yOff, int zOff>
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx& eId,
+    getNghData(const Idx& gidx,
                int        card,
                T const&   defaultValue)
         const -> NghData
     {
         NghData    res(defaultValue, false);
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(eId, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
         if (isValidNeighbour) {
-            T val = operator()(cellNgh, card);
+            T val = operator()(gidxNgh, card);
             res.set(val, true);
         }
         return res;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
-    nghVal(const Idx& eId,
+    nghVal(const Idx& gidx,
            uint8_t    nghID,
            int        card,
            const T&   alternativeVal)
         const -> NghData
     {
         NghIdx nghOffset = mStencil[nghID];
-        return getNghData(eId, nghOffset, card, alternativeVal);
+        return getNghData(gidx, nghOffset, card, alternativeVal);
     }
     /**
      * Get the index of the neighbor given the offset
      * @tparam dataView_ta
-     * @param[in] eId Index of the current element
+     * @param[in] gidx Index of the current element
      * @param[in] nghOffset Offset of the neighbor of interest from the current element
      * @param[in,out] neighbourIdx Index of the neighbor
      * @return Whether the neighbour is valid
      */
     NEON_CUDA_HOST_DEVICE inline auto
-    nghIdx(const Idx&    eId,
-           const NghIdx& nghOffset,
-           Idx&          neighbourIdx)
+    helpGetNghIdx(const Idx&    gidx,
+                  const NghIdx& nghOffset,
+                  Idx&          neighbourIdx)
         const -> bool
     {
-        Idx cellNgh(eId.get().x + nghOffset.x,
-                    eId.get().y + nghOffset.y,
-                    eId.get().z + nghOffset.z);
+        Idx gidxNgh(gidx.getLocation().x + nghOffset.x,
+                    gidx.getLocation().y + nghOffset.y,
+                    gidx.getLocation().z + nghOffset.z);
 
-        const auto cellNghGlobal = getGlobalIndex(cellNgh);
+        const auto gidxNghGlobal = getGlobalIndex(gidxNgh);
 
         bool isValidNeighbour = true;
 
-        if (mPeriodicZ) {
-            printf("Error, periodic not implemented yet");
-            assert(false);
-        }
-
-        isValidNeighbour = (cellNghGlobal.x >= 0) &&
-                           (cellNghGlobal.y >= 0) &&
-                           (cellNghGlobal.z >= 0);
-
-        //        isValidNeighbour = (cellNgh.get().x < m_dim.x) &&
-        //                           (cellNgh.get().y < m_dim.y) &&
-        //                           (cellNgh.get().z < m_dim.z + 2 * m_zHaloRadius) && isValidNeighbour;
+        isValidNeighbour = (gidxNghGlobal.x >= 0) &&
+                           (gidxNghGlobal.y >= 0) &&
+                           (gidxNghGlobal.z >= 0);
 
-        isValidNeighbour = (cellNghGlobal.x < m_fullGridSize.x) &&
-                           (cellNghGlobal.y < m_fullGridSize.y) &&
-                           (cellNghGlobal.z < m_fullGridSize.z) &&
+        isValidNeighbour = (gidxNghGlobal.x < mFullGridSize.x) &&
+                           (gidxNghGlobal.y < mFullGridSize.y) &&
+                           (gidxNghGlobal.z < mFullGridSize.z) &&
                            isValidNeighbour;
 
         if (isValidNeighbour) {
-            neighbourIdx = cellNgh;
+            neighbourIdx = gidxNgh;
         }
         return isValidNeighbour;
     }
 
     template <int xOff, int yOff, int zOff>
     NEON_CUDA_HOST_DEVICE inline auto
-    nghIdx(const Idx& eId,
-           Idx&       cellNgh)
+    helpGetNghIdx(const Idx& gidx,
+                  Idx&       gidxNgh)
         const -> bool
     {
-        cellNgh = Idx(eId.get().x + xOff,
-                      eId.get().y + yOff,
-                      eId.get().z + zOff);
-        Idx cellNgh_global(cellNgh.get() + m_origin);
-        // const bool isValidNeighbour = (cellNgh_global >= 0 && cellNgh < (m_dim + m_halo) && cellNgh_global < m_fullGridSize);
-        bool isValidNeighbour = true;
-        if constexpr (xOff > 0) {
-            isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour;
-            isValidNeighbour = cellNgh_global.get().x <= m_fullGridSize.x && isValidNeighbour;
-        }
-        if constexpr (xOff < 0) {
-            isValidNeighbour = cellNgh_global.get().x >= 0 && isValidNeighbour;
-        }
-        if constexpr (yOff > 0) {
-            isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour;
-            isValidNeighbour = cellNgh_global.get().y <= m_fullGridSize.y && isValidNeighbour;
-        }
-        if constexpr (yOff < 0) {
-            isValidNeighbour = cellNgh_global.get().y >= 0 && isValidNeighbour;
-        }
-        if constexpr (zOff > 0) {
-            isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour;
-            isValidNeighbour = cellNgh_global.get().z <= m_fullGridSize.z && isValidNeighbour;
-        }
-        if constexpr (zOff < 0) {
-            isValidNeighbour = cellNgh_global.get().z >= m_zHaloRadius && isValidNeighbour;
-        }
-        return isValidNeighbour;
+        return helpGetNghIdx(gidx, NghIdx{xOff, yOff, zOff}, gidxNgh);
+        //        gidxNgh = Idx(gidx.getLocation().x + xOff,
+        //                      gidx.getLocation().y + yOff,
+        //                      gidx.getLocation().z + zOff);
+        //
+        //        bool isValidNeighbour = true;
+        //        if constexpr (xOff > 0) {
+        //            int constexpr direction = Neon::index_3d::directionX;
+        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+        //            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
+        //        }
+        //        if constexpr (xOff < 0) {
+        //            int constexpr direction = Neon::index_3d::directionX;
+        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+        //            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
+        //        }
+        //        if constexpr (yOff > 0) {
+        //            int constexpr direction = Neon::index_3d::directionY;
+        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+        //            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
+        //        }
+        //        if constexpr (yOff < 0) {
+        //            int constexpr direction = Neon::index_3d::directionY;
+        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+        //            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
+        //        }
+        //        if constexpr (zOff > 0) {
+        //            int constexpr direction = Neon::index_3d::directionZ;
+        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+        //            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
+        //        }
+        //        if constexpr (zOff < 0) {
+        //            int constexpr direction = Neon::index_3d::directionZ;
+        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+        //            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
+        //        }
+        //        return isValidNeighbour;
     }
 
 
@@ -303,7 +302,7 @@ class dPartition
     mem()
         -> T*
     {
-        return m_mem;
+        return mMem;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -311,7 +310,7 @@ class dPartition
         const
         -> const T*
     {
-        return m_mem;
+        return mMem;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -319,7 +318,7 @@ class dPartition
         int        cardinalityIdx) -> T*
     {
         int64_t p = getPitch(cell, cardinalityIdx);
-        return m_mem[p];
+        return mMem[p];
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -327,7 +326,7 @@ class dPartition
                int        cardinalityIdx) -> T&
     {
         int64_t p = getPitch(cell, cardinalityIdx);
-        return m_mem[p];
+        return mMem[p];
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -335,7 +334,7 @@ class dPartition
                int        cardinalityIdx) const -> const T&
     {
         int64_t p = getPitch(cell, cardinalityIdx);
-        return m_mem[p];
+        return mMem[p];
     }
 
     template <typename ComputeType>
@@ -386,22 +385,35 @@ class dPartition
         //               local.mLocation.y < m_dim.y &&
         //               local.mLocation.z < m_dim.z + m_zHaloRadius);
 
-        Neon::index_3d result = local.mLocation + m_origin;
-        result.z -= m_zHaloRadius;
+        Neon::index_3d result = local.mLocation;
+        result.z = result.z + mOrigin.z - mZHaloRadius;
         return result;
     }
 
+    template <int direction>
+    NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local)
+        const -> int
+    {
+        if constexpr (Neon::index_3d::directionZ != direction) {
+            return local.mLocation.v[direction];
+        } else {
+            return local.mLocation.v[Neon::index_3d::directionZ] +
+                   mOrigin.v[Neon::index_3d::directionZ] -
+                   mZHaloRadius;
+        }
+    }
+
     NEON_CUDA_HOST_DEVICE inline auto getDomainSize()
         const -> Neon::index_3d
     {
-        return m_fullGridSize;
+        return mFullGridSize;
     }
 
     auto ioToVti(std::string const& fname, std::string const& fieldName)
     {
-        auto fnameCommplete = fname + "_" + std::to_string(m_prtID);
-        auto haloOrigin = Vec_3d<double>(m_origin.x, m_origin.y, m_origin.z - m_zHaloRadius);
-        auto haloDim = m_dim + Neon::index_3d(0, 0, 2 * m_zHaloRadius) + 1;
+        auto fnameCommplete = fname + "_" + std::to_string(mPrtID);
+        auto haloOrigin = Vec_3d<double>(mOrigin.x, mOrigin.y, mOrigin.z - mZHaloRadius);
+        auto haloDim = mDim + Neon::index_3d(0, 0, 2 * mZHaloRadius) + 1;
 
         IoToVTK<int, int64_t> io(fnameCommplete,
                                  haloDim,
@@ -413,35 +425,35 @@ class dPartition
         io.addField([&](const Neon::index_3d& idx, int i) {
             return operator()(dIndex(idx), i);
         },
-                    m_cardinality, "Partition", ioToVTKns::VtiDataType_e::voxel);
+                    mCardinality, "Partition", ioToVTKns::VtiDataType_e::voxel);
 
         io.flushAndClear();
         return;
     }
 
     auto getDataView()
-        -> Neon::DataView
+        const -> Neon::DataView
     {
-        return m_dataView;
+        return mDataView;
     }
 
-    auto helpGetGlobalToLocalOffets() const
-        -> NghIdx const*
+    auto helpGetGlobalToLocalOffets()
+        const -> NghIdx*
     {
         return mStencil;
     }
 
    private:
-    Neon::DataView        m_dataView;
-    T* NEON_RESTRICT      m_mem;
-    Neon::index_3d        m_dim;
-    int                   m_zHaloRadius;
-    int                   m_zBoundaryRadius;
-    Pitch                 m_pitch;
-    int                   m_prtID;
-    Neon::index_3d        m_origin;
-    int                   m_cardinality;
-    Neon::index_3d        m_fullGridSize;
+    Neon::DataView        mDataView;
+    T* NEON_RESTRICT      mMem;
+    Neon::index_3d        mDim;
+    int                   mZHaloRadius;
+    int                   mZBoundaryRadius;
+    Pitch                 mPitch;
+    int                   mPrtID;
+    Neon::index_3d        mOrigin;
+    int                   mCardinality;
+    Neon::index_3d        mFullGridSize;
     bool                  mPeriodicZ;
     NghIdx* NEON_RESTRICT mStencil;
 };
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h
index 8f6f9fea..9fb56572 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h
@@ -10,29 +10,29 @@ dSpan::setAndValidate(Idx&            idx,
     const -> bool
 {
     bool res = false;
-    idx.set().x = int(x);
-    idx.set().y = int(y);
-    idx.set().z = int(z);
+    idx.setLocation().x = int(x);
+    idx.setLocation().y = int(y);
+    idx.setLocation().z = int(z);
 
-    if (idx.get() < mDim) {
+    if (idx.getLocation() < mDim) {
         res = true;
     }
 
     switch (mDataView) {
         case Neon::DataView::STANDARD: {
-            idx.set().z += mZHaloRadius;
+            idx.setLocation().z += mZHaloRadius;
             return res;
         }
         case Neon::DataView::INTERNAL: {
-            idx.set().z += mZHaloRadius + mZBoundaryRadius;
+            idx.setLocation().z += mZHaloRadius + mZBoundaryRadius;
             return res;
         }
         case Neon::DataView::BOUNDARY: {
 
-            idx.set().z += idx.get().z < mZBoundaryRadius
+            idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius
                                ? 0
                                : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */);
-            idx.set().z += mZHaloRadius;
+            idx.setLocation().z += mZHaloRadius;
 
             return res;
         }
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h
index 61b182b2..7ce3e582 100644
--- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h
@@ -28,21 +28,22 @@
 #include "Neon/domain/patterns/PatternScalar.h"
 
 #include "dPartitionSoA.h"
+#include "dSpanSoA.h"
 
 namespace Neon::domain::details::dGridSoA {
 
 namespace details {
 struct dGridSoATransformation
 {
+    using FoundationGrid = Neon::domain::details::dGrid::dGrid;
+    using Idx = dIndexSoA;
+    using Span = dSpanSoA;
     template <typename T, int C>
     using Partition = dPartitionSoA<T, C>;
-    using Span = Neon::domain::details::eGrid::eSpan;
-    static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on;
 
-    using FoundationGrid = Neon::domain::details::eGrid::eGrid;
-    static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan;
+    static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on;
+    static constexpr Neon::set::details::ExecutionThreadSpan            executionThreadSpan = FoundationGrid::executionThreadSpan;
     using ExecutionThreadSpanIndexType = int32_t;
-    using Idx = FoundationGrid::Idx;
 
     static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const&
     {
@@ -55,7 +56,7 @@ struct dGridSoATransformation
                                            Neon::SetIdx    setIdx,
                                            Neon::DataView  dw,
                                            Span&           span) {
-            span = foundationGrid.getSpan(execution, setIdx, dw);
+            span.helpInit(foundationGrid.getSpan(execution, setIdx, dw));
         });
     }
 
@@ -67,14 +68,14 @@ struct dGridSoATransformation
         return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem);
     }
 
-    static auto helpGetGridIdx(FoundationGrid&,
-                               Neon::SetIdx const&,
-                               FoundationGrid::Idx const& fgIdx)
-        -> GridTransformation::Idx
-    {
-        GridTransformation::Idx tgIdx = fgIdx;
-        return tgIdx;
-    }
+    //    static auto helpGetGridIdx(FoundationGrid&,
+    //                               Neon::SetIdx const&,
+    //                               FoundationGrid::Idx const& fgIdx)
+    //        -> dGridSoATransformation::Idx
+    //    {
+    //        dGridSoATransformation::Idx tgIdx = fgIdx;
+    //        return tgIdx;
+    //    }
 
     template <typename T, int C>
     static auto initFieldPartition(FoundationGrid::Field<T, C>&                         foundationField,
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
index fc4c3642..1cdd75db 100644
--- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
@@ -19,12 +19,13 @@ class dPartitionSoA
     using Idx = dIndexSoA;
     using NghData = Neon::domain::NghData<T>;
     using Pitch = uint32_4d;
+    using NghIdx = int8_3d;
 
     dPartitionSoA()
     {
     }
 
-    dPartitionSoA(Neon::domain::details::dGrid::dPartition<T, C> const& dPartitionOriginal)
+    dPartitionSoA(Neon::domain::details::dGrid::dPartition<T, C>& dPartitionOriginal)
     {
         mDataView = dPartitionOriginal.getDataView();
         mMem = dPartitionOriginal.mem();
@@ -34,22 +35,22 @@ class dPartitionSoA
         mPrtID = dPartitionOriginal.prtID();
         mOrigin = dPartitionOriginal.origin();
         mCardinality = dPartitionOriginal.cardinality();
-        mFullGridSize = dPartitionOriginal.fullGridSize();
-        NghIdx* mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets();
+        mFullGridSize = dPartitionOriginal.getDomainSize();
+        mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets();
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     prtID()
         const -> int
     {
-        return mPrtID();
+        return mPrtID;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     cardinality()
         const -> int
     {
-        return mCardinality();
+        return mCardinality;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
@@ -62,30 +63,30 @@ class dPartitionSoA
     inline NEON_CUDA_HOST_DEVICE auto
     getPitch(const Idx& idx,
              int        cardinality)
-        -> Idx::Offset
+        const -> Idx::Offset
     {
-        return idx.getLocationOffset() + cardinality * mPitch.w;
+        return idx.getOffset() + cardinality * mPitch.w;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     dim()
         const -> const Neon::index_3d
     {
-        return mDim();
+        return mDim;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     halo()
         const -> const Neon::index_3d
     {
-        return mDPartition.halo();
+        return Neon::index_3d(0, 0, mZHaloRadius);
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     origin()
         const -> const Neon::index_3d
     {
-        return m_ormDPartition.origin();
+        return mOrigin;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -96,7 +97,7 @@ class dPartitionSoA
         const -> NghData
     {
         Idx        gidxNgh;
-        const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh);
+        const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh);
         T          val = alternativeVal;
         if (isValidNeighbour) {
             val = operator()(gidxNgh, card);
@@ -111,7 +112,7 @@ class dPartitionSoA
         const -> NghData
     {
         Idx        gidxNgh;
-        const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh);
+        const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh);
         T          val;
         if (isValidNeighbour) {
             val = operator()(gidxNgh, card);
@@ -132,7 +133,7 @@ class dPartitionSoA
         const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T>, void>
     {
         Idx        gidxNgh;
-        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
         if (isValidNeighbour) {
             T val = this->operator()(gidxNgh, card);
             funIfValid(val);
@@ -152,7 +153,7 @@ class dPartitionSoA
     {
         NghData    res;
         Idx        gidxNgh;
-        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
         if (isValidNeighbour) {
             T val = operator()(gidxNgh, card);
             res.set(val, true);
@@ -171,7 +172,7 @@ class dPartitionSoA
     {
         NghData    res(defaultValue, false);
         Idx        gidxNgh;
-        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
         if (isValidNeighbour) {
             T val = operator()(gidxNgh, card);
             res.set(val, true);
@@ -199,31 +200,31 @@ class dPartitionSoA
      * @return Whether the neighbour is valid
      */
     NEON_CUDA_HOST_DEVICE inline auto
-    nghIdx(const Idx&    gidx,
-           const NghIdx& nghOffset,
-           Idx&          neighbourIdx)
+    helpGetNghIdx(const Idx&    gidx,
+                  const NghIdx& nghOffset,
+                  Idx&          neighbourIdx)
         const -> bool
     {
-        Neon::index_3d cartesian(gidx.get().x + nghOffset.x,
-                                 gidx.get().y + nghOffset.y,
-                                 gidx.get().z + nghOffset.z);
+        Neon::index_3d cartesian(gidx.getLocation().x + nghOffset.x,
+                                 gidx.getLocation().y + nghOffset.y,
+                                 gidx.getLocation().z + nghOffset.z);
 
-        neighbourIdx = Idx(cartesian,
-                           gidx.getOffset() + nghOffset.x * getPitchData().x +
-                               nghOffset.y * getPitchData().y +
-                               nghOffset.z * getPitchData().z);
+        neighbourIdx = Idx(cartesian, gidx.getOffset() +
+                                          nghOffset.x * getPitchData().x +
+                                          nghOffset.y * getPitchData().y +
+                                          nghOffset.z * getPitchData().z);
 
-        Idx::Location nghCartesianGlobal = getGlobalIndex(gidxNgh);
+        Neon::index_3d const nghCartesianIdx = getGlobalIndex(neighbourIdx);
 
         bool isValidNeighbour = true;
 
-        isValidNeighbour = (gidxNghGlobal.x >= 0) &&
-                           (gidxNghGlobal.y >= 0) &&
-                           (gidxNghGlobal.z >= 0);
+        isValidNeighbour = (nghCartesianIdx.x >= 0) &&
+                           (nghCartesianIdx.y >= 0) &&
+                           (nghCartesianIdx.z >= 0);
 
-        isValidNeighbour = (gidxNghGlobal.x < m_fullGridSize.x) &&
-                           (gidxNghGlobal.y < m_fullGridSize.y) &&
-                           (gidxNghGlobal.z < m_fullGridSize.z) &&
+        isValidNeighbour = (nghCartesianIdx.x < mFullGridSize.x) &&
+                           (nghCartesianIdx.y < mFullGridSize.y) &&
+                           (nghCartesianIdx.z < mFullGridSize.z) &&
                            isValidNeighbour;
 
         return isValidNeighbour;
@@ -235,37 +236,46 @@ class dPartitionSoA
                   Idx&       gidxNgh)
         const -> bool
     {
-        Neon::index_3d cartesian(gidx.get().x + xOff,
-                                 gidx.get().y + yOff,
-                                 gidx.get().z + zOff);
-        gidxNgh = Idx(cartesian,
-                      gidx.getOffset() + xOff * getPitchData().x +
-                          yOff * getPitchData().y +
-                          zOff * getPitchData().z);
-
-        Idx::Location nghCartesianGlobal(getGlobalIndex(gidxNgh));
+        {
+            Neon::index_3d cartesian(gidx.getLocation().x + xOff,
+                                     gidx.getLocation().y + yOff,
+                                     gidx.getLocation().z + zOff);
+            gidxNgh = Idx(cartesian, gidx.getOffset() +
+                                         xOff * getPitchData().x +
+                                         yOff * getPitchData().y +
+                                         zOff * getPitchData().z);
+        }
 
         bool isValidNeighbour = true;
         if constexpr (xOff > 0) {
-            isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour;
-            isValidNeighbour = nghCartesianGlobal.x <= mDPartition.m_fullGridSize.x && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionX;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
         }
         if constexpr (xOff < 0) {
-            isValidNeighbour = nghCartesianGlobal.x >= 0 && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionX;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
         }
         if constexpr (yOff > 0) {
-            isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour;
-            isValidNeighbour = nghCartesianGlobal.y <= mDPartition.m_fullGridSize.y && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionY;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
         }
         if constexpr (yOff < 0) {
-            isValidNeighbour = nghCartesianGlobal.y >= 0 && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionY;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
         }
         if constexpr (zOff > 0) {
-            isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour;
-            isValidNeighbour = nghCartesianGlobal.z <= mDPartition.m_fullGridSize.z && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionZ;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
         }
         if constexpr (zOff < 0) {
-            isValidNeighbour = nghCartesianGlobal.z >= mDPartition.m_zHaloRadius && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionZ;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
         }
         return isValidNeighbour;
     }
@@ -274,14 +284,14 @@ class dPartitionSoA
     mem()
         -> T*
     {
-        return mDPartition.m_mem;
+        return mMem;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
     mem() const
         -> const T*
     {
-        return mDPartition.m_mem;
+        return mMem;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -290,7 +300,7 @@ class dPartitionSoA
         -> T*
     {
         Idx::Offset p = getPitch(cell, cardinalityIdx);
-        return mDPartition.m_mem[p];
+        return mMem[p];
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -299,7 +309,7 @@ class dPartitionSoA
         -> T&
     {
         Idx::Offset p = getPitch(cell, cardinalityIdx);
-        return mDPartition.m_mem[p];
+        return mMem[p];
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -308,21 +318,35 @@ class dPartitionSoA
         const -> const T&
     {
         Idx::Offset p = getPitch(cell, cardinalityIdx);
-        return mDPartition.m_mem[p];
+        return mMem[p];
     }
 
     NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local)
         const -> Neon::index_3d
     {
-        Neon::index_3d result = local.mLocation + m_origin;
-        result.z -= mDPartition.m_zHaloRadius;
+        Neon::index_3d result = local.mLocation + mOrigin;
+        result.z -= mZHaloRadius;
         return result;
     }
 
+    template <int direction>
+    NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local)
+        const -> int
+    {
+        if constexpr (Neon::index_3d::directionZ != direction) {
+            return local.mLocation.v[direction] +
+                   mOrigin.v[direction];
+        } else {
+            return local.mLocation.v[Neon::index_3d::directionZ] +
+                   mOrigin.v[Neon::index_3d::directionZ] -
+                   mZHaloRadius;
+        }
+    }
+
     NEON_CUDA_HOST_DEVICE inline auto getDomainSize()
         const -> Neon::index_3d
     {
-        return mDPartition.m_fullGridSize;
+        return mFullGridSize;
     }
 
     Neon::DataView        mDataView;
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h
index 83d5a2dc..3aee038c 100644
--- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h
@@ -1,6 +1,8 @@
 #pragma once
 #include "Neon/set/DevSet.h"
 #include "dIndexSoA.h"
+#include "Neon/domain/details/dGrid/dSpan.h"
+
 namespace Neon::domain::details::dGridSoA {
 
 /**
@@ -40,6 +42,9 @@ class dSpanSoA
     helpGetDim()
         const -> Neon::index_3d const&;
 
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpInit(Neon::domain::details::dGrid::dSpan const&) ->void;
+
    private:
     Neon::DataView mDataView;
     int            mZHaloRadius;
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
index a3dff4cf..421a3f27 100644
--- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
@@ -14,19 +14,19 @@ dSpanSoA::setAndValidate(Idx&            idx,
     idx.setLocation().y = int(y);
     idx.setLocation().z = int(z);
 
-    if (idx.get() < mDim) {
+    if (idx.getLocation() < mDim) {
         res = true;
     }
 
     switch (mDataView) {
         case Neon::DataView::STANDARD: {
             idx.setLocation().z += mZHaloRadius;
-            idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
+            idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
             return res;
         }
         case Neon::DataView::INTERNAL: {
             idx.setLocation().z += mZHaloRadius + mZBoundaryRadius;
-            idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
+            idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
             return res;
         }
         case Neon::DataView::BOUNDARY: {
@@ -35,7 +35,7 @@ dSpanSoA::setAndValidate(Idx&            idx,
                                ? 0
                                : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */);
             idx.setLocation().z += mZHaloRadius;
-            idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
+            idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
             return res;
         }
         default: {
@@ -68,4 +68,13 @@ NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDim()
     return mDim;
 }
 
+NEON_CUDA_HOST_DEVICE inline auto  dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) ->void
+{
+    mDataView = dspan.helpGetDataView();
+    mZHaloRadius = dspan.helpGetZHaloRadius();
+    mZBoundaryRadius = dspan.helpGetZBoundaryRadius();
+    mDim = dspan.helpGetDim();
+}
+
+
 }  // namespace Neon::domain::details::dGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
index 012a3588..62b75981 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
@@ -59,7 +59,7 @@ class ePartition
      *  |
      *  |   Connectivity table has the same layout of a field with cardinality equal to
      *  |   the number of neighbours and an SoA layout. Let's call this field nghField.
-     *  |   nghField(e, nghIdx) is the eIdx_t of the neighbour element as in a STANDARD
+     *  |   nghField(e, helpGetNghIdx) is the eIdx_t of the neighbour element as in a STANDARD
      *  |   view.
      *  |--)
      */
diff --git a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h
index 90556fb9..47518f7a 100644
--- a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h
+++ b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h
@@ -1,10 +1,10 @@
 #pragma once
 
+#include "Neon/domain/tools/PartitionTable.h"
+#include "Neon/domain/tools/SpanTable.h"
 #include "Neon/domain/tools/gridTransformer/tField.h"
 #include "Neon/domain/tools/gridTransformer/tGrid.h"
 #include "Neon/domain/tools/gridTransformer/tGrid_ti.h"
-#include "Neon/domain/tools/PartitionTable.h"
-#include "Neon/domain/tools/SpanTable.h"
 
 namespace Neon::domain::tool {
 
@@ -24,9 +24,10 @@ template <typename GridTransformation>
 class GridTransformer
 {
    public:
+    using Idx = typename GridTransformation::Idx;
+    using Span = typename GridTransformation::Span;
     template <typename T, int C>
     using Partition = typename GridTransformation::template Partition<T, C>;
-    using Span = typename GridTransformation::Span;
     using FoundationGrid = typename GridTransformation::FoundationGrid;
 
     using Grid = details::tGrid<GridTransformation>;
diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h
index d6d98be1..bd28e8f5 100644
--- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h
+++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h
@@ -54,6 +54,15 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate<tGrid<GridTransfo
     tGrid();
     virtual ~tGrid();
     explicit tGrid(FoundationGrid& foundationGrid);
+
+    template <typename SparsityPattern>
+    tGrid(const Neon::Backend&         backend /**< Target for computation */,
+          const Neon::int32_3d&        dimension /**< Dimension of the bounding box containing the domain */,
+          const SparsityPattern&       activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */,
+          const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */,
+          const Vec_3d<double>&        spacing = Vec_3d<double>(1, 1, 1) /**< Spacing, i.e. size of a voxel */,
+          const Vec_3d<double>&        origin = Vec_3d<double>(0, 0, 0) /**< Origin  */);
+
     tGrid(const tGrid& other);                 // copy constructor
     tGrid(tGrid&& other) noexcept;             // move constructor
     tGrid& operator=(const tGrid& other);      // copy assignment
@@ -109,7 +118,7 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate<tGrid<GridTransfo
     struct Data
     {
         Data() = default;
-        explicit Data(Neon::Backend& bk)
+        explicit Data(Neon::Backend const& bk)
         {
             spanTable = Neon::domain::tool::SpanTable<Span>(bk);
         }
diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h
index 4ba1403d..0a0249d7 100644
--- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h
+++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h
@@ -30,6 +30,34 @@ tGrid<GridTransformation>::tGrid(FoundationGrid& foundationGrid)
                           foundationGrid.getOrigin());
 }
 
+template <typename GridTransformation>
+template <typename SparsityPattern>
+tGrid<GridTransformation>::tGrid(const Neon::Backend&         bk,
+                                 const Neon::int32_3d&        dimension,
+                                 const SparsityPattern&       activeCellLambda,
+                                 const Neon::domain::Stencil& stencil,
+                                 const Vec_3d<double>&        spacing,
+                                 const Vec_3d<double>&        origin)
+{
+    mData = std::make_shared<Data>(bk);
+    mData->foundationGrid = FoundationGrid(bk,
+                                           dimension,
+                                           activeCellLambda,
+                                           stencil,
+                                           spacing,
+                                           origin);
+    GridTransformation::initSpan(mData->foundationGrid,
+                                 NEON_OUT mData->spanTable);
+    tGrid::GridBase::init("tGrid",
+                          bk,
+                          mData->foundationGrid.getDimension(),
+                          mData->foundationGrid.getStencil(),
+                          mData->foundationGrid.getNumActiveCellsPerPartition(),
+                          mData->foundationGrid.getDefaultBlock(),
+                          mData->foundationGrid.getSpacing(),
+                          mData->foundationGrid.getOrigin());
+}
+
 template <typename GridTransformation>
 tGrid<GridTransformation>::tGrid()
 {
diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu
index 158d3e05..1b94b566 100644
--- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu
+++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu
@@ -1,5 +1,6 @@
 #include <functional>
 #include "Neon/domain/Grids.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 
 #include "Neon/domain/tools/TestData.h"
 #include "TestInformation.h"
@@ -27,18 +28,18 @@ auto defContainer(int    streamIdx,
             return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable {
                 // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val);
                 Neon::index_3d globalPoint = a.getGlobalIndex(e);
-                a(e, 0) = globalPoint.x ;
+                a(e, 0) = globalPoint.x;
                 b(e, 0) = globalPoint.y;
                 c(e, 0) = globalPoint.z;
-//                if constexpr (std::is_same_v<typename Field::Grid, Neon::bGrid>) {
-//                    printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx,
-//                           e.mInDataBlockIdx.x,
-//                           e.mInDataBlockIdx.y,
-//                           e.mInDataBlockIdx.z,
-//                           globalPoint.x,
-//                           globalPoint.y,
-//                           globalPoint.z);
-//                }
+                //                if constexpr (std::is_same_v<typename Field::Grid, Neon::bGrid>) {
+                //                    printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx,
+                //                           e.mInDataBlockIdx.x,
+                //                           e.mInDataBlockIdx.y,
+                //                           e.mInDataBlockIdx.z,
+                //                           globalPoint.x,
+                //                           globalPoint.y,
+                //                           globalPoint.z);
+                //                }
             };
         });
 }
@@ -98,5 +99,6 @@ auto run(TestData<G, T, C>& data) -> void
 template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 
 }  // namespace globalIdx
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h
index 0a3b87eb..c766f7ca 100644
--- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h
+++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h
@@ -3,9 +3,9 @@
 #include <functional>
 
 #include "Neon/domain/Grids.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 #include "Neon/domain/tools/TestData.h"
 
-
 namespace globalIdx {
 using namespace Neon::domain::tool::testing;
 
@@ -15,6 +15,7 @@ auto run(TestData<G, T, C>& data) -> void;
 extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 
 
-}  // namespace map
+}  // namespace globalIdx
diff --git a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp
index 783830ca..f0ecce78 100644
--- a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp
@@ -4,7 +4,7 @@
 #include "globalIdx.h"
 #include "runHelper.h"
 
-TEST(domain_unit_test_globalIdx, dGrid)
+TEST(domain_globalIdx, dGrid)
 {
     int nGpus = 3;
     using Type = int64_t;
@@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid)
                             1);
 }
 
-TEST(domain_unit_test_globalIdx, eGrid)
+TEST(domain_globalIdx, eGrid)
 {
     int nGpus = 3;
     using Type = int64_t;
@@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid)
                             1);
 }
 
-TEST(domain_unit_test_globalIdx, bGrid)
+TEST(domain_globalIdx, bGrid)
 {
     int nGpus = 3;
     using Type = int64_t;
@@ -31,6 +31,15 @@ TEST(domain_unit_test_globalIdx, bGrid)
                             1);
 }
 
+TEST(domain_globalIdx, dGridSoA)
+{
+    int nGpus = 3;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::run<Neon::domain::details::dGridSoA::dGridSoA , Type, 0>),
+                            nGpus,
+                            1);
+}
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp
index d0d43b60..50d6e34d 100644
--- a/libNeonDomain/tests/domain-map/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-map/src/gtests.cpp
@@ -31,6 +31,15 @@ TEST(domain_map, bGrid)
                             1);
 }
 
+TEST(domain_map, dGridSoA)
+{
+    int nGpus = 1;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::run<Neon::domain::details::dGridSoA::dGridSoA, Type, 0>),
+                            nGpus,
+                            1);
+}
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu
index bd25f178..b001d832 100644
--- a/libNeonDomain/tests/domain-map/src/map.cu
+++ b/libNeonDomain/tests/domain-map/src/map.cu
@@ -4,6 +4,7 @@
 #include "Neon/domain/tools/TestData.h"
 #include "TestInformation.h"
 #include "gtest/gtest.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 
 
 namespace map {
@@ -75,6 +76,7 @@ auto run(TestData<G, T, C>& data) -> void
 template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 
 
 }  // namespace map
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h
index 611f2046..16073657 100644
--- a/libNeonDomain/tests/domain-map/src/map.h
+++ b/libNeonDomain/tests/domain-map/src/map.h
@@ -4,6 +4,7 @@
 
 #include "Neon/domain/Grids.h"
 #include "Neon/domain/tools/TestData.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 
 
 namespace map {
@@ -14,6 +15,8 @@ auto run(TestData<G, T, C>& data) -> void;
 
 extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 
 
 }  // namespace map

From ceab2a6f62dd72d4faedfadaea2be33b3ab4f565 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 26 Jun 2023 11:32:35 -0400
Subject: [PATCH 16/25] domain_neighbour_globalIdx for dGridSoA

---
 .../Neon/domain/details/dGrid/dPartition.h    |  75 +++---
 .../domain-neighbour-globalIdx/src/gtests.cpp |  55 ++++-
 .../src/runHelper.h                           |   1 +
 .../src/testsAndContainers.cu                 | 220 ++++++++++++++++--
 .../src/testsAndContainers.h                  |   9 +
 5 files changed, 306 insertions(+), 54 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
index 86faf619..2becc97d 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
@@ -258,43 +258,44 @@ class dPartition
                   Idx&       gidxNgh)
         const -> bool
     {
-        return helpGetNghIdx(gidx, NghIdx{xOff, yOff, zOff}, gidxNgh);
-        //        gidxNgh = Idx(gidx.getLocation().x + xOff,
-        //                      gidx.getLocation().y + yOff,
-        //                      gidx.getLocation().z + zOff);
-        //
-        //        bool isValidNeighbour = true;
-        //        if constexpr (xOff > 0) {
-        //            int constexpr direction = Neon::index_3d::directionX;
-        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
-        //            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
-        //        }
-        //        if constexpr (xOff < 0) {
-        //            int constexpr direction = Neon::index_3d::directionX;
-        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
-        //            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
-        //        }
-        //        if constexpr (yOff > 0) {
-        //            int constexpr direction = Neon::index_3d::directionY;
-        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
-        //            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
-        //        }
-        //        if constexpr (yOff < 0) {
-        //            int constexpr direction = Neon::index_3d::directionY;
-        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
-        //            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
-        //        }
-        //        if constexpr (zOff > 0) {
-        //            int constexpr direction = Neon::index_3d::directionZ;
-        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
-        //            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
-        //        }
-        //        if constexpr (zOff < 0) {
-        //            int constexpr direction = Neon::index_3d::directionZ;
-        //            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
-        //            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
-        //        }
-        //        return isValidNeighbour;
+        //        NghIdx offset(xOff, yOff, zOff);
+        //        return helpGetNghIdx(gidx, offset, gidxNgh);
+        gidxNgh = Idx(gidx.getLocation().x + xOff,
+                      gidx.getLocation().y + yOff,
+                      gidx.getLocation().z + zOff);
+
+        bool isValidNeighbour = true;
+        if constexpr (xOff > 0) {
+            int constexpr direction = Neon::index_3d::directionX;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
+        }
+        if constexpr (xOff < 0) {
+            int constexpr direction = Neon::index_3d::directionX;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
+        }
+        if constexpr (yOff > 0) {
+            int constexpr direction = Neon::index_3d::directionY;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
+        }
+        if constexpr (yOff < 0) {
+            int constexpr direction = Neon::index_3d::directionY;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
+        }
+        if constexpr (zOff > 0) {
+            int constexpr direction = Neon::index_3d::directionZ;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
+        }
+        if constexpr (zOff < 0) {
+            int constexpr direction = Neon::index_3d::directionZ;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
+        }
+        return isValidNeighbour;
     }
 
 
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp
index feba5a9b..21bba9b5 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp
@@ -1,10 +1,10 @@
 
+#include "./testsAndContainers.h"
 #include "Neon/Neon.h"
 #include "gtest/gtest.h"
-#include "./testsAndContainers.h"
 #include "runHelper.h"
 
-TEST(domain_unit_test_globalIdx, dGrid)
+TEST(domain_neighbour_globalIdx, dGrid)
 {
     int nGpus = 5;
     using Type = int64_t;
@@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid)
                             1);
 }
 
-TEST(domain_unit_test_globalIdx, eGrid)
+TEST(domain_neighbour_globalIdx, eGrid)
 {
     int nGpus = 5;
     using Type = int64_t;
@@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid)
                             1);
 }
 
-TEST(domain_unit_test_globalIdx, bGrid)
+TEST(domain_neighbour_globalIdx, bGrid)
 {
     int nGpus = 5;
     using Type = int64_t;
@@ -31,6 +31,53 @@ TEST(domain_unit_test_globalIdx, bGrid)
                             1);
 }
 
+TEST(domain_neighbour_globalIdx, dGridSoA)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::run<Neon::dGridSoA, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+///////////////////////////////////////////
+
+TEST(domain_neighbour_globalIdx, dGrid_template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::runTemplate<Neon::dGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_neighbour_globalIdx, eGrid_template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::runTemplate<Neon::eGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_neighbour_globalIdx, bGrid_template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::runTemplate<Neon::bGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_neighbour_globalIdx, dGridSoA_template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::runTemplate<Neon::dGridSoA, Type, 0>),
+                            nGpus,
+                            1);
+}
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h
index 0014594c..32a078d6 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h
@@ -9,6 +9,7 @@
 
 #include "Neon/domain/dGrid.h"
 #include "Neon/domain/eGrid.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 #include "Neon/domain/tools/Geometries.h"
 #include "Neon/domain/tools/TestData.h"
 
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu
index 49dd3bd2..7b2c3fef 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu
@@ -1,5 +1,6 @@
 #include <functional>
 #include "Neon/domain/Grids.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 
 #include "Neon/domain/tools/TestData.h"
 #include "TestInformation.h"
@@ -61,15 +62,15 @@ auto checkNeighbourData(Field const&   filedA,
                         Field const&   filedB,
                         Field const&   filedC,
                         Neon::index_3d testDirection,
-                        Field const&   checkFlatA,
-                        Field const&   checkFlatB,
-                        Field const&   checkFlatC)
+                        Field&         checkFlatA,
+                        Field&         checkFlatB,
+                        Field&         checkFlatC)
     -> Neon::set::Container
 {
     const auto& grid = filedA.getGrid();
     return grid.newContainer(
         "defContainer",
-        [&](Neon::set::Loader& loader) {
+        [&, testDirection](Neon::set::Loader& loader) {
             auto a = loader.load(filedA, Neon::Pattern::STENCIL);
             auto b = loader.load(filedB, Neon::Pattern::STENCIL);
             auto c = loader.load(filedC, Neon::Pattern::STENCIL);
@@ -102,6 +103,58 @@ auto checkNeighbourData(Field const&   filedA,
         });
 }
 
+template <int xOff,
+          int yOff,
+          int zOff,
+          typename Field>
+auto checkNeighbourDataTemplate(Field const& filedA,
+                                Field const& filedB,
+                                Field const& filedC,
+                                Field&       checkFlatA,
+                                Field&       checkFlatB,
+                                Field&       checkFlatC)
+    -> Neon::set::Container
+{
+    const auto& grid = filedA.getGrid();
+    return grid.newContainer(
+        "defContainer",
+        [&](Neon::set::Loader& loader) {
+            auto a = loader.load(filedA, Neon::Pattern::STENCIL);
+            auto b = loader.load(filedB, Neon::Pattern::STENCIL);
+            auto c = loader.load(filedC, Neon::Pattern::STENCIL);
+
+            auto resA = loader.load(checkFlatA, Neon::Pattern::MAP);
+            auto resB = loader.load(checkFlatB, Neon::Pattern::MAP);
+            auto resC = loader.load(checkFlatC, Neon::Pattern::MAP);
+
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable {
+                constexpr Neon::index_3d testDirection(xOff, yOff, zOff);
+
+                // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val);
+                Neon::index_3d globalPoint = a.getGlobalIndex(e);
+                auto           ngh = globalPoint + testDirection;
+
+                decltype(a)* nghInfo[3] = {&a, &b, &c};
+                decltype(a)* results[3] = {&resA, &resB, &resC};
+
+                for (int i = 0; i < 3; i++) {
+                    auto d = nghInfo[i]->template getNghData<testDirection.x, testDirection.y, testDirection.z>(e, 0);
+                    // auto d = nghInfo[i]->getNghData(e, testDirection.newType<int8_t>(), 0);
+
+                    if (d.isValid()) {
+                        results[i]->operator()(e, 0) = d.getData() == ngh.v[i] ? +1 : -1;
+                        if (d.getData() != ngh.v[i]) {
+                            printf("ERROR: %d %d %d %d %d %d\n", globalPoint.x, globalPoint.y, globalPoint.z, ngh.v[0], ngh.v[1], ngh.v[2]);
+                            d = nghInfo[i]->getNghData(e, testDirection.newType<int8_t>(), 0);
+                        }
+                    } else {
+                        results[i]->operator()(e, 0) = 0;
+                    }
+                }
+            };
+        });
+}
+
 using namespace Neon::domain::tool::testing;
 
 template <typename G, typename T, int C>
@@ -165,15 +218,15 @@ auto run(TestData<G, T, C>& data) -> void
                                    X, Y, Z);
     };
 
-    //    constexpr std::array<const Ngh3DIdx, 6>
-    //        stencil{Ngh3DIdx(1, 0, 0),
-    //                Ngh3DIdx(-1, 0, 0),
-    //                Ngh3DIdx(0, 1, 0),
-    //                Ngh3DIdx(0, -1, 0),
-    //                Ngh3DIdx(0, 0, 1),
-    //                Ngh3DIdx(0, 0, -1)};
-    constexpr std::array<const Ngh3DIdx, 1>
-        stencil{Ngh3DIdx(0, 0, -1)};
+    constexpr std::array<const Ngh3DIdx, 6>
+        stencil{Ngh3DIdx(1, 0, 0),
+                Ngh3DIdx(-1, 0, 0),
+                Ngh3DIdx(0, 1, 0),
+                Ngh3DIdx(0, -1, 0),
+                Ngh3DIdx(0, 0, 1),
+                Ngh3DIdx(0, 0, -1)};
+    //    constexpr std::array<const Ngh3DIdx, 1>
+    //        stencil{Ngh3DIdx(0, 0, -1)};
 
     for (auto const& direction : stencil) {
         reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx);
@@ -214,8 +267,149 @@ auto run(TestData<G, T, C>& data) -> void
     }
 }
 
+template <typename G, typename T, int C>
+auto runTemplate(TestData<G, T, C>& data) -> void
+{
+
+    using Type = typename TestData<G, T, C>::Type;
+    auto&             grid = data.getGrid();
+    const std::string appName = TestInformation::fullName(grid.getImplementationName());
+
+    data.resetValuesToLinear(1, 100);
+
+    auto aField = grid.template newField<int64_t>("a", 1, 0);
+    auto bField = grid.template newField<int64_t>("a", 1, 0);
+    auto cField = grid.template newField<int64_t>("a", 1, 0);
+
+    auto& X = data.getField(FieldNames::X);
+    auto& Y = data.getField(FieldNames::Y);
+    auto& Z = data.getField(FieldNames::Z);
+
+    const Neon::index_3d dim = grid.getDimension();
+    auto                 bk = grid.getBackend();
+
+    {  // NEON
+        {
+            initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+            aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+        }
+    }
+    using Ngh3DIdx = Neon::int32_3d;
+
+    auto setGolden = [&](Ngh3DIdx const& direction) {  // Golden data
+        auto& X = data.getIODomain(FieldNames::X);
+        auto& Y = data.getIODomain(FieldNames::Y);
+        auto& Z = data.getIODomain(FieldNames::Z);
+
+        data.forEachActiveIODomain([&](const Neon::index_3d& idx,
+                                       int                   cardinality,
+                                       Type&                 a,
+                                       Type&                 b,
+                                       Type&                 c) {
+            a = 1;
+            b = 1;
+            c = 1;
+            auto ngh = direction + idx;
+            if (!(ngh >= 0)) {
+                a = 0;
+                b = 0;
+                c = 0;
+            }
+            if (!(dim > ngh)) {
+                a = 0;
+                b = 0;
+                c = 0;
+            }
+        },
+                                   X, Y, Z);
+    };
+
+    constexpr std::array<const Ngh3DIdx, 6>
+        stencil{Ngh3DIdx(1, 0, 0),
+                Ngh3DIdx(-1, 0, 0),
+                Ngh3DIdx(0, 1, 0),
+                Ngh3DIdx(0, -1, 0),
+                Ngh3DIdx(0, 0, 1),
+                Ngh3DIdx(0, 0, -1)};
+    //    constexpr std::array<const Ngh3DIdx, 1>
+    //        stencil{Ngh3DIdx(0, 0, -1)};
+
+    for (auto const& direction : stencil) {
+        reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx);
+        reset(X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        {  // Updating halo with wrong data
+            bk.sync(Neon::Backend::mainStreamIdx);
+            aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+        }
+        {
+            initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+            aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+        }
+
+
+        // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+
+        if (direction == Neon::index_3d(1, 0, 0)) {
+            checkNeighbourDataTemplate<1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(-1, 0, 0)) {
+            checkNeighbourDataTemplate<-1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(0, 1, 0)) {
+            checkNeighbourDataTemplate<0, 1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(0, -1, 0)) {
+            checkNeighbourDataTemplate<0, -1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(0, 0, 1)) {
+            checkNeighbourDataTemplate<0, 0, 1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(0, 0, -1)) {
+            checkNeighbourDataTemplate<0, 0, -1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else {
+            std::cout << "Direction not implemented " << direction << std::endl;
+            exit(99);
+        }
+        setGolden(direction);
+
+        bk.sync(Neon::Backend::mainStreamIdx);
+        bool isOk = data.compare(FieldNames::X);
+        isOk = isOk && data.compare(FieldNames::Y);
+        isOk = isOk && data.compare(FieldNames::Z);
+
+        if (!isOk) {
+            std::cout << "Direction with errors " << direction << std::endl;
+            data.getField(FieldNames::X).ioToVtk(grid.getImplementationName() + "X", "X", true);
+            data.getField(FieldNames::Y).ioToVtk(grid.getImplementationName() + "Y", "Y", true);
+            data.getField(FieldNames::Z).ioToVtk(grid.getImplementationName() + "Z", "Z", true);
+            exit(77);
+            ASSERT_TRUE(isOk);
+        }
+    }
+}
+
+
 template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto run<Neon::dGridSoA, int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
+
+
+template auto runTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::dGridSoA, int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
 }  // namespace globalIdx
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h
index 0a3b87eb..bcf503f2 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h
@@ -4,6 +4,7 @@
 
 #include "Neon/domain/Grids.h"
 #include "Neon/domain/tools/TestData.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 
 
 namespace globalIdx {
@@ -12,9 +13,17 @@ using namespace Neon::domain::tool::testing;
 template <typename G, typename T, int C>
 auto run(TestData<G, T, C>& data) -> void;
 
+template <typename G, typename T, int C>
+auto runTemplate(TestData<G, T, C>& data) -> void;
+
 extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
+extern template auto runTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
 }  // namespace map

From 13377a4af18430dfc9bf7ec16afe2fcb2d209520 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 27 Jun 2023 10:08:19 -0400
Subject: [PATCH 17/25] Testing block sizes on bGrid

---
 .../lbm-lid-driven-cavity-flow/src/LbmTools.h |   8 +-
 .../src/LbmToolsTemplateOnly.h                | 440 ++++++++++++++++++
 .../src/RunCavityTwoPop.cu                    |  27 +-
 .../domain/details/dGridSoA/dSpanSoA_imp.h    |  52 ++-
 .../tests/domain-map/src/runHelper.h          |   4 +-
 5 files changed, 501 insertions(+), 30 deletions(-)
 create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h

diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h
index 5728a5d3..ab79ed2a 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h
@@ -35,7 +35,6 @@ struct LbmContainers<D3Q19Template<typename PopulationField::Type, LbmComputeTyp
     {                                                                                                                   \
         { /*GO*/                                                                                                        \
             if (wallBitFlag & (uint32_t(1) << GOid)) {                                                                  \
-                /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \
                 popIn[GOid] = fin(i, BKid);                                                                             \
             } else {                                                                                                    \
                 popIn[GOid] = fin.template nghVal<BKx, BKy, BKz>(i, GOid, 0.0).value;                                   \
@@ -101,8 +100,6 @@ struct LbmContainers<D3Q19Template<typename PopulationField::Type, LbmComputeTyp
                typename PopulationField::Partition const& fin,
                NEON_OUT LbmStoreType                      popIn[19])
     {
-        // #pragma omp critical
-        //        {
 #if 0
         using TopologyByDirection = std::tuple<Neon::int32_3d, int, Neon::int32_3d, int>;
         constexpr std::array<TopologyByDirection, 9> stencil{
@@ -160,7 +157,6 @@ struct LbmContainers<D3Q19Template<typename PopulationField::Type, LbmComputeTyp
         PULL_STREAM(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /*  BKid */ 17);
         PULL_STREAM(0, -1, 1, /*  GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18);
 
-        //  }
         // Treat the case of the center (c[k] = {0, 0, 0,}).
         {
             popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection);
@@ -266,7 +262,9 @@ struct LbmContainers<D3Q19Template<typename PopulationField::Type, LbmComputeTyp
         COMPUTE_GO_AND_BACK(3, 13)
         COMPUTE_GO_AND_BACK(4, 14)
         COMPUTE_GO_AND_BACK(5, 15)
-        COMPUTE_GO_AND_BACK(6, 16)
+        // COMPUTE_GO_AND_BACK(6, 16)
+        fOut(i, 6) = static_cast<LbmStoreType>(pop_out_06);
+        fOut(i, 16) = static_cast<LbmStoreType>(pop_out_opp_06);
         COMPUTE_GO_AND_BACK(7, 17)
         COMPUTE_GO_AND_BACK(8, 18)
 
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h
new file mode 100644
index 00000000..fc4d7806
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h
@@ -0,0 +1,440 @@
+#include "CellType.h"
+#include "D3Q19.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Containter.h"
+
+#define COMPUTE_CAST(VAR) static_cast<LbmComputeType>((VAR))
+
+template <typename Lattice,
+          typename PopulationField,
+          typename LbmComputeType>
+struct LbmContainersTemplateOnly
+{
+};
+
+/**
+ * Specialization for Lattice
+ * @tparam PopulationField
+ * @tparam LbmComputeType
+ */
+template <typename PopulationField,
+          typename LbmComputeType>
+struct LbmContainersTemplateOnly<D3Q19Template<typename PopulationField::Type, LbmComputeType>,
+                                 PopulationField,
+                                 LbmComputeType>
+{
+    using LbmStoreType = typename PopulationField::Type;
+    using CellTypeField = typename PopulationField::Grid::template Field<CellType, 1>;
+    using Lattice = D3Q19Template<LbmStoreType, LbmComputeType>;
+    using Idx = typename PopulationField::Idx;
+    using Grid = typename PopulationField::Grid;
+    using Rho = typename Grid::template Field<LbmStoreType, 1>;
+    using U = typename Grid::template Field<LbmStoreType, 3>;
+
+#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid)                                                               \
+    {                                                                                                                   \
+        { /*GO*/                                                                                                        \
+            if (wallBitFlag & (uint32_t(1) << GOid)) {                                                                  \
+                /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \
+                popIn[GOid] = fin.template read<BKid>(gidx);                                                            \
+            } else {                                                                                                    \
+                popIn[GOid] = fin.template nghVal<BKx, BKy, BKz, GOid>(gidx).value;                                     \
+            }                                                                                                           \
+        }                                                                                                               \
+        { /*BK*/                                                                                                        \
+            if (wallBitFlag & (uint32_t(1) << BKid)) {                                                                  \
+                popIn[BKid] = fin.template read<GOid>(gidx);                                                            \
+            } else {                                                                                                    \
+                popIn[BKid] = fin.template nghVal<GOx, GOy, GOz, BKid>(gidx).value;                                     \
+            }                                                                                                           \
+        }                                                                                                               \
+    }
+    static inline NEON_CUDA_HOST_DEVICE auto
+    loadPopulation(Idx const&                                 gidx,
+                   const uint32_t&                            wallBitFlag,
+                   typename PopulationField::Partition const& fin,
+                   NEON_OUT LbmStoreType                      popIn[19])
+    {
+        // #pragma omp critical
+        //        {
+
+        LOADPOP(-1, 0, 0, /*  GOid */ 0, /* --- */ 1, 0, 0, /*  BKid */ 10);
+        LOADPOP(0, -1, 0, /*  GOid */ 1, /* --- */ 0, 1, 0, /*  BKid */ 11);
+        LOADPOP(0, 0, -1, /*  GOid */ 2, /* --- */ 0, 0, 1, /*  BKid */ 12);
+        LOADPOP(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /*  BKid */ 13);
+        LOADPOP(-1, 1, 0, /*  GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14);
+        LOADPOP(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /*  BKid */ 15);
+        LOADPOP(-1, 0, 1, /*  GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16);
+        LOADPOP(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /*  BKid */ 17);
+        LOADPOP(0, -1, 1, /*  GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18);
+        //  }
+        // Treat the case of the center (c[k] = {0, 0, 0,}).
+        {
+            popIn[Lattice::centerDirection] = fin(i, Lattice::centerDirection);
+        }
+    }
+#undef LOADPOP
+
+#define PULL_STREAM(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid)                                                           \
+    {                                                                                                                   \
+        { /*GO*/                                                                                                        \
+            if (wallBitFlag & (uint32_t(1) << GOid)) {                                                                  \
+                /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \
+                popIn[GOid] = fin(gidx, BKid) +                                                                         \
+                              fin.template getNghData<BKx, BKy, BKz>(gidx, BKid)();                                     \
+            } else {                                                                                                    \
+                popIn[GOid] = fin.template getNghData<BKx, BKy, BKz>(gidx, GOid)();                                     \
+            }                                                                                                           \
+        }                                                                                                               \
+        { /*BK*/                                                                                                        \
+            if (wallBitFlag & (uint32_t(1) << BKid)) {                                                                  \
+                popIn[BKid] = fin(gidx, GOid) + fin.template getNghData<GOx, GOy, GOz>(gidx, GOid)();                   \
+            } else {                                                                                                    \
+                popIn[BKid] = fin.template getNghData<GOx, GOy, GOz>(gidx, BKid)();                                     \
+            }                                                                                                           \
+        }                                                                                                               \
+    }
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    pullStream(Idx const&                                 gidx,
+               const uint32_t&                            wallBitFlag,
+               typename PopulationField::Partition const& fin,
+               NEON_OUT LbmStoreType                      popIn[19])
+    {
+        // #pragma omp critical
+        //        {
+#if 0
+        using TopologyByDirection = std::tuple<Neon::int32_3d, int, Neon::int32_3d, int>;
+        constexpr std::array<TopologyByDirection, 9> stencil{
+            std::make_tuple(Neon::int32_3d(-1, 0, 0), /*  GOid */ 0, /* --- */ Neon::int32_3d(1, 0, 0), /*  BKid */ 10),
+            std::make_tuple(Neon::int32_3d(0, -1, 0), /*  GOid */ 1, /* --- */ Neon::int32_3d(0, 1, 0), /*  BKid */ 11),
+            std::make_tuple(Neon::int32_3d(0, 0, -1), /*  GOid */ 2, /* --- */ Neon::int32_3d(0, 0, 1), /*  BKid */ 12),
+            std::make_tuple(Neon::int32_3d(-1, -1, 0), /* GOid */ 3, /* --- */ Neon::int32_3d(1, 1, 0), /*  BKid */ 13),
+            std::make_tuple(Neon::int32_3d(-1, 1, 0), /*  GOid */ 4, /* --- */ Neon::int32_3d(1, -1, 0), /* BKid */ 14),
+            std::make_tuple(Neon::int32_3d(-1, 0, -1), /* GOid */ 5, /* --- */ Neon::int32_3d(1, 0, 1), /*  BKid */ 15),
+            std::make_tuple(Neon::int32_3d(-1, 0, 1), /*  GOid */ 6, /* --- */ Neon::int32_3d(1, 0, -1), /* BKid */ 16),
+            std::make_tuple(Neon::int32_3d(0, -1, -1), /* GOid */ 7, /* --- */ Neon::int32_3d(0, 1, 1), /*  BKid */ 17),
+            std::make_tuple(Neon::int32_3d(0, -1, 1), /*  GOid */ 8, /* --- */ Neon::int32_3d(0, 1, -1), /* BKid */ 18)};
+
+
+        auto pullStream = [&]<int stencilIdx>() {
+            static_assert(stencilIdx < 9);
+            constexpr int            GOid = std::get<1>(stencil[stencilIdx]);
+            constexpr int            BKid = std::get<3>(stencil[stencilIdx]);
+            constexpr Neon::int32_3d GoOffset = std::get<0>(stencil[stencilIdx]);
+            constexpr Neon::int32_3d BkOffset = std::get<2>(stencil[stencilIdx]);
+            {
+                if (wallBitFlag & (uint32_t(1) << GOid)) {
+                    popIn[GOid] = fin(gidx, BKid) +
+                                  fin.template getNghData<BkOffset.x, BkOffset.y, BkOffset.z>(gidx, BKid)();
+                } else {
+                    popIn[GOid] = fin.template getNghData<BkOffset.x, BkOffset.y, BkOffset.z>(gidx, GOid)();
+                }
+            }
+            { /*BK*/
+                if (wallBitFlag & (uint32_t(1) << BKid)) {
+                    popIn[BKid] = fin(gidx, GOid) +
+                                  fin.template getNghData<GoOffset.x, GoOffset.y, GoOffset.z>(gidx, GOid)();
+                } else {
+                    popIn[BKid] = fin.template getNghData<GoOffset.x, GoOffset.y, GoOffset.z>(gidx, BKid)();
+                }
+            }
+        };
+        pullStream.template operator()<0>();
+        pullStream.template operator()<1>();
+        pullStream.template operator()<2>();
+        pullStream.template operator()<3>();
+        pullStream.template operator()<4>();
+        pullStream.template operator()<5>();
+        pullStream.template operator()<6>();
+        pullStream.template operator()<7>();
+        pullStream.template operator()<8>();
+#endif
+        PULL_STREAM(-1, 0, 0, /*  GOid */ 0, /* --- */ 1, 0, 0, /*  BKid */ 10);
+        PULL_STREAM(0, -1, 0, /*  GOid */ 1, /* --- */ 0, 1, 0, /*  BKid */ 11);
+        PULL_STREAM(0, 0, -1, /*  GOid */ 2, /* --- */ 0, 0, 1, /*  BKid */ 12);
+        PULL_STREAM(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /*  BKid */ 13);
+        PULL_STREAM(-1, 1, 0, /*  GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14);
+        PULL_STREAM(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /*  BKid */ 15);
+        PULL_STREAM(-1, 0, 1, /*  GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16);
+        PULL_STREAM(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /*  BKid */ 17);
+        PULL_STREAM(0, -1, 1, /*  GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18);
+
+        //  }
+        // Treat the case of the center (c[k] = {0, 0, 0,}).
+        {
+            popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection);
+        }
+    }
+#undef PULL_STREAM
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    macroscopic(const LbmStoreType       pop[Lattice::Q],
+                NEON_OUT LbmComputeType& rho,
+                NEON_OUT std::array<LbmComputeType, 3>& u)
+        -> void
+    {
+#define POP(IDX) static_cast<LbmComputeType>(pop[IDX])
+
+        const LbmComputeType X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6);
+        const LbmComputeType X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16);
+        const LbmComputeType X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18);
+
+        const LbmComputeType Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14);
+        const LbmComputeType Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18);
+
+        const LbmComputeType Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18);
+        const LbmComputeType Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17);
+
+#undef POP
+
+        rho = X_M1 + X_P1 + X_0;
+        u[0] = (X_P1 - X_M1) / rho;
+        u[1] = (Y_P1 - Y_M1) / rho;
+        u[2] = (Z_P1 - Z_M1) / rho;
+    }
+
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    collideBgkUnrolled(Idx const&                           i /*!     LbmComputeType iterator   */,
+                       const LbmStoreType                   pop[Lattice::Q],
+                       LbmComputeType const&                rho /*!   Density            */,
+                       std::array<LbmComputeType, 3> const& u /*!     Velocity           */,
+                       LbmComputeType const&                usqr /*!  Usqr               */,
+                       LbmComputeType const&                omega /*! Omega              */,
+                       typename PopulationField::Partition& fOut /*!  Population         */)
+
+        -> void
+    {
+        const LbmComputeType ck_u03 = u[0] + u[1];
+        const LbmComputeType ck_u04 = u[0] - u[1];
+        const LbmComputeType ck_u05 = u[0] + u[2];
+        const LbmComputeType ck_u06 = u[0] - u[2];
+        const LbmComputeType ck_u07 = u[1] + u[2];
+        const LbmComputeType ck_u08 = u[1] - u[2];
+
+        const LbmComputeType eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr);
+        const LbmComputeType eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr);
+        const LbmComputeType eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr);
+        const LbmComputeType eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr);
+        const LbmComputeType eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr);
+        const LbmComputeType eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr);
+        const LbmComputeType eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr);
+        const LbmComputeType eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr);
+        const LbmComputeType eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr);
+
+        const LbmComputeType eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0];
+        const LbmComputeType eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1];
+        const LbmComputeType eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2];
+        const LbmComputeType eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03;
+        const LbmComputeType eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04;
+        const LbmComputeType eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05;
+        const LbmComputeType eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06;
+        const LbmComputeType eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07;
+        const LbmComputeType eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08;
+
+        const LbmComputeType pop_out_00 = (1. - omega) * static_cast<LbmComputeType>(pop[0]) + omega * eq_00;
+        const LbmComputeType pop_out_01 = (1. - omega) * static_cast<LbmComputeType>(pop[1]) + omega * eq_01;
+        const LbmComputeType pop_out_02 = (1. - omega) * static_cast<LbmComputeType>(pop[2]) + omega * eq_02;
+        const LbmComputeType pop_out_03 = (1. - omega) * static_cast<LbmComputeType>(pop[3]) + omega * eq_03;
+        const LbmComputeType pop_out_04 = (1. - omega) * static_cast<LbmComputeType>(pop[4]) + omega * eq_04;
+        const LbmComputeType pop_out_05 = (1. - omega) * static_cast<LbmComputeType>(pop[5]) + omega * eq_05;
+        const LbmComputeType pop_out_06 = (1. - omega) * static_cast<LbmComputeType>(pop[6]) + omega * eq_06;
+        const LbmComputeType pop_out_07 = (1. - omega) * static_cast<LbmComputeType>(pop[7]) + omega * eq_07;
+        const LbmComputeType pop_out_08 = (1. - omega) * static_cast<LbmComputeType>(pop[8]) + omega * eq_08;
+
+        const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast<LbmComputeType>(pop[10]) + omega * eqopp_00;
+        const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast<LbmComputeType>(pop[11]) + omega * eqopp_01;
+        const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast<LbmComputeType>(pop[12]) + omega * eqopp_02;
+        const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast<LbmComputeType>(pop[13]) + omega * eqopp_03;
+        const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast<LbmComputeType>(pop[14]) + omega * eqopp_04;
+        const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast<LbmComputeType>(pop[15]) + omega * eqopp_05;
+        const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast<LbmComputeType>(pop[16]) + omega * eqopp_06;
+        const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast<LbmComputeType>(pop[17]) + omega * eqopp_07;
+        const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast<LbmComputeType>(pop[18]) + omega * eqopp_08;
+
+
+#define COMPUTE_GO_AND_BACK(GOid, BKid)                                 \
+    {                                                                   \
+        fOut(i, GOid) = static_cast<LbmStoreType>(pop_out_0##GOid);     \
+        fOut(i, BKid) = static_cast<LbmStoreType>(pop_out_opp_0##GOid); \
+    }
+
+        COMPUTE_GO_AND_BACK(0, 10)
+        COMPUTE_GO_AND_BACK(1, 11)
+        COMPUTE_GO_AND_BACK(2, 12)
+        COMPUTE_GO_AND_BACK(3, 13)
+        COMPUTE_GO_AND_BACK(4, 14)
+        COMPUTE_GO_AND_BACK(5, 15)
+        COMPUTE_GO_AND_BACK(6, 16)
+        COMPUTE_GO_AND_BACK(7, 17)
+        COMPUTE_GO_AND_BACK(8, 18)
+
+#undef COMPUTE_GO_AND_BACK
+
+        {
+            const LbmComputeType eq_09 = rho * (1. / 3.) * (1. - usqr);
+            const LbmComputeType pop_out_09 = (1. - omega) *
+                                                  static_cast<LbmComputeType>(pop[Lattice::centerDirection]) +
+                                              omega * eq_09;
+            fOut(i, Lattice::centerDirection) = static_cast<LbmStoreType>(pop_out_09);
+        }
+    }
+
+    static auto
+    iteration(Neon::set::StencilSemantic stencilSemantic,
+              const PopulationField&     fInField /*!   inpout population field */,
+              const CellTypeField&       cellTypeField /*!       Cell type field     */,
+              const LbmComputeType       omega /*! LBM omega parameter */,
+              PopulationField&           fOutField /*!  output Population field */)
+        -> Neon::set::Container
+    {
+
+        Neon::set::Container container = fInField.getGrid().newContainer(
+            "LBM_iteration",
+            [&, omega](Neon::set::Loader& L) -> auto {
+                auto&       fIn = L.load(fInField,
+                                         Neon::Pattern::STENCIL, stencilSemantic);
+                auto&       fOut = L.load(fOutField);
+                const auto& cellInfoPartition = L.load(cellTypeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable {
+                    CellType cellInfo = cellInfoPartition(gidx, 0);
+                    if (cellInfo.classification == CellType::bulk) {
+
+                        LbmStoreType popIn[Lattice::Q];
+                        pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn);
+
+                        LbmComputeType                rho;
+                        std::array<LbmComputeType, 3> u{.0, .0, .0};
+                        macroscopic(popIn, NEON_OUT rho, NEON_OUT u);
+
+                        LbmComputeType usqr = 1.5 * (u[0] * u[0] +
+                                                     u[1] * u[1] +
+                                                     u[2] * u[2]);
+
+                        collideBgkUnrolled(gidx,
+                                           popIn,
+                                           rho, u,
+                                           usqr, omega,
+                                           NEON_OUT fOut);
+                    }
+                };
+            });
+        return container;
+    }
+
+#define COMPUTE_MASK_WALL(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid)                                           \
+    {                                                                                                         \
+        { /*GO*/                                                                                              \
+            CellType nghCellType = infoIn.template getNghData<BKx, BKy, BKz>(gidx, 0, CellType::undefined)(); \
+            if (nghCellType.classification != CellType::bulk) {                                               \
+                cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid));                  \
+            }                                                                                                 \
+        }                                                                                                     \
+        { /*BK*/                                                                                              \
+            CellType nghCellType = infoIn.template getNghData<GOx, GOy, GOz>(gidx, 0, CellType::undefined)(); \
+            if (nghCellType.classification != CellType::bulk) {                                               \
+                cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << BKid));                  \
+            }                                                                                                 \
+        }                                                                                                     \
+    }
+
+    static auto
+    computeWallNghMask(const CellTypeField& infoInField,
+                       CellTypeField&       infoOutpeField)
+
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = infoInField.getGrid().newContainer(
+            "LBM_iteration",
+            [&](Neon::set::Loader& L) -> auto {
+                auto& infoIn = L.load(infoInField,
+                                      Neon::Pattern::STENCIL);
+                auto& infoOut = L.load(infoOutpeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable {
+                    CellType cellType = infoIn(gidx, 0);
+                    cellType.wallNghBitflag = 0;
+
+                    if (cellType.classification == CellType::bulk) {
+                        COMPUTE_MASK_WALL(-1, 0, 0, /*  GOid */ 0, /* --- */ 1, 0, 0, /*  BKid */ 10)
+                        COMPUTE_MASK_WALL(0, -1, 0, /*  GOid */ 1, /* --- */ 0, 1, 0, /*  BKid */ 11)
+                        COMPUTE_MASK_WALL(0, 0, -1, /*  GOid */ 2, /* --- */ 0, 0, 1, /*  BKid */ 12)
+                        COMPUTE_MASK_WALL(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /*  BKid */ 13)
+                        COMPUTE_MASK_WALL(-1, 1, 0, /*  GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14)
+                        COMPUTE_MASK_WALL(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /*  BKid */ 15)
+                        COMPUTE_MASK_WALL(-1, 0, 1, /*  GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16)
+                        COMPUTE_MASK_WALL(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /*  BKid */ 17)
+                        COMPUTE_MASK_WALL(0, -1, 1, /*  GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18)
+
+                        infoOut(gidx, 0) = cellType;
+                    }
+                };
+            });
+        return container;
+    }
+#undef COMPUTE_MASK_WALL
+
+#define BC_LOAD(GOID, DKID)        \
+    popIn[GOID] = fIn(gidx, GOID); \
+    popIn[DKID] = fIn(gidx, DKID);
+
+    static auto
+    computeRhoAndU([[maybe_unused]] const PopulationField& fInField /*!   inpout population field */,
+                   const CellTypeField&                    cellTypeField /*!       Cell type field     */,
+                   Rho&                                    rhoField /*!  output Population field */,
+                   U&                                      uField /*!  output Population field */)
+
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = fInField.getGrid().newContainer(
+            "LBM_iteration",
+            [&](Neon::set::Loader& L) -> auto {
+                auto& fIn = L.load(fInField,
+                                   Neon::Pattern::STENCIL);
+                auto& rhoXpu = L.load(rhoField);
+                auto& uXpu = L.load(uField);
+
+                const auto& cellInfoPartition = L.load(cellTypeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable {
+                    CellType                      cellInfo = cellInfoPartition(gidx, 0);
+                    LbmComputeType                rho = 0;
+                    std::array<LbmComputeType, 3> u{.0, .0, .0};
+                    LbmStoreType                  popIn[Lattice::Q];
+
+                    if (cellInfo.classification == CellType::bulk) {
+                        pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn);
+                        macroscopic(popIn, NEON_OUT rho, NEON_OUT u);
+                    } else {
+                        if (cellInfo.classification == CellType::movingWall) {
+                            BC_LOAD(0, 10)
+                            BC_LOAD(1, 11)
+                            BC_LOAD(2, 12)
+                            BC_LOAD(3, 13)
+                            BC_LOAD(4, 14)
+                            BC_LOAD(5, 15)
+                            BC_LOAD(6, 16)
+                            BC_LOAD(7, 17)
+                            BC_LOAD(8, 18)
+                            popIn[9] = fIn(gidx, 9);
+
+                            rho = 1.0;
+                            u = std::array<LbmComputeType, 3>{COMPUTE_CAST(popIn[0]) / COMPUTE_CAST(6. * 1. / 18.),
+                                                              COMPUTE_CAST(popIn[1]) / COMPUTE_CAST(6. * 1. / 18.),
+                                                              COMPUTE_CAST(popIn[2]) / COMPUTE_CAST(6. * 1. / 18.)};
+                        }
+                    }
+
+                    rhoXpu(gidx, 0) = static_cast<LbmStoreType>(rho);
+                    uXpu(gidx, 0) = static_cast<LbmStoreType>(u[0]);
+                    uXpu(gidx, 1) = static_cast<LbmStoreType>(u[1]);
+                    uXpu(gidx, 2) = static_cast<LbmStoreType>(u[2]);
+                };
+            });
+        return container;
+    }
+};
+
+#undef COMPUTE_CAST
\ No newline at end of file
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
index 29c7573d..e91055f9 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
@@ -2,8 +2,8 @@
 #include "D3Q19.h"
 #include "Neon/domain/bGrid.h"
 #include "Neon/domain/dGrid.h"
-#include "Neon/domain/eGrid.h"
 #include "Neon/domain/details/dGridSoA/dGridSoA.h"
+#include "Neon/domain/eGrid.h"
 
 #include "CellType.h"
 #include "LbmIteration.h"
@@ -314,6 +314,31 @@ auto run(Config& config,
     if (config.gridType == "bGrid") {
         return details::runFilterStoreType<Neon::bGrid>(config, report);
     }
+    if (config.gridType == "bGrid_4_4_4") {
+        using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>;
+        using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+        return details::runFilterStoreType<Grid>(config, report);
+    }
+    if (config.gridType == "bGrid_32_8_4") {
+        using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>;
+        using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+        return details::runFilterStoreType<Grid>(config, report);
+    }
+    if (config.gridType == "bGrid_32_8_4") {
+        using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 4, 8>;
+        using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+        return details::runFilterStoreType<Grid>(config, report);
+    }
+    if (config.gridType == "bGrid_32_2_8") {
+        using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>;
+        using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+        return details::runFilterStoreType<Grid>(config, report);
+    }
+    if (config.gridType == "bGrid_32_8_2") {
+        using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>;
+        using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+        return details::runFilterStoreType<Grid>(config, report);
+    }
     if (config.gridType == "dGridSoA") {
         return details::runFilterStoreType<Neon::domain::details::dGridSoA::dGridSoA>(config, report);
     }
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
index 421a3f27..f760adb5 100644
--- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
@@ -4,71 +4,77 @@ namespace Neon::domain::details::dGridSoA {
 
 NEON_CUDA_HOST_DEVICE inline auto
 dSpanSoA::setAndValidate(Idx&            idx,
-                      const uint32_t& x,
-                      const uint32_t& y,
-                      const uint32_t& z)
+                         const uint32_t& x,
+                         const uint32_t& y,
+                         const uint32_t& z)
     const -> bool
 {
-    bool res = false;
     idx.setLocation().x = int(x);
     idx.setLocation().y = int(y);
     idx.setLocation().z = int(z);
 
-    if (idx.getLocation() < mDim) {
-        res = true;
-    }
+    bool  isValid = idx.getLocation() < mDim;
 
     switch (mDataView) {
         case Neon::DataView::STANDARD: {
             idx.setLocation().z += mZHaloRadius;
-            idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
-            return res;
+            idx.setOffset() = idx.getLocation().x +
+                              idx.getLocation().y * mDim.x +
+                              idx.getLocation().z * mDim.x * mDim.y;
+            break ;
         }
         case Neon::DataView::INTERNAL: {
             idx.setLocation().z += mZHaloRadius + mZBoundaryRadius;
-            idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
-            return res;
+            idx.setOffset() = idx.getLocation().x +
+                              idx.getLocation().y * mDim.x +
+                              idx.getLocation().z * mDim.x * mDim.y;
+            break ;
         }
         case Neon::DataView::BOUNDARY: {
-
             idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius
-                               ? 0
-                               : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */);
+                                       ? 0
+                                       : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */);
             idx.setLocation().z += mZHaloRadius;
-            idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y;
-            return res;
+            idx.setOffset() = idx.getLocation().x +
+                              idx.getLocation().y * mDim.x +
+                              idx.getLocation().z * mDim.x * mDim.y;
+            break ;
         }
         default: {
         }
     }
-    return false;
+    return isValid;
 }
 
-NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDataView()
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::helpGetDataView()
     const -> Neon::DataView const&
 {
     return mDataView;
 }
 
-NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZHaloRadius()
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::helpGetZHaloRadius()
     const -> int const&
 {
     return mZHaloRadius;
 }
 
-NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZBoundaryRadius()
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::helpGetZBoundaryRadius()
     const -> int const&
 {
     return mZBoundaryRadius;
 }
 
-NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDim()
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::helpGetDim()
     const -> Neon::index_3d const&
 {
     return mDim;
 }
 
-NEON_CUDA_HOST_DEVICE inline auto  dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) ->void
+NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) -> void
 {
     mDataView = dspan.helpGetDataView();
     mZHaloRadius = dspan.helpGetZHaloRadius();
@@ -77,4 +83,4 @@ NEON_CUDA_HOST_DEVICE inline auto  dSpanSoA::helpInit(Neon::domain::details::dGr
 }
 
 
-}  // namespace Neon::domain::details::dGrid
\ No newline at end of file
+}  // namespace Neon::domain::details::dGridSoA
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-map/src/runHelper.h b/libNeonDomain/tests/domain-map/src/runHelper.h
index 53ea8681..593e31c2 100644
--- a/libNeonDomain/tests/domain-map/src/runHelper.h
+++ b/libNeonDomain/tests/domain-map/src/runHelper.h
@@ -31,7 +31,7 @@ void runAllTestConfiguration(
         nGpuTest.push_back(i);
     }
     // std::vector<int> nGpuTest{2,4,6,8};
-    std::vector<int> cardinalityTest{1};
+    std::vector<int> cardinalityTest{1,3,19};
 
     std::vector<Neon::index_3d> dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}};
     std::vector<Neon::Runtime>  runtimeE{Neon::Runtime::openmp};
@@ -95,6 +95,7 @@ void runAllTestConfiguration(
     }
 }
 
+#if 0
 
 template <typename G, typename T, int C>
 void runOneTestConfiguration(const std::string&                      gname,
@@ -144,3 +145,4 @@ void runOneTestConfiguration(const std::string&                      gname,
         }
     }
 }
+#endif
\ No newline at end of file

From 3a36f0c81e830a170712227b463a8c4d7631cf26 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 28 Jun 2023 12:55:22 -0400
Subject: [PATCH 18/25] Adding dGridSoA to the stencil tests

---
 .../Neon/domain/details/dGridSoA/dPartitionSoA.h      |  2 --
 .../Neon/domain/tools/gridTransformer/tField.h        |  1 +
 libNeonDomain/tests/domain-stencil/src/gtests.cpp     | 11 ++++++++++-
 libNeonDomain/tests/domain-stencil/src/runHelper.h    |  2 +-
 libNeonDomain/tests/domain-stencil/src/stencil.cu     |  1 +
 libNeonDomain/tests/domain-stencil/src/stencil.h      |  3 ++-
 6 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
index 1cdd75db..62fdc9a4 100644
--- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
@@ -353,13 +353,11 @@ class dPartitionSoA
     T* NEON_RESTRICT      mMem;
     Neon::index_3d        mDim;
     int                   mZHaloRadius;
-    int                   mZBoundaryRadius;
     Pitch                 mPitch;
     int                   mPrtID;
     Neon::index_3d        mOrigin;
     int                   mCardinality;
     Neon::index_3d        mFullGridSize;
-    bool                  mPeriodicZ;
     NghIdx* NEON_RESTRICT mStencil;
 };
 
diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h
index c9ca59b9..a1b4c90d 100644
--- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h
+++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h
@@ -26,6 +26,7 @@ class tField : public Neon::domain::interface::FieldBaseTemplate<T,
     using Partition = typename GridTransformation::template Partition<T, C>;
     using Idx = typename Partition::Idx;
     using NghIdx = typename Partition::NghIdx;  // for compatibility with eGrid
+    using NghData = typename Partition::NghData;  // for compatibility with eGrid
 
    private:
     using FoundationGrid = typename GridTransformation::FoundationGrid;
diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp
index ec6f892a..15816da3 100644
--- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp
@@ -22,7 +22,7 @@ TEST(domain_stencil, eGrid)
                             1);
 }
 
-TEST(domain_stencil, bGri )
+TEST(domain_stencil, bGri)
 {
     int nGpus = 5;
     using Type = int64_t;
@@ -31,6 +31,15 @@ TEST(domain_stencil, bGri )
                             1);
 }
 
+TEST(domain_stencil, dGridSoA)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::run<Neon::dGridSoA, Type, 0>),
+                            nGpus,
+                            1);
+}
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/libNeonDomain/tests/domain-stencil/src/runHelper.h b/libNeonDomain/tests/domain-stencil/src/runHelper.h
index e8f286ae..16cefb0f 100644
--- a/libNeonDomain/tests/domain-stencil/src/runHelper.h
+++ b/libNeonDomain/tests/domain-stencil/src/runHelper.h
@@ -33,7 +33,7 @@ void runAllTestConfiguration(
     // std::vector<int> nGpuTest{2,4,6,8};
     std::vector<int> cardinalityTest{1};
 
-    std::vector<Neon::index_3d> dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}};
+    std::vector<Neon::index_3d> dimTest{{10, 17, 90}, {1, 1, 100}, {17, 1, 77}};
     std::vector<Neon::Runtime>  runtimeE{Neon::Runtime::openmp};
     if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) {
         runtimeE.push_back(Neon::Runtime::stream);
diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu
index a86f1def..d0f19c67 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.cu
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu
@@ -203,6 +203,7 @@ auto run(TestData<G, T, C>& data) -> void
 template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto run<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
 
 }  // namespace map
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.h b/libNeonDomain/tests/domain-stencil/src/stencil.h
index a35d8011..7d74196a 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.h
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.h
@@ -15,5 +15,6 @@ auto run(TestData<G, T, C>& data) -> void;
 
 extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
-
+extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 }  // namespace map

From a49b27aeaeb83dfdd1ed47debba0fed99221a834 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 29 Jun 2023 11:27:58 -0400
Subject: [PATCH 19/25] WIP

---
 .../Neon/domain/details/bGrid/bPartition.h    |  15 ++
 .../domain/details/bGrid/bPartition_imp.h     |  34 +++-
 .../domain/details/dGridSoA/dPartitionSoA.h   |   1 +
 .../Neon/domain/details/eGrid/ePartition.h    |  13 ++
 .../domain/details/eGrid/ePartition_imp.h     |  87 ++++++----
 .../tests/domain-stencil/src/gtests.cpp       |  52 +++++-
 .../tests/domain-stencil/src/stencil.cu       | 158 +++++++++++++-----
 .../tests/domain-stencil/src/stencil.h        |  20 ++-
 8 files changed, 291 insertions(+), 89 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
index 73ccb914..a03af559 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
@@ -98,6 +98,19 @@ class bPartition
                T          defaultValue)
         const -> NghData;
 
+    template <int xOff,
+              int yOff,
+              int zOff,
+              typename LambdaVALID,
+              typename LambdaNOTValid = void*>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid = nullptr)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> &&( std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>;
+
+
     /**
      * Gets the global coordinates of the cartesian point.
      */
@@ -134,6 +147,8 @@ class bPartition
     helpGetNghIdx(const Idx& idx)
         const -> Idx;
 
+
+
     int                                             mCardinality;
     T*                                              mMem;
     NghIdx const* NEON_RESTRICT                     mStencilNghIndex;
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
index dc4c5880..5fa6f260 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
@@ -45,10 +45,10 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     location.x += gidx.mInDataBlockIdx.x;
     location.y += gidx.mInDataBlockIdx.y;
     location.z += gidx.mInDataBlockIdx.z;
-    if constexpr (SBlock::isMultiResMode){
+    if constexpr (SBlock::isMultiResMode) {
         return location * mMultiResDiscreteIdxSpacing;
     }
-    return location ;
+    return location;
 }
 
 template <typename T, int C, typename SBlock>
@@ -354,4 +354,34 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     result.set(value, true);
     return result;
 }
+
+template <typename T, int C, typename SBlock>
+
+template <int xOff,
+          int yOff,
+          int zOff,
+          typename LambdaVALID,
+          typename LambdaNOTValid>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> && (std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx<xOff, yOff, zOff>(gidx);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+
+    if (isValid) {
+        auto const& value = mMem[pitch];
+        funIfValid(value);
+        return;
+    }
+
+    if constexpr (!std::is_same_v<LambdaNOTValid, void*>) {
+        funIfNOTValid();
+    }
+    return;
+}
 }  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
index 62fdc9a4..0572302b 100644
--- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
@@ -20,6 +20,7 @@ class dPartitionSoA
     using NghData = Neon::domain::NghData<T>;
     using Pitch = uint32_4d;
     using NghIdx = int8_3d;
+    using Type = T;
 
     dPartitionSoA()
     {
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
index 62b75981..05f3101b 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
@@ -188,6 +188,19 @@ class ePartition
                int card,
                T defaultValue)
         const -> NghData;
+
+    template <int xOff,
+              int yOff,
+              int zOff,
+              typename LambdaVALID,
+              typename LambdaNOTValid = void*>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid = nullptr)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> &&( std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>;
+
     /**
      * Check is the
      * @tparam dataView_ta
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h
index 0063ee9e..8565cdc1 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h
@@ -37,34 +37,34 @@ ePartition<T, C>::cardinality() const
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::operator()(eIndex eId, int cardinalityIdx) const
+ePartition<T, C>::operator()(eIndex gidx, int cardinalityIdx) const
     -> T
 {
-    Offset jump = getOffset(eId, cardinalityIdx);
+    Offset jump = getOffset(gidx, cardinalityIdx);
     return mMem[jump];
 }
 
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::operator()(eIndex eId, int cardinalityIdx) -> T&
+ePartition<T, C>::operator()(eIndex gidx, int cardinalityIdx) -> T&
 {
-    Offset jump = getOffset(eId, cardinalityIdx);
+    Offset jump = getOffset(gidx, cardinalityIdx);
     return mMem[jump];
 }
 
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghData(eIndex eId,
+ePartition<T, C>::getNghData(eIndex gidx,
                              NghIdx nghIdx,
                              int    card)
     const -> NghData
 {
-    eIndex     eIdxNgh;
-    const bool isValidNeighbour = isValidNgh(eId, nghIdx, eIdxNgh);
+    eIndex     gidxxNgh;
+    const bool isValidNeighbour = isValidNgh(gidx, nghIdx, gidxxNgh);
     if (isValidNeighbour) {
-        T val = this->operator()(eIdxNgh, card);
+        T val = this->operator()(gidxxNgh, card);
         return NghData(val, isValidNeighbour);
     }
     return NghData(isValidNeighbour);
@@ -73,7 +73,7 @@ ePartition<T, C>::getNghData(eIndex eId,
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghData(eIndex               eId,
+ePartition<T, C>::getNghData(eIndex               gidx,
                              const Neon::int8_3d& ngh3dIdx,
                              int                  card)
     const -> NghData
@@ -82,7 +82,7 @@ ePartition<T, C>::getNghData(eIndex               eId,
                      (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch +
                      (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
     NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
-    NghData res = getNghData(eId, nghIdx, card);
+    NghData res = getNghData(gidx, nghIdx, card);
 
     return res;
 }
@@ -91,15 +91,15 @@ template <typename T,
           int C>
 template <int xOff, int yOff, int zOff>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghData(eIndex               eId,
-                             int                  card)
+ePartition<T, C>::getNghData(eIndex gidx,
+                             int    card)
     const -> NghData
 {
     int tablePithc = (xOff + mStencilRadius) +
                      (yOff + mStencilRadius) * mStencilTableYPitch +
                      (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
     NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
-    NghData res = getNghData(eId, nghIdx, card);
+    NghData res = getNghData(gidx, nghIdx, card);
 
     return res;
 }
@@ -108,37 +108,66 @@ template <typename T,
           int C>
 template <int xOff, int yOff, int zOff>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghData(eIndex               eId,
-                             int                  card,
-                             T defaultVal)
+ePartition<T, C>::getNghData(eIndex gidx,
+                             int    card,
+                             T      defaultVal)
     const -> NghData
 {
     int tablePithc = (xOff + mStencilRadius) +
                      (yOff + mStencilRadius) * mStencilTableYPitch +
                      (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
     NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
-    NghData res = getNghData(eId, nghIdx, card);
+    NghData res = getNghData(gidx, nghIdx, card);
     if (!res.isValid()) {
         res.set(defaultVal, false);
     }
     return res;
 }
 
+template <typename T,
+          int C>
+template <int xOff,
+          int yOff,
+          int zOff,
+          typename LambdaVALID,
+          typename LambdaNOTValid>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::getNghData(const Idx&     gidx,
+                             int            card,
+                             LambdaVALID    funIfValid,
+                             LambdaNOTValid funIfNOTValid)
+    const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> && (std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>
+{
+    int tablePithc = (xOff + mStencilRadius) +
+                     (yOff + mStencilRadius) * mStencilTableYPitch +
+                     (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
+    NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
+    NghData res = getNghData(gidx, nghIdx, card);
+    if (res.isValid()) {
+        funIfValid(res.getData());
+        return;
+    }
+    if constexpr (!std::is_same_v<LambdaNOTValid, void*>) {
+        funIfNOTValid();
+    }
+    return;
+}
+
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghIndex(eIndex               eId,
+ePartition<T, C>::getNghIndex(eIndex               gidx,
                               const Neon::int8_3d& ngh3dIdx,
-                              eIndex&              eIdxNgh) const -> bool
+                              eIndex&              gidxxNgh) const -> bool
 {
     int tablePithc = (ngh3dIdx.x + mStencilRadius) +
                      (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch +
                      (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
     NghIdx     nghIdx = mStencil3dTo1dOffset[tablePithc];
     eIndex     tmpEIdxNgh;
-    const bool isValidNeighbour = isValidNgh(eId, nghIdx, tmpEIdxNgh);
+    const bool isValidNeighbour = isValidNgh(gidx, nghIdx, tmpEIdxNgh);
     if (isValidNeighbour) {
-        eIdxNgh = tmpEIdxNgh;
+        gidxxNgh = tmpEIdxNgh;
     }
     return isValidNeighbour;
 }
@@ -146,17 +175,17 @@ ePartition<T, C>::getNghIndex(eIndex               eId,
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::isValidNgh(eIndex  eId,
+ePartition<T, C>::isValidNgh(eIndex  gidx,
                              NghIdx  nghIdx,
                              eIndex& neighbourIdx) const
     -> bool
 {
-    const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + eId.helpGet();
+    const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + gidx.helpGet();
     neighbourIdx.helpSet() = NEON_CUDA_CONST_LOAD((mConnectivity + connectivityJumo));
     const bool isValidNeighbour = (neighbourIdx.mIdx > -1);
-    //    printf("(prtId %d) getNghData id %d eIdxNgh %d connectivityJumo %d\n",
+    //    printf("(prtId %d) getNghData id %d gidxxNgh %d connectivityJumo %d\n",
     //           mPrtID,
-    //           eId.mIdx, neighbourIdx.mIdx, connectivityJumo);
+    //           gidx.mIdx, neighbourIdx.mIdx, connectivityJumo);
     return isValidNeighbour;
 }
 
@@ -201,20 +230,20 @@ ePartition<T, C>::ePartition(int             prtId,
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE auto
-ePartition<T, C>::pointer(eIndex eId, int cardinalityIdx) const
+ePartition<T, C>::pointer(eIndex gidx, int cardinalityIdx) const
     -> const Type*
 {
-    Offset jump = getOffset(eId, cardinalityIdx);
+    Offset jump = getOffset(gidx, cardinalityIdx);
     return mMem + jump;
 }
 
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getOffset(eIndex eId, int cardinalityIdx) const
+ePartition<T, C>::getOffset(eIndex gidx, int cardinalityIdx) const
     -> Offset
 {
-    return Offset(eId.helpGet() * mPitch.x + cardinalityIdx * mPitch.y);
+    return Offset(gidx.helpGet() * mPitch.x + cardinalityIdx * mPitch.y);
 }
 
 template <typename T,
diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp
index 15816da3..9fed3354 100644
--- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp
@@ -4,38 +4,74 @@
 #include "runHelper.h"
 #include "stencil.h"
 
-TEST(domain_stencil, dGrid)
+TEST(domain_stencil, dGrid_NoTemplate)
 {
     int nGpus = 3;
     using Type = int64_t;
-    runAllTestConfiguration(std::function(map::run<Neon::dGrid, Type, 0>),
+    runAllTestConfiguration(std::function(map::runNoTemplate<Neon::dGrid, Type, 0>),
                             nGpus,
                             1);
 }
 
-TEST(domain_stencil, eGrid)
+TEST(domain_stencil, eGrid_NoTemplate)
 {
     int nGpus = 3;
     using Type = int64_t;
-    runAllTestConfiguration(std::function(map::run<Neon::eGrid, Type, 0>),
+    runAllTestConfiguration(std::function(map::runNoTemplate<Neon::eGrid, Type, 0>),
                             nGpus,
                             1);
 }
 
-TEST(domain_stencil, bGri)
+TEST(domain_stencil, bGri_NoTemplate)
 {
     int nGpus = 5;
     using Type = int64_t;
-    runAllTestConfiguration(std::function(map::run<Neon::bGrid, Type, 0>),
+    runAllTestConfiguration(std::function(map::runNoTemplate<Neon::bGrid, Type, 0>),
                             nGpus,
                             1);
 }
 
-TEST(domain_stencil, dGridSoA)
+TEST(domain_stencil, dGridSoA_NoTemplate)
 {
     int nGpus = 5;
     using Type = int64_t;
-    runAllTestConfiguration(std::function(map::run<Neon::dGridSoA, Type, 0>),
+    runAllTestConfiguration(std::function(map::runNoTemplate<Neon::dGridSoA, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_stencil, dGrid_Template)
+{
+    int nGpus = 3;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::runTemplate<Neon::dGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_stencil, eGrid_Template)
+{
+    int nGpus = 3;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::runTemplate<Neon::eGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_stencil, bGri_Template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::runTemplate<Neon::bGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_stencil, dGridSoA_Template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::runTemplate<Neon::dGridSoA, Type, 0>),
                             nGpus,
                             1);
 }
diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu
index d0f19c67..926153fa 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.cu
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu
@@ -9,8 +9,8 @@
 namespace map {
 
 template <typename Field>
-auto stencilContainer_laplace(const Field& filedA,
-                              Field&       fieldB)
+auto laplaceNoTemplate(const Field& filedA,
+                       Field&       fieldB)
     -> Neon::set::Container
 {
     const auto& grid = filedA.getGrid();
@@ -59,15 +59,22 @@ static constexpr std::array<const Ngh3DIdx, 6> stencil{
     Ngh3DIdx(0, 0, 1),
     Ngh3DIdx(0, 0, -1)};
 
-template<int stencilIdx, typename IDX, typename Field>
-inline auto viaTemplate (const IDX& idx, int i, const Field& a, int& partial, int& count){
-        a.template getNghData<stencil[stencilIdx].x,
-                              stencil[stencilIdx].y,
-                              stencil[stencilIdx].z>(idx, i,
-                                                     [&](typename Field::Type const& val) {
-                                                         partial += val;
-                                                         count++;
-                                                     });
+template <int X, int Y, int Z, typename IDX, typename Partition, typename Partial>
+NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Partition& a, Partial& partial, int& count)
+{
+    Neon::index_3d direction(X, Y, Z);
+    auto           nghData = a.getNghData(idx, direction.newType<int8_t>(), i);
+    if (nghData.isValid()) {
+        partial += nghData.getData();
+        count++;
+    }
+    //    a.template getNghData<stencil[stencilIdx].x,
+    //                          stencil[stencilIdx].y,
+    //                          stencil[stencilIdx].z>(idx, i,
+    //                                                 [&](typename Partition::Type const& val) {
+    //                                                     partial += val;
+    //                                                     count++;
+    //                                                 });
 };
 
 template <typename Field>
@@ -88,36 +95,18 @@ auto stencilContainerLaplaceTemplate(const Field& filedA,
                     // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val);
                     typename Field::Type partial = 0;
                     int                  count = 0;
+                    using Ngh3DIdx = Neon::int8_3d;
 
-                    constexpr std::array<const Ngh3DIdx, 6> stencil{
-                        Ngh3DIdx(1, 0, 0),
-                        Ngh3DIdx(-1, 0, 0),
-                        Ngh3DIdx(0, 1, 0),
-                        Ngh3DIdx(0, -1, 0),
-                        Ngh3DIdx(0, 0, 1),
-                        Ngh3DIdx(0, 0, -1)};
 
-#if 0
-                    auto viaTemplate = [&]<int stencilIdx>() {
-                        if constexpr (std::is_same_v<typename Field::Grid, Neon::dGrid>) {
-                            a.template getNghData<stencil[stencilIdx].x,
-                                                  stencil[stencilIdx].y,
-                                                  stencil[stencilIdx].z>(idx, i,
-                                                                         [&](Field::Type const& val) {
-                                                                             partial += val;
-                                                                             count++;
-                                                                         });
-                        }
-                    };
-#endif
-                    viaTemplate<0>(idx, i, a, partial, count);
-                    viaTemplate<1>(idx, i, a, partial, count);
-                    viaTemplate<2>(idx, i, a, partial, count);
-                    viaTemplate<3>(idx, i, a, partial, count);
-                    viaTemplate<4>(idx, i, a, partial, count);
-                    viaTemplate<5>(idx, i, a, partial, count);
+                    viaTemplate<1, 0, 0>(idx, i, a, partial, count);
+                    viaTemplate<-1, 0, 0>(idx, i, a, partial, count);
+                    viaTemplate<0, 1, 0>(idx, i, a, partial, count);
+                    viaTemplate<0, -1, 0>(idx, i, a, partial, count);
+                    viaTemplate<0, 0, 1>(idx, i, a, partial, count);
+                    viaTemplate<0, 0, -1>(idx, i, a, partial, count);
 
-                    b(idx, i) = a(idx, i) - count * partial;
+
+                    b(idx, i) = a(idx, i) - count * partial ;
                 }
             };
         });
@@ -126,7 +115,82 @@ auto stencilContainerLaplaceTemplate(const Field& filedA,
 using namespace Neon::domain::tool::testing;
 
 template <typename G, typename T, int C>
-auto run(TestData<G, T, C>& data) -> void
+auto runNoTemplate(TestData<G, T, C>& data) -> void
+{
+
+    using Type = typename TestData<G, T, C>::Type;
+    auto&             grid = data.getGrid();
+    const std::string appName = TestInformation::fullName(grid.getImplementationName());
+    const int         maxIters = 1;
+
+    NEON_INFO(grid.toString());
+
+    // data.resetValuesToLinear(1, 100);
+    data.resetValuesToMasked(1);
+
+    {  // NEON
+        const Neon::index_3d        dim = grid.getDimension();
+        std::vector<Neon::index_3d> elements;
+        auto                        bk = grid.getBackend();
+        auto&                       X = data.getField(FieldNames::X);
+        auto&                       Y = data.getField(FieldNames::Y);
+        for (int iter = maxIters; iter > 0; iter--) {
+            bk.sync(Neon::Backend::mainStreamIdx);
+            X.newHaloUpdate(Neon::set::StencilSemantic::standard,
+                            Neon::set::TransferMode::put,
+                            Neon::Execution::device)
+                .run(Neon::Backend::mainStreamIdx);
+
+            bk.sync(Neon::Backend::mainStreamIdx);
+            laplaceNoTemplate(X, Y).run(Neon::Backend::mainStreamIdx);
+
+            bk.sync(Neon::Backend::mainStreamIdx);
+            Y.newHaloUpdate(Neon::set::StencilSemantic::standard,
+                            Neon::set::TransferMode::get,
+                            Neon::Execution::device)
+                .run(Neon::Backend::mainStreamIdx);
+
+            bk.sync(Neon::Backend::mainStreamIdx);
+            laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx);
+        }
+        data.getBackend().sync(0);
+    }
+
+    {  // Golden data
+        auto& X = data.getIODomain(FieldNames::X);
+        auto& Y = data.getIODomain(FieldNames::Y);
+        for (int iter = maxIters; iter > 0; iter--) {
+            data.laplace(X, Y);
+            data.laplace(Y, X);
+        }
+    }
+
+    data.updateHostData();
+
+    data.getField(FieldNames::X).ioToVtk("X", "X", true);
+    //    data.getField(FieldNames::Y).ioToVtk("Y", "Y", false);
+    //    data.getField(FieldNames::Z).ioToVtk("Z", "Z", false);
+    //
+    data.getIODomain(FieldNames::X).ioToVti("X_", "X_");
+    //    data.getField(FieldNames::Y).ioVtiAllocator("Y_");
+    //    data.getField(FieldNames::Z).ioVtiAllocator("Z_");
+
+    bool isOk = data.compare(FieldNames::X);
+    isOk = data.compare(FieldNames::Y);
+    if (!isOk) {
+        auto flagField = data.compareAndGetField(FieldNames::X);
+        flagField.ioToVti("X_diffFlag", "X_diffFlag");
+        flagField = data.compareAndGetField(FieldNames::Y);
+        flagField.ioToVti("Y_diffFlag", "Y_diffFlag");
+    }
+    ASSERT_TRUE(isOk);
+    if (!isOk) {
+        exit(99);
+    }
+}
+
+template <typename G, typename T, int C>
+auto runTemplate(TestData<G, T, C>& data) -> void
 {
 
     using Type = typename TestData<G, T, C>::Type;
@@ -153,7 +217,7 @@ auto run(TestData<G, T, C>& data) -> void
                 .run(Neon::Backend::mainStreamIdx);
 
             bk.sync(Neon::Backend::mainStreamIdx);
-            stencilContainer_laplace(X, Y).run(Neon::Backend::mainStreamIdx);
+            stencilContainerLaplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx);
 
             bk.sync(Neon::Backend::mainStreamIdx);
             Y.newHaloUpdate(Neon::set::StencilSemantic::standard,
@@ -162,7 +226,7 @@ auto run(TestData<G, T, C>& data) -> void
                 .run(Neon::Backend::mainStreamIdx);
 
             bk.sync(Neon::Backend::mainStreamIdx);
-            stencilContainer_laplace(Y, X).run(Neon::Backend::mainStreamIdx);
+            laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx);
         }
         data.getBackend().sync(0);
     }
@@ -200,10 +264,14 @@ auto run(TestData<G, T, C>& data) -> void
     }
 }
 
-template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
-template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
-template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
-template auto run<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
+template auto runNoTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+template auto runNoTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+template auto runNoTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto runNoTemplate<Neon::dGridSoA, int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
+template auto runTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::dGridSoA, int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
 }  // namespace map
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.h b/libNeonDomain/tests/domain-stencil/src/stencil.h
index 7d74196a..456f5f01 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.h
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.h
@@ -11,10 +11,20 @@ namespace map {
 using namespace Neon::domain::tool::testing;
 
 template <typename G, typename T, int C>
-auto run(TestData<G, T, C>& data) -> void;
+auto runNoTemplate(TestData<G, T, C>& data) -> void;
+
+template <typename G, typename T, int C>
+auto runTemplate(TestData<G, T, C>& data) -> void;
+
+
+extern template auto runNoTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+extern template auto runNoTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+extern template auto runNoTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto runNoTemplate<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
+
+extern template auto runTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
-extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
-extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
-extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
-extern template auto run<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 }  // namespace map

From fde014d67b87529c5ae18e297b307e4381b4bd65 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 29 Jun 2023 11:33:43 -0400
Subject: [PATCH 20/25] Extending unit test for stencil to dGridSoA

---
 .../tests/domain-stencil/src/stencil.cu       | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu
index 926153fa..14ae82b1 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.cu
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu
@@ -59,22 +59,22 @@ static constexpr std::array<const Ngh3DIdx, 6> stencil{
     Ngh3DIdx(0, 0, 1),
     Ngh3DIdx(0, 0, -1)};
 
-template <int X, int Y, int Z, typename IDX, typename Partition, typename Partial>
+template <int sIdx, typename IDX, typename Partition, typename Partial>
 NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Partition& a, Partial& partial, int& count)
 {
-    Neon::index_3d direction(X, Y, Z);
-    auto           nghData = a.getNghData(idx, direction.newType<int8_t>(), i);
-    if (nghData.isValid()) {
-        partial += nghData.getData();
-        count++;
-    }
-    //    a.template getNghData<stencil[stencilIdx].x,
-    //                          stencil[stencilIdx].y,
-    //                          stencil[stencilIdx].z>(idx, i,
-    //                                                 [&](typename Partition::Type const& val) {
-    //                                                     partial += val;
-    //                                                     count++;
-    //                                                 });
+    //    Neon::index_3d direction(X, Y, Z);
+    //    auto           nghData = a.getNghData(idx, direction.newType<int8_t>(), i);
+    //    if (nghData.isValid()) {
+    //        partial += nghData.getData();
+    //        count++;
+    //    }
+    a.template getNghData<stencil[sIdx].x,
+                          stencil[sIdx].y,
+                          stencil[sIdx].z>(idx, i,
+                                           [&](typename Partition::Type const& val) {
+                                               partial += val;
+                                               count++;
+                                           });
 };
 
 template <typename Field>
@@ -98,15 +98,15 @@ auto stencilContainerLaplaceTemplate(const Field& filedA,
                     using Ngh3DIdx = Neon::int8_3d;
 
 
-                    viaTemplate<1, 0, 0>(idx, i, a, partial, count);
-                    viaTemplate<-1, 0, 0>(idx, i, a, partial, count);
-                    viaTemplate<0, 1, 0>(idx, i, a, partial, count);
-                    viaTemplate<0, -1, 0>(idx, i, a, partial, count);
-                    viaTemplate<0, 0, 1>(idx, i, a, partial, count);
-                    viaTemplate<0, 0, -1>(idx, i, a, partial, count);
+                    viaTemplate<0>(idx, i, a, partial, count);
+                    viaTemplate<1>(idx, i, a, partial, count);
+                    viaTemplate<2>(idx, i, a, partial, count);
+                    viaTemplate<3>(idx, i, a, partial, count);
+                    viaTemplate<4>(idx, i, a, partial, count);
+                    viaTemplate<5>(idx, i, a, partial, count);
 
 
-                    b(idx, i) = a(idx, i) - count * partial ;
+                    b(idx, i) = a(idx, i) - count * partial;
                 }
             };
         });

From b0e74e6c3dc62179c84a9d7d899efa461ecbc115 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 29 Jun 2023 17:14:38 -0400
Subject: [PATCH 21/25] WIP

---
 libNeonDomain/tests/domain-stencil/src/stencil.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu
index 14ae82b1..31e937e1 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.cu
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu
@@ -78,7 +78,7 @@ NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Parti
 };
 
 template <typename Field>
-auto stencilContainerLaplaceTemplate(const Field& filedA,
+auto laplaceTemplate(const Field& filedA,
                                      Field&       fieldB)
     -> Neon::set::Container
 {
@@ -217,7 +217,7 @@ auto runTemplate(TestData<G, T, C>& data) -> void
                 .run(Neon::Backend::mainStreamIdx);
 
             bk.sync(Neon::Backend::mainStreamIdx);
-            stencilContainerLaplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx);
+            laplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx);
 
             bk.sync(Neon::Backend::mainStreamIdx);
             Y.newHaloUpdate(Neon::set::StencilSemantic::standard,
@@ -226,7 +226,7 @@ auto runTemplate(TestData<G, T, C>& data) -> void
                 .run(Neon::Backend::mainStreamIdx);
 
             bk.sync(Neon::Backend::mainStreamIdx);
-            laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx);
+            laplaceTemplate(Y, X).run(Neon::Backend::mainStreamIdx);
         }
         data.getBackend().sync(0);
     }

From 1dd5abc612caa5b3dc6f0896fea36f02e73f42dc Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 30 Jun 2023 09:07:11 -0400
Subject: [PATCH 22/25] WIP

---
 .../include/Neon/domain/details/bGrid/bPartition_imp.h          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
index 5fa6f260..9a0bab8e 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
@@ -100,7 +100,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
     helpGetValidIdxPitchExplicit(const Idx& idx, int card)
         const -> uint32_t
 {
-    uint32_t const blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
+    uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
     uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x +
                                         SBlock::memBlockSizeX * idx.mInDataBlockIdx.y +
                                         (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z;

From 81b352696731adfd70786292d8f0e107a3f0958d Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 30 Jun 2023 10:43:24 -0400
Subject: [PATCH 23/25] WIP

---
 .../tests/domain-stencil/src/stencil.cu       | 36 ++++++++++++++-----
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu
index 31e937e1..f6865999 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.cu
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu
@@ -77,9 +77,19 @@ NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Parti
                                            });
 };
 
+
+template <auto Start, auto End, auto Inc, class F>
+constexpr void constexpr_for(F&& f)
+{
+    if constexpr (Start < End) {
+        f(std::integral_constant<decltype(Start), Start>());
+        constexpr_for<Start + Inc, End, Inc>(f);
+    }
+}
+
 template <typename Field>
 auto laplaceTemplate(const Field& filedA,
-                                     Field&       fieldB)
+                     Field&       fieldB)
     -> Neon::set::Container
 {
     const auto& grid = filedA.getGrid();
@@ -97,13 +107,23 @@ auto laplaceTemplate(const Field& filedA,
                     int                  count = 0;
                     using Ngh3DIdx = Neon::int8_3d;
 
-
-                    viaTemplate<0>(idx, i, a, partial, count);
-                    viaTemplate<1>(idx, i, a, partial, count);
-                    viaTemplate<2>(idx, i, a, partial, count);
-                    viaTemplate<3>(idx, i, a, partial, count);
-                    viaTemplate<4>(idx, i, a, partial, count);
-                    viaTemplate<5>(idx, i, a, partial, count);
+                    constexpr_for<0, 6, 1>([&](auto sIdx) {
+                        a.template getNghData<stencil[sIdx].x,
+                                              stencil[sIdx].y,
+                                              stencil[sIdx].z>(idx, i,
+                                                               [&](auto const& val) {
+                                                                   partial += val;
+                                                                   count++;
+                                                               });
+                    });
+
+
+//                    viaTemplate<0>(idx, i, a, partial, count);
+//                    viaTemplate<1>(idx, i, a, partial, count);
+//                    viaTemplate<2>(idx, i, a, partial, count);
+//                    viaTemplate<3>(idx, i, a, partial, count);
+//                    viaTemplate<4>(idx, i, a, partial, count);
+//                    viaTemplate<5>(idx, i, a, partial, count);
 
 
                     b(idx, i) = a(idx, i) - count * partial;

From 2a2caf7d83bb0c401cc5d7839e2d212132a966c1 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 30 Jun 2023 11:04:00 -0400
Subject: [PATCH 24/25] WIP

---
 .../include/Neon/core/tools/metaprogramming.h |  1 +
 .../core/tools/metaprogramming/ConstexprFor.h | 14 +++++++++
 .../tests/domain-stencil/src/stencil.cu       | 29 +++++++------------
 3 files changed, 25 insertions(+), 19 deletions(-)
 create mode 100644 libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h

diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming.h b/libNeonCore/include/Neon/core/tools/metaprogramming.h
index 53678ed6..ea004a43 100644
--- a/libNeonCore/include/Neon/core/tools/metaprogramming.h
+++ b/libNeonCore/include/Neon/core/tools/metaprogramming.h
@@ -4,3 +4,4 @@
 #include "Neon/core/tools/metaprogramming/debugHelp.h"
 #include "Neon/core/tools/metaprogramming/extractTupleVecType.h"
 #include "Neon/core/tools/metaprogramming/tupleVecTable.h"
+#include "Neon/core/tools/metaprogramming/ConstexprFor.h"
\ No newline at end of file
diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h
new file mode 100644
index 00000000..2e8161e6
--- /dev/null
+++ b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h
@@ -0,0 +1,14 @@
+#pragma once
+
+namespace Neon {
+
+template <auto Start, auto End, auto Inc, class F>
+constexpr void ConstexprFor(F&& f)
+{
+    if constexpr (Start < End) {
+        f(std::integral_constant<decltype(Start), Start>());
+        ConstexprFor<Start + Inc, End, Inc>(f);
+    }
+}
+
+}  // namespace Neon
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu
index f6865999..6cd4f6ff 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.cu
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu
@@ -78,14 +78,14 @@ NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Parti
 };
 
 
-template <auto Start, auto End, auto Inc, class F>
-constexpr void constexpr_for(F&& f)
-{
-    if constexpr (Start < End) {
-        f(std::integral_constant<decltype(Start), Start>());
-        constexpr_for<Start + Inc, End, Inc>(f);
-    }
-}
+//template <auto Start, auto End, auto Inc, class F>
+//constexpr void constexpr_for(F&& f)
+//{
+//    if constexpr (Start < End) {
+//        f(std::integral_constant<decltype(Start), Start>());
+//        constexpr_for<Start + Inc, End, Inc>(f);
+//    }
+//}
 
 template <typename Field>
 auto laplaceTemplate(const Field& filedA,
@@ -107,7 +107,7 @@ auto laplaceTemplate(const Field& filedA,
                     int                  count = 0;
                     using Ngh3DIdx = Neon::int8_3d;
 
-                    constexpr_for<0, 6, 1>([&](auto sIdx) {
+                    Neon::ConstexprFor<0, 6, 1>([&](auto sIdx) {
                         a.template getNghData<stencil[sIdx].x,
                                               stencil[sIdx].y,
                                               stencil[sIdx].z>(idx, i,
@@ -116,16 +116,7 @@ auto laplaceTemplate(const Field& filedA,
                                                                    count++;
                                                                });
                     });
-
-
-//                    viaTemplate<0>(idx, i, a, partial, count);
-//                    viaTemplate<1>(idx, i, a, partial, count);
-//                    viaTemplate<2>(idx, i, a, partial, count);
-//                    viaTemplate<3>(idx, i, a, partial, count);
-//                    viaTemplate<4>(idx, i, a, partial, count);
-//                    viaTemplate<5>(idx, i, a, partial, count);
-
-
+                    
                     b(idx, i) = a(idx, i) - count * partial;
                 }
             };

From b63b90beece180c75f39695b20437bcc3b29a1fb Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 30 Jun 2023 15:58:06 -0400
Subject: [PATCH 25/25] WIP

---
 .../lbm-lid-driven-cavity-flow/src/LbmTools.h | 125 +++++++++---------
 1 file changed, 65 insertions(+), 60 deletions(-)

diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h
index ab79ed2a..4a12ca18 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h
@@ -31,22 +31,22 @@ struct LbmContainers<D3Q19Template<typename PopulationField::Type, LbmComputeTyp
     using Rho = typename Grid::template Field<LbmStoreType, 1>;
     using U = typename Grid::template Field<LbmStoreType, 3>;
 
-#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid)                                                               \
-    {                                                                                                                   \
-        { /*GO*/                                                                                                        \
-            if (wallBitFlag & (uint32_t(1) << GOid)) {                                                                  \
-                popIn[GOid] = fin(i, BKid);                                                                             \
-            } else {                                                                                                    \
-                popIn[GOid] = fin.template nghVal<BKx, BKy, BKz>(i, GOid, 0.0).value;                                   \
-            }                                                                                                           \
-        }                                                                                                               \
-        { /*BK*/                                                                                                        \
-            if (wallBitFlag & (uint32_t(1) << BKid)) {                                                                  \
-                popIn[BKid] = fin(i, GOid);                                                                             \
-            } else {                                                                                                    \
-                popIn[BKid] = fin.template nghVal<GOx, GOy, GOz>(i, BKid, 0.0).value;                                   \
-            }                                                                                                           \
-        }                                                                                                               \
+#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid)                             \
+    {                                                                                 \
+        { /*GO*/                                                                      \
+            if (wallBitFlag & (uint32_t(1) << GOid)) {                                \
+                popIn[GOid] = fin(i, BKid);                                           \
+            } else {                                                                  \
+                popIn[GOid] = fin.template nghVal<BKx, BKy, BKz>(i, GOid, 0.0).value; \
+            }                                                                         \
+        }                                                                             \
+        { /*BK*/                                                                      \
+            if (wallBitFlag & (uint32_t(1) << BKid)) {                                \
+                popIn[BKid] = fin(i, GOid);                                           \
+            } else {                                                                  \
+                popIn[BKid] = fin.template nghVal<GOx, GOy, GOz>(i, BKid, 0.0).value; \
+            }                                                                         \
+        }                                                                             \
     }
     static inline NEON_CUDA_HOST_DEVICE auto
     loadPopulation(Idx const&                                 i,
@@ -209,45 +209,52 @@ struct LbmContainers<D3Q19Template<typename PopulationField::Type, LbmComputeTyp
         const LbmComputeType ck_u07 = u[1] + u[2];
         const LbmComputeType ck_u08 = u[1] - u[2];
 
-        const LbmComputeType eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr);
-        const LbmComputeType eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr);
-        const LbmComputeType eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr);
-        const LbmComputeType eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr);
-        const LbmComputeType eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr);
-        const LbmComputeType eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr);
-        const LbmComputeType eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr);
-        const LbmComputeType eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr);
-        const LbmComputeType eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr);
-
-        const LbmComputeType eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0];
-        const LbmComputeType eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1];
-        const LbmComputeType eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2];
-        const LbmComputeType eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03;
-        const LbmComputeType eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04;
-        const LbmComputeType eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05;
-        const LbmComputeType eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06;
-        const LbmComputeType eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07;
-        const LbmComputeType eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08;
-
-        const LbmComputeType pop_out_00 = (1. - omega) * static_cast<LbmComputeType>(pop[0]) + omega * eq_00;
-        const LbmComputeType pop_out_01 = (1. - omega) * static_cast<LbmComputeType>(pop[1]) + omega * eq_01;
-        const LbmComputeType pop_out_02 = (1. - omega) * static_cast<LbmComputeType>(pop[2]) + omega * eq_02;
-        const LbmComputeType pop_out_03 = (1. - omega) * static_cast<LbmComputeType>(pop[3]) + omega * eq_03;
-        const LbmComputeType pop_out_04 = (1. - omega) * static_cast<LbmComputeType>(pop[4]) + omega * eq_04;
-        const LbmComputeType pop_out_05 = (1. - omega) * static_cast<LbmComputeType>(pop[5]) + omega * eq_05;
-        const LbmComputeType pop_out_06 = (1. - omega) * static_cast<LbmComputeType>(pop[6]) + omega * eq_06;
-        const LbmComputeType pop_out_07 = (1. - omega) * static_cast<LbmComputeType>(pop[7]) + omega * eq_07;
-        const LbmComputeType pop_out_08 = (1. - omega) * static_cast<LbmComputeType>(pop[8]) + omega * eq_08;
-
-        const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast<LbmComputeType>(pop[10]) + omega * eqopp_00;
-        const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast<LbmComputeType>(pop[11]) + omega * eqopp_01;
-        const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast<LbmComputeType>(pop[12]) + omega * eqopp_02;
-        const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast<LbmComputeType>(pop[13]) + omega * eqopp_03;
-        const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast<LbmComputeType>(pop[14]) + omega * eqopp_04;
-        const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast<LbmComputeType>(pop[15]) + omega * eqopp_05;
-        const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast<LbmComputeType>(pop[16]) + omega * eqopp_06;
-        const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast<LbmComputeType>(pop[17]) + omega * eqopp_07;
-        const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast<LbmComputeType>(pop[18]) + omega * eqopp_08;
+        constexpr LbmComputeType c1over18 = 1. / 18.;
+        constexpr LbmComputeType c1over36 = 1. / 36.;
+        constexpr LbmComputeType c4dot5 = 4.5;
+        constexpr LbmComputeType c3 = 3.;
+        constexpr LbmComputeType c1 = 1.;
+        constexpr LbmComputeType c6 = 6.;
+
+        const LbmComputeType eq_00 = rho * c1over18 * (c1 - c6 * u[0] + c4dot5 * u[0] * u[0] - usqr);
+        const LbmComputeType eq_01 = rho * c1over18 * (c1 - c6 * u[1] + c4dot5 * u[1] * u[1] - usqr);
+        const LbmComputeType eq_02 = rho * c1over18 * (c1 - c6 * u[2] + c4dot5 * u[2] * u[2] - usqr);
+        const LbmComputeType eq_03 = rho * c1over36 * (c1 - c6 * ck_u03 + c4dot5 * ck_u03 * ck_u03 - usqr);
+        const LbmComputeType eq_04 = rho * c1over36 * (c1 - c6 * ck_u04 + c4dot5 * ck_u04 * ck_u04 - usqr);
+        const LbmComputeType eq_05 = rho * c1over36 * (c1 - c6 * ck_u05 + c4dot5 * ck_u05 * ck_u05 - usqr);
+        const LbmComputeType eq_06 = rho * c1over36 * (c1 - c6 * ck_u06 + c4dot5 * ck_u06 * ck_u06 - usqr);
+        const LbmComputeType eq_07 = rho * c1over36 * (c1 - c6 * ck_u07 + c4dot5 * ck_u07 * ck_u07 - usqr);
+        const LbmComputeType eq_08 = rho * c1over36 * (c1 - c6 * ck_u08 + c4dot5 * ck_u08 * ck_u08 - usqr);
+
+        const LbmComputeType eqopp_00 = eq_00 + rho * c1over18 * c6 * u[0];
+        const LbmComputeType eqopp_01 = eq_01 + rho * c1over18 * c6 * u[1];
+        const LbmComputeType eqopp_02 = eq_02 + rho * c1over18 * c6 * u[2];
+        const LbmComputeType eqopp_03 = eq_03 + rho * c1over36 * c6 * ck_u03;
+        const LbmComputeType eqopp_04 = eq_04 + rho * c1over36 * c6 * ck_u04;
+        const LbmComputeType eqopp_05 = eq_05 + rho * c1over36 * c6 * ck_u05;
+        const LbmComputeType eqopp_06 = eq_06 + rho * c1over36 * c6 * ck_u06;
+        const LbmComputeType eqopp_07 = eq_07 + rho * c1over36 * c6 * ck_u07;
+        const LbmComputeType eqopp_08 = eq_08 + rho * c1over36 * c6 * ck_u08;
+
+        const LbmComputeType pop_out_00 = (c1 - omega) * static_cast<LbmComputeType>(pop[0]) + omega * eq_00;
+        const LbmComputeType pop_out_01 = (c1 - omega) * static_cast<LbmComputeType>(pop[1]) + omega * eq_01;
+        const LbmComputeType pop_out_02 = (c1 - omega) * static_cast<LbmComputeType>(pop[2]) + omega * eq_02;
+        const LbmComputeType pop_out_03 = (c1 - omega) * static_cast<LbmComputeType>(pop[3]) + omega * eq_03;
+        const LbmComputeType pop_out_04 = (c1 - omega) * static_cast<LbmComputeType>(pop[4]) + omega * eq_04;
+        const LbmComputeType pop_out_05 = (c1 - omega) * static_cast<LbmComputeType>(pop[5]) + omega * eq_05;
+        const LbmComputeType pop_out_06 = (c1 - omega) * static_cast<LbmComputeType>(pop[6]) + omega * eq_06;
+        const LbmComputeType pop_out_07 = (c1 - omega) * static_cast<LbmComputeType>(pop[7]) + omega * eq_07;
+        const LbmComputeType pop_out_08 = (c1 - omega) * static_cast<LbmComputeType>(pop[8]) + omega * eq_08;
+
+        const LbmComputeType pop_out_opp_00 = (c1 - omega) * static_cast<LbmComputeType>(pop[10]) + omega * eqopp_00;
+        const LbmComputeType pop_out_opp_01 = (c1 - omega) * static_cast<LbmComputeType>(pop[11]) + omega * eqopp_01;
+        const LbmComputeType pop_out_opp_02 = (c1 - omega) * static_cast<LbmComputeType>(pop[12]) + omega * eqopp_02;
+        const LbmComputeType pop_out_opp_03 = (c1 - omega) * static_cast<LbmComputeType>(pop[13]) + omega * eqopp_03;
+        const LbmComputeType pop_out_opp_04 = (c1 - omega) * static_cast<LbmComputeType>(pop[14]) + omega * eqopp_04;
+        const LbmComputeType pop_out_opp_05 = (c1 - omega) * static_cast<LbmComputeType>(pop[15]) + omega * eqopp_05;
+        const LbmComputeType pop_out_opp_06 = (c1 - omega) * static_cast<LbmComputeType>(pop[16]) + omega * eqopp_06;
+        const LbmComputeType pop_out_opp_07 = (c1 - omega) * static_cast<LbmComputeType>(pop[17]) + omega * eqopp_07;
+        const LbmComputeType pop_out_opp_08 = (c1 - omega) * static_cast<LbmComputeType>(pop[18]) + omega * eqopp_08;
 
 
 #define COMPUTE_GO_AND_BACK(GOid, BKid)                                 \
@@ -262,17 +269,15 @@ struct LbmContainers<D3Q19Template<typename PopulationField::Type, LbmComputeTyp
         COMPUTE_GO_AND_BACK(3, 13)
         COMPUTE_GO_AND_BACK(4, 14)
         COMPUTE_GO_AND_BACK(5, 15)
-        // COMPUTE_GO_AND_BACK(6, 16)
-        fOut(i, 6) = static_cast<LbmStoreType>(pop_out_06);
-        fOut(i, 16) = static_cast<LbmStoreType>(pop_out_opp_06);
+        COMPUTE_GO_AND_BACK(6, 16)
         COMPUTE_GO_AND_BACK(7, 17)
         COMPUTE_GO_AND_BACK(8, 18)
 
 #undef COMPUTE_GO_AND_BACK
 
         {
-            const LbmComputeType eq_09 = rho * (1. / 3.) * (1. - usqr);
-            const LbmComputeType pop_out_09 = (1. - omega) *
+            const LbmComputeType eq_09 = rho * (c1 / c3) * (c1 - usqr);
+            const LbmComputeType pop_out_09 = (c1 - omega) *
                                                   static_cast<LbmComputeType>(pop[Lattice::centerDirection]) +
                                               omega * eq_09;
             fOut(i, Lattice::centerDirection) = static_cast<LbmStoreType>(pop_out_09);