diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index 5aebe104..677aefba 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -4,7 +4,8 @@ GRID_LIST = "dGrid bGrid eGrid".split() STORAGE_FP_LIST = "double float".split() COMPUTE_FP_LIST = "double float".split() -OCC_LIST = "nOCC".split() +OCC_LIST = "nOCC sOCC".split() +HU_LIST = "huGrid huLattice".split() WARM_UP_ITER = 10 MAX_ITER = 100 REPETITIONS = 5 @@ -38,60 +39,69 @@ def countAll(): for COMPUTE_FP in COMPUTE_FP_LIST: for DEVICE_SET in DEVICE_SET_LIST: for GRID in GRID_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue + for HU in HU_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue - counter += 1 + counter += 1 return counter SAMPLES = countAll() counter = 0 command = './lbm-lid-driven-cavity-flow' +# command = 'echo' with open(command + '.log', 'w') as fp: for DEVICE_TYPE in DEVICE_TYPE_LIST: DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] if DEVICE_TYPE == 'gpu': for DEVICE in DEVICE_ID_LIST[1:]: DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) - for OCC in OCC_LIST: - for DOMAIN_SIZE in DOMAIN_SIZE_LIST: - for STORAGE_FP in STORAGE_FP_LIST: - for COMPUTE_FP in COMPUTE_FP_LIST: - for DEVICE_SET in DEVICE_SET_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: for GRID in GRID_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue + for HU in HU_LIST: + + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue - parameters = [] - parameters.append('--deviceType ' + DEVICE_TYPE) - parameters.append('--deviceIds ' + DEVICE_SET) - parameters.append('--grid ' + GRID) - parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) - parameters.append('--max-iter ' + str(MAX_ITER)) - parameters.append( - '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + - STORAGE_FP + '_' + COMPUTE_FP + '_' + - DEVICE_SET.replace(' ', '_') + '_' + OCC) - parameters.append('--computeFP ' + COMPUTE_FP) - parameters.append('--storageFP ' + STORAGE_FP) - parameters.append('--benchmark') - parameters.append('--' + OCC) + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + + DEVICE_SET.replace(' ', '_') + '-' + + GRID + '_' + + DOMAIN_SIZE + '-' + + STORAGE_FP + '-' + COMPUTE_FP + '-' + + OCC) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--benchmark') + parameters.append('--' + OCC) + parameters.append('--' + HU) - commandList = [] - commandList.append(command) - for el in parameters: - for s in el.split(): - commandList.append(s) + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) - fp.write("\n-------------------------------------------\n") - fp.write(' '.join(commandList)) - fp.write("\n-------------------------------------------\n") - fp.flush() - subprocess.run(commandList, text=True, stdout=fp) + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + print(' '.join(commandList)) + subprocess.run(commandList, text=True, stdout=fp) - counter += 1 - printProgressBar(counter * 100.0 / SAMPLES, 'Progress') + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h index 5728a5d3..4a12ca18 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h @@ -31,23 +31,22 @@ struct LbmContainers; using U = typename Grid::template Field; -#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - if (wallBitFlag & (uint32_t(1) << GOid)) { \ - /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ - popIn[GOid] = fin(i, BKid); \ - } else { \ - popIn[GOid] = fin.template nghVal(i, GOid, 0.0).value; \ - } \ - } \ - { /*BK*/ \ - if (wallBitFlag & (uint32_t(1) << BKid)) { \ - popIn[BKid] = fin(i, GOid); \ - } else { \ - popIn[BKid] = fin.template nghVal(i, BKid, 0.0).value; \ - } \ - } \ +#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + popIn[GOid] = fin(i, BKid); \ + } else { \ + popIn[GOid] = fin.template nghVal(i, GOid, 0.0).value; \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin(i, GOid); \ + } else { \ + popIn[BKid] = fin.template nghVal(i, BKid, 0.0).value; \ + } \ + } \ } static inline NEON_CUDA_HOST_DEVICE auto loadPopulation(Idx const& i, @@ -101,8 +100,6 @@ struct LbmContainers; constexpr std::array stencil{ @@ -160,7 +157,6 @@ struct LbmContainers(pop[0]) + omega * eq_00; - const LbmComputeType pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; - const LbmComputeType pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; - const LbmComputeType pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; - const LbmComputeType pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; - const LbmComputeType pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; - const LbmComputeType pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; - const LbmComputeType pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; - const LbmComputeType pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; - - const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; - const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; - const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; - const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; - const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; - const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; - const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; - const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; - const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; + constexpr LbmComputeType c1over18 = 1. / 18.; + constexpr LbmComputeType c1over36 = 1. / 36.; + constexpr LbmComputeType c4dot5 = 4.5; + constexpr LbmComputeType c3 = 3.; + constexpr LbmComputeType c1 = 1.; + constexpr LbmComputeType c6 = 6.; + + const LbmComputeType eq_00 = rho * c1over18 * (c1 - c6 * u[0] + c4dot5 * u[0] * u[0] - usqr); + const LbmComputeType eq_01 = rho * c1over18 * (c1 - c6 * u[1] + c4dot5 * u[1] * u[1] - usqr); + const LbmComputeType eq_02 = rho * c1over18 * (c1 - c6 * u[2] + c4dot5 * u[2] * u[2] - usqr); + const LbmComputeType eq_03 = rho * c1over36 * (c1 - c6 * ck_u03 + c4dot5 * ck_u03 * ck_u03 - usqr); + const LbmComputeType eq_04 = rho * c1over36 * (c1 - c6 * ck_u04 + c4dot5 * ck_u04 * ck_u04 - usqr); + const LbmComputeType eq_05 = rho * c1over36 * (c1 - c6 * ck_u05 + c4dot5 * ck_u05 * ck_u05 - usqr); + const LbmComputeType eq_06 = rho * c1over36 * (c1 - c6 * ck_u06 + c4dot5 * ck_u06 * ck_u06 - usqr); + const LbmComputeType eq_07 = rho * c1over36 * (c1 - c6 * ck_u07 + c4dot5 * ck_u07 * ck_u07 - usqr); + const LbmComputeType eq_08 = rho * c1over36 * (c1 - c6 * ck_u08 + c4dot5 * ck_u08 * ck_u08 - usqr); + + const LbmComputeType eqopp_00 = eq_00 + rho * c1over18 * c6 * u[0]; + const LbmComputeType eqopp_01 = eq_01 + rho * c1over18 * c6 * u[1]; + const LbmComputeType eqopp_02 = eq_02 + rho * c1over18 * c6 * u[2]; + const LbmComputeType eqopp_03 = eq_03 + rho * c1over36 * c6 * ck_u03; + const LbmComputeType eqopp_04 = eq_04 + rho * c1over36 * c6 * ck_u04; + const LbmComputeType eqopp_05 = eq_05 + rho * c1over36 * c6 * ck_u05; + const LbmComputeType eqopp_06 = eq_06 + rho * c1over36 * c6 * ck_u06; + const LbmComputeType eqopp_07 = eq_07 + rho * c1over36 * c6 * ck_u07; + const LbmComputeType eqopp_08 = eq_08 + rho * c1over36 * c6 * ck_u08; + + const LbmComputeType pop_out_00 = (c1 - omega) * static_cast(pop[0]) + omega * eq_00; + const LbmComputeType pop_out_01 = (c1 - omega) * static_cast(pop[1]) + omega * eq_01; + const LbmComputeType pop_out_02 = (c1 - omega) * static_cast(pop[2]) + omega * eq_02; + const LbmComputeType pop_out_03 = (c1 - omega) * static_cast(pop[3]) + omega * eq_03; + const LbmComputeType pop_out_04 = (c1 - omega) * static_cast(pop[4]) + omega * eq_04; + const LbmComputeType pop_out_05 = (c1 - omega) * static_cast(pop[5]) + omega * eq_05; + const LbmComputeType pop_out_06 = (c1 - omega) * static_cast(pop[6]) + omega * eq_06; + const LbmComputeType pop_out_07 = (c1 - omega) * static_cast(pop[7]) + omega * eq_07; + const LbmComputeType pop_out_08 = (c1 - omega) * static_cast(pop[8]) + omega * eq_08; + + const LbmComputeType pop_out_opp_00 = (c1 - omega) * static_cast(pop[10]) + omega * eqopp_00; + const LbmComputeType pop_out_opp_01 = (c1 - omega) * static_cast(pop[11]) + omega * eqopp_01; + const LbmComputeType pop_out_opp_02 = (c1 - omega) * static_cast(pop[12]) + omega * eqopp_02; + const LbmComputeType pop_out_opp_03 = (c1 - omega) * static_cast(pop[13]) + omega * eqopp_03; + const LbmComputeType pop_out_opp_04 = (c1 - omega) * static_cast(pop[14]) + omega * eqopp_04; + const LbmComputeType pop_out_opp_05 = (c1 - omega) * static_cast(pop[15]) + omega * eqopp_05; + const LbmComputeType pop_out_opp_06 = (c1 - omega) * static_cast(pop[16]) + omega * eqopp_06; + const LbmComputeType pop_out_opp_07 = (c1 - omega) * static_cast(pop[17]) + omega * eqopp_07; + const LbmComputeType pop_out_opp_08 = (c1 - omega) * static_cast(pop[18]) + omega * eqopp_08; #define COMPUTE_GO_AND_BACK(GOid, BKid) \ @@ -273,8 +276,8 @@ struct LbmContainers(pop[Lattice::centerDirection]) + omega * eq_09; fOut(i, Lattice::centerDirection) = static_cast(pop_out_09); diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h new file mode 100644 index 00000000..fc4d7806 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h @@ -0,0 +1,440 @@ +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +#define COMPUTE_CAST(VAR) static_cast((VAR)) + +template +struct LbmContainersTemplateOnly +{ +}; + +/** + * Specialization for Lattice + * @tparam PopulationField + * @tparam LbmComputeType + */ +template +struct LbmContainersTemplateOnly, + PopulationField, + LbmComputeType> +{ + using LbmStoreType = typename PopulationField::Type; + using CellTypeField = typename PopulationField::Grid::template Field; + using Lattice = D3Q19Template; + using Idx = typename PopulationField::Idx; + using Grid = typename PopulationField::Grid; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + +#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ + popIn[GOid] = fin.template read(gidx); \ + } else { \ + popIn[GOid] = fin.template nghVal(gidx).value; \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin.template read(gidx); \ + } else { \ + popIn[BKid] = fin.template nghVal(gidx).value; \ + } \ + } \ + } + static inline NEON_CUDA_HOST_DEVICE auto + loadPopulation(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopulationField::Partition const& fin, + NEON_OUT LbmStoreType popIn[19]) + { + // #pragma omp critical + // { + + LOADPOP(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); + LOADPOP(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); + LOADPOP(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); + LOADPOP(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); + LOADPOP(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); + LOADPOP(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); + LOADPOP(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); + LOADPOP(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); + LOADPOP(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); + // } + // Treat the case of the center (c[k] = {0, 0, 0,}). + { + popIn[Lattice::centerDirection] = fin(i, Lattice::centerDirection); + } + } +#undef LOADPOP + +#define PULL_STREAM(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ + popIn[GOid] = fin(gidx, BKid) + \ + fin.template getNghData(gidx, BKid)(); \ + } else { \ + popIn[GOid] = fin.template getNghData(gidx, GOid)(); \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin(gidx, GOid) + fin.template getNghData(gidx, GOid)(); \ + } else { \ + popIn[BKid] = fin.template getNghData(gidx, BKid)(); \ + } \ + } \ + } + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopulationField::Partition const& fin, + NEON_OUT LbmStoreType popIn[19]) + { + // #pragma omp critical + // { +#if 0 + using TopologyByDirection = std::tuple; + constexpr std::array stencil{ + std::make_tuple(Neon::int32_3d(-1, 0, 0), /* GOid */ 0, /* --- */ Neon::int32_3d(1, 0, 0), /* BKid */ 10), + std::make_tuple(Neon::int32_3d(0, -1, 0), /* GOid */ 1, /* --- */ Neon::int32_3d(0, 1, 0), /* BKid */ 11), + std::make_tuple(Neon::int32_3d(0, 0, -1), /* GOid */ 2, /* --- */ Neon::int32_3d(0, 0, 1), /* BKid */ 12), + std::make_tuple(Neon::int32_3d(-1, -1, 0), /* GOid */ 3, /* --- */ Neon::int32_3d(1, 1, 0), /* BKid */ 13), + std::make_tuple(Neon::int32_3d(-1, 1, 0), /* GOid */ 4, /* --- */ Neon::int32_3d(1, -1, 0), /* BKid */ 14), + std::make_tuple(Neon::int32_3d(-1, 0, -1), /* GOid */ 5, /* --- */ Neon::int32_3d(1, 0, 1), /* BKid */ 15), + std::make_tuple(Neon::int32_3d(-1, 0, 1), /* GOid */ 6, /* --- */ Neon::int32_3d(1, 0, -1), /* BKid */ 16), + std::make_tuple(Neon::int32_3d(0, -1, -1), /* GOid */ 7, /* --- */ Neon::int32_3d(0, 1, 1), /* BKid */ 17), + std::make_tuple(Neon::int32_3d(0, -1, 1), /* GOid */ 8, /* --- */ Neon::int32_3d(0, 1, -1), /* BKid */ 18)}; + + + auto pullStream = [&]() { + static_assert(stencilIdx < 9); + constexpr int GOid = std::get<1>(stencil[stencilIdx]); + constexpr int BKid = std::get<3>(stencil[stencilIdx]); + constexpr Neon::int32_3d GoOffset = std::get<0>(stencil[stencilIdx]); + constexpr Neon::int32_3d BkOffset = std::get<2>(stencil[stencilIdx]); + { + if (wallBitFlag & (uint32_t(1) << GOid)) { + popIn[GOid] = fin(gidx, BKid) + + fin.template getNghData(gidx, BKid)(); + } else { + popIn[GOid] = fin.template getNghData(gidx, GOid)(); + } + } + { /*BK*/ + if (wallBitFlag & (uint32_t(1) << BKid)) { + popIn[BKid] = fin(gidx, GOid) + + fin.template getNghData(gidx, GOid)(); + } else { + popIn[BKid] = fin.template getNghData(gidx, BKid)(); + } + } + }; + pullStream.template operator()<0>(); + pullStream.template operator()<1>(); + pullStream.template operator()<2>(); + pullStream.template operator()<3>(); + pullStream.template operator()<4>(); + pullStream.template operator()<5>(); + pullStream.template operator()<6>(); + pullStream.template operator()<7>(); + pullStream.template operator()<8>(); +#endif + PULL_STREAM(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); + PULL_STREAM(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); + PULL_STREAM(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); + PULL_STREAM(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); + PULL_STREAM(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); + PULL_STREAM(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); + PULL_STREAM(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); + PULL_STREAM(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); + PULL_STREAM(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); + + // } + // Treat the case of the center (c[k] = {0, 0, 0,}). + { + popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection); + } + } +#undef PULL_STREAM + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const LbmStoreType pop[Lattice::Q], + NEON_OUT LbmComputeType& rho, + NEON_OUT std::array& u) + -> void + { +#define POP(IDX) static_cast(pop[IDX]) + + const LbmComputeType X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const LbmComputeType X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const LbmComputeType X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const LbmComputeType Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const LbmComputeType Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const LbmComputeType Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const LbmComputeType Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); + +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! LbmComputeType iterator */, + const LbmStoreType pop[Lattice::Q], + LbmComputeType const& rho /*! Density */, + std::array const& u /*! Velocity */, + LbmComputeType const& usqr /*! Usqr */, + LbmComputeType const& omega /*! Omega */, + typename PopulationField::Partition& fOut /*! Population */) + + -> void + { + const LbmComputeType ck_u03 = u[0] + u[1]; + const LbmComputeType ck_u04 = u[0] - u[1]; + const LbmComputeType ck_u05 = u[0] + u[2]; + const LbmComputeType ck_u06 = u[0] - u[2]; + const LbmComputeType ck_u07 = u[1] + u[2]; + const LbmComputeType ck_u08 = u[1] - u[2]; + + const LbmComputeType eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr); + const LbmComputeType eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr); + const LbmComputeType eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr); + const LbmComputeType eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr); + const LbmComputeType eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr); + const LbmComputeType eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr); + const LbmComputeType eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr); + const LbmComputeType eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr); + const LbmComputeType eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr); + + const LbmComputeType eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0]; + const LbmComputeType eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1]; + const LbmComputeType eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2]; + const LbmComputeType eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03; + const LbmComputeType eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04; + const LbmComputeType eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05; + const LbmComputeType eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06; + const LbmComputeType eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07; + const LbmComputeType eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08; + + const LbmComputeType pop_out_00 = (1. - omega) * static_cast(pop[0]) + omega * eq_00; + const LbmComputeType pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; + const LbmComputeType pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; + const LbmComputeType pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; + const LbmComputeType pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; + const LbmComputeType pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; + const LbmComputeType pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; + const LbmComputeType pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; + const LbmComputeType pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; + + const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; + const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; + const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; + const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; + const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; + const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; + const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; + const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; + const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; + + +#define COMPUTE_GO_AND_BACK(GOid, BKid) \ + { \ + fOut(i, GOid) = static_cast(pop_out_0##GOid); \ + fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ + } + + COMPUTE_GO_AND_BACK(0, 10) + COMPUTE_GO_AND_BACK(1, 11) + COMPUTE_GO_AND_BACK(2, 12) + COMPUTE_GO_AND_BACK(3, 13) + COMPUTE_GO_AND_BACK(4, 14) + COMPUTE_GO_AND_BACK(5, 15) + COMPUTE_GO_AND_BACK(6, 16) + COMPUTE_GO_AND_BACK(7, 17) + COMPUTE_GO_AND_BACK(8, 18) + +#undef COMPUTE_GO_AND_BACK + + { + const LbmComputeType eq_09 = rho * (1. / 3.) * (1. - usqr); + const LbmComputeType pop_out_09 = (1. - omega) * + static_cast(pop[Lattice::centerDirection]) + + omega * eq_09; + fOut(i, Lattice::centerDirection) = static_cast(pop_out_09); + } + } + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopulationField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const LbmComputeType omega /*! LBM omega parameter */, + PopulationField& fOutField /*! output Population field */) + -> Neon::set::Container + { + + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + LbmStoreType popIn[Lattice::Q]; + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + LbmComputeType rho; + std::array u{.0, .0, .0}; + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + LbmComputeType usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + +#define COMPUTE_MASK_WALL(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); \ + } \ + } \ + { /*BK*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << BKid)); \ + } \ + } \ + } + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, + Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + COMPUTE_MASK_WALL(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10) + COMPUTE_MASK_WALL(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11) + COMPUTE_MASK_WALL(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12) + COMPUTE_MASK_WALL(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13) + COMPUTE_MASK_WALL(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14) + COMPUTE_MASK_WALL(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15) + COMPUTE_MASK_WALL(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16) + COMPUTE_MASK_WALL(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17) + COMPUTE_MASK_WALL(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18) + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } +#undef COMPUTE_MASK_WALL + +#define BC_LOAD(GOID, DKID) \ + popIn[GOID] = fIn(gidx, GOID); \ + popIn[DKID] = fIn(gidx, DKID); + + static auto + computeRhoAndU([[maybe_unused]] const PopulationField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + LbmComputeType rho = 0; + std::array u{.0, .0, .0}; + LbmStoreType popIn[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + BC_LOAD(0, 10) + BC_LOAD(1, 11) + BC_LOAD(2, 12) + BC_LOAD(3, 13) + BC_LOAD(4, 14) + BC_LOAD(5, 15) + BC_LOAD(6, 16) + BC_LOAD(7, 17) + BC_LOAD(8, 18) + popIn[9] = fIn(gidx, 9); + + rho = 1.0; + u = std::array{COMPUTE_CAST(popIn[0]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[1]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[2]) / COMPUTE_CAST(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } +}; + +#undef COMPUTE_CAST \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index c603415c..e91055f9 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -2,6 +2,7 @@ #include "D3Q19.h" #include "Neon/domain/bGrid.h" #include "Neon/domain/dGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/eGrid.h" #include "CellType.h" @@ -313,5 +314,33 @@ auto run(Config& config, if (config.gridType == "bGrid") { return details::runFilterStoreType(config, report); } + if (config.gridType == "bGrid_4_4_4") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_8_4") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_8_4") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 4, 8>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_2_8") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_8_2") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "dGridSoA") { + return details::runFilterStoreType(config, report); + } } } // namespace CavityTwoPop diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming.h b/libNeonCore/include/Neon/core/tools/metaprogramming.h index 53678ed6..ea004a43 100644 --- a/libNeonCore/include/Neon/core/tools/metaprogramming.h +++ b/libNeonCore/include/Neon/core/tools/metaprogramming.h @@ -4,3 +4,4 @@ #include "Neon/core/tools/metaprogramming/debugHelp.h" #include "Neon/core/tools/metaprogramming/extractTupleVecType.h" #include "Neon/core/tools/metaprogramming/tupleVecTable.h" +#include "Neon/core/tools/metaprogramming/ConstexprFor.h" \ No newline at end of file diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h new file mode 100644 index 00000000..2e8161e6 --- /dev/null +++ b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h @@ -0,0 +1,14 @@ +#pragma once + +namespace Neon { + +template +constexpr void ConstexprFor(F&& f) +{ + if constexpr (Start < End) { + f(std::integral_constant()); + ConstexprFor(f); + } +} + +} // namespace Neon \ No newline at end of file diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h index acdae410..ae475c6e 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h @@ -56,6 +56,10 @@ class Vec_3d num_axis = 3 }; + static constexpr int directionX = axis_e::x_axis; + static constexpr int directionY = axis_e::y_axis; + static constexpr int directionZ = axis_e::z_axis; + union { Integer v[axis_e::num_axis]{0, 0, 0}; @@ -120,7 +124,7 @@ class Vec_3d NEON_CUDA_HOST_DEVICE inline void constexpr set(Integer p[self_t::num_axis]); - NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other); + NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other); NEON_CUDA_HOST_DEVICE inline void constexpr set(const Integer& xyz); diff --git a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h index 788291a6..940c6d2c 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h @@ -58,6 +58,7 @@ template class Vec_4d { public: + using Integer = IntegerType_ta; using element_t = IntegerType_ta; using self_t = Vec_4d; diff --git a/libNeonDomain/include/Neon/domain/Grids.h b/libNeonDomain/include/Neon/domain/Grids.h index aad0cda5..7c899b98 100644 --- a/libNeonDomain/include/Neon/domain/Grids.h +++ b/libNeonDomain/include/Neon/domain/Grids.h @@ -3,3 +3,4 @@ #include "Neon/domain/aGrid.h" #include "Neon/domain/eGrid.h" #include "Neon/domain/bGrid.h" +#include "Neon/domain/dGridSoA.h" diff --git a/libNeonDomain/include/Neon/domain/dGridSoA.h b/libNeonDomain/include/Neon/domain/dGridSoA.h new file mode 100644 index 00000000..bdd63f25 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/dGridSoA.h @@ -0,0 +1,7 @@ +#pragma once +#include "Neon/domain/details/dGridSoA/dGridSoA.h" + + +namespace Neon { +using dGridSoA = Neon::domain::details::dGridSoA::dGridSoA; +} \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 73ccb914..a03af559 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -98,6 +98,19 @@ class bPartition T defaultValue) const -> NghData; + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t &&( std::is_invocable_v || std::is_same_v), void>; + + /** * Gets the global coordinates of the cartesian point. */ @@ -134,6 +147,8 @@ class bPartition helpGetNghIdx(const Idx& idx) const -> Idx; + + int mCardinality; T* mMem; NghIdx const* NEON_RESTRICT mStencilNghIndex; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index dc4c5880..9a0bab8e 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -45,10 +45,10 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: location.x += gidx.mInDataBlockIdx.x; location.y += gidx.mInDataBlockIdx.y; location.z += gidx.mInDataBlockIdx.z; - if constexpr (SBlock::isMultiResMode){ + if constexpr (SBlock::isMultiResMode) { return location * mMultiResDiscreteIdxSpacing; } - return location ; + return location; } template @@ -100,7 +100,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpGetValidIdxPitchExplicit(const Idx& idx, int card) const -> uint32_t { - uint32_t const blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x + SBlock::memBlockSizeX * idx.mInDataBlockIdx.y + (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z; @@ -354,4 +354,34 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: result.set(value, true); return result; } + +template + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + + if (isValid) { + auto const& value = mMem[pitch]; + funIfValid(value); + return; + } + + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h index 3291e622..a2c57cdb 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h @@ -37,9 +37,9 @@ struct dIndex NEON_CUDA_HOST_DEVICE inline explicit dIndex(const Location& location); - NEON_CUDA_HOST_DEVICE inline auto set() -> Location&; + NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&; - NEON_CUDA_HOST_DEVICE inline auto get() const -> const Location&; + NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&; }; } // namespace Neon::domain::details::dGrid diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h index 4389fb3f..6426e43a 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h @@ -16,11 +16,11 @@ NEON_CUDA_HOST_DEVICE inline dIndex::dIndex(const Location::Integer &x, mLocation.z = z; } -NEON_CUDA_HOST_DEVICE inline auto dIndex::set() -> Location& +NEON_CUDA_HOST_DEVICE inline auto dIndex::setLocation() -> Location& { return mLocation; } -NEON_CUDA_HOST_DEVICE inline auto dIndex::get() const -> const Location& +NEON_CUDA_HOST_DEVICE inline auto dIndex::getLocation() const -> const Location& { return mLocation; } diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 196f6b70..2becc97d 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -44,16 +44,16 @@ class dPartition int cardinality, Neon::index_3d fullGridSize, NghIdx* stencil = nullptr) - : m_dataView(dataView), - m_mem(mem), - m_dim(dim), - m_zHaloRadius(zHaloRadius), - m_zBoundaryRadius(zBoundaryRadius), - m_pitch(pitch), - m_prtID(prtID), - m_origin(origin), - m_cardinality(cardinality), - m_fullGridSize(fullGridSize), + : mDataView(dataView), + mMem(mem), + mDim(dim), + mZHaloRadius(zHaloRadius), + mZBoundaryRadius(zBoundaryRadius), + mPitch(pitch), + mPrtID(prtID), + mOrigin(origin), + mCardinality(cardinality), + mFullGridSize(fullGridSize), mPeriodicZ(false), mStencil(stencil) { @@ -70,21 +70,21 @@ class dPartition prtID() const -> int { - return m_prtID; + return mPrtID; } inline NEON_CUDA_HOST_DEVICE auto cardinality() const -> int { - return m_cardinality; + return mCardinality; } inline NEON_CUDA_HOST_DEVICE auto getPitchData() const -> const Pitch& { - return m_pitch; + return mPitch; } inline NEON_CUDA_HOST_DEVICE auto @@ -92,76 +92,76 @@ class dPartition int cardinalityIdx = 0) const -> int64_t { - return idx.get().x * int64_t(m_pitch.x) + - idx.get().y * int64_t(m_pitch.y) + - idx.get().z * int64_t(m_pitch.z) + - cardinalityIdx * int64_t(m_pitch.w); + return idx.getLocation().x * int64_t(mPitch.x) + + idx.getLocation().y * int64_t(mPitch.y) + + idx.getLocation().z * int64_t(mPitch.z) + + cardinalityIdx * int64_t(mPitch.w); } inline NEON_CUDA_HOST_DEVICE auto dim() const -> const Neon::index_3d { - return m_dim; + return mDim; } inline NEON_CUDA_HOST_DEVICE auto halo() const -> const Neon::index_3d { - return Neon::index_3d(0, 0, m_zHaloRadius); + return Neon::index_3d(0, 0, mZHaloRadius); } inline NEON_CUDA_HOST_DEVICE auto origin() const -> const Neon::index_3d { - return m_origin; + return mOrigin; } NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, NghIdx nghOffset, int card, const T& alternativeVal) const -> NghData { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val = alternativeVal; if (isValidNeighbour) { - val = operator()(cellNgh, card); + val = operator()(gidxNgh, card); } return NghData(val, isValidNeighbour); } NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, NghIdx nghOffset, int card) const -> NghData { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val; if (isValidNeighbour) { - val = operator()(cellNgh, card); + val = operator()(gidxNgh, card); } return NghData(val, isValidNeighbour); } - template + template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card, LambdaVALID funIfValid, LambdaNOTValid funIfNOTValid = nullptr) - const -> std::enable_if_t , void> + const -> std::enable_if_t, void> { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { - T val = this->operator()(cellNgh, card); + T val = this->operator()(gidxNgh, card); funIfValid(val); } if constexpr (!std::is_same_v) { @@ -171,129 +171,129 @@ class dPartition } } - template + template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card) const -> NghData { - NghData res; - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + T val; if (isValidNeighbour) { - T val = operator()(cellNgh, card); - res.set(val, true); - } else { - res.invalidate(); + val = operator()(gidxNgh, card); } - return res; + return NghData(val, isValidNeighbour); } template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card, T const& defaultValue) const -> NghData { NghData res(defaultValue, false); - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { - T val = operator()(cellNgh, card); + T val = operator()(gidxNgh, card); res.set(val, true); } return res; } NEON_CUDA_HOST_DEVICE inline auto - nghVal(const Idx& eId, + nghVal(const Idx& gidx, uint8_t nghID, int card, const T& alternativeVal) const -> NghData { NghIdx nghOffset = mStencil[nghID]; - return getNghData(eId, nghOffset, card, alternativeVal); + return getNghData(gidx, nghOffset, card, alternativeVal); } /** * Get the index of the neighbor given the offset * @tparam dataView_ta - * @param[in] eId Index of the current element + * @param[in] gidx Index of the current element * @param[in] nghOffset Offset of the neighbor of interest from the current element * @param[in,out] neighbourIdx Index of the neighbor * @return Whether the neighbour is valid */ NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& eId, - const NghIdx& nghOffset, - Idx& neighbourIdx) + helpGetNghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) const -> bool { - Idx cellNgh(eId.get().x + nghOffset.x, - eId.get().y + nghOffset.y, - eId.get().z + nghOffset.z); + Idx gidxNgh(gidx.getLocation().x + nghOffset.x, + gidx.getLocation().y + nghOffset.y, + gidx.getLocation().z + nghOffset.z); - const auto cellNghGlobal = getGlobalIndex(cellNgh); + const auto gidxNghGlobal = getGlobalIndex(gidxNgh); bool isValidNeighbour = true; - if (mPeriodicZ) { - printf("Error, periodic not implemented yet"); - assert(false); - } - - isValidNeighbour = (cellNghGlobal.x >= 0) && - (cellNghGlobal.y >= 0) && - (cellNghGlobal.z >= 0); - - // isValidNeighbour = (cellNgh.get().x < m_dim.x) && - // (cellNgh.get().y < m_dim.y) && - // (cellNgh.get().z < m_dim.z + 2 * m_zHaloRadius) && isValidNeighbour; + isValidNeighbour = (gidxNghGlobal.x >= 0) && + (gidxNghGlobal.y >= 0) && + (gidxNghGlobal.z >= 0); - isValidNeighbour = (cellNghGlobal.x < m_fullGridSize.x) && - (cellNghGlobal.y < m_fullGridSize.y) && - (cellNghGlobal.z < m_fullGridSize.z) && + isValidNeighbour = (gidxNghGlobal.x < mFullGridSize.x) && + (gidxNghGlobal.y < mFullGridSize.y) && + (gidxNghGlobal.z < mFullGridSize.z) && isValidNeighbour; if (isValidNeighbour) { - neighbourIdx = cellNgh; + neighbourIdx = gidxNgh; } return isValidNeighbour; } template NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& eId, - Idx& cellNgh) + helpGetNghIdx(const Idx& gidx, + Idx& gidxNgh) const -> bool { - cellNgh = Idx(eId.get().x + xOff, - eId.get().y + yOff, - eId.get().z + zOff); - Idx cellNgh_global(cellNgh.get() + m_origin); - // const bool isValidNeighbour = (cellNgh_global >= 0 && cellNgh < (m_dim + m_halo) && cellNgh_global < m_fullGridSize); + // NghIdx offset(xOff, yOff, zOff); + // return helpGetNghIdx(gidx, offset, gidxNgh); + gidxNgh = Idx(gidx.getLocation().x + xOff, + gidx.getLocation().y + yOff, + gidx.getLocation().z + zOff); + bool isValidNeighbour = true; if constexpr (xOff > 0) { - isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().x <= m_fullGridSize.x && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (xOff < 0) { - isValidNeighbour = cellNgh_global.get().x >= 0 && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } if constexpr (yOff > 0) { - isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().y <= m_fullGridSize.y && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (yOff < 0) { - isValidNeighbour = cellNgh_global.get().y >= 0 && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } if constexpr (zOff > 0) { - isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().z <= m_fullGridSize.z && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (zOff < 0) { - isValidNeighbour = cellNgh_global.get().z >= m_zHaloRadius && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } return isValidNeighbour; } @@ -303,7 +303,7 @@ class dPartition mem() -> T* { - return m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -311,7 +311,7 @@ class dPartition const -> const T* { - return m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -319,7 +319,7 @@ class dPartition int cardinalityIdx) -> T* { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -327,7 +327,7 @@ class dPartition int cardinalityIdx) -> T& { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -335,7 +335,7 @@ class dPartition int cardinalityIdx) const -> const T& { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } template @@ -386,22 +386,35 @@ class dPartition // local.mLocation.y < m_dim.y && // local.mLocation.z < m_dim.z + m_zHaloRadius); - Neon::index_3d result = local.mLocation + m_origin; - result.z -= m_zHaloRadius; + Neon::index_3d result = local.mLocation; + result.z = result.z + mOrigin.z - mZHaloRadius; return result; } + template + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local) + const -> int + { + if constexpr (Neon::index_3d::directionZ != direction) { + return local.mLocation.v[direction]; + } else { + return local.mLocation.v[Neon::index_3d::directionZ] + + mOrigin.v[Neon::index_3d::directionZ] - + mZHaloRadius; + } + } + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() const -> Neon::index_3d { - return m_fullGridSize; + return mFullGridSize; } auto ioToVti(std::string const& fname, std::string const& fieldName) { - auto fnameCommplete = fname + "_" + std::to_string(m_prtID); - auto haloOrigin = Vec_3d(m_origin.x, m_origin.y, m_origin.z - m_zHaloRadius); - auto haloDim = m_dim + Neon::index_3d(0, 0, 2 * m_zHaloRadius) + 1; + auto fnameCommplete = fname + "_" + std::to_string(mPrtID); + auto haloOrigin = Vec_3d(mOrigin.x, mOrigin.y, mOrigin.z - mZHaloRadius); + auto haloDim = mDim + Neon::index_3d(0, 0, 2 * mZHaloRadius) + 1; IoToVTK io(fnameCommplete, haloDim, @@ -413,25 +426,37 @@ class dPartition io.addField([&](const Neon::index_3d& idx, int i) { return operator()(dIndex(idx), i); }, - m_cardinality, "Partition", ioToVTKns::VtiDataType_e::voxel); + mCardinality, "Partition", ioToVTKns::VtiDataType_e::voxel); io.flushAndClear(); return; } + auto getDataView() + const -> Neon::DataView + { + return mDataView; + } + + auto helpGetGlobalToLocalOffets() + const -> NghIdx* + { + return mStencil; + } + private: - Neon::DataView m_dataView; - T* m_mem; - Neon::index_3d m_dim; - int m_zHaloRadius; - int m_zBoundaryRadius; - Pitch m_pitch; - int m_prtID; - Neon::index_3d m_origin; - int m_cardinality; - Neon::index_3d m_fullGridSize; - bool mPeriodicZ; - NghIdx* mStencil; + Neon::DataView mDataView; + T* NEON_RESTRICT mMem; + Neon::index_3d mDim; + int mZHaloRadius; + int mZBoundaryRadius; + Pitch mPitch; + int mPrtID; + Neon::index_3d mOrigin; + int mCardinality; + Neon::index_3d mFullGridSize; + bool mPeriodicZ; + NghIdx* NEON_RESTRICT mStencil; }; diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h index 8f6f9fea..9fb56572 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h @@ -10,29 +10,29 @@ dSpan::setAndValidate(Idx& idx, const -> bool { bool res = false; - idx.set().x = int(x); - idx.set().y = int(y); - idx.set().z = int(z); + idx.setLocation().x = int(x); + idx.setLocation().y = int(y); + idx.setLocation().z = int(z); - if (idx.get() < mDim) { + if (idx.getLocation() < mDim) { res = true; } switch (mDataView) { case Neon::DataView::STANDARD: { - idx.set().z += mZHaloRadius; + idx.setLocation().z += mZHaloRadius; return res; } case Neon::DataView::INTERNAL: { - idx.set().z += mZHaloRadius + mZBoundaryRadius; + idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; return res; } case Neon::DataView::BOUNDARY: { - idx.set().z += idx.get().z < mZBoundaryRadius + idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius ? 0 : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); - idx.set().z += mZHaloRadius; + idx.setLocation().z += mZHaloRadius; return res; } diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h new file mode 100644 index 00000000..7ce3e582 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h @@ -0,0 +1,98 @@ +#pragma once +#include + +#include "Neon/core/core.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/BlockConfig.h" +#include "Neon/set/Containter.h" +#include "Neon/set/DevSet.h" +#include "Neon/set/MemoryOptions.h" + +#include "Neon/sys/memory/MemDevice.h" + +#include "Neon/domain/aGrid.h" + +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/interface/GridConcept.h" +#include "Neon/domain/interface/KernelConfig.h" +#include "Neon/domain/interface/LaunchConfig.h" +#include "Neon/domain/interface/Stencil.h" +#include "Neon/domain/interface/common.h" + +#include "Neon/domain/tools/GridTransformer.h" +#include "Neon/domain/tools/SpanTable.h" + +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/patterns/PatternScalar.h" + +#include "dPartitionSoA.h" +#include "dSpanSoA.h" + +namespace Neon::domain::details::dGridSoA { + +namespace details { +struct dGridSoATransformation +{ + using FoundationGrid = Neon::domain::details::dGrid::dGrid; + using Idx = dIndexSoA; + using Span = dSpanSoA; + template + using Partition = dPartitionSoA; + + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + using ExecutionThreadSpanIndexType = int32_t; + + static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& + { + return foundationGrid.getDefaultBlock(); + } + + static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable& spanTable) -> void + { + spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span.helpInit(foundationGrid.getSpan(execution, setIdx, dw)); + }); + } + + static auto initLaunchParameters(FoundationGrid& foundationGrid, + Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& shareMem) -> Neon::set::LaunchParameters + { + return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); + } + + // static auto helpGetGridIdx(FoundationGrid&, + // Neon::SetIdx const&, + // FoundationGrid::Idx const& fgIdx) + // -> dGridSoATransformation::Idx + // { + // dGridSoATransformation::Idx tgIdx = fgIdx; + // return tgIdx; + // } + + template + static auto initFieldPartition(FoundationGrid::Field& foundationField, + Neon::domain::tool::PartitionTable>& partitionTable) -> void + { + partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Partition& partition) { + auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw); + partition = Partition(foundationPartition); + }); + } +}; + +} // namespace details +using dGridSoA = Neon::domain::tool::GridTransformer::Grid; + +} // namespace Neon::domain::details::dGridSoA diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h new file mode 100644 index 00000000..2ed82d86 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h @@ -0,0 +1,53 @@ +#pragma once + +#include "Neon/core/core.h" +#include "Neon/domain/details/dGridSoA/dIndexSoA.h" + +namespace Neon::domain::details::dGridSoA { + +// Common forward declarations +class dSpanSoA; +template +class dPartitionSoA; + +struct dIndexSoA +{ + using OuterIdx = dIndexSoA; + + template + friend class dPartition; + friend dSpanSoA; + + template + friend class dField; + + // dGrid specific types + using Offset = int32_t; + using Location = index_3d; + using Count = int32_t; + + dIndexSoA() = default; + Location mLocation = 0; + Offset mOffset = 0; + + NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location const& location, + Offset const& offset); + + NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location::Integer const& x, + Location::Integer const& y, + Location::Integer const& z, + Offset const& offset); + + NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&; + + NEON_CUDA_HOST_DEVICE inline auto setOffset() -> Offset&; + + NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&; + + NEON_CUDA_HOST_DEVICE inline auto getOffset() const -> const Offset&; +}; + +} // namespace Neon::domain::details::dGridSoA + +#include "dIndexSoA_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h new file mode 100644 index 00000000..790608c7 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h @@ -0,0 +1,50 @@ +#pragma once +#include "Neon/core/core.h" + +namespace Neon::domain::details::dGridSoA { + +NEON_CUDA_HOST_DEVICE inline dIndexSoA:: + dIndexSoA(const Location& location, + Offset const& offset) +{ + mLocation = location; + mOffset = offset; +} + +NEON_CUDA_HOST_DEVICE inline dIndexSoA:: + dIndexSoA(const Location::Integer& x, + const Location::Integer& y, + const Location::Integer& z, + Offset const& offset) +{ + mLocation.x = x; + mLocation.y = y; + mLocation.z = z; + mOffset = offset; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + setLocation() -> Location& +{ + return mLocation; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + setOffset() -> Offset& +{ + return mOffset; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + getLocation() const -> const Location& +{ + return mLocation; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + getOffset() + const -> const Offset& +{ + return mOffset; +} +} // namespace Neon::domain::details::dGridSoA \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h new file mode 100644 index 00000000..0572302b --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -0,0 +1,365 @@ +#pragma once +#include +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" +#include "Neon/domain/details/dGrid/dGrid.h" +#include "Neon/domain/interface/NghData.h" +#include "Neon/set/DevSet.h" +#include "Neon/sys/memory/CudaIntrinsics.h" +#include "cuda_fp16.h" +#include "dIndexSoA.h" + +namespace Neon::domain::details::dGridSoA { + +template +class dPartitionSoA +{ + public: + using Idx = dIndexSoA; + using NghData = Neon::domain::NghData; + using Pitch = uint32_4d; + using NghIdx = int8_3d; + using Type = T; + + dPartitionSoA() + { + } + + dPartitionSoA(Neon::domain::details::dGrid::dPartition& dPartitionOriginal) + { + mDataView = dPartitionOriginal.getDataView(); + mMem = dPartitionOriginal.mem(); + mDim = dPartitionOriginal.dim(); + mZHaloRadius = dPartitionOriginal.halo().z; + mPitch = dPartitionOriginal.getPitchData().template newType(); + mPrtID = dPartitionOriginal.prtID(); + mOrigin = dPartitionOriginal.origin(); + mCardinality = dPartitionOriginal.cardinality(); + mFullGridSize = dPartitionOriginal.getDomainSize(); + mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets(); + } + + inline NEON_CUDA_HOST_DEVICE auto + prtID() + const -> int + { + return mPrtID; + } + + inline NEON_CUDA_HOST_DEVICE auto + cardinality() + const -> int + { + return mCardinality; + } + + inline NEON_CUDA_HOST_DEVICE auto + getPitchData() + const -> const Pitch& + { + return mPitch; + } + + inline NEON_CUDA_HOST_DEVICE auto + getPitch(const Idx& idx, + int cardinality) + const -> Idx::Offset + { + return idx.getOffset() + cardinality * mPitch.w; + } + + inline NEON_CUDA_HOST_DEVICE auto + dim() + const -> const Neon::index_3d + { + return mDim; + } + + inline NEON_CUDA_HOST_DEVICE auto + halo() + const -> const Neon::index_3d + { + return Neon::index_3d(0, 0, mZHaloRadius); + } + + inline NEON_CUDA_HOST_DEVICE auto + origin() + const -> const Neon::index_3d + { + return mOrigin; + } + + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + NghIdx nghOffset, + int card, + const T& alternativeVal) + const -> NghData + { + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); + T val = alternativeVal; + if (isValidNeighbour) { + val = operator()(gidxNgh, card); + } + return NghData(val, isValidNeighbour); + } + + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + NghIdx nghOffset, + int card) + const -> NghData + { + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); + T val; + if (isValidNeighbour) { + val = operator()(gidxNgh, card); + } + return NghData(val, isValidNeighbour); + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t, void> + { + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = this->operator()(gidxNgh, card); + funIfValid(val); + } + if constexpr (!std::is_same_v) { + if (!isValidNeighbour) { + funIfNOTValid(); + } + } + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card) + const -> NghData + { + NghData res; + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = operator()(gidxNgh, card); + res.set(val, true); + } else { + res.invalidate(); + } + return res; + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + T const& defaultValue) + const -> NghData + { + NghData res(defaultValue, false); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = operator()(gidxNgh, card); + res.set(val, true); + } + return res; + } + + NEON_CUDA_HOST_DEVICE inline auto + nghVal(const Idx& gidx, + uint8_t nghID, + int card, + const T& alternativeVal) + const -> NghData + { + NghIdx nghOffset = mStencil[nghID]; + return getNghData(gidx, nghOffset, card, alternativeVal); + } + + /** + * Get the index of the neighbor given the offset + * @tparam dataView_ta + * @param[in] gidx Index of the current element + * @param[in] nghOffset Offset of the neighbor of interest from the current element + * @param[in,out] neighbourIdx Index of the neighbor + * @return Whether the neighbour is valid + */ + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) + const -> bool + { + Neon::index_3d cartesian(gidx.getLocation().x + nghOffset.x, + gidx.getLocation().y + nghOffset.y, + gidx.getLocation().z + nghOffset.z); + + neighbourIdx = Idx(cartesian, gidx.getOffset() + + nghOffset.x * getPitchData().x + + nghOffset.y * getPitchData().y + + nghOffset.z * getPitchData().z); + + Neon::index_3d const nghCartesianIdx = getGlobalIndex(neighbourIdx); + + bool isValidNeighbour = true; + + isValidNeighbour = (nghCartesianIdx.x >= 0) && + (nghCartesianIdx.y >= 0) && + (nghCartesianIdx.z >= 0); + + isValidNeighbour = (nghCartesianIdx.x < mFullGridSize.x) && + (nghCartesianIdx.y < mFullGridSize.y) && + (nghCartesianIdx.z < mFullGridSize.z) && + isValidNeighbour; + + return isValidNeighbour; + } + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& gidx, + Idx& gidxNgh) + const -> bool + { + { + Neon::index_3d cartesian(gidx.getLocation().x + xOff, + gidx.getLocation().y + yOff, + gidx.getLocation().z + zOff); + gidxNgh = Idx(cartesian, gidx.getOffset() + + xOff * getPitchData().x + + yOff * getPitchData().y + + zOff * getPitchData().z); + } + + bool isValidNeighbour = true; + if constexpr (xOff > 0) { + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (xOff < 0) { + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + if constexpr (yOff > 0) { + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (yOff < 0) { + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + if constexpr (zOff > 0) { + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (zOff < 0) { + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + return isValidNeighbour; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem() + -> T* + { + return mMem; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem() const + -> const T* + { + return mMem; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem(const Idx& cell, + int cardinalityIdx) + -> T* + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mMem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto + operator()(const Idx& cell, + int cardinalityIdx) + -> T& + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mMem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto + operator()(const Idx& cell, + int cardinalityIdx) + const -> const T& + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mMem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local) + const -> Neon::index_3d + { + Neon::index_3d result = local.mLocation + mOrigin; + result.z -= mZHaloRadius; + return result; + } + + template + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local) + const -> int + { + if constexpr (Neon::index_3d::directionZ != direction) { + return local.mLocation.v[direction] + + mOrigin.v[direction]; + } else { + return local.mLocation.v[Neon::index_3d::directionZ] + + mOrigin.v[Neon::index_3d::directionZ] - + mZHaloRadius; + } + } + + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() + const -> Neon::index_3d + { + return mFullGridSize; + } + + Neon::DataView mDataView; + T* NEON_RESTRICT mMem; + Neon::index_3d mDim; + int mZHaloRadius; + Pitch mPitch; + int mPrtID; + Neon::index_3d mOrigin; + int mCardinality; + Neon::index_3d mFullGridSize; + NghIdx* NEON_RESTRICT mStencil; +}; + +} // namespace Neon::domain::details::dGridSoA diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h new file mode 100644 index 00000000..3aee038c --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h @@ -0,0 +1,57 @@ +#pragma once +#include "Neon/set/DevSet.h" +#include "dIndexSoA.h" +#include "Neon/domain/details/dGrid/dSpan.h" + +namespace Neon::domain::details::dGridSoA { + +/** + * Abstraction that represents the Cell space of a partition + * This abstraction is used by the neon lambda executor to + * run a containers on aGrid + */ +class dSpanSoA +{ + public: + using Idx = dIndexSoA; + + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d3; + using ExecutionThreadSpanIndexType = int32_t; + + + NEON_CUDA_HOST_DEVICE inline auto + setAndValidate(Idx& idx, + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) const + -> bool; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetDataView() + const -> Neon::DataView const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetZHaloRadius() + const -> int const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetZBoundaryRadius() + const -> int const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetDim() + const -> Neon::index_3d const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpInit(Neon::domain::details::dGrid::dSpan const&) ->void; + + private: + Neon::DataView mDataView; + int mZHaloRadius; + int mZBoundaryRadius; + Neon::index_3d mDim /** Dimension of the span, its values depends on the mDataView*/; +}; + +} // namespace Neon::domain::details::dGrid + +#include "dSpanSoA_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h new file mode 100644 index 00000000..f760adb5 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h @@ -0,0 +1,86 @@ +#pragma once + +namespace Neon::domain::details::dGridSoA { + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::setAndValidate(Idx& idx, + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) + const -> bool +{ + idx.setLocation().x = int(x); + idx.setLocation().y = int(y); + idx.setLocation().z = int(z); + + bool isValid = idx.getLocation() < mDim; + + switch (mDataView) { + case Neon::DataView::STANDARD: { + idx.setLocation().z += mZHaloRadius; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; + } + case Neon::DataView::INTERNAL: { + idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; + } + case Neon::DataView::BOUNDARY: { + idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius + ? 0 + : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); + idx.setLocation().z += mZHaloRadius; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; + } + default: { + } + } + return isValid; +} + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetDataView() + const -> Neon::DataView const& +{ + return mDataView; +} + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetZHaloRadius() + const -> int const& +{ + return mZHaloRadius; +} + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetZBoundaryRadius() + const -> int const& +{ + return mZBoundaryRadius; +} + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetDim() + const -> Neon::index_3d const& +{ + return mDim; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) -> void +{ + mDataView = dspan.helpGetDataView(); + mZHaloRadius = dspan.helpGetZHaloRadius(); + mZBoundaryRadius = dspan.helpGetZBoundaryRadius(); + mDim = dspan.helpGetDim(); +} + + +} // namespace Neon::domain::details::dGridSoA \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h index 012a3588..05f3101b 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h @@ -59,7 +59,7 @@ class ePartition * | * | Connectivity table has the same layout of a field with cardinality equal to * | the number of neighbours and an SoA layout. Let's call this field nghField. - * | nghField(e, nghIdx) is the eIdx_t of the neighbour element as in a STANDARD + * | nghField(e, helpGetNghIdx) is the eIdx_t of the neighbour element as in a STANDARD * | view. * |--) */ @@ -188,6 +188,19 @@ class ePartition int card, T defaultValue) const -> NghData; + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t &&( std::is_invocable_v || std::is_same_v), void>; + /** * Check is the * @tparam dataView_ta diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h index 0063ee9e..8565cdc1 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h @@ -37,34 +37,34 @@ ePartition::cardinality() const template NEON_CUDA_HOST_DEVICE inline auto -ePartition::operator()(eIndex eId, int cardinalityIdx) const +ePartition::operator()(eIndex gidx, int cardinalityIdx) const -> T { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem[jump]; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::operator()(eIndex eId, int cardinalityIdx) -> T& +ePartition::operator()(eIndex gidx, int cardinalityIdx) -> T& { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem[jump]; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, +ePartition::getNghData(eIndex gidx, NghIdx nghIdx, int card) const -> NghData { - eIndex eIdxNgh; - const bool isValidNeighbour = isValidNgh(eId, nghIdx, eIdxNgh); + eIndex gidxxNgh; + const bool isValidNeighbour = isValidNgh(gidx, nghIdx, gidxxNgh); if (isValidNeighbour) { - T val = this->operator()(eIdxNgh, card); + T val = this->operator()(gidxxNgh, card); return NghData(val, isValidNeighbour); } return NghData(isValidNeighbour); @@ -73,7 +73,7 @@ ePartition::getNghData(eIndex eId, template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, +ePartition::getNghData(eIndex gidx, const Neon::int8_3d& ngh3dIdx, int card) const -> NghData @@ -82,7 +82,7 @@ ePartition::getNghData(eIndex eId, (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); return res; } @@ -91,15 +91,15 @@ template template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, - int card) +ePartition::getNghData(eIndex gidx, + int card) const -> NghData { int tablePithc = (xOff + mStencilRadius) + (yOff + mStencilRadius) * mStencilTableYPitch + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); return res; } @@ -108,37 +108,66 @@ template template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, - int card, - T defaultVal) +ePartition::getNghData(eIndex gidx, + int card, + T defaultVal) const -> NghData { int tablePithc = (xOff + mStencilRadius) + (yOff + mStencilRadius) * mStencilTableYPitch + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); if (!res.isValid()) { res.set(defaultVal, false); } return res; } +template +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + int tablePithc = (xOff + mStencilRadius) + + (yOff + mStencilRadius) * mStencilTableYPitch + + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(gidx, nghIdx, card); + if (res.isValid()) { + funIfValid(res.getData()); + return; + } + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} + template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghIndex(eIndex eId, +ePartition::getNghIndex(eIndex gidx, const Neon::int8_3d& ngh3dIdx, - eIndex& eIdxNgh) const -> bool + eIndex& gidxxNgh) const -> bool { int tablePithc = (ngh3dIdx.x + mStencilRadius) + (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; eIndex tmpEIdxNgh; - const bool isValidNeighbour = isValidNgh(eId, nghIdx, tmpEIdxNgh); + const bool isValidNeighbour = isValidNgh(gidx, nghIdx, tmpEIdxNgh); if (isValidNeighbour) { - eIdxNgh = tmpEIdxNgh; + gidxxNgh = tmpEIdxNgh; } return isValidNeighbour; } @@ -146,17 +175,17 @@ ePartition::getNghIndex(eIndex eId, template NEON_CUDA_HOST_DEVICE inline auto -ePartition::isValidNgh(eIndex eId, +ePartition::isValidNgh(eIndex gidx, NghIdx nghIdx, eIndex& neighbourIdx) const -> bool { - const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + eId.helpGet(); + const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + gidx.helpGet(); neighbourIdx.helpSet() = NEON_CUDA_CONST_LOAD((mConnectivity + connectivityJumo)); const bool isValidNeighbour = (neighbourIdx.mIdx > -1); - // printf("(prtId %d) getNghData id %d eIdxNgh %d connectivityJumo %d\n", + // printf("(prtId %d) getNghData id %d gidxxNgh %d connectivityJumo %d\n", // mPrtID, - // eId.mIdx, neighbourIdx.mIdx, connectivityJumo); + // gidx.mIdx, neighbourIdx.mIdx, connectivityJumo); return isValidNeighbour; } @@ -201,20 +230,20 @@ ePartition::ePartition(int prtId, template NEON_CUDA_HOST_DEVICE auto -ePartition::pointer(eIndex eId, int cardinalityIdx) const +ePartition::pointer(eIndex gidx, int cardinalityIdx) const -> const Type* { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem + jump; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getOffset(eIndex eId, int cardinalityIdx) const +ePartition::getOffset(eIndex gidx, int cardinalityIdx) const -> Offset { - return Offset(eId.helpGet() * mPitch.x + cardinalityIdx * mPitch.y); + return Offset(gidx.helpGet() * mPitch.x + cardinalityIdx * mPitch.y); } template class GridTransformer { public: + using Idx = typename GridTransformation::Idx; + using Span = typename GridTransformation::Span; template using Partition = typename GridTransformation::template Partition; - using Span = typename GridTransformation::Span; using FoundationGrid = typename GridTransformation::FoundationGrid; using Grid = details::tGrid; diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h index c9ca59b9..a1b4c90d 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h @@ -26,6 +26,7 @@ class tField : public Neon::domain::interface::FieldBaseTemplate; using Idx = typename Partition::Idx; using NghIdx = typename Partition::NghIdx; // for compatibility with eGrid + using NghData = typename Partition::NghData; // for compatibility with eGrid private: using FoundationGrid = typename GridTransformation::FoundationGrid; diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h index d6d98be1..bd28e8f5 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h @@ -54,6 +54,15 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate + tGrid(const Neon::Backend& backend /**< Target for computation */, + const Neon::int32_3d& dimension /**< Dimension of the bounding box containing the domain */, + const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, + const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, + const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, + const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */); + tGrid(const tGrid& other); // copy constructor tGrid(tGrid&& other) noexcept; // move constructor tGrid& operator=(const tGrid& other); // copy assignment @@ -109,7 +118,7 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate(bk); } diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h index 4ba1403d..0a0249d7 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h @@ -30,6 +30,34 @@ tGrid::tGrid(FoundationGrid& foundationGrid) foundationGrid.getOrigin()); } +template +template +tGrid::tGrid(const Neon::Backend& bk, + const Neon::int32_3d& dimension, + const SparsityPattern& activeCellLambda, + const Neon::domain::Stencil& stencil, + const Vec_3d& spacing, + const Vec_3d& origin) +{ + mData = std::make_shared(bk); + mData->foundationGrid = FoundationGrid(bk, + dimension, + activeCellLambda, + stencil, + spacing, + origin); + GridTransformation::initSpan(mData->foundationGrid, + NEON_OUT mData->spanTable); + tGrid::GridBase::init("tGrid", + bk, + mData->foundationGrid.getDimension(), + mData->foundationGrid.getStencil(), + mData->foundationGrid.getNumActiveCellsPerPartition(), + mData->foundationGrid.getDefaultBlock(), + mData->foundationGrid.getSpacing(), + mData->foundationGrid.getOrigin()); +} + template tGrid::tGrid() { diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu index 158d3e05..1b94b566 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu @@ -1,5 +1,6 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" @@ -27,18 +28,18 @@ auto defContainer(int streamIdx, return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); Neon::index_3d globalPoint = a.getGlobalIndex(e); - a(e, 0) = globalPoint.x ; + a(e, 0) = globalPoint.x; b(e, 0) = globalPoint.y; c(e, 0) = globalPoint.z; -// if constexpr (std::is_same_v) { -// printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx, -// e.mInDataBlockIdx.x, -// e.mInDataBlockIdx.y, -// e.mInDataBlockIdx.z, -// globalPoint.x, -// globalPoint.y, -// globalPoint.z); -// } + // if constexpr (std::is_same_v) { + // printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx, + // e.mInDataBlockIdx.x, + // e.mInDataBlockIdx.y, + // e.mInDataBlockIdx.z, + // globalPoint.x, + // globalPoint.y, + // globalPoint.z); + // } }; }); } @@ -98,5 +99,6 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h index 0a3b87eb..c766f7ca 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h @@ -3,9 +3,9 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" - namespace globalIdx { using namespace Neon::domain::tool::testing; @@ -15,6 +15,7 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; -} // namespace map +} // namespace globalIdx diff --git a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp index 783830ca..f0ecce78 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp @@ -4,7 +4,7 @@ #include "globalIdx.h" #include "runHelper.h" -TEST(domain_unit_test_globalIdx, dGrid) +TEST(domain_globalIdx, dGrid) { int nGpus = 3; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid) 1); } -TEST(domain_unit_test_globalIdx, eGrid) +TEST(domain_globalIdx, eGrid) { int nGpus = 3; using Type = int64_t; @@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGrid) +TEST(domain_globalIdx, bGrid) { int nGpus = 3; using Type = int64_t; @@ -31,6 +31,15 @@ TEST(domain_unit_test_globalIdx, bGrid) 1); } +TEST(domain_globalIdx, dGridSoA) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp index d0d43b60..50d6e34d 100644 --- a/libNeonDomain/tests/domain-map/src/gtests.cpp +++ b/libNeonDomain/tests/domain-map/src/gtests.cpp @@ -31,6 +31,15 @@ TEST(domain_map, bGrid) 1); } +TEST(domain_map, dGridSoA) +{ + int nGpus = 1; + using Type = int64_t; + runAllTestConfiguration(std::function(map::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu index bd25f178..b001d832 100644 --- a/libNeonDomain/tests/domain-map/src/map.cu +++ b/libNeonDomain/tests/domain-map/src/map.cu @@ -4,6 +4,7 @@ #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" #include "gtest/gtest.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace map { @@ -75,6 +76,7 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h index 611f2046..16073657 100644 --- a/libNeonDomain/tests/domain-map/src/map.h +++ b/libNeonDomain/tests/domain-map/src/map.h @@ -4,6 +4,7 @@ #include "Neon/domain/Grids.h" #include "Neon/domain/tools/TestData.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace map { @@ -14,6 +15,8 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; } // namespace map diff --git a/libNeonDomain/tests/domain-map/src/runHelper.h b/libNeonDomain/tests/domain-map/src/runHelper.h index 53ea8681..593e31c2 100644 --- a/libNeonDomain/tests/domain-map/src/runHelper.h +++ b/libNeonDomain/tests/domain-map/src/runHelper.h @@ -31,7 +31,7 @@ void runAllTestConfiguration( nGpuTest.push_back(i); } // std::vector nGpuTest{2,4,6,8}; - std::vector cardinalityTest{1}; + std::vector cardinalityTest{1,3,19}; std::vector dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}}; std::vector runtimeE{Neon::Runtime::openmp}; @@ -95,6 +95,7 @@ void runAllTestConfiguration( } } +#if 0 template void runOneTestConfiguration(const std::string& gname, @@ -144,3 +145,4 @@ void runOneTestConfiguration(const std::string& gname, } } } +#endif \ No newline at end of file diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp index feba5a9b..21bba9b5 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp @@ -1,10 +1,10 @@ +#include "./testsAndContainers.h" #include "Neon/Neon.h" #include "gtest/gtest.h" -#include "./testsAndContainers.h" #include "runHelper.h" -TEST(domain_unit_test_globalIdx, dGrid) +TEST(domain_neighbour_globalIdx, dGrid) { int nGpus = 5; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid) 1); } -TEST(domain_unit_test_globalIdx, eGrid) +TEST(domain_neighbour_globalIdx, eGrid) { int nGpus = 5; using Type = int64_t; @@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGrid) +TEST(domain_neighbour_globalIdx, bGrid) { int nGpus = 5; using Type = int64_t; @@ -31,6 +31,53 @@ TEST(domain_unit_test_globalIdx, bGrid) 1); } +TEST(domain_neighbour_globalIdx, dGridSoA) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + +/////////////////////////////////////////// + +TEST(domain_neighbour_globalIdx, dGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, eGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, bGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, dGridSoA_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h index 0014594c..32a078d6 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h @@ -9,6 +9,7 @@ #include "Neon/domain/dGrid.h" #include "Neon/domain/eGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/Geometries.h" #include "Neon/domain/tools/TestData.h" diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu index 49dd3bd2..7b2c3fef 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu @@ -1,5 +1,6 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" @@ -61,15 +62,15 @@ auto checkNeighbourData(Field const& filedA, Field const& filedB, Field const& filedC, Neon::index_3d testDirection, - Field const& checkFlatA, - Field const& checkFlatB, - Field const& checkFlatC) + Field& checkFlatA, + Field& checkFlatB, + Field& checkFlatC) -> Neon::set::Container { const auto& grid = filedA.getGrid(); return grid.newContainer( "defContainer", - [&](Neon::set::Loader& loader) { + [&, testDirection](Neon::set::Loader& loader) { auto a = loader.load(filedA, Neon::Pattern::STENCIL); auto b = loader.load(filedB, Neon::Pattern::STENCIL); auto c = loader.load(filedC, Neon::Pattern::STENCIL); @@ -102,6 +103,58 @@ auto checkNeighbourData(Field const& filedA, }); } +template +auto checkNeighbourDataTemplate(Field const& filedA, + Field const& filedB, + Field const& filedC, + Field& checkFlatA, + Field& checkFlatB, + Field& checkFlatC) + -> Neon::set::Container +{ + const auto& grid = filedA.getGrid(); + return grid.newContainer( + "defContainer", + [&](Neon::set::Loader& loader) { + auto a = loader.load(filedA, Neon::Pattern::STENCIL); + auto b = loader.load(filedB, Neon::Pattern::STENCIL); + auto c = loader.load(filedC, Neon::Pattern::STENCIL); + + auto resA = loader.load(checkFlatA, Neon::Pattern::MAP); + auto resB = loader.load(checkFlatB, Neon::Pattern::MAP); + auto resC = loader.load(checkFlatC, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { + constexpr Neon::index_3d testDirection(xOff, yOff, zOff); + + // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); + Neon::index_3d globalPoint = a.getGlobalIndex(e); + auto ngh = globalPoint + testDirection; + + decltype(a)* nghInfo[3] = {&a, &b, &c}; + decltype(a)* results[3] = {&resA, &resB, &resC}; + + for (int i = 0; i < 3; i++) { + auto d = nghInfo[i]->template getNghData(e, 0); + // auto d = nghInfo[i]->getNghData(e, testDirection.newType(), 0); + + if (d.isValid()) { + results[i]->operator()(e, 0) = d.getData() == ngh.v[i] ? +1 : -1; + if (d.getData() != ngh.v[i]) { + printf("ERROR: %d %d %d %d %d %d\n", globalPoint.x, globalPoint.y, globalPoint.z, ngh.v[0], ngh.v[1], ngh.v[2]); + d = nghInfo[i]->getNghData(e, testDirection.newType(), 0); + } + } else { + results[i]->operator()(e, 0) = 0; + } + } + }; + }); +} + using namespace Neon::domain::tool::testing; template @@ -165,15 +218,15 @@ auto run(TestData& data) -> void X, Y, Z); }; - // constexpr std::array - // stencil{Ngh3DIdx(1, 0, 0), - // Ngh3DIdx(-1, 0, 0), - // Ngh3DIdx(0, 1, 0), - // Ngh3DIdx(0, -1, 0), - // Ngh3DIdx(0, 0, 1), - // Ngh3DIdx(0, 0, -1)}; - constexpr std::array - stencil{Ngh3DIdx(0, 0, -1)}; + constexpr std::array + stencil{Ngh3DIdx(1, 0, 0), + Ngh3DIdx(-1, 0, 0), + Ngh3DIdx(0, 1, 0), + Ngh3DIdx(0, -1, 0), + Ngh3DIdx(0, 0, 1), + Ngh3DIdx(0, 0, -1)}; + // constexpr std::array + // stencil{Ngh3DIdx(0, 0, -1)}; for (auto const& direction : stencil) { reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx); @@ -214,8 +267,149 @@ auto run(TestData& data) -> void } } +template +auto runTemplate(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + + data.resetValuesToLinear(1, 100); + + auto aField = grid.template newField("a", 1, 0); + auto bField = grid.template newField("a", 1, 0); + auto cField = grid.template newField("a", 1, 0); + + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + auto& Z = data.getField(FieldNames::Z); + + const Neon::index_3d dim = grid.getDimension(); + auto bk = grid.getBackend(); + + { // NEON + { + initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + } + using Ngh3DIdx = Neon::int32_3d; + + auto setGolden = [&](Ngh3DIdx const& direction) { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + auto& Z = data.getIODomain(FieldNames::Z); + + data.forEachActiveIODomain([&](const Neon::index_3d& idx, + int cardinality, + Type& a, + Type& b, + Type& c) { + a = 1; + b = 1; + c = 1; + auto ngh = direction + idx; + if (!(ngh >= 0)) { + a = 0; + b = 0; + c = 0; + } + if (!(dim > ngh)) { + a = 0; + b = 0; + c = 0; + } + }, + X, Y, Z); + }; + + constexpr std::array + stencil{Ngh3DIdx(1, 0, 0), + Ngh3DIdx(-1, 0, 0), + Ngh3DIdx(0, 1, 0), + Ngh3DIdx(0, -1, 0), + Ngh3DIdx(0, 0, 1), + Ngh3DIdx(0, 0, -1)}; + // constexpr std::array + // stencil{Ngh3DIdx(0, 0, -1)}; + + for (auto const& direction : stencil) { + reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + reset(X, Y, Z).run(Neon::Backend::mainStreamIdx); + { // Updating halo with wrong data + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + { + initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + + + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + + if (direction == Neon::index_3d(1, 0, 0)) { + checkNeighbourDataTemplate<1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(-1, 0, 0)) { + checkNeighbourDataTemplate<-1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 1, 0)) { + checkNeighbourDataTemplate<0, 1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, -1, 0)) { + checkNeighbourDataTemplate<0, -1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 0, 1)) { + checkNeighbourDataTemplate<0, 0, 1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 0, -1)) { + checkNeighbourDataTemplate<0, 0, -1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else { + std::cout << "Direction not implemented " << direction << std::endl; + exit(99); + } + setGolden(direction); + + bk.sync(Neon::Backend::mainStreamIdx); + bool isOk = data.compare(FieldNames::X); + isOk = isOk && data.compare(FieldNames::Y); + isOk = isOk && data.compare(FieldNames::Z); + + if (!isOk) { + std::cout << "Direction with errors " << direction << std::endl; + data.getField(FieldNames::X).ioToVtk(grid.getImplementationName() + "X", "X", true); + data.getField(FieldNames::Y).ioToVtk(grid.getImplementationName() + "Y", "Y", true); + data.getField(FieldNames::Z).ioToVtk(grid.getImplementationName() + "Z", "Z", true); + exit(77); + ASSERT_TRUE(isOk); + } + } +} + + template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; + + +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h index 0a3b87eb..bcf503f2 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h @@ -4,6 +4,7 @@ #include "Neon/domain/Grids.h" #include "Neon/domain/tools/TestData.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace globalIdx { @@ -12,9 +13,17 @@ using namespace Neon::domain::tool::testing; template auto run(TestData& data) -> void; +template +auto runTemplate(TestData& data) -> void; + extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; } // namespace map diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp index ec6f892a..9fed3354 100644 --- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp +++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp @@ -4,29 +4,74 @@ #include "runHelper.h" #include "stencil.h" -TEST(domain_stencil, dGrid) +TEST(domain_stencil, dGrid_NoTemplate) { int nGpus = 3; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, eGrid) +TEST(domain_stencil, eGrid_NoTemplate) { int nGpus = 3; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, bGri ) +TEST(domain_stencil, bGri_NoTemplate) { int nGpus = 5; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGridSoA_NoTemplate) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runNoTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGrid_Template) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, eGrid_Template) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, bGri_Template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGridSoA_Template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), nGpus, 1); } diff --git a/libNeonDomain/tests/domain-stencil/src/runHelper.h b/libNeonDomain/tests/domain-stencil/src/runHelper.h index e8f286ae..16cefb0f 100644 --- a/libNeonDomain/tests/domain-stencil/src/runHelper.h +++ b/libNeonDomain/tests/domain-stencil/src/runHelper.h @@ -33,7 +33,7 @@ void runAllTestConfiguration( // std::vector nGpuTest{2,4,6,8}; std::vector cardinalityTest{1}; - std::vector dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}}; + std::vector dimTest{{10, 17, 90}, {1, 1, 100}, {17, 1, 77}}; std::vector runtimeE{Neon::Runtime::openmp}; if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) { runtimeE.push_back(Neon::Runtime::stream); diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index a86f1def..6cd4f6ff 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -9,8 +9,8 @@ namespace map { template -auto stencilContainer_laplace(const Field& filedA, - Field& fieldB) +auto laplaceNoTemplate(const Field& filedA, + Field& fieldB) -> Neon::set::Container { const auto& grid = filedA.getGrid(); @@ -59,20 +59,37 @@ static constexpr std::array stencil{ Ngh3DIdx(0, 0, 1), Ngh3DIdx(0, 0, -1)}; -template -inline auto viaTemplate (const IDX& idx, int i, const Field& a, int& partial, int& count){ - a.template getNghData(idx, i, - [&](typename Field::Type const& val) { - partial += val; - count++; - }); +template +NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Partition& a, Partial& partial, int& count) +{ + // Neon::index_3d direction(X, Y, Z); + // auto nghData = a.getNghData(idx, direction.newType(), i); + // if (nghData.isValid()) { + // partial += nghData.getData(); + // count++; + // } + a.template getNghData(idx, i, + [&](typename Partition::Type const& val) { + partial += val; + count++; + }); }; + +//template +//constexpr void constexpr_for(F&& f) +//{ +// if constexpr (Start < End) { +// f(std::integral_constant()); +// constexpr_for(f); +// } +//} + template -auto stencilContainerLaplaceTemplate(const Field& filedA, - Field& fieldB) +auto laplaceTemplate(const Field& filedA, + Field& fieldB) -> Neon::set::Container { const auto& grid = filedA.getGrid(); @@ -88,35 +105,18 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); typename Field::Type partial = 0; int count = 0; + using Ngh3DIdx = Neon::int8_3d; - constexpr std::array stencil{ - Ngh3DIdx(1, 0, 0), - Ngh3DIdx(-1, 0, 0), - Ngh3DIdx(0, 1, 0), - Ngh3DIdx(0, -1, 0), - Ngh3DIdx(0, 0, 1), - Ngh3DIdx(0, 0, -1)}; - -#if 0 - auto viaTemplate = [&]() { - if constexpr (std::is_same_v) { - a.template getNghData(idx, i, - [&](Field::Type const& val) { - partial += val; - count++; - }); - } - }; -#endif - viaTemplate<0>(idx, i, a, partial, count); - viaTemplate<1>(idx, i, a, partial, count); - viaTemplate<2>(idx, i, a, partial, count); - viaTemplate<3>(idx, i, a, partial, count); - viaTemplate<4>(idx, i, a, partial, count); - viaTemplate<5>(idx, i, a, partial, count); - + Neon::ConstexprFor<0, 6, 1>([&](auto sIdx) { + a.template getNghData(idx, i, + [&](auto const& val) { + partial += val; + count++; + }); + }); + b(idx, i) = a(idx, i) - count * partial; } }; @@ -126,7 +126,82 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, using namespace Neon::domain::tool::testing; template -auto run(TestData& data) -> void +auto runNoTemplate(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + const int maxIters = 1; + + NEON_INFO(grid.toString()); + + // data.resetValuesToLinear(1, 100); + data.resetValuesToMasked(1); + + { // NEON + const Neon::index_3d dim = grid.getDimension(); + std::vector elements; + auto bk = grid.getBackend(); + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + for (int iter = maxIters; iter > 0; iter--) { + bk.sync(Neon::Backend::mainStreamIdx); + X.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::put, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(X, Y).run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + Y.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx); + } + data.getBackend().sync(0); + } + + { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + for (int iter = maxIters; iter > 0; iter--) { + data.laplace(X, Y); + data.laplace(Y, X); + } + } + + data.updateHostData(); + + data.getField(FieldNames::X).ioToVtk("X", "X", true); + // data.getField(FieldNames::Y).ioToVtk("Y", "Y", false); + // data.getField(FieldNames::Z).ioToVtk("Z", "Z", false); + // + data.getIODomain(FieldNames::X).ioToVti("X_", "X_"); + // data.getField(FieldNames::Y).ioVtiAllocator("Y_"); + // data.getField(FieldNames::Z).ioVtiAllocator("Z_"); + + bool isOk = data.compare(FieldNames::X); + isOk = data.compare(FieldNames::Y); + if (!isOk) { + auto flagField = data.compareAndGetField(FieldNames::X); + flagField.ioToVti("X_diffFlag", "X_diffFlag"); + flagField = data.compareAndGetField(FieldNames::Y); + flagField.ioToVti("Y_diffFlag", "Y_diffFlag"); + } + ASSERT_TRUE(isOk); + if (!isOk) { + exit(99); + } +} + +template +auto runTemplate(TestData& data) -> void { using Type = typename TestData::Type; @@ -153,7 +228,7 @@ auto run(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainer_laplace(X, Y).run(Neon::Backend::mainStreamIdx); + laplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); Y.newHaloUpdate(Neon::set::StencilSemantic::standard, @@ -162,7 +237,7 @@ auto run(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainer_laplace(Y, X).run(Neon::Backend::mainStreamIdx); + laplaceTemplate(Y, X).run(Neon::Backend::mainStreamIdx); } data.getBackend().sync(0); } @@ -200,9 +275,14 @@ auto run(TestData& data) -> void } } -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.h b/libNeonDomain/tests/domain-stencil/src/stencil.h index a35d8011..456f5f01 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.h +++ b/libNeonDomain/tests/domain-stencil/src/stencil.h @@ -11,9 +11,20 @@ namespace map { using namespace Neon::domain::tool::testing; template -auto run(TestData& data) -> void; +auto runNoTemplate(TestData& data) -> void; -extern template auto run(TestData&) -> void; -extern template auto run(TestData&) -> void; +template +auto runTemplate(TestData& data) -> void; + + +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; + +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; } // namespace map