diff --git a/Makefile b/Makefile index 3e76c25d2c..428191ab59 100644 --- a/Makefile +++ b/Makefile @@ -111,7 +111,7 @@ OMP_SIMD_FLAG.oneAPI := $(OMP_SIMD_FLAG.clang) SYCL_FLAG.gcc := SYCL_FLAG.clang := -fsycl SYCL_FLAG.icc := -SYCL_FLAG.oneAPI := -fsycl +SYCL_FLAG.oneAPI := -fsycl -fno-sycl-id-queries-fit-in-int OPT.gcc := -g -ffp-contract=fast OPT.clang := $(OPT.gcc) OPT.oneAPI := $(OPT.clang) @@ -275,6 +275,7 @@ hip-gen.cpp := $(sort $(wildcard backends/hip-gen/*.cpp)) sycl-core.cpp := $(sort $(wildcard backends/sycl/*.sycl.cpp)) sycl-ref.cpp := $(sort $(wildcard backends/sycl-ref/*.sycl.cpp)) sycl-shared.cpp:= $(sort $(wildcard backends/sycl-shared/*.sycl.cpp)) +sycl-gen.cpp := $(sort $(wildcard backends/sycl-gen/*.sycl.cpp)) hip-all.c := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c) hip-all.cpp := $(hip.cpp) $(hip-ref.cpp) $(hip-gen.cpp) @@ -453,7 +454,7 @@ ifneq ($(HIP_LIB_DIR),) endif # SYCL Backends -SYCL_BACKENDS = /gpu/sycl/ref /gpu/sycl/shared +SYCL_BACKENDS = /gpu/sycl/ref /gpu/sycl/shared /gpu/sycl/gen ifneq ($(SYCL_DIR),) SYCL_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(SYCL_DIR)/$d/libsycl.${SO_EXT})) SYCL_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(SYCL_LIB_DIR)))) @@ -461,7 +462,7 @@ endif ifneq ($(SYCL_LIB_DIR),) PKG_LIBS += $(SYCL_FLAG) -lze_loader LIBCEED_CONTAINS_CXX = 1 - libceed.sycl += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp) + libceed.sycl += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp) $(sycl-gen.cpp) BACKENDS_MAKE += $(SYCL_BACKENDS) endif @@ -556,6 +557,9 @@ $(OBJDIR)/%.o : $(CURDIR)/%.hip.cpp | $$(@D)/.DIR $(OBJDIR)/%.o : $(CURDIR)/%.sycl.cpp | $$(@D)/.DIR $(call quiet,SYCLCXX) $(SYCLFLAGS) $(CPPFLAGS) -c -o $@ $(abspath $<) +$(OBJDIR)/%.o : $(CURDIR)/%.sycl.cpp | $$(@D)/.DIR + $(call quiet,SYCLCXX) $(SYCLFLAGS) $(CPPFLAGS) -c -o $@ $(abspath $<) + $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.c | $$(@D)/.DIR $(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS) diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h index 23989a0a27..34489a7cd0 100644 --- a/backends/ceed-backend-list.h +++ b/backends/ceed-backend-list.h @@ -21,6 +21,7 @@ CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen") CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared") CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref") CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared") +CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen") CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma") CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det") CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked") diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp new file mode 100644 index 0000000000..ca4052739c --- /dev/null +++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp @@ -0,0 +1,14 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#ifndef _ceed_sycl_gen_operator_build_h +#define _ceed_sycl_gen_operator_build_h + +CEED_INTERN int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes); +CEED_INTERN int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op); + +#endif // _ceed_sycl_gen_operator_build_h diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp new file mode 100644 index 0000000000..2f96d6fe0d --- /dev/null +++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp @@ -0,0 +1,751 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#define CEED_DEBUG_COLOR 12 + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../sycl-ref/ceed-sycl-ref.hpp" +#include "../sycl-shared/ceed-sycl-shared.hpp" +#include "../sycl/ceed-sycl-compile.hpp" + +#include "ceed-sycl-gen.hpp" + +//------------------------------------------------------------------------------ +// Calculate the block size used for launching the operator kernel +//------------------------------------------------------------------------------ +extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { + const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); + if (dim == 1) { + CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + block_sizes[0] = thread1d; + block_sizes[1] = 1; + block_sizes[2] = elems_per_block; + } else if (dim == 2) { + const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; + block_sizes[0] = thread1d; + block_sizes[1] = thread1d; + block_sizes[2] = elems_per_block; + } else if (dim == 3) { + const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); + block_sizes[0] = thread1d; + block_sizes[1] = thread1d; + block_sizes[2] = elems_per_block; + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Build single operator kernel +// - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct +// - [ ] Do kernel jitting! +//------------------------------------------------------------------------------ +extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { + bool is_setup_done; + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); + if (is_setup_done) return CEED_ERROR_SUCCESS; + + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed_Sycl *sycl_data; + CeedCallBackend(CeedGetData(ceed, &sycl_data)); + + CeedOperator_Sycl_gen *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); + Fields_Sycl h_B, h_G; + FieldsInt_Sycl h_indices; + CeedQFunction qf; + CeedQFunction_Sycl_gen *qf_impl; + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); + CeedSize lsize; + CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + Q_1d = Q; + + CeedOperatorField *op_input_fields, *op_output_fields; + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + + CeedEvalMode eval_mode; + CeedBasis basis; + CeedBasis_Sycl_shared *basis_impl; + CeedElemRestriction Erestrict; + CeedElemRestriction_Sycl *restr_impl; + + // Check for restriction only identity operator + bool is_identity_qf; + CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); + if (is_identity_qf) { + CeedEvalMode eval_mode_in, eval_mode_out; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); + if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement restriction only identity operators"); + // LCOV_EXCL_STOP + } + } + + std::ostringstream code; + // TODO: generalize to accept different device functions? + { + char *tensor_basis_kernel_path, *tensor_basis_code; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); + CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_code)); + code << tensor_basis_code; + CeedCallBackend(CeedFree(&tensor_basis_kernel_path)); + CeedCallBackend(CeedFree(&tensor_basis_code)); + } + { + char *sycl_gen_template_path, *sycl_gen_template_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-gen-templates.h", &sycl_gen_template_path)); + CeedDebug256(ceed, 2, "----- Loading Sycl-Gen Template Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, sycl_gen_template_path, &sycl_gen_template_source)); + code << sycl_gen_template_source; + CeedCallBackend(CeedFree(&sycl_gen_template_path)); + CeedCallBackend(CeedFree(&sycl_gen_template_source)); + } + + std::string_view q_function_source(qf_impl->q_function_source); + std::string_view q_function_name(qf_impl->q_function_name); + const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(q_function_name); + + // Find dim, P_1d, Q_1d + impl->max_P_1d = 0; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_COLLOCATED) { + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + + // Collect dim, P_1d, and Q_1d + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + bool isTensor; + CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); + if (isTensor) { + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + if (P_1d > impl->max_P_1d) impl->max_P_1d = P_1d; + } else { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); + // LCOV_EXCL_STOP + } + } + } + // Check output bases for Q_1d, dim as well + // The only input basis might be CEED_BASIS_COLLOCATED + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + + if (basis != CEED_BASIS_COLLOCATED) { + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + + // Collect Q_1d + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + bool isTensor; + CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); + if (isTensor) { + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + } else { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); + // LCOV_EXCL_STOP + } + } + } + impl->dim = dim; + impl->Q_1d = Q_1d; + + // Only use 3D collocated gradient parallelization strategy when gradient is computed + // TODO: put in a function? + bool use_collograd_parallelization = false; + if (dim == 3) { + bool was_grad_found = false; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_GRAD) { + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); + use_collograd_parallelization = !!basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); + was_grad_found = true; + } + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_GRAD) { + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); + use_collograd_parallelization = !!basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); + was_grad_found = true; + } + } + } + + CeedInt block_sizes[3]; + CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes)); + + // Define CEED_Q_VLA + code << "\n#undef CEED_Q_VLA\n"; + if (dim != 3 || use_collograd_parallelization) { + code << "#define CEED_Q_VLA 1\n\n"; + } else { + code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; + } + + // Determine subgroup size based on supported sizes : Default : 16 (if supported) + std::vector allowed_sg_sizes = sycl_data->sycl_device.get_info(); + CeedInt sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1]; + for (const auto &s : allowed_sg_sizes) { + if (s == 16) { + sub_group_size_op = s; + break; + } + } + + code << q_function_source; + + // Kernel function + code << "\n// -----------------------------------------------------------------------------\n"; + code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n"; + code << "kernel void " << operator_name << "("; + code << "const CeedInt num_elem, "; + code << "global void* ctx, "; + code << "global const FieldsInt_Sycl* indices, "; + code << "global Fields_Sycl* fields, "; + code << "global const Fields_Sycl* B, "; + code << "global const Fields_Sycl* G, "; + code << "global const CeedScalar * restrict W"; + code << ") {\n"; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT + code << " global const CeedScalar* d_u_" << i << " = fields->inputs[" << i << "];\n"; + } + } + + for (CeedInt i = 0; i < num_output_fields; i++) { + code << " global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n"; + } + + // TODO: Convert these to defined constants to save on GRF + code << " const CeedInt DIM = " << dim << ";\n"; + code << " const CeedInt Q_1D = " << Q_1d << ";\n"; + + const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2]; + code << " local CeedScalar scratch[" << scratch_size << "];\n"; + code << " local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; + + code << "\n // -- Input field constants and basis data --\n"; + // Initialize constants, and matrices B and G + for (CeedInt i = 0; i < num_input_fields; i++) { + code << " // ---- Input field " << i << " ----\n"; + // Get elem_size, eval_mode, num_comp + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); + CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + + // Set field constants + if (eval_mode != CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_COLLOCATED) { + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + code << " const CeedInt P_in_" << i << " = " << P_1d << ";\n"; + } else { + code << " const CeedInt P_in_" << i << " = " << Q_1d << ";\n"; + } + code << " const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n"; + } + + // Load basis data + code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); + h_B.inputs[i] = basis_impl->d_interp_1d; + code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); + h_B.inputs[i] = basis_impl->d_interp_1d; + code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n"; + if (use_collograd_parallelization) { + h_G.inputs[i] = basis_impl->d_collo_grad_1d; + code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n"; + code << " loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n"; + } else { + bool has_collo_grad = !!basis_impl->d_collo_grad_1d; + h_G.inputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; + code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; + code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i + << ");\n"; + } + break; + case CEED_EVAL_WEIGHT: + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + } + } + + code << "\n // -- Output field constants and basis data --\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + code << " // ---- Output field " << i << " ----\n"; + // Get elem_size, eval_mode, num_comp + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); + CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + + // Set field constants + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + if (basis != CEED_BASIS_COLLOCATED) { + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + code << " const CeedInt P_out_" << i << " = " << P_1d << ";\n"; + } else { + code << " const CeedInt P_out_" << i << " = " << Q_1d << ";\n"; + } + code << " const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n"; + + // Load basis data + code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + break; // No action + case CEED_EVAL_INTERP: + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); + h_B.outputs[i] = basis_impl->d_interp_1d; + code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); + h_B.outputs[i] = basis_impl->d_interp_1d; + code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n"; + if (use_collograd_parallelization) { + h_G.outputs[i] = basis_impl->d_collo_grad_1d; + code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n"; + code << " loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n"; + } else { + bool has_collo_grad = !!basis_impl->d_collo_grad_1d; + h_G.outputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; + code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; + code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i + << ");\n"; + } + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + break; // Should not occur + } + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + } + code << "\n // -- Element loop --\n"; + code << " work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"; + code << " {\n"; + // Input basis apply if needed + // Generate the correct eval mode code for each input + code << " // -- Input field restrictions and basis actions --\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + code << " // ---- Input field " << i << " ----\n"; + // Get elem_size, eval_mode, num_comp + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); + CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + + // Restriction + if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { + code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; + + bool is_strided; + CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + if (!is_strided) { + CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); + code << " const CeedInt lsize_in_" << i << " = " << lsize << ";\n"; + CeedInt comp_stride; + CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + code << " // CompStride: " << comp_stride << "\n"; + CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); + h_indices.inputs[i] = restr_impl->d_ind; + code << " readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i + << "], d_u_" << i << ", r_u_" << i << ");\n"; + } else { + bool has_backend_strides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + CeedInt num_elem; + CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + } + code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; + code << " readDofsStrided" << dim << "d(num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] + << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n"; + } + } + + // Basis action + code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + if (!use_collograd_parallelization) { + code << " private CeedScalar* r_t_" << i << " = r_u_" << i << ";\n"; + } + break; + case CEED_EVAL_INTERP: + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; + code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i + << ", r_t_" << i << ", elem_scratch);\n"; + break; + case CEED_EVAL_GRAD: + if (use_collograd_parallelization) { + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; + code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" + << i << ", r_t_" << i << ", elem_scratch);\n"; + } else { + CeedInt P_1d; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n"; + code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i + << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i + << ", r_t_" << i << ", elem_scratch);\n"; + } + break; + case CEED_EVAL_WEIGHT: + code << " CeedScalar r_t_" << i << "[Q_1D];\n"; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); + impl->W = basis_impl->d_q_weight_1d; + code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n"; + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + } + } + + // Q function + code << "\n // -- Output field setup --\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + code << "\n // ---- Output field " << i << " ----\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_GRAD) { + if (use_collograd_parallelization) { + // Accumulator for gradient slices + code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; + code << " for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n"; + code << " for (CeedInt j = 0; j < Q_1D; ++j) {\n"; + code << " r_tt_" << i << "[j + i*Q_1D] = 0.0;\n"; + code << " }\n"; + code << " }\n"; + } else { + code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n"; + } + } + if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) { + code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; + } + } + // We treat quadrature points per slice in 3d to save registers + if (use_collograd_parallelization) { + code << "\n // Note: Using planes of 3D elements\n"; + code << " for (CeedInt q = 0; q < Q_1D; q++) {\n"; + code << " // -- Input fields --\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + code << " // ---- Input field " << i << " ----\n"; + // Get elem_size, eval_mode, num_comp + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + // Basis action + code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; + + bool is_strided; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); + CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + if (!is_strided) { + CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); + code << " const CeedInt lsize_in_" << i << " = " << lsize << ";\n"; + CeedInt comp_stride; + CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + code << " // CompStride: " << comp_stride << "\n"; + CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); + h_indices.inputs[i] = restr_impl->d_ind; + code << " readSliceQuadsOffset" + << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, lsize_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_" + << i << ", r_q_" << i << ");\n"; + } else { + CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + bool has_backend_strides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + CeedInt num_elem; + CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + } + code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; + code << " readSliceQuadsStrided" + << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i + << ", r_q_" << i << ");\n"; + } + break; + case CEED_EVAL_INTERP: + code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; + code << " for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n"; + code << " r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1D];\n"; + code << " }\n"; + break; + case CEED_EVAL_GRAD: + code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "*DIM];\n"; + code << " gradCollo3d(num_comp_in_" << i << ", Q_1D, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ", elem_scratch);\n"; + break; + case CEED_EVAL_WEIGHT: + code << " CeedScalar r_q_" << i << "[1];\n"; + code << " r_q_" << i << "[0] = r_t_" << i << "[q];\n"; + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + } + } + code << "\n // -- Output fields --\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + code << " // ---- Output field " << i << " ----\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + // Basis action + switch (eval_mode) { + case CEED_EVAL_NONE: + code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; + break; // No action + case CEED_EVAL_INTERP: + code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; + break; + case CEED_EVAL_GRAD: + code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*DIM];\n"; + break; + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + } + } + } else { + code << "\n // Note: Using full elements\n"; + code << " // -- Input fields --\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + code << " // ---- Input field " << i << " ----\n"; + code << " private CeedScalar* r_q_" << i << " = r_t_" << i << ";\n"; + } + code << " // -- Output fields --\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + code << " // ---- Output field " << i << " ----\n"; + code << " private CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n"; + } + } + //-------------------------------------------------- + code << "\n // -- QFunction Inputs and outputs --\n"; + code << " const CeedScalar * in[" << num_input_fields << "];\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { + code << " // ---- Input field " << i << " ----\n"; + code << " in[" << i << "] = r_q_" << i << ";\n"; + } + code << " CeedScalar * out[" << num_output_fields << "];\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + code << " // ---- Output field " << i << " ----\n"; + code << " out[" << i << "] = r_qq_" << i << ";\n"; + } + + code << "\n // -- Apply QFunction --\n"; + code << " " << q_function_name << "(ctx, "; + if (dim != 3 || use_collograd_parallelization) { + code << "1"; + } else { + code << "Q_1D"; + } + code << ", in, out);\n"; + //-------------------------------------------------- + + if (use_collograd_parallelization) { + code << " // -- Output fields --\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + code << " // ---- Output field " << i << " ----\n"; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + // Basis action + code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; + code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n"; + code << " }\n"; + break; // No action + case CEED_EVAL_INTERP: + code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; + code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n"; + code << " }\n"; + break; + case CEED_EVAL_GRAD: + code << " gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i + << ", elem_scratch);\n"; + break; + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + } + } + code << " }\n"; + } + + // Output basis apply if needed + // Generate the correct eval mode code for each output + code << "\n // -- Output field basis action and restrictions --\n"; + for (CeedInt i = 0; i < num_output_fields; i++) { + code << " // ---- Output field " << i << " ----\n"; + // Get elem_size, eval_mode, num_comp + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); + CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + // Basis action + code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; + switch (eval_mode) { + case CEED_EVAL_NONE: + code << " private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n"; + break; // No action + case CEED_EVAL_INTERP: + code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; + code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i + << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; + break; + case CEED_EVAL_GRAD: + code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; + if (use_collograd_parallelization) { + code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i + << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; + } else { + CeedInt P_1d; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i + << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i + << ", r_v_" << i << ", elem_scratch);\n"; + } + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + break; // Should not occur + } + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP + } + // Restriction + bool is_strided; + CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + if (!is_strided) { + CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); + code << " const CeedInt lsize_out_" << i << " = " << lsize << ";\n"; + CeedInt comp_stride; + CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + code << " // CompStride: " << comp_stride << "\n"; + CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); + h_indices.outputs[i] = restr_impl->d_ind; + code << " writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i + << "], r_v_" << i << ", d_v_" << i << ");\n"; + } else { + bool has_backend_strides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + CeedInt num_elem; + CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + } + code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; + code << " writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] + << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n"; + } + } + + code << " }\n"; + code << "}\n"; + code << "// -----------------------------------------------------------------------------\n\n"; + + // Copy the struct (containing device addresses) from the host to the device + sycl::event copy_B = sycl_data->sycl_queue.copy(&h_B, impl->B, 1); + sycl::event copy_G = sycl_data->sycl_queue.copy(&h_G, impl->G, 1); + sycl::event copy_indices = sycl_data->sycl_queue.copy(&h_indices, impl->indices, 1); + // These copies can happen while the JIT is being done + CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices})); + + // View kernel for debugging + CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); + CeedDebug(ceed, code.str().c_str()); + + std::map jit_constants; + jit_constants["T_1D"] = block_sizes[0]; + jit_constants["GROUP_SIZE_X"] = block_sizes[0]; + jit_constants["GROUP_SIZE_Y"] = block_sizes[1]; + jit_constants["GROUP_SIZE_Z"] = block_sizes[2]; + + // Compile kernel into a kernel bundle + CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants)); + + // Load kernel function + CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op)); + + CeedCallBackend(CeedOperatorSetSetupDone(op)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp new file mode 100644 index 0000000000..80a63a4b0a --- /dev/null +++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp @@ -0,0 +1,188 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#include + +#include "../sycl/ceed-sycl-compile.hpp" +#include "ceed-sycl-gen-operator-build.hpp" +#include "ceed-sycl-gen.hpp" + +//------------------------------------------------------------------------------ +// Destroy operator +//------------------------------------------------------------------------------ +static int CeedOperatorDestroy_Sycl_gen(CeedOperator op) { + CeedOperator_Sycl_gen *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedFree(&impl)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Apply and add to output +//------------------------------------------------------------------------------ +static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed_Sycl *ceed_Sycl; + CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); + CeedOperator_Sycl_gen *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedQFunction qf; + CeedQFunction_Sycl_gen *qf_impl; + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); + CeedInt num_elem, num_input_fields, num_output_fields; + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedOperatorField *op_input_fields, *op_output_fields; + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedEvalMode eval_mode; + CeedVector vec, output_vecs[CEED_FIELD_MAX] = {}; + + // Creation of the operator + CeedCallBackend(CeedOperatorBuildKernel_Sycl_gen(op)); + + // Input vectors + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + impl->fields->inputs[i] = NULL; + } else { + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &impl->fields->inputs[i])); + } + } + + // Output vectors + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + impl->fields->outputs[i] = NULL; + } else { + // Get output vector + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; + output_vecs[i] = vec; + // Check for multiple output modes + CeedInt index = -1; + for (CeedInt j = 0; j < i; j++) { + if (vec == output_vecs[j]) { + index = j; + break; + } + } + if (index == -1) { + CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &impl->fields->outputs[i])); + } else { + impl->fields->outputs[i] = impl->fields->outputs[index]; + } + } + } + + // Get context data + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_impl->d_c)); + + // Apply operator + const CeedInt dim = impl->dim; + const CeedInt Q_1d = impl->Q_1d; + const CeedInt P_1d = impl->max_P_1d; + CeedInt block_sizes[3], grid = 0; + CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes)); + if (dim == 1) { + grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + } else if (dim == 2) { + grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + } else if (dim == 3) { + grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + } + + sycl::range<3> local_range(block_sizes[2], block_sizes[1], block_sizes[0]); + sycl::range<3> global_range(grid * block_sizes[2], block_sizes[1], block_sizes[0]); + sycl::nd_range<3> kernel_range(global_range, local_range); + + //----------- + // Order queue + sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier(); + + CeedCallSycl(ceed, ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(e); + cgh.set_args(num_elem, qf_impl->d_c, impl->indices, impl->fields, impl->B, impl->G, impl->W); + cgh.parallel_for(kernel_range, *(impl->op)); + })); + CeedCallSycl(ceed, ceed_Sycl->sycl_queue.wait_and_throw()); + + // Restore input arrays + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &impl->fields->inputs[i])); + } + } + + // Restore output arrays + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; + // Check for multiple output modes + CeedInt index = -1; + for (CeedInt j = 0; j < i; j++) { + if (vec == output_vecs[j]) { + index = j; + break; + } + } + if (index == -1) { + CeedCallBackend(CeedVectorRestoreArray(vec, &impl->fields->outputs[i])); + } + } + } + + // Restore context data + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_impl->d_c)); + + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Create operator +//------------------------------------------------------------------------------ +int CeedOperatorCreate_Sycl_gen(CeedOperator op) { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed_Sycl *sycl_data; + CeedCallBackend(CeedGetData(ceed, &sycl_data)); + + CeedOperator_Sycl_gen *impl; + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); + + impl->indices = sycl::malloc_device(1, sycl_data->sycl_device, sycl_data->sycl_context); + impl->fields = sycl::malloc_host(1, sycl_data->sycl_context); + impl->B = sycl::malloc_device(1, sycl_data->sycl_device, sycl_data->sycl_context); + impl->G = sycl::malloc_device(1, sycl_data->sycl_device, sycl_data->sycl_context); + impl->W = sycl::malloc_device(1, sycl_data->sycl_device, sycl_data->sycl_context); + + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl_gen)); + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl_gen)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp new file mode 100644 index 0000000000..951474476a --- /dev/null +++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp @@ -0,0 +1,71 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#include +#include +#include + +#include "ceed-sycl-gen.hpp" + +//------------------------------------------------------------------------------ +// Apply QFunction +//------------------------------------------------------------------------------ +static int CeedQFunctionApply_Sycl_gen(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { + Ceed ceed; + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement QFunctionApply"); +} + +//------------------------------------------------------------------------------ +// Destroy QFunction +//------------------------------------------------------------------------------ +static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) { + Ceed ceed; + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + CeedQFunction_Sycl_gen *impl; + CeedCallBackend(CeedQFunctionGetData(qf, &impl)); + Ceed_Sycl *data; + CeedCallBackend(CeedGetData(ceed, &data)); + + // Wait for all work to finish before freeing memory + CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); + CeedCallSycl(ceed, sycl::free(impl->d_c, data->sycl_context)); + + CeedCallBackend(CeedFree(&impl->q_function_source)); + CeedCallBackend(CeedFree(&impl)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Create QFunction +//------------------------------------------------------------------------------ +int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf) { + Ceed ceed; + CeedQFunctionGetCeed(qf, &ceed); + CeedQFunction_Sycl_gen *impl; + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedQFunctionSetData(qf, impl)); + + // Read QFunction source + CeedCallBackend(CeedQFunctionGetKernelName(qf, &impl->q_function_name)); + CeedDebug256(ceed, 2, "----- Loading QFunction User Source -----\n"); + CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &impl->q_function_source)); + CeedDebug256(ceed, 2, "----- Loading QFunction User Source Complete! -----\n"); + if (!impl->q_function_source) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "/gpu/sycl/gen backend requires QFunction source code file"); + // LCOV_EXCL_STOP + } + + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl_gen)); + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl_gen)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ diff --git a/backends/sycl-gen/ceed-sycl-gen.hpp b/backends/sycl-gen/ceed-sycl-gen.hpp new file mode 100644 index 0000000000..e4e716393c --- /dev/null +++ b/backends/sycl-gen/ceed-sycl-gen.hpp @@ -0,0 +1,41 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#ifndef _ceed_sycl_gen_h +#define _ceed_sycl_gen_h + +#include +#include +#include + +#include "../sycl/ceed-sycl-common.hpp" +#include "../sycl/ceed-sycl-compile.hpp" + +typedef struct { + CeedInt dim; + CeedInt Q_1d; + CeedInt max_P_1d; + SyclModule_t *sycl_module; + sycl::kernel *op; + FieldsInt_Sycl *indices; + Fields_Sycl *fields; + Fields_Sycl *B; + Fields_Sycl *G; + CeedScalar *W; +} CeedOperator_Sycl_gen; + +typedef struct { + char *q_function_name; + char *q_function_source; + void *d_c; +} CeedQFunction_Sycl_gen; + +CEED_INTERN int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf); + +CEED_INTERN int CeedOperatorCreate_Sycl_gen(CeedOperator op); + +#endif // _ceed_sycl_gen_h diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp new file mode 100644 index 0000000000..a5084ff6b7 --- /dev/null +++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp @@ -0,0 +1,59 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include "ceed-sycl-gen.hpp" + +#include +#include + +#include +#include +#include + +//------------------------------------------------------------------------------ +// Backend init +//------------------------------------------------------------------------------ +static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) { + char *resource_root; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":device_id=", &resource_root)); + if (strcmp(resource_root, "/gpu/sycl") && strcmp(resource_root, "/gpu/sycl/gen")) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedFree(&resource_root)); + + Ceed_Sycl *data; + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedSetData(ceed, data)); + CeedCallBackend(CeedInit_Sycl(ceed, resource)); + + Ceed ceed_shared; + CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared)); + + Ceed_Sycl *shared_data; + CeedCallBackend(CeedGetData(ceed_shared, &shared_data)); + // Need to use the same queue everywhere for correct synchronization + shared_data->sycl_queue = data->sycl_queue; + + CeedCallBackend(CeedSetDelegate(ceed, ceed_shared)); + + const char fallbackresource[] = "/gpu/sycl/ref"; + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); + + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen)); + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen)); + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Sycl)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Register backend +//------------------------------------------------------------------------------ +CEED_INTERN int CeedRegister_Sycl_Gen(void) { return CeedRegister("/gpu/sycl/gen", CeedInit_Sycl_gen, 20); } + +//------------------------------------------------------------------------------ diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp index 6967c1442f..601ebdda95 100644 --- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp @@ -178,11 +178,10 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t & const CeedInt v_comp_stride = num_elem * v_stride; const CeedInt u_dim_stride = transpose ? num_elem * num_qpts * num_comp : 0; const CeedInt v_dim_stride = transpose ? 0 : num_elem * num_qpts * num_comp; - - sycl::group work_group = work_item.get_group(); - const CeedInt i = work_item.get_local_linear_id(); - const CeedInt group_size = work_group.get_local_linear_range(); - const CeedInt elem = work_group.get_group_linear_id(); + sycl::group work_group = work_item.get_group(); + const CeedInt i = work_item.get_local_linear_id(); + const CeedInt group_size = work_group.get_local_linear_range(); + const CeedInt elem = work_group.get_group_linear_id(); CeedScalar *s_interp_1d = s_mem.get_pointer(); CeedScalar *s_grad_1d = s_interp_1d + P * Q; diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp index 0f1f1f1de6..7aa3222a96 100644 --- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp @@ -787,7 +787,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const std::vector copy_events; if (evalNone) { CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity)); - for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; + for (CeedSize i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device(iLen, sycl_data->sycl_device, sycl_data->sycl_context)); sycl::event identity_copy = sycl_data->sycl_queue.copy(identity, diag->d_identity, iLen, {e}); copy_events.push_back(identity_copy); @@ -838,11 +838,11 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const //------------------------------------------------------------------------------ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool pointBlock, const CeedInt nelem, const CeedOperatorDiag_Sycl *diag, const CeedScalar *assembledqfarray, CeedScalar *elemdiagarray) { - const CeedInt nnodes = diag->nnodes; - const CeedInt nqpts = diag->nqpts; - const CeedInt ncomp = diag->ncomp; - const CeedInt numemodein = diag->numemodein; - const CeedInt numemodeout = diag->numemodeout; + const CeedSize nnodes = diag->nnodes; + const CeedSize nqpts = diag->nqpts; + const CeedSize ncomp = diag->ncomp; + const CeedSize numemodein = diag->numemodein; + const CeedSize numemodeout = diag->numemodeout; const CeedScalar *identity = diag->d_identity; const CeedScalar *interpin = diag->d_interpin; @@ -864,23 +864,23 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p // Each element CeedInt dout = -1; // Each basis eval mode pair - for (CeedInt eout = 0; eout < numemodeout; eout++) { + for (CeedSize eout = 0; eout < numemodeout; eout++) { const CeedScalar *bt = NULL; if (emodeout[eout] == CEED_EVAL_GRAD) ++dout; CeedOperatorGetBasisPointer_Sycl(&bt, emodeout[eout], identity, interpout, &gradout[dout * nqpts * nnodes]); CeedInt din = -1; - for (CeedInt ein = 0; ein < numemodein; ein++) { + for (CeedSize ein = 0; ein < numemodein; ein++) { const CeedScalar *b = NULL; if (emodein[ein] == CEED_EVAL_GRAD) ++din; CeedOperatorGetBasisPointer_Sycl(&b, emodein[ein], identity, interpin, &gradin[din * nqpts * nnodes]); // Each component - for (CeedInt compOut = 0; compOut < ncomp; compOut++) { + for (CeedSize compOut = 0; compOut < ncomp; compOut++) { // Each qpoint/node pair if (pointBlock) { // Point Block Diagonal for (CeedInt compIn = 0; compIn < ncomp; compIn++) { CeedScalar evalue = 0.0; - for (CeedInt q = 0; q < nqpts; q++) { + for (CeedSize q = 0; q < nqpts; q++) { const CeedScalar qfvalue = assembledqfarray[((((ein * ncomp + compIn) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q]; evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid]; @@ -890,7 +890,7 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p } else { // Diagonal Only CeedScalar evalue = 0.0; - for (CeedInt q = 0; q < nqpts; q++) { + for (CeedSize q = 0; q < nqpts; q++) { const CeedScalar qfvalue = assembledqfarray[((((ein * ncomp + compOut) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q]; evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid]; @@ -1187,23 +1187,23 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp // TODO: expand to more general cases CeedOperatorAssemble_Sycl *asmb = impl->asmb; const CeedInt nelem = asmb->nelem; - const CeedInt nnodes = asmb->nnodes; - const CeedInt ncomp = asmb->ncomp; - const CeedInt nqpts = asmb->nqpts; - const CeedInt numemodein = asmb->numemodein; - const CeedInt numemodeout = asmb->numemodeout; + const CeedSize nnodes = asmb->nnodes; + const CeedSize ncomp = asmb->ncomp; + const CeedSize nqpts = asmb->nqpts; + const CeedSize numemodein = asmb->numemodein; + const CeedSize numemodeout = asmb->numemodeout; // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element, // comp_in, comp_out, node_row, node_col - const CeedInt comp_out_stride = nnodes * nnodes; - const CeedInt comp_in_stride = comp_out_stride * ncomp; - const CeedInt e_stride = comp_in_stride * ncomp; + const CeedSize comp_out_stride = nnodes * nnodes; + const CeedSize comp_in_stride = comp_out_stride * ncomp; + const CeedSize e_stride = comp_in_stride * ncomp; // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const CeedInt qe_stride = nqpts; - const CeedInt qcomp_out_stride = nelem * qe_stride; - const CeedInt qemode_out_stride = qcomp_out_stride * ncomp; - const CeedInt qcomp_in_stride = qemode_out_stride * numemodeout; - const CeedInt qemode_in_stride = qcomp_in_stride * ncomp; + const CeedSize qe_stride = nqpts; + const CeedSize qcomp_out_stride = nelem * qe_stride; + const CeedSize qemode_out_stride = qcomp_out_stride * ncomp; + const CeedSize qcomp_in_stride = qemode_out_stride * numemodeout; + const CeedSize qemode_in_stride = qcomp_in_stride * ncomp; CeedScalar *B_in, *B_out; B_in = asmb->d_B_in; @@ -1220,22 +1220,22 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp const int l = idx.get(1); // The output column index of each B^TDB operation const int i = idx.get(2); // The output row index of each B^TDB operation // such that we have (Bout^T)_ij D_jk Bin_kl = C_il - for (CeedInt comp_in = 0; comp_in < ncomp; comp_in++) { - for (CeedInt comp_out = 0; comp_out < ncomp; comp_out++) { + for (CeedSize comp_in = 0; comp_in < ncomp; comp_in++) { + for (CeedSize comp_out = 0; comp_out < ncomp; comp_out++) { CeedScalar result = 0.0; - CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; - for (CeedInt emode_in = 0; emode_in < numemodein; emode_in++) { - CeedInt b_in_index = emode_in * nqpts * nnodes; - for (CeedInt emode_out = 0; emode_out < numemodeout; emode_out++) { - CeedInt b_out_index = emode_out * nqpts * nnodes; - CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + CeedSize qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; + for (CeedSize emode_in = 0; emode_in < numemodein; emode_in++) { + CeedSize b_in_index = emode_in * nqpts * nnodes; + for (CeedSize emode_out = 0; emode_out < numemodeout; emode_out++) { + CeedSize b_out_index = emode_out * nqpts * nnodes; + CeedSize qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; // Perform the B^T D B operation for this 'chunk' of D (the qf_array) - for (CeedInt j = 0; j < nqpts; j++) { + for (CeedSize j = 0; j < nqpts; j++) { result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l]; } } // end of emode_out } // end of emode_in - CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l; + CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l; values_array[val_index] = result; } // end of out component } // end of in component diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp index 02b91fdd58..5c0d8b5f4b 100644 --- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp @@ -306,15 +306,15 @@ static int CeedVectorSetArray_Sycl(const CeedVector vec, const CeedMemType mem_t //------------------------------------------------------------------------------ // Set host array to value //------------------------------------------------------------------------------ -static int CeedHostSetValue_Sycl(CeedScalar *h_array, CeedInt length, CeedScalar val) { - for (int i = 0; i < length; i++) h_array[i] = val; +static int CeedHostSetValue_Sycl(CeedScalar *h_array, CeedSize length, CeedScalar val) { + for (CeedSize i = 0; i < length; i++) h_array[i] = val; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Set device array to value //------------------------------------------------------------------------------ -static int CeedDeviceSetValue_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedInt length, CeedScalar val) { +static int CeedDeviceSetValue_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedSize length, CeedScalar val) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.fill(d_array, val, length, {e}); @@ -527,8 +527,8 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no //------------------------------------------------------------------------------ // Take reciprocal of a vector on host //------------------------------------------------------------------------------ -static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedInt length) { - for (int i = 0; i < length; i++) { +static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedSize length) { + for (CeedSize i = 0; i < length; i++) { if (std::fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i]; } return CEED_ERROR_SUCCESS; @@ -537,7 +537,7 @@ static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedInt length) { //------------------------------------------------------------------------------ // Take reciprocal of a vector on device //------------------------------------------------------------------------------ -static int CeedDeviceReciprocal_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedInt length) { +static int CeedDeviceReciprocal_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedSize length) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { @@ -569,15 +569,15 @@ static int CeedVectorReciprocal_Sycl(CeedVector vec) { //------------------------------------------------------------------------------ // Compute x = alpha x on the host //------------------------------------------------------------------------------ -static int CeedHostScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedInt length) { - for (int i = 0; i < length; i++) x_array[i] *= alpha; +static int CeedHostScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { + for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute x = alpha x on device //------------------------------------------------------------------------------ -static int CeedDeviceScale_Sycl(sycl::queue &sycl_queue, CeedScalar *x_array, CeedScalar alpha, CeedInt length) { +static int CeedDeviceScale_Sycl(sycl::queue &sycl_queue, CeedScalar *x_array, CeedScalar alpha, CeedSize length) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { x_array[i] *= alpha; }); @@ -607,15 +607,15 @@ static int CeedVectorScale_Sycl(CeedVector x, CeedScalar alpha) { //------------------------------------------------------------------------------ // Compute y = alpha x + y on the host //------------------------------------------------------------------------------ -static int CeedHostAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) { - for (int i = 0; i < length; i++) y_array[i] += alpha * x_array[i]; +static int CeedHostAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { + for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i]; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute y = alpha x + y on device //------------------------------------------------------------------------------ -static int CeedDeviceAXPY_Sycl(sycl::queue &sycl_queue, CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) { +static int CeedDeviceAXPY_Sycl(sycl::queue &sycl_queue, CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { y_array[i] += alpha * x_array[i]; }); @@ -652,15 +652,15 @@ static int CeedVectorAXPY_Sycl(CeedVector y, CeedScalar alpha, CeedVector x) { //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on the host //------------------------------------------------------------------------------ -static int CeedHostPointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) { - for (int i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i]; +static int CeedHostPointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { + for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i]; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on device (impl in .cu file) //------------------------------------------------------------------------------ -static int CeedDevicePointwiseMult_Sycl(sycl::queue &sycl_queue, CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) { +static int CeedDevicePointwiseMult_Sycl(sycl::queue &sycl_queue, CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { w_array[i] = x_array[i] * y_array[i]; }); diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp index d7d0a1cfab..d6e7067eee 100644 --- a/backends/sycl/ceed-sycl-common.sycl.cpp +++ b/backends/sycl/ceed-sycl-common.sycl.cpp @@ -92,6 +92,7 @@ int CeedSetStream_Sycl(Ceed ceed, void *handle) { Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); + CeedCheck(handle, ceed, CEED_ERROR_BACKEND, "Stream handle is null"); sycl::queue *q = static_cast(handle); // Ensure we are using the expected device @@ -111,6 +112,17 @@ int CeedSetStream_Sycl(Ceed ceed, void *handle) { delegate_data->sycl_queue = *q; } + // Set queue and context for Ceed Fallback object + Ceed ceed_fallback = NULL; + CeedGetOperatorFallbackCeed(ceed, &ceed_fallback); + if (ceed_fallback) { + Ceed_Sycl *fallback_data; + CeedCallBackend(CeedGetData(ceed_fallback, &fallback_data)); + fallback_data->sycl_device = q->get_device(); + fallback_data->sycl_context = q->get_context(); + fallback_data->sycl_queue = *q; + } + return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl/ceed-sycl-compile.hpp b/backends/sycl/ceed-sycl-compile.hpp index 57ae351d8d..f4a95b8168 100644 --- a/backends/sycl/ceed-sycl-compile.hpp +++ b/backends/sycl/ceed-sycl-compile.hpp @@ -20,4 +20,7 @@ CEED_INTERN int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source const std::map &constants = {}); CEED_INTERN int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel); +CEED_INTERN int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, void **args); + #endif // _ceed_sycl_compile_h diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp index 03d725b231..845c46b5c5 100644 --- a/backends/sycl/ceed-sycl-compile.sycl.cpp +++ b/backends/sycl/ceed-sycl-compile.sycl.cpp @@ -22,7 +22,7 @@ using ByteVector_t = std::vector; //------------------------------------------------------------------------------ -// +// Add defined constants at the beginning of kernel source //------------------------------------------------------------------------------ static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_source, std::string &jit_source, const std::map &constants = {}) { @@ -77,24 +77,34 @@ static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_ // ------------------------------------------------------------------------------ // Load (compile) SPIR-V source and wrap in sycl kernel_bundle -// TODO: determine appropriate flags -// TODO: Error handle lz calls // ------------------------------------------------------------------------------ -static int CeedJitLoadModule_Sycl(const sycl::context &sycl_context, const sycl::device &sycl_device, const ByteVector_t &il_binary, - SyclModule_t **sycl_module) { +static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, const sycl::device &sycl_device, const ByteVector_t &il_binary, + SyclModule_t **sycl_module) { auto lz_context = sycl::get_native(sycl_context); auto lz_device = sycl::get_native(sycl_device); ze_module_desc_t lz_mod_desc = {ZE_STRUCTURE_TYPE_MODULE_DESC, - nullptr, + nullptr, // extension specific structs ZE_MODULE_FORMAT_IL_SPIRV, il_binary.size(), il_binary.data(), " -ze-opt-large-register-file", // flags - nullptr}; // build log + nullptr}; // specialization constants + + ze_module_handle_t lz_module; + ze_module_build_log_handle_t lz_log; + ze_result_t lz_err = zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, &lz_log); + + if (ZE_RESULT_SUCCESS != lz_err) { + size_t log_size = 0; + zeModuleBuildLogGetString(lz_log, &log_size, nullptr); - ze_module_handle_t lz_module; - zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, nullptr); + char *log_message; + CeedCall(CeedCalloc(log_size, &log_message)); + zeModuleBuildLogGetString(lz_log, &log_size, log_message); + + return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to compile Level Zero module:\n%s", log_message); + } // sycl make_ only throws errors for backend mismatch--assume we have vetted this already *sycl_module = new SyclModule_t(sycl::make_kernel_bundle( @@ -119,7 +129,7 @@ int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule ByteVector_t il_binary; CeedCallBackend(CeedJitCompileSource_Sycl(ceed, data->sycl_device, jit_source, il_binary, flags)); - CeedCallBackend(CeedJitLoadModule_Sycl(data->sycl_context, data->sycl_device, il_binary, sycl_module)); + CeedCallBackend(CeedLoadModule_Sycl(ceed, data->sycl_context, data->sycl_device, il_binary, sycl_module)); return CEED_ERROR_SUCCESS; } @@ -139,10 +149,38 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st ze_kernel_desc_t lz_kernel_desc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_name.c_str()}; ze_kernel_handle_t lz_kernel; - zeKernelCreate(lz_module, &lz_kernel_desc, &lz_kernel); + ze_result_t lz_err = zeKernelCreate(lz_module, &lz_kernel_desc, &lz_kernel); + + if (ZE_RESULT_SUCCESS != lz_err) { + return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to retrieve kernel from Level Zero module"); + } *sycl_kernel = new sycl::kernel(sycl::make_kernel( {*sycl_module, lz_kernel, sycl::ext::oneapi::level_zero::ownership::transfer}, data->sycl_context)); return CEED_ERROR_SUCCESS; } + +//------------------------------------------------------------------------------ +// Run SYCL kernel for spatial dimension with shared memory +//------------------------------------------------------------------------------ +int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, void **kernel_args) { + sycl::range<3> local_range(block_size_z, block_size_y, block_size_x); + sycl::range<3> global_range(grid_size * block_size_z, block_size_y, block_size_x); + sycl::nd_range<3> kernel_range(global_range, local_range); + + //----------- + // Order queue + Ceed_Sycl *ceed_Sycl; + CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); + sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier(); + + ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(e); + cgh.set_args(*kernel_args); + cgh.parallel_for(kernel_range, *kernel); + }); + + return CEED_ERROR_SUCCESS; +} diff --git a/include/ceed/jit-source/sycl/sycl-gen-templates.h b/include/ceed/jit-source/sycl/sycl-gen-templates.h new file mode 100644 index 0000000000..502ef562c0 --- /dev/null +++ b/include/ceed/jit-source/sycl/sycl-gen-templates.h @@ -0,0 +1,360 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for SYCL backend macro and type definitions for JiT source +#ifndef _ceed_sycl_gen_templates_h +#define _ceed_sycl_gen_templates_h + +#include + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable +// TODO: Handle FP32 case +typedef atomic_double CeedAtomicScalar; + +//------------------------------------------------------------------------------ +// Load matrices for basis actions +//------------------------------------------------------------------------------ +inline void loadMatrix(const CeedInt N, const CeedScalar* restrict d_B, CeedScalar* restrict B) { + const CeedInt item_id = get_local_linear_id(); + const CeedInt group_size = get_local_size(0) * get_local_size(1) * get_local_size(2); + for (CeedInt i = item_id; i < N; i += group_size) B[i] = d_B[i]; +} + +//------------------------------------------------------------------------------ +// 1D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, offsets provided +//------------------------------------------------------------------------------ +inline void readDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, + const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && elem < num_elem) { + const CeedInt node = item_id_x; + const CeedInt ind = indices[node + elem * P_1D]; + for (CeedInt comp = 0; comp < num_comp; ++comp) { + r_u[comp] = d_u[ind + strides_comp * comp]; + } + } +} + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, strided +//------------------------------------------------------------------------------ +inline void readDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedInt num_elem, global const CeedScalar* restrict d_u, + private CeedScalar* restrict r_u) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && elem < num_elem) { + const CeedInt node = item_id_x; + const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < num_comp; comp++) { + r_u[comp] = d_u[ind + comp * strides_comp]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, offsets provided +//------------------------------------------------------------------------------ +inline void writeDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, + const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && elem < num_elem) { + const CeedInt node = item_id_x; + const CeedInt ind = indices[node + elem * P_1D]; + for (CeedInt comp = 0; comp < num_comp; ++comp) + atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device); + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, strided +//------------------------------------------------------------------------------ +inline void writeDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedInt num_elem, private const CeedScalar* restrict r_v, + global CeedScalar* restrict d_v) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && elem < num_elem) { + const CeedInt node = item_id_x; + const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < num_comp; comp++) { + d_v[ind + comp * strides_comp] = r_v[comp]; + } + } +} + +//------------------------------------------------------------------------------ +// 2D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, offsets provided +//------------------------------------------------------------------------------ +inline void readDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, + const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { + const CeedInt node = item_id_x + item_id_y * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D]; + for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp]; + } +} + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, strided +//------------------------------------------------------------------------------ +inline void readDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar* restrict d_u, + private CeedScalar* restrict r_u) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { + const CeedInt node = item_id_x + item_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp]; + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, offsets provided +//------------------------------------------------------------------------------ +inline void writeDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, + const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { + const CeedInt node = item_id_x + item_id_y * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D]; + for (CeedInt comp = 0; comp < num_comp; ++comp) + atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device); + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, strided +//------------------------------------------------------------------------------ +inline void writeDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar* restrict r_v, + global CeedScalar* restrict d_v) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { + const CeedInt node = item_id_x + item_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[comp]; + } +} + +//------------------------------------------------------------------------------ +// 3D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, offsets provided +//------------------------------------------------------------------------------ +inline void readDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, + const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { + for (CeedInt z = 0; z < P_1D; ++z) { + const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z); + const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; + for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + strides_comp * comp]; + } + } +} + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, strided +//------------------------------------------------------------------------------ +inline void readDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar* restrict d_u, + private CeedScalar* restrict r_u) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { + for (CeedInt z = 0; z < P_1D; ++z) { + const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z); + const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> Q-vector, offests provided +//------------------------------------------------------------------------------ +inline void readSliceQuadsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt Q_1D, const CeedInt num_elem, const CeedInt q, + const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) { + const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q); + const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; + for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp]; + } +} + +//------------------------------------------------------------------------------ +// E-vector -> Q-vector, strided +//------------------------------------------------------------------------------ +inline void readSliceQuadsStrided3d(const CeedInt num_comp, const CeedInt Q_1D, CeedInt strides_node, CeedInt strides_comp, CeedInt strides_elem, + const CeedInt num_elem, const CeedInt q, const global CeedScalar* restrict d_u, + private CeedScalar* restrict r_u) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) { + const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q); + const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp]; + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, offsets provided +//------------------------------------------------------------------------------ +inline void writeDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, + const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { + for (CeedInt z = 0; z < P_1D; ++z) { + const CeedInt node = item_id_x + item_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; + for (CeedInt comp = 0; comp < num_comp; ++comp) + atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[z + comp * P_1D], memory_order_relaxed, memory_scope_device); + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, strided +//------------------------------------------------------------------------------ +inline void writeDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar* restrict r_v, + global CeedScalar* restrict d_v) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + const CeedInt elem = get_global_id(2); + + if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { + for (CeedInt z = 0; z < P_1D; ++z) { + const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z); + const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; + } + } +} + +//------------------------------------------------------------------------------ +// 3D collocated derivatives computation +//------------------------------------------------------------------------------ +inline void gradCollo3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar* restrict r_U, + const local CeedScalar* s_G, private CeedScalar* restrict r_V, local CeedScalar* restrict scratch) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + + for (CeedInt comp = 0; comp < num_comp; ++comp) { + if (item_id_x < Q_1D && item_id_y < Q_1D) { + scratch[item_id_x + item_id_y * T_1D] = r_U[q + comp * Q_1D]; + } + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + if (item_id_x < Q_1D && item_id_y < Q_1D) { + // X derivative + r_V[comp + 0 * num_comp] = 0.0; + for (CeedInt i = 0; i < Q_1D; ++i) + r_V[comp + 0 * num_comp] += s_G[i + item_id_x * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative) + + // Y derivative + r_V[comp + 1 * num_comp] = 0.0; + for (CeedInt i = 0; i < Q_1D; ++i) + r_V[comp + 1 * num_comp] += s_G[i + item_id_y * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative) + + // Z derivative + r_V[comp + 2 * num_comp] = 0.0; + for (CeedInt i = 0; i < Q_1D; ++i) r_V[comp + 2 * num_comp] += s_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; // Contract z direction (Z derivative) + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + } +} + +//------------------------------------------------------------------------------ +// 3D collocated derivatives transpose +//------------------------------------------------------------------------------ +inline void gradColloTranspose3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar* restrict r_U, + const local CeedScalar* restrict s_G, private CeedScalar* restrict r_V, local CeedScalar* restrict scratch) { + const CeedInt item_id_x = get_local_id(0); + const CeedInt item_id_y = get_local_id(1); + + for (CeedInt comp = 0; comp < num_comp; ++comp) { + // X derivative + if (item_id_x < Q_1D && item_id_y < Q_1D) { + scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 0 * num_comp]; + } + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + if (item_id_x < Q_1D && item_id_y < Q_1D) { + for (CeedInt i = 0; i < Q_1D; ++i) + r_V[q + comp * Q_1D] += s_G[item_id_x + i * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative) + } + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + // Y derivative + if (item_id_x < Q_1D && item_id_y < Q_1D) { + scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 1 * num_comp]; + } + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + if (item_id_x < Q_1D && item_id_y < Q_1D) { + for (CeedInt i = 0; i < Q_1D; ++i) + r_V[q + comp * Q_1D] += s_G[item_id_y + i * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative) + } + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + // Z derivative + if (item_id_x < Q_1D && item_id_y < Q_1D) { + for (CeedInt i = 0; i < Q_1D; ++i) + r_V[i + comp * Q_1D] += s_G[i + q * Q_1D] * r_U[comp + 2 * num_comp]; // PARTIAL contract z direction (Z derivative) + } + } +} + +#endif diff --git a/include/ceed/jit-source/sycl/sycl-types.h b/include/ceed/jit-source/sycl/sycl-types.h index e81e4cd7c2..c47b333a7a 100644 --- a/include/ceed/jit-source/sycl/sycl-types.h +++ b/include/ceed/jit-source/sycl/sycl-types.h @@ -14,23 +14,26 @@ #define CEED_SYCL_NUMBER_FIELDS 16 +#ifdef __OPENCL_C_VERSION__ typedef struct { - const CeedScalar* inputs[CEED_SYCL_NUMBER_FIELDS]; - CeedScalar* outputs[CEED_SYCL_NUMBER_FIELDS]; + global const CeedScalar* inputs[CEED_SYCL_NUMBER_FIELDS]; + global CeedScalar* outputs[CEED_SYCL_NUMBER_FIELDS]; } Fields_Sycl; typedef struct { - CeedInt* inputs[CEED_SYCL_NUMBER_FIELDS]; - CeedInt* outputs[CEED_SYCL_NUMBER_FIELDS]; + global const CeedInt* inputs[CEED_SYCL_NUMBER_FIELDS]; + global CeedInt* outputs[CEED_SYCL_NUMBER_FIELDS]; } FieldsInt_Sycl; +#else +typedef struct { + const CeedScalar* inputs[CEED_SYCL_NUMBER_FIELDS]; + CeedScalar* outputs[CEED_SYCL_NUMBER_FIELDS]; +} Fields_Sycl; -// TODO: Determine if slice should have `__local` qualifier typedef struct { - CeedInt t_id_x; - CeedInt t_id_y; - CeedInt t_id_z; - CeedInt t_id; - CeedScalar* slice; -} SharedData_Sycl; + const CeedInt* inputs[CEED_SYCL_NUMBER_FIELDS]; + CeedInt* outputs[CEED_SYCL_NUMBER_FIELDS]; +} FieldsInt_Sycl; +#endif #endif