diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp index 1d3cf330ad..e7280b721d 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp @@ -45,30 +45,8 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { // QFunction kernel generation CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); - std::vector input_sizes(num_input_fields); - CeedQFunctionField *input_i = input_fields; - - for (auto &size_i : input_sizes) { - CeedCallBackend(CeedQFunctionFieldGetSize(*input_i, &size_i)); - ++input_i; - } - - std::vector output_sizes(num_output_fields); - CeedQFunctionField *output_i = output_fields; - - for (auto &size_i : output_sizes) { - CeedCallBackend(CeedQFunctionFieldGetSize(*output_i, &size_i)); - ++output_i; - } - - CeedCallBackend(CeedQFunctionGetKernelName(qf, &qfunction_name)); - - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n"); - CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &qfunction_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n"); - - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-ref-qfunction.h", &read_write_kernel_path)); - + // Build strings for final kernel function + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-ref-qfunction.h", &read_write_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n"); { char *source; @@ -77,12 +55,27 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { read_write_kernel_source = source; } CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n"); - - std::string_view qf_name_view(qfunction_name); - std::string_view qf_source_view(qfunction_source); + std::string_view qf_name_view(impl->qfunction_name); + std::string_view qf_source_view(impl->qfunction_source); std::string_view rw_source_view(read_write_kernel_source); const std::string kernel_name = "CeedKernelSyclRefQFunction_" + std::string(qf_name_view); + // std::vector input_sizes(num_input_fields); + // CeedQFunctionField *input_i = input_fields; + + // for (auto &size_i : input_sizes) { + // CeedCallBackend(CeedQFunctionFieldGetSize(*input_i, &size_i)); + // ++input_i; + // } + + // std::vector output_sizes(num_output_fields); + // CeedQFunctionField *output_i = output_fields; + + // for (auto &size_i : output_sizes) { + // CeedCallBackend(CeedQFunctionFieldGetSize(*output_i, &size_i)); + // ++output_i; + // } + // Defintions std::ostringstream code; code << rw_source_view; @@ -92,75 +85,88 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { // Kernel function // Here we are fixing a lower sub-group size value to avoid register spills // This needs to be revisited if all qfunctions require this. - code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) __kernel void " << kernel_name - << "(__global void *ctx, CeedInt Q,\n"; + // code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) extern \"C\" void " << kernel_name + code << "#include \n\n"; + code << "extern \"C\" void " << kernel_name + << "(sycl::queue &sycl_queue, sycl::nd_range<1> kernel_range, void *ctx, CeedInt Q, Fields_Sycl fields) {\n"; // OpenCL doesn't allow for structs with pointers. // We will need to pass all of the arguments individually. // Input parameters + code << " " + << "const CeedScalar *fields_inputs[" << num_input_fields << "];\n"; for (CeedInt i = 0; i < num_input_fields; ++i) { code << " " - << "__global const CeedScalar *in_" << i << ",\n"; + << "fields_inputs[" << i << "] = fields.inputs[" << i << "];\n"; } // Output parameters code << " " - << "__global CeedScalar *out_0"; - for (CeedInt i = 1; i < num_output_fields; ++i) { - code << "\n, " - << "__global CeedScalar *out_" << i; + << "const CeedScalar *fields_outputs[" << num_output_fields << "];\n"; + for (CeedInt i = 0; i < num_output_fields; ++i) { + code << " " + << "fields_outputs[" << i << "] = fields.outputs[" << i << "];\n"; } + code << "\n"; + + // Insert SYCL barrier for out-of-order queues + code << " std::vector e;\n"; + code << " if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};\n\n"; + // Begin kernel function body - code << ") {\n\n"; + code << " " + << "sycl_queue.parallel_for(kernel_range, e, " + << "[=](sycl::id<1> id) {\n"; // Inputs - code << " // Input fields\n"; + code << " // Input fields\n"; for (CeedInt i = 0; i < num_input_fields; ++i) { - code << " CeedScalar U_" << i << "[" << input_sizes[i] << "];\n"; + code << " CeedScalar U_" << i << "[" << input_sizes[i] << "];\n"; } - code << " const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "] = {U_0"; + code << " const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "] = {U_0"; for (CeedInt i = 1; i < num_input_fields; i++) { code << ", U_" << i << "\n"; } code << "};\n\n"; // Outputs - code << " // Output fields\n"; + code << " // Output fields\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " CeedScalar V_" << i << "[" << output_sizes[i] << "];\n"; + code << " CeedScalar V_" << i << "[" << output_sizes[i] << "];\n"; } - code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "] = {V_0"; + code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "] = {V_0"; for (CeedInt i = 1; i < num_output_fields; i++) { code << ", V_" << i << "\n"; } code << "};\n\n"; - code << " const CeedInt q = get_global_linear_id();\n\n"; + code << " const CeedInt q = id;\n\n"; - code << "if(q < Q){ \n\n"; + code << " if(q < Q) { \n\n"; // Load inputs - code << " // -- Load inputs\n"; + code << " // -- Load inputs\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - code << " readQuads(" << input_sizes[i] << ", Q, q, " - << "in_" << i << ", U_" << i << ");\n"; + code << " readQuads<" << input_sizes[i] << ">(q, Q, " + << "fields_inputs[" << i << "], U_" << i << ");\n"; } code << "\n"; // QFunction - code << " // -- Call QFunction\n"; - code << " " << qf_name_view << "(ctx, 1, inputs, outputs);\n\n"; + code << " // -- Call QFunction\n"; + code << " " << qf_name_view << "(ctx, 1, inputs, outputs);\n\n"; // Write outputs - code << " // -- Write outputs\n"; + code << " // -- Write outputs\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " writeQuads(" << output_sizes[i] << ", Q, q, " - << "V_" << i << ", out_" << i << ");\n"; + code << " writeQuads<" << output_sizes[i] << ">(q, Q, " + << "V_" << i << ", fields_outputs[" << i << "]);\n"; } - code << "\n"; + code << " }\n"; // End kernel function body - code << "}\n"; + code <<" });\n"; + // End launcher function code << "}\n"; // View kernel for debugging diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp index 4de8fcf379..ced56cb005 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp @@ -41,18 +41,24 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); // Read vectors - std::vector inputs(num_input_fields); - const CeedVector *U_i = U; - for (auto &input_i : inputs) { - CeedCallBackend(CeedVectorGetArrayRead(*U_i, CEED_MEM_DEVICE, &input_i)); - ++U_i; + // std::vector inputs(num_input_fields); + // const CeedVector *U_i = U; + // for (auto &input_i : inputs) { + // CeedCallBackend(CeedVectorGetArrayRead(*U_i, CEED_MEM_DEVICE, &input_i)); + // ++U_i; + // } + + // std::vector outputs(num_output_fields); + // CeedVector *V_i = V; + // for (auto &output_i : outputs) { + // CeedCallBackend(CeedVectorGetArrayWrite(*V_i, CEED_MEM_DEVICE, &output_i)); + // ++V_i; + // } + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_DEVICE, &impl->fields.inputs[i])); } - - std::vector outputs(num_output_fields); - CeedVector *V_i = V; - for (auto &output_i : outputs) { - CeedCallBackend(CeedVectorGetArrayWrite(*V_i, CEED_MEM_DEVICE, &output_i)); - ++V_i; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedVectorGetArrayRead(V[i], CEED_MEM_DEVICE, &impl->fields.outputs[i])); } // Get context data @@ -88,17 +94,26 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C cgh.parallel_for(kernel_range, *(impl->QFunction)); }); + // Call launcher function that executes kernel + *(impl->QFunction)(sycl_queue, context_data, Q, fields); + // Restore vectors - U_i = U; - for (auto &input_i : inputs) { - CeedCallBackend(CeedVectorRestoreArrayRead(*U_i, &input_i)); - ++U_i; + // U_i = U; + // for (auto &input_i : inputs) { + // CeedCallBackend(CeedVectorRestoreArrayRead(*U_i, &input_i)); + // ++U_i; + // } + + // V_i = V; + // for (auto &output_i : outputs) { + // CeedCallBackend(CeedVectorRestoreArray(*V_i, &output_i)); + // ++V_i; + // } + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedCallBackend(CeedVectorRestoreArrayRead(U[i], &impl->fields.inputs[i])); } - - V_i = V; - for (auto &output_i : outputs) { - CeedCallBackend(CeedVectorRestoreArray(*V_i, &output_i)); - ++V_i; + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedVectorRestoreArray(V[i], &impl->fields.outputs[i])); } // Restore context @@ -131,6 +146,13 @@ int CeedQFunctionCreate_Sycl(CeedQFunction qf) { CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionSetData(qf, impl)); + + // Read QFunction source + CeedCallBackend(CeedQFunctionGetKernelName(qf, &impl->qfunction_name)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n"); + CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &impl->qfunction_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n"); + // Register backend functions CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl)); diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp index ae765dbafc..2b08f1f95a 100644 --- a/backends/sycl-ref/ceed-sycl-ref.hpp +++ b/backends/sycl-ref/ceed-sycl-ref.hpp @@ -68,9 +68,14 @@ typedef struct { CeedScalar *d_q_weight; } CeedBasisNonTensor_Sycl; +using SyclQfunctionKernel_t = std::function, void*, CeedInt, Fields_Sycl)>; + typedef struct { SyclModule_t *sycl_module; - sycl::kernel *QFunction; + const char *qfunction_name; + const char *qfunction_source; + SyclQfunctionKernel_t *QFunction; + Fields_Sycl fields; } CeedQFunction_Sycl; typedef struct { diff --git a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h index d62de2533a..f3efdafa35 100644 --- a/include/ceed/jit-source/sycl/sycl-ref-qfunction.h +++ b/include/ceed/jit-source/sycl/sycl-ref-qfunction.h @@ -13,13 +13,15 @@ //------------------------------------------------------------------------------ // Read from quadrature points //------------------------------------------------------------------------------ -inline void readQuads(CeedInt N, CeedInt stride, CeedInt offset, const CeedScalar *src, CeedScalar *dest) { - for (CeedInt i = 0; i < N; ++i) dest[i] = src[stride * i + offset]; +template +inline void readQuads(CeedInt offset, CeedInt stride, const CeedScalar *src, CeedScalar *dest) { + for (CeedInt i = 0; i < SIZE; ++i) dest[i] = src[offset + stride * i]; } //------------------------------------------------------------------------------ // Write at quadrature points //------------------------------------------------------------------------------ -inline void writeQuads(CeedInt N, CeedInt stride, CeedInt offset, const CeedScalar *src, CeedScalar *dest) { - for (CeedInt i = 0; i < N; ++i) dest[stride * i + offset] = src[i]; +template +inline void writeQuads(CeedInt offset, CeedInt stride, const CeedScalar *src, CeedScalar *dest) { + for (CeedInt i = 0; i < SIZE; ++i) dest[offset + stride * i] = src[i]; }