Skip to content

Commit

Permalink
Added changes to sycl-ref qfunction generation
Browse files Browse the repository at this point in the history
  • Loading branch information
uumesh committed Oct 2, 2024
1 parent 5026b11 commit 77317b1
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 78 deletions.
112 changes: 59 additions & 53 deletions backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,30 +45,8 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
// QFunction kernel generation
CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields));

std::vector<CeedInt> input_sizes(num_input_fields);
CeedQFunctionField *input_i = input_fields;

for (auto &size_i : input_sizes) {
CeedCallBackend(CeedQFunctionFieldGetSize(*input_i, &size_i));
++input_i;
}

std::vector<CeedInt> output_sizes(num_output_fields);
CeedQFunctionField *output_i = output_fields;

for (auto &size_i : output_sizes) {
CeedCallBackend(CeedQFunctionFieldGetSize(*output_i, &size_i));
++output_i;
}

CeedCallBackend(CeedQFunctionGetKernelName(qf, &qfunction_name));

CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &qfunction_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");

CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-ref-qfunction.h", &read_write_kernel_path));

// Build strings for final kernel function
CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-ref-qfunction.h", &read_write_kernel_path));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n");
{
char *source;
Expand All @@ -77,12 +55,27 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
read_write_kernel_source = source;
}
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n");

std::string_view qf_name_view(qfunction_name);
std::string_view qf_source_view(qfunction_source);
std::string_view qf_name_view(impl->qfunction_name);
std::string_view qf_source_view(impl->qfunction_source);
std::string_view rw_source_view(read_write_kernel_source);
const std::string kernel_name = "CeedKernelSyclRefQFunction_" + std::string(qf_name_view);

// std::vector<CeedInt> input_sizes(num_input_fields);
// CeedQFunctionField *input_i = input_fields;

// for (auto &size_i : input_sizes) {
// CeedCallBackend(CeedQFunctionFieldGetSize(*input_i, &size_i));
// ++input_i;
// }

// std::vector<CeedInt> output_sizes(num_output_fields);
// CeedQFunctionField *output_i = output_fields;

// for (auto &size_i : output_sizes) {
// CeedCallBackend(CeedQFunctionFieldGetSize(*output_i, &size_i));
// ++output_i;
// }

// Defintions
std::ostringstream code;
code << rw_source_view;
Expand All @@ -92,75 +85,88 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
// Kernel function
// Here we are fixing a lower sub-group size value to avoid register spills
// This needs to be revisited if all qfunctions require this.
code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) __kernel void " << kernel_name
<< "(__global void *ctx, CeedInt Q,\n";
// code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) extern \"C\" void " << kernel_name
code << "#include <vector>\n\n";
code << "extern \"C\" void " << kernel_name
<< "(sycl::queue &sycl_queue, sycl::nd_range<1> kernel_range, void *ctx, CeedInt Q, Fields_Sycl fields) {\n";

// OpenCL doesn't allow for structs with pointers.
// We will need to pass all of the arguments individually.
// Input parameters
code << " "
<< "const CeedScalar *fields_inputs[" << num_input_fields << "];\n";
for (CeedInt i = 0; i < num_input_fields; ++i) {
code << " "
<< "__global const CeedScalar *in_" << i << ",\n";
<< "fields_inputs[" << i << "] = fields.inputs[" << i << "];\n";
}

// Output parameters
code << " "
<< "__global CeedScalar *out_0";
for (CeedInt i = 1; i < num_output_fields; ++i) {
code << "\n, "
<< "__global CeedScalar *out_" << i;
<< "const CeedScalar *fields_outputs[" << num_output_fields << "];\n";
for (CeedInt i = 0; i < num_output_fields; ++i) {
code << " "
<< "fields_outputs[" << i << "] = fields.outputs[" << i << "];\n";
}
code << "\n";

// Insert SYCL barrier for out-of-order queues
code << " std::vector<sycl::event> e;\n";
code << " if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};\n\n";

// Begin kernel function body
code << ") {\n\n";
code << " "
<< "sycl_queue.parallel_for<CeedQFunction_" << qf_name_view << ">(kernel_range, e, "
<< "[=](sycl::id<1> id) {\n";

// Inputs
code << " // Input fields\n";
code << " // Input fields\n";
for (CeedInt i = 0; i < num_input_fields; ++i) {
code << " CeedScalar U_" << i << "[" << input_sizes[i] << "];\n";
code << " CeedScalar U_" << i << "[" << input_sizes[i] << "];\n";
}
code << " const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "] = {U_0";
code << " const CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "] = {U_0";
for (CeedInt i = 1; i < num_input_fields; i++) {
code << ", U_" << i << "\n";
}
code << "};\n\n";

// Outputs
code << " // Output fields\n";
code << " // Output fields\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
code << " CeedScalar V_" << i << "[" << output_sizes[i] << "];\n";
code << " CeedScalar V_" << i << "[" << output_sizes[i] << "];\n";
}
code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "] = {V_0";
code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "] = {V_0";
for (CeedInt i = 1; i < num_output_fields; i++) {
code << ", V_" << i << "\n";
}
code << "};\n\n";

code << " const CeedInt q = get_global_linear_id();\n\n";
code << " const CeedInt q = id;\n\n";

code << "if(q < Q){ \n\n";
code << " if(q < Q) { \n\n";

// Load inputs
code << " // -- Load inputs\n";
code << " // -- Load inputs\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
code << " readQuads(" << input_sizes[i] << ", Q, q, "
<< "in_" << i << ", U_" << i << ");\n";
code << " readQuads<" << input_sizes[i] << ">(q, Q, "
<< "fields_inputs[" << i << "], U_" << i << ");\n";
}
code << "\n";

// QFunction
code << " // -- Call QFunction\n";
code << " " << qf_name_view << "(ctx, 1, inputs, outputs);\n\n";
code << " // -- Call QFunction\n";
code << " " << qf_name_view << "(ctx, 1, inputs, outputs);\n\n";

// Write outputs
code << " // -- Write outputs\n";
code << " // -- Write outputs\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
code << " writeQuads(" << output_sizes[i] << ", Q, q, "
<< "V_" << i << ", out_" << i << ");\n";
code << " writeQuads<" << output_sizes[i] << ">(q, Q, "
<< "V_" << i << ", fields_outputs[" << i << "]);\n";
}
code << "\n";
code << " }\n";

// End kernel function body
code << "}\n";
code <<" });\n";
// End launcher function
code << "}\n";

// View kernel for debugging
Expand Down
62 changes: 42 additions & 20 deletions backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,24 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));

// Read vectors
std::vector<const CeedScalar *> inputs(num_input_fields);
const CeedVector *U_i = U;
for (auto &input_i : inputs) {
CeedCallBackend(CeedVectorGetArrayRead(*U_i, CEED_MEM_DEVICE, &input_i));
++U_i;
// std::vector<const CeedScalar *> inputs(num_input_fields);
// const CeedVector *U_i = U;
// for (auto &input_i : inputs) {
// CeedCallBackend(CeedVectorGetArrayRead(*U_i, CEED_MEM_DEVICE, &input_i));
// ++U_i;
// }

// std::vector<CeedScalar *> outputs(num_output_fields);
// CeedVector *V_i = V;
// for (auto &output_i : outputs) {
// CeedCallBackend(CeedVectorGetArrayWrite(*V_i, CEED_MEM_DEVICE, &output_i));
// ++V_i;
// }
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_DEVICE, &impl->fields.inputs[i]));
}

std::vector<CeedScalar *> outputs(num_output_fields);
CeedVector *V_i = V;
for (auto &output_i : outputs) {
CeedCallBackend(CeedVectorGetArrayWrite(*V_i, CEED_MEM_DEVICE, &output_i));
++V_i;
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedCallBackend(CeedVectorGetArrayRead(V[i], CEED_MEM_DEVICE, &impl->fields.outputs[i]));
}

// Get context data
Expand Down Expand Up @@ -88,17 +94,26 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
cgh.parallel_for(kernel_range, *(impl->QFunction));
});

// Call launcher function that executes kernel
*(impl->QFunction)(sycl_queue, context_data, Q, fields);

// Restore vectors
U_i = U;
for (auto &input_i : inputs) {
CeedCallBackend(CeedVectorRestoreArrayRead(*U_i, &input_i));
++U_i;
// U_i = U;
// for (auto &input_i : inputs) {
// CeedCallBackend(CeedVectorRestoreArrayRead(*U_i, &input_i));
// ++U_i;
// }

// V_i = V;
// for (auto &output_i : outputs) {
// CeedCallBackend(CeedVectorRestoreArray(*V_i, &output_i));
// ++V_i;
// }
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedCallBackend(CeedVectorRestoreArrayRead(U[i], &impl->fields.inputs[i]));
}

V_i = V;
for (auto &output_i : outputs) {
CeedCallBackend(CeedVectorRestoreArray(*V_i, &output_i));
++V_i;
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedCallBackend(CeedVectorRestoreArray(V[i], &impl->fields.outputs[i]));
}

// Restore context
Expand Down Expand Up @@ -131,6 +146,13 @@ int CeedQFunctionCreate_Sycl(CeedQFunction qf) {
CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
CeedCallBackend(CeedCalloc(1, &impl));
CeedCallBackend(CeedQFunctionSetData(qf, impl));

// Read QFunction source
CeedCallBackend(CeedQFunctionGetKernelName(qf, &impl->qfunction_name));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &impl->qfunction_source));
CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");

// Register backend functions
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl));
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl));
Expand Down
7 changes: 6 additions & 1 deletion backends/sycl-ref/ceed-sycl-ref.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,14 @@ typedef struct {
CeedScalar *d_q_weight;
} CeedBasisNonTensor_Sycl;

using SyclQfunctionKernel_t = std::function<void(sycl::queue&, sycl::nd_range<1>, void*, CeedInt, Fields_Sycl)>;

typedef struct {
SyclModule_t *sycl_module;
sycl::kernel *QFunction;
const char *qfunction_name;
const char *qfunction_source;
SyclQfunctionKernel_t *QFunction;
Fields_Sycl fields;
} CeedQFunction_Sycl;

typedef struct {
Expand Down
10 changes: 6 additions & 4 deletions include/ceed/jit-source/sycl/sycl-ref-qfunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
//------------------------------------------------------------------------------
// Read from quadrature points
//------------------------------------------------------------------------------
inline void readQuads(CeedInt N, CeedInt stride, CeedInt offset, const CeedScalar *src, CeedScalar *dest) {
for (CeedInt i = 0; i < N; ++i) dest[i] = src[stride * i + offset];
template <int SIZE>
inline void readQuads(CeedInt offset, CeedInt stride, const CeedScalar *src, CeedScalar *dest) {
for (CeedInt i = 0; i < SIZE; ++i) dest[i] = src[offset + stride * i];
}

//------------------------------------------------------------------------------
// Write at quadrature points
//------------------------------------------------------------------------------
inline void writeQuads(CeedInt N, CeedInt stride, CeedInt offset, const CeedScalar *src, CeedScalar *dest) {
for (CeedInt i = 0; i < N; ++i) dest[stride * i + offset] = src[i];
template <int SIZE>
inline void writeQuads(CeedInt offset, CeedInt stride, const CeedScalar *src, CeedScalar *dest) {
for (CeedInt i = 0; i < SIZE; ++i) dest[offset + stride * i] = src[i];
}

0 comments on commit 77317b1

Please sign in to comment.