Skip to content

Commit

Permalink
runsOnPolaris but not using any merged code
Browse files Browse the repository at this point in the history
  • Loading branch information
KennethEJansen committed Oct 9, 2023
1 parent 936b246 commit b624c09
Show file tree
Hide file tree
Showing 28 changed files with 678 additions and 556 deletions.
2 changes: 1 addition & 1 deletion backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@
#define _ceed_sycl_gen_operator_build_h

CEED_INTERN int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes);
CEED_INTERN int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op);
CEED_INTERN int CeedSyclGenOperatorBuild(CeedOperator op);

#endif // _ceed_sycl_gen_operator_build_h
149 changes: 65 additions & 84 deletions backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp

Large diffs are not rendered by default.

53 changes: 27 additions & 26 deletions backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
CeedOperator_Sycl_gen *impl;
CeedCallBackend(CeedOperatorGetData(op, &impl));
CeedQFunction qf;
CeedQFunction qf;
CeedQFunction_Sycl_gen *qf_impl;
CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
Expand All @@ -47,7 +47,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
CeedVector vec, output_vecs[CEED_FIELD_MAX] = {};

// Creation of the operator
CeedCallBackend(CeedOperatorBuildKernel_Sycl_gen(op));
CeedCallBackend(CeedSyclGenOperatorBuild(op));

// Input vectors
for (CeedInt i = 0; i < num_input_fields; i++) {
Expand Down Expand Up @@ -92,36 +92,38 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_impl->d_c));

// Apply operator
const CeedInt dim = impl->dim;
const CeedInt Q_1d = impl->Q_1d;
const CeedInt P_1d = impl->max_P_1d;
const CeedInt dim = impl->dim;
const CeedInt Q_1d = impl->Q_1d;
const CeedInt P_1d = impl->max_P_1d;
const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
CeedInt block_sizes[3], grid = 0;
CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes));
if (dim == 1) {
grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
// CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
//CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
} else if (dim == 2) {
grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
// CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
//CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
} else if (dim == 3) {
grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
// CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
//CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
}

sycl::range<3> local_range(block_sizes[2], block_sizes[1], block_sizes[0]);
sycl::range<3> global_range(grid * block_sizes[2], block_sizes[1], block_sizes[0]);
sycl::nd_range<3> kernel_range(global_range, local_range);

sycl::range<3> local_range(block_sizes[2], block_sizes[1], block_sizes[0]);
sycl::range<3> global_range(grid*block_sizes[2], block_sizes[1], block_sizes[0]);
sycl::nd_range<3> kernel_range(global_range,local_range);
//-----------
// Order queue
//Order queue
sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();

CeedCallSycl(ceed, ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {

CeedCallSycl(ceed,
ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){
cgh.depends_on(e);
cgh.set_args(num_elem, qf_impl->d_c, impl->indices, impl->fields, impl->B, impl->G, impl->W);
cgh.parallel_for(kernel_range, *(impl->op));
cgh.parallel_for(kernel_range,*(impl->op));
}));
CeedCallSycl(ceed, ceed_Sycl->sycl_queue.wait_and_throw());
CeedCallSycl(ceed,ceed_Sycl->sycl_queue.wait_and_throw());

// Restore input arrays
for (CeedInt i = 0; i < num_input_fields; i++) {
Expand Down Expand Up @@ -174,15 +176,14 @@ int CeedOperatorCreate_Sycl_gen(CeedOperator op) {
CeedCallBackend(CeedCalloc(1, &impl));
CeedCallBackend(CeedOperatorSetData(op, impl));

impl->indices = sycl::malloc_device<FieldsInt_Sycl>(1, sycl_data->sycl_device, sycl_data->sycl_context);
impl->fields = sycl::malloc_host<Fields_Sycl>(1, sycl_data->sycl_context);
impl->B = sycl::malloc_device<Fields_Sycl>(1, sycl_data->sycl_device, sycl_data->sycl_context);
impl->G = sycl::malloc_device<Fields_Sycl>(1, sycl_data->sycl_device, sycl_data->sycl_context);
impl->W = sycl::malloc_device<CeedScalar>(1, sycl_data->sycl_device, sycl_data->sycl_context);
impl->indices = sycl::malloc_device<FieldsInt_Sycl>(1,sycl_data->sycl_device,sycl_data->sycl_context);
impl->fields = sycl::malloc_host<Fields_Sycl>(1,sycl_data->sycl_context);
impl->B = sycl::malloc_device<Fields_Sycl>(1,sycl_data->sycl_device,sycl_data->sycl_context);
impl->G = sycl::malloc_device<Fields_Sycl>(1,sycl_data->sycl_device,sycl_data->sycl_context);
impl->W = sycl::malloc_device<CeedScalar>(1,sycl_data->sycl_device,sycl_data->sycl_context);

CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl_gen));
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl_gen));
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
3 changes: 1 addition & 2 deletions backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) {
// Wait for all work to finish before freeing memory
CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
CeedCallSycl(ceed, sycl::free(impl->d_c, data->sycl_context));

CeedCallBackend(CeedFree(&impl->q_function_source));
CeedCallBackend(CeedFree(&impl));
return CEED_ERROR_SUCCESS;
Expand Down Expand Up @@ -67,5 +67,4 @@ int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf) {
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl_gen));
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
12 changes: 6 additions & 6 deletions backends/sycl-gen/ceed-sycl-gen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@
#include "../sycl/ceed-sycl-compile.hpp"

typedef struct {
CeedInt dim;
CeedInt Q_1d;
CeedInt max_P_1d;
SyclModule_t *sycl_module;
sycl::kernel *op;
CeedInt dim;
CeedInt Q_1d;
CeedInt max_P_1d;
SyclModule_t* sycl_module;
sycl::kernel* op;
FieldsInt_Sycl *indices;
Fields_Sycl *fields;
Fields_Sycl *B;
Fields_Sycl *G;
CeedScalar *W;
CeedScalar *W;
} CeedOperator_Sycl_gen;

typedef struct {
Expand Down
7 changes: 3 additions & 4 deletions backends/sycl-gen/ceed-sycl-gen.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@
#include <ceed/backend.h>
#include <ceed/ceed.h>

#include <string.h>
#include <string>
#include <string_view>
#include <string.h>

//------------------------------------------------------------------------------
// Backend init
//------------------------------------------------------------------------------
static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
char *resource_root;
CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":device_id=", &resource_root));
CeedCallBackend(CeedSyclGetResourceRoot(ceed, resource, &resource_root));
if (strcmp(resource_root, "/gpu/sycl") && strcmp(resource_root, "/gpu/sycl/gen")) {
// LCOV_EXCL_START
return CeedError(ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource);
Expand All @@ -30,7 +30,7 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
Ceed_Sycl *data;
CeedCallBackend(CeedCalloc(1, &data));
CeedCallBackend(CeedSetData(ceed, data));
CeedCallBackend(CeedInit_Sycl(ceed, resource));
CeedCallBackend(CeedSyclInit(ceed, resource));

Ceed ceed_shared;
CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared));
Expand All @@ -55,5 +55,4 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
// Register backend
//------------------------------------------------------------------------------
CEED_INTERN int CeedRegister_Sycl_Gen(void) { return CeedRegister("/gpu/sycl/gen", CeedInit_Sycl_gen, 20); }

//------------------------------------------------------------------------------
88 changes: 45 additions & 43 deletions backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
#include "../sycl/ceed-sycl-compile.hpp"
#include "ceed-sycl-ref.hpp"

template <int>
class CeedBasisSyclInterp;
template <int>
class CeedBasisSyclGrad;
template<int> class CeedBasisSyclInterp;
template<int> class CeedBasisSyclGrad;
class CeedBasisSyclWeight;

class CeedBasisSyclInterpNT;
Expand All @@ -35,11 +33,11 @@ static constexpr SpecID BASIS_Q_1D_ID;
//------------------------------------------------------------------------------
// Interpolation kernel - tensor
//------------------------------------------------------------------------------
template <int transpose>
static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
const CeedScalar *u, CeedScalar *v) {
const CeedInt buf_len = impl->buf_len;
const CeedInt op_len = impl->op_len;
template<int transpose>
static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t& sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, const CeedScalar *u,
CeedScalar *v) {
const CeedInt buf_len = impl->buf_len;
const CeedInt op_len = impl->op_len;
const CeedScalar *interp_1d = impl->d_interp_1d;

const sycl::device &sycl_device = sycl_queue.get_device();
Expand All @@ -65,8 +63,8 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
const CeedInt P_1d = kh.get_specialization_constant<BASIS_P_1D_ID>();
const CeedInt Q_1d = kh.get_specialization_constant<BASIS_Q_1D_ID>();
//-------------------------------------------------------------->
const CeedInt num_nodes = CeedIntPow(P_1d, dim);
const CeedInt num_qpts = CeedIntPow(Q_1d, dim);
const CeedInt num_nodes = CeedIntPow(P_1d,dim);
const CeedInt num_qpts = CeedIntPow(Q_1d,dim);
const CeedInt P = transpose ? Q_1d : P_1d;
const CeedInt Q = transpose ? P_1d : Q_1d;
const CeedInt stride_0 = transpose ? 1 : P_1d;
Expand Down Expand Up @@ -136,16 +134,16 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
//------------------------------------------------------------------------------
// Gradient kernel - tensor
//------------------------------------------------------------------------------
template <int transpose>
static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
const CeedScalar *u, CeedScalar *v) {
const CeedInt buf_len = impl->buf_len;
const CeedInt op_len = impl->op_len;
template<int transpose>
static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t& sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, const CeedScalar *u,
CeedScalar *v) {
const CeedInt buf_len = impl->buf_len;
const CeedInt op_len = impl->op_len;
const CeedScalar *interp_1d = impl->d_interp_1d;
const CeedScalar *grad_1d = impl->d_grad_1d;

const sycl::device &sycl_device = sycl_queue.get_device();
const CeedInt work_group_size = 32;
const sycl::device &sycl_device = sycl_queue.get_device();
const CeedInt work_group_size = 32;
sycl::range<1> local_range(work_group_size);
sycl::range<1> global_range(num_elem * work_group_size);
sycl::nd_range<1> kernel_range(global_range, local_range);
Expand All @@ -166,22 +164,23 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &
const CeedInt P_1d = kh.get_specialization_constant<BASIS_P_1D_ID>();
const CeedInt Q_1d = kh.get_specialization_constant<BASIS_Q_1D_ID>();
//-------------------------------------------------------------->
const CeedInt num_nodes = CeedIntPow(P_1d, dim);
const CeedInt num_qpts = CeedIntPow(Q_1d, dim);
const CeedInt P = transpose ? Q_1d : P_1d;
const CeedInt Q = transpose ? P_1d : Q_1d;
const CeedInt stride_0 = transpose ? 1 : P_1d;
const CeedInt stride_1 = transpose ? P_1d : 1;
const CeedInt u_stride = transpose ? num_qpts : num_nodes;
const CeedInt v_stride = transpose ? num_nodes : num_qpts;
const CeedInt num_nodes = CeedIntPow(P_1d,dim);
const CeedInt num_qpts = CeedIntPow(Q_1d,dim);
const CeedInt P = transpose ? Q_1d : P_1d;
const CeedInt Q = transpose ? P_1d : Q_1d;
const CeedInt stride_0 = transpose ? 1 : P_1d;
const CeedInt stride_1 = transpose ? P_1d : 1;
const CeedInt u_stride = transpose ? num_qpts : num_nodes;
const CeedInt v_stride = transpose ? num_nodes : num_qpts;
const CeedInt u_comp_stride = num_elem * u_stride;
const CeedInt v_comp_stride = num_elem * v_stride;
const CeedInt u_dim_stride = transpose ? num_elem * num_qpts * num_comp : 0;
const CeedInt v_dim_stride = transpose ? 0 : num_elem * num_qpts * num_comp;
sycl::group work_group = work_item.get_group();
const CeedInt i = work_item.get_local_linear_id();
const CeedInt group_size = work_group.get_local_linear_range();
const CeedInt elem = work_group.get_group_linear_id();

sycl::group work_group = work_item.get_group();
const CeedInt i = work_item.get_local_linear_id();
const CeedInt group_size = work_group.get_local_linear_range();
const CeedInt elem = work_group.get_group_linear_id();

CeedScalar *s_interp_1d = s_mem.get_pointer();
CeedScalar *s_grad_1d = s_interp_1d + P * Q;
Expand All @@ -204,7 +203,7 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &
for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) {
// Use older version of sycl workgroup barrier for performance reasons
// Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
// sycl::group_barrier(work_group);
//sycl::group_barrier(work_group);
work_item.barrier(sycl::access::fence_space::local_space);

pre /= P;
Expand Down Expand Up @@ -292,14 +291,14 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
// Basis action
switch (eval_mode) {
case CEED_EVAL_INTERP: {
if (transpose) {
if(transpose) {
CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_TRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
} else {
CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_NOTRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
}
} break;
case CEED_EVAL_GRAD: {
if (transpose) {
if(transpose) {
CeedCallBackend(CeedBasisApplyGrad_Sycl<1>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
} else {
CeedCallBackend(CeedBasisApplyGrad_Sycl<0>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
Expand Down Expand Up @@ -582,27 +581,31 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();

CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, {e});
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d,{e});

const CeedInt interp_length = Q_1d * P_1d;
CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, {e});
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length,{e});

CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, {e});
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length,{e});

CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));

std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()};
std::vector<sycl::kernel_id> kernel_ids = {
sycl::get_kernel_id<CeedBasisSyclInterp<1>>(),
sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
sycl::get_kernel_id<CeedBasisSyclGrad<1>>(),
sycl::get_kernel_id<CeedBasisSyclGrad<0>>()
};

sycl::kernel_bundle<sycl::bundle_state::input> input_bundle = sycl::get_kernel_bundle<sycl::bundle_state::input>(data->sycl_context, kernel_ids);
input_bundle.set_specialization_constant<BASIS_DIM_ID>(dim);
input_bundle.set_specialization_constant<BASIS_NUM_COMP_ID>(num_comp);
input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d);
input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d);

CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));
CeedCallSycl(ceed,impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));

CeedCallBackend(CeedBasisSetData(basis, impl));

Expand Down Expand Up @@ -636,15 +639,15 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();

CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, {e});
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts,{e});

const CeedInt interp_length = num_qpts * num_nodes;
CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, {e});
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length,{e});

const CeedInt grad_length = num_qpts * num_nodes * dim;
CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, {e});
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length,{e});

CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));

Expand All @@ -655,5 +658,4 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl));
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
Loading

0 comments on commit b624c09

Please sign in to comment.