diff --git a/Makefile b/Makefile
index 3e76c25d2c..428191ab59 100644
--- a/Makefile
+++ b/Makefile
@@ -111,7 +111,7 @@ OMP_SIMD_FLAG.oneAPI    := $(OMP_SIMD_FLAG.clang)
 SYCL_FLAG.gcc           :=
 SYCL_FLAG.clang         := -fsycl
 SYCL_FLAG.icc           :=
-SYCL_FLAG.oneAPI        := -fsycl
+SYCL_FLAG.oneAPI        := -fsycl -fno-sycl-id-queries-fit-in-int
 OPT.gcc                 := -g -ffp-contract=fast
 OPT.clang               := $(OPT.gcc)
 OPT.oneAPI              := $(OPT.clang)
@@ -275,6 +275,7 @@ hip-gen.cpp    := $(sort $(wildcard backends/hip-gen/*.cpp))
 sycl-core.cpp  := $(sort $(wildcard backends/sycl/*.sycl.cpp))
 sycl-ref.cpp   := $(sort $(wildcard backends/sycl-ref/*.sycl.cpp))
 sycl-shared.cpp:= $(sort $(wildcard backends/sycl-shared/*.sycl.cpp))
+sycl-gen.cpp   := $(sort $(wildcard backends/sycl-gen/*.sycl.cpp))
 
 hip-all.c := interface/ceed-hip.c $(hip.c) $(hip-ref.c) $(hip-shared.c) $(hip-gen.c)
 hip-all.cpp := $(hip.cpp) $(hip-ref.cpp) $(hip-gen.cpp)
@@ -453,7 +454,7 @@ ifneq ($(HIP_LIB_DIR),)
 endif
 
 # SYCL Backends
-SYCL_BACKENDS = /gpu/sycl/ref /gpu/sycl/shared
+SYCL_BACKENDS = /gpu/sycl/ref /gpu/sycl/shared /gpu/sycl/gen
 ifneq ($(SYCL_DIR),)
 	SYCL_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(SYCL_DIR)/$d/libsycl.${SO_EXT}))
 	SYCL_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(SYCL_LIB_DIR))))
@@ -461,7 +462,7 @@ endif
 ifneq ($(SYCL_LIB_DIR),)
 	PKG_LIBS += $(SYCL_FLAG) -lze_loader
 	LIBCEED_CONTAINS_CXX = 1
-	libceed.sycl += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp)
+	libceed.sycl += $(sycl-core.cpp) $(sycl-ref.cpp) $(sycl-shared.cpp) $(sycl-gen.cpp)
 	BACKENDS_MAKE += $(SYCL_BACKENDS)
 endif
 
@@ -556,6 +557,9 @@ $(OBJDIR)/%.o : $(CURDIR)/%.hip.cpp | $$(@D)/.DIR
 $(OBJDIR)/%.o : $(CURDIR)/%.sycl.cpp | $$(@D)/.DIR
 	$(call quiet,SYCLCXX) $(SYCLFLAGS) $(CPPFLAGS) -c -o $@ $(abspath $<)
 
+$(OBJDIR)/%.o : $(CURDIR)/%.sycl.cpp | $$(@D)/.DIR
+	$(call quiet,SYCLCXX) $(SYCLFLAGS) $(CPPFLAGS) -c -o $@ $(abspath $<)
+
 $(OBJDIR)/%$(EXE_SUFFIX) : tests/%.c | $$(@D)/.DIR
 	$(call quiet,LINK.c) $(CEED_LDFLAGS) -o $@ $(abspath $<) $(CEED_LIBS) $(CEED_LDLIBS) $(LDLIBS)
 
diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h
index 23989a0a27..34489a7cd0 100644
--- a/backends/ceed-backend-list.h
+++ b/backends/ceed-backend-list.h
@@ -21,6 +21,7 @@ CEED_BACKEND(CeedRegister_Hip_Gen, 1, "/gpu/hip/gen")
 CEED_BACKEND(CeedRegister_Hip_Shared, 1, "/gpu/hip/shared")
 CEED_BACKEND(CeedRegister_Sycl, 1, "/gpu/sycl/ref")
 CEED_BACKEND(CeedRegister_Sycl_Shared, 1, "/gpu/sycl/shared")
+CEED_BACKEND(CeedRegister_Sycl_Gen, 1, "/gpu/sycl/gen")
 CEED_BACKEND(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
 CEED_BACKEND(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
 CEED_BACKEND(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
new file mode 100644
index 0000000000..ca4052739c
--- /dev/null
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
@@ -0,0 +1,14 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#ifndef _ceed_sycl_gen_operator_build_h
+#define _ceed_sycl_gen_operator_build_h
+
+CEED_INTERN int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes);
+CEED_INTERN int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op);
+
+#endif  // _ceed_sycl_gen_operator_build_h
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
new file mode 100644
index 0000000000..2f96d6fe0d
--- /dev/null
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
@@ -0,0 +1,751 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#define CEED_DEBUG_COLOR 12
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+#include <ceed/jit-source/sycl/sycl-types.h>
+#include <ceed/jit-tools.h>
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "../sycl-ref/ceed-sycl-ref.hpp"
+#include "../sycl-shared/ceed-sycl-shared.hpp"
+#include "../sycl/ceed-sycl-compile.hpp"
+
+#include "ceed-sycl-gen.hpp"
+
+//------------------------------------------------------------------------------
+// Calculate the block size used for launching the operator kernel
+//------------------------------------------------------------------------------
+extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
+  const CeedInt thread1d = CeedIntMax(Q_1d, P_1d);
+  if (dim == 1) {
+    CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64;
+    elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+    block_sizes[0]          = thread1d;
+    block_sizes[1]          = 1;
+    block_sizes[2]          = elems_per_block;
+  } else if (dim == 2) {
+    const CeedInt elems_per_block = thread1d < 4 ? 16 : 2;
+    block_sizes[0]                = thread1d;
+    block_sizes[1]                = thread1d;
+    block_sizes[2]                = elems_per_block;
+  } else if (dim == 3) {
+    const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1);
+    block_sizes[0]                = thread1d;
+    block_sizes[1]                = thread1d;
+    block_sizes[2]                = elems_per_block;
+  }
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Build single operator kernel
+// - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct
+// - [ ] Do kernel jitting!
+//------------------------------------------------------------------------------
+extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
+  bool is_setup_done;
+  CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
+  if (is_setup_done) return CEED_ERROR_SUCCESS;
+
+  Ceed ceed;
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  Ceed_Sycl *sycl_data;
+  CeedCallBackend(CeedGetData(ceed, &sycl_data));
+
+  CeedOperator_Sycl_gen *impl;
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  Fields_Sycl             h_B, h_G;
+  FieldsInt_Sycl          h_indices;
+  CeedQFunction           qf;
+  CeedQFunction_Sycl_gen *qf_impl;
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
+  CeedSize lsize;
+  CeedInt  Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1;
+  CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
+  Q_1d = Q;
+
+  CeedOperatorField *op_input_fields, *op_output_fields;
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+
+  CeedEvalMode              eval_mode;
+  CeedBasis                 basis;
+  CeedBasis_Sycl_shared    *basis_impl;
+  CeedElemRestriction       Erestrict;
+  CeedElemRestriction_Sycl *restr_impl;
+
+  // Check for restriction only identity operator
+  bool is_identity_qf;
+  CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
+  if (is_identity_qf) {
+    CeedEvalMode eval_mode_in, eval_mode_out;
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
+    if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) {
+      // LCOV_EXCL_START
+      return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement restriction only identity operators");
+      // LCOV_EXCL_STOP
+    }
+  }
+
+  std::ostringstream code;
+  // TODO: generalize to accept different device functions?
+  {
+    char *tensor_basis_kernel_path, *tensor_basis_code;
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
+    CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_code));
+    code << tensor_basis_code;
+    CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
+    CeedCallBackend(CeedFree(&tensor_basis_code));
+  }
+  {
+    char *sycl_gen_template_path, *sycl_gen_template_source;
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-gen-templates.h", &sycl_gen_template_path));
+    CeedDebug256(ceed, 2, "----- Loading Sycl-Gen Template Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, sycl_gen_template_path, &sycl_gen_template_source));
+    code << sycl_gen_template_source;
+    CeedCallBackend(CeedFree(&sycl_gen_template_path));
+    CeedCallBackend(CeedFree(&sycl_gen_template_source));
+  }
+
+  std::string_view  q_function_source(qf_impl->q_function_source);
+  std::string_view  q_function_name(qf_impl->q_function_name);
+  const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(q_function_name);
+
+  // Find dim, P_1d, Q_1d
+  impl->max_P_1d = 0;
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+    if (basis != CEED_BASIS_COLLOCATED) {
+      CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+
+      // Collect dim, P_1d, and Q_1d
+      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+      bool isTensor;
+      CeedCallBackend(CeedBasisIsTensor(basis, &isTensor));
+      if (isTensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+        if (P_1d > impl->max_P_1d) impl->max_P_1d = P_1d;
+      } else {
+        // LCOV_EXCL_START
+        return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
+        // LCOV_EXCL_STOP
+      }
+    }
+  }
+  // Check output bases for Q_1d, dim as well
+  //   The only input basis might be CEED_BASIS_COLLOCATED
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+
+    if (basis != CEED_BASIS_COLLOCATED) {
+      CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+
+      // Collect Q_1d
+      CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+      bool isTensor;
+      CeedCallBackend(CeedBasisIsTensor(basis, &isTensor));
+      if (isTensor) {
+        CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      } else {
+        // LCOV_EXCL_START
+        return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
+        // LCOV_EXCL_STOP
+      }
+    }
+  }
+  impl->dim  = dim;
+  impl->Q_1d = Q_1d;
+
+  // Only use 3D collocated gradient parallelization strategy when gradient is computed
+  // TODO: put in a function?
+  bool use_collograd_parallelization = false;
+  if (dim == 3) {
+    bool was_grad_found = false;
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_GRAD) {
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+        CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
+        use_collograd_parallelization = !!basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
+        was_grad_found                = true;
+      }
+    }
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      if (eval_mode == CEED_EVAL_GRAD) {
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+        CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
+        use_collograd_parallelization = !!basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
+        was_grad_found                = true;
+      }
+    }
+  }
+
+  CeedInt block_sizes[3];
+  CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes));
+
+  // Define CEED_Q_VLA
+  code << "\n#undef CEED_Q_VLA\n";
+  if (dim != 3 || use_collograd_parallelization) {
+    code << "#define CEED_Q_VLA 1\n\n";
+  } else {
+    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
+  }
+
+  // Determine subgroup size based on supported sizes : Default : 16 (if supported)
+  std::vector allowed_sg_sizes  = sycl_data->sycl_device.get_info<sycl::info::device::sub_group_sizes>();
+  CeedInt     sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1];
+  for (const auto &s : allowed_sg_sizes) {
+    if (s == 16) {
+      sub_group_size_op = s;
+      break;
+    }
+  }
+
+  code << q_function_source;
+
+  // Kernel function
+  code << "\n// -----------------------------------------------------------------------------\n";
+  code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n";
+  code << "kernel void " << operator_name << "(";
+  code << "const CeedInt num_elem, ";
+  code << "global void* ctx, ";
+  code << "global const FieldsInt_Sycl* indices, ";
+  code << "global Fields_Sycl* fields, ";
+  code << "global const Fields_Sycl* B, ";
+  code << "global const Fields_Sycl* G, ";
+  code << "global const CeedScalar * restrict W";
+  code << ") {\n";
+
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
+      code << "  global const CeedScalar* d_u_" << i << " = fields->inputs[" << i << "];\n";
+    }
+  }
+
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << "  global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n";
+  }
+
+  // TODO: Convert these to defined constants to save on GRF
+  code << "  const CeedInt DIM = " << dim << ";\n";
+  code << "  const CeedInt Q_1D = " << Q_1d << ";\n";
+
+  const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2];
+  code << "  local CeedScalar scratch[" << scratch_size << "];\n";
+  code << "  local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
+
+  code << "\n  // -- Input field constants and basis data --\n";
+  // Initialize constants, and matrices B and G
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    code << "  // ---- Input field " << i << " ----\n";
+    // Get elem_size, eval_mode, num_comp
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp));
+
+    // Set field constants
+    if (eval_mode != CEED_EVAL_WEIGHT) {
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_COLLOCATED) {
+        CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+        code << "  const CeedInt P_in_" << i << " = " << P_1d << ";\n";
+      } else {
+        code << "  const CeedInt P_in_" << i << " = " << Q_1d << ";\n";
+      }
+      code << "  const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n";
+    }
+
+    // Load basis data
+    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        break;
+      case CEED_EVAL_INTERP:
+        CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
+        h_B.inputs[i] = basis_impl->d_interp_1d;
+        code << "  local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
+        code << "  loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n";
+        break;
+      case CEED_EVAL_GRAD:
+        CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
+        h_B.inputs[i] = basis_impl->d_interp_1d;
+        code << "  local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
+        code << "  loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n";
+        if (use_collograd_parallelization) {
+          h_G.inputs[i] = basis_impl->d_collo_grad_1d;
+          code << "  local CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n";
+          code << "  loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n";
+        } else {
+          bool has_collo_grad = !!basis_impl->d_collo_grad_1d;
+          h_G.inputs[i]       = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
+          code << "  local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
+          code << "  loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i
+               << ");\n";
+        }
+        break;
+      case CEED_EVAL_WEIGHT:
+        break;  // No action
+      case CEED_EVAL_DIV:
+        break;  // TODO: Not implemented
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+    }
+  }
+
+  code << "\n  // -- Output field constants and basis data --\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << "  // ---- Output field " << i << " ----\n";
+    // Get elem_size, eval_mode, num_comp
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp));
+
+    // Set field constants
+    CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+    if (basis != CEED_BASIS_COLLOCATED) {
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      code << "  const CeedInt P_out_" << i << " = " << P_1d << ";\n";
+    } else {
+      code << "  const CeedInt P_out_" << i << " = " << Q_1d << ";\n";
+    }
+    code << "  const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n";
+
+    // Load basis data
+    code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        break;  // No action
+      case CEED_EVAL_INTERP:
+        CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
+        h_B.outputs[i] = basis_impl->d_interp_1d;
+        code << "  local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
+        code << "  loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n";
+        break;
+      case CEED_EVAL_GRAD:
+        CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
+        h_B.outputs[i] = basis_impl->d_interp_1d;
+        code << "  local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
+        code << "  loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n";
+        if (use_collograd_parallelization) {
+          h_G.outputs[i] = basis_impl->d_collo_grad_1d;
+          code << "  local CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n";
+          code << "  loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n";
+        } else {
+          bool has_collo_grad = !!basis_impl->d_collo_grad_1d;
+          h_G.outputs[i]      = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
+          code << "  local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
+          code << "  loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i
+               << ");\n";
+        }
+        break;
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT: {
+        Ceed ceed;
+        CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        break;  // Should not occur
+      }
+      case CEED_EVAL_DIV:
+        break;  // TODO: Not implemented
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
+    }
+  }
+  code << "\n  // -- Element loop --\n";
+  code << "  work_group_barrier(CLK_LOCAL_MEM_FENCE);\n";
+  code << "  {\n";
+  // Input basis apply if needed
+  // Generate the correct eval mode code for each input
+  code << "    // -- Input field restrictions and basis actions --\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    code << "    // ---- Input field " << i << " ----\n";
+    // Get elem_size, eval_mode, num_comp
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp));
+
+    // Restriction
+    if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
+      code << "    CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n";
+
+      bool is_strided;
+      CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided));
+      if (!is_strided) {
+        CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize));
+        code << "    const CeedInt lsize_in_" << i << " = " << lsize << ";\n";
+        CeedInt comp_stride;
+        CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride));
+        code << "    // CompStride: " << comp_stride << "\n";
+        CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl));
+        h_indices.inputs[i] = restr_impl->d_ind;
+        code << "    readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i
+             << "], d_u_" << i << ", r_u_" << i << ");\n";
+      } else {
+        bool has_backend_strides;
+        CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides));
+        CeedInt num_elem;
+        CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem));
+        CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+        if (!has_backend_strides) {
+          CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides));
+        }
+        code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
+        code << "    readDofsStrided" << dim << "d(num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
+             << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n";
+      }
+    }
+
+    // Basis action
+    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        if (!use_collograd_parallelization) {
+          code << "    private CeedScalar* r_t_" << i << " = r_u_" << i << ";\n";
+        }
+        break;
+      case CEED_EVAL_INTERP:
+        code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
+        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i
+             << ", r_t_" << i << ", elem_scratch);\n";
+        break;
+      case CEED_EVAL_GRAD:
+        if (use_collograd_parallelization) {
+          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_"
+               << i << ", r_t_" << i << ", elem_scratch);\n";
+        } else {
+          CeedInt P_1d;
+          CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n";
+          code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i
+               << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i
+               << ", r_t_" << i << ", elem_scratch);\n";
+        }
+        break;
+      case CEED_EVAL_WEIGHT:
+        code << "    CeedScalar r_t_" << i << "[Q_1D];\n";
+        CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+        CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
+        impl->W = basis_impl->d_q_weight_1d;
+        code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n";
+        break;  // No action
+      case CEED_EVAL_DIV:
+        break;  // TODO: Not implemented
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+    }
+  }
+
+  // Q function
+  code << "\n    // -- Output field setup --\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << "\n    // ---- Output field " << i << " ----\n";
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_GRAD) {
+      if (use_collograd_parallelization) {
+        // Accumulator for gradient slices
+        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
+        code << "    for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n";
+        code << "      for (CeedInt j = 0; j < Q_1D; ++j) {\n";
+        code << "        r_tt_" << i << "[j + i*Q_1D] = 0.0;\n";
+        code << "      }\n";
+        code << "    }\n";
+      } else {
+        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n";
+      }
+    }
+    if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
+      code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
+    }
+  }
+  // We treat quadrature points per slice in 3d to save registers
+  if (use_collograd_parallelization) {
+    code << "\n    // Note: Using planes of 3D elements\n";
+    code << "    for (CeedInt q = 0; q < Q_1D; q++) {\n";
+    code << "      // -- Input fields --\n";
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      code << "      // ---- Input field " << i << " ----\n";
+      // Get elem_size, eval_mode, num_comp
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+      // Basis action
+      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
+
+          bool is_strided;
+          CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict));
+          CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided));
+          if (!is_strided) {
+            CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize));
+            code << "      const CeedInt lsize_in_" << i << " = " << lsize << ";\n";
+            CeedInt comp_stride;
+            CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride));
+            code << "      // CompStride: " << comp_stride << "\n";
+            CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl));
+            h_indices.inputs[i] = restr_impl->d_ind;
+            code << "      readSliceQuadsOffset"
+                 << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, lsize_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_"
+                 << i << ", r_q_" << i << ");\n";
+          } else {
+            CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
+            bool has_backend_strides;
+            CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides));
+            CeedInt num_elem;
+            CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem));
+            CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+            if (!has_backend_strides) {
+              CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides));
+            }
+            code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
+            code << "      readSliceQuadsStrided"
+                 << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i
+                 << ", r_q_" << i << ");\n";
+          }
+          break;
+        case CEED_EVAL_INTERP:
+          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
+          code << "      for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n";
+          code << "        r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1D];\n";
+          code << "      }\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "*DIM];\n";
+          code << "      gradCollo3d(num_comp_in_" << i << ", Q_1D, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ", elem_scratch);\n";
+          break;
+        case CEED_EVAL_WEIGHT:
+          code << "      CeedScalar r_q_" << i << "[1];\n";
+          code << "      r_q_" << i << "[0] = r_t_" << i << "[q];\n";
+          break;  // No action
+        case CEED_EVAL_DIV:
+          break;  // TODO: Not implemented
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+      }
+    }
+    code << "\n      // -- Output fields --\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      code << "      // ---- Output field " << i << " ----\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
+          break;  // No action
+        case CEED_EVAL_INTERP:
+          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*DIM];\n";
+          break;
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
+          break;  // TODO: Not implemented
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+      }
+    }
+  } else {
+    code << "\n      // Note: Using full elements\n";
+    code << "      // -- Input fields --\n";
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      code << "      // ---- Input field " << i << " ----\n";
+      code << "      private CeedScalar* r_q_" << i << " = r_t_" << i << ";\n";
+    }
+    code << "      // -- Output fields --\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      code << "      // ---- Output field " << i << " ----\n";
+      code << "      private CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n";
+    }
+  }
+  //--------------------------------------------------
+  code << "\n      // -- QFunction Inputs and outputs --\n";
+  code << "      const CeedScalar * in[" << num_input_fields << "];\n";
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    code << "      // ---- Input field " << i << " ----\n";
+    code << "      in[" << i << "] = r_q_" << i << ";\n";
+  }
+  code << "      CeedScalar * out[" << num_output_fields << "];\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << "      // ---- Output field " << i << " ----\n";
+    code << "      out[" << i << "] = r_qq_" << i << ";\n";
+  }
+
+  code << "\n      // -- Apply QFunction --\n";
+  code << "      " << q_function_name << "(ctx, ";
+  if (dim != 3 || use_collograd_parallelization) {
+    code << "1";
+  } else {
+    code << "Q_1D";
+  }
+  code << ", in, out);\n";
+  //--------------------------------------------------
+
+  if (use_collograd_parallelization) {
+    code << "      // -- Output fields --\n";
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      code << "      // ---- Output field " << i << " ----\n";
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+      // Basis action
+      code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
+          code << "        r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n";
+          code << "      }\n";
+          break;  // No action
+        case CEED_EVAL_INTERP:
+          code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
+          code << "        r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n";
+          code << "      }\n";
+          break;
+        case CEED_EVAL_GRAD:
+          code << "      gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i
+               << ", elem_scratch);\n";
+          break;
+        case CEED_EVAL_WEIGHT:
+          break;  // Should not occur
+        case CEED_EVAL_DIV:
+          break;  // TODO: Not implemented
+        case CEED_EVAL_CURL:
+          break;  // TODO: Not implemented
+      }
+    }
+    code << "    }\n";
+  }
+
+  // Output basis apply if needed
+  // Generate the correct eval mode code for each output
+  code << "\n    // -- Output field basis action and restrictions --\n";
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    code << "    // ---- Output field " << i << " ----\n";
+    // Get elem_size, eval_mode, num_comp
+    CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict));
+    CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp));
+    // Basis action
+    code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
+    switch (eval_mode) {
+      case CEED_EVAL_NONE:
+        code << "    private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n";
+        break;  // No action
+      case CEED_EVAL_INTERP:
+        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
+        code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
+             << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
+        break;
+      case CEED_EVAL_GRAD:
+        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
+        if (use_collograd_parallelization) {
+          code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
+               << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
+        } else {
+          CeedInt P_1d;
+          CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+          CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+          code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i
+               << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i
+               << ", r_v_" << i << ", elem_scratch);\n";
+        }
+        break;
+      // LCOV_EXCL_START
+      case CEED_EVAL_WEIGHT: {
+        Ceed ceed;
+        CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+        return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
+        break;  // Should not occur
+      }
+      case CEED_EVAL_DIV:
+        break;  // TODO: Not implemented
+      case CEED_EVAL_CURL:
+        break;  // TODO: Not implemented
+                // LCOV_EXCL_STOP
+    }
+    // Restriction
+    bool is_strided;
+    CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided));
+    if (!is_strided) {
+      CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize));
+      code << "    const CeedInt lsize_out_" << i << " = " << lsize << ";\n";
+      CeedInt comp_stride;
+      CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride));
+      code << "    // CompStride: " << comp_stride << "\n";
+      CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl));
+      h_indices.outputs[i] = restr_impl->d_ind;
+      code << "    writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i
+           << "], r_v_" << i << ", d_v_" << i << ");\n";
+    } else {
+      bool has_backend_strides;
+      CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides));
+      CeedInt num_elem;
+      CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem));
+      CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
+      if (!has_backend_strides) {
+        CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides));
+      }
+      code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
+      code << "    writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
+           << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n";
+    }
+  }
+
+  code << "  }\n";
+  code << "}\n";
+  code << "// -----------------------------------------------------------------------------\n\n";
+
+  // Copy the struct (containing device addresses) from the host to the device
+  sycl::event copy_B       = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1);
+  sycl::event copy_G       = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1);
+  sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1);
+  // These copies can happen while the JIT is being done
+  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices}));
+
+  // View kernel for debugging
+  CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
+  CeedDebug(ceed, code.str().c_str());
+
+  std::map<std::string, CeedInt> jit_constants;
+  jit_constants["T_1D"]         = block_sizes[0];
+  jit_constants["GROUP_SIZE_X"] = block_sizes[0];
+  jit_constants["GROUP_SIZE_Y"] = block_sizes[1];
+  jit_constants["GROUP_SIZE_Z"] = block_sizes[2];
+
+  // Compile kernel into a kernel bundle
+  CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants));
+
+  // Load kernel function
+  CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op));
+
+  CeedCallBackend(CeedOperatorSetSetupDone(op));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
new file mode 100644
index 0000000000..80a63a4b0a
--- /dev/null
+++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
@@ -0,0 +1,188 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+#include <stddef.h>
+
+#include "../sycl/ceed-sycl-compile.hpp"
+#include "ceed-sycl-gen-operator-build.hpp"
+#include "ceed-sycl-gen.hpp"
+
+//------------------------------------------------------------------------------
+// Destroy operator
+//------------------------------------------------------------------------------
+static int CeedOperatorDestroy_Sycl_gen(CeedOperator op) {
+  CeedOperator_Sycl_gen *impl;
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedCallBackend(CeedFree(&impl));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Apply and add to output
+//------------------------------------------------------------------------------
+static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  Ceed ceed;
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  Ceed_Sycl *ceed_Sycl;
+  CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
+  CeedOperator_Sycl_gen *impl;
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedQFunction           qf;
+  CeedQFunction_Sycl_gen *qf_impl;
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
+  CeedInt num_elem, num_input_fields, num_output_fields;
+  CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedOperatorField *op_input_fields, *op_output_fields;
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  CeedQFunctionField *qf_input_fields, *qf_output_fields;
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
+  CeedEvalMode eval_mode;
+  CeedVector   vec, output_vecs[CEED_FIELD_MAX] = {};
+
+  // Creation of the operator
+  CeedCallBackend(CeedOperatorBuildKernel_Sycl_gen(op));
+
+  // Input vectors
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      impl->fields->inputs[i] = NULL;
+    } else {
+      // Get input vector
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &impl->fields->inputs[i]));
+    }
+  }
+
+  // Output vectors
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+      impl->fields->outputs[i] = NULL;
+    } else {
+      // Get output vector
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      output_vecs[i] = vec;
+      // Check for multiple output modes
+      CeedInt index = -1;
+      for (CeedInt j = 0; j < i; j++) {
+        if (vec == output_vecs[j]) {
+          index = j;
+          break;
+        }
+      }
+      if (index == -1) {
+        CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &impl->fields->outputs[i]));
+      } else {
+        impl->fields->outputs[i] = impl->fields->outputs[index];
+      }
+    }
+  }
+
+  // Get context data
+  CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_impl->d_c));
+
+  // Apply operator
+  const CeedInt dim  = impl->dim;
+  const CeedInt Q_1d = impl->Q_1d;
+  const CeedInt P_1d = impl->max_P_1d;
+  CeedInt       block_sizes[3], grid = 0;
+  CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes));
+  if (dim == 1) {
+    grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+    // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+  } else if (dim == 2) {
+    grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+    // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+  } else if (dim == 3) {
+    grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+    // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+  }
+
+  sycl::range<3>    local_range(block_sizes[2], block_sizes[1], block_sizes[0]);
+  sycl::range<3>    global_range(grid * block_sizes[2], block_sizes[1], block_sizes[0]);
+  sycl::nd_range<3> kernel_range(global_range, local_range);
+
+  //-----------
+  // Order queue
+  sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
+
+  CeedCallSycl(ceed, ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(e);
+    cgh.set_args(num_elem, qf_impl->d_c, impl->indices, impl->fields, impl->B, impl->G, impl->W);
+    cgh.parallel_for(kernel_range, *(impl->op));
+  }));
+  CeedCallSycl(ceed, ceed_Sycl->sycl_queue.wait_and_throw());
+
+  // Restore input arrays
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+    } else {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec));
+      if (vec == CEED_VECTOR_ACTIVE) vec = input_vec;
+      CeedCallBackend(CeedVectorRestoreArrayRead(vec, &impl->fields->inputs[i]));
+    }
+  }
+
+  // Restore output arrays
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
+    if (eval_mode == CEED_EVAL_WEIGHT) {  // Skip
+    } else {
+      CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
+      if (vec == CEED_VECTOR_ACTIVE) vec = output_vec;
+      // Check for multiple output modes
+      CeedInt index = -1;
+      for (CeedInt j = 0; j < i; j++) {
+        if (vec == output_vecs[j]) {
+          index = j;
+          break;
+        }
+      }
+      if (index == -1) {
+        CeedCallBackend(CeedVectorRestoreArray(vec, &impl->fields->outputs[i]));
+      }
+    }
+  }
+
+  // Restore context data
+  CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_impl->d_c));
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Create operator
+//------------------------------------------------------------------------------
+int CeedOperatorCreate_Sycl_gen(CeedOperator op) {
+  Ceed ceed;
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  Ceed_Sycl *sycl_data;
+  CeedCallBackend(CeedGetData(ceed, &sycl_data));
+
+  CeedOperator_Sycl_gen *impl;
+  CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedOperatorSetData(op, impl));
+
+  impl->indices = sycl::malloc_device<FieldsInt_Sycl>(1, sycl_data->sycl_device, sycl_data->sycl_context);
+  impl->fields  = sycl::malloc_host<Fields_Sycl>(1, sycl_data->sycl_context);
+  impl->B       = sycl::malloc_device<Fields_Sycl>(1, sycl_data->sycl_device, sycl_data->sycl_context);
+  impl->G       = sycl::malloc_device<Fields_Sycl>(1, sycl_data->sycl_device, sycl_data->sycl_context);
+  impl->W       = sycl::malloc_device<CeedScalar>(1, sycl_data->sycl_device, sycl_data->sycl_context);
+
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl_gen));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl_gen));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
new file mode 100644
index 0000000000..951474476a
--- /dev/null
+++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+#include <sycl/sycl.hpp>
+#include <stdio.h>
+#include <string.h>
+
+#include "ceed-sycl-gen.hpp"
+
+//------------------------------------------------------------------------------
+// Apply QFunction
+//------------------------------------------------------------------------------
+static int CeedQFunctionApply_Sycl_gen(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) {
+  Ceed ceed;
+  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
+  return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement QFunctionApply");
+}
+
+//------------------------------------------------------------------------------
+// Destroy QFunction
+//------------------------------------------------------------------------------
+static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) {
+  Ceed ceed;
+  CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
+  CeedQFunction_Sycl_gen *impl;
+  CeedCallBackend(CeedQFunctionGetData(qf, &impl));
+  Ceed_Sycl *data;
+  CeedCallBackend(CeedGetData(ceed, &data));
+
+  // Wait for all work to finish before freeing memory
+  CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
+  CeedCallSycl(ceed, sycl::free(impl->d_c, data->sycl_context));
+
+  CeedCallBackend(CeedFree(&impl->q_function_source));
+  CeedCallBackend(CeedFree(&impl));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Create QFunction
+//------------------------------------------------------------------------------
+int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf) {
+  Ceed ceed;
+  CeedQFunctionGetCeed(qf, &ceed);
+  CeedQFunction_Sycl_gen *impl;
+  CeedCallBackend(CeedCalloc(1, &impl));
+  CeedCallBackend(CeedQFunctionSetData(qf, impl));
+
+  // Read QFunction source
+  CeedCallBackend(CeedQFunctionGetKernelName(qf, &impl->q_function_name));
+  CeedDebug256(ceed, 2, "----- Loading QFunction User Source -----\n");
+  CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &impl->q_function_source));
+  CeedDebug256(ceed, 2, "----- Loading QFunction User Source Complete! -----\n");
+  if (!impl->q_function_source) {
+    // LCOV_EXCL_START
+    return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "/gpu/sycl/gen backend requires QFunction source code file");
+    // LCOV_EXCL_STOP
+  }
+
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl_gen));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl_gen));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/sycl-gen/ceed-sycl-gen.hpp b/backends/sycl-gen/ceed-sycl-gen.hpp
new file mode 100644
index 0000000000..e4e716393c
--- /dev/null
+++ b/backends/sycl-gen/ceed-sycl-gen.hpp
@@ -0,0 +1,41 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#ifndef _ceed_sycl_gen_h
+#define _ceed_sycl_gen_h
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+#include <sycl/sycl.hpp>
+
+#include "../sycl/ceed-sycl-common.hpp"
+#include "../sycl/ceed-sycl-compile.hpp"
+
+typedef struct {
+  CeedInt         dim;
+  CeedInt         Q_1d;
+  CeedInt         max_P_1d;
+  SyclModule_t   *sycl_module;
+  sycl::kernel   *op;
+  FieldsInt_Sycl *indices;
+  Fields_Sycl    *fields;
+  Fields_Sycl    *B;
+  Fields_Sycl    *G;
+  CeedScalar     *W;
+} CeedOperator_Sycl_gen;
+
+typedef struct {
+  char *q_function_name;
+  char *q_function_source;
+  void *d_c;
+} CeedQFunction_Sycl_gen;
+
+CEED_INTERN int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf);
+
+CEED_INTERN int CeedOperatorCreate_Sycl_gen(CeedOperator op);
+
+#endif  // _ceed_sycl_gen_h
diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
new file mode 100644
index 0000000000..a5084ff6b7
--- /dev/null
+++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
@@ -0,0 +1,59 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include "ceed-sycl-gen.hpp"
+
+#include <ceed/backend.h>
+#include <ceed/ceed.h>
+
+#include <string>
+#include <string_view>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Backend init
+//------------------------------------------------------------------------------
+static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
+  char *resource_root;
+  CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":device_id=", &resource_root));
+  if (strcmp(resource_root, "/gpu/sycl") && strcmp(resource_root, "/gpu/sycl/gen")) {
+    // LCOV_EXCL_START
+    return CeedError(ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource);
+    // LCOV_EXCL_STOP
+  }
+  CeedCallBackend(CeedFree(&resource_root));
+
+  Ceed_Sycl *data;
+  CeedCallBackend(CeedCalloc(1, &data));
+  CeedCallBackend(CeedSetData(ceed, data));
+  CeedCallBackend(CeedInit_Sycl(ceed, resource));
+
+  Ceed ceed_shared;
+  CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared));
+
+  Ceed_Sycl *shared_data;
+  CeedCallBackend(CeedGetData(ceed_shared, &shared_data));
+  // Need to use the same queue everywhere for correct synchronization
+  shared_data->sycl_queue = data->sycl_queue;
+
+  CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
+
+  const char fallbackresource[] = "/gpu/sycl/ref";
+  CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource));
+
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Sycl));
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Register backend
+//------------------------------------------------------------------------------
+CEED_INTERN int CeedRegister_Sycl_Gen(void) { return CeedRegister("/gpu/sycl/gen", CeedInit_Sycl_gen, 20); }
+
+//------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
index 6967c1442f..601ebdda95 100644
--- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
@@ -178,11 +178,10 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &
       const CeedInt v_comp_stride = num_elem * v_stride;
       const CeedInt u_dim_stride  = transpose ? num_elem * num_qpts * num_comp : 0;
       const CeedInt v_dim_stride  = transpose ? 0 : num_elem * num_qpts * num_comp;
-
-      sycl::group   work_group = work_item.get_group();
-      const CeedInt i          = work_item.get_local_linear_id();
-      const CeedInt group_size = work_group.get_local_linear_range();
-      const CeedInt elem       = work_group.get_group_linear_id();
+      sycl::group   work_group    = work_item.get_group();
+      const CeedInt i             = work_item.get_local_linear_id();
+      const CeedInt group_size    = work_group.get_local_linear_range();
+      const CeedInt elem          = work_group.get_group_linear_id();
 
       CeedScalar *s_interp_1d = s_mem.get_pointer();
       CeedScalar *s_grad_1d   = s_interp_1d + P * Q;
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 0f1f1f1de6..7aa3222a96 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -787,7 +787,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const
   std::vector<sycl::event> copy_events;
   if (evalNone) {
     CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity));
-    for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0;
+    for (CeedSize i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0;
     CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context));
     sycl::event identity_copy = sycl_data->sycl_queue.copy<CeedScalar>(identity, diag->d_identity, iLen, {e});
     copy_events.push_back(identity_copy);
@@ -838,11 +838,11 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool pointBlock, const CeedInt nelem, const CeedOperatorDiag_Sycl *diag,
                                            const CeedScalar *assembledqfarray, CeedScalar *elemdiagarray) {
-  const CeedInt nnodes      = diag->nnodes;
-  const CeedInt nqpts       = diag->nqpts;
-  const CeedInt ncomp       = diag->ncomp;
-  const CeedInt numemodein  = diag->numemodein;
-  const CeedInt numemodeout = diag->numemodeout;
+  const CeedSize nnodes      = diag->nnodes;
+  const CeedSize nqpts       = diag->nqpts;
+  const CeedSize ncomp       = diag->ncomp;
+  const CeedSize numemodein  = diag->numemodein;
+  const CeedSize numemodeout = diag->numemodeout;
 
   const CeedScalar   *identity  = diag->d_identity;
   const CeedScalar   *interpin  = diag->d_interpin;
@@ -864,23 +864,23 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p
     // Each element
     CeedInt dout = -1;
     // Each basis eval mode pair
-    for (CeedInt eout = 0; eout < numemodeout; eout++) {
+    for (CeedSize eout = 0; eout < numemodeout; eout++) {
       const CeedScalar *bt = NULL;
       if (emodeout[eout] == CEED_EVAL_GRAD) ++dout;
       CeedOperatorGetBasisPointer_Sycl(&bt, emodeout[eout], identity, interpout, &gradout[dout * nqpts * nnodes]);
       CeedInt din = -1;
-      for (CeedInt ein = 0; ein < numemodein; ein++) {
+      for (CeedSize ein = 0; ein < numemodein; ein++) {
         const CeedScalar *b = NULL;
         if (emodein[ein] == CEED_EVAL_GRAD) ++din;
         CeedOperatorGetBasisPointer_Sycl(&b, emodein[ein], identity, interpin, &gradin[din * nqpts * nnodes]);
         // Each component
-        for (CeedInt compOut = 0; compOut < ncomp; compOut++) {
+        for (CeedSize compOut = 0; compOut < ncomp; compOut++) {
           // Each qpoint/node pair
           if (pointBlock) {
             // Point Block Diagonal
             for (CeedInt compIn = 0; compIn < ncomp; compIn++) {
               CeedScalar evalue = 0.0;
-              for (CeedInt q = 0; q < nqpts; q++) {
+              for (CeedSize q = 0; q < nqpts; q++) {
                 const CeedScalar qfvalue =
                     assembledqfarray[((((ein * ncomp + compIn) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q];
                 evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid];
@@ -890,7 +890,7 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p
           } else {
             // Diagonal Only
             CeedScalar evalue = 0.0;
-            for (CeedInt q = 0; q < nqpts; q++) {
+            for (CeedSize q = 0; q < nqpts; q++) {
               const CeedScalar qfvalue =
                   assembledqfarray[((((ein * ncomp + compOut) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q];
               evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid];
@@ -1187,23 +1187,23 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
   // TODO: expand to more general cases
   CeedOperatorAssemble_Sycl *asmb        = impl->asmb;
   const CeedInt              nelem       = asmb->nelem;
-  const CeedInt              nnodes      = asmb->nnodes;
-  const CeedInt              ncomp       = asmb->ncomp;
-  const CeedInt              nqpts       = asmb->nqpts;
-  const CeedInt              numemodein  = asmb->numemodein;
-  const CeedInt              numemodeout = asmb->numemodeout;
+  const CeedSize             nnodes      = asmb->nnodes;
+  const CeedSize             ncomp       = asmb->ncomp;
+  const CeedSize             nqpts       = asmb->nqpts;
+  const CeedSize             numemodein  = asmb->numemodein;
+  const CeedSize             numemodeout = asmb->numemodeout;
 
   // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element,
   // comp_in, comp_out, node_row, node_col
-  const CeedInt comp_out_stride = nnodes * nnodes;
-  const CeedInt comp_in_stride  = comp_out_stride * ncomp;
-  const CeedInt e_stride        = comp_in_stride * ncomp;
+  const CeedSize comp_out_stride = nnodes * nnodes;
+  const CeedSize comp_in_stride  = comp_out_stride * ncomp;
+  const CeedSize e_stride        = comp_in_stride * ncomp;
   // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt
-  const CeedInt qe_stride         = nqpts;
-  const CeedInt qcomp_out_stride  = nelem * qe_stride;
-  const CeedInt qemode_out_stride = qcomp_out_stride * ncomp;
-  const CeedInt qcomp_in_stride   = qemode_out_stride * numemodeout;
-  const CeedInt qemode_in_stride  = qcomp_in_stride * ncomp;
+  const CeedSize qe_stride         = nqpts;
+  const CeedSize qcomp_out_stride  = nelem * qe_stride;
+  const CeedSize qemode_out_stride = qcomp_out_stride * ncomp;
+  const CeedSize qcomp_in_stride   = qemode_out_stride * numemodeout;
+  const CeedSize qemode_in_stride  = qcomp_in_stride * ncomp;
 
   CeedScalar *B_in, *B_out;
   B_in                       = asmb->d_B_in;
@@ -1220,22 +1220,22 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
     const int l = idx.get(1);  // The output column index of each B^TDB operation
     const int i = idx.get(2);  // The output row index of each B^TDB operation
                                // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
-    for (CeedInt comp_in = 0; comp_in < ncomp; comp_in++) {
-      for (CeedInt comp_out = 0; comp_out < ncomp; comp_out++) {
+    for (CeedSize comp_in = 0; comp_in < ncomp; comp_in++) {
+      for (CeedSize comp_out = 0; comp_out < ncomp; comp_out++) {
         CeedScalar result        = 0.0;
-        CeedInt    qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
-        for (CeedInt emode_in = 0; emode_in < numemodein; emode_in++) {
-          CeedInt b_in_index = emode_in * nqpts * nnodes;
-          for (CeedInt emode_out = 0; emode_out < numemodeout; emode_out++) {
-            CeedInt b_out_index = emode_out * nqpts * nnodes;
-            CeedInt qf_index    = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
+        CeedSize   qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
+        for (CeedSize emode_in = 0; emode_in < numemodein; emode_in++) {
+          CeedSize b_in_index = emode_in * nqpts * nnodes;
+          for (CeedSize emode_out = 0; emode_out < numemodeout; emode_out++) {
+            CeedSize b_out_index = emode_out * nqpts * nnodes;
+            CeedSize qf_index    = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
             // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
-            for (CeedInt j = 0; j < nqpts; j++) {
+            for (CeedSize j = 0; j < nqpts; j++) {
               result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l];
             }
           }  // end of emode_out
         }    // end of emode_in
-        CeedInt val_index       = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l;
+        CeedSize val_index      = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l;
         values_array[val_index] = result;
       }  // end of out component
     }    // end of in component
diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
index 02b91fdd58..5c0d8b5f4b 100644
--- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
@@ -306,15 +306,15 @@ static int CeedVectorSetArray_Sycl(const CeedVector vec, const CeedMemType mem_t
 //------------------------------------------------------------------------------
 // Set host array to value
 //------------------------------------------------------------------------------
-static int CeedHostSetValue_Sycl(CeedScalar *h_array, CeedInt length, CeedScalar val) {
-  for (int i = 0; i < length; i++) h_array[i] = val;
+static int CeedHostSetValue_Sycl(CeedScalar *h_array, CeedSize length, CeedScalar val) {
+  for (CeedSize i = 0; i < length; i++) h_array[i] = val;
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Set device array to value
 //------------------------------------------------------------------------------
-static int CeedDeviceSetValue_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedInt length, CeedScalar val) {
+static int CeedDeviceSetValue_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedSize length, CeedScalar val) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.fill(d_array, val, length, {e});
@@ -527,8 +527,8 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no
 //------------------------------------------------------------------------------
 // Take reciprocal of a vector on host
 //------------------------------------------------------------------------------
-static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedInt length) {
-  for (int i = 0; i < length; i++) {
+static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedSize length) {
+  for (CeedSize i = 0; i < length; i++) {
     if (std::fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i];
   }
   return CEED_ERROR_SUCCESS;
@@ -537,7 +537,7 @@ static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedInt length) {
 //------------------------------------------------------------------------------
 // Take reciprocal of a vector on device
 //------------------------------------------------------------------------------
-static int CeedDeviceReciprocal_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedInt length) {
+static int CeedDeviceReciprocal_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedSize length) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) {
@@ -569,15 +569,15 @@ static int CeedVectorReciprocal_Sycl(CeedVector vec) {
 //------------------------------------------------------------------------------
 // Compute x = alpha x on the host
 //------------------------------------------------------------------------------
-static int CeedHostScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedInt length) {
-  for (int i = 0; i < length; i++) x_array[i] *= alpha;
+static int CeedHostScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
+  for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha;
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Compute x = alpha x on device
 //------------------------------------------------------------------------------
-static int CeedDeviceScale_Sycl(sycl::queue &sycl_queue, CeedScalar *x_array, CeedScalar alpha, CeedInt length) {
+static int CeedDeviceScale_Sycl(sycl::queue &sycl_queue, CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { x_array[i] *= alpha; });
@@ -607,15 +607,15 @@ static int CeedVectorScale_Sycl(CeedVector x, CeedScalar alpha) {
 //------------------------------------------------------------------------------
 // Compute y = alpha x + y on the host
 //------------------------------------------------------------------------------
-static int CeedHostAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) {
-  for (int i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
+static int CeedHostAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
+  for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Compute y = alpha x + y on device
 //------------------------------------------------------------------------------
-static int CeedDeviceAXPY_Sycl(sycl::queue &sycl_queue, CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) {
+static int CeedDeviceAXPY_Sycl(sycl::queue &sycl_queue, CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { y_array[i] += alpha * x_array[i]; });
@@ -652,15 +652,15 @@ static int CeedVectorAXPY_Sycl(CeedVector y, CeedScalar alpha, CeedVector x) {
 //------------------------------------------------------------------------------
 // Compute the pointwise multiplication w = x .* y on the host
 //------------------------------------------------------------------------------
-static int CeedHostPointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) {
-  for (int i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
+static int CeedHostPointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
+  for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
 //------------------------------------------------------------------------------
-static int CeedDevicePointwiseMult_Sycl(sycl::queue &sycl_queue, CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) {
+static int CeedDevicePointwiseMult_Sycl(sycl::queue &sycl_queue, CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { w_array[i] = x_array[i] * y_array[i]; });
diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp
index d7d0a1cfab..d6e7067eee 100644
--- a/backends/sycl/ceed-sycl-common.sycl.cpp
+++ b/backends/sycl/ceed-sycl-common.sycl.cpp
@@ -92,6 +92,7 @@ int CeedSetStream_Sycl(Ceed ceed, void *handle) {
   Ceed_Sycl *data;
   CeedCallBackend(CeedGetData(ceed, &data));
 
+  CeedCheck(handle, ceed, CEED_ERROR_BACKEND, "Stream handle is null");
   sycl::queue *q = static_cast<sycl::queue *>(handle);
 
   // Ensure we are using the expected device
@@ -111,6 +112,17 @@ int CeedSetStream_Sycl(Ceed ceed, void *handle) {
     delegate_data->sycl_queue   = *q;
   }
 
+  // Set queue and context for Ceed Fallback object
+  Ceed ceed_fallback = NULL;
+  CeedGetOperatorFallbackCeed(ceed, &ceed_fallback);
+  if (ceed_fallback) {
+    Ceed_Sycl *fallback_data;
+    CeedCallBackend(CeedGetData(ceed_fallback, &fallback_data));
+    fallback_data->sycl_device  = q->get_device();
+    fallback_data->sycl_context = q->get_context();
+    fallback_data->sycl_queue   = *q;
+  }
+
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/sycl/ceed-sycl-compile.hpp b/backends/sycl/ceed-sycl-compile.hpp
index 57ae351d8d..f4a95b8168 100644
--- a/backends/sycl/ceed-sycl-compile.hpp
+++ b/backends/sycl/ceed-sycl-compile.hpp
@@ -20,4 +20,7 @@ CEED_INTERN int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source
                                      const std::map<std::string, CeedInt> &constants = {});
 CEED_INTERN int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel);
 
+CEED_INTERN int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                           const int block_size_z, const int shared_mem_size, void **args);
+
 #endif  // _ceed_sycl_compile_h
diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp
index 03d725b231..845c46b5c5 100644
--- a/backends/sycl/ceed-sycl-compile.sycl.cpp
+++ b/backends/sycl/ceed-sycl-compile.sycl.cpp
@@ -22,7 +22,7 @@
 using ByteVector_t = std::vector<unsigned char>;
 
 //------------------------------------------------------------------------------
-//
+// Add defined constants at the beginning of kernel source
 //------------------------------------------------------------------------------
 static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_source, std::string &jit_source,
                                       const std::map<std::string, CeedInt> &constants = {}) {
@@ -77,24 +77,34 @@ static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_
 
 // ------------------------------------------------------------------------------
 // Load (compile) SPIR-V source and wrap in sycl kernel_bundle
-// TODO: determine appropriate flags
-// TODO: Error handle lz calls
 // ------------------------------------------------------------------------------
-static int CeedJitLoadModule_Sycl(const sycl::context &sycl_context, const sycl::device &sycl_device, const ByteVector_t &il_binary,
-                                  SyclModule_t **sycl_module) {
+static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, const sycl::device &sycl_device, const ByteVector_t &il_binary,
+                               SyclModule_t **sycl_module) {
   auto lz_context = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_context);
   auto lz_device  = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_device);
 
   ze_module_desc_t lz_mod_desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
-                                  nullptr,
+                                  nullptr,  // extension specific structs
                                   ZE_MODULE_FORMAT_IL_SPIRV,
                                   il_binary.size(),
                                   il_binary.data(),
                                   " -ze-opt-large-register-file",  // flags
-                                  nullptr};                        // build log
+                                  nullptr};                        // specialization constants
+
+  ze_module_handle_t           lz_module;
+  ze_module_build_log_handle_t lz_log;
+  ze_result_t                  lz_err = zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, &lz_log);
+
+  if (ZE_RESULT_SUCCESS != lz_err) {
+    size_t log_size = 0;
+    zeModuleBuildLogGetString(lz_log, &log_size, nullptr);
 
-  ze_module_handle_t lz_module;
-  zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, nullptr);
+    char *log_message;
+    CeedCall(CeedCalloc(log_size, &log_message));
+    zeModuleBuildLogGetString(lz_log, &log_size, log_message);
+
+    return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to compile Level Zero module:\n%s", log_message);
+  }
 
   // sycl make_<type> only throws errors for backend mismatch--assume we have vetted this already
   *sycl_module = new SyclModule_t(sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero, sycl::bundle_state::executable>(
@@ -119,7 +129,7 @@ int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule
   ByteVector_t il_binary;
   CeedCallBackend(CeedJitCompileSource_Sycl(ceed, data->sycl_device, jit_source, il_binary, flags));
 
-  CeedCallBackend(CeedJitLoadModule_Sycl(data->sycl_context, data->sycl_device, il_binary, sycl_module));
+  CeedCallBackend(CeedLoadModule_Sycl(ceed, data->sycl_context, data->sycl_device, il_binary, sycl_module));
 
   return CEED_ERROR_SUCCESS;
 }
@@ -139,10 +149,38 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st
 
   ze_kernel_desc_t   lz_kernel_desc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_name.c_str()};
   ze_kernel_handle_t lz_kernel;
-  zeKernelCreate(lz_module, &lz_kernel_desc, &lz_kernel);
+  ze_result_t        lz_err = zeKernelCreate(lz_module, &lz_kernel_desc, &lz_kernel);
+
+  if (ZE_RESULT_SUCCESS != lz_err) {
+    return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to retrieve kernel from Level Zero module");
+  }
 
   *sycl_kernel = new sycl::kernel(sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
       {*sycl_module, lz_kernel, sycl::ext::oneapi::level_zero::ownership::transfer}, data->sycl_context));
 
   return CEED_ERROR_SUCCESS;
 }
+
+//------------------------------------------------------------------------------
+// Run SYCL kernel for spatial dimension with shared memory
+//------------------------------------------------------------------------------
+int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                               const int block_size_z, const int shared_mem_size, void **kernel_args) {
+  sycl::range<3>    local_range(block_size_z, block_size_y, block_size_x);
+  sycl::range<3>    global_range(grid_size * block_size_z, block_size_y, block_size_x);
+  sycl::nd_range<3> kernel_range(global_range, local_range);
+
+  //-----------
+  // Order queue
+  Ceed_Sycl *ceed_Sycl;
+  CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
+  sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
+
+  ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(e);
+    cgh.set_args(*kernel_args);
+    cgh.parallel_for(kernel_range, *kernel);
+  });
+
+  return CEED_ERROR_SUCCESS;
+}
diff --git a/include/ceed/jit-source/sycl/sycl-gen-templates.h b/include/ceed/jit-source/sycl/sycl-gen-templates.h
new file mode 100644
index 0000000000..502ef562c0
--- /dev/null
+++ b/include/ceed/jit-source/sycl/sycl-gen-templates.h
@@ -0,0 +1,360 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for SYCL backend macro and type definitions for JiT source
+#ifndef _ceed_sycl_gen_templates_h
+#define _ceed_sycl_gen_templates_h
+
+#include <ceed/types.h>
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+// TODO: Handle FP32 case
+typedef atomic_double CeedAtomicScalar;
+
+//------------------------------------------------------------------------------
+// Load matrices for basis actions
+//------------------------------------------------------------------------------
+inline void loadMatrix(const CeedInt N, const CeedScalar* restrict d_B, CeedScalar* restrict B) {
+  const CeedInt item_id    = get_local_linear_id();
+  const CeedInt group_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
+  for (CeedInt i = item_id; i < N; i += group_size) B[i] = d_B[i];
+}
+
+//------------------------------------------------------------------------------
+// 1D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, offsets provided
+//------------------------------------------------------------------------------
+inline void readDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
+                             const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && elem < num_elem) {
+    const CeedInt node = item_id_x;
+    const CeedInt ind  = indices[node + elem * P_1D];
+    for (CeedInt comp = 0; comp < num_comp; ++comp) {
+      r_u[comp] = d_u[ind + strides_comp * comp];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, strided
+//------------------------------------------------------------------------------
+inline void readDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
+                              const CeedInt strides_elem, const CeedInt num_elem, global const CeedScalar* restrict d_u,
+                              private CeedScalar* restrict r_u) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && elem < num_elem) {
+    const CeedInt node = item_id_x;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+    for (CeedInt comp = 0; comp < num_comp; comp++) {
+      r_u[comp] = d_u[ind + comp * strides_comp];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, offsets provided
+//------------------------------------------------------------------------------
+inline void writeDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
+                              const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && elem < num_elem) {
+    const CeedInt node = item_id_x;
+    const CeedInt ind  = indices[node + elem * P_1D];
+    for (CeedInt comp = 0; comp < num_comp; ++comp)
+      atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, strided
+//------------------------------------------------------------------------------
+inline void writeDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
+                               const CeedInt strides_elem, const CeedInt num_elem, private const CeedScalar* restrict r_v,
+                               global CeedScalar* restrict d_v) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && elem < num_elem) {
+    const CeedInt node = item_id_x;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+    for (CeedInt comp = 0; comp < num_comp; comp++) {
+      d_v[ind + comp * strides_comp] = r_v[comp];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, offsets provided
+//------------------------------------------------------------------------------
+inline void readDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
+                             const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
+    const CeedInt node = item_id_x + item_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
+    for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, strided
+//------------------------------------------------------------------------------
+inline void readDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
+                              const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar* restrict d_u,
+                              private CeedScalar* restrict r_u) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
+    const CeedInt node = item_id_x + item_id_y * P_1D;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+    for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, offsets provided
+//------------------------------------------------------------------------------
+inline void writeDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
+                              const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
+    const CeedInt node = item_id_x + item_id_y * P_1D;
+    const CeedInt ind  = indices[node + elem * P_1D * P_1D];
+    for (CeedInt comp = 0; comp < num_comp; ++comp)
+      atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, strided
+//------------------------------------------------------------------------------
+inline void writeDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
+                               const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar* restrict r_v,
+                               global CeedScalar* restrict d_v) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
+    const CeedInt node = item_id_x + item_id_y * P_1D;
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+    for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, offsets provided
+//------------------------------------------------------------------------------
+inline void readDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
+                             const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
+    for (CeedInt z = 0; z < P_1D; ++z) {
+      const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
+      for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + strides_comp * comp];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, strided
+//------------------------------------------------------------------------------
+inline void readDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
+                              const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar* restrict d_u,
+                              private CeedScalar* restrict r_u) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
+    for (CeedInt z = 0; z < P_1D; ++z) {
+      const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
+      const CeedInt ind  = node * strides_node + elem * strides_elem;
+      for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> Q-vector, offests provided
+//------------------------------------------------------------------------------
+inline void readSliceQuadsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt Q_1D, const CeedInt num_elem, const CeedInt q,
+                                   const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
+    const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
+    const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
+    for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> Q-vector, strided
+//------------------------------------------------------------------------------
+inline void readSliceQuadsStrided3d(const CeedInt num_comp, const CeedInt Q_1D, CeedInt strides_node, CeedInt strides_comp, CeedInt strides_elem,
+                                    const CeedInt num_elem, const CeedInt q, const global CeedScalar* restrict d_u,
+                                    private CeedScalar* restrict r_u) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
+    const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
+    const CeedInt ind  = node * strides_node + elem * strides_elem;
+    for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, offsets provided
+//------------------------------------------------------------------------------
+inline void writeDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
+                              const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
+    for (CeedInt z = 0; z < P_1D; ++z) {
+      const CeedInt node = item_id_x + item_id_y * P_1D + z * P_1D * P_1D;
+      const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
+      for (CeedInt comp = 0; comp < num_comp; ++comp)
+        atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[z + comp * P_1D], memory_order_relaxed, memory_scope_device);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, strided
+//------------------------------------------------------------------------------
+inline void writeDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
+                               const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar* restrict r_v,
+                               global CeedScalar* restrict d_v) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+  const CeedInt elem      = get_global_id(2);
+
+  if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
+    for (CeedInt z = 0; z < P_1D; ++z) {
+      const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
+      const CeedInt ind  = node * strides_node + elem * strides_elem;
+      for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D collocated derivatives computation
+//------------------------------------------------------------------------------
+inline void gradCollo3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar* restrict r_U,
+                        const local CeedScalar* s_G, private CeedScalar* restrict r_V, local CeedScalar* restrict scratch) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+
+  for (CeedInt comp = 0; comp < num_comp; ++comp) {
+    if (item_id_x < Q_1D && item_id_y < Q_1D) {
+      scratch[item_id_x + item_id_y * T_1D] = r_U[q + comp * Q_1D];
+    }
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (item_id_x < Q_1D && item_id_y < Q_1D) {
+      // X derivative
+      r_V[comp + 0 * num_comp] = 0.0;
+      for (CeedInt i = 0; i < Q_1D; ++i)
+        r_V[comp + 0 * num_comp] += s_G[i + item_id_x * Q_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction (X derivative)
+
+      // Y derivative
+      r_V[comp + 1 * num_comp] = 0.0;
+      for (CeedInt i = 0; i < Q_1D; ++i)
+        r_V[comp + 1 * num_comp] += s_G[i + item_id_y * Q_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction (Y derivative)
+
+      // Z derivative
+      r_V[comp + 2 * num_comp] = 0.0;
+      for (CeedInt i = 0; i < Q_1D; ++i) r_V[comp + 2 * num_comp] += s_G[i + q * Q_1D] * r_U[i + comp * Q_1D];  // Contract z direction (Z derivative)
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D collocated derivatives transpose
+//------------------------------------------------------------------------------
+inline void gradColloTranspose3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar* restrict r_U,
+                                 const local CeedScalar* restrict s_G, private CeedScalar* restrict r_V, local CeedScalar* restrict scratch) {
+  const CeedInt item_id_x = get_local_id(0);
+  const CeedInt item_id_y = get_local_id(1);
+
+  for (CeedInt comp = 0; comp < num_comp; ++comp) {
+    // X derivative
+    if (item_id_x < Q_1D && item_id_y < Q_1D) {
+      scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 0 * num_comp];
+    }
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (item_id_x < Q_1D && item_id_y < Q_1D) {
+      for (CeedInt i = 0; i < Q_1D; ++i)
+        r_V[q + comp * Q_1D] += s_G[item_id_x + i * Q_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction (X derivative)
+    }
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Y derivative
+    if (item_id_x < Q_1D && item_id_y < Q_1D) {
+      scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 1 * num_comp];
+    }
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (item_id_x < Q_1D && item_id_y < Q_1D) {
+      for (CeedInt i = 0; i < Q_1D; ++i)
+        r_V[q + comp * Q_1D] += s_G[item_id_y + i * Q_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction (Y derivative)
+    }
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Z derivative
+    if (item_id_x < Q_1D && item_id_y < Q_1D) {
+      for (CeedInt i = 0; i < Q_1D; ++i)
+        r_V[i + comp * Q_1D] += s_G[i + q * Q_1D] * r_U[comp + 2 * num_comp];  // PARTIAL contract z direction (Z derivative)
+    }
+  }
+}
+
+#endif
diff --git a/include/ceed/jit-source/sycl/sycl-types.h b/include/ceed/jit-source/sycl/sycl-types.h
index e81e4cd7c2..c47b333a7a 100644
--- a/include/ceed/jit-source/sycl/sycl-types.h
+++ b/include/ceed/jit-source/sycl/sycl-types.h
@@ -14,23 +14,26 @@
 
 #define CEED_SYCL_NUMBER_FIELDS 16
 
+#ifdef __OPENCL_C_VERSION__
 typedef struct {
-  const CeedScalar* inputs[CEED_SYCL_NUMBER_FIELDS];
-  CeedScalar*       outputs[CEED_SYCL_NUMBER_FIELDS];
+  global const CeedScalar* inputs[CEED_SYCL_NUMBER_FIELDS];
+  global CeedScalar*       outputs[CEED_SYCL_NUMBER_FIELDS];
 } Fields_Sycl;
 
 typedef struct {
-  CeedInt* inputs[CEED_SYCL_NUMBER_FIELDS];
-  CeedInt* outputs[CEED_SYCL_NUMBER_FIELDS];
+  global const CeedInt* inputs[CEED_SYCL_NUMBER_FIELDS];
+  global CeedInt*       outputs[CEED_SYCL_NUMBER_FIELDS];
 } FieldsInt_Sycl;
+#else
+typedef struct {
+  const CeedScalar* inputs[CEED_SYCL_NUMBER_FIELDS];
+  CeedScalar*       outputs[CEED_SYCL_NUMBER_FIELDS];
+} Fields_Sycl;
 
-// TODO: Determine if slice should have `__local` qualifier
 typedef struct {
-  CeedInt     t_id_x;
-  CeedInt     t_id_y;
-  CeedInt     t_id_z;
-  CeedInt     t_id;
-  CeedScalar* slice;
-} SharedData_Sycl;
+  const CeedInt* inputs[CEED_SYCL_NUMBER_FIELDS];
+  CeedInt*       outputs[CEED_SYCL_NUMBER_FIELDS];
+} FieldsInt_Sycl;
+#endif
 
 #endif