diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
index ca4052739c..3edc12e3b7 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp
@@ -9,6 +9,6 @@
 #define _ceed_sycl_gen_operator_build_h
 
 CEED_INTERN int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes);
-CEED_INTERN int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op);
+CEED_INTERN int CeedSyclGenOperatorBuild(CeedOperator op);
 
 #endif  // _ceed_sycl_gen_operator_build_h
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
index 608635ab01..e126df6dd3 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
@@ -41,9 +41,9 @@ extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d
     block_sizes[1]                = thread1d;
     block_sizes[2]                = elems_per_block;
   } else if (dim == 3) {
-    const CeedInt elems_per_block = 1; //thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1);
+    const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1);
     block_sizes[0]                = thread1d;
-    block_sizes[1]                = thread1d * thread1d;
+    block_sizes[1]                = thread1d;
     block_sizes[2]                = elems_per_block;
   }
   return CEED_ERROR_SUCCESS;
@@ -54,11 +54,11 @@ extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d
 // - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct
 // - [ ] Do kernel jitting!
 //------------------------------------------------------------------------------
-extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
+extern "C" int CeedSyclGenOperatorBuild(CeedOperator op) {
   bool is_setup_done;
   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
   if (is_setup_done) return CEED_ERROR_SUCCESS;
-
+ 
   Ceed ceed;
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   Ceed_Sycl *sycl_data;
@@ -66,9 +66,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
 
   CeedOperator_Sycl_gen *impl;
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  Fields_Sycl             h_B, h_G;
-  FieldsInt_Sycl          h_indices;
-  CeedQFunction           qf;
+  Fields_Sycl h_B, h_G;
+  FieldsInt_Sycl h_indices;
+  CeedQFunction          qf;
   CeedQFunction_Sycl_gen *qf_impl;
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
@@ -81,11 +81,11 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-
-  CeedEvalMode              eval_mode;
-  CeedBasis                 basis;
+  
+  CeedEvalMode             eval_mode;
+  CeedBasis                basis;
   CeedBasis_Sycl_shared    *basis_impl;
-  CeedElemRestriction       Erestrict;
+  CeedElemRestriction      Erestrict;
   CeedElemRestriction_Sycl *restr_impl;
 
   // Check for restriction only identity operator
@@ -123,8 +123,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     CeedCallBackend(CeedFree(&sycl_gen_template_source));
   }
 
-  std::string_view  q_function_source(qf_impl->q_function_source);
-  std::string_view  q_function_name(qf_impl->q_function_name);
+  std::string_view q_function_source(qf_impl->q_function_source);
+  std::string_view q_function_name(qf_impl->q_function_name);
   const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(q_function_name);
 
   // Find dim, P_1d, Q_1d
@@ -199,9 +199,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
       }
     }
   }
-  use_collograd_parallelization = false;  // Enforcing non-collograd mode for 3D threads
 
-  CeedInt block_sizes[3];
+  CeedInt       block_sizes[3];
   CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes));
 
   // Define CEED_Q_VLA
@@ -209,25 +208,14 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
   if (dim != 3 || use_collograd_parallelization) {
     code << "#define CEED_Q_VLA 1\n\n";
   } else {
-    // code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
-    code << "#define CEED_Q_VLA " << 1 << "\n\n";
-  }
-
-  // Determine subgroup size based on supported sizes : Default : 16 (if supported)
-  std::vector allowed_sg_sizes  = sycl_data->sycl_device.get_info<sycl::info::device::sub_group_sizes>();
-  CeedInt     sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1];
-  for (const auto &s : allowed_sg_sizes) {
-    if (s == 16) {
-      sub_group_size_op = s;
-      break;
-    }
+    code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
   }
 
   code << q_function_source;
-
-  // Kernel function
+  
+  // Kernel function 
   code << "\n// -----------------------------------------------------------------------------\n";
-  code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n";
+  code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z)))\n";
   code << "kernel void " << operator_name << "(";
   code << "const CeedInt num_elem, ";
   code << "global void* ctx, ";
@@ -236,8 +224,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
   code << "global const Fields_Sycl* B, ";
   code << "global const Fields_Sycl* G, ";
   code << "global const CeedScalar * restrict W";
-  code << ") {\n";
-
+  code << ") {\n"; 
+  
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
@@ -248,14 +236,14 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
   for (CeedInt i = 0; i < num_output_fields; i++) {
     code << "  global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n";
   }
-
+  
   // TODO: Convert these to defined constants to save on GRF
   code << "  const CeedInt DIM = " << dim << ";\n";
   code << "  const CeedInt Q_1D = " << Q_1d << ";\n";
 
   const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2];
   code << "  local CeedScalar scratch[" << scratch_size << "];\n";
-  code << "  local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << (dim > 2 ? "*T_1D" : "") << ";\n";
+  code << "  local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
 
   code << "\n  // -- Input field constants and basis data --\n";
   // Initialize constants, and matrices B and G
@@ -301,14 +289,14 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
           code << "  loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n";
         } else {
           bool has_collo_grad = !!basis_impl->d_collo_grad_1d;
-          h_G.inputs[i]       = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
+          h_G.inputs[i]   = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
           code << "  local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
           code << "  loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i
                << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT:
-        break;  // No action
+        break;  // No action 
       case CEED_EVAL_DIV:
         break;  // TODO: Not implemented
       case CEED_EVAL_CURL:
@@ -357,10 +345,10 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
           code << "  loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n";
         } else {
           bool has_collo_grad = !!basis_impl->d_collo_grad_1d;
-          h_G.outputs[i]      = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
+          h_G.outputs[i]  = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
           code << "  local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
-          code << "  loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i
-               << ");\n";
+          code << "  loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_"
+               << i << ");\n";
         }
         break;
       // LCOV_EXCL_START
@@ -393,7 +381,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
 
     // Restriction
     if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
-      code << "    CeedScalar r_u_" << i << "[num_comp_in_" << i << "];\n";
+      code << "    CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n";
 
       bool is_strided;
       CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided));
@@ -405,8 +393,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         code << "    // CompStride: " << comp_stride << "\n";
         CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl));
         h_indices.inputs[i] = restr_impl->d_ind;
-        code << "    readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i
-             << "], d_u_" << i << ", r_u_" << i << ");\n";
+        code << "    readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i
+             << ", num_elem, indices->inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n";
       } else {
         bool has_backend_strides;
         CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides));
@@ -431,23 +419,22 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         }
         break;
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "];\n";
-        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i
-             << ", r_t_" << i << ", elem_scratch);\n";
+        code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
+        code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_"
+             << i << ", r_t_" << i << ", elem_scratch);\n";
         break;
       case CEED_EVAL_GRAD:
         if (use_collograd_parallelization) {
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "];\n";
-          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_"
-               << i << ", r_t_" << i << ", elem_scratch);\n";
+          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
+          code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i
+               << ", s_B_in_" << i << ", r_t_" << i << ", elem_scratch);\n";
         } else {
           CeedInt P_1d;
           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
           CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
-          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM];\n";
+          code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n";
           code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i
-               << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i
-               << ", r_t_" << i << ", elem_scratch);\n";
+               << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ", elem_scratch);\n";
         }
         break;
       case CEED_EVAL_WEIGHT:
@@ -472,26 +459,25 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
     if (eval_mode == CEED_EVAL_GRAD) {
       if (use_collograd_parallelization) {
         // Accumulator for gradient slices
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "];\n";
+        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
         code << "    for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n";
-        code << "      //for (CeedInt j = 0; j < Q_1D; ++j) {\n";
-        code << "        //r_tt_" << i << "[j + i*Q_1D] = 0.0;\n";
-        code << "        r_tt_" << i << "[i] = 0.0;\n";
-        code << "      //}\n";
+        code << "      for (CeedInt j = 0; j < Q_1D; ++j) {\n";
+        code << "        r_tt_" << i << "[j + i*Q_1D] = 0.0;\n";
+        code << "      }\n";
         code << "    }\n";
       } else {
-        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM];\n";
+        code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n";
       }
     }
     if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
-      code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "];\n";
+      code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
     }
   }
   // We treat quadrature points per slice in 3d to save registers
   if (use_collograd_parallelization) {
     code << "\n    // Note: Using planes of 3D elements\n";
-    code << "    //for (CeedInt q = 0; q < Q_1D; q++) {\n";
-    code << "    q = get_local_id(1) / T_1D;\n    {\n";
+    code << "    __attribute__((opencl_unroll_hint))\n";
+    code << "    for (CeedInt q = 0; q < Q_1D; q++) {\n";
     code << "      // -- Input fields --\n";
     for (CeedInt i = 0; i < num_input_fields; i++) {
       code << "      // ---- Input field " << i << " ----\n";
@@ -529,8 +515,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
             }
             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
             code << "      readSliceQuadsStrided"
-                 << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i
-                 << ", r_q_" << i << ");\n";
+                 << "3d(num_comp_in_" << i
+                 << ", Q_1D,"
+                 << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i << ", r_q_" << i << ");\n";
           }
           break;
         case CEED_EVAL_INTERP:
@@ -607,8 +594,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
   if (dim != 3 || use_collograd_parallelization) {
     code << "1";
   } else {
-    // code << "Q_1D";
-    code << "1";
+    code << "Q_1D";
   }
   code << ", in, out);\n";
   //--------------------------------------------------
@@ -632,8 +618,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
           code << "      }\n";
           break;
         case CEED_EVAL_GRAD:
-          code << "      gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i
-               << ", elem_scratch);\n";
+          code << "      gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ", elem_scratch);\n";
           break;
         case CEED_EVAL_WEIGHT:
           break;  // Should not occur
@@ -663,12 +648,12 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
         code << "    private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n";
         break;  // No action
       case CEED_EVAL_INTERP:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "];\n";
+        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
         code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
              << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
         break;
       case CEED_EVAL_GRAD:
-        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "];\n";
+        code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
         if (use_collograd_parallelization) {
           code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
                << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
@@ -677,8 +662,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
           CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
           CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
           code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i
-               << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i
-               << ", r_v_" << i << ", elem_scratch);\n";
+               << ",P_out_" << i << ", Q_1D, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
         }
         break;
       // LCOV_EXCL_START
@@ -705,8 +689,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
       code << "    // CompStride: " << comp_stride << "\n";
       CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl));
       h_indices.outputs[i] = restr_impl->d_ind;
-      code << "    writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i
-           << "], r_v_" << i << ", d_v_" << i << ");\n";
+      code << "    writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i
+           << ", num_elem, indices->outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n";
     } else {
       bool has_backend_strides;
       CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides));
@@ -727,32 +711,29 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
   code << "// -----------------------------------------------------------------------------\n\n";
 
   // Copy the struct (containing device addresses) from the host to the device
-  sycl::event copy_B       = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1);
-  sycl::event copy_G       = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1);
-  sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1);
+  sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B,impl->B,1);
+  sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G,impl->G,1);
+  sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices,impl->indices,1);
   // These copies can happen while the JIT is being done
-  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices}));
+  CeedCallSycl(ceed,sycl::event::wait_and_throw({copy_B, copy_G, copy_indices}));
 
   // View kernel for debugging
   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
   CeedDebug(ceed, code.str().c_str());
-  // std::cout<<" \n Generated Operator Kernels:\n";
-  // std::cout<<code.str();
 
   std::map<std::string, CeedInt> jit_constants;
-  jit_constants["T_1D"]         = block_sizes[0];
+  jit_constants["T_1D"] = block_sizes[0];
   jit_constants["GROUP_SIZE_X"] = block_sizes[0];
   jit_constants["GROUP_SIZE_Y"] = block_sizes[1];
   jit_constants["GROUP_SIZE_Z"] = block_sizes[2];
 
   // Compile kernel into a kernel bundle
-  CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants));
-
+  CeedCallBackend(CeedJitBuildModule_Sycl(ceed, code.str(), &impl->sycl_module,jit_constants));
+  
   // Load kernel function
-  CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op));
-
+  CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op));
+  
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
index 80a63a4b0a..08dd747a11 100644
--- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
@@ -33,7 +33,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
   CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
   CeedOperator_Sycl_gen *impl;
   CeedCallBackend(CeedOperatorGetData(op, &impl));
-  CeedQFunction           qf;
+  CeedQFunction          qf;
   CeedQFunction_Sycl_gen *qf_impl;
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
@@ -47,7 +47,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
   CeedVector   vec, output_vecs[CEED_FIELD_MAX] = {};
 
   // Creation of the operator
-  CeedCallBackend(CeedOperatorBuildKernel_Sycl_gen(op));
+  CeedCallBackend(CeedSyclGenOperatorBuild(op));
 
   // Input vectors
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -92,36 +92,38 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_impl->d_c));
 
   // Apply operator
-  const CeedInt dim  = impl->dim;
-  const CeedInt Q_1d = impl->Q_1d;
-  const CeedInt P_1d = impl->max_P_1d;
+  const CeedInt dim       = impl->dim;
+  const CeedInt Q_1d      = impl->Q_1d;
+  const CeedInt P_1d      = impl->max_P_1d;
+  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
   CeedInt       block_sizes[3], grid = 0;
   CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes));
   if (dim == 1) {
-    grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+    //CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
   } else if (dim == 2) {
-    grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+    //CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
   } else if (dim == 3) {
-    grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
+    //CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
   }
 
-  sycl::range<3>    local_range(block_sizes[2], block_sizes[1], block_sizes[0]);
-  sycl::range<3>    global_range(grid * block_sizes[2], block_sizes[1], block_sizes[0]);
-  sycl::nd_range<3> kernel_range(global_range, local_range);
-
+  sycl::range<3> local_range(block_sizes[2], block_sizes[1], block_sizes[0]);
+  sycl::range<3> global_range(grid*block_sizes[2], block_sizes[1], block_sizes[0]);
+  sycl::nd_range<3> kernel_range(global_range,local_range);
+  
   //-----------
-  // Order queue
+  //Order queue
   sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
-
-  CeedCallSycl(ceed, ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
+  
+  CeedCallSycl(ceed,
+  ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){
     cgh.depends_on(e);
     cgh.set_args(num_elem, qf_impl->d_c, impl->indices, impl->fields, impl->B, impl->G, impl->W);
-    cgh.parallel_for(kernel_range, *(impl->op));
+    cgh.parallel_for(kernel_range,*(impl->op));
   }));
-  CeedCallSycl(ceed, ceed_Sycl->sycl_queue.wait_and_throw());
+  CeedCallSycl(ceed,ceed_Sycl->sycl_queue.wait_and_throw());
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -174,15 +176,14 @@ int CeedOperatorCreate_Sycl_gen(CeedOperator op) {
   CeedCallBackend(CeedCalloc(1, &impl));
   CeedCallBackend(CeedOperatorSetData(op, impl));
 
-  impl->indices = sycl::malloc_device<FieldsInt_Sycl>(1, sycl_data->sycl_device, sycl_data->sycl_context);
-  impl->fields  = sycl::malloc_host<Fields_Sycl>(1, sycl_data->sycl_context);
-  impl->B       = sycl::malloc_device<Fields_Sycl>(1, sycl_data->sycl_device, sycl_data->sycl_context);
-  impl->G       = sycl::malloc_device<Fields_Sycl>(1, sycl_data->sycl_device, sycl_data->sycl_context);
-  impl->W       = sycl::malloc_device<CeedScalar>(1, sycl_data->sycl_device, sycl_data->sycl_context);
+  impl->indices = sycl::malloc_device<FieldsInt_Sycl>(1,sycl_data->sycl_device,sycl_data->sycl_context);
+  impl->fields = sycl::malloc_host<Fields_Sycl>(1,sycl_data->sycl_context);
+  impl->B = sycl::malloc_device<Fields_Sycl>(1,sycl_data->sycl_device,sycl_data->sycl_context);
+  impl->G = sycl::malloc_device<Fields_Sycl>(1,sycl_data->sycl_device,sycl_data->sycl_context);
+  impl->W = sycl::malloc_device<CeedScalar>(1,sycl_data->sycl_device,sycl_data->sycl_context);
 
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl_gen));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl_gen));
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
index 951474476a..e180638205 100644
--- a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp
@@ -36,7 +36,7 @@ static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) {
   // Wait for all work to finish before freeing memory
   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
   CeedCallSycl(ceed, sycl::free(impl->d_c, data->sycl_context));
-
+  
   CeedCallBackend(CeedFree(&impl->q_function_source));
   CeedCallBackend(CeedFree(&impl));
   return CEED_ERROR_SUCCESS;
@@ -67,5 +67,4 @@ int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl_gen));
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-gen/ceed-sycl-gen.hpp b/backends/sycl-gen/ceed-sycl-gen.hpp
index e4e716393c..c65a9fe3b7 100644
--- a/backends/sycl-gen/ceed-sycl-gen.hpp
+++ b/backends/sycl-gen/ceed-sycl-gen.hpp
@@ -16,16 +16,16 @@
 #include "../sycl/ceed-sycl-compile.hpp"
 
 typedef struct {
-  CeedInt         dim;
-  CeedInt         Q_1d;
-  CeedInt         max_P_1d;
-  SyclModule_t   *sycl_module;
-  sycl::kernel   *op;
+  CeedInt       dim;
+  CeedInt       Q_1d;
+  CeedInt       max_P_1d;
+  SyclModule_t* sycl_module;
+  sycl::kernel* op;
   FieldsInt_Sycl *indices;
   Fields_Sycl    *fields;
   Fields_Sycl    *B;
   Fields_Sycl    *G;
-  CeedScalar     *W;
+  CeedScalar   *W;
 } CeedOperator_Sycl_gen;
 
 typedef struct {
diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
index a5084ff6b7..c84133de26 100644
--- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
+++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp
@@ -10,16 +10,16 @@
 #include <ceed/backend.h>
 #include <ceed/ceed.h>
 
+#include <string.h>
 #include <string>
 #include <string_view>
-#include <string.h>
 
 //------------------------------------------------------------------------------
 // Backend init
 //------------------------------------------------------------------------------
 static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
   char *resource_root;
-  CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":device_id=", &resource_root));
+  CeedCallBackend(CeedSyclGetResourceRoot(ceed, resource, &resource_root));
   if (strcmp(resource_root, "/gpu/sycl") && strcmp(resource_root, "/gpu/sycl/gen")) {
     // LCOV_EXCL_START
     return CeedError(ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource);
@@ -30,7 +30,7 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
   Ceed_Sycl *data;
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedSetData(ceed, data));
-  CeedCallBackend(CeedInit_Sycl(ceed, resource));
+  CeedCallBackend(CeedSyclInit(ceed, resource));
 
   Ceed ceed_shared;
   CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared));
@@ -55,5 +55,4 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
 // Register backend
 //------------------------------------------------------------------------------
 CEED_INTERN int CeedRegister_Sycl_Gen(void) { return CeedRegister("/gpu/sycl/gen", CeedInit_Sycl_gen, 20); }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
index 601ebdda95..80d17788cb 100644
--- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
@@ -15,10 +15,8 @@
 #include "../sycl/ceed-sycl-compile.hpp"
 #include "ceed-sycl-ref.hpp"
 
-template <int>
-class CeedBasisSyclInterp;
-template <int>
-class CeedBasisSyclGrad;
+template<int> class CeedBasisSyclInterp;
+template<int> class CeedBasisSyclGrad;
 class CeedBasisSyclWeight;
 
 class CeedBasisSyclInterpNT;
@@ -35,11 +33,11 @@ static constexpr SpecID BASIS_Q_1D_ID;
 //------------------------------------------------------------------------------
 // Interpolation kernel - tensor
 //------------------------------------------------------------------------------
-template <int transpose>
-static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
-                                     const CeedScalar *u, CeedScalar *v) {
-  const CeedInt     buf_len   = impl->buf_len;
-  const CeedInt     op_len    = impl->op_len;
+template<int transpose>
+static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t& sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, const CeedScalar *u,
+                                     CeedScalar *v) {
+  const CeedInt buf_len       = impl->buf_len;
+  const CeedInt op_len        = impl->op_len;
   const CeedScalar *interp_1d = impl->d_interp_1d;
 
   const sycl::device &sycl_device         = sycl_queue.get_device();
@@ -65,8 +63,8 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
       const CeedInt P_1d     = kh.get_specialization_constant<BASIS_P_1D_ID>();
       const CeedInt Q_1d     = kh.get_specialization_constant<BASIS_Q_1D_ID>();
       //-------------------------------------------------------------->
-      const CeedInt num_nodes     = CeedIntPow(P_1d, dim);
-      const CeedInt num_qpts      = CeedIntPow(Q_1d, dim);
+      const CeedInt num_nodes     = CeedIntPow(P_1d,dim);
+      const CeedInt num_qpts      = CeedIntPow(Q_1d,dim);
       const CeedInt P             = transpose ? Q_1d : P_1d;
       const CeedInt Q             = transpose ? P_1d : Q_1d;
       const CeedInt stride_0      = transpose ? 1 : P_1d;
@@ -136,16 +134,16 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
 //------------------------------------------------------------------------------
 // Gradient kernel - tensor
 //------------------------------------------------------------------------------
-template <int transpose>
-static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
-                                   const CeedScalar *u, CeedScalar *v) {
-  const CeedInt     buf_len   = impl->buf_len;
-  const CeedInt     op_len    = impl->op_len;
+template<int transpose>
+static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t& sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, const CeedScalar *u,
+                                   CeedScalar *v) {  
+  const CeedInt buf_len       = impl->buf_len;
+  const CeedInt op_len        = impl->op_len;
   const CeedScalar *interp_1d = impl->d_interp_1d;
   const CeedScalar *grad_1d   = impl->d_grad_1d;
 
-  const sycl::device &sycl_device     = sycl_queue.get_device();
-  const CeedInt       work_group_size = 32;
+  const sycl::device &sycl_device         = sycl_queue.get_device();
+  const CeedInt       work_group_size     = 32;
   sycl::range<1>      local_range(work_group_size);
   sycl::range<1>      global_range(num_elem * work_group_size);
   sycl::nd_range<1>   kernel_range(global_range, local_range);
@@ -166,22 +164,23 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &
       const CeedInt P_1d     = kh.get_specialization_constant<BASIS_P_1D_ID>();
       const CeedInt Q_1d     = kh.get_specialization_constant<BASIS_Q_1D_ID>();
       //-------------------------------------------------------------->
-      const CeedInt num_nodes     = CeedIntPow(P_1d, dim);
-      const CeedInt num_qpts      = CeedIntPow(Q_1d, dim);
-      const CeedInt P             = transpose ? Q_1d : P_1d;
-      const CeedInt Q             = transpose ? P_1d : Q_1d;
-      const CeedInt stride_0      = transpose ? 1 : P_1d;
-      const CeedInt stride_1      = transpose ? P_1d : 1;
-      const CeedInt u_stride      = transpose ? num_qpts : num_nodes;
-      const CeedInt v_stride      = transpose ? num_nodes : num_qpts;
+      const CeedInt num_nodes = CeedIntPow(P_1d,dim);
+      const CeedInt num_qpts  = CeedIntPow(Q_1d,dim);
+      const CeedInt P = transpose ? Q_1d : P_1d;
+      const CeedInt Q = transpose ? P_1d : Q_1d;
+      const CeedInt stride_0 = transpose ? 1 : P_1d;
+      const CeedInt stride_1 = transpose ? P_1d : 1;
+      const CeedInt u_stride = transpose ? num_qpts : num_nodes;
+      const CeedInt v_stride = transpose ? num_nodes : num_qpts;
       const CeedInt u_comp_stride = num_elem * u_stride;
       const CeedInt v_comp_stride = num_elem * v_stride;
       const CeedInt u_dim_stride  = transpose ? num_elem * num_qpts * num_comp : 0;
       const CeedInt v_dim_stride  = transpose ? 0 : num_elem * num_qpts * num_comp;
-      sycl::group   work_group    = work_item.get_group();
-      const CeedInt i             = work_item.get_local_linear_id();
-      const CeedInt group_size    = work_group.get_local_linear_range();
-      const CeedInt elem          = work_group.get_group_linear_id();
+      
+      sycl::group   work_group = work_item.get_group();
+      const CeedInt i          = work_item.get_local_linear_id();
+      const CeedInt group_size = work_group.get_local_linear_range();
+      const CeedInt elem       = work_group.get_group_linear_id();
 
       CeedScalar *s_interp_1d = s_mem.get_pointer();
       CeedScalar *s_grad_1d   = s_interp_1d + P * Q;
@@ -204,7 +203,7 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &
           for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) {
             // Use older version of sycl workgroup barrier for performance reasons
             // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
-            // sycl::group_barrier(work_group);
+            //sycl::group_barrier(work_group);
             work_item.barrier(sycl::access::fence_space::local_space);
 
             pre /= P;
@@ -292,14 +291,14 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      if (transpose) {
+      if(transpose) {
         CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_TRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
       } else {
         CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_NOTRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
       }
     } break;
     case CEED_EVAL_GRAD: {
-      if (transpose) {
+      if(transpose) {
         CeedCallBackend(CeedBasisApplyGrad_Sycl<1>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
       } else {
         CeedCallBackend(CeedBasisApplyGrad_Sycl<0>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
@@ -582,19 +581,23 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
 
   CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
-  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, {e});
+  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d,{e});
 
   const CeedInt interp_length = Q_1d * P_1d;
   CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
-  sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, {e});
+  sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length,{e});
 
   CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
-  sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, {e});
+  sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length,{e});
 
   CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
 
-  std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
-                                             sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()};
+  std::vector<sycl::kernel_id> kernel_ids = {
+    sycl::get_kernel_id<CeedBasisSyclInterp<1>>(),
+    sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
+    sycl::get_kernel_id<CeedBasisSyclGrad<1>>(),
+    sycl::get_kernel_id<CeedBasisSyclGrad<0>>()
+  };
 
   sycl::kernel_bundle<sycl::bundle_state::input> input_bundle = sycl::get_kernel_bundle<sycl::bundle_state::input>(data->sycl_context, kernel_ids);
   input_bundle.set_specialization_constant<BASIS_DIM_ID>(dim);
@@ -602,7 +605,7 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
   input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d);
   input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d);
 
-  CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));
+  CeedCallSycl(ceed,impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
@@ -636,15 +639,15 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
 
   CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
-  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, {e});
+  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts,{e});
 
   const CeedInt interp_length = num_qpts * num_nodes;
   CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
-  sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, {e});
+  sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length,{e});
 
   const CeedInt grad_length = num_qpts * num_nodes * dim;
   CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
-  sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, {e});
+  sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length,{e});
 
   CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
 
@@ -655,5 +658,4 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl));
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
index 285d199ddd..6b97689dee 100644
--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
@@ -348,7 +348,7 @@ static inline int CeedOperatorRestoreInputs_Sycl(CeedInt numinputfields, CeedQFu
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode));
     if (emode == CEED_EVAL_WEIGHT) {  // Skip
     } else {
-      if (!impl->evecs[i]) {  // This was a skiprestrict case
+      if (!impl->evecs[i]) {          // This was a skiprestrict case
         CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec));
         CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i]));
       } else {
@@ -481,8 +481,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op,
   Ceed        ceed, ceedparent;
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedGetOperatorFallbackParentCeed(ceed, &ceedparent));
-  ceedparent                            = ceedparent ? ceedparent : ceed;
-  CeedScalar *edata[2 * CEED_FIELD_MAX] = {NULL};
+  ceedparent = ceedparent ? ceedparent : ceed;
+  CeedScalar *edata[2 * CEED_FIELD_MAX];
 
   // Setup
   CeedCallBackend(CeedOperatorSetup_Sycl(op));
@@ -787,7 +787,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const
   std::vector<sycl::event> copy_events;
   if (evalNone) {
     CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity));
-    for (CeedSize i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0;
+    for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0;
     CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context));
     sycl::event identity_copy = sycl_data->sycl_queue.copy<CeedScalar>(identity, diag->d_identity, iLen, {e});
     copy_events.push_back(identity_copy);
@@ -838,11 +838,11 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool pointBlock, const CeedInt nelem, const CeedOperatorDiag_Sycl *diag,
                                            const CeedScalar *assembledqfarray, CeedScalar *elemdiagarray) {
-  const CeedSize nnodes      = diag->nnodes;
-  const CeedSize nqpts       = diag->nqpts;
-  const CeedSize ncomp       = diag->ncomp;
-  const CeedSize numemodein  = diag->numemodein;
-  const CeedSize numemodeout = diag->numemodeout;
+  const CeedInt nnodes      = diag->nnodes;
+  const CeedInt nqpts       = diag->nqpts;
+  const CeedInt ncomp       = diag->ncomp;
+  const CeedInt numemodein  = diag->numemodein;
+  const CeedInt numemodeout = diag->numemodeout;
 
   const CeedScalar   *identity  = diag->d_identity;
   const CeedScalar   *interpin  = diag->d_interpin;
@@ -864,23 +864,23 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p
     // Each element
     CeedInt dout = -1;
     // Each basis eval mode pair
-    for (CeedSize eout = 0; eout < numemodeout; eout++) {
+    for (CeedInt eout = 0; eout < numemodeout; eout++) {
       const CeedScalar *bt = NULL;
       if (emodeout[eout] == CEED_EVAL_GRAD) ++dout;
       CeedOperatorGetBasisPointer_Sycl(&bt, emodeout[eout], identity, interpout, &gradout[dout * nqpts * nnodes]);
       CeedInt din = -1;
-      for (CeedSize ein = 0; ein < numemodein; ein++) {
+      for (CeedInt ein = 0; ein < numemodein; ein++) {
         const CeedScalar *b = NULL;
         if (emodein[ein] == CEED_EVAL_GRAD) ++din;
         CeedOperatorGetBasisPointer_Sycl(&b, emodein[ein], identity, interpin, &gradin[din * nqpts * nnodes]);
         // Each component
-        for (CeedSize compOut = 0; compOut < ncomp; compOut++) {
+        for (CeedInt compOut = 0; compOut < ncomp; compOut++) {
           // Each qpoint/node pair
           if (pointBlock) {
             // Point Block Diagonal
             for (CeedInt compIn = 0; compIn < ncomp; compIn++) {
               CeedScalar evalue = 0.0;
-              for (CeedSize q = 0; q < nqpts; q++) {
+              for (CeedInt q = 0; q < nqpts; q++) {
                 const CeedScalar qfvalue =
                     assembledqfarray[((((ein * ncomp + compIn) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q];
                 evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid];
@@ -890,7 +890,7 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p
           } else {
             // Diagonal Only
             CeedScalar evalue = 0.0;
-            for (CeedSize q = 0; q < nqpts; q++) {
+            for (CeedInt q = 0; q < nqpts; q++) {
               const CeedScalar qfvalue =
                   assembledqfarray[((((ein * ncomp + compOut) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q];
               evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid];
@@ -916,8 +916,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVec
   CeedCallBackend(CeedGetData(ceed, &sycl_data));
 
   // Assemble QFunction
-  CeedVector          assembledqf = NULL;
-  CeedElemRestriction rstr        = NULL;
+  CeedVector          assembledqf;
+  CeedElemRestriction rstr;
   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr));
 
@@ -1187,23 +1187,23 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
   // TODO: expand to more general cases
   CeedOperatorAssemble_Sycl *asmb        = impl->asmb;
   const CeedInt              nelem       = asmb->nelem;
-  const CeedSize             nnodes      = asmb->nnodes;
-  const CeedSize             ncomp       = asmb->ncomp;
-  const CeedSize             nqpts       = asmb->nqpts;
-  const CeedSize             numemodein  = asmb->numemodein;
-  const CeedSize             numemodeout = asmb->numemodeout;
+  const CeedInt              nnodes      = asmb->nnodes;
+  const CeedInt              ncomp       = asmb->ncomp;
+  const CeedInt              nqpts       = asmb->nqpts;
+  const CeedInt              numemodein  = asmb->numemodein;
+  const CeedInt              numemodeout = asmb->numemodeout;
 
   // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element,
   // comp_in, comp_out, node_row, node_col
-  const CeedSize comp_out_stride = nnodes * nnodes;
-  const CeedSize comp_in_stride  = comp_out_stride * ncomp;
-  const CeedSize e_stride        = comp_in_stride * ncomp;
+  const CeedInt comp_out_stride = nnodes * nnodes;
+  const CeedInt comp_in_stride  = comp_out_stride * ncomp;
+  const CeedInt e_stride        = comp_in_stride * ncomp;
   // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt
-  const CeedSize qe_stride         = nqpts;
-  const CeedSize qcomp_out_stride  = nelem * qe_stride;
-  const CeedSize qemode_out_stride = qcomp_out_stride * ncomp;
-  const CeedSize qcomp_in_stride   = qemode_out_stride * numemodeout;
-  const CeedSize qemode_in_stride  = qcomp_in_stride * ncomp;
+  const CeedInt qe_stride         = nqpts;
+  const CeedInt qcomp_out_stride  = nelem * qe_stride;
+  const CeedInt qemode_out_stride = qcomp_out_stride * ncomp;
+  const CeedInt qcomp_in_stride   = qemode_out_stride * numemodeout;
+  const CeedInt qemode_in_stride  = qcomp_in_stride * ncomp;
 
   CeedScalar *B_in, *B_out;
   B_in                       = asmb->d_B_in;
@@ -1220,22 +1220,22 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp
     const int l = idx.get(1);  // The output column index of each B^TDB operation
     const int i = idx.get(2);  // The output row index of each B^TDB operation
                                // such that we have (Bout^T)_ij D_jk Bin_kl = C_il
-    for (CeedSize comp_in = 0; comp_in < ncomp; comp_in++) {
-      for (CeedSize comp_out = 0; comp_out < ncomp; comp_out++) {
+    for (CeedInt comp_in = 0; comp_in < ncomp; comp_in++) {
+      for (CeedInt comp_out = 0; comp_out < ncomp; comp_out++) {
         CeedScalar result        = 0.0;
-        CeedSize   qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
-        for (CeedSize emode_in = 0; emode_in < numemodein; emode_in++) {
-          CeedSize b_in_index = emode_in * nqpts * nnodes;
-          for (CeedSize emode_out = 0; emode_out < numemodeout; emode_out++) {
-            CeedSize b_out_index = emode_out * nqpts * nnodes;
-            CeedSize qf_index    = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
+        CeedInt    qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e;
+        for (CeedInt emode_in = 0; emode_in < numemodein; emode_in++) {
+          CeedInt b_in_index = emode_in * nqpts * nnodes;
+          for (CeedInt emode_out = 0; emode_out < numemodeout; emode_out++) {
+            CeedInt b_out_index = emode_out * nqpts * nnodes;
+            CeedInt qf_index    = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in;
             // Perform the B^T D B operation for this 'chunk' of D (the qf_array)
-            for (CeedSize j = 0; j < nqpts; j++) {
+            for (CeedInt j = 0; j < nqpts; j++) {
               result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l];
             }
           }  // end of emode_out
         }    // end of emode_in
-        CeedSize val_index      = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l;
+        CeedInt val_index       = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l;
         values_array[val_index] = result;
       }  // end of out component
     }    // end of in component
@@ -1343,8 +1343,8 @@ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, Ceed
   }
 
   // Assemble QFunction
-  CeedVector          assembled_qf = NULL;
-  CeedElemRestriction rstr_q       = NULL;
+  CeedVector          assembled_qf;
+  CeedElemRestriction rstr_q;
   CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE));
   CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q));
   CeedScalar *values_array;
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
index b53d46a71f..d23cbe36ee 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp
@@ -8,6 +8,6 @@
 #ifndef _ceed_sycl_qfunction_load_hpp
 #define _ceed_sycl_qfunction_load_hpp
 
-CEED_INTERN int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf);
+CEED_INTERN int CeedSyclBuildQFunction(CeedQFunction qf);
 
 #endif  // _ceed_sycl_qfunction_load_hpp
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
index 35b3699518..c1cbae731f 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
@@ -26,7 +26,7 @@
 //
 // TODO: Refactor
 //------------------------------------------------------------------------------
-extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
+extern "C" int CeedSyclBuildQFunction(CeedQFunction qf) {
   CeedQFunction_Sycl* impl;
   CeedCallBackend(CeedQFunctionGetData(qf, (void**)&impl));
   // QFunction is built
@@ -60,17 +60,17 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   CeedCallBackend(CeedQFunctionGetKernelName(qf, &qfunction_name));
 
   char* qfunction_source;
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n");
+  CeedDebug256(ceed, 2, "----- Loading QFunction User Source -----\n");
   CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &qfunction_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n");
+  CeedDebug256(ceed, 2, "----- Loading QFunction User Source Complete! -----\n");
 
   char* read_write_kernel_path;
   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-ref-qfunction.h", &read_write_kernel_path));
 
   char* read_write_kernel_source;
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n");
+  CeedDebug256(ceed, 2, "----- Loading QFunction Read/Write Kernel Source -----\n");
   CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n");
+  CeedDebug256(ceed, 2, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n");
 
   std::string_view  qf_name_view(qfunction_name);
   std::string_view  qf_source_view(qfunction_source);
@@ -86,8 +86,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   // Kernel function
   // Here we are fixing a lower sub-group size value to avoid register spills
   // This needs to be revisited if all qfunctions require this.
-  code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) __kernel void " << kernel_name
-       << "(__global void *ctx, CeedInt Q,\n";
+  code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) __kernel void " << kernel_name << "(__global void *ctx, CeedInt Q,\n";
 
   // OpenCL doesn't allow for structs with pointers.
   // We will need to pass all of the arguments individually.
@@ -158,12 +157,12 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
   code << "}\n";
 
   // View kernel for debugging
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated QFunction Kernels:\n");
+  CeedDebug256(ceed, 2, "Generated QFunction Kernels:\n");
   CeedDebug(ceed, code.str().c_str());
 
   // Compile kernel
-  CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module));
-  CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, kernel_name, &impl->QFunction));
+  CeedCallBackend(CeedJitBuildModule_Sycl(ceed, code.str(), &impl->sycl_module));
+  CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, kernel_name, &impl->QFunction));
 
   // Cleanup
   CeedCallBackend(CeedFree(&qfunction_source));
@@ -172,5 +171,4 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
 
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
index 10e090b58c..6822c333ba 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
@@ -28,7 +28,7 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
   CeedCallBackend(CeedQFunctionGetData(qf, &impl));
 
   // Build and compile kernel, if not done
-  if (!impl->QFunction) CeedCallBackend(CeedQFunctionBuildKernel_Sycl(qf));
+  if (!impl->QFunction) CeedCallBackend(CeedSyclBuildQFunction(qf));
 
   Ceed ceed;
   CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed));
@@ -80,8 +80,8 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
     // Hard-coding the work-group size for now
     // We could use the Level Zero API to query and set an appropriate size in future
     // Equivalent of CUDA Occupancy Calculator
-    int               wg_size   = WG_SIZE_QF;
-    sycl::range<1>    rounded_Q = ((Q + (wg_size - 1)) / wg_size) * wg_size;
+    int wg_size = WG_SIZE_QF;
+    sycl::range<1> rounded_Q = ((Q + (wg_size- 1)) / wg_size) * wg_size;
     sycl::nd_range<1> kernel_range(rounded_Q, wg_size);
     cgh.parallel_for(kernel_range, *(impl->QFunction));
   });
@@ -139,5 +139,4 @@ int CeedQFunctionCreate_Sycl(CeedQFunction qf) {
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl));
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
index 0c6d2a0637..b3848d9883 100644
--- a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
@@ -269,7 +269,7 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con
   CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
   CeedQFunctionContext_Sycl *impl;
   CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
-
+  
   Ceed_Sycl *ceedSycl;
   CeedCallBackend(CeedGetData(ceed, &ceedSycl));
 
@@ -399,5 +399,4 @@ int CeedQFunctionContextCreate_Sycl(CeedQFunctionContext ctx) {
 
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp
index 8cdaa3531e..9c9be231c2 100644
--- a/backends/sycl-ref/ceed-sycl-ref.hpp
+++ b/backends/sycl-ref/ceed-sycl-ref.hpp
@@ -44,18 +44,18 @@ typedef struct {
 } CeedElemRestriction_Sycl;
 
 typedef struct {
-  CeedInt       dim;
-  CeedInt       P_1d;
-  CeedInt       Q_1d;
-  CeedInt       num_comp;
-  CeedInt       num_nodes;
-  CeedInt       num_qpts;
-  CeedInt       buf_len;
-  CeedInt       op_len;
+  CeedInt     dim;
+  CeedInt     P_1d;
+  CeedInt     Q_1d;
+  CeedInt     num_comp;
+  CeedInt     num_nodes;
+  CeedInt     num_qpts;
+  CeedInt     buf_len;
+  CeedInt     op_len;
   SyclModule_t *sycl_module;
-  CeedScalar   *d_interp_1d;
-  CeedScalar   *d_grad_1d;
-  CeedScalar   *d_q_weight_1d;
+  CeedScalar *d_interp_1d;
+  CeedScalar *d_grad_1d;
+  CeedScalar *d_q_weight_1d;
 } CeedBasis_Sycl;
 
 typedef struct {
@@ -112,16 +112,25 @@ typedef struct {
   CeedOperatorAssemble_Sycl *asmb;
 } CeedOperator_Sycl;
 
+// CEED_INTERN int CeedSyclGetCublasHandle(Ceed ceed, cublasHandle_t *handle);
+
 CEED_INTERN int CeedVectorCreate_Sycl(CeedSize n, CeedVector vec);
 
+CEED_INTERN int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r);
+
+CEED_INTERN int CeedElemRestrictionCreateBlocked_Sycl(const CeedMemType mem_type, const CeedCopyMode copy_mode, const CeedInt *indices,
+                                                      const CeedElemRestriction res);
+
+CEED_INTERN int CeedBasisApplyElems_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
+                                         const CeedVector u, CeedVector v);
+
+CEED_INTERN int CeedQFunctionApplyElems_Sycl(CeedQFunction qf, const CeedInt Q, const CeedVector *const u, const CeedVector *v);
+
 CEED_INTERN int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                              const CeedScalar *qref_1d, const CeedScalar *qweight_1d, CeedBasis basis);
 
-CEED_INTERN int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp,
-                                       const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis);
-
-CEED_INTERN int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients,
-                                               const CeedInt8 *curl_orients, CeedElemRestriction r);
+CEED_INTERN int CeedBasisCreateH1_Sycl(CeedElemTopology, CeedInt, CeedInt, CeedInt, const CeedScalar *, const CeedScalar *, const CeedScalar *,
+                                       const CeedScalar *, CeedBasis);
 
 CEED_INTERN int CeedQFunctionCreate_Sycl(CeedQFunction qf);
 
diff --git a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
index 65d4fead36..b0e45614f1 100644
--- a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp
@@ -25,25 +25,28 @@ static int CeedGetPreferredMemType_Sycl(CeedMemType *mem_type) {
 //------------------------------------------------------------------------------
 // Backend Init
 //------------------------------------------------------------------------------
-static int CeedInit_Sycl_ref(const char *resource, Ceed ceed) {
+static int CeedInit_Sycl(const char *resource, Ceed ceed) {
   char *resource_root;
-  CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
-  CeedCheck(!std::strcmp(resource_root, "/gpu/sycl/ref") || !std::strcmp(resource_root, "/cpu/sycl/ref"), ceed, CEED_ERROR_BACKEND,
-            "Sycl backend cannot use resource: %s", resource);
+  CeedCallBackend(CeedSyclGetResourceRoot(ceed, resource, &resource_root));
+  if (std::strcmp(resource_root, "/gpu/sycl/ref") && std::strcmp(resource_root, "/cpu/sycl/ref")) {
+    // LCOV_EXCL_START
+    return CeedError(ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource);
+    // LCOV_EXCL_STOP
+  }
   CeedCallBackend(CeedFree(&resource_root));
   CeedCallBackend(CeedSetDeterministic(ceed, true));
 
   Ceed_Sycl *data;
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedSetData(ceed, data));
-  CeedCallBackend(CeedInit_Sycl(ceed, resource));
+  CeedCallBackend(CeedSyclInit(ceed, resource));
 
-  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "SetStream", CeedSetStream_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "GetPreferredMemType", CeedGetPreferredMemType_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "VectorCreate", &CeedVectorCreate_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "BasisCreateTensorH1", &CeedBasisCreateTensorH1_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "BasisCreateH1", &CeedBasisCreateH1_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "ElemRestrictionCreate", &CeedElemRestrictionCreate_Sycl));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "ElemRestrictionCreateBlocked", &CeedElemRestrictionCreateBlocked_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", &CeedQFunctionCreate_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionContextCreate", &CeedQFunctionContextCreate_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", &CeedOperatorCreate_Sycl));
@@ -56,9 +59,8 @@ static int CeedInit_Sycl_ref(const char *resource, Ceed ceed) {
 // Backend Register
 //------------------------------------------------------------------------------
 CEED_INTERN int CeedRegister_Sycl(void) {
-  CeedCallBackend(CeedRegister("/gpu/sycl/ref", CeedInit_Sycl_ref, 40));
-  CeedCallBackend(CeedRegister("/cpu/sycl/ref", CeedInit_Sycl_ref, 50));
+  CeedCallBackend(CeedRegister("/gpu/sycl/ref", CeedInit_Sycl, 40));
+  CeedCallBackend(CeedRegister("/cpu/sycl/ref", CeedInit_Sycl, 50));
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
index ef0a1f4c47..4407791703 100644
--- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp
@@ -196,6 +196,18 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction r, CeedTransposeMod
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Blocked not supported
+//------------------------------------------------------------------------------
+int CeedElemRestrictionApplyBlock_Sycl(CeedElemRestriction r, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
+                                       CeedRequest *request) {
+  // LCOV_EXCL_START
+  Ceed ceed;
+  CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
+  return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions");
+  // LCOV_EXCL_STOP
+}
+
 //------------------------------------------------------------------------------
 // Get offsets
 //------------------------------------------------------------------------------
@@ -333,8 +345,7 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee
 //------------------------------------------------------------------------------
 // Create restriction
 //------------------------------------------------------------------------------
-int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients,
-                                   const CeedInt8 *curl_orients, CeedElemRestriction r) {
+int CeedElemRestrictionCreate_Sycl(CeedMemType m_type, CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r) {
   Ceed ceed;
   CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
   Ceed_Sycl *data;
@@ -349,11 +360,6 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedInt strides[3]  = {1, size, elem_size};
   CeedInt comp_stride = 1;
 
-  CeedRestrictionType rstr_type;
-  CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type));
-  CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND,
-            "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented");
-
   // Stride data
   bool is_strided;
   CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided));
@@ -386,7 +392,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedElemRestrictionSetELayout(r, layout));
 
   // Set up device indices/offset arrays
-  if (mem_type == CEED_MEM_HOST) {
+  if (m_type == CEED_MEM_HOST) {
     switch (copy_mode) {
       case CEED_OWN_POINTER:
         impl->h_ind_allocated = (CeedInt *)indices;
@@ -414,7 +420,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode,
       CeedCallSycl(ceed, copy_event.wait_and_throw());
       CeedCallBackend(CeedElemRestrictionOffset_Sycl(r, indices));
     }
-  } else if (mem_type == CEED_MEM_DEVICE) {
+  } else if (m_type == CEED_MEM_DEVICE) {
     switch (copy_mode) {
       case CEED_COPY_VALUES:
         if (indices != NULL) {
@@ -453,9 +459,18 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode,
 
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Sycl));
-  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Sycl));
-  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Sycl));
+  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyBlock", CeedElemRestrictionApplyBlock_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl));
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Sycl));
   return CEED_ERROR_SUCCESS;
 }
+
+//------------------------------------------------------------------------------
+// Blocked not supported
+//------------------------------------------------------------------------------
+int CeedElemRestrictionCreateBlocked_Sycl(const CeedMemType m_type, const CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r) {
+  Ceed ceed;
+  CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed));
+  return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions");
+}
+//------------------------------------------------------------------------------
diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
index 5c0d8b5f4b..d806e37e0a 100644
--- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
+++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp
@@ -84,7 +84,11 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) {
   Ceed_Sycl *data;
   CeedCallBackend(CeedGetData(ceed, &data));
 
-  CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
+      if (!impl->d_array) {
+    // LCOV_EXCL_START
+    return CeedError(ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
+    // LCOV_EXCL_STOP
+  }
 
   CeedSize length;
   CeedCallBackend(CeedVectorGetLength(vec, &length));
@@ -306,15 +310,15 @@ static int CeedVectorSetArray_Sycl(const CeedVector vec, const CeedMemType mem_t
 //------------------------------------------------------------------------------
 // Set host array to value
 //------------------------------------------------------------------------------
-static int CeedHostSetValue_Sycl(CeedScalar *h_array, CeedSize length, CeedScalar val) {
-  for (CeedSize i = 0; i < length; i++) h_array[i] = val;
+static int CeedHostSetValue_Sycl(CeedScalar *h_array, CeedInt length, CeedScalar val) {
+  for (int i = 0; i < length; i++) h_array[i] = val;
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Set device array to value
 //------------------------------------------------------------------------------
-static int CeedDeviceSetValue_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedSize length, CeedScalar val) {
+static int CeedDeviceSetValue_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedInt length, CeedScalar val) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.fill(d_array, val, length, {e});
@@ -420,7 +424,6 @@ static int CeedVectorGetArrayCore_Sycl(const CeedVector vec, const CeedMemType m
 
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
 // Get read-only access to a vector via the specified mem_type
 //------------------------------------------------------------------------------
@@ -505,13 +508,15 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no
     case CEED_NORM_2: {
       // Order queue
       sycl::event e            = data->sycl_queue.ext_oneapi_submit_barrier();
-      auto        sumReduction = sycl::reduction(impl->reduction_norm, sycl::plus<>(), {sycl::property::reduction::initialize_to_identity{}});
+      auto        sumReduction = sycl::reduction(impl->reduction_norm, sycl::plus<>(),
+      {sycl::property::reduction::initialize_to_identity{}});
       data->sycl_queue.parallel_for(length, {e}, sumReduction, [=](sycl::id<1> i, auto &sum) { sum += (d_array[i] * d_array[i]); }).wait_and_throw();
     } break;
     case CEED_NORM_MAX: {
       // Order queue
       sycl::event e            = data->sycl_queue.ext_oneapi_submit_barrier();
-      auto        maxReduction = sycl::reduction(impl->reduction_norm, sycl::maximum<>(), {sycl::property::reduction::initialize_to_identity{}});
+      auto        maxReduction = sycl::reduction(impl->reduction_norm, sycl::maximum<>(),
+      {sycl::property::reduction::initialize_to_identity{}});
       data->sycl_queue.parallel_for(length, {e}, maxReduction, [=](sycl::id<1> i, auto &max) { max.combine(abs(d_array[i])); }).wait_and_throw();
     } break;
   }
@@ -527,8 +532,8 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no
 //------------------------------------------------------------------------------
 // Take reciprocal of a vector on host
 //------------------------------------------------------------------------------
-static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedSize length) {
-  for (CeedSize i = 0; i < length; i++) {
+static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedInt length) {
+  for (int i = 0; i < length; i++) {
     if (std::fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i];
   }
   return CEED_ERROR_SUCCESS;
@@ -537,7 +542,7 @@ static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedSize length) {
 //------------------------------------------------------------------------------
 // Take reciprocal of a vector on device
 //------------------------------------------------------------------------------
-static int CeedDeviceReciprocal_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedSize length) {
+static int CeedDeviceReciprocal_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedInt length) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) {
@@ -569,15 +574,15 @@ static int CeedVectorReciprocal_Sycl(CeedVector vec) {
 //------------------------------------------------------------------------------
 // Compute x = alpha x on the host
 //------------------------------------------------------------------------------
-static int CeedHostScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
-  for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha;
+static int CeedHostScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedInt length) {
+  for (int i = 0; i < length; i++) x_array[i] *= alpha;
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Compute x = alpha x on device
 //------------------------------------------------------------------------------
-static int CeedDeviceScale_Sycl(sycl::queue &sycl_queue, CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
+static int CeedDeviceScale_Sycl(sycl::queue &sycl_queue, CeedScalar *x_array, CeedScalar alpha, CeedInt length) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { x_array[i] *= alpha; });
@@ -607,15 +612,15 @@ static int CeedVectorScale_Sycl(CeedVector x, CeedScalar alpha) {
 //------------------------------------------------------------------------------
 // Compute y = alpha x + y on the host
 //------------------------------------------------------------------------------
-static int CeedHostAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
-  for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
+static int CeedHostAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) {
+  for (int i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Compute y = alpha x + y on device
 //------------------------------------------------------------------------------
-static int CeedDeviceAXPY_Sycl(sycl::queue &sycl_queue, CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
+static int CeedDeviceAXPY_Sycl(sycl::queue &sycl_queue, CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { y_array[i] += alpha * x_array[i]; });
@@ -652,15 +657,15 @@ static int CeedVectorAXPY_Sycl(CeedVector y, CeedScalar alpha, CeedVector x) {
 //------------------------------------------------------------------------------
 // Compute the pointwise multiplication w = x .* y on the host
 //------------------------------------------------------------------------------
-static int CeedHostPointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
-  for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
+static int CeedHostPointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) {
+  for (int i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
   return CEED_ERROR_SUCCESS;
 }
 
 //------------------------------------------------------------------------------
 // Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
 //------------------------------------------------------------------------------
-static int CeedDevicePointwiseMult_Sycl(sycl::queue &sycl_queue, CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
+static int CeedDevicePointwiseMult_Sycl(sycl::queue &sycl_queue, CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) {
   // Order queue
   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
   sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { w_array[i] = x_array[i] * y_array[i]; });
diff --git a/backends/sycl-ref/kernels/sycl-ref-vector.cpp b/backends/sycl-ref/kernels/sycl-ref-vector.hpp
similarity index 98%
rename from backends/sycl-ref/kernels/sycl-ref-vector.cpp
rename to backends/sycl-ref/kernels/sycl-ref-vector.hpp
index 308d74d209..855bddead9 100644
--- a/backends/sycl-ref/kernels/sycl-ref-vector.cpp
+++ b/backends/sycl-ref/kernels/sycl-ref-vector.hpp
@@ -5,8 +5,11 @@
 // SPDX-License-Identifier: BSD-2-Clause
 //
 // This file is part of CEED:  http://github.com/ceed
+#ifndef _ceed_sycl_kernels_ref_vector_hpp
+#define _ceed_sycl_kernels_ref_vector_hpp
 
 #include <ceed/ceed.h>
+
 #include <sycl/sycl.hpp>
 
 //------------------------------------------------------------------------------
@@ -119,4 +122,4 @@ extern "C" int CeedDevicePointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_a
   return 0;
 }
 
-//------------------------------------------------------------------------------
+#endif
diff --git a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
index 1d3149d087..bbdcf9b01d 100644
--- a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp
@@ -19,18 +19,17 @@
 //------------------------------------------------------------------------------
 // Compute the local range of for basis kernels
 //------------------------------------------------------------------------------
-static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt *local_range, CeedInt max_group_size = 128) {
-  local_range[0] = thread_1d;
-  local_range[1] = (dim > 2 ? thread_1d : 1) * (dim > 1 ? thread_1d : 1);
+static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt *local_range, CeedInt max_group_size = 256) {
 
+  local_range[0] = thread_1d;
+  local_range[1] = (dim > 1) ? thread_1d : 1; 
+  
   const CeedInt min_group_size = local_range[0] * local_range[1];
+  if (min_group_size > max_group_size) {
+    return CeedError(ceed, CEED_ERROR_BACKEND,"Requested group size is smaller than the required minimum.");
+  }
 
-  if (min_group_size > max_group_size) max_group_size = 256;
-  if (min_group_size > max_group_size) max_group_size = 512;
-  if (min_group_size > max_group_size) max_group_size = 1024;
-  CeedCheck(min_group_size <= max_group_size, ceed, CEED_ERROR_BACKEND, "Requested group size is smaller than the required minimum.");
-
-  local_range[2] = max_group_size / min_group_size;  // elements per group
+  local_range[2] = max_group_size / min_group_size; //elements per group
   return CEED_ERROR_SUCCESS;
 }
 
@@ -38,7 +37,7 @@ static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt
 // Apply basis
 //------------------------------------------------------------------------------
 int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
-                                     CeedVector v) {
+                                    CeedVector v) {
   Ceed ceed;
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   Ceed_Sycl *ceed_Sycl;
@@ -57,62 +56,62 @@ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, Ce
   // Apply basis operation
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      CeedInt       *lrange         = impl->interp_local_range;
-      const CeedInt &elem_per_group = lrange[2];
-      const CeedInt  group_count    = (num_elem / elem_per_group) + !!(num_elem % elem_per_group);
+      CeedInt* lrange = impl->interp_local_range;
+      const CeedInt& elem_per_group = lrange[2];
+      const CeedInt group_count = (num_elem / elem_per_group) + !!(num_elem % elem_per_group);
       //-----------
-      sycl::range<3>    local_range(lrange[2], lrange[1], lrange[0]);
-      sycl::range<3>    global_range(group_count * lrange[2], lrange[1], lrange[0]);
-      sycl::nd_range<3> kernel_range(global_range, local_range);
+      sycl::range<3> local_range(lrange[2],lrange[1],lrange[0]);
+      sycl::range<3> global_range(group_count * lrange[2],lrange[1],lrange[0]);
+      sycl::nd_range<3> kernel_range(global_range,local_range);
       //-----------
-      sycl::kernel *interp_kernel = (t_mode == CEED_TRANSPOSE) ? impl->interp_transpose_kernel : impl->interp_kernel;
+      sycl::kernel* interp_kernel = (t_mode == CEED_TRANSPOSE) ? impl->interp_transpose_kernel : impl->interp_kernel;
 
-      // Order queue
+      //Order queue
       sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
-
-      ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
+      
+      ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){
         cgh.depends_on(e);
         cgh.set_args(num_elem, impl->d_interp_1d, d_u, d_v);
-        cgh.parallel_for(kernel_range, *interp_kernel);
+        cgh.parallel_for(kernel_range,*interp_kernel);
       });
 
     } break;
     case CEED_EVAL_GRAD: {
-      CeedInt       *lrange         = impl->grad_local_range;
-      const CeedInt &elem_per_group = lrange[2];
-      const CeedInt  group_count    = (num_elem / elem_per_group) + !!(num_elem % elem_per_group);
+      CeedInt* lrange = impl->grad_local_range;
+      const CeedInt& elem_per_group = lrange[2];
+      const CeedInt group_count = (num_elem / elem_per_group) + !!(num_elem % elem_per_group);
       //-----------
-      sycl::range<3>    local_range(lrange[2], lrange[1], lrange[0]);
-      sycl::range<3>    global_range(group_count * lrange[2], lrange[1], lrange[0]);
-      sycl::nd_range<3> kernel_range(global_range, local_range);
+      sycl::range<3> local_range(lrange[2],lrange[1],lrange[0]);
+      sycl::range<3> global_range(group_count * lrange[2],lrange[1],lrange[0]);
+      sycl::nd_range<3> kernel_range(global_range,local_range);
       //-----------
-      sycl::kernel     *grad_kernel = (t_mode == CEED_TRANSPOSE) ? impl->grad_transpose_kernel : impl->grad_kernel;
-      const CeedScalar *d_grad_1d   = (impl->d_collo_grad_1d) ? impl->d_collo_grad_1d : impl->d_grad_1d;
-      // Order queue
+      sycl::kernel* grad_kernel = (t_mode == CEED_TRANSPOSE) ? impl->grad_transpose_kernel : impl->grad_kernel;
+      const CeedScalar* d_grad_1d = (impl->d_collo_grad_1d) ? impl->d_collo_grad_1d : impl->d_grad_1d;
+      //Order queue
       sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
-
-      ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
+      
+      ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){
         cgh.depends_on(e);
         cgh.set_args(num_elem, impl->d_interp_1d, d_grad_1d, d_u, d_v);
-        cgh.parallel_for(kernel_range, *grad_kernel);
+        cgh.parallel_for(kernel_range,*grad_kernel);
       });
     } break;
     case CEED_EVAL_WEIGHT: {
-      CeedInt       *lrange         = impl->weight_local_range;
-      const CeedInt &elem_per_group = lrange[2];
-      const CeedInt  group_count    = (num_elem / elem_per_group) + !!(num_elem % elem_per_group);
+      CeedInt* lrange = impl->weight_local_range;
+      const CeedInt& elem_per_group = lrange[2];
+      const CeedInt group_count = (num_elem / elem_per_group) + !!(num_elem % elem_per_group);
       //-----------
-      sycl::range<3>    local_range(lrange[2], lrange[1], lrange[0]);
-      sycl::range<3>    global_range(group_count * lrange[2], lrange[1], lrange[0]);
-      sycl::nd_range<3> kernel_range(global_range, local_range);
+      sycl::range<3> local_range(lrange[2],lrange[1],lrange[0]);
+      sycl::range<3> global_range(group_count * lrange[2],lrange[1],lrange[0]);
+      sycl::nd_range<3> kernel_range(global_range,local_range);
       //-----------
-      // Order queue
+      //Order queue
       sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
-
-      ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
+      
+      ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){
         cgh.depends_on(e);
         cgh.set_args(num_elem, impl->d_q_weight_1d, d_v);
-        cgh.parallel_for(kernel_range, *(impl->weight_kernel));
+        cgh.parallel_for(kernel_range,*(impl->weight_kernel));
       });
     } break;
     // LCOV_EXCL_START
@@ -148,9 +147,9 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) {
   CeedCallBackend(CeedGetData(ceed, &data));
 
   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
-  CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
-  CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
-  CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
+  CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d,   data->sycl_context));
+  CeedCallSycl(ceed, sycl::free(impl->d_interp_1d,     data->sycl_context));
+  CeedCallSycl(ceed, sycl::free(impl->d_grad_1d,       data->sycl_context));
   CeedCallSycl(ceed, sycl::free(impl->d_collo_grad_1d, data->sycl_context));
 
   delete impl->interp_kernel;
@@ -183,17 +182,17 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
 
   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
   const CeedInt num_nodes = CeedIntPow(P_1d, dim);
-  const CeedInt num_qpts  = CeedIntPow(Q_1d, dim);
-
-  CeedInt *interp_lrange = impl->interp_local_range;
-  CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, interp_lrange));
-  const CeedInt interp_group_size = interp_lrange[0] * interp_lrange[1] * interp_lrange[2];
+  const CeedInt num_qpts = CeedIntPow(Q_1d, dim);
 
-  CeedInt *grad_lrange = impl->grad_local_range;
-  CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, grad_lrange));
-  const CeedInt grad_group_size = grad_lrange[0] * grad_lrange[1] * grad_lrange[2];
+  CeedInt* interp_lrange = impl->interp_local_range;
+  CeedCallBackend(ComputeLocalRange(ceed,dim,thread_1d,interp_lrange));
+  const CeedInt interp_group_size = interp_lrange[0] * interp_lrange[1] * interp_lrange[2]; 
 
-  CeedCallBackend(ComputeLocalRange(ceed, dim, Q_1d, impl->weight_local_range));
+  CeedInt* grad_lrange = impl->grad_local_range;
+  CeedCallBackend(ComputeLocalRange(ceed,dim,thread_1d,grad_lrange));
+  const CeedInt grad_group_size = grad_lrange[0] * grad_lrange[1] * grad_lrange[2]; 
+  
+  CeedCallBackend(ComputeLocalRange(ceed,dim,Q_1d,impl->weight_local_range));
 
   // Copy basis data to GPU
   CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
@@ -209,48 +208,48 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
 
   // Compute collocated gradient and copy to GPU
-  impl->d_collo_grad_1d          = NULL;
+  impl->d_collo_grad_1d    = NULL;
   const bool has_collocated_grad = (dim == 3) && (Q_1d >= P_1d);
   if (has_collocated_grad) {
     CeedScalar *collo_grad_1d;
     CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d));
     CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d));
     const CeedInt cgrad_length = Q_1d * Q_1d;
-    CeedCallSycl(ceed, impl->d_collo_grad_1d = sycl::malloc_device<CeedScalar>(cgrad_length, data->sycl_device, data->sycl_context));
-    CeedCallSycl(ceed, data->sycl_queue.copy<CeedScalar>(collo_grad_1d, impl->d_collo_grad_1d, cgrad_length).wait_and_throw());
+    CeedCallSycl(ceed, impl->d_collo_grad_1d = sycl::malloc_device<CeedScalar>(cgrad_length,data->sycl_device,data->sycl_context));
+    CeedCallSycl(ceed, data->sycl_queue.copy<CeedScalar>(collo_grad_1d,impl->d_collo_grad_1d,cgrad_length).wait_and_throw());
     CeedCallBackend(CeedFree(&collo_grad_1d));
   }
 
   // ---[Refactor into separate function]------>
   // Define compile-time constants
   std::map<std::string, CeedInt> jit_constants;
-  jit_constants["BASIS_DIM"]                 = dim;
-  jit_constants["BASIS_Q_1D"]                = Q_1d;
-  jit_constants["BASIS_P_1D"]                = P_1d;
-  jit_constants["T_1D"]                      = thread_1d;
-  jit_constants["BASIS_NUM_COMP"]            = num_comp;
-  jit_constants["BASIS_NUM_NODES"]           = num_nodes;
-  jit_constants["BASIS_NUM_QPTS"]            = num_qpts;
+  jit_constants["BASIS_DIM"] = dim; 
+  jit_constants["BASIS_Q_1D"] = Q_1d; 
+  jit_constants["BASIS_P_1D"] = P_1d;
+  jit_constants["T_1D"] = thread_1d;
+  jit_constants["BASIS_NUM_COMP"] = num_comp;
+  jit_constants["BASIS_NUM_NODES"] = num_nodes;
+  jit_constants["BASIS_NUM_QPTS"] = num_qpts; 
   jit_constants["BASIS_HAS_COLLOCATED_GRAD"] = has_collocated_grad;
   jit_constants["BASIS_INTERP_SCRATCH_SIZE"] = interp_group_size;
-  jit_constants["BASIS_GRAD_SCRATCH_SIZE"]   = grad_group_size;
+  jit_constants["BASIS_GRAD_SCRATCH_SIZE"] = grad_group_size;
 
   // Load kernel source
   char *basis_kernel_path, *basis_kernel_source;
   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor.h", &basis_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+  CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete -----\n");
-
+  CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n");
+  
   // Compile kernels into a kernel bundle
-  CeedCallBackend(CeedBuildModule_Sycl(ceed, basis_kernel_source, &impl->sycl_module, jit_constants));
+  CeedCallBackend(CeedJitBuildModule_Sycl(ceed, basis_kernel_source, &impl->sycl_module,jit_constants));
 
   // Load kernel functions
-  CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "Interp", &impl->interp_kernel));
-  CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "InterpTranspose", &impl->interp_transpose_kernel));
-  CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "Grad", &impl->grad_kernel));
-  CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "GradTranspose", &impl->grad_transpose_kernel));
-  CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "Weight", &impl->weight_kernel));
+  CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "Interp", &impl->interp_kernel));
+  CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "InterpTranspose", &impl->interp_transpose_kernel));
+  CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "Grad", &impl->grad_kernel));
+  CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "GradTranspose", &impl->grad_transpose_kernel));
+  CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "Weight", &impl->weight_kernel));
 
   // Clean-up
   CeedCallBackend(CeedFree(&basis_kernel_path));
@@ -264,5 +263,4 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl_shared));
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-shared/ceed-sycl-shared.hpp b/backends/sycl-shared/ceed-sycl-shared.hpp
index 79ef10d4ab..8ee7b2b0ce 100644
--- a/backends/sycl-shared/ceed-sycl-shared.hpp
+++ b/backends/sycl-shared/ceed-sycl-shared.hpp
@@ -17,19 +17,19 @@
 #include "../sycl/ceed-sycl-compile.hpp"
 
 typedef struct {
-  CeedInt       interp_local_range[3];
-  CeedInt       grad_local_range[3];
-  CeedInt       weight_local_range[3];
+  CeedInt interp_local_range[3];
+  CeedInt grad_local_range[3];
+  CeedInt weight_local_range[3];
   SyclModule_t *sycl_module;
   sycl::kernel *interp_kernel;
   sycl::kernel *interp_transpose_kernel;
   sycl::kernel *grad_kernel;
   sycl::kernel *grad_transpose_kernel;
   sycl::kernel *weight_kernel;
-  CeedScalar   *d_interp_1d;
-  CeedScalar   *d_grad_1d;
-  CeedScalar   *d_collo_grad_1d;  // eliminate
-  CeedScalar   *d_q_weight_1d;
+  CeedScalar *d_interp_1d;
+  CeedScalar *d_grad_1d;
+  CeedScalar *d_collo_grad_1d; //eliminate 
+  CeedScalar *d_q_weight_1d;
 } CeedBasis_Sycl_shared;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
diff --git a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
index be6b5c8d42..5898fb6388 100644
--- a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
+++ b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp
@@ -19,9 +19,12 @@
 //------------------------------------------------------------------------------
 static int CeedInit_Sycl_shared(const char *resource, Ceed ceed) {
   char *resource_root;
-  CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root));
-  CeedCheck(!std::strcmp(resource_root, "/gpu/sycl/shared") || !std::strcmp(resource_root, "/cpu/sycl/shared"), ceed, CEED_ERROR_BACKEND,
-            "Sycl backend cannot use resource: %s", resource);
+  CeedCallBackend(CeedSyclGetResourceRoot(ceed, resource, &resource_root));
+  if (std::strcmp(resource_root, "/gpu/sycl/shared") && std::strcmp(resource_root, "/cpu/sycl/shared")) {
+    // LCOV_EXCL_START
+    return CeedError(ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource);
+    // LCOV_EXCL_STOP
+  }
   std::string_view root_view = resource_root;
 
   auto suffix_length = root_view.size() - root_view.rfind("shared");
@@ -32,23 +35,23 @@ static int CeedInit_Sycl_shared(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedFree(&resource_root));
   CeedCallBackend(CeedSetDeterministic(ceed, true));
-
+  
   Ceed_Sycl *data;
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedSetData(ceed, data));
-  CeedCallBackend(CeedInit_Sycl(ceed, resource));
+  CeedCallBackend(CeedSyclInit(ceed, resource));
 
   Ceed ceed_ref;
   CeedCallBackend(CeedInit(ref_resource.str().c_str(), &ceed_ref));
-
+  
   Ceed_Sycl *ref_data;
   CeedCallBackend(CeedGetData(ceed_ref, &ref_data));
-
+  
   // Need to use the same queue everywhere for correct synchronization
   ref_data->sycl_queue   = data->sycl_queue;
   ref_data->sycl_context = data->sycl_context;
   ref_data->sycl_device  = data->sycl_device;
-
+  
   CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
 
   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Sycl_shared));
@@ -64,5 +67,4 @@ CEED_INTERN int CeedRegister_Sycl_Shared(void) {
   CeedCallBackend(CeedRegister("/cpu/sycl/shared", CeedInit_Sycl_shared, 35));
   return CEED_ERROR_SUCCESS;
 }
-
 //------------------------------------------------------------------------------
diff --git a/backends/sycl-shared/kernels/sycl-shared-basis.hpp b/backends/sycl-shared/kernels/sycl-shared-basis.hpp
new file mode 100644
index 0000000000..9d82fc827f
--- /dev/null
+++ b/backends/sycl-shared/kernels/sycl-shared-basis.hpp
@@ -0,0 +1,56 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+#ifndef _ceed_sycl_kernels_shared_basis_hpp
+#define _ceed_sycl_kernels_shared_basis_hpp
+
+#include <ceed/ceed.h>
+
+#include <sycl/sycl.hpp>
+
+const int               sizeMax = 16;
+__constant__ CeedScalar c_B[sizeMax * sizeMax];
+__constant__ CeedScalar c_G[sizeMax * sizeMax];
+
+//------------------------------------------------------------------------------
+// Interp device initalization
+//------------------------------------------------------------------------------
+extern "C" int CeedSyclInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr) {
+  const int bytes = P_1d * Q_1d * sizeof(CeedScalar);
+  syclMemcpyToSymbol(c_B, d_B, bytes, 0, syclMemcpyDeviceToDevice);
+  syclGetSymbolAddress((void **)c_B_ptr, c_B);
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Grad device initalization
+//------------------------------------------------------------------------------
+extern "C" int CeedSyclInitGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
+  const int bytes = P_1d * Q_1d * sizeof(CeedScalar);
+  syclMemcpyToSymbol(c_B, d_B, bytes, 0, syclMemcpyDeviceToDevice);
+  syclGetSymbolAddress((void **)c_B_ptr, c_B);
+  syclMemcpyToSymbol(c_G, d_G, bytes, 0, syclMemcpyDeviceToDevice);
+  syclGetSymbolAddress((void **)c_G_ptr, c_G);
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Collocated grad device initalization
+//------------------------------------------------------------------------------
+extern "C" int CeedSyclInitCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) {
+  const int bytes_interp = P_1d * Q_1d * sizeof(CeedScalar);
+  syclMemcpyToSymbol(c_B, d_B, bytes_interp, 0, syclMemcpyDeviceToDevice);
+  syclGetSymbolAddress((void **)c_B_ptr, c_B);
+  const int bytes_grad = Q_1d * Q_1d * sizeof(CeedScalar);
+  syclMemcpyToSymbol(c_G, d_G, bytes_grad, 0, syclMemcpyDeviceToDevice);
+  syclGetSymbolAddress((void **)c_G_ptr, c_G);
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/sycl/ceed-sycl-common.hpp b/backends/sycl/ceed-sycl-common.hpp
index e845286055..4c97e5d245 100644
--- a/backends/sycl/ceed-sycl-common.hpp
+++ b/backends/sycl/ceed-sycl-common.hpp
@@ -39,10 +39,12 @@ typedef struct {
   sycl::queue   sycl_queue;
 } Ceed_Sycl;
 
-CEED_INTERN int CeedInit_Sycl(Ceed ceed, const char *resource);
+CEED_INTERN int CeedSyclGetResourceRoot(Ceed ceed, const char *resource, char **resource_root);
+
+CEED_INTERN int CeedSyclInit(Ceed ceed, const char *resource);
 
 CEED_INTERN int CeedDestroy_Sycl(Ceed ceed);
 
-CEED_INTERN int CeedSetStream_Sycl(Ceed ceed, void *handle);
+CEED_EXTERN int CeedSetStream_Sycl(Ceed ceed, void* handle);
 
 #endif  // _ceed_sycl_common_h
diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp
index d6e7067eee..d6af11eb19 100644
--- a/backends/sycl/ceed-sycl-common.sycl.cpp
+++ b/backends/sycl/ceed-sycl-common.sycl.cpp
@@ -11,10 +11,22 @@
 #include <string>
 #include <sycl/sycl.hpp>
 
+//------------------------------------------------------------------------------
+// Get root resource without device spec
+//------------------------------------------------------------------------------
+int CeedSyclGetResourceRoot(Ceed ceed, const char *resource, char **resource_root) {
+  const char *device_spec       = std::strstr(resource, ":device_id=");
+  size_t      resource_root_len = device_spec ? (size_t)(device_spec - resource) + 1 : strlen(resource) + 1;
+  CeedCallBackend(CeedCalloc(resource_root_len, resource_root));
+  memcpy(*resource_root, resource, resource_root_len - 1);
+
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Device information backend init
 //------------------------------------------------------------------------------
-int CeedInit_Sycl(Ceed ceed, const char *resource) {
+int CeedSyclInit(Ceed ceed, const char *resource) {
   const char *device_spec = std::strstr(resource, ":device_id=");
   const int   device_id   = (device_spec) ? atoi(device_spec + 11) : 0;
 
@@ -88,39 +100,31 @@ int CeedDestroy_Sycl(Ceed ceed) {
 //------------------------------------------------------------------------------
 // Use an external queue
 //------------------------------------------------------------------------------
-int CeedSetStream_Sycl(Ceed ceed, void *handle) {
+int CeedSetStream_Sycl(Ceed ceed, void* handle) {
   Ceed_Sycl *data;
   CeedCallBackend(CeedGetData(ceed, &data));
 
-  CeedCheck(handle, ceed, CEED_ERROR_BACKEND, "Stream handle is null");
-  sycl::queue *q = static_cast<sycl::queue *>(handle);
+  if (!handle) {
+    return CeedError(ceed, CEED_ERROR_BACKEND, "Stream handle is null");;
+  }
+  sycl::queue* q = static_cast<sycl::queue*>(handle);
 
   // Ensure we are using the expected device
-  CeedCheck(data->sycl_device == q->get_device(), ceed, CEED_ERROR_BACKEND, "Device mismatch between provided queue and ceed object");
-  data->sycl_device  = q->get_device();
+  if (data->sycl_device != q->get_device()) {
+    return CeedError(ceed, CEED_ERROR_BACKEND, "Device mismatch between provided queue and ceed object");;
+  }
+  data->sycl_device = q->get_device();
   data->sycl_context = q->get_context();
-  data->sycl_queue   = *q;
+  data->sycl_queue = *q;
 
   // Revisit this when we have a hierarchy of delegates
-  Ceed ceed_delegate = NULL;
-  CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
-  if (ceed_delegate) {
+  Ceed ceed_delegate;
+  if(!CeedGetDelegate(ceed,&ceed_delegate)) {
     Ceed_Sycl *delegate_data;
     CeedCallBackend(CeedGetData(ceed_delegate, &delegate_data));
-    delegate_data->sycl_device  = q->get_device();
+    delegate_data->sycl_device = q->get_device();
     delegate_data->sycl_context = q->get_context();
-    delegate_data->sycl_queue   = *q;
-  }
-
-  // Set queue and context for Ceed Fallback object
-  Ceed ceed_fallback = NULL;
-  CeedGetOperatorFallbackCeed(ceed, &ceed_fallback);
-  if (ceed_fallback) {
-    Ceed_Sycl *fallback_data;
-    CeedCallBackend(CeedGetData(ceed_fallback, &fallback_data));
-    fallback_data->sycl_device  = q->get_device();
-    fallback_data->sycl_context = q->get_context();
-    fallback_data->sycl_queue   = *q;
+    delegate_data->sycl_queue = *q;
   }
 
   return CEED_ERROR_SUCCESS;
diff --git a/backends/sycl/ceed-sycl-compile.hpp b/backends/sycl/ceed-sycl-compile.hpp
index f4a95b8168..a90233dd2a 100644
--- a/backends/sycl/ceed-sycl-compile.hpp
+++ b/backends/sycl/ceed-sycl-compile.hpp
@@ -16,11 +16,11 @@
 
 using SyclModule_t = sycl::kernel_bundle<sycl::bundle_state::executable>;
 
-CEED_INTERN int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module,
-                                     const std::map<std::string, CeedInt> &constants = {});
-CEED_INTERN int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel);
+CEED_INTERN int CeedJitBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module,
+                                        const std::map<std::string, CeedInt> &constants = {});
+CEED_INTERN int CeedJitGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel);
 
-CEED_INTERN int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_size, const int block_size_x, const int block_size_y,
+CEED_INTERN int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel* kernel, const int grid_size, const int block_size_x, const int block_size_y,
                                            const int block_size_z, const int shared_mem_size, void **args);
 
 #endif  // _ceed_sycl_compile_h
diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp
index 845c46b5c5..144e5b478f 100644
--- a/backends/sycl/ceed-sycl-compile.sycl.cpp
+++ b/backends/sycl/ceed-sycl-compile.sycl.cpp
@@ -14,15 +14,16 @@
 
 #include <map>
 #include <sstream>
+// #include <sycl/ext/intel/experimental/online_compiler.hpp>
 #include <sycl/sycl.hpp>
 
-#include "./online_compiler.hpp"
 #include "ceed-sycl-common.hpp"
+#include "./online_compiler.hpp"
 
 using ByteVector_t = std::vector<unsigned char>;
 
 //------------------------------------------------------------------------------
-// Add defined constants at the beginning of kernel source
+//
 //------------------------------------------------------------------------------
 static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_source, std::string &jit_source,
                                       const std::map<std::string, CeedInt> &constants = {}) {
@@ -63,7 +64,9 @@ static inline int CeedJitGetFlags_Sycl(std::vector<std::string> &flags) {
 //------------------------------------------------------------------------------
 // Compile an OpenCL source to SPIR-V using Intel's online compiler extension
 //------------------------------------------------------------------------------
-static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_device, const std::string &opencl_source, ByteVector_t &il_binary,
+static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_device, 
+                                            const std::string &opencl_source, 
+                                            ByteVector_t &il_binary,
                                             const std::vector<std::string> &flags = {}) {
   sycl::ext::libceed::online_compiler<sycl::ext::libceed::source_language::opencl_c> compiler(sycl_device);
 
@@ -77,34 +80,26 @@ static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_
 
 // ------------------------------------------------------------------------------
 // Load (compile) SPIR-V source and wrap in sycl kernel_bundle
+// TODO: determine appropriate flags
+// TODO: Error handle lz calls
 // ------------------------------------------------------------------------------
-static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, const sycl::device &sycl_device, const ByteVector_t &il_binary,
-                               SyclModule_t **sycl_module) {
+static int CeedJitLoadModule_Sycl(const sycl::context &sycl_context, 
+                                  const sycl::device &sycl_device, 
+                                  const ByteVector_t &il_binary,
+                                  SyclModule_t **sycl_module) {
   auto lz_context = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_context);
   auto lz_device  = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_device);
 
   ze_module_desc_t lz_mod_desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
-                                  nullptr,  // extension specific structs
+                                  nullptr,
                                   ZE_MODULE_FORMAT_IL_SPIRV,
                                   il_binary.size(),
                                   il_binary.data(),
-                                  " -ze-opt-large-register-file",  // flags
-                                  nullptr};                        // specialization constants
+                                  " -ze-opt-large-register-file",   // flags
+                                  nullptr};  // build log
 
-  ze_module_handle_t           lz_module;
-  ze_module_build_log_handle_t lz_log;
-  ze_result_t                  lz_err = zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, &lz_log);
-
-  if (ZE_RESULT_SUCCESS != lz_err) {
-    size_t log_size = 0;
-    zeModuleBuildLogGetString(lz_log, &log_size, nullptr);
-
-    char *log_message;
-    CeedCall(CeedCalloc(log_size, &log_message));
-    zeModuleBuildLogGetString(lz_log, &log_size, log_message);
-
-    return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to compile Level Zero module:\n%s", log_message);
-  }
+  ze_module_handle_t lz_module;
+  zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, nullptr);
 
   // sycl make_<type> only throws errors for backend mismatch--assume we have vetted this already
   *sycl_module = new SyclModule_t(sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero, sycl::bundle_state::executable>(
@@ -116,7 +111,8 @@ static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, con
 // ------------------------------------------------------------------------------
 // Compile kernel source to an executable `sycl::kernel_bundle`
 // ------------------------------------------------------------------------------
-int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module, const std::map<std::string, CeedInt> &constants) {
+int CeedJitBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module,
+                            const std::map<std::string, CeedInt> &constants) {
   Ceed_Sycl *data;
   CeedCallBackend(CeedGetData(ceed, &data));
 
@@ -129,7 +125,7 @@ int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule
   ByteVector_t il_binary;
   CeedCallBackend(CeedJitCompileSource_Sycl(ceed, data->sycl_device, jit_source, il_binary, flags));
 
-  CeedCallBackend(CeedLoadModule_Sycl(ceed, data->sycl_context, data->sycl_device, il_binary, sycl_module));
+  CeedCallBackend(CeedJitLoadModule_Sycl(data->sycl_context, data->sycl_device, il_binary, sycl_module));
 
   return CEED_ERROR_SUCCESS;
 }
@@ -139,7 +135,9 @@ int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule
 //
 // TODO: Error handle lz calls
 // ------------------------------------------------------------------------------
-int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel) {
+int CeedJitGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, 
+                          const std::string &kernel_name, 
+                          sycl::kernel **sycl_kernel) {
   Ceed_Sycl *data;
   CeedCallBackend(CeedGetData(ceed, &data));
 
@@ -149,11 +147,7 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st
 
   ze_kernel_desc_t   lz_kernel_desc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_name.c_str()};
   ze_kernel_handle_t lz_kernel;
-  ze_result_t        lz_err = zeKernelCreate(lz_module, &lz_kernel_desc, &lz_kernel);
-
-  if (ZE_RESULT_SUCCESS != lz_err) {
-    return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to retrieve kernel from Level Zero module");
-  }
+  zeKernelCreate(lz_module, &lz_kernel_desc, &lz_kernel);
 
   *sycl_kernel = new sycl::kernel(sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
       {*sycl_module, lz_kernel, sycl::ext::oneapi::level_zero::ownership::transfer}, data->sycl_context));
@@ -164,23 +158,23 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st
 //------------------------------------------------------------------------------
 // Run SYCL kernel for spatial dimension with shared memory
 //------------------------------------------------------------------------------
-int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_size, const int block_size_x, const int block_size_y,
+int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel* kernel, const int grid_size, const int block_size_x, const int block_size_y,
                                const int block_size_z, const int shared_mem_size, void **kernel_args) {
-  sycl::range<3>    local_range(block_size_z, block_size_y, block_size_x);
-  sycl::range<3>    global_range(grid_size * block_size_z, block_size_y, block_size_x);
-  sycl::nd_range<3> kernel_range(global_range, local_range);
-
+  sycl::range<3> local_range(block_size_z, block_size_y, block_size_x);
+  sycl::range<3> global_range(grid_size*block_size_z, block_size_y, block_size_x);
+  sycl::nd_range<3> kernel_range(global_range,local_range);
+  
   //-----------
-  // Order queue
+  //Order queue
   Ceed_Sycl *ceed_Sycl;
   CeedCallBackend(CeedGetData(ceed, &ceed_Sycl));
   sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
-
-  ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
+  
+  ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){
     cgh.depends_on(e);
     cgh.set_args(*kernel_args);
-    cgh.parallel_for(kernel_range, *kernel);
+    cgh.parallel_for(kernel_range,*kernel);
   });
 
   return CEED_ERROR_SUCCESS;
-}
+}
\ No newline at end of file
diff --git a/backends/sycl/ocloc_api.h b/backends/sycl/ocloc_api.h
index 80b7bf6672..f073c5c524 100644
--- a/backends/sycl/ocloc_api.h
+++ b/backends/sycl/ocloc_api.h
@@ -13,12 +13,13 @@
 
 #ifndef OCLOC_MAKE_VERSION
 /// Generates ocloc API versions
-#define OCLOC_MAKE_VERSION(_major, _minor) ((_major << 16) | (_minor & 0x0000ffff))
-#endif  // OCLOC_MAKE_VERSION
+#define OCLOC_MAKE_VERSION(_major, _minor)                                     \
+  ((_major << 16) | (_minor & 0x0000ffff))
+#endif // OCLOC_MAKE_VERSION
 
 typedef enum _ocloc_version_t {
-  OCLOC_VERSION_1_0          = OCLOC_MAKE_VERSION(1, 0),  ///< version 1.0
-  OCLOC_VERSION_CURRENT      = OCLOC_MAKE_VERSION(1, 0),  ///< latest known version
+  OCLOC_VERSION_1_0 = OCLOC_MAKE_VERSION(1, 0),     ///< version 1.0
+  OCLOC_VERSION_CURRENT = OCLOC_MAKE_VERSION(1, 0), ///< latest known version
   OCLOC_VERSION_FORCE_UINT32 = 0x7fffffff
 } ocloc_version_t;
 
@@ -81,9 +82,14 @@ extern "C" {
 /// messages generated by ocloc (e.g. compiler errors/warnings).
 ///
 /// \returns 0 on succes. Returns non-0 in case of failure.
-SIGNATURE oclocInvoke(uint32_t NumArgs, const char *Argv[], uint32_t NumSources, const uint8_t **DataSources, const uint64_t *LenSources,
-                      const char **NameSources, uint32_t NumInputHeaders, const uint8_t **DataInputHeaders, const uint64_t *LenInputHeaders,
-                      const char **NameInputHeaders, uint32_t *NumOutputs, uint8_t ***DataOutputs, uint64_t **LenOutputs, char ***NameOutputs);
+SIGNATURE oclocInvoke(uint32_t NumArgs, const char *Argv[], uint32_t NumSources,
+                      const uint8_t **DataSources, const uint64_t *LenSources,
+                      const char **NameSources, uint32_t NumInputHeaders,
+                      const uint8_t **DataInputHeaders,
+                      const uint64_t *LenInputHeaders,
+                      const char **NameInputHeaders, uint32_t *NumOutputs,
+                      uint8_t ***DataOutputs, uint64_t **LenOutputs,
+                      char ***NameOutputs);
 
 /// Frees results of oclocInvoke
 ///
@@ -96,7 +102,8 @@ SIGNATURE oclocInvoke(uint32_t NumArgs, const char *Argv[], uint32_t NumSources,
 /// \param NameOutputs is array of names of outputs as returned by oclocInvoke()
 ///
 /// \returns 0 on succes. Returns non-0 in case of failure.
-SIGNATURE oclocFreeOutput(uint32_t *NumOutputs, uint8_t ***DataOutputs, uint64_t **LenOutputs, char ***NameOutputs);
+SIGNATURE oclocFreeOutput(uint32_t *NumOutputs, uint8_t ***DataOutputs,
+                          uint64_t **LenOutputs, char ***NameOutputs);
 
 /// Returns the current version of ocloc.
 SIGNATURE oclocVersion();
diff --git a/backends/sycl/online_compiler.hpp b/backends/sycl/online_compiler.hpp
index edc14ab92e..f259bd52ea 100644
--- a/backends/sycl/online_compiler.hpp
+++ b/backends/sycl/online_compiler.hpp
@@ -19,11 +19,11 @@ namespace ext::libceed {
 using byte = unsigned char;
 
 enum class compiled_code_format {
-  spir_v = 0  // the only format supported for now
+  spir_v = 0 // the only format supported for now
 };
 
 class device_arch {
- public:
+public:
   static constexpr int any = 0;
 
   device_arch(int Val) : Val(Val) {}
@@ -33,17 +33,17 @@ class device_arch {
   // The API must allow user to define the target GPU option even if it is
   // not listed in this enumerator below.
   enum gpu {
-    gpu_any    = 1,
-    gpu_gen9   = 2,
-    gpu_skl    = gpu_gen9,
+    gpu_any = 1,
+    gpu_gen9 = 2,
+    gpu_skl = gpu_gen9,
     gpu_gen9_5 = 3,
-    gpu_kbl    = gpu_gen9_5,
-    gpu_cfl    = gpu_gen9_5,
-    gpu_gen11  = 4,
-    gpu_icl    = gpu_gen11,
-    gpu_gen12  = 5,
-    gpu_tgl    = gpu_gen12,
-    gpu_tgllp  = gpu_gen12
+    gpu_kbl = gpu_gen9_5,
+    gpu_cfl = gpu_gen9_5,
+    gpu_gen11 = 4,
+    gpu_icl = gpu_gen11,
+    gpu_gen12 = 5,
+    gpu_tgl = gpu_gen12,
+    gpu_tgllp = gpu_gen12
   };
 
   enum cpu {
@@ -56,13 +56,13 @@ class device_arch {
 
   operator int() { return Val; }
 
- private:
+private:
   int Val;
 };
 
 /// Represents an error happend during online compilation.
 class online_compile_error : public sycl::exception {
- public:
+public:
   online_compile_error() = default;
   online_compile_error(const std::string &Msg) : sycl::exception(Msg) {}
 };
@@ -72,40 +72,35 @@ enum class source_language { opencl_c = 0, cm = 1 };
 
 /// Represents an online compiler for the language given as template
 /// parameter.
-template <source_language Lang>
-class online_compiler {
- public:
+template <source_language Lang> class online_compiler {
+public:
   /// Constructs online compiler which can target any device and produces
   /// given compiled code format. Produces 64-bit device code.
   /// The created compiler is "optimistic" - it assumes all applicable SYCL
   /// device capabilities are supported by the target device(s).
   online_compiler(compiled_code_format fmt = compiled_code_format::spir_v)
-      : OutputFormat(fmt),
-        OutputFormatVersion({0, 0}),
-        DeviceType(sycl::info::device_type::all),
-        DeviceArch(device_arch::any),
-        Is64Bit(true),
-        DeviceStepping("") {}
+      : OutputFormat(fmt), OutputFormatVersion({0, 0}),
+        DeviceType(sycl::info::device_type::all), DeviceArch(device_arch::any),
+        Is64Bit(true), DeviceStepping("") {}
 
   /// Constructs online compiler which targets given architecture and produces
   /// given compiled code format. Produces 64-bit device code.
   /// Throws online_compile_error if values of constructor arguments are
   /// contradictory or not supported - e.g. if the source language is not
   /// supported for given device type.
-  online_compiler(sycl::info::device_type dev_type, device_arch arch, compiled_code_format fmt = compiled_code_format::spir_v)
-      : OutputFormat(fmt), OutputFormatVersion({0, 0}), DeviceType(dev_type), DeviceArch(arch), Is64Bit(true), DeviceStepping("") {}
+  online_compiler(sycl::info::device_type dev_type, device_arch arch,
+                  compiled_code_format fmt = compiled_code_format::spir_v)
+      : OutputFormat(fmt), OutputFormatVersion({0, 0}), DeviceType(dev_type),
+        DeviceArch(arch), Is64Bit(true), DeviceStepping("") {}
 
   /// Constructs online compiler for the target specified by given SYCL device.
   // TODO: the initial version generates the generic code (SKL now), need
   // to do additional device::info calls to determine the device by it's
   // features.
   online_compiler(const sycl::device &)
-      : OutputFormat(compiled_code_format::spir_v),
-        OutputFormatVersion({0, 0}),
-        DeviceType(sycl::info::device_type::all),
-        DeviceArch(device_arch::any),
-        Is64Bit(true),
-        DeviceStepping("") {}
+      : OutputFormat(compiled_code_format::spir_v), OutputFormatVersion({0, 0}),
+        DeviceType(sycl::info::device_type::all), DeviceArch(device_arch::any),
+        Is64Bit(true), DeviceStepping("") {}
 
   /// Compiles given in-memory \c Lang source to a binary blob. Blob format,
   /// other parameters are set in the constructor by the compilation target
@@ -160,7 +155,7 @@ class online_compiler {
     return *this;
   }
 
- private:
+private:
   /// Compiled code format.
   compiled_code_format OutputFormat;
 
@@ -180,7 +175,7 @@ class online_compiler {
   std::string DeviceStepping;
 
   /// Handles to helper functions used by the implementation.
-  void *CompileToSPIRVHandle   = nullptr;
+  void *CompileToSPIRVHandle = nullptr;
   void *FreeSPIRVOutputsHandle = nullptr;
 };
 
@@ -193,10 +188,12 @@ class online_compiler {
 ///   OpenCL JIT compiler options must be supported.
 template <>
 template <>
-std::vector<byte> online_compiler<source_language::opencl_c>::compile(const std::string &src, const std::vector<std::string> &options);
+std::vector<byte>
+online_compiler<source_language::opencl_c>::compile(
+    const std::string &src, const std::vector<std::string> &options);
 
-// /// Compiles the given OpenCL source. May throw \c online_compile_error.
-// /// @param src - contents of the source.
+/// Compiles the given OpenCL source. May throw \c online_compile_error.
+/// @param src - contents of the source.
 // template <>
 // template <>
 // std::vector<byte>
@@ -209,7 +206,8 @@ std::vector<byte> online_compiler<source_language::opencl_c>::compile(const std:
 /// @param options - compilation options (implementation defined).
 template <>
 template <>
-std::vector<byte> online_compiler<source_language::cm>::compile(const std::string &src, const std::vector<std::string> &options);
+std::vector<byte> online_compiler<source_language::cm>::compile(
+    const std::string &src, const std::vector<std::string> &options);
 
 /// Compiles the given CM source \p src.
 // template <>
@@ -218,5 +216,5 @@ std::vector<byte> online_compiler<source_language::cm>::compile(const std::strin
 //   return compile(src, std::vector<std::string>{});
 // }
 
-}  // namespace ext::libceed
-}  // namespace sycl
+} // namespace ext::libceed
+} // namespace sycl
diff --git a/backends/sycl/online_compiler.sycl.cpp b/backends/sycl/online_compiler.sycl.cpp
index d8459ce938..9e4fb539d8 100644
--- a/backends/sycl/online_compiler.sycl.cpp
+++ b/backends/sycl/online_compiler.sycl.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 #include <cstring>
-#include <string>
 #include <dlfcn.h>
+#include <string>
 
 #include <sycl/sycl.hpp>
-#include "ocloc_api.h"
 #include "online_compiler.hpp"
+#include "ocloc_api.h"
 
 namespace sycl {
 namespace ext::libceed {
@@ -22,7 +22,8 @@ void *loadOsLibrary(const std::string &PluginPath) {
   void *so = dlopen(PluginPath.c_str(), RTLD_NOW);
   if (!so) {
     char *Error = dlerror();
-    std::cerr << "dlopen(" << PluginPath << ") failed with <" << (Error ? Error : "unknown error") << ">" << std::endl;
+    std::cerr << "dlopen(" << PluginPath << ") failed with <"
+              << (Error ? Error : "unknown error") << ">" << std::endl;
   }
   return so;
 }
@@ -35,32 +36,36 @@ void *loadOsLibrary(const std::string &PluginPath) {
 //   return dlclose(Library);
 // }
 
-void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) { return dlsym(Library, FunctionName.c_str()); }
+void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) {
+  return dlsym(Library, FunctionName.c_str());
+}
 
-static std::vector<const char *> prepareOclocArgs(sycl::info::device_type DeviceType, device_arch DeviceArch, bool Is64Bit,
-                                                  const std::string &DeviceStepping, const std::string &UserArgs) {
+static std::vector<const char *>
+prepareOclocArgs(sycl::info::device_type DeviceType, device_arch DeviceArch,
+                 bool Is64Bit, const std::string &DeviceStepping,
+                 const std::string &UserArgs) {
   std::vector<const char *> Args = {"ocloc", "-q", "-spv_only", "-device"};
 
   if (DeviceType == sycl::info::device_type::gpu) {
     switch (DeviceArch) {
-      case device_arch::gpu_gen9:
-        Args.push_back("skl");
-        break;
+    case device_arch::gpu_gen9:
+      Args.push_back("skl");
+      break;
 
-      case device_arch::gpu_gen9_5:
-        Args.push_back("cfl");
-        break;
+    case device_arch::gpu_gen9_5:
+      Args.push_back("cfl");
+      break;
 
-      case device_arch::gpu_gen11:
-        Args.push_back("icllp");
-        break;
+    case device_arch::gpu_gen11:
+      Args.push_back("icllp");
+      break;
 
-      case device_arch::gpu_gen12:
-        Args.push_back("tgllp");
-        break;
+    case device_arch::gpu_gen12:
+      Args.push_back("tgllp");
+      break;
 
-      default:
-        Args.push_back("pvc");
+    default:
+      Args.push_back("pvc");
     }
   } else {
     // TODO: change that to generic device when ocloc adds support for it.
@@ -97,9 +102,13 @@ static std::vector<const char *> prepareOclocArgs(sycl::info::device_type Device
 ///                                 of the library function freeing memory
 ///                                 allocated during the compilation.
 /// @param UserArgs - User's options to ocloc compiler.
-static std::vector<byte> compileToSPIRV(const std::string &Source, sycl::info::device_type DeviceType, device_arch DeviceArch, bool Is64Bit,
-                                        const std::string &DeviceStepping, void *&CompileToSPIRVHandle, void *&FreeSPIRVOutputsHandle,
-                                        const std::vector<std::string> &UserArgs) {
+static std::vector<byte>
+compileToSPIRV(const std::string &Source, sycl::info::device_type DeviceType,
+               device_arch DeviceArch, bool Is64Bit,
+               const std::string &DeviceStepping, void *&CompileToSPIRVHandle,
+               void *&FreeSPIRVOutputsHandle,
+               const std::vector<std::string> &UserArgs) {
+
   if (!CompileToSPIRVHandle) {
 #ifdef __SYCL_RT_OS_WINDOWS
     static const std::string OclocLibraryName = "ocloc64.dll";
@@ -107,62 +116,83 @@ static std::vector<byte> compileToSPIRV(const std::string &Source, sycl::info::d
     static const std::string OclocLibraryName = "libocloc.so";
 #endif
     void *OclocLibrary = loadOsLibrary(OclocLibraryName);
-    if (!OclocLibrary) throw online_compile_error("Cannot load ocloc library: " + OclocLibraryName);
-    void *OclocVersionHandle = getOsLibraryFuncAddress(OclocLibrary, "oclocVersion");
+    if (!OclocLibrary)
+      throw online_compile_error("Cannot load ocloc library: " +
+                                 OclocLibraryName);
+    void *OclocVersionHandle =
+        getOsLibraryFuncAddress(OclocLibrary, "oclocVersion");
     // The initial versions of ocloc library did not have the oclocVersion()
     // function. Those versions had the same API as the first version of ocloc
     // library having that oclocVersion() function.
     int LoadedVersion = ocloc_version_t::OCLOC_VERSION_1_0;
     if (OclocVersionHandle) {
-      decltype(::oclocVersion) *OclocVersionFunc = reinterpret_cast<decltype(::oclocVersion) *>(OclocVersionHandle);
-      LoadedVersion                              = OclocVersionFunc();
+      decltype(::oclocVersion) *OclocVersionFunc =
+          reinterpret_cast<decltype(::oclocVersion) *>(OclocVersionHandle);
+      LoadedVersion = OclocVersionFunc();
     }
     // The loaded library with version (A.B) is compatible with expected API/ABI
     // version (X.Y) used here if A == B and B >= Y.
-    int LoadedVersionMajor  = LoadedVersion >> 16;
-    int LoadedVersionMinor  = LoadedVersion & 0xffff;
+    int LoadedVersionMajor = LoadedVersion >> 16;
+    int LoadedVersionMinor = LoadedVersion & 0xffff;
     int CurrentVersionMajor = ocloc_version_t::OCLOC_VERSION_CURRENT >> 16;
     int CurrentVersionMinor = ocloc_version_t::OCLOC_VERSION_CURRENT & 0xffff;
-    if (LoadedVersionMajor != CurrentVersionMajor || LoadedVersionMinor < CurrentVersionMinor)
-      throw online_compile_error(std::string("Found incompatible version of ocloc library: (") + std::to_string(LoadedVersionMajor) + "." +
-                                 std::to_string(LoadedVersionMinor) + "). The supported versions are (" + std::to_string(CurrentVersionMajor) +
-                                 ".N), where (N >= " + std::to_string(CurrentVersionMinor) + ").");
-
-    CompileToSPIRVHandle = getOsLibraryFuncAddress(OclocLibrary, "oclocInvoke");
-    if (!CompileToSPIRVHandle) throw online_compile_error("Cannot load oclocInvoke() function");
-    FreeSPIRVOutputsHandle = getOsLibraryFuncAddress(OclocLibrary, "oclocFreeOutput");
-    if (!FreeSPIRVOutputsHandle) throw online_compile_error("Cannot load oclocFreeOutput() function");
+    if (LoadedVersionMajor != CurrentVersionMajor ||
+        LoadedVersionMinor < CurrentVersionMinor)
+      throw online_compile_error(
+          std::string("Found incompatible version of ocloc library: (") +
+          std::to_string(LoadedVersionMajor) + "." +
+          std::to_string(LoadedVersionMinor) +
+          "). The supported versions are (" +
+          std::to_string(CurrentVersionMajor) +
+          ".N), where (N >= " + std::to_string(CurrentVersionMinor) + ").");
+
+    CompileToSPIRVHandle =
+        getOsLibraryFuncAddress(OclocLibrary, "oclocInvoke");
+    if (!CompileToSPIRVHandle)
+      throw online_compile_error("Cannot load oclocInvoke() function");
+    FreeSPIRVOutputsHandle = getOsLibraryFuncAddress(
+        OclocLibrary, "oclocFreeOutput");
+    if (!FreeSPIRVOutputsHandle)
+      throw online_compile_error("Cannot load oclocFreeOutput() function");
   }
 
   std::string CombinedUserArgs;
   for (auto UserArg : UserArgs) {
-    if (UserArg == "") continue;
-    if (CombinedUserArgs != "") CombinedUserArgs = CombinedUserArgs + " " + UserArg;
-    else CombinedUserArgs = UserArg;
+    if (UserArg == "")
+      continue;
+    if (CombinedUserArgs != "")
+      CombinedUserArgs = CombinedUserArgs + " " + UserArg;
+    else
+      CombinedUserArgs = UserArg;
   }
-  std::vector<const char *> Args = prepareOclocArgs(DeviceType, DeviceArch, Is64Bit, DeviceStepping, CombinedUserArgs);
+  std::vector<const char *> Args = prepareOclocArgs(
+      DeviceType, DeviceArch, Is64Bit, DeviceStepping, CombinedUserArgs);
 
-  uint32_t  NumOutputs    = 0;
-  byte    **Outputs       = nullptr;
+  uint32_t NumOutputs = 0;
+  byte **Outputs = nullptr;
   uint64_t *OutputLengths = nullptr;
-  char    **OutputNames   = nullptr;
+  char **OutputNames = nullptr;
 
-  const byte    *Sources[]       = {reinterpret_cast<const byte *>(Source.c_str())};
-  const char    *SourceName      = "main.cl";
+  const byte *Sources[] = {reinterpret_cast<const byte *>(Source.c_str())};
+  const char *SourceName = "main.cl";
   const uint64_t SourceLengths[] = {Source.length() + 1};
 
   Args.push_back("-file");
   Args.push_back(SourceName);
 
-  decltype(::oclocInvoke) *OclocInvokeFunc = reinterpret_cast<decltype(::oclocInvoke) *>(CompileToSPIRVHandle);
-  int CompileError = OclocInvokeFunc(Args.size(), Args.data(), 1, Sources, SourceLengths, &SourceName, 0, nullptr, nullptr, nullptr, &NumOutputs,
-                                     &Outputs, &OutputLengths, &OutputNames);
+  decltype(::oclocInvoke) *OclocInvokeFunc =
+      reinterpret_cast<decltype(::oclocInvoke) *>(CompileToSPIRVHandle);
+  int CompileError =
+      OclocInvokeFunc(Args.size(), Args.data(), 1, Sources, SourceLengths,
+                      &SourceName, 0, nullptr, nullptr, nullptr, &NumOutputs,
+                      &Outputs, &OutputLengths, &OutputNames);
 
   std::vector<byte> SpirV;
-  std::string       CompileLog;
+  std::string CompileLog;
   for (uint32_t I = 0; I < NumOutputs; I++) {
     size_t NameLen = strlen(OutputNames[I]);
-    if (NameLen >= 4 && strstr(OutputNames[I], ".spv") != nullptr && Outputs[I] != nullptr) {
+    if (NameLen >= 4 && strstr(OutputNames[I], ".spv") != nullptr &&
+        Outputs[I] != nullptr) {
       assert(SpirV.size() == 0 && "More than one SPIR-V output found.");
       SpirV = std::vector<byte>(Outputs[I], Outputs[I] + OutputLengths[I]);
     } else if (!strcmp(OutputNames[I], "stdout.log")) {
@@ -171,39 +201,58 @@ static std::vector<byte> compileToSPIRV(const std::string &Source, sycl::info::d
   }
 
   // Try to free memory before reporting possible error.
-  decltype(::oclocFreeOutput) *OclocFreeOutputFunc = reinterpret_cast<decltype(::oclocFreeOutput) *>(FreeSPIRVOutputsHandle);
-  int                          MemFreeError        = OclocFreeOutputFunc(&NumOutputs, &Outputs, &OutputLengths, &OutputNames);
+  decltype(::oclocFreeOutput) *OclocFreeOutputFunc =
+      reinterpret_cast<decltype(::oclocFreeOutput) *>(FreeSPIRVOutputsHandle);
+  int MemFreeError =
+      OclocFreeOutputFunc(&NumOutputs, &Outputs, &OutputLengths, &OutputNames);
 
-  if (CompileError) throw online_compile_error("ocloc reported compilation errors: {\n" + CompileLog + "\n}");
-  if (SpirV.empty()) throw online_compile_error("Unexpected output: ocloc did not return SPIR-V");
-  if (MemFreeError) throw online_compile_error("ocloc cannot safely free resources");
+  if (CompileError)
+    throw online_compile_error("ocloc reported compilation errors: {\n" +
+                               CompileLog + "\n}");
+  if (SpirV.empty())
+    throw online_compile_error(
+        "Unexpected output: ocloc did not return SPIR-V");
+  if (MemFreeError)
+    throw online_compile_error("ocloc cannot safely free resources");
 
   return SpirV;
 }
 
 template <>
 template <>
-std::vector<byte> online_compiler<source_language::opencl_c>::compile(const std::string &Source, const std::vector<std::string> &UserArgs) {
+std::vector<byte> online_compiler<source_language::opencl_c>::compile(
+    const std::string &Source, const std::vector<std::string> &UserArgs) {
+
   if (OutputFormatVersion != std::pair<int, int>{0, 0}) {
-    std::string Version = std::to_string(OutputFormatVersion.first) + ", " + std::to_string(OutputFormatVersion.second);
-    throw online_compile_error(std::string("The output format version (") + Version + ") is not supported yet");
+    std::string Version = std::to_string(OutputFormatVersion.first) + ", " +
+                          std::to_string(OutputFormatVersion.second);
+    throw online_compile_error(std::string("The output format version (") +
+                               Version + ") is not supported yet");
   }
 
-  return compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit, DeviceStepping, CompileToSPIRVHandle, FreeSPIRVOutputsHandle, UserArgs);
+  return compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit,
+                                DeviceStepping, CompileToSPIRVHandle,
+                                FreeSPIRVOutputsHandle, UserArgs);
 }
 
 template <>
 template <>
-std::vector<byte> online_compiler<source_language::cm>::compile(const std::string &Source, const std::vector<std::string> &UserArgs) {
+std::vector<byte> online_compiler<source_language::cm>::compile(
+    const std::string &Source, const std::vector<std::string> &UserArgs) {
+
   if (OutputFormatVersion != std::pair<int, int>{0, 0}) {
-    std::string Version = std::to_string(OutputFormatVersion.first) + ", " + std::to_string(OutputFormatVersion.second);
-    throw online_compile_error(std::string("The output format version (") + Version + ") is not supported yet");
+    std::string Version = std::to_string(OutputFormatVersion.first) + ", " +
+                          std::to_string(OutputFormatVersion.second);
+    throw online_compile_error(std::string("The output format version (") +
+                               Version + ") is not supported yet");
   }
 
   std::vector<std::string> CMUserArgs = UserArgs;
   CMUserArgs.push_back("-cmc");
-  return compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit, DeviceStepping, CompileToSPIRVHandle, FreeSPIRVOutputsHandle, CMUserArgs);
+  return compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit,
+                                DeviceStepping, CompileToSPIRVHandle,
+                                FreeSPIRVOutputsHandle, CMUserArgs);
 }
 
-}  // namespace ext::libceed
-}  // namespace sycl
+} // namespace ext::libceed
+} // namespace sycl