diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp index ca4052739c..3edc12e3b7 100644 --- a/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp +++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.hpp @@ -9,6 +9,6 @@ #define _ceed_sycl_gen_operator_build_h CEED_INTERN int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes); -CEED_INTERN int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op); +CEED_INTERN int CeedSyclGenOperatorBuild(CeedOperator op); #endif // _ceed_sycl_gen_operator_build_h diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp index 608635ab01..e126df6dd3 100644 --- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp @@ -41,9 +41,9 @@ extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d block_sizes[1] = thread1d; block_sizes[2] = elems_per_block; } else if (dim == 3) { - const CeedInt elems_per_block = 1; //thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); + const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); block_sizes[0] = thread1d; - block_sizes[1] = thread1d * thread1d; + block_sizes[1] = thread1d; block_sizes[2] = elems_per_block; } return CEED_ERROR_SUCCESS; @@ -54,11 +54,11 @@ extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d // - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct // - [ ] Do kernel jitting! //------------------------------------------------------------------------------ -extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { +extern "C" int CeedSyclGenOperatorBuild(CeedOperator op) { bool is_setup_done; CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - + Ceed ceed; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); Ceed_Sycl *sycl_data; @@ -66,9 +66,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { CeedOperator_Sycl_gen *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - Fields_Sycl h_B, h_G; - FieldsInt_Sycl h_indices; - CeedQFunction qf; + Fields_Sycl h_B, h_G; + FieldsInt_Sycl h_indices; + CeedQFunction qf; CeedQFunction_Sycl_gen *qf_impl; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); @@ -81,11 +81,11 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - - CeedEvalMode eval_mode; - CeedBasis basis; + + CeedEvalMode eval_mode; + CeedBasis basis; CeedBasis_Sycl_shared *basis_impl; - CeedElemRestriction Erestrict; + CeedElemRestriction Erestrict; CeedElemRestriction_Sycl *restr_impl; // Check for restriction only identity operator @@ -123,8 +123,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { CeedCallBackend(CeedFree(&sycl_gen_template_source)); } - std::string_view q_function_source(qf_impl->q_function_source); - std::string_view q_function_name(qf_impl->q_function_name); + std::string_view q_function_source(qf_impl->q_function_source); + std::string_view q_function_name(qf_impl->q_function_name); const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(q_function_name); // Find dim, P_1d, Q_1d @@ -199,9 +199,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { } } } - use_collograd_parallelization = false; // Enforcing non-collograd mode for 3D threads - CeedInt block_sizes[3]; + CeedInt block_sizes[3]; CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes)); // Define CEED_Q_VLA @@ -209,25 +208,14 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { if (dim != 3 || use_collograd_parallelization) { code << "#define CEED_Q_VLA 1\n\n"; } else { - // code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; - code << "#define CEED_Q_VLA " << 1 << "\n\n"; - } - - // Determine subgroup size based on supported sizes : Default : 16 (if supported) - std::vector allowed_sg_sizes = sycl_data->sycl_device.get_info(); - CeedInt sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1]; - for (const auto &s : allowed_sg_sizes) { - if (s == 16) { - sub_group_size_op = s; - break; - } + code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; } code << q_function_source; - - // Kernel function + + // Kernel function code << "\n// -----------------------------------------------------------------------------\n"; - code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n"; + code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z)))\n"; code << "kernel void " << operator_name << "("; code << "const CeedInt num_elem, "; code << "global void* ctx, "; @@ -236,8 +224,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << "global const Fields_Sycl* B, "; code << "global const Fields_Sycl* G, "; code << "global const CeedScalar * restrict W"; - code << ") {\n"; - + code << ") {\n"; + for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT @@ -248,14 +236,14 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { for (CeedInt i = 0; i < num_output_fields; i++) { code << " global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n"; } - + // TODO: Convert these to defined constants to save on GRF code << " const CeedInt DIM = " << dim << ";\n"; code << " const CeedInt Q_1D = " << Q_1d << ";\n"; const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2]; code << " local CeedScalar scratch[" << scratch_size << "];\n"; - code << " local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << (dim > 2 ? "*T_1D" : "") << ";\n"; + code << " local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; code << "\n // -- Input field constants and basis data --\n"; // Initialize constants, and matrices B and G @@ -301,14 +289,14 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << " loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n"; } else { bool has_collo_grad = !!basis_impl->d_collo_grad_1d; - h_G.inputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; + h_G.inputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n"; } break; case CEED_EVAL_WEIGHT: - break; // No action + break; // No action case CEED_EVAL_DIV: break; // TODO: Not implemented case CEED_EVAL_CURL: @@ -357,10 +345,10 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << " loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n"; } else { bool has_collo_grad = !!basis_impl->d_collo_grad_1d; - h_G.outputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; + h_G.outputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; - code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i - << ");\n"; + code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" + << i << ");\n"; } break; // LCOV_EXCL_START @@ -393,7 +381,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // Restriction if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { - code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "];\n"; + code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; bool is_strided; CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); @@ -405,8 +393,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << " // CompStride: " << comp_stride << "\n"; CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); h_indices.inputs[i] = restr_impl->d_ind; - code << " readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i - << "], d_u_" << i << ", r_u_" << i << ");\n"; + code << " readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i + << ", num_elem, indices->inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n"; } else { bool has_backend_strides; CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); @@ -431,23 +419,22 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { } break; case CEED_EVAL_INTERP: - code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "];\n"; - code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i - << ", r_t_" << i << ", elem_scratch);\n"; + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; + code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" + << i << ", r_t_" << i << ", elem_scratch);\n"; break; case CEED_EVAL_GRAD: if (use_collograd_parallelization) { - code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "];\n"; - code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" - << i << ", r_t_" << i << ", elem_scratch);\n"; + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; + code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i + << ", s_B_in_" << i << ", r_t_" << i << ", elem_scratch);\n"; } else { CeedInt P_1d; CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM];\n"; + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n"; code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i - << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i - << ", r_t_" << i << ", elem_scratch);\n"; + << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ", elem_scratch);\n"; } break; case CEED_EVAL_WEIGHT: @@ -472,26 +459,25 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { if (eval_mode == CEED_EVAL_GRAD) { if (use_collograd_parallelization) { // Accumulator for gradient slices - code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "];\n"; + code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; code << " for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n"; - code << " //for (CeedInt j = 0; j < Q_1D; ++j) {\n"; - code << " //r_tt_" << i << "[j + i*Q_1D] = 0.0;\n"; - code << " r_tt_" << i << "[i] = 0.0;\n"; - code << " //}\n"; + code << " for (CeedInt j = 0; j < Q_1D; ++j) {\n"; + code << " r_tt_" << i << "[j + i*Q_1D] = 0.0;\n"; + code << " }\n"; code << " }\n"; } else { - code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM];\n"; + code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n"; } } if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) { - code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "];\n"; + code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; } } // We treat quadrature points per slice in 3d to save registers if (use_collograd_parallelization) { code << "\n // Note: Using planes of 3D elements\n"; - code << " //for (CeedInt q = 0; q < Q_1D; q++) {\n"; - code << " q = get_local_id(1) / T_1D;\n {\n"; + code << " __attribute__((opencl_unroll_hint))\n"; + code << " for (CeedInt q = 0; q < Q_1D; q++) {\n"; code << " // -- Input fields --\n"; for (CeedInt i = 0; i < num_input_fields; i++) { code << " // ---- Input field " << i << " ----\n"; @@ -529,8 +515,9 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " readSliceQuadsStrided" - << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i - << ", r_q_" << i << ");\n"; + << "3d(num_comp_in_" << i + << ", Q_1D," + << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i << ", r_q_" << i << ");\n"; } break; case CEED_EVAL_INTERP: @@ -607,8 +594,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { if (dim != 3 || use_collograd_parallelization) { code << "1"; } else { - // code << "Q_1D"; - code << "1"; + code << "Q_1D"; } code << ", in, out);\n"; //-------------------------------------------------- @@ -632,8 +618,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << " }\n"; break; case CEED_EVAL_GRAD: - code << " gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i - << ", elem_scratch);\n"; + code << " gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ", elem_scratch);\n"; break; case CEED_EVAL_WEIGHT: break; // Should not occur @@ -663,12 +648,12 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << " private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n"; break; // No action case CEED_EVAL_INTERP: - code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "];\n"; + code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; break; case CEED_EVAL_GRAD: - code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "];\n"; + code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; if (use_collograd_parallelization) { code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; @@ -677,8 +662,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i - << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i - << ", r_v_" << i << ", elem_scratch);\n"; + << ",P_out_" << i << ", Q_1D, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; } break; // LCOV_EXCL_START @@ -705,8 +689,8 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << " // CompStride: " << comp_stride << "\n"; CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); h_indices.outputs[i] = restr_impl->d_ind; - code << " writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i - << "], r_v_" << i << ", d_v_" << i << ");\n"; + code << " writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i + << ", num_elem, indices->outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n"; } else { bool has_backend_strides; CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); @@ -727,32 +711,29 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << "// -----------------------------------------------------------------------------\n\n"; // Copy the struct (containing device addresses) from the host to the device - sycl::event copy_B = sycl_data->sycl_queue.copy(&h_B, impl->B, 1); - sycl::event copy_G = sycl_data->sycl_queue.copy(&h_G, impl->G, 1); - sycl::event copy_indices = sycl_data->sycl_queue.copy(&h_indices, impl->indices, 1); + sycl::event copy_B = sycl_data->sycl_queue.copy(&h_B,impl->B,1); + sycl::event copy_G = sycl_data->sycl_queue.copy(&h_G,impl->G,1); + sycl::event copy_indices = sycl_data->sycl_queue.copy(&h_indices,impl->indices,1); // These copies can happen while the JIT is being done - CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices})); + CeedCallSycl(ceed,sycl::event::wait_and_throw({copy_B, copy_G, copy_indices})); // View kernel for debugging CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); CeedDebug(ceed, code.str().c_str()); - // std::cout<<" \n Generated Operator Kernels:\n"; - // std::cout< jit_constants; - jit_constants["T_1D"] = block_sizes[0]; + jit_constants["T_1D"] = block_sizes[0]; jit_constants["GROUP_SIZE_X"] = block_sizes[0]; jit_constants["GROUP_SIZE_Y"] = block_sizes[1]; jit_constants["GROUP_SIZE_Z"] = block_sizes[2]; // Compile kernel into a kernel bundle - CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants)); - + CeedCallBackend(CeedJitBuildModule_Sycl(ceed, code.str(), &impl->sycl_module,jit_constants)); + // Load kernel function - CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op)); - + CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op)); + CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp index 80a63a4b0a..08dd747a11 100644 --- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp @@ -33,7 +33,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); CeedOperator_Sycl_gen *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; + CeedQFunction qf; CeedQFunction_Sycl_gen *qf_impl; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); @@ -47,7 +47,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedVector vec, output_vecs[CEED_FIELD_MAX] = {}; // Creation of the operator - CeedCallBackend(CeedOperatorBuildKernel_Sycl_gen(op)); + CeedCallBackend(CeedSyclGenOperatorBuild(op)); // Input vectors for (CeedInt i = 0; i < num_input_fields; i++) { @@ -92,36 +92,38 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_impl->d_c)); // Apply operator - const CeedInt dim = impl->dim; - const CeedInt Q_1d = impl->Q_1d; - const CeedInt P_1d = impl->max_P_1d; + const CeedInt dim = impl->dim; + const CeedInt Q_1d = impl->Q_1d; + const CeedInt P_1d = impl->max_P_1d; + const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); CeedInt block_sizes[3], grid = 0; CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes)); if (dim == 1) { - grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); - // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + //CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); } else if (dim == 2) { - grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); - // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + //CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); } else if (dim == 3) { - grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); - // CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + //CeedCallBackend(CeedRunKernelDimSharedSycl(ceed, impl->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); } - sycl::range<3> local_range(block_sizes[2], block_sizes[1], block_sizes[0]); - sycl::range<3> global_range(grid * block_sizes[2], block_sizes[1], block_sizes[0]); - sycl::nd_range<3> kernel_range(global_range, local_range); - + sycl::range<3> local_range(block_sizes[2], block_sizes[1], block_sizes[0]); + sycl::range<3> global_range(grid*block_sizes[2], block_sizes[1], block_sizes[0]); + sycl::nd_range<3> kernel_range(global_range,local_range); + //----------- - // Order queue + //Order queue sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier(); - - CeedCallSycl(ceed, ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) { + + CeedCallSycl(ceed, + ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){ cgh.depends_on(e); cgh.set_args(num_elem, qf_impl->d_c, impl->indices, impl->fields, impl->B, impl->G, impl->W); - cgh.parallel_for(kernel_range, *(impl->op)); + cgh.parallel_for(kernel_range,*(impl->op)); })); - CeedCallSycl(ceed, ceed_Sycl->sycl_queue.wait_and_throw()); + CeedCallSycl(ceed,ceed_Sycl->sycl_queue.wait_and_throw()); // Restore input arrays for (CeedInt i = 0; i < num_input_fields; i++) { @@ -174,15 +176,14 @@ int CeedOperatorCreate_Sycl_gen(CeedOperator op) { CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); - impl->indices = sycl::malloc_device(1, sycl_data->sycl_device, sycl_data->sycl_context); - impl->fields = sycl::malloc_host(1, sycl_data->sycl_context); - impl->B = sycl::malloc_device(1, sycl_data->sycl_device, sycl_data->sycl_context); - impl->G = sycl::malloc_device(1, sycl_data->sycl_device, sycl_data->sycl_context); - impl->W = sycl::malloc_device(1, sycl_data->sycl_device, sycl_data->sycl_context); + impl->indices = sycl::malloc_device(1,sycl_data->sycl_device,sycl_data->sycl_context); + impl->fields = sycl::malloc_host(1,sycl_data->sycl_context); + impl->B = sycl::malloc_device(1,sycl_data->sycl_device,sycl_data->sycl_context); + impl->G = sycl::malloc_device(1,sycl_data->sycl_device,sycl_data->sycl_context); + impl->W = sycl::malloc_device(1,sycl_data->sycl_device,sycl_data->sycl_context); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Sycl_gen)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Sycl_gen)); return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp index 951474476a..e180638205 100644 --- a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp @@ -36,7 +36,7 @@ static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) { // Wait for all work to finish before freeing memory CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); CeedCallSycl(ceed, sycl::free(impl->d_c, data->sycl_context)); - + CeedCallBackend(CeedFree(&impl->q_function_source)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; @@ -67,5 +67,4 @@ int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl_gen)); return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-gen/ceed-sycl-gen.hpp b/backends/sycl-gen/ceed-sycl-gen.hpp index e4e716393c..c65a9fe3b7 100644 --- a/backends/sycl-gen/ceed-sycl-gen.hpp +++ b/backends/sycl-gen/ceed-sycl-gen.hpp @@ -16,16 +16,16 @@ #include "../sycl/ceed-sycl-compile.hpp" typedef struct { - CeedInt dim; - CeedInt Q_1d; - CeedInt max_P_1d; - SyclModule_t *sycl_module; - sycl::kernel *op; + CeedInt dim; + CeedInt Q_1d; + CeedInt max_P_1d; + SyclModule_t* sycl_module; + sycl::kernel* op; FieldsInt_Sycl *indices; Fields_Sycl *fields; Fields_Sycl *B; Fields_Sycl *G; - CeedScalar *W; + CeedScalar *W; } CeedOperator_Sycl_gen; typedef struct { diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp index a5084ff6b7..c84133de26 100644 --- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp @@ -10,16 +10,16 @@ #include #include +#include #include #include -#include //------------------------------------------------------------------------------ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) { char *resource_root; - CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":device_id=", &resource_root)); + CeedCallBackend(CeedSyclGetResourceRoot(ceed, resource, &resource_root)); if (strcmp(resource_root, "/gpu/sycl") && strcmp(resource_root, "/gpu/sycl/gen")) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource); @@ -30,7 +30,7 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) { Ceed_Sycl *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); - CeedCallBackend(CeedInit_Sycl(ceed, resource)); + CeedCallBackend(CeedSyclInit(ceed, resource)); Ceed ceed_shared; CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared)); @@ -55,5 +55,4 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) { // Register backend //------------------------------------------------------------------------------ CEED_INTERN int CeedRegister_Sycl_Gen(void) { return CeedRegister("/gpu/sycl/gen", CeedInit_Sycl_gen, 20); } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp index 601ebdda95..80d17788cb 100644 --- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp @@ -15,10 +15,8 @@ #include "../sycl/ceed-sycl-compile.hpp" #include "ceed-sycl-ref.hpp" -template -class CeedBasisSyclInterp; -template -class CeedBasisSyclGrad; +template class CeedBasisSyclInterp; +template class CeedBasisSyclGrad; class CeedBasisSyclWeight; class CeedBasisSyclInterpNT; @@ -35,11 +33,11 @@ static constexpr SpecID BASIS_Q_1D_ID; //------------------------------------------------------------------------------ // Interpolation kernel - tensor //------------------------------------------------------------------------------ -template -static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, - const CeedScalar *u, CeedScalar *v) { - const CeedInt buf_len = impl->buf_len; - const CeedInt op_len = impl->op_len; +template +static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t& sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, const CeedScalar *u, + CeedScalar *v) { + const CeedInt buf_len = impl->buf_len; + const CeedInt op_len = impl->op_len; const CeedScalar *interp_1d = impl->d_interp_1d; const sycl::device &sycl_device = sycl_queue.get_device(); @@ -65,8 +63,8 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t const CeedInt P_1d = kh.get_specialization_constant(); const CeedInt Q_1d = kh.get_specialization_constant(); //--------------------------------------------------------------> - const CeedInt num_nodes = CeedIntPow(P_1d, dim); - const CeedInt num_qpts = CeedIntPow(Q_1d, dim); + const CeedInt num_nodes = CeedIntPow(P_1d,dim); + const CeedInt num_qpts = CeedIntPow(Q_1d,dim); const CeedInt P = transpose ? Q_1d : P_1d; const CeedInt Q = transpose ? P_1d : Q_1d; const CeedInt stride_0 = transpose ? 1 : P_1d; @@ -136,16 +134,16 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t //------------------------------------------------------------------------------ // Gradient kernel - tensor //------------------------------------------------------------------------------ -template -static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, - const CeedScalar *u, CeedScalar *v) { - const CeedInt buf_len = impl->buf_len; - const CeedInt op_len = impl->op_len; +template +static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t& sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, const CeedScalar *u, + CeedScalar *v) { + const CeedInt buf_len = impl->buf_len; + const CeedInt op_len = impl->op_len; const CeedScalar *interp_1d = impl->d_interp_1d; const CeedScalar *grad_1d = impl->d_grad_1d; - const sycl::device &sycl_device = sycl_queue.get_device(); - const CeedInt work_group_size = 32; + const sycl::device &sycl_device = sycl_queue.get_device(); + const CeedInt work_group_size = 32; sycl::range<1> local_range(work_group_size); sycl::range<1> global_range(num_elem * work_group_size); sycl::nd_range<1> kernel_range(global_range, local_range); @@ -166,22 +164,23 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t & const CeedInt P_1d = kh.get_specialization_constant(); const CeedInt Q_1d = kh.get_specialization_constant(); //--------------------------------------------------------------> - const CeedInt num_nodes = CeedIntPow(P_1d, dim); - const CeedInt num_qpts = CeedIntPow(Q_1d, dim); - const CeedInt P = transpose ? Q_1d : P_1d; - const CeedInt Q = transpose ? P_1d : Q_1d; - const CeedInt stride_0 = transpose ? 1 : P_1d; - const CeedInt stride_1 = transpose ? P_1d : 1; - const CeedInt u_stride = transpose ? num_qpts : num_nodes; - const CeedInt v_stride = transpose ? num_nodes : num_qpts; + const CeedInt num_nodes = CeedIntPow(P_1d,dim); + const CeedInt num_qpts = CeedIntPow(Q_1d,dim); + const CeedInt P = transpose ? Q_1d : P_1d; + const CeedInt Q = transpose ? P_1d : Q_1d; + const CeedInt stride_0 = transpose ? 1 : P_1d; + const CeedInt stride_1 = transpose ? P_1d : 1; + const CeedInt u_stride = transpose ? num_qpts : num_nodes; + const CeedInt v_stride = transpose ? num_nodes : num_qpts; const CeedInt u_comp_stride = num_elem * u_stride; const CeedInt v_comp_stride = num_elem * v_stride; const CeedInt u_dim_stride = transpose ? num_elem * num_qpts * num_comp : 0; const CeedInt v_dim_stride = transpose ? 0 : num_elem * num_qpts * num_comp; - sycl::group work_group = work_item.get_group(); - const CeedInt i = work_item.get_local_linear_id(); - const CeedInt group_size = work_group.get_local_linear_range(); - const CeedInt elem = work_group.get_group_linear_id(); + + sycl::group work_group = work_item.get_group(); + const CeedInt i = work_item.get_local_linear_id(); + const CeedInt group_size = work_group.get_local_linear_range(); + const CeedInt elem = work_group.get_group_linear_id(); CeedScalar *s_interp_1d = s_mem.get_pointer(); CeedScalar *s_grad_1d = s_interp_1d + P * Q; @@ -204,7 +203,7 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t & for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) { // Use older version of sycl workgroup barrier for performance reasons // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed - // sycl::group_barrier(work_group); + //sycl::group_barrier(work_group); work_item.barrier(sycl::access::fence_space::local_space); pre /= P; @@ -292,14 +291,14 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran // Basis action switch (eval_mode) { case CEED_EVAL_INTERP: { - if (transpose) { + if(transpose) { CeedCallBackend(CeedBasisApplyInterp_Sycl(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); } else { CeedCallBackend(CeedBasisApplyInterp_Sycl(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); } } break; case CEED_EVAL_GRAD: { - if (transpose) { + if(transpose) { CeedCallBackend(CeedBasisApplyGrad_Sycl<1>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); } else { CeedCallBackend(CeedBasisApplyGrad_Sycl<0>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); @@ -582,19 +581,23 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device(Q_1d, data->sycl_device, data->sycl_context)); - sycl::event copy_weight = data->sycl_queue.copy(q_weight_1d, impl->d_q_weight_1d, Q_1d, {e}); + sycl::event copy_weight = data->sycl_queue.copy(q_weight_1d, impl->d_q_weight_1d, Q_1d,{e}); const CeedInt interp_length = Q_1d * P_1d; CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device(interp_length, data->sycl_device, data->sycl_context)); - sycl::event copy_interp = data->sycl_queue.copy(interp_1d, impl->d_interp_1d, interp_length, {e}); + sycl::event copy_interp = data->sycl_queue.copy(interp_1d, impl->d_interp_1d, interp_length,{e}); CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device(interp_length, data->sycl_device, data->sycl_context)); - sycl::event copy_grad = data->sycl_queue.copy(grad_1d, impl->d_grad_1d, interp_length, {e}); + sycl::event copy_grad = data->sycl_queue.copy(grad_1d, impl->d_grad_1d, interp_length,{e}); CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad})); - std::vector kernel_ids = {sycl::get_kernel_id>(), sycl::get_kernel_id>(), - sycl::get_kernel_id>(), sycl::get_kernel_id>()}; + std::vector kernel_ids = { + sycl::get_kernel_id>(), + sycl::get_kernel_id>(), + sycl::get_kernel_id>(), + sycl::get_kernel_id>() + }; sycl::kernel_bundle input_bundle = sycl::get_kernel_bundle(data->sycl_context, kernel_ids); input_bundle.set_specialization_constant(dim); @@ -602,7 +605,7 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const input_bundle.set_specialization_constant(Q_1d); input_bundle.set_specialization_constant(P_1d); - CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle))); + CeedCallSycl(ceed,impl->sycl_module = new SyclModule_t(sycl::build(input_bundle))); CeedCallBackend(CeedBasisSetData(basis, impl)); @@ -636,15 +639,15 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device(num_qpts, data->sycl_device, data->sycl_context)); - sycl::event copy_weight = data->sycl_queue.copy(q_weight, impl->d_q_weight, num_qpts, {e}); + sycl::event copy_weight = data->sycl_queue.copy(q_weight, impl->d_q_weight, num_qpts,{e}); const CeedInt interp_length = num_qpts * num_nodes; CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device(interp_length, data->sycl_device, data->sycl_context)); - sycl::event copy_interp = data->sycl_queue.copy(interp, impl->d_interp, interp_length, {e}); + sycl::event copy_interp = data->sycl_queue.copy(interp, impl->d_interp, interp_length,{e}); const CeedInt grad_length = num_qpts * num_nodes * dim; CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device(grad_length, data->sycl_device, data->sycl_context)); - sycl::event copy_grad = data->sycl_queue.copy(grad, impl->d_grad, grad_length, {e}); + sycl::event copy_grad = data->sycl_queue.copy(grad, impl->d_grad, grad_length,{e}); CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad})); @@ -655,5 +658,4 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl)); return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp index 285d199ddd..6b97689dee 100644 --- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp @@ -348,7 +348,7 @@ static inline int CeedOperatorRestoreInputs_Sycl(CeedInt numinputfields, CeedQFu CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); if (emode == CEED_EVAL_WEIGHT) { // Skip } else { - if (!impl->evecs[i]) { // This was a skiprestrict case + if (!impl->evecs[i]) { // This was a skiprestrict case CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i])); } else { @@ -481,8 +481,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, Ceed ceed, ceedparent; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedGetOperatorFallbackParentCeed(ceed, &ceedparent)); - ceedparent = ceedparent ? ceedparent : ceed; - CeedScalar *edata[2 * CEED_FIELD_MAX] = {NULL}; + ceedparent = ceedparent ? ceedparent : ceed; + CeedScalar *edata[2 * CEED_FIELD_MAX]; // Setup CeedCallBackend(CeedOperatorSetup_Sycl(op)); @@ -787,7 +787,7 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const std::vector copy_events; if (evalNone) { CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity)); - for (CeedSize i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; + for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device(iLen, sycl_data->sycl_device, sycl_data->sycl_context)); sycl::event identity_copy = sycl_data->sycl_queue.copy(identity, diag->d_identity, iLen, {e}); copy_events.push_back(identity_copy); @@ -838,11 +838,11 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const //------------------------------------------------------------------------------ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool pointBlock, const CeedInt nelem, const CeedOperatorDiag_Sycl *diag, const CeedScalar *assembledqfarray, CeedScalar *elemdiagarray) { - const CeedSize nnodes = diag->nnodes; - const CeedSize nqpts = diag->nqpts; - const CeedSize ncomp = diag->ncomp; - const CeedSize numemodein = diag->numemodein; - const CeedSize numemodeout = diag->numemodeout; + const CeedInt nnodes = diag->nnodes; + const CeedInt nqpts = diag->nqpts; + const CeedInt ncomp = diag->ncomp; + const CeedInt numemodein = diag->numemodein; + const CeedInt numemodeout = diag->numemodeout; const CeedScalar *identity = diag->d_identity; const CeedScalar *interpin = diag->d_interpin; @@ -864,23 +864,23 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p // Each element CeedInt dout = -1; // Each basis eval mode pair - for (CeedSize eout = 0; eout < numemodeout; eout++) { + for (CeedInt eout = 0; eout < numemodeout; eout++) { const CeedScalar *bt = NULL; if (emodeout[eout] == CEED_EVAL_GRAD) ++dout; CeedOperatorGetBasisPointer_Sycl(&bt, emodeout[eout], identity, interpout, &gradout[dout * nqpts * nnodes]); CeedInt din = -1; - for (CeedSize ein = 0; ein < numemodein; ein++) { + for (CeedInt ein = 0; ein < numemodein; ein++) { const CeedScalar *b = NULL; if (emodein[ein] == CEED_EVAL_GRAD) ++din; CeedOperatorGetBasisPointer_Sycl(&b, emodein[ein], identity, interpin, &gradin[din * nqpts * nnodes]); // Each component - for (CeedSize compOut = 0; compOut < ncomp; compOut++) { + for (CeedInt compOut = 0; compOut < ncomp; compOut++) { // Each qpoint/node pair if (pointBlock) { // Point Block Diagonal for (CeedInt compIn = 0; compIn < ncomp; compIn++) { CeedScalar evalue = 0.0; - for (CeedSize q = 0; q < nqpts; q++) { + for (CeedInt q = 0; q < nqpts; q++) { const CeedScalar qfvalue = assembledqfarray[((((ein * ncomp + compIn) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q]; evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid]; @@ -890,7 +890,7 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p } else { // Diagonal Only CeedScalar evalue = 0.0; - for (CeedSize q = 0; q < nqpts; q++) { + for (CeedInt q = 0; q < nqpts; q++) { const CeedScalar qfvalue = assembledqfarray[((((ein * ncomp + compOut) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q]; evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid]; @@ -916,8 +916,8 @@ static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVec CeedCallBackend(CeedGetData(ceed, &sycl_data)); // Assemble QFunction - CeedVector assembledqf = NULL; - CeedElemRestriction rstr = NULL; + CeedVector assembledqf; + CeedElemRestriction rstr; CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); @@ -1187,23 +1187,23 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp // TODO: expand to more general cases CeedOperatorAssemble_Sycl *asmb = impl->asmb; const CeedInt nelem = asmb->nelem; - const CeedSize nnodes = asmb->nnodes; - const CeedSize ncomp = asmb->ncomp; - const CeedSize nqpts = asmb->nqpts; - const CeedSize numemodein = asmb->numemodein; - const CeedSize numemodeout = asmb->numemodeout; + const CeedInt nnodes = asmb->nnodes; + const CeedInt ncomp = asmb->ncomp; + const CeedInt nqpts = asmb->nqpts; + const CeedInt numemodein = asmb->numemodein; + const CeedInt numemodeout = asmb->numemodeout; // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element, // comp_in, comp_out, node_row, node_col - const CeedSize comp_out_stride = nnodes * nnodes; - const CeedSize comp_in_stride = comp_out_stride * ncomp; - const CeedSize e_stride = comp_in_stride * ncomp; + const CeedInt comp_out_stride = nnodes * nnodes; + const CeedInt comp_in_stride = comp_out_stride * ncomp; + const CeedInt e_stride = comp_in_stride * ncomp; // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const CeedSize qe_stride = nqpts; - const CeedSize qcomp_out_stride = nelem * qe_stride; - const CeedSize qemode_out_stride = qcomp_out_stride * ncomp; - const CeedSize qcomp_in_stride = qemode_out_stride * numemodeout; - const CeedSize qemode_in_stride = qcomp_in_stride * ncomp; + const CeedInt qe_stride = nqpts; + const CeedInt qcomp_out_stride = nelem * qe_stride; + const CeedInt qemode_out_stride = qcomp_out_stride * ncomp; + const CeedInt qcomp_in_stride = qemode_out_stride * numemodeout; + const CeedInt qemode_in_stride = qcomp_in_stride * ncomp; CeedScalar *B_in, *B_out; B_in = asmb->d_B_in; @@ -1220,22 +1220,22 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp const int l = idx.get(1); // The output column index of each B^TDB operation const int i = idx.get(2); // The output row index of each B^TDB operation // such that we have (Bout^T)_ij D_jk Bin_kl = C_il - for (CeedSize comp_in = 0; comp_in < ncomp; comp_in++) { - for (CeedSize comp_out = 0; comp_out < ncomp; comp_out++) { + for (CeedInt comp_in = 0; comp_in < ncomp; comp_in++) { + for (CeedInt comp_out = 0; comp_out < ncomp; comp_out++) { CeedScalar result = 0.0; - CeedSize qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; - for (CeedSize emode_in = 0; emode_in < numemodein; emode_in++) { - CeedSize b_in_index = emode_in * nqpts * nnodes; - for (CeedSize emode_out = 0; emode_out < numemodeout; emode_out++) { - CeedSize b_out_index = emode_out * nqpts * nnodes; - CeedSize qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; + for (CeedInt emode_in = 0; emode_in < numemodein; emode_in++) { + CeedInt b_in_index = emode_in * nqpts * nnodes; + for (CeedInt emode_out = 0; emode_out < numemodeout; emode_out++) { + CeedInt b_out_index = emode_out * nqpts * nnodes; + CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; // Perform the B^T D B operation for this 'chunk' of D (the qf_array) - for (CeedSize j = 0; j < nqpts; j++) { + for (CeedInt j = 0; j < nqpts; j++) { result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l]; } } // end of emode_out } // end of emode_in - CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l; + CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l; values_array[val_index] = result; } // end of out component } // end of in component @@ -1343,8 +1343,8 @@ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, Ceed } // Assemble QFunction - CeedVector assembled_qf = NULL; - CeedElemRestriction rstr_q = NULL; + CeedVector assembled_qf; + CeedElemRestriction rstr_q; CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); CeedScalar *values_array; diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp index b53d46a71f..d23cbe36ee 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.hpp @@ -8,6 +8,6 @@ #ifndef _ceed_sycl_qfunction_load_hpp #define _ceed_sycl_qfunction_load_hpp -CEED_INTERN int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf); +CEED_INTERN int CeedSyclBuildQFunction(CeedQFunction qf); #endif // _ceed_sycl_qfunction_load_hpp diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp index 35b3699518..c1cbae731f 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp @@ -26,7 +26,7 @@ // // TODO: Refactor //------------------------------------------------------------------------------ -extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { +extern "C" int CeedSyclBuildQFunction(CeedQFunction qf) { CeedQFunction_Sycl* impl; CeedCallBackend(CeedQFunctionGetData(qf, (void**)&impl)); // QFunction is built @@ -60,17 +60,17 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { CeedCallBackend(CeedQFunctionGetKernelName(qf, &qfunction_name)); char* qfunction_source; - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n"); + CeedDebug256(ceed, 2, "----- Loading QFunction User Source -----\n"); CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &qfunction_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n"); + CeedDebug256(ceed, 2, "----- Loading QFunction User Source Complete! -----\n"); char* read_write_kernel_path; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-ref-qfunction.h", &read_write_kernel_path)); char* read_write_kernel_source; - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n"); + CeedDebug256(ceed, 2, "----- Loading QFunction Read/Write Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n"); + CeedDebug256(ceed, 2, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n"); std::string_view qf_name_view(qfunction_name); std::string_view qf_source_view(qfunction_source); @@ -86,8 +86,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { // Kernel function // Here we are fixing a lower sub-group size value to avoid register spills // This needs to be revisited if all qfunctions require this. - code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) __kernel void " << kernel_name - << "(__global void *ctx, CeedInt Q,\n"; + code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) __kernel void " << kernel_name << "(__global void *ctx, CeedInt Q,\n"; // OpenCL doesn't allow for structs with pointers. // We will need to pass all of the arguments individually. @@ -158,12 +157,12 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { code << "}\n"; // View kernel for debugging - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated QFunction Kernels:\n"); + CeedDebug256(ceed, 2, "Generated QFunction Kernels:\n"); CeedDebug(ceed, code.str().c_str()); // Compile kernel - CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module)); - CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, kernel_name, &impl->QFunction)); + CeedCallBackend(CeedJitBuildModule_Sycl(ceed, code.str(), &impl->sycl_module)); + CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, kernel_name, &impl->QFunction)); // Cleanup CeedCallBackend(CeedFree(&qfunction_source)); @@ -172,5 +171,4 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp index 10e090b58c..6822c333ba 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp @@ -28,7 +28,7 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C CeedCallBackend(CeedQFunctionGetData(qf, &impl)); // Build and compile kernel, if not done - if (!impl->QFunction) CeedCallBackend(CeedQFunctionBuildKernel_Sycl(qf)); + if (!impl->QFunction) CeedCallBackend(CeedSyclBuildQFunction(qf)); Ceed ceed; CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); @@ -80,8 +80,8 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C // Hard-coding the work-group size for now // We could use the Level Zero API to query and set an appropriate size in future // Equivalent of CUDA Occupancy Calculator - int wg_size = WG_SIZE_QF; - sycl::range<1> rounded_Q = ((Q + (wg_size - 1)) / wg_size) * wg_size; + int wg_size = WG_SIZE_QF; + sycl::range<1> rounded_Q = ((Q + (wg_size- 1)) / wg_size) * wg_size; sycl::nd_range<1> kernel_range(rounded_Q, wg_size); cgh.parallel_for(kernel_range, *(impl->QFunction)); }); @@ -139,5 +139,4 @@ int CeedQFunctionCreate_Sycl(CeedQFunction qf) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl)); return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp index 0c6d2a0637..b3848d9883 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp @@ -269,7 +269,7 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Sycl *impl; CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - + Ceed_Sycl *ceedSycl; CeedCallBackend(CeedGetData(ceed, &ceedSycl)); @@ -399,5 +399,4 @@ int CeedQFunctionContextCreate_Sycl(CeedQFunctionContext ctx) { return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp index 8cdaa3531e..9c9be231c2 100644 --- a/backends/sycl-ref/ceed-sycl-ref.hpp +++ b/backends/sycl-ref/ceed-sycl-ref.hpp @@ -44,18 +44,18 @@ typedef struct { } CeedElemRestriction_Sycl; typedef struct { - CeedInt dim; - CeedInt P_1d; - CeedInt Q_1d; - CeedInt num_comp; - CeedInt num_nodes; - CeedInt num_qpts; - CeedInt buf_len; - CeedInt op_len; + CeedInt dim; + CeedInt P_1d; + CeedInt Q_1d; + CeedInt num_comp; + CeedInt num_nodes; + CeedInt num_qpts; + CeedInt buf_len; + CeedInt op_len; SyclModule_t *sycl_module; - CeedScalar *d_interp_1d; - CeedScalar *d_grad_1d; - CeedScalar *d_q_weight_1d; + CeedScalar *d_interp_1d; + CeedScalar *d_grad_1d; + CeedScalar *d_q_weight_1d; } CeedBasis_Sycl; typedef struct { @@ -112,16 +112,25 @@ typedef struct { CeedOperatorAssemble_Sycl *asmb; } CeedOperator_Sycl; +// CEED_INTERN int CeedSyclGetCublasHandle(Ceed ceed, cublasHandle_t *handle); + CEED_INTERN int CeedVectorCreate_Sycl(CeedSize n, CeedVector vec); +CEED_INTERN int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r); + +CEED_INTERN int CeedElemRestrictionCreateBlocked_Sycl(const CeedMemType mem_type, const CeedCopyMode copy_mode, const CeedInt *indices, + const CeedElemRestriction res); + +CEED_INTERN int CeedBasisApplyElems_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + const CeedVector u, CeedVector v); + +CEED_INTERN int CeedQFunctionApplyElems_Sycl(CeedQFunction qf, const CeedInt Q, const CeedVector *const u, const CeedVector *v); + CEED_INTERN int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *qref_1d, const CeedScalar *qweight_1d, CeedBasis basis); -CEED_INTERN int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis); - -CEED_INTERN int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, - const CeedInt8 *curl_orients, CeedElemRestriction r); +CEED_INTERN int CeedBasisCreateH1_Sycl(CeedElemTopology, CeedInt, CeedInt, CeedInt, const CeedScalar *, const CeedScalar *, const CeedScalar *, + const CeedScalar *, CeedBasis); CEED_INTERN int CeedQFunctionCreate_Sycl(CeedQFunction qf); diff --git a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp index 65d4fead36..b0e45614f1 100644 --- a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp @@ -25,25 +25,28 @@ static int CeedGetPreferredMemType_Sycl(CeedMemType *mem_type) { //------------------------------------------------------------------------------ // Backend Init //------------------------------------------------------------------------------ -static int CeedInit_Sycl_ref(const char *resource, Ceed ceed) { +static int CeedInit_Sycl(const char *resource, Ceed ceed) { char *resource_root; - CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); - CeedCheck(!std::strcmp(resource_root, "/gpu/sycl/ref") || !std::strcmp(resource_root, "/cpu/sycl/ref"), ceed, CEED_ERROR_BACKEND, - "Sycl backend cannot use resource: %s", resource); + CeedCallBackend(CeedSyclGetResourceRoot(ceed, resource, &resource_root)); + if (std::strcmp(resource_root, "/gpu/sycl/ref") && std::strcmp(resource_root, "/cpu/sycl/ref")) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedSetDeterministic(ceed, true)); Ceed_Sycl *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); - CeedCallBackend(CeedInit_Sycl(ceed, resource)); + CeedCallBackend(CeedSyclInit(ceed, resource)); - CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "SetStream", CeedSetStream_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "GetPreferredMemType", CeedGetPreferredMemType_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "VectorCreate", &CeedVectorCreate_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "BasisCreateTensorH1", &CeedBasisCreateTensorH1_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "BasisCreateH1", &CeedBasisCreateH1_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "ElemRestrictionCreate", &CeedElemRestrictionCreate_Sycl)); + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "ElemRestrictionCreateBlocked", &CeedElemRestrictionCreateBlocked_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", &CeedQFunctionCreate_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionContextCreate", &CeedQFunctionContextCreate_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", &CeedOperatorCreate_Sycl)); @@ -56,9 +59,8 @@ static int CeedInit_Sycl_ref(const char *resource, Ceed ceed) { // Backend Register //------------------------------------------------------------------------------ CEED_INTERN int CeedRegister_Sycl(void) { - CeedCallBackend(CeedRegister("/gpu/sycl/ref", CeedInit_Sycl_ref, 40)); - CeedCallBackend(CeedRegister("/cpu/sycl/ref", CeedInit_Sycl_ref, 50)); + CeedCallBackend(CeedRegister("/gpu/sycl/ref", CeedInit_Sycl, 40)); + CeedCallBackend(CeedRegister("/cpu/sycl/ref", CeedInit_Sycl, 50)); return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp index ef0a1f4c47..4407791703 100644 --- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp @@ -196,6 +196,18 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction r, CeedTransposeMod return CEED_ERROR_SUCCESS; } +//------------------------------------------------------------------------------ +// Blocked not supported +//------------------------------------------------------------------------------ +int CeedElemRestrictionApplyBlock_Sycl(CeedElemRestriction r, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v, + CeedRequest *request) { + // LCOV_EXCL_START + Ceed ceed; + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); + // LCOV_EXCL_STOP +} + //------------------------------------------------------------------------------ // Get offsets //------------------------------------------------------------------------------ @@ -333,8 +345,7 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee //------------------------------------------------------------------------------ // Create restriction //------------------------------------------------------------------------------ -int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, - const CeedInt8 *curl_orients, CeedElemRestriction r) { +int CeedElemRestrictionCreate_Sycl(CeedMemType m_type, CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r) { Ceed ceed; CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); Ceed_Sycl *data; @@ -349,11 +360,6 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, CeedInt strides[3] = {1, size, elem_size}; CeedInt comp_stride = 1; - CeedRestrictionType rstr_type; - CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); - CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, - "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); - // Stride data bool is_strided; CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); @@ -386,7 +392,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); // Set up device indices/offset arrays - if (mem_type == CEED_MEM_HOST) { + if (m_type == CEED_MEM_HOST) { switch (copy_mode) { case CEED_OWN_POINTER: impl->h_ind_allocated = (CeedInt *)indices; @@ -414,7 +420,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallSycl(ceed, copy_event.wait_and_throw()); CeedCallBackend(CeedElemRestrictionOffset_Sycl(r, indices)); } - } else if (mem_type == CEED_MEM_DEVICE) { + } else if (m_type == CEED_MEM_DEVICE) { switch (copy_mode) { case CEED_COPY_VALUES: if (indices != NULL) { @@ -453,9 +459,18 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, // Register backend functions CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Sycl)); - CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Sycl)); - CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Sycl)); + CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyBlock", CeedElemRestrictionApplyBlock_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Sycl)); return CEED_ERROR_SUCCESS; } + +//------------------------------------------------------------------------------ +// Blocked not supported +//------------------------------------------------------------------------------ +int CeedElemRestrictionCreateBlocked_Sycl(const CeedMemType m_type, const CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r) { + Ceed ceed; + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); +} +//------------------------------------------------------------------------------ diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp index 5c0d8b5f4b..d806e37e0a 100644 --- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp @@ -84,7 +84,11 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) { Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); - CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); + if (!impl->d_array) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); + // LCOV_EXCL_STOP + } CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); @@ -306,15 +310,15 @@ static int CeedVectorSetArray_Sycl(const CeedVector vec, const CeedMemType mem_t //------------------------------------------------------------------------------ // Set host array to value //------------------------------------------------------------------------------ -static int CeedHostSetValue_Sycl(CeedScalar *h_array, CeedSize length, CeedScalar val) { - for (CeedSize i = 0; i < length; i++) h_array[i] = val; +static int CeedHostSetValue_Sycl(CeedScalar *h_array, CeedInt length, CeedScalar val) { + for (int i = 0; i < length; i++) h_array[i] = val; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Set device array to value //------------------------------------------------------------------------------ -static int CeedDeviceSetValue_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedSize length, CeedScalar val) { +static int CeedDeviceSetValue_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedInt length, CeedScalar val) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.fill(d_array, val, length, {e}); @@ -420,7 +424,6 @@ static int CeedVectorGetArrayCore_Sycl(const CeedVector vec, const CeedMemType m return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ // Get read-only access to a vector via the specified mem_type //------------------------------------------------------------------------------ @@ -505,13 +508,15 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no case CEED_NORM_2: { // Order queue sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); - auto sumReduction = sycl::reduction(impl->reduction_norm, sycl::plus<>(), {sycl::property::reduction::initialize_to_identity{}}); + auto sumReduction = sycl::reduction(impl->reduction_norm, sycl::plus<>(), + {sycl::property::reduction::initialize_to_identity{}}); data->sycl_queue.parallel_for(length, {e}, sumReduction, [=](sycl::id<1> i, auto &sum) { sum += (d_array[i] * d_array[i]); }).wait_and_throw(); } break; case CEED_NORM_MAX: { // Order queue sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); - auto maxReduction = sycl::reduction(impl->reduction_norm, sycl::maximum<>(), {sycl::property::reduction::initialize_to_identity{}}); + auto maxReduction = sycl::reduction(impl->reduction_norm, sycl::maximum<>(), + {sycl::property::reduction::initialize_to_identity{}}); data->sycl_queue.parallel_for(length, {e}, maxReduction, [=](sycl::id<1> i, auto &max) { max.combine(abs(d_array[i])); }).wait_and_throw(); } break; } @@ -527,8 +532,8 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no //------------------------------------------------------------------------------ // Take reciprocal of a vector on host //------------------------------------------------------------------------------ -static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedSize length) { - for (CeedSize i = 0; i < length; i++) { +static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedInt length) { + for (int i = 0; i < length; i++) { if (std::fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i]; } return CEED_ERROR_SUCCESS; @@ -537,7 +542,7 @@ static int CeedHostReciprocal_Sycl(CeedScalar *h_array, CeedSize length) { //------------------------------------------------------------------------------ // Take reciprocal of a vector on device //------------------------------------------------------------------------------ -static int CeedDeviceReciprocal_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedSize length) { +static int CeedDeviceReciprocal_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, CeedInt length) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { @@ -569,15 +574,15 @@ static int CeedVectorReciprocal_Sycl(CeedVector vec) { //------------------------------------------------------------------------------ // Compute x = alpha x on the host //------------------------------------------------------------------------------ -static int CeedHostScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { - for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha; +static int CeedHostScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedInt length) { + for (int i = 0; i < length; i++) x_array[i] *= alpha; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute x = alpha x on device //------------------------------------------------------------------------------ -static int CeedDeviceScale_Sycl(sycl::queue &sycl_queue, CeedScalar *x_array, CeedScalar alpha, CeedSize length) { +static int CeedDeviceScale_Sycl(sycl::queue &sycl_queue, CeedScalar *x_array, CeedScalar alpha, CeedInt length) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { x_array[i] *= alpha; }); @@ -607,15 +612,15 @@ static int CeedVectorScale_Sycl(CeedVector x, CeedScalar alpha) { //------------------------------------------------------------------------------ // Compute y = alpha x + y on the host //------------------------------------------------------------------------------ -static int CeedHostAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { - for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i]; +static int CeedHostAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) { + for (int i = 0; i < length; i++) y_array[i] += alpha * x_array[i]; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute y = alpha x + y on device //------------------------------------------------------------------------------ -static int CeedDeviceAXPY_Sycl(sycl::queue &sycl_queue, CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { +static int CeedDeviceAXPY_Sycl(sycl::queue &sycl_queue, CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { y_array[i] += alpha * x_array[i]; }); @@ -652,15 +657,15 @@ static int CeedVectorAXPY_Sycl(CeedVector y, CeedScalar alpha, CeedVector x) { //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on the host //------------------------------------------------------------------------------ -static int CeedHostPointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { - for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i]; +static int CeedHostPointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) { + for (int i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i]; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on device (impl in .cu file) //------------------------------------------------------------------------------ -static int CeedDevicePointwiseMult_Sycl(sycl::queue &sycl_queue, CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { +static int CeedDevicePointwiseMult_Sycl(sycl::queue &sycl_queue, CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) { // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.parallel_for(length, {e}, [=](sycl::id<1> i) { w_array[i] = x_array[i] * y_array[i]; }); diff --git a/backends/sycl-ref/kernels/sycl-ref-vector.cpp b/backends/sycl-ref/kernels/sycl-ref-vector.hpp similarity index 98% rename from backends/sycl-ref/kernels/sycl-ref-vector.cpp rename to backends/sycl-ref/kernels/sycl-ref-vector.hpp index 308d74d209..855bddead9 100644 --- a/backends/sycl-ref/kernels/sycl-ref-vector.cpp +++ b/backends/sycl-ref/kernels/sycl-ref-vector.hpp @@ -5,8 +5,11 @@ // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed +#ifndef _ceed_sycl_kernels_ref_vector_hpp +#define _ceed_sycl_kernels_ref_vector_hpp #include + #include //------------------------------------------------------------------------------ @@ -119,4 +122,4 @@ extern "C" int CeedDevicePointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_a return 0; } -//------------------------------------------------------------------------------ +#endif diff --git a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp index 1d3149d087..bbdcf9b01d 100644 --- a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp +++ b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp @@ -19,18 +19,17 @@ //------------------------------------------------------------------------------ // Compute the local range of for basis kernels //------------------------------------------------------------------------------ -static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt *local_range, CeedInt max_group_size = 128) { - local_range[0] = thread_1d; - local_range[1] = (dim > 2 ? thread_1d : 1) * (dim > 1 ? thread_1d : 1); +static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt *local_range, CeedInt max_group_size = 256) { + local_range[0] = thread_1d; + local_range[1] = (dim > 1) ? thread_1d : 1; + const CeedInt min_group_size = local_range[0] * local_range[1]; + if (min_group_size > max_group_size) { + return CeedError(ceed, CEED_ERROR_BACKEND,"Requested group size is smaller than the required minimum."); + } - if (min_group_size > max_group_size) max_group_size = 256; - if (min_group_size > max_group_size) max_group_size = 512; - if (min_group_size > max_group_size) max_group_size = 1024; - CeedCheck(min_group_size <= max_group_size, ceed, CEED_ERROR_BACKEND, "Requested group size is smaller than the required minimum."); - - local_range[2] = max_group_size / min_group_size; // elements per group + local_range[2] = max_group_size / min_group_size; //elements per group return CEED_ERROR_SUCCESS; } @@ -38,7 +37,7 @@ static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt // Apply basis //------------------------------------------------------------------------------ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { + CeedVector v) { Ceed ceed; CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Sycl *ceed_Sycl; @@ -57,62 +56,62 @@ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, Ce // Apply basis operation switch (eval_mode) { case CEED_EVAL_INTERP: { - CeedInt *lrange = impl->interp_local_range; - const CeedInt &elem_per_group = lrange[2]; - const CeedInt group_count = (num_elem / elem_per_group) + !!(num_elem % elem_per_group); + CeedInt* lrange = impl->interp_local_range; + const CeedInt& elem_per_group = lrange[2]; + const CeedInt group_count = (num_elem / elem_per_group) + !!(num_elem % elem_per_group); //----------- - sycl::range<3> local_range(lrange[2], lrange[1], lrange[0]); - sycl::range<3> global_range(group_count * lrange[2], lrange[1], lrange[0]); - sycl::nd_range<3> kernel_range(global_range, local_range); + sycl::range<3> local_range(lrange[2],lrange[1],lrange[0]); + sycl::range<3> global_range(group_count * lrange[2],lrange[1],lrange[0]); + sycl::nd_range<3> kernel_range(global_range,local_range); //----------- - sycl::kernel *interp_kernel = (t_mode == CEED_TRANSPOSE) ? impl->interp_transpose_kernel : impl->interp_kernel; + sycl::kernel* interp_kernel = (t_mode == CEED_TRANSPOSE) ? impl->interp_transpose_kernel : impl->interp_kernel; - // Order queue + //Order queue sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier(); - - ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) { + + ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){ cgh.depends_on(e); cgh.set_args(num_elem, impl->d_interp_1d, d_u, d_v); - cgh.parallel_for(kernel_range, *interp_kernel); + cgh.parallel_for(kernel_range,*interp_kernel); }); } break; case CEED_EVAL_GRAD: { - CeedInt *lrange = impl->grad_local_range; - const CeedInt &elem_per_group = lrange[2]; - const CeedInt group_count = (num_elem / elem_per_group) + !!(num_elem % elem_per_group); + CeedInt* lrange = impl->grad_local_range; + const CeedInt& elem_per_group = lrange[2]; + const CeedInt group_count = (num_elem / elem_per_group) + !!(num_elem % elem_per_group); //----------- - sycl::range<3> local_range(lrange[2], lrange[1], lrange[0]); - sycl::range<3> global_range(group_count * lrange[2], lrange[1], lrange[0]); - sycl::nd_range<3> kernel_range(global_range, local_range); + sycl::range<3> local_range(lrange[2],lrange[1],lrange[0]); + sycl::range<3> global_range(group_count * lrange[2],lrange[1],lrange[0]); + sycl::nd_range<3> kernel_range(global_range,local_range); //----------- - sycl::kernel *grad_kernel = (t_mode == CEED_TRANSPOSE) ? impl->grad_transpose_kernel : impl->grad_kernel; - const CeedScalar *d_grad_1d = (impl->d_collo_grad_1d) ? impl->d_collo_grad_1d : impl->d_grad_1d; - // Order queue + sycl::kernel* grad_kernel = (t_mode == CEED_TRANSPOSE) ? impl->grad_transpose_kernel : impl->grad_kernel; + const CeedScalar* d_grad_1d = (impl->d_collo_grad_1d) ? impl->d_collo_grad_1d : impl->d_grad_1d; + //Order queue sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier(); - - ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) { + + ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){ cgh.depends_on(e); cgh.set_args(num_elem, impl->d_interp_1d, d_grad_1d, d_u, d_v); - cgh.parallel_for(kernel_range, *grad_kernel); + cgh.parallel_for(kernel_range,*grad_kernel); }); } break; case CEED_EVAL_WEIGHT: { - CeedInt *lrange = impl->weight_local_range; - const CeedInt &elem_per_group = lrange[2]; - const CeedInt group_count = (num_elem / elem_per_group) + !!(num_elem % elem_per_group); + CeedInt* lrange = impl->weight_local_range; + const CeedInt& elem_per_group = lrange[2]; + const CeedInt group_count = (num_elem / elem_per_group) + !!(num_elem % elem_per_group); //----------- - sycl::range<3> local_range(lrange[2], lrange[1], lrange[0]); - sycl::range<3> global_range(group_count * lrange[2], lrange[1], lrange[0]); - sycl::nd_range<3> kernel_range(global_range, local_range); + sycl::range<3> local_range(lrange[2],lrange[1],lrange[0]); + sycl::range<3> global_range(group_count * lrange[2],lrange[1],lrange[0]); + sycl::nd_range<3> kernel_range(global_range,local_range); //----------- - // Order queue + //Order queue sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier(); - - ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) { + + ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){ cgh.depends_on(e); cgh.set_args(num_elem, impl->d_q_weight_1d, d_v); - cgh.parallel_for(kernel_range, *(impl->weight_kernel)); + cgh.parallel_for(kernel_range,*(impl->weight_kernel)); }); } break; // LCOV_EXCL_START @@ -148,9 +147,9 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) { CeedCallBackend(CeedGetData(ceed, &data)); CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); - CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context)); - CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context)); - CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_collo_grad_1d, data->sycl_context)); delete impl->interp_kernel; @@ -183,17 +182,17 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); const CeedInt num_nodes = CeedIntPow(P_1d, dim); - const CeedInt num_qpts = CeedIntPow(Q_1d, dim); - - CeedInt *interp_lrange = impl->interp_local_range; - CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, interp_lrange)); - const CeedInt interp_group_size = interp_lrange[0] * interp_lrange[1] * interp_lrange[2]; + const CeedInt num_qpts = CeedIntPow(Q_1d, dim); - CeedInt *grad_lrange = impl->grad_local_range; - CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, grad_lrange)); - const CeedInt grad_group_size = grad_lrange[0] * grad_lrange[1] * grad_lrange[2]; + CeedInt* interp_lrange = impl->interp_local_range; + CeedCallBackend(ComputeLocalRange(ceed,dim,thread_1d,interp_lrange)); + const CeedInt interp_group_size = interp_lrange[0] * interp_lrange[1] * interp_lrange[2]; - CeedCallBackend(ComputeLocalRange(ceed, dim, Q_1d, impl->weight_local_range)); + CeedInt* grad_lrange = impl->grad_local_range; + CeedCallBackend(ComputeLocalRange(ceed,dim,thread_1d,grad_lrange)); + const CeedInt grad_group_size = grad_lrange[0] * grad_lrange[1] * grad_lrange[2]; + + CeedCallBackend(ComputeLocalRange(ceed,dim,Q_1d,impl->weight_local_range)); // Copy basis data to GPU CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device(Q_1d, data->sycl_device, data->sycl_context)); @@ -209,48 +208,48 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad})); // Compute collocated gradient and copy to GPU - impl->d_collo_grad_1d = NULL; + impl->d_collo_grad_1d = NULL; const bool has_collocated_grad = (dim == 3) && (Q_1d >= P_1d); if (has_collocated_grad) { CeedScalar *collo_grad_1d; CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); const CeedInt cgrad_length = Q_1d * Q_1d; - CeedCallSycl(ceed, impl->d_collo_grad_1d = sycl::malloc_device(cgrad_length, data->sycl_device, data->sycl_context)); - CeedCallSycl(ceed, data->sycl_queue.copy(collo_grad_1d, impl->d_collo_grad_1d, cgrad_length).wait_and_throw()); + CeedCallSycl(ceed, impl->d_collo_grad_1d = sycl::malloc_device(cgrad_length,data->sycl_device,data->sycl_context)); + CeedCallSycl(ceed, data->sycl_queue.copy(collo_grad_1d,impl->d_collo_grad_1d,cgrad_length).wait_and_throw()); CeedCallBackend(CeedFree(&collo_grad_1d)); } // ---[Refactor into separate function]------> // Define compile-time constants std::map jit_constants; - jit_constants["BASIS_DIM"] = dim; - jit_constants["BASIS_Q_1D"] = Q_1d; - jit_constants["BASIS_P_1D"] = P_1d; - jit_constants["T_1D"] = thread_1d; - jit_constants["BASIS_NUM_COMP"] = num_comp; - jit_constants["BASIS_NUM_NODES"] = num_nodes; - jit_constants["BASIS_NUM_QPTS"] = num_qpts; + jit_constants["BASIS_DIM"] = dim; + jit_constants["BASIS_Q_1D"] = Q_1d; + jit_constants["BASIS_P_1D"] = P_1d; + jit_constants["T_1D"] = thread_1d; + jit_constants["BASIS_NUM_COMP"] = num_comp; + jit_constants["BASIS_NUM_NODES"] = num_nodes; + jit_constants["BASIS_NUM_QPTS"] = num_qpts; jit_constants["BASIS_HAS_COLLOCATED_GRAD"] = has_collocated_grad; jit_constants["BASIS_INTERP_SCRATCH_SIZE"] = interp_group_size; - jit_constants["BASIS_GRAD_SCRATCH_SIZE"] = grad_group_size; + jit_constants["BASIS_GRAD_SCRATCH_SIZE"] = grad_group_size; // Load kernel source char *basis_kernel_path, *basis_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete -----\n"); - + CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n"); + // Compile kernels into a kernel bundle - CeedCallBackend(CeedBuildModule_Sycl(ceed, basis_kernel_source, &impl->sycl_module, jit_constants)); + CeedCallBackend(CeedJitBuildModule_Sycl(ceed, basis_kernel_source, &impl->sycl_module,jit_constants)); // Load kernel functions - CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "Interp", &impl->interp_kernel)); - CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "InterpTranspose", &impl->interp_transpose_kernel)); - CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "Grad", &impl->grad_kernel)); - CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "GradTranspose", &impl->grad_transpose_kernel)); - CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, "Weight", &impl->weight_kernel)); + CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "Interp", &impl->interp_kernel)); + CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "InterpTranspose", &impl->interp_transpose_kernel)); + CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "Grad", &impl->grad_kernel)); + CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "GradTranspose", &impl->grad_transpose_kernel)); + CeedCallBackend(CeedJitGetKernel_Sycl(ceed, impl->sycl_module, "Weight", &impl->weight_kernel)); // Clean-up CeedCallBackend(CeedFree(&basis_kernel_path)); @@ -264,5 +263,4 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl_shared)); return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-shared/ceed-sycl-shared.hpp b/backends/sycl-shared/ceed-sycl-shared.hpp index 79ef10d4ab..8ee7b2b0ce 100644 --- a/backends/sycl-shared/ceed-sycl-shared.hpp +++ b/backends/sycl-shared/ceed-sycl-shared.hpp @@ -17,19 +17,19 @@ #include "../sycl/ceed-sycl-compile.hpp" typedef struct { - CeedInt interp_local_range[3]; - CeedInt grad_local_range[3]; - CeedInt weight_local_range[3]; + CeedInt interp_local_range[3]; + CeedInt grad_local_range[3]; + CeedInt weight_local_range[3]; SyclModule_t *sycl_module; sycl::kernel *interp_kernel; sycl::kernel *interp_transpose_kernel; sycl::kernel *grad_kernel; sycl::kernel *grad_transpose_kernel; sycl::kernel *weight_kernel; - CeedScalar *d_interp_1d; - CeedScalar *d_grad_1d; - CeedScalar *d_collo_grad_1d; // eliminate - CeedScalar *d_q_weight_1d; + CeedScalar *d_interp_1d; + CeedScalar *d_grad_1d; + CeedScalar *d_collo_grad_1d; //eliminate + CeedScalar *d_q_weight_1d; } CeedBasis_Sycl_shared; CEED_INTERN int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, diff --git a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp index be6b5c8d42..5898fb6388 100644 --- a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp +++ b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp @@ -19,9 +19,12 @@ //------------------------------------------------------------------------------ static int CeedInit_Sycl_shared(const char *resource, Ceed ceed) { char *resource_root; - CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); - CeedCheck(!std::strcmp(resource_root, "/gpu/sycl/shared") || !std::strcmp(resource_root, "/cpu/sycl/shared"), ceed, CEED_ERROR_BACKEND, - "Sycl backend cannot use resource: %s", resource); + CeedCallBackend(CeedSyclGetResourceRoot(ceed, resource, &resource_root)); + if (std::strcmp(resource_root, "/gpu/sycl/shared") && std::strcmp(resource_root, "/cpu/sycl/shared")) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } std::string_view root_view = resource_root; auto suffix_length = root_view.size() - root_view.rfind("shared"); @@ -32,23 +35,23 @@ static int CeedInit_Sycl_shared(const char *resource, Ceed ceed) { CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedSetDeterministic(ceed, true)); - + Ceed_Sycl *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); - CeedCallBackend(CeedInit_Sycl(ceed, resource)); + CeedCallBackend(CeedSyclInit(ceed, resource)); Ceed ceed_ref; CeedCallBackend(CeedInit(ref_resource.str().c_str(), &ceed_ref)); - + Ceed_Sycl *ref_data; CeedCallBackend(CeedGetData(ceed_ref, &ref_data)); - + // Need to use the same queue everywhere for correct synchronization ref_data->sycl_queue = data->sycl_queue; ref_data->sycl_context = data->sycl_context; ref_data->sycl_device = data->sycl_device; - + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Sycl_shared)); @@ -64,5 +67,4 @@ CEED_INTERN int CeedRegister_Sycl_Shared(void) { CeedCallBackend(CeedRegister("/cpu/sycl/shared", CeedInit_Sycl_shared, 35)); return CEED_ERROR_SUCCESS; } - //------------------------------------------------------------------------------ diff --git a/backends/sycl-shared/kernels/sycl-shared-basis.hpp b/backends/sycl-shared/kernels/sycl-shared-basis.hpp new file mode 100644 index 0000000000..9d82fc827f --- /dev/null +++ b/backends/sycl-shared/kernels/sycl-shared-basis.hpp @@ -0,0 +1,56 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +#ifndef _ceed_sycl_kernels_shared_basis_hpp +#define _ceed_sycl_kernels_shared_basis_hpp + +#include + +#include + +const int sizeMax = 16; +__constant__ CeedScalar c_B[sizeMax * sizeMax]; +__constant__ CeedScalar c_G[sizeMax * sizeMax]; + +//------------------------------------------------------------------------------ +// Interp device initalization +//------------------------------------------------------------------------------ +extern "C" int CeedSyclInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr) { + const int bytes = P_1d * Q_1d * sizeof(CeedScalar); + syclMemcpyToSymbol(c_B, d_B, bytes, 0, syclMemcpyDeviceToDevice); + syclGetSymbolAddress((void **)c_B_ptr, c_B); + + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Grad device initalization +//------------------------------------------------------------------------------ +extern "C" int CeedSyclInitGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) { + const int bytes = P_1d * Q_1d * sizeof(CeedScalar); + syclMemcpyToSymbol(c_B, d_B, bytes, 0, syclMemcpyDeviceToDevice); + syclGetSymbolAddress((void **)c_B_ptr, c_B); + syclMemcpyToSymbol(c_G, d_G, bytes, 0, syclMemcpyDeviceToDevice); + syclGetSymbolAddress((void **)c_G_ptr, c_G); + + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Collocated grad device initalization +//------------------------------------------------------------------------------ +extern "C" int CeedSyclInitCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) { + const int bytes_interp = P_1d * Q_1d * sizeof(CeedScalar); + syclMemcpyToSymbol(c_B, d_B, bytes_interp, 0, syclMemcpyDeviceToDevice); + syclGetSymbolAddress((void **)c_B_ptr, c_B); + const int bytes_grad = Q_1d * Q_1d * sizeof(CeedScalar); + syclMemcpyToSymbol(c_G, d_G, bytes_grad, 0, syclMemcpyDeviceToDevice); + syclGetSymbolAddress((void **)c_G_ptr, c_G); + + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ diff --git a/backends/sycl/ceed-sycl-common.hpp b/backends/sycl/ceed-sycl-common.hpp index e845286055..4c97e5d245 100644 --- a/backends/sycl/ceed-sycl-common.hpp +++ b/backends/sycl/ceed-sycl-common.hpp @@ -39,10 +39,12 @@ typedef struct { sycl::queue sycl_queue; } Ceed_Sycl; -CEED_INTERN int CeedInit_Sycl(Ceed ceed, const char *resource); +CEED_INTERN int CeedSyclGetResourceRoot(Ceed ceed, const char *resource, char **resource_root); + +CEED_INTERN int CeedSyclInit(Ceed ceed, const char *resource); CEED_INTERN int CeedDestroy_Sycl(Ceed ceed); -CEED_INTERN int CeedSetStream_Sycl(Ceed ceed, void *handle); +CEED_EXTERN int CeedSetStream_Sycl(Ceed ceed, void* handle); #endif // _ceed_sycl_common_h diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp index d6e7067eee..d6af11eb19 100644 --- a/backends/sycl/ceed-sycl-common.sycl.cpp +++ b/backends/sycl/ceed-sycl-common.sycl.cpp @@ -11,10 +11,22 @@ #include #include +//------------------------------------------------------------------------------ +// Get root resource without device spec +//------------------------------------------------------------------------------ +int CeedSyclGetResourceRoot(Ceed ceed, const char *resource, char **resource_root) { + const char *device_spec = std::strstr(resource, ":device_id="); + size_t resource_root_len = device_spec ? (size_t)(device_spec - resource) + 1 : strlen(resource) + 1; + CeedCallBackend(CeedCalloc(resource_root_len, resource_root)); + memcpy(*resource_root, resource, resource_root_len - 1); + + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Device information backend init //------------------------------------------------------------------------------ -int CeedInit_Sycl(Ceed ceed, const char *resource) { +int CeedSyclInit(Ceed ceed, const char *resource) { const char *device_spec = std::strstr(resource, ":device_id="); const int device_id = (device_spec) ? atoi(device_spec + 11) : 0; @@ -88,39 +100,31 @@ int CeedDestroy_Sycl(Ceed ceed) { //------------------------------------------------------------------------------ // Use an external queue //------------------------------------------------------------------------------ -int CeedSetStream_Sycl(Ceed ceed, void *handle) { +int CeedSetStream_Sycl(Ceed ceed, void* handle) { Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); - CeedCheck(handle, ceed, CEED_ERROR_BACKEND, "Stream handle is null"); - sycl::queue *q = static_cast(handle); + if (!handle) { + return CeedError(ceed, CEED_ERROR_BACKEND, "Stream handle is null");; + } + sycl::queue* q = static_cast(handle); // Ensure we are using the expected device - CeedCheck(data->sycl_device == q->get_device(), ceed, CEED_ERROR_BACKEND, "Device mismatch between provided queue and ceed object"); - data->sycl_device = q->get_device(); + if (data->sycl_device != q->get_device()) { + return CeedError(ceed, CEED_ERROR_BACKEND, "Device mismatch between provided queue and ceed object");; + } + data->sycl_device = q->get_device(); data->sycl_context = q->get_context(); - data->sycl_queue = *q; + data->sycl_queue = *q; // Revisit this when we have a hierarchy of delegates - Ceed ceed_delegate = NULL; - CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); - if (ceed_delegate) { + Ceed ceed_delegate; + if(!CeedGetDelegate(ceed,&ceed_delegate)) { Ceed_Sycl *delegate_data; CeedCallBackend(CeedGetData(ceed_delegate, &delegate_data)); - delegate_data->sycl_device = q->get_device(); + delegate_data->sycl_device = q->get_device(); delegate_data->sycl_context = q->get_context(); - delegate_data->sycl_queue = *q; - } - - // Set queue and context for Ceed Fallback object - Ceed ceed_fallback = NULL; - CeedGetOperatorFallbackCeed(ceed, &ceed_fallback); - if (ceed_fallback) { - Ceed_Sycl *fallback_data; - CeedCallBackend(CeedGetData(ceed_fallback, &fallback_data)); - fallback_data->sycl_device = q->get_device(); - fallback_data->sycl_context = q->get_context(); - fallback_data->sycl_queue = *q; + delegate_data->sycl_queue = *q; } return CEED_ERROR_SUCCESS; diff --git a/backends/sycl/ceed-sycl-compile.hpp b/backends/sycl/ceed-sycl-compile.hpp index f4a95b8168..a90233dd2a 100644 --- a/backends/sycl/ceed-sycl-compile.hpp +++ b/backends/sycl/ceed-sycl-compile.hpp @@ -16,11 +16,11 @@ using SyclModule_t = sycl::kernel_bundle; -CEED_INTERN int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module, - const std::map &constants = {}); -CEED_INTERN int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel); +CEED_INTERN int CeedJitBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module, + const std::map &constants = {}); +CEED_INTERN int CeedJitGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel); -CEED_INTERN int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_size, const int block_size_x, const int block_size_y, +CEED_INTERN int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel* kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, const int shared_mem_size, void **args); #endif // _ceed_sycl_compile_h diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp index 845c46b5c5..144e5b478f 100644 --- a/backends/sycl/ceed-sycl-compile.sycl.cpp +++ b/backends/sycl/ceed-sycl-compile.sycl.cpp @@ -14,15 +14,16 @@ #include #include +// #include #include -#include "./online_compiler.hpp" #include "ceed-sycl-common.hpp" +#include "./online_compiler.hpp" using ByteVector_t = std::vector; //------------------------------------------------------------------------------ -// Add defined constants at the beginning of kernel source +// //------------------------------------------------------------------------------ static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_source, std::string &jit_source, const std::map &constants = {}) { @@ -63,7 +64,9 @@ static inline int CeedJitGetFlags_Sycl(std::vector &flags) { //------------------------------------------------------------------------------ // Compile an OpenCL source to SPIR-V using Intel's online compiler extension //------------------------------------------------------------------------------ -static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_device, const std::string &opencl_source, ByteVector_t &il_binary, +static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_device, + const std::string &opencl_source, + ByteVector_t &il_binary, const std::vector &flags = {}) { sycl::ext::libceed::online_compiler compiler(sycl_device); @@ -77,34 +80,26 @@ static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_ // ------------------------------------------------------------------------------ // Load (compile) SPIR-V source and wrap in sycl kernel_bundle +// TODO: determine appropriate flags +// TODO: Error handle lz calls // ------------------------------------------------------------------------------ -static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, const sycl::device &sycl_device, const ByteVector_t &il_binary, - SyclModule_t **sycl_module) { +static int CeedJitLoadModule_Sycl(const sycl::context &sycl_context, + const sycl::device &sycl_device, + const ByteVector_t &il_binary, + SyclModule_t **sycl_module) { auto lz_context = sycl::get_native(sycl_context); auto lz_device = sycl::get_native(sycl_device); ze_module_desc_t lz_mod_desc = {ZE_STRUCTURE_TYPE_MODULE_DESC, - nullptr, // extension specific structs + nullptr, ZE_MODULE_FORMAT_IL_SPIRV, il_binary.size(), il_binary.data(), - " -ze-opt-large-register-file", // flags - nullptr}; // specialization constants + " -ze-opt-large-register-file", // flags + nullptr}; // build log - ze_module_handle_t lz_module; - ze_module_build_log_handle_t lz_log; - ze_result_t lz_err = zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, &lz_log); - - if (ZE_RESULT_SUCCESS != lz_err) { - size_t log_size = 0; - zeModuleBuildLogGetString(lz_log, &log_size, nullptr); - - char *log_message; - CeedCall(CeedCalloc(log_size, &log_message)); - zeModuleBuildLogGetString(lz_log, &log_size, log_message); - - return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to compile Level Zero module:\n%s", log_message); - } + ze_module_handle_t lz_module; + zeModuleCreate(lz_context, lz_device, &lz_mod_desc, &lz_module, nullptr); // sycl make_ only throws errors for backend mismatch--assume we have vetted this already *sycl_module = new SyclModule_t(sycl::make_kernel_bundle( @@ -116,7 +111,8 @@ static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, con // ------------------------------------------------------------------------------ // Compile kernel source to an executable `sycl::kernel_bundle` // ------------------------------------------------------------------------------ -int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module, const std::map &constants) { +int CeedJitBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module, + const std::map &constants) { Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); @@ -129,7 +125,7 @@ int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule ByteVector_t il_binary; CeedCallBackend(CeedJitCompileSource_Sycl(ceed, data->sycl_device, jit_source, il_binary, flags)); - CeedCallBackend(CeedLoadModule_Sycl(ceed, data->sycl_context, data->sycl_device, il_binary, sycl_module)); + CeedCallBackend(CeedJitLoadModule_Sycl(data->sycl_context, data->sycl_device, il_binary, sycl_module)); return CEED_ERROR_SUCCESS; } @@ -139,7 +135,9 @@ int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule // // TODO: Error handle lz calls // ------------------------------------------------------------------------------ -int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel) { +int CeedJitGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, + const std::string &kernel_name, + sycl::kernel **sycl_kernel) { Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); @@ -149,11 +147,7 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st ze_kernel_desc_t lz_kernel_desc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_name.c_str()}; ze_kernel_handle_t lz_kernel; - ze_result_t lz_err = zeKernelCreate(lz_module, &lz_kernel_desc, &lz_kernel); - - if (ZE_RESULT_SUCCESS != lz_err) { - return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to retrieve kernel from Level Zero module"); - } + zeKernelCreate(lz_module, &lz_kernel_desc, &lz_kernel); *sycl_kernel = new sycl::kernel(sycl::make_kernel( {*sycl_module, lz_kernel, sycl::ext::oneapi::level_zero::ownership::transfer}, data->sycl_context)); @@ -164,23 +158,23 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st //------------------------------------------------------------------------------ // Run SYCL kernel for spatial dimension with shared memory //------------------------------------------------------------------------------ -int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_size, const int block_size_x, const int block_size_y, +int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel* kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, const int shared_mem_size, void **kernel_args) { - sycl::range<3> local_range(block_size_z, block_size_y, block_size_x); - sycl::range<3> global_range(grid_size * block_size_z, block_size_y, block_size_x); - sycl::nd_range<3> kernel_range(global_range, local_range); - + sycl::range<3> local_range(block_size_z, block_size_y, block_size_x); + sycl::range<3> global_range(grid_size*block_size_z, block_size_y, block_size_x); + sycl::nd_range<3> kernel_range(global_range,local_range); + //----------- - // Order queue + //Order queue Ceed_Sycl *ceed_Sycl; CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier(); - - ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) { + + ceed_Sycl->sycl_queue.submit([&](sycl::handler& cgh){ cgh.depends_on(e); cgh.set_args(*kernel_args); - cgh.parallel_for(kernel_range, *kernel); + cgh.parallel_for(kernel_range,*kernel); }); return CEED_ERROR_SUCCESS; -} +} \ No newline at end of file diff --git a/backends/sycl/ocloc_api.h b/backends/sycl/ocloc_api.h index 80b7bf6672..f073c5c524 100644 --- a/backends/sycl/ocloc_api.h +++ b/backends/sycl/ocloc_api.h @@ -13,12 +13,13 @@ #ifndef OCLOC_MAKE_VERSION /// Generates ocloc API versions -#define OCLOC_MAKE_VERSION(_major, _minor) ((_major << 16) | (_minor & 0x0000ffff)) -#endif // OCLOC_MAKE_VERSION +#define OCLOC_MAKE_VERSION(_major, _minor) \ + ((_major << 16) | (_minor & 0x0000ffff)) +#endif // OCLOC_MAKE_VERSION typedef enum _ocloc_version_t { - OCLOC_VERSION_1_0 = OCLOC_MAKE_VERSION(1, 0), ///< version 1.0 - OCLOC_VERSION_CURRENT = OCLOC_MAKE_VERSION(1, 0), ///< latest known version + OCLOC_VERSION_1_0 = OCLOC_MAKE_VERSION(1, 0), ///< version 1.0 + OCLOC_VERSION_CURRENT = OCLOC_MAKE_VERSION(1, 0), ///< latest known version OCLOC_VERSION_FORCE_UINT32 = 0x7fffffff } ocloc_version_t; @@ -81,9 +82,14 @@ extern "C" { /// messages generated by ocloc (e.g. compiler errors/warnings). /// /// \returns 0 on succes. Returns non-0 in case of failure. -SIGNATURE oclocInvoke(uint32_t NumArgs, const char *Argv[], uint32_t NumSources, const uint8_t **DataSources, const uint64_t *LenSources, - const char **NameSources, uint32_t NumInputHeaders, const uint8_t **DataInputHeaders, const uint64_t *LenInputHeaders, - const char **NameInputHeaders, uint32_t *NumOutputs, uint8_t ***DataOutputs, uint64_t **LenOutputs, char ***NameOutputs); +SIGNATURE oclocInvoke(uint32_t NumArgs, const char *Argv[], uint32_t NumSources, + const uint8_t **DataSources, const uint64_t *LenSources, + const char **NameSources, uint32_t NumInputHeaders, + const uint8_t **DataInputHeaders, + const uint64_t *LenInputHeaders, + const char **NameInputHeaders, uint32_t *NumOutputs, + uint8_t ***DataOutputs, uint64_t **LenOutputs, + char ***NameOutputs); /// Frees results of oclocInvoke /// @@ -96,7 +102,8 @@ SIGNATURE oclocInvoke(uint32_t NumArgs, const char *Argv[], uint32_t NumSources, /// \param NameOutputs is array of names of outputs as returned by oclocInvoke() /// /// \returns 0 on succes. Returns non-0 in case of failure. -SIGNATURE oclocFreeOutput(uint32_t *NumOutputs, uint8_t ***DataOutputs, uint64_t **LenOutputs, char ***NameOutputs); +SIGNATURE oclocFreeOutput(uint32_t *NumOutputs, uint8_t ***DataOutputs, + uint64_t **LenOutputs, char ***NameOutputs); /// Returns the current version of ocloc. SIGNATURE oclocVersion(); diff --git a/backends/sycl/online_compiler.hpp b/backends/sycl/online_compiler.hpp index edc14ab92e..f259bd52ea 100644 --- a/backends/sycl/online_compiler.hpp +++ b/backends/sycl/online_compiler.hpp @@ -19,11 +19,11 @@ namespace ext::libceed { using byte = unsigned char; enum class compiled_code_format { - spir_v = 0 // the only format supported for now + spir_v = 0 // the only format supported for now }; class device_arch { - public: +public: static constexpr int any = 0; device_arch(int Val) : Val(Val) {} @@ -33,17 +33,17 @@ class device_arch { // The API must allow user to define the target GPU option even if it is // not listed in this enumerator below. enum gpu { - gpu_any = 1, - gpu_gen9 = 2, - gpu_skl = gpu_gen9, + gpu_any = 1, + gpu_gen9 = 2, + gpu_skl = gpu_gen9, gpu_gen9_5 = 3, - gpu_kbl = gpu_gen9_5, - gpu_cfl = gpu_gen9_5, - gpu_gen11 = 4, - gpu_icl = gpu_gen11, - gpu_gen12 = 5, - gpu_tgl = gpu_gen12, - gpu_tgllp = gpu_gen12 + gpu_kbl = gpu_gen9_5, + gpu_cfl = gpu_gen9_5, + gpu_gen11 = 4, + gpu_icl = gpu_gen11, + gpu_gen12 = 5, + gpu_tgl = gpu_gen12, + gpu_tgllp = gpu_gen12 }; enum cpu { @@ -56,13 +56,13 @@ class device_arch { operator int() { return Val; } - private: +private: int Val; }; /// Represents an error happend during online compilation. class online_compile_error : public sycl::exception { - public: +public: online_compile_error() = default; online_compile_error(const std::string &Msg) : sycl::exception(Msg) {} }; @@ -72,40 +72,35 @@ enum class source_language { opencl_c = 0, cm = 1 }; /// Represents an online compiler for the language given as template /// parameter. -template -class online_compiler { - public: +template class online_compiler { +public: /// Constructs online compiler which can target any device and produces /// given compiled code format. Produces 64-bit device code. /// The created compiler is "optimistic" - it assumes all applicable SYCL /// device capabilities are supported by the target device(s). online_compiler(compiled_code_format fmt = compiled_code_format::spir_v) - : OutputFormat(fmt), - OutputFormatVersion({0, 0}), - DeviceType(sycl::info::device_type::all), - DeviceArch(device_arch::any), - Is64Bit(true), - DeviceStepping("") {} + : OutputFormat(fmt), OutputFormatVersion({0, 0}), + DeviceType(sycl::info::device_type::all), DeviceArch(device_arch::any), + Is64Bit(true), DeviceStepping("") {} /// Constructs online compiler which targets given architecture and produces /// given compiled code format. Produces 64-bit device code. /// Throws online_compile_error if values of constructor arguments are /// contradictory or not supported - e.g. if the source language is not /// supported for given device type. - online_compiler(sycl::info::device_type dev_type, device_arch arch, compiled_code_format fmt = compiled_code_format::spir_v) - : OutputFormat(fmt), OutputFormatVersion({0, 0}), DeviceType(dev_type), DeviceArch(arch), Is64Bit(true), DeviceStepping("") {} + online_compiler(sycl::info::device_type dev_type, device_arch arch, + compiled_code_format fmt = compiled_code_format::spir_v) + : OutputFormat(fmt), OutputFormatVersion({0, 0}), DeviceType(dev_type), + DeviceArch(arch), Is64Bit(true), DeviceStepping("") {} /// Constructs online compiler for the target specified by given SYCL device. // TODO: the initial version generates the generic code (SKL now), need // to do additional device::info calls to determine the device by it's // features. online_compiler(const sycl::device &) - : OutputFormat(compiled_code_format::spir_v), - OutputFormatVersion({0, 0}), - DeviceType(sycl::info::device_type::all), - DeviceArch(device_arch::any), - Is64Bit(true), - DeviceStepping("") {} + : OutputFormat(compiled_code_format::spir_v), OutputFormatVersion({0, 0}), + DeviceType(sycl::info::device_type::all), DeviceArch(device_arch::any), + Is64Bit(true), DeviceStepping("") {} /// Compiles given in-memory \c Lang source to a binary blob. Blob format, /// other parameters are set in the constructor by the compilation target @@ -160,7 +155,7 @@ class online_compiler { return *this; } - private: +private: /// Compiled code format. compiled_code_format OutputFormat; @@ -180,7 +175,7 @@ class online_compiler { std::string DeviceStepping; /// Handles to helper functions used by the implementation. - void *CompileToSPIRVHandle = nullptr; + void *CompileToSPIRVHandle = nullptr; void *FreeSPIRVOutputsHandle = nullptr; }; @@ -193,10 +188,12 @@ class online_compiler { /// OpenCL JIT compiler options must be supported. template <> template <> -std::vector online_compiler::compile(const std::string &src, const std::vector &options); +std::vector +online_compiler::compile( + const std::string &src, const std::vector &options); -// /// Compiles the given OpenCL source. May throw \c online_compile_error. -// /// @param src - contents of the source. +/// Compiles the given OpenCL source. May throw \c online_compile_error. +/// @param src - contents of the source. // template <> // template <> // std::vector @@ -209,7 +206,8 @@ std::vector online_compiler::compile(const std: /// @param options - compilation options (implementation defined). template <> template <> -std::vector online_compiler::compile(const std::string &src, const std::vector &options); +std::vector online_compiler::compile( + const std::string &src, const std::vector &options); /// Compiles the given CM source \p src. // template <> @@ -218,5 +216,5 @@ std::vector online_compiler::compile(const std::strin // return compile(src, std::vector{}); // } -} // namespace ext::libceed -} // namespace sycl +} // namespace ext::libceed +} // namespace sycl diff --git a/backends/sycl/online_compiler.sycl.cpp b/backends/sycl/online_compiler.sycl.cpp index d8459ce938..9e4fb539d8 100644 --- a/backends/sycl/online_compiler.sycl.cpp +++ b/backends/sycl/online_compiler.sycl.cpp @@ -6,12 +6,12 @@ // //===----------------------------------------------------------------------===// #include -#include #include +#include #include -#include "ocloc_api.h" #include "online_compiler.hpp" +#include "ocloc_api.h" namespace sycl { namespace ext::libceed { @@ -22,7 +22,8 @@ void *loadOsLibrary(const std::string &PluginPath) { void *so = dlopen(PluginPath.c_str(), RTLD_NOW); if (!so) { char *Error = dlerror(); - std::cerr << "dlopen(" << PluginPath << ") failed with <" << (Error ? Error : "unknown error") << ">" << std::endl; + std::cerr << "dlopen(" << PluginPath << ") failed with <" + << (Error ? Error : "unknown error") << ">" << std::endl; } return so; } @@ -35,32 +36,36 @@ void *loadOsLibrary(const std::string &PluginPath) { // return dlclose(Library); // } -void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) { return dlsym(Library, FunctionName.c_str()); } +void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) { + return dlsym(Library, FunctionName.c_str()); +} -static std::vector prepareOclocArgs(sycl::info::device_type DeviceType, device_arch DeviceArch, bool Is64Bit, - const std::string &DeviceStepping, const std::string &UserArgs) { +static std::vector +prepareOclocArgs(sycl::info::device_type DeviceType, device_arch DeviceArch, + bool Is64Bit, const std::string &DeviceStepping, + const std::string &UserArgs) { std::vector Args = {"ocloc", "-q", "-spv_only", "-device"}; if (DeviceType == sycl::info::device_type::gpu) { switch (DeviceArch) { - case device_arch::gpu_gen9: - Args.push_back("skl"); - break; + case device_arch::gpu_gen9: + Args.push_back("skl"); + break; - case device_arch::gpu_gen9_5: - Args.push_back("cfl"); - break; + case device_arch::gpu_gen9_5: + Args.push_back("cfl"); + break; - case device_arch::gpu_gen11: - Args.push_back("icllp"); - break; + case device_arch::gpu_gen11: + Args.push_back("icllp"); + break; - case device_arch::gpu_gen12: - Args.push_back("tgllp"); - break; + case device_arch::gpu_gen12: + Args.push_back("tgllp"); + break; - default: - Args.push_back("pvc"); + default: + Args.push_back("pvc"); } } else { // TODO: change that to generic device when ocloc adds support for it. @@ -97,9 +102,13 @@ static std::vector prepareOclocArgs(sycl::info::device_type Device /// of the library function freeing memory /// allocated during the compilation. /// @param UserArgs - User's options to ocloc compiler. -static std::vector compileToSPIRV(const std::string &Source, sycl::info::device_type DeviceType, device_arch DeviceArch, bool Is64Bit, - const std::string &DeviceStepping, void *&CompileToSPIRVHandle, void *&FreeSPIRVOutputsHandle, - const std::vector &UserArgs) { +static std::vector +compileToSPIRV(const std::string &Source, sycl::info::device_type DeviceType, + device_arch DeviceArch, bool Is64Bit, + const std::string &DeviceStepping, void *&CompileToSPIRVHandle, + void *&FreeSPIRVOutputsHandle, + const std::vector &UserArgs) { + if (!CompileToSPIRVHandle) { #ifdef __SYCL_RT_OS_WINDOWS static const std::string OclocLibraryName = "ocloc64.dll"; @@ -107,62 +116,83 @@ static std::vector compileToSPIRV(const std::string &Source, sycl::info::d static const std::string OclocLibraryName = "libocloc.so"; #endif void *OclocLibrary = loadOsLibrary(OclocLibraryName); - if (!OclocLibrary) throw online_compile_error("Cannot load ocloc library: " + OclocLibraryName); - void *OclocVersionHandle = getOsLibraryFuncAddress(OclocLibrary, "oclocVersion"); + if (!OclocLibrary) + throw online_compile_error("Cannot load ocloc library: " + + OclocLibraryName); + void *OclocVersionHandle = + getOsLibraryFuncAddress(OclocLibrary, "oclocVersion"); // The initial versions of ocloc library did not have the oclocVersion() // function. Those versions had the same API as the first version of ocloc // library having that oclocVersion() function. int LoadedVersion = ocloc_version_t::OCLOC_VERSION_1_0; if (OclocVersionHandle) { - decltype(::oclocVersion) *OclocVersionFunc = reinterpret_cast(OclocVersionHandle); - LoadedVersion = OclocVersionFunc(); + decltype(::oclocVersion) *OclocVersionFunc = + reinterpret_cast(OclocVersionHandle); + LoadedVersion = OclocVersionFunc(); } // The loaded library with version (A.B) is compatible with expected API/ABI // version (X.Y) used here if A == B and B >= Y. - int LoadedVersionMajor = LoadedVersion >> 16; - int LoadedVersionMinor = LoadedVersion & 0xffff; + int LoadedVersionMajor = LoadedVersion >> 16; + int LoadedVersionMinor = LoadedVersion & 0xffff; int CurrentVersionMajor = ocloc_version_t::OCLOC_VERSION_CURRENT >> 16; int CurrentVersionMinor = ocloc_version_t::OCLOC_VERSION_CURRENT & 0xffff; - if (LoadedVersionMajor != CurrentVersionMajor || LoadedVersionMinor < CurrentVersionMinor) - throw online_compile_error(std::string("Found incompatible version of ocloc library: (") + std::to_string(LoadedVersionMajor) + "." + - std::to_string(LoadedVersionMinor) + "). The supported versions are (" + std::to_string(CurrentVersionMajor) + - ".N), where (N >= " + std::to_string(CurrentVersionMinor) + ")."); - - CompileToSPIRVHandle = getOsLibraryFuncAddress(OclocLibrary, "oclocInvoke"); - if (!CompileToSPIRVHandle) throw online_compile_error("Cannot load oclocInvoke() function"); - FreeSPIRVOutputsHandle = getOsLibraryFuncAddress(OclocLibrary, "oclocFreeOutput"); - if (!FreeSPIRVOutputsHandle) throw online_compile_error("Cannot load oclocFreeOutput() function"); + if (LoadedVersionMajor != CurrentVersionMajor || + LoadedVersionMinor < CurrentVersionMinor) + throw online_compile_error( + std::string("Found incompatible version of ocloc library: (") + + std::to_string(LoadedVersionMajor) + "." + + std::to_string(LoadedVersionMinor) + + "). The supported versions are (" + + std::to_string(CurrentVersionMajor) + + ".N), where (N >= " + std::to_string(CurrentVersionMinor) + ")."); + + CompileToSPIRVHandle = + getOsLibraryFuncAddress(OclocLibrary, "oclocInvoke"); + if (!CompileToSPIRVHandle) + throw online_compile_error("Cannot load oclocInvoke() function"); + FreeSPIRVOutputsHandle = getOsLibraryFuncAddress( + OclocLibrary, "oclocFreeOutput"); + if (!FreeSPIRVOutputsHandle) + throw online_compile_error("Cannot load oclocFreeOutput() function"); } std::string CombinedUserArgs; for (auto UserArg : UserArgs) { - if (UserArg == "") continue; - if (CombinedUserArgs != "") CombinedUserArgs = CombinedUserArgs + " " + UserArg; - else CombinedUserArgs = UserArg; + if (UserArg == "") + continue; + if (CombinedUserArgs != "") + CombinedUserArgs = CombinedUserArgs + " " + UserArg; + else + CombinedUserArgs = UserArg; } - std::vector Args = prepareOclocArgs(DeviceType, DeviceArch, Is64Bit, DeviceStepping, CombinedUserArgs); + std::vector Args = prepareOclocArgs( + DeviceType, DeviceArch, Is64Bit, DeviceStepping, CombinedUserArgs); - uint32_t NumOutputs = 0; - byte **Outputs = nullptr; + uint32_t NumOutputs = 0; + byte **Outputs = nullptr; uint64_t *OutputLengths = nullptr; - char **OutputNames = nullptr; + char **OutputNames = nullptr; - const byte *Sources[] = {reinterpret_cast(Source.c_str())}; - const char *SourceName = "main.cl"; + const byte *Sources[] = {reinterpret_cast(Source.c_str())}; + const char *SourceName = "main.cl"; const uint64_t SourceLengths[] = {Source.length() + 1}; Args.push_back("-file"); Args.push_back(SourceName); - decltype(::oclocInvoke) *OclocInvokeFunc = reinterpret_cast(CompileToSPIRVHandle); - int CompileError = OclocInvokeFunc(Args.size(), Args.data(), 1, Sources, SourceLengths, &SourceName, 0, nullptr, nullptr, nullptr, &NumOutputs, - &Outputs, &OutputLengths, &OutputNames); + decltype(::oclocInvoke) *OclocInvokeFunc = + reinterpret_cast(CompileToSPIRVHandle); + int CompileError = + OclocInvokeFunc(Args.size(), Args.data(), 1, Sources, SourceLengths, + &SourceName, 0, nullptr, nullptr, nullptr, &NumOutputs, + &Outputs, &OutputLengths, &OutputNames); std::vector SpirV; - std::string CompileLog; + std::string CompileLog; for (uint32_t I = 0; I < NumOutputs; I++) { size_t NameLen = strlen(OutputNames[I]); - if (NameLen >= 4 && strstr(OutputNames[I], ".spv") != nullptr && Outputs[I] != nullptr) { + if (NameLen >= 4 && strstr(OutputNames[I], ".spv") != nullptr && + Outputs[I] != nullptr) { assert(SpirV.size() == 0 && "More than one SPIR-V output found."); SpirV = std::vector(Outputs[I], Outputs[I] + OutputLengths[I]); } else if (!strcmp(OutputNames[I], "stdout.log")) { @@ -171,39 +201,58 @@ static std::vector compileToSPIRV(const std::string &Source, sycl::info::d } // Try to free memory before reporting possible error. - decltype(::oclocFreeOutput) *OclocFreeOutputFunc = reinterpret_cast(FreeSPIRVOutputsHandle); - int MemFreeError = OclocFreeOutputFunc(&NumOutputs, &Outputs, &OutputLengths, &OutputNames); + decltype(::oclocFreeOutput) *OclocFreeOutputFunc = + reinterpret_cast(FreeSPIRVOutputsHandle); + int MemFreeError = + OclocFreeOutputFunc(&NumOutputs, &Outputs, &OutputLengths, &OutputNames); - if (CompileError) throw online_compile_error("ocloc reported compilation errors: {\n" + CompileLog + "\n}"); - if (SpirV.empty()) throw online_compile_error("Unexpected output: ocloc did not return SPIR-V"); - if (MemFreeError) throw online_compile_error("ocloc cannot safely free resources"); + if (CompileError) + throw online_compile_error("ocloc reported compilation errors: {\n" + + CompileLog + "\n}"); + if (SpirV.empty()) + throw online_compile_error( + "Unexpected output: ocloc did not return SPIR-V"); + if (MemFreeError) + throw online_compile_error("ocloc cannot safely free resources"); return SpirV; } template <> template <> -std::vector online_compiler::compile(const std::string &Source, const std::vector &UserArgs) { +std::vector online_compiler::compile( + const std::string &Source, const std::vector &UserArgs) { + if (OutputFormatVersion != std::pair{0, 0}) { - std::string Version = std::to_string(OutputFormatVersion.first) + ", " + std::to_string(OutputFormatVersion.second); - throw online_compile_error(std::string("The output format version (") + Version + ") is not supported yet"); + std::string Version = std::to_string(OutputFormatVersion.first) + ", " + + std::to_string(OutputFormatVersion.second); + throw online_compile_error(std::string("The output format version (") + + Version + ") is not supported yet"); } - return compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit, DeviceStepping, CompileToSPIRVHandle, FreeSPIRVOutputsHandle, UserArgs); + return compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit, + DeviceStepping, CompileToSPIRVHandle, + FreeSPIRVOutputsHandle, UserArgs); } template <> template <> -std::vector online_compiler::compile(const std::string &Source, const std::vector &UserArgs) { +std::vector online_compiler::compile( + const std::string &Source, const std::vector &UserArgs) { + if (OutputFormatVersion != std::pair{0, 0}) { - std::string Version = std::to_string(OutputFormatVersion.first) + ", " + std::to_string(OutputFormatVersion.second); - throw online_compile_error(std::string("The output format version (") + Version + ") is not supported yet"); + std::string Version = std::to_string(OutputFormatVersion.first) + ", " + + std::to_string(OutputFormatVersion.second); + throw online_compile_error(std::string("The output format version (") + + Version + ") is not supported yet"); } std::vector CMUserArgs = UserArgs; CMUserArgs.push_back("-cmc"); - return compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit, DeviceStepping, CompileToSPIRVHandle, FreeSPIRVOutputsHandle, CMUserArgs); + return compileToSPIRV(Source, DeviceType, DeviceArch, Is64Bit, + DeviceStepping, CompileToSPIRVHandle, + FreeSPIRVOutputsHandle, CMUserArgs); } -} // namespace ext::libceed -} // namespace sycl +} // namespace ext::libceed +} // namespace sycl