Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Efficiently use both in-order and out-of-order sycl queues. #59

Merged
merged 6 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -722,9 +722,12 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
code << "// -----------------------------------------------------------------------------\n\n";

// Copy the struct (containing device addresses) from the host to the device
sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1);
sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1);
sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1);
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};

sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1, e);
sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1, e);
sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1, e);
// These copies can happen while the JIT is being done
CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices}));

Expand Down
4 changes: 2 additions & 2 deletions backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
sycl::nd_range<3> kernel_range(global_range, local_range);

//-----------
// Order queue
sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};

CeedCallSycl(ceed, ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(e);
Expand Down
11 changes: 5 additions & 6 deletions backends/sycl-gen/ceed-sycl-gen.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,16 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {

Ceed ceed_shared;
CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared));

Ceed_Sycl *shared_data;
CeedCallBackend(CeedGetData(ceed_shared, &shared_data));
// Need to use the same queue everywhere for correct synchronization
shared_data->sycl_queue = data->sycl_queue;

CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
CeedCallBackend(CeedSetStream_Sycl(ceed_shared,&(data->sycl_queue)));

const char fallbackresource[] = "/gpu/sycl/ref";
CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource));

Ceed ceed_fallback = NULL;
CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
CeedCallBackend(CeedSetStream_Sycl(ceed_fallback,&(data->sycl_queue)));

CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen));
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen));
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Sycl));
Expand Down
68 changes: 37 additions & 31 deletions backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,11 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
sycl::range<1> global_range(num_elem * work_group_size);
sycl::nd_range<1> kernel_range(global_range, local_range);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on({e});
cgh.depends_on(e);
cgh.use_kernel_bundle(sycl_module);

sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh);
Expand Down Expand Up @@ -150,10 +151,11 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &
sycl::range<1> global_range(num_elem * work_group_size);
sycl::nd_range<1> kernel_range(global_range, local_range);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on({e});
cgh.depends_on(e);
cgh.use_kernel_bundle(sycl_module);

sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh);
Expand Down Expand Up @@ -248,9 +250,10 @@ static int CeedBasisApplyWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem,
const CeedInt num_quad_z = (dim > 2) ? Q_1d : 1;
sycl::range<3> kernel_range(num_elem * num_quad_z, num_quad_y, num_quad_x);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, {e}, [=](sycl::item<3> work_item) {
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, e, [=](sycl::item<3> work_item) {
if (dim == 1) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]];
if (dim == 2) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]];
if (dim == 3) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]] * q_weight_1d[work_item[0] % Q_1d];
Expand Down Expand Up @@ -284,9 +287,9 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
if (t_mode == CEED_TRANSPOSE) {
CeedSize length;
CeedCallBackend(CeedVectorGetLength(v, &length));
// Order queue
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e});
std::vector<sycl::event> e;
if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
data->sycl_queue.fill<CeedScalar>(d_v, 0, length, e);
}

// Basis action
Expand Down Expand Up @@ -349,9 +352,10 @@ static int CeedBasisApplyNonTensorInterp_Sycl(sycl::queue &sycl_queue, CeedInt n

sycl::range<2> kernel_range(num_elem, v_size);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, e, [=](sycl::id<2> indx) {
const CeedInt i = indx[1];
const CeedInt elem = indx[0];

Expand Down Expand Up @@ -393,9 +397,10 @@ static int CeedBasisApplyNonTensorGrad_Sycl(sycl::queue &sycl_queue, CeedInt num

sycl::range<2> kernel_range(num_elem, v_size);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, e, [=](sycl::id<2> indx) {
const CeedInt i = indx[1];
const CeedInt elem = indx[0];

Expand Down Expand Up @@ -431,9 +436,10 @@ static int CeedBasisApplyNonTensorWeight_Sycl(sycl::queue &sycl_queue, CeedInt n

sycl::range<2> kernel_range(num_elem, num_qpts);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, e, [=](sycl::id<2> indx) {
const CeedInt i = indx[1];
const CeedInt elem = indx[0];
d_V[i + elem * num_qpts] = q_weight[i];
Expand Down Expand Up @@ -578,18 +584,18 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
impl->buf_len = num_comp * CeedIntMax(num_nodes, num_qpts);
impl->op_len = Q_1d * P_1d;

// Order queue
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};

CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, {e});
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);

const CeedInt interp_length = Q_1d * P_1d;
CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, {e});
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);

CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, {e});
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);

CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));

Expand Down Expand Up @@ -632,19 +638,19 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
impl->num_nodes = num_nodes;
impl->num_qpts = num_qpts;

// Order queue
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};

CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, {e});
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e);

const CeedInt interp_length = num_qpts * num_nodes;
CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, {e});
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, e);

const CeedInt grad_length = num_qpts * num_nodes * dim;
CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, {e});
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, e);

CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));

Expand Down
52 changes: 27 additions & 25 deletions backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -781,47 +781,47 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const
for (CeedInt i = 0; i < numemodein; i++) evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE);
for (CeedInt i = 0; i < numemodeout; i++) evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE);

// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};

std::vector<sycl::event> copy_events;
if (evalNone) {
CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity));
for (CeedSize i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0;
CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event identity_copy = sycl_data->sycl_queue.copy<CeedScalar>(identity, diag->d_identity, iLen, {e});
sycl::event identity_copy = sycl_data->sycl_queue.copy<CeedScalar>(identity, diag->d_identity, iLen, e);
copy_events.push_back(identity_copy);
}

// CEED_EVAL_INTERP
CeedCallBackend(CeedBasisGetInterp(basisin, &interpin));
CeedCallSycl(ceed, diag->d_interpin = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event interpin_copy = sycl_data->sycl_queue.copy<CeedScalar>(interpin, diag->d_interpin, iLen, {e});
sycl::event interpin_copy = sycl_data->sycl_queue.copy<CeedScalar>(interpin, diag->d_interpin, iLen, e);
copy_events.push_back(interpin_copy);

CeedCallBackend(CeedBasisGetInterp(basisout, &interpout));
CeedCallSycl(ceed, diag->d_interpout = sycl::malloc_device<CeedScalar>(iLen, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event interpout_copy = sycl_data->sycl_queue.copy<CeedScalar>(interpout, diag->d_interpout, iLen, {e});
sycl::event interpout_copy = sycl_data->sycl_queue.copy<CeedScalar>(interpout, diag->d_interpout, iLen, e);
copy_events.push_back(interpout_copy);

// CEED_EVAL_GRAD
CeedCallBackend(CeedBasisGetGrad(basisin, &gradin));
CeedCallSycl(ceed, diag->d_gradin = sycl::malloc_device<CeedScalar>(gLen, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event gradin_copy = sycl_data->sycl_queue.copy<CeedScalar>(gradin, diag->d_gradin, gLen, {e});
sycl::event gradin_copy = sycl_data->sycl_queue.copy<CeedScalar>(gradin, diag->d_gradin, gLen, e);
copy_events.push_back(gradin_copy);

CeedCallBackend(CeedBasisGetGrad(basisout, &gradout));
CeedCallSycl(ceed, diag->d_gradout = sycl::malloc_device<CeedScalar>(gLen, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event gradout_copy = sycl_data->sycl_queue.copy<CeedScalar>(gradout, diag->d_gradout, gLen, {e});
sycl::event gradout_copy = sycl_data->sycl_queue.copy<CeedScalar>(gradout, diag->d_gradout, gLen, e);
copy_events.push_back(gradout_copy);

// Arrays of emodes
CeedCallSycl(ceed, diag->d_emodein = sycl::malloc_device<CeedEvalMode>(numemodein, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event emodein_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(emodein, diag->d_emodein, numemodein, {e});
sycl::event emodein_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(emodein, diag->d_emodein, numemodein, e);
copy_events.push_back(emodein_copy);

CeedCallSycl(ceed, diag->d_emodeout = sycl::malloc_device<CeedEvalMode>(numemodeout, sycl_data->sycl_device, sycl_data->sycl_context));
sycl::event emodeout_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(emodeout, diag->d_emodeout, numemodeout, {e});
sycl::event emodeout_copy = sycl_data->sycl_queue.copy<CeedEvalMode>(emodeout, diag->d_emodeout, numemodeout, e);
copy_events.push_back(emodeout_copy);

// Restriction
Expand Down Expand Up @@ -854,9 +854,10 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p

sycl::range<1> kernel_range(nelem * nnodes);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedOperatorSyclLinearDiagonal>(kernel_range, {e}, [=](sycl::id<1> idx) {
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedOperatorSyclLinearDiagonal>(kernel_range, e, [=](sycl::id<1> idx) {
const CeedInt tid = idx % nnodes;
const CeedInt e = idx / nnodes;

Expand Down Expand Up @@ -1136,13 +1137,14 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
CeedEvalMode eval_mode = eval_mode_in[i];
if (eval_mode == CEED_EVAL_INTERP) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
sycl_data->sycl_queue.copy<CeedScalar>(interp_in, &asmb->d_B_in[mat_start], esize * nqpts, {e});
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(interp_in, &asmb->d_B_in[mat_start], esize * nqpts, e);
mat_start += esize * nqpts;
} else if (eval_mode == CEED_EVAL_GRAD) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
sycl_data->sycl_queue.copy<CeedScalar>(grad_in, &asmb->d_B_in[mat_start], dim * esize * nqpts, {e});
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(grad_in, &asmb->d_B_in[mat_start], dim * esize * nqpts, e);
mat_start += dim * esize * nqpts;
}
}
Expand All @@ -1164,14 +1166,14 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
for (int i = 0; i < num_B_out_mats_to_load; i++) {
CeedEvalMode eval_mode = eval_mode_out[i];
if (eval_mode == CEED_EVAL_INTERP) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
sycl_data->sycl_queue.copy<CeedScalar>(interp_out, &asmb->d_B_out[mat_start], esize * nqpts, {e});
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(interp_out, &asmb->d_B_out[mat_start], esize * nqpts, e);
mat_start += esize * nqpts;
} else if (eval_mode == CEED_EVAL_GRAD) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
sycl_data->sycl_queue.copy<CeedScalar>(grad_out, &asmb->d_B_out[mat_start], dim * esize * nqpts, {e});
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(grad_out, &asmb->d_B_out[mat_start], dim * esize * nqpts, e);
mat_start += dim * esize * nqpts;
}
}
Expand Down Expand Up @@ -1213,8 +1215,8 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp

sycl::range<3> kernel_range(nelem, block_size_y, block_size_x);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
sycl_queue.parallel_for<CeedOperatorSyclLinearAssemble>(kernel_range, {e}, [=](sycl::id<3> idx) {
const int e = idx.get(0); // Element index
const int l = idx.get(1); // The output column index of each B^TDB operation
Expand Down
6 changes: 3 additions & 3 deletions backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,12 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
void *context_data;
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &context_data));

// Order queue
sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};

// Launch as a basic parallel_for over Q quadrature points
ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on({e});
cgh.depends_on(e);

int iarg{};
cgh.set_arg(iarg, context_data);
Expand Down
Loading
Loading