Skip to content

Commit

Permalink
Efficiently use both in-order and out-of-order sycl queues. (#59)
Browse files Browse the repository at this point in the history
* Default to in-order queues.

* Only order queue as necessary.

* Use the same queue recursively through a hierarchy of Ceed objects,
  • Loading branch information
kris-rowe authored and uumesh committed Apr 3, 2024
1 parent a035cd8 commit aaac8d7
Show file tree
Hide file tree
Showing 12 changed files with 149 additions and 151 deletions.
9 changes: 6 additions & 3 deletions backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -741,9 +741,12 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
code << "// -----------------------------------------------------------------------------\n\n";

// Copy the struct (containing device addresses) from the host to the device
sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1);
sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1);
sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1);
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};

sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1, e);
sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1, e);
sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1, e);
// These copies can happen while the JIT is being done
CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices}));

Expand Down
4 changes: 2 additions & 2 deletions backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec,
sycl::nd_range<3> kernel_range(global_range, local_range);

//-----------
// Order queue
sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};

CeedCallSycl(ceed, ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(e);
Expand Down
10 changes: 5 additions & 5 deletions backends/sycl-gen/ceed-sycl-gen.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) {
CeedCallBackend(CeedInit_Sycl(ceed, resource));

CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared));

CeedCallBackend(CeedGetData(ceed_shared, &shared_data));
// Need to use the same queue everywhere for correct synchronization
shared_data->sycl_queue = data->sycl_queue;

CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
CeedCallBackend(CeedSetStream_Sycl(ceed_shared,&(data->sycl_queue)));

CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource));

Ceed ceed_fallback = NULL;
CeedCallBackend(CeedGetOperatorFallbackCeed(ceed, &ceed_fallback));
CeedCallBackend(CeedSetStream_Sycl(ceed_fallback,&(data->sycl_queue)));

CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen));
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen));
CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Sycl));
Expand Down
68 changes: 37 additions & 31 deletions backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,11 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
sycl::range<1> global_range(num_elem * work_group_size);
sycl::nd_range<1> kernel_range(global_range, local_range);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on({e});
cgh.depends_on(e);
cgh.use_kernel_bundle(sycl_module);

sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh);
Expand Down Expand Up @@ -150,10 +151,11 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &
sycl::range<1> global_range(num_elem * work_group_size);
sycl::nd_range<1> kernel_range(global_range, local_range);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on({e});
cgh.depends_on(e);
cgh.use_kernel_bundle(sycl_module);

sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh);
Expand Down Expand Up @@ -248,9 +250,10 @@ static int CeedBasisApplyWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem,
const CeedInt num_quad_z = (dim > 2) ? Q_1d : 1;
sycl::range<3> kernel_range(num_elem * num_quad_z, num_quad_y, num_quad_x);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, {e}, [=](sycl::item<3> work_item) {
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, e, [=](sycl::item<3> work_item) {
if (dim == 1) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]];
if (dim == 2) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]];
if (dim == 3) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]] * q_weight_1d[work_item[0] % Q_1d];
Expand Down Expand Up @@ -283,9 +286,9 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
if (is_transpose) {
CeedSize length;
CeedCallBackend(CeedVectorGetLength(v, &length));
// Order queue
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e});
std::vector<sycl::event> e;
if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
data->sycl_queue.fill<CeedScalar>(d_v, 0, length, e);
}

// Basis action
Expand Down Expand Up @@ -343,9 +346,10 @@ static int CeedBasisApplyNonTensorInterp_Sycl(sycl::queue &sycl_queue, CeedInt n

sycl::range<2> kernel_range(num_elem, v_size);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, e, [=](sycl::id<2> indx) {
const CeedInt i = indx[1];
const CeedInt elem = indx[0];

Expand Down Expand Up @@ -387,9 +391,10 @@ static int CeedBasisApplyNonTensorGrad_Sycl(sycl::queue &sycl_queue, CeedInt num

sycl::range<2> kernel_range(num_elem, v_size);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, e, [=](sycl::id<2> indx) {
const CeedInt i = indx[1];
const CeedInt elem = indx[0];

Expand Down Expand Up @@ -425,9 +430,10 @@ static int CeedBasisApplyNonTensorWeight_Sycl(sycl::queue &sycl_queue, CeedInt n

sycl::range<2> kernel_range(num_elem, num_qpts);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, e, [=](sycl::id<2> indx) {
const CeedInt i = indx[1];
const CeedInt elem = indx[0];
d_V[i + elem * num_qpts] = q_weight[i];
Expand Down Expand Up @@ -564,18 +570,18 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
impl->buf_len = num_comp * CeedIntMax(num_nodes, num_qpts);
impl->op_len = Q_1d * P_1d;

// Order queue
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};

CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, {e});
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);

const CeedInt interp_length = Q_1d * P_1d;
CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, {e});
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);

CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, {e});
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);

CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));

Expand Down Expand Up @@ -618,19 +624,19 @@ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes
impl->num_nodes = num_nodes;
impl->num_qpts = num_qpts;

// Order queue
sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};

CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, {e});
sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e);

const CeedInt interp_length = num_qpts * num_nodes;
CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, {e});
sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, e);

const CeedInt grad_length = num_qpts * num_nodes * dim;
CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, {e});
sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, e);

CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));

Expand Down
38 changes: 19 additions & 19 deletions backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -749,8 +749,8 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op) {
for (CeedInt i = 0; i < num_e_mode_in; i++) has_eval_none = has_eval_none || (e_mode_in[i] == CEED_EVAL_NONE);
for (CeedInt i = 0; i < num_e_mode_out; i++) has_eval_none = has_eval_none || (e_mode_out[i] == CEED_EVAL_NONE);

// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};

std::vector<sycl::event> copy_events;
if (has_eval_none) {
Expand Down Expand Up @@ -820,11 +820,12 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool i

sycl::range<1> kernel_range(num_elem * num_nodes);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
sycl_queue.parallel_for<CeedOperatorSyclLinearDiagonal>(kernel_range, {e}, [=](sycl::id<1> idx) {
const CeedInt tid = idx % num_nodes;
const CeedInt e = idx / num_nodes;
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};

sycl_queue.parallel_for<CeedOperatorSyclLinearDiagonal>(kernel_range, e, [=](sycl::id<1> idx) {
const CeedInt tid = idx % nnodes;
const CeedInt e = idx / nnodes;

// Compute the diagonal of B^T D B
// Each element
Expand Down Expand Up @@ -1078,15 +1079,14 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
CeedEvalMode eval_mode = eval_mode_in[i];

if (eval_mode == CEED_EVAL_INTERP) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(interp_in, &asmb->d_B_in[mat_start], elem_size * num_qpts, {e});
mat_start += elem_size * num_qpts;
} else if (eval_mode == CEED_EVAL_GRAD) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
sycl_data->sycl_queue.copy<CeedScalar>(grad_in, &asmb->d_B_in[mat_start], dim * elem_size * num_qpts, {e});
mat_start += dim * elem_size * num_qpts;
td::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
mat_start += dim * elem_size * num_qpts;
}
}

Expand All @@ -1108,13 +1108,13 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) {
CeedEvalMode eval_mode = eval_mode_out[i];

if (eval_mode == CEED_EVAL_INTERP) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(interp_out, &asmb->d_B_out[mat_start], elem_size * num_qpts, {e});
mat_start += elem_size * num_qpts;
} else if (eval_mode == CEED_EVAL_GRAD) {
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl_data->sycl_queue.copy<CeedScalar>(grad_out, &asmb->d_B_out[mat_start], dim * elem_size * num_qpts, {e});
mat_start += dim * elem_size * num_qpts;
}
Expand Down Expand Up @@ -1157,8 +1157,8 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp

sycl::range<3> kernel_range(num_elem, block_size_y, block_size_x);

// Order queue
sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
sycl_queue.parallel_for<CeedOperatorSyclLinearAssemble>(kernel_range, {e}, [=](sycl::id<3> idx) {
const int e = idx.get(0); // Element index
const int l = idx.get(1); // The output column index of each B^TDB operation
Expand Down
6 changes: 3 additions & 3 deletions backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
// Get context data
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &context_data));

// Order queue
sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};

// Launch as a basic parallel_for over Q quadrature points
ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on({e});
cgh.depends_on(e);

int iarg{};
cgh.set_arg(iarg, context_data);
Expand Down
16 changes: 8 additions & 8 deletions backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ static inline int CeedQFunctionContextSyncH2D_Sycl(const CeedQFunctionContext ct
CeedCallSycl(ceed, impl->d_data_owned = sycl::malloc_device(ctx_size, sycl_data->sycl_device, sycl_data->sycl_context));
impl->d_data = impl->d_data_owned;
}
// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->d_data, impl->h_data, ctx_size, {e});
CeedCallSycl(ceed, copy_event.wait_and_throw());
return CEED_ERROR_SUCCESS;
Expand Down Expand Up @@ -69,8 +69,8 @@ static inline int CeedQFunctionContextSyncD2H_Sycl(const CeedQFunctionContext ct
impl->h_data = impl->h_data_owned;
}

// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->h_data, impl->d_data, ctx_size, {e});
CeedCallSycl(ceed, copy_event.wait_and_throw());
return CEED_ERROR_SUCCESS;
Expand Down Expand Up @@ -194,8 +194,8 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx
CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed));
CeedCallBackend(CeedGetData(ceed, &sycl_data));

// Order queue
sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier();
std::vector<sycl::event> e;
if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};

// Wait for all work to finish before freeing memory
if (impl->d_data_owned) {
Expand Down Expand Up @@ -260,8 +260,8 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con
CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl));
CeedCallBackend(CeedGetData(ceed, &ceedSycl));

// Order queue
ceedSycl->sycl_queue.ext_oneapi_submit_barrier();
// Order queue if needed
if (!ceedSycl->sycl_queue.is_in_order()) ceedSycl->sycl_queue.ext_oneapi_submit_barrier();

// Sync data to requested mem_type
CeedCallBackend(CeedQFunctionContextNeedSync_Sycl(ctx, mem_type, &need_sync));
Expand Down
Loading

0 comments on commit aaac8d7

Please sign in to comment.