Skip to content

Commit

Permalink
Merge pull request CEED#1655 from CEED/jeremy/at-points-transpose
Browse files Browse the repository at this point in the history
AtPoints - fix transpose basis apply on GPU
  • Loading branch information
jeremylt authored Sep 5, 2024
2 parents f04c500 + 9e511c8 commit 25c4e04
Show file tree
Hide file tree
Showing 12 changed files with 220 additions and 62 deletions.
51 changes: 41 additions & 10 deletions backends/cuda-ref/ceed-cuda-ref-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <ceed/jit-tools.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <string.h>

#include "../cuda/ceed-cuda-common.h"
#include "../cuda/ceed-cuda-compile.h"
Expand Down Expand Up @@ -115,18 +116,46 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedCallBackend(CeedBasisGetDimension(basis, &dim));

// Check uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) {
CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
"BasisApplyAtPoints only supported for the same number of points in each element");
}

// Weight handled separately
if (eval_mode == CEED_EVAL_WEIGHT) {
CeedCallBackend(CeedVectorSetValue(v, 1.0));
return CEED_ERROR_SUCCESS;
}

// Check padded to uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
{
CeedInt num_comp, q_comp;
CeedSize len, len_required;

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
"Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
" Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
len, len_required);
}

// Move num_points array to device
if (is_transpose) {
const CeedInt num_bytes = num_elem * sizeof(CeedInt);

if (num_elem != data->num_elem_at_points) {
data->num_elem_at_points = num_elem;

if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
}
if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
memcpy(data->h_points_per_elem, num_points, num_bytes);
CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
}
}

// Build kernels if needed
if (data->num_points != max_num_points) {
CeedInt P_1d;
Expand Down Expand Up @@ -186,14 +215,14 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
// Basis action
switch (eval_mode) {
case CEED_EVAL_INTERP: {
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
} break;
case CEED_EVAL_GRAD: {
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
} break;
Expand Down Expand Up @@ -343,6 +372,8 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) {
CeedCallCuda(ceed, cuModuleUnload(data->module));
if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
Expand Down
24 changes: 15 additions & 9 deletions backends/cuda-ref/ceed-cuda-ref-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
CeedCallBackend(CeedOperatorGetData(op, &impl));

// Apply data
CeedCallBackend(CeedFree(&impl->num_points));
CeedCallBackend(CeedFree(&impl->skip_rstr_in));
CeedCallBackend(CeedFree(&impl->skip_rstr_out));
CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
Expand Down Expand Up @@ -557,10 +558,17 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
{
CeedElemRestriction elem_rstr = NULL;
CeedElemRestriction rstr_points = NULL;

CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
CeedCallBackend(CeedCalloc(num_elem, &impl->num_points));
for (CeedInt e = 0; e < num_elem; e++) {
CeedInt num_points_elem;

CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
impl->num_points[e] = num_points_elem;
}
}
impl->max_num_points = max_num_points;

Expand Down Expand Up @@ -674,7 +682,7 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const Ce
// Apply and add to output AtPoints
//------------------------------------------------------------------------------
static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
CeedInt max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
CeedInt max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL};
CeedQFunctionField *qf_input_fields, *qf_output_fields;
CeedQFunction qf;
Expand All @@ -686,12 +694,11 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
CeedInt num_points[num_elem];

// Setup
CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
num_points = impl->num_points;
max_num_points = impl->max_num_points;
for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;

// Input Evecs and Restriction
CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
Expand Down Expand Up @@ -1616,7 +1623,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee
// Assemble Linear Diagonal AtPoints
//------------------------------------------------------------------------------
static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
CeedInt max_num_points, num_elem, num_input_fields, num_output_fields;
CeedInt max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL};
CeedQFunctionField *qf_input_fields, *qf_output_fields;
CeedQFunction qf;
Expand All @@ -1628,12 +1635,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
CeedInt num_points[num_elem];

// Setup
CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
num_points = impl->num_points;
max_num_points = impl->max_num_points;
for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;

// Create separate output e-vecs
if (impl->has_shared_e_vecs) {
Expand Down
4 changes: 4 additions & 0 deletions backends/cuda-ref/ceed-cuda-ref.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ typedef struct {
CeedScalar *d_grad_1d;
CeedScalar *d_q_weight_1d;
CeedScalar *d_chebyshev_interp_1d;
CeedInt num_elem_at_points;
CeedInt *h_points_per_elem;
CeedInt *d_points_per_elem;
} CeedBasis_Cuda;

typedef struct {
Expand Down Expand Up @@ -136,6 +139,7 @@ typedef struct {
CeedInt num_inputs, num_outputs;
CeedInt num_active_in, num_active_out;
CeedInt max_num_points;
CeedInt *num_points;
CeedVector *qf_active_in, point_coords_elem;
CeedOperatorDiag_Cuda *diag;
CeedOperatorAssemble_Cuda *asmb;
Expand Down
51 changes: 41 additions & 10 deletions backends/cuda-shared/ceed-cuda-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <cuda_runtime.h>
#include <stdbool.h>
#include <stddef.h>
#include <string.h>

#include "../cuda/ceed-cuda-common.h"
#include "../cuda/ceed-cuda-compile.h"
Expand Down Expand Up @@ -221,18 +222,46 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedCallBackend(CeedBasisGetDimension(basis, &dim));

// Check uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) {
CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
"BasisApplyAtPoints only supported for the same number of points in each element");
}

// Weight handled separately
if (eval_mode == CEED_EVAL_WEIGHT) {
CeedCallBackend(CeedVectorSetValue(v, 1.0));
return CEED_ERROR_SUCCESS;
}

// Check padded to uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
{
CeedInt num_comp, q_comp;
CeedSize len, len_required;

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
"Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
" Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
len, len_required);
}

// Move num_points array to device
if (is_transpose) {
const CeedInt num_bytes = num_elem * sizeof(CeedInt);

if (num_elem != data->num_elem_at_points) {
data->num_elem_at_points = num_elem;

if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
}
if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
memcpy(data->h_points_per_elem, num_points, num_bytes);
CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
}
}

// Build kernels if needed
if (data->num_points != max_num_points) {
CeedInt P_1d;
Expand Down Expand Up @@ -292,14 +321,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
// Basis action
switch (eval_mode) {
case CEED_EVAL_INTERP: {
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
} break;
case CEED_EVAL_GRAD: {
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
} break;
Expand Down Expand Up @@ -345,6 +374,8 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
CeedCallCuda(ceed, cuModuleUnload(data->module));
if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d));
Expand Down
3 changes: 3 additions & 0 deletions backends/cuda-shared/ceed-cuda-shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ typedef struct {
CeedScalar *d_chebyshev_interp_1d;
CeedScalar *c_B;
CeedScalar *c_G;
CeedInt num_elem_at_points;
CeedInt *h_points_per_elem;
CeedInt *d_points_per_elem;
} CeedBasis_Cuda_shared;

CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
Expand Down
51 changes: 41 additions & 10 deletions backends/hip-ref/ceed-hip-ref-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <ceed.h>
#include <ceed/backend.h>
#include <ceed/jit-tools.h>
#include <string.h>
#include <hip/hip_runtime.h>

#include "../hip/ceed-hip-common.h"
Expand Down Expand Up @@ -113,18 +114,46 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedCallBackend(CeedBasisGetDimension(basis, &dim));

// Check uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) {
CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
"BasisApplyAtPoints only supported for the same number of points in each element");
}

// Weight handled separately
if (eval_mode == CEED_EVAL_WEIGHT) {
CeedCallBackend(CeedVectorSetValue(v, 1.0));
return CEED_ERROR_SUCCESS;
}

// Check padded to uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
{
CeedInt num_comp, q_comp;
CeedSize len, len_required;

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
"Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
" Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
len, len_required);
}

// Move num_points array to device
if (is_transpose) {
const CeedInt num_bytes = num_elem * sizeof(CeedInt);

if (num_elem != data->num_elem_at_points) {
data->num_elem_at_points = num_elem;

if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
}
if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
memcpy(data->h_points_per_elem, num_points, num_bytes);
CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
}
}

// Build kernels if needed
if (data->num_points != max_num_points) {
CeedInt P_1d;
Expand Down Expand Up @@ -184,14 +213,14 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
// Basis action
switch (eval_mode) {
case CEED_EVAL_INTERP: {
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
} break;
case CEED_EVAL_GRAD: {
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
} break;
Expand Down Expand Up @@ -341,6 +370,8 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) {
CeedCallHip(ceed, hipModuleUnload(data->module));
if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
CeedCallHip(ceed, hipFree(data->d_interp_1d));
CeedCallHip(ceed, hipFree(data->d_grad_1d));
CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
Expand Down
Loading

0 comments on commit 25c4e04

Please sign in to comment.