Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AtPoints - fix transpose basis apply on GPU #1655

Merged
merged 2 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 41 additions & 10 deletions backends/cuda-ref/ceed-cuda-ref-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <ceed/jit-tools.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <string.h>

#include "../cuda/ceed-cuda-common.h"
#include "../cuda/ceed-cuda-compile.h"
Expand Down Expand Up @@ -115,18 +116,46 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedCallBackend(CeedBasisGetDimension(basis, &dim));

// Check uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) {
CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
"BasisApplyAtPoints only supported for the same number of points in each element");
}

// Weight handled separately
if (eval_mode == CEED_EVAL_WEIGHT) {
CeedCallBackend(CeedVectorSetValue(v, 1.0));
return CEED_ERROR_SUCCESS;
}

// Check padded to uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
{
CeedInt num_comp, q_comp;
CeedSize len, len_required;

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
"Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
" Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
len, len_required);
}

// Move num_points array to device
if (is_transpose) {
const CeedInt num_bytes = num_elem * sizeof(CeedInt);

if (num_elem != data->num_elem_at_points) {
data->num_elem_at_points = num_elem;

if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
}
if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
memcpy(data->h_points_per_elem, num_points, num_bytes);
CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
}
}

// Build kernels if needed
if (data->num_points != max_num_points) {
CeedInt P_1d;
Expand Down Expand Up @@ -186,14 +215,14 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
// Basis action
switch (eval_mode) {
case CEED_EVAL_INTERP: {
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
} break;
case CEED_EVAL_GRAD: {
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
} break;
Expand Down Expand Up @@ -343,6 +372,8 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) {
CeedCallCuda(ceed, cuModuleUnload(data->module));
if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
Expand Down
24 changes: 15 additions & 9 deletions backends/cuda-ref/ceed-cuda-ref-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
CeedCallBackend(CeedOperatorGetData(op, &impl));

// Apply data
CeedCallBackend(CeedFree(&impl->num_points));
CeedCallBackend(CeedFree(&impl->skip_rstr_in));
CeedCallBackend(CeedFree(&impl->skip_rstr_out));
CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
Expand Down Expand Up @@ -557,10 +558,17 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
{
CeedElemRestriction elem_rstr = NULL;
CeedElemRestriction rstr_points = NULL;

CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
CeedCallBackend(CeedCalloc(num_elem, &impl->num_points));
for (CeedInt e = 0; e < num_elem; e++) {
CeedInt num_points_elem;

CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
impl->num_points[e] = num_points_elem;
}
}
impl->max_num_points = max_num_points;

Expand Down Expand Up @@ -674,7 +682,7 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const Ce
// Apply and add to output AtPoints
//------------------------------------------------------------------------------
static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
CeedInt max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
CeedInt max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL};
CeedQFunctionField *qf_input_fields, *qf_output_fields;
CeedQFunction qf;
Expand All @@ -686,12 +694,11 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
CeedInt num_points[num_elem];

// Setup
CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
num_points = impl->num_points;
max_num_points = impl->max_num_points;
for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;

// Input Evecs and Restriction
CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
Expand Down Expand Up @@ -1616,7 +1623,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee
// Assemble Linear Diagonal AtPoints
//------------------------------------------------------------------------------
static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
CeedInt max_num_points, num_elem, num_input_fields, num_output_fields;
CeedInt max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL};
CeedQFunctionField *qf_input_fields, *qf_output_fields;
CeedQFunction qf;
Expand All @@ -1628,12 +1635,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
CeedInt num_points[num_elem];

// Setup
CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
num_points = impl->num_points;
max_num_points = impl->max_num_points;
for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;

// Create separate output e-vecs
if (impl->has_shared_e_vecs) {
Expand Down
4 changes: 4 additions & 0 deletions backends/cuda-ref/ceed-cuda-ref.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ typedef struct {
CeedScalar *d_grad_1d;
CeedScalar *d_q_weight_1d;
CeedScalar *d_chebyshev_interp_1d;
CeedInt num_elem_at_points;
CeedInt *h_points_per_elem;
CeedInt *d_points_per_elem;
} CeedBasis_Cuda;

typedef struct {
Expand Down Expand Up @@ -136,6 +139,7 @@ typedef struct {
CeedInt num_inputs, num_outputs;
CeedInt num_active_in, num_active_out;
CeedInt max_num_points;
CeedInt *num_points;
CeedVector *qf_active_in, point_coords_elem;
CeedOperatorDiag_Cuda *diag;
CeedOperatorAssemble_Cuda *asmb;
Expand Down
51 changes: 41 additions & 10 deletions backends/cuda-shared/ceed-cuda-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <cuda_runtime.h>
#include <stdbool.h>
#include <stddef.h>
#include <string.h>

#include "../cuda/ceed-cuda-common.h"
#include "../cuda/ceed-cuda-compile.h"
Expand Down Expand Up @@ -221,18 +222,46 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedCallBackend(CeedBasisGetDimension(basis, &dim));

// Check uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) {
CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
"BasisApplyAtPoints only supported for the same number of points in each element");
}

// Weight handled separately
if (eval_mode == CEED_EVAL_WEIGHT) {
CeedCallBackend(CeedVectorSetValue(v, 1.0));
return CEED_ERROR_SUCCESS;
}

// Check padded to uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
{
CeedInt num_comp, q_comp;
CeedSize len, len_required;

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
"Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
" Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
len, len_required);
}

// Move num_points array to device
if (is_transpose) {
const CeedInt num_bytes = num_elem * sizeof(CeedInt);

if (num_elem != data->num_elem_at_points) {
data->num_elem_at_points = num_elem;

if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
}
if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
memcpy(data->h_points_per_elem, num_points, num_bytes);
CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
}
}

// Build kernels if needed
if (data->num_points != max_num_points) {
CeedInt P_1d;
Expand Down Expand Up @@ -292,14 +321,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
// Basis action
switch (eval_mode) {
case CEED_EVAL_INTERP: {
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
} break;
case CEED_EVAL_GRAD: {
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
} break;
Expand Down Expand Up @@ -345,6 +374,8 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
CeedCallCuda(ceed, cuModuleUnload(data->module));
if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d));
Expand Down
3 changes: 3 additions & 0 deletions backends/cuda-shared/ceed-cuda-shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ typedef struct {
CeedScalar *d_chebyshev_interp_1d;
CeedScalar *c_B;
CeedScalar *c_G;
CeedInt num_elem_at_points;
CeedInt *h_points_per_elem;
CeedInt *d_points_per_elem;
} CeedBasis_Cuda_shared;

CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
Expand Down
51 changes: 41 additions & 10 deletions backends/hip-ref/ceed-hip-ref-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <ceed.h>
#include <ceed/backend.h>
#include <ceed/jit-tools.h>
#include <string.h>
#include <hip/hip_runtime.h>

#include "../hip/ceed-hip-common.h"
Expand Down Expand Up @@ -113,18 +114,46 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
CeedCallBackend(CeedBasisGetDimension(basis, &dim));

// Check uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) {
CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
"BasisApplyAtPoints only supported for the same number of points in each element");
}

// Weight handled separately
if (eval_mode == CEED_EVAL_WEIGHT) {
CeedCallBackend(CeedVectorSetValue(v, 1.0));
return CEED_ERROR_SUCCESS;
}

// Check padded to uniform number of points per elem
for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
{
CeedInt num_comp, q_comp;
CeedSize len, len_required;

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
"Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
" Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
len, len_required);
}

// Move num_points array to device
if (is_transpose) {
const CeedInt num_bytes = num_elem * sizeof(CeedInt);

if (num_elem != data->num_elem_at_points) {
data->num_elem_at_points = num_elem;

if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
}
if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
memcpy(data->h_points_per_elem, num_points, num_bytes);
CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
}
}

// Build kernels if needed
if (data->num_points != max_num_points) {
CeedInt P_1d;
Expand Down Expand Up @@ -184,14 +213,14 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
// Basis action
switch (eval_mode) {
case CEED_EVAL_INTERP: {
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
} break;
case CEED_EVAL_GRAD: {
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);

CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
} break;
Expand Down Expand Up @@ -341,6 +370,8 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) {
CeedCallHip(ceed, hipModuleUnload(data->module));
if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
CeedCallBackend(CeedFree(&data->h_points_per_elem));
if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
CeedCallHip(ceed, hipFree(data->d_interp_1d));
CeedCallHip(ceed, hipFree(data->d_grad_1d));
CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
Expand Down
Loading