Skip to content

Commit

Permalink
Merge pull request #1648 from CEED/jeremy/cuda-reuse-out
Browse files Browse the repository at this point in the history
Reuse GPU E-vecs
  • Loading branch information
jeremylt authored Aug 29, 2024
2 parents 29a534d + 8a21357 commit 71ed691
Show file tree
Hide file tree
Showing 4 changed files with 200 additions and 22 deletions.
109 changes: 99 additions & 10 deletions backends/cuda-ref/ceed-cuda-ref-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -264,14 +264,53 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {
impl->num_inputs = num_input_fields;
impl->num_outputs = num_output_fields;

// Set up infield and outfield e_vecs and q_vecs
// Set up infield and outfield e-vecs and q-vecs
// Infields
CeedCallBackend(
CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
// Outfields
CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
num_input_fields, num_output_fields, Q, num_elem));

// Reuse active e-vecs where able
{
CeedInt num_used = 0;
CeedElemRestriction *rstr_used = NULL;

for (CeedInt i = 0; i < num_input_fields; i++) {
bool is_used = false;
CeedVector vec_i;
CeedElemRestriction rstr_i;

CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
if (vec_i != CEED_VECTOR_ACTIVE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
for (CeedInt j = 0; j < num_used; j++) {
if (rstr_i == rstr_used[i]) is_used = true;
}
if (is_used) continue;
num_used++;
if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
rstr_used[num_used - 1] = rstr_i;
for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
CeedEvalMode eval_mode;
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
if (vec_j != CEED_VECTOR_ACTIVE) continue;
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
if (eval_mode == CEED_EVAL_NONE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
if (rstr_i == rstr_j) {
CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
}
}
}
CeedCallBackend(CeedFree(&rstr_used));
}
impl->has_shared_e_vecs = true;
CeedCallBackend(CeedOperatorSetSetupDone(op));
return CEED_ERROR_SUCCESS;
}
Expand Down Expand Up @@ -310,7 +349,7 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFu
uint64_t state;

CeedCallBackend(CeedVectorGetState(vec, &state));
if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) {
if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
}
impl->input_states[i] = state;
Expand Down Expand Up @@ -435,6 +474,9 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec
// Q function
CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));

// Restore input arrays
CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));

// Output basis apply if needed
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedEvalMode eval_mode;
Expand Down Expand Up @@ -490,9 +532,6 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec

CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
}

// Restore input arrays
CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -536,14 +575,53 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
impl->num_inputs = num_input_fields;
impl->num_outputs = num_output_fields;

// Set up infield and outfield e_vecs and q_vecs
// Set up infield and outfield e-vecs and q-vecs
// Infields
CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
max_num_points, num_elem));
// Outfields
CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
num_input_fields, num_output_fields, max_num_points, num_elem));

// Reuse active e-vecs where able
{
CeedInt num_used = 0;
CeedElemRestriction *rstr_used = NULL;

for (CeedInt i = 0; i < num_input_fields; i++) {
bool is_used = false;
CeedVector vec_i;
CeedElemRestriction rstr_i;

CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
if (vec_i != CEED_VECTOR_ACTIVE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
for (CeedInt j = 0; j < num_used; j++) {
if (rstr_i == rstr_used[i]) is_used = true;
}
if (is_used) continue;
num_used++;
if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
rstr_used[num_used - 1] = rstr_i;
for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
CeedEvalMode eval_mode;
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
if (vec_j != CEED_VECTOR_ACTIVE) continue;
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
if (eval_mode == CEED_EVAL_NONE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
if (rstr_i == rstr_j) {
CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
}
}
}
CeedCallBackend(CeedFree(&rstr_used));
}
impl->has_shared_e_vecs = true;
CeedCallBackend(CeedOperatorSetSetupDone(op));
return CEED_ERROR_SUCCESS;
}
Expand Down Expand Up @@ -646,6 +724,9 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
// Q function
CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));

// Restore input arrays
CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));

// Output basis apply if needed
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedEvalMode eval_mode;
Expand Down Expand Up @@ -703,9 +784,6 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,

CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
}

// Restore input arrays
CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -830,7 +908,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
}

// Un-set output q_vecs to prevent accidental overwrite of Assembled
// Un-set output q-vecs to prevent accidental overwrite of Assembled
for (CeedInt out = 0; out < num_output_fields; out++) {
CeedVector vec;

Expand Down Expand Up @@ -1557,6 +1635,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
max_num_points = impl->max_num_points;
for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;

// Create separate output e-vecs
if (impl->has_shared_e_vecs) {
for (CeedInt i = 0; i < impl->num_outputs; i++) {
CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[impl->num_inputs + i]));
}
CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
num_input_fields, num_output_fields, max_num_points, num_elem));
}
impl->has_shared_e_vecs = false;

// Input Evecs and Restriction
CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));

Expand Down
2 changes: 1 addition & 1 deletion backends/cuda-ref/ceed-cuda-ref.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ typedef struct {
} CeedOperatorAssemble_Cuda;

typedef struct {
bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs;
uint64_t *input_states; // State tracking for passive inputs
CeedVector *e_vecs; // E-vectors, inputs followed by outputs
CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator
Expand Down
109 changes: 99 additions & 10 deletions backends/hip-ref/ceed-hip-ref-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -263,14 +263,53 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {
impl->num_inputs = num_input_fields;
impl->num_outputs = num_output_fields;

// Set up infield and outfield e_vecs and q_vecs
// Set up infield and outfield e-vecs and q-vecs
// Infields
CeedCallBackend(
CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
// Outfields
CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
num_input_fields, num_output_fields, Q, num_elem));

// Reuse active e-vecs where able
{
CeedInt num_used = 0;
CeedElemRestriction *rstr_used = NULL;

for (CeedInt i = 0; i < num_input_fields; i++) {
bool is_used = false;
CeedVector vec_i;
CeedElemRestriction rstr_i;

CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
if (vec_i != CEED_VECTOR_ACTIVE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
for (CeedInt j = 0; j < num_used; j++) {
if (rstr_i == rstr_used[i]) is_used = true;
}
if (is_used) continue;
num_used++;
if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
rstr_used[num_used - 1] = rstr_i;
for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
CeedEvalMode eval_mode;
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
if (vec_j != CEED_VECTOR_ACTIVE) continue;
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
if (eval_mode == CEED_EVAL_NONE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
if (rstr_i == rstr_j) {
CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
}
}
}
CeedCallBackend(CeedFree(&rstr_used));
}
impl->has_shared_e_vecs = true;
CeedCallBackend(CeedOperatorSetSetupDone(op));
return CEED_ERROR_SUCCESS;
}
Expand Down Expand Up @@ -309,7 +348,7 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun
uint64_t state;

CeedCallBackend(CeedVectorGetState(vec, &state));
if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) {
if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
}
impl->input_states[i] = state;
Expand Down Expand Up @@ -434,6 +473,9 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect
// Q function
CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out));

// Restore input arrays
CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));

// Output basis apply if needed
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedEvalMode eval_mode;
Expand Down Expand Up @@ -489,9 +531,6 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect

CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
}

// Restore input arrays
CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -535,14 +574,53 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
impl->num_inputs = num_input_fields;
impl->num_outputs = num_output_fields;

// Set up infield and outfield e_vecs and q_vecs
// Set up infield and outfield e-vecs and q-vecs
// Infields
CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
max_num_points, num_elem));
// Outfields
CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
num_input_fields, num_output_fields, max_num_points, num_elem));

// Reuse active e-vecs where able
{
CeedInt num_used = 0;
CeedElemRestriction *rstr_used = NULL;

for (CeedInt i = 0; i < num_input_fields; i++) {
bool is_used = false;
CeedVector vec_i;
CeedElemRestriction rstr_i;

CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
if (vec_i != CEED_VECTOR_ACTIVE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
for (CeedInt j = 0; j < num_used; j++) {
if (rstr_i == rstr_used[i]) is_used = true;
}
if (is_used) continue;
num_used++;
if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used));
else CeedCallBackend(CeedRealloc(num_used, &rstr_used));
rstr_used[num_used - 1] = rstr_i;
for (CeedInt j = num_output_fields - 1; j >= 0; j--) {
CeedEvalMode eval_mode;
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j));
if (vec_j != CEED_VECTOR_ACTIVE) continue;
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode));
if (eval_mode == CEED_EVAL_NONE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j));
if (rstr_i == rstr_j) {
CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs]));
}
}
}
CeedCallBackend(CeedFree(&rstr_used));
}
impl->has_shared_e_vecs = true;
CeedCallBackend(CeedOperatorSetSetupDone(op));
return CEED_ERROR_SUCCESS;
}
Expand Down Expand Up @@ -645,6 +723,9 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
// Q function
CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out));

// Restore input arrays
CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));

// Output basis apply if needed
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedEvalMode eval_mode;
Expand Down Expand Up @@ -702,9 +783,6 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,

CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request));
}

// Restore input arrays
CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl));
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -829,7 +907,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b
CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out));
}

// Un-set output q_vecs to prevent accidental overwrite of Assembled
// Un-set output q-vecs to prevent accidental overwrite of Assembled
for (CeedInt out = 0; out < num_output_fields; out++) {
CeedVector vec;

Expand Down Expand Up @@ -1554,6 +1632,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
max_num_points = impl->max_num_points;
for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;

// Create separate output e-vecs
if (impl->has_shared_e_vecs) {
for (CeedInt i = 0; i < impl->num_outputs; i++) {
CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i]));
CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[impl->num_inputs + i]));
}
CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out,
num_input_fields, num_output_fields, max_num_points, num_elem));
}
impl->has_shared_e_vecs = false;

// Input Evecs and Restriction
CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request));

Expand Down
2 changes: 1 addition & 1 deletion backends/hip-ref/ceed-hip-ref.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ typedef struct {
} CeedOperatorAssemble_Hip;

typedef struct {
bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out;
bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs;
uint64_t *input_states; // State tracking for passive inputs
CeedVector *e_vecs; // E-vectors, inputs followed by outputs
CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator
Expand Down

0 comments on commit 71ed691

Please sign in to comment.