CEED · jeremylt · Sep 5, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -10,6 +10,7 @@
 #include <ceed/jit-tools.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
@@ -115,18 +116,46 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -186,14 +215,14 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
@@ -343,6 +372,8 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) {
   CeedCallCuda(ceed, cuModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -27,6 +27,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Apply data
+  CeedCallBackend(CeedFree(&impl->num_points));
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   CeedCallBackend(CeedFree(&impl->skip_rstr_out));
   CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
@@ -557,10 +558,17 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
   {
-    CeedElemRestriction elem_rstr = NULL;
+    CeedElemRestriction rstr_points = NULL;
 
-    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
-    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedCalloc(num_elem, &impl->num_points));
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt num_points_elem;
+
+      CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+      impl->num_points[e] = num_points_elem;
+    }
   }
   impl->max_num_points = max_num_points;
 
@@ -674,7 +682,7 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const Ce
 // Apply and add to output AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
+  CeedInt             max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -686,12 +694,11 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
@@ -1616,7 +1623,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee
 // Assemble Linear Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields;
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -1628,12 +1635,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Create separate output e-vecs
   if (impl->has_shared_e_vecs) {

diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
@@ -75,6 +75,9 @@ typedef struct {
   CeedScalar *d_grad_1d;
   CeedScalar *d_q_weight_1d;
   CeedScalar *d_chebyshev_interp_1d;
+  CeedInt     num_elem_at_points;
+  CeedInt    *h_points_per_elem;
+  CeedInt    *d_points_per_elem;
 } CeedBasis_Cuda;
 
 typedef struct {
@@ -136,6 +139,7 @@ typedef struct {
   CeedInt                    num_inputs, num_outputs;
   CeedInt                    num_active_in, num_active_out;
   CeedInt                    max_num_points;
+  CeedInt                   *num_points;
   CeedVector                *qf_active_in, point_coords_elem;
   CeedOperatorDiag_Cuda     *diag;
   CeedOperatorAssemble_Cuda *asmb;

diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -12,6 +12,7 @@
 #include <cuda_runtime.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
@@ -221,18 +222,46 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -292,14 +321,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
@@ -345,6 +374,8 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
   CeedCallCuda(ceed, cuModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d));

diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
@@ -30,6 +30,9 @@ typedef struct {
   CeedScalar *d_chebyshev_interp_1d;
   CeedScalar *c_B;
   CeedScalar *c_G;
+  CeedInt     num_elem_at_points;
+  CeedInt    *h_points_per_elem;
+  CeedInt    *d_points_per_elem;
 } CeedBasis_Cuda_shared;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,

diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -8,6 +8,7 @@
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <ceed/jit-tools.h>
+#include <string.h>
 #include <hip/hip_runtime.h>
 
 #include "../hip/ceed-hip-common.h"
@@ -113,18 +114,46 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -184,14 +213,14 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
@@ -341,6 +370,8 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) {
   CeedCallHip(ceed, hipModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));